kevinwang676 commited on
Commit
fb4fac3
·
verified ·
1 Parent(s): 4d4f2d3

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. DiffSynth_Studio.py +15 -0
  3. LICENSE +201 -0
  4. README.md +117 -13
  5. diffsynth/__init__.py +6 -0
  6. diffsynth/controlnets/__init__.py +2 -0
  7. diffsynth/controlnets/controlnet_unit.py +53 -0
  8. diffsynth/controlnets/processors.py +51 -0
  9. diffsynth/data/__init__.py +1 -0
  10. diffsynth/data/video.py +148 -0
  11. diffsynth/extensions/ESRGAN/__init__.py +118 -0
  12. diffsynth/extensions/FastBlend/__init__.py +63 -0
  13. diffsynth/extensions/FastBlend/api.py +397 -0
  14. diffsynth/extensions/FastBlend/cupy_kernels.py +119 -0
  15. diffsynth/extensions/FastBlend/data.py +146 -0
  16. diffsynth/extensions/FastBlend/patch_match.py +298 -0
  17. diffsynth/extensions/FastBlend/runners/__init__.py +4 -0
  18. diffsynth/extensions/FastBlend/runners/accurate.py +35 -0
  19. diffsynth/extensions/FastBlend/runners/balanced.py +46 -0
  20. diffsynth/extensions/FastBlend/runners/fast.py +141 -0
  21. diffsynth/extensions/FastBlend/runners/interpolation.py +121 -0
  22. diffsynth/extensions/RIFE/__init__.py +241 -0
  23. diffsynth/models/__init__.py +814 -0
  24. diffsynth/models/attention.py +89 -0
  25. diffsynth/models/downloader.py +28 -0
  26. diffsynth/models/hunyuan_dit.py +451 -0
  27. diffsynth/models/hunyuan_dit_text_encoder.py +161 -0
  28. diffsynth/models/kolors_text_encoder.py +1363 -0
  29. diffsynth/models/sd3_dit.py +797 -0
  30. diffsynth/models/sd3_text_encoder.py +0 -0
  31. diffsynth/models/sd3_vae_decoder.py +80 -0
  32. diffsynth/models/sd3_vae_encoder.py +94 -0
  33. diffsynth/models/sd_controlnet.py +587 -0
  34. diffsynth/models/sd_ipadapter.py +56 -0
  35. diffsynth/models/sd_lora.py +60 -0
  36. diffsynth/models/sd_motion.py +198 -0
  37. diffsynth/models/sd_text_encoder.py +320 -0
  38. diffsynth/models/sd_unet.py +0 -0
  39. diffsynth/models/sd_vae_decoder.py +332 -0
  40. diffsynth/models/sd_vae_encoder.py +278 -0
  41. diffsynth/models/sdxl_ipadapter.py +121 -0
  42. diffsynth/models/sdxl_motion.py +103 -0
  43. diffsynth/models/sdxl_text_encoder.py +757 -0
  44. diffsynth/models/sdxl_unet.py +0 -0
  45. diffsynth/models/sdxl_vae_decoder.py +15 -0
  46. diffsynth/models/sdxl_vae_encoder.py +15 -0
  47. diffsynth/models/svd_image_encoder.py +504 -0
  48. diffsynth/models/svd_unet.py +0 -0
  49. diffsynth/models/svd_vae_decoder.py +577 -0
  50. diffsynth/models/svd_vae_encoder.py +138 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ diffsynth/tokenizer_configs/kolors/tokenizer/vocab.txt filter=lfs diff=lfs merge=lfs -text
DiffSynth_Studio.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Set web page format
2
+ import streamlit as st
3
+ st.set_page_config(layout="wide")
4
+ # Diasble virtual VRAM on windows system
5
+ import torch
6
+ torch.cuda.set_per_process_memory_fraction(0.999, 0)
7
+
8
+
9
+ st.markdown("""
10
+ # DiffSynth Studio
11
+
12
+ [Source Code](https://github.com/Artiprocher/DiffSynth-Studio)
13
+
14
+ Welcome to DiffSynth Studio.
15
+ """)
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [2023] [Zhongjie Duan]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,13 +1,117 @@
1
- ---
2
- title: Diffutoon
3
- emoji: 🏃
4
- colorFrom: green
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 4.39.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DiffSynth Studio
2
+
3
+
4
+ ## Introduction
5
+
6
+ DiffSynth Studio is a Diffusion engine. We have restructured architectures including Text Encoder, UNet, VAE, among others, maintaining compatibility with models from the open-source community while enhancing computational performance. We provide many interesting features. Enjoy the magic of Diffusion models!
7
+
8
+ Until now, DiffSynth Studio has supported the following models:
9
+
10
+ * [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
11
+ * [Kolors](https://huggingface.co/Kwai-Kolors/Kolors)
12
+ * [Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)
13
+ * [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)
14
+ * [Hunyuan-DiT](https://github.com/Tencent/HunyuanDiT)
15
+ * [RIFE](https://github.com/hzwer/ECCV2022-RIFE)
16
+ * [ESRGAN](https://github.com/xinntao/ESRGAN)
17
+ * [Ip-Adapter](https://github.com/tencent-ailab/IP-Adapter)
18
+ * [AnimateDiff](https://github.com/guoyww/animatediff/)
19
+ * [ControlNet](https://github.com/lllyasviel/ControlNet)
20
+ * [Stable Diffusion XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
21
+ * [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
22
+
23
+ ## News
24
+
25
+
26
+ - **June 21, 2024.** 🔥🔥🔥 We propose ExVideo, a post-tuning technique aimed at enhancing the capability of video generation models. We have extended Stable Video Diffusion to achieve the generation of long videos up to 128 frames.
27
+ - [Project Page](https://ecnu-cilab.github.io/ExVideoProjectPage/)
28
+ - Source code is released in this repo. See [`examples/ExVideo`](./examples/ExVideo/).
29
+ - Models are released on [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1) and [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-SVD-128f-v1).
30
+ - Technical report is released on [arXiv](https://arxiv.org/abs/2406.14130).
31
+ - You can try ExVideo in this [Demo](https://huggingface.co/spaces/modelscope/ExVideo-SVD-128f-v1)!
32
+
33
+ - **June 13, 2024.** DiffSynth Studio is transferred to ModelScope. The developers have transitioned from "I" to "we". Of course, I will still participate in development and maintenance.
34
+
35
+ - **Jan 29, 2024.** We propose Diffutoon, a fantastic solution for toon shading.
36
+ - [Project Page](https://ecnu-cilab.github.io/DiffutoonProjectPage/)
37
+ - The source codes are released in this project.
38
+ - The technical report (IJCAI 2024) is released on [arXiv](https://arxiv.org/abs/2401.16224).
39
+
40
+ - **Dec 8, 2023.** We decide to develop a new Project, aiming to release the potential of diffusion models, especially in video synthesis. The development of this project is started.
41
+
42
+ - **Nov 15, 2023.** We propose FastBlend, a powerful video deflickering algorithm.
43
+ - The sd-webui extension is released on [GitHub](https://github.com/Artiprocher/sd-webui-fastblend).
44
+ - Demo videos are shown on Bilibili, including three tasks.
45
+ - [Video deflickering](https://www.bilibili.com/video/BV1d94y1W7PE)
46
+ - [Video interpolation](https://www.bilibili.com/video/BV1Lw411m71p)
47
+ - [Image-driven video rendering](https://www.bilibili.com/video/BV1RB4y1Z7LF)
48
+ - The technical report is released on [arXiv](https://arxiv.org/abs/2311.09265).
49
+ - An unofficial ComfyUI extension developed by other users is released on [GitHub](https://github.com/AInseven/ComfyUI-fastblend).
50
+
51
+ - **Oct 1, 2023.** We release an early version of this project, namely FastSDXL. A try for building a diffusion engine.
52
+ - The source codes are released on [GitHub](https://github.com/Artiprocher/FastSDXL).
53
+ - FastSDXL includes a trainable OLSS scheduler for efficiency improvement.
54
+ - The original repo of OLSS is [here](https://github.com/alibaba/EasyNLP/tree/master/diffusion/olss_scheduler).
55
+ - The technical report (CIKM 2023) is released on [arXiv](https://arxiv.org/abs/2305.14677).
56
+ - A demo video is shown on [Bilibili](https://www.bilibili.com/video/BV1w8411y7uj).
57
+ - Since OLSS requires additional training, we don't implement it in this project.
58
+
59
+ - **Aug 29, 2023.** We propose DiffSynth, a video synthesis framework.
60
+ - [Project Page](https://ecnu-cilab.github.io/DiffSynth.github.io/).
61
+ - The source codes are released in [EasyNLP](https://github.com/alibaba/EasyNLP/tree/master/diffusion/DiffSynth).
62
+ - The technical report (ECML PKDD 2024) is released on [arXiv](https://arxiv.org/abs/2308.03463).
63
+
64
+
65
+ ## Installation
66
+
67
+ ```
68
+ git clone https://github.com/modelscope/DiffSynth-Studio.git
69
+ cd DiffSynth-Studio
70
+ pip install -e .
71
+ ```
72
+
73
+ ## Usage (in Python code)
74
+
75
+ The Python examples are in [`examples`](./examples/). We provide an overview here.
76
+
77
+ ### Long Video Synthesis
78
+
79
+ We trained an extended video synthesis model, which can generate 128 frames. [`examples/ExVideo`](./examples/ExVideo/)
80
+
81
+ https://github.com/modelscope/DiffSynth-Studio/assets/35051019/d97f6aa9-8064-4b5b-9d49-ed6001bb9acc
82
+
83
+ ### Image Synthesis
84
+
85
+ Generate high-resolution images, by breaking the limitation of diffusion models! [`examples/image_synthesis`](./examples/image_synthesis/).
86
+
87
+ LoRA fine-tuning is supported in [`examples/train`](./examples/train/).
88
+
89
+ |Model|Example|
90
+ |-|-|
91
+ |Stable Diffusion|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/6fc84611-8da6-4a1f-8fee-9a34eba3b4a5)|
92
+ |Stable Diffusion XL|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/67687748-e738-438c-aee5-96096f09ac90)|
93
+ |Stable Diffusion 3|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4df346db-6f91-420a-b4c1-26e205376098)|
94
+ |Kolors|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/53ef6f41-da11-4701-8665-9f64392607bf)|
95
+ |Hunyuan-DiT|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/60b022c8-df3f-4541-95ab-bf39f2fa8bb5)|
96
+
97
+ ### Toon Shading
98
+
99
+ Render realistic videos in a flatten style and enable video editing features. [`examples/Diffutoon`](./examples/Diffutoon/)
100
+
101
+ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/b54c05c5-d747-4709-be5e-b39af82404dd
102
+
103
+ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/20528af5-5100-474a-8cdc-440b9efdd86c
104
+
105
+ ### Video Stylization
106
+
107
+ Video stylization without video models. [`examples/diffsynth`](./examples/diffsynth/)
108
+
109
+ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-4481-b79f-0c3a7361a1ea
110
+
111
+ ## Usage (in WebUI)
112
+
113
+ ```
114
+ python -m streamlit run DiffSynth_Studio.py
115
+ ```
116
+
117
+ https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/93085557-73f3-4eee-a205-9829591ef954
diffsynth/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from .data import *
2
+ from .models import *
3
+ from .prompts import *
4
+ from .schedulers import *
5
+ from .pipelines import *
6
+ from .controlnets import *
diffsynth/controlnets/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .controlnet_unit import ControlNetConfigUnit, ControlNetUnit, MultiControlNetManager
2
+ from .processors import Annotator
diffsynth/controlnets/controlnet_unit.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from .processors import Processor_id
4
+
5
+
6
+ class ControlNetConfigUnit:
7
+ def __init__(self, processor_id: Processor_id, model_path, scale=1.0):
8
+ self.processor_id = processor_id
9
+ self.model_path = model_path
10
+ self.scale = scale
11
+
12
+
13
+ class ControlNetUnit:
14
+ def __init__(self, processor, model, scale=1.0):
15
+ self.processor = processor
16
+ self.model = model
17
+ self.scale = scale
18
+
19
+
20
+ class MultiControlNetManager:
21
+ def __init__(self, controlnet_units=[]):
22
+ self.processors = [unit.processor for unit in controlnet_units]
23
+ self.models = [unit.model for unit in controlnet_units]
24
+ self.scales = [unit.scale for unit in controlnet_units]
25
+
26
+ def process_image(self, image, processor_id=None):
27
+ if processor_id is None:
28
+ processed_image = [processor(image) for processor in self.processors]
29
+ else:
30
+ processed_image = [self.processors[processor_id](image)]
31
+ processed_image = torch.concat([
32
+ torch.Tensor(np.array(image_, dtype=np.float32) / 255).permute(2, 0, 1).unsqueeze(0)
33
+ for image_ in processed_image
34
+ ], dim=0)
35
+ return processed_image
36
+
37
+ def __call__(
38
+ self,
39
+ sample, timestep, encoder_hidden_states, conditionings,
40
+ tiled=False, tile_size=64, tile_stride=32
41
+ ):
42
+ res_stack = None
43
+ for conditioning, model, scale in zip(conditionings, self.models, self.scales):
44
+ res_stack_ = model(
45
+ sample, timestep, encoder_hidden_states, conditioning,
46
+ tiled=tiled, tile_size=tile_size, tile_stride=tile_stride
47
+ )
48
+ res_stack_ = [res * scale for res in res_stack_]
49
+ if res_stack is None:
50
+ res_stack = res_stack_
51
+ else:
52
+ res_stack = [i + j for i, j in zip(res_stack, res_stack_)]
53
+ return res_stack
diffsynth/controlnets/processors.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing_extensions import Literal, TypeAlias
2
+ import warnings
3
+ with warnings.catch_warnings():
4
+ warnings.simplefilter("ignore")
5
+ from controlnet_aux.processor import (
6
+ CannyDetector, MidasDetector, HEDdetector, LineartDetector, LineartAnimeDetector, OpenposeDetector
7
+ )
8
+
9
+
10
+ Processor_id: TypeAlias = Literal[
11
+ "canny", "depth", "softedge", "lineart", "lineart_anime", "openpose", "tile"
12
+ ]
13
+
14
+ class Annotator:
15
+ def __init__(self, processor_id: Processor_id, model_path="models/Annotators", detect_resolution=None, device='cuda'):
16
+ if processor_id == "canny":
17
+ self.processor = CannyDetector()
18
+ elif processor_id == "depth":
19
+ self.processor = MidasDetector.from_pretrained(model_path).to(device)
20
+ elif processor_id == "softedge":
21
+ self.processor = HEDdetector.from_pretrained(model_path).to(device)
22
+ elif processor_id == "lineart":
23
+ self.processor = LineartDetector.from_pretrained(model_path).to(device)
24
+ elif processor_id == "lineart_anime":
25
+ self.processor = LineartAnimeDetector.from_pretrained(model_path).to(device)
26
+ elif processor_id == "openpose":
27
+ self.processor = OpenposeDetector.from_pretrained(model_path).to(device)
28
+ elif processor_id == "tile":
29
+ self.processor = None
30
+ else:
31
+ raise ValueError(f"Unsupported processor_id: {processor_id}")
32
+
33
+ self.processor_id = processor_id
34
+ self.detect_resolution = detect_resolution
35
+
36
+ def __call__(self, image):
37
+ width, height = image.size
38
+ if self.processor_id == "openpose":
39
+ kwargs = {
40
+ "include_body": True,
41
+ "include_hand": True,
42
+ "include_face": True
43
+ }
44
+ else:
45
+ kwargs = {}
46
+ if self.processor is not None:
47
+ detect_resolution = self.detect_resolution if self.detect_resolution is not None else min(width, height)
48
+ image = self.processor(image, detect_resolution=detect_resolution, image_resolution=min(width, height), **kwargs)
49
+ image = image.resize((width, height))
50
+ return image
51
+
diffsynth/data/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .video import VideoData, save_video, save_frames
diffsynth/data/video.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import imageio, os
2
+ import numpy as np
3
+ from PIL import Image
4
+ from tqdm import tqdm
5
+
6
+
7
+ class LowMemoryVideo:
8
+ def __init__(self, file_name):
9
+ self.reader = imageio.get_reader(file_name)
10
+
11
+ def __len__(self):
12
+ return self.reader.count_frames()
13
+
14
+ def __getitem__(self, item):
15
+ return Image.fromarray(np.array(self.reader.get_data(item))).convert("RGB")
16
+
17
+ def __del__(self):
18
+ self.reader.close()
19
+
20
+
21
+ def split_file_name(file_name):
22
+ result = []
23
+ number = -1
24
+ for i in file_name:
25
+ if ord(i)>=ord("0") and ord(i)<=ord("9"):
26
+ if number == -1:
27
+ number = 0
28
+ number = number*10 + ord(i) - ord("0")
29
+ else:
30
+ if number != -1:
31
+ result.append(number)
32
+ number = -1
33
+ result.append(i)
34
+ if number != -1:
35
+ result.append(number)
36
+ result = tuple(result)
37
+ return result
38
+
39
+
40
+ def search_for_images(folder):
41
+ file_list = [i for i in os.listdir(folder) if i.endswith(".jpg") or i.endswith(".png")]
42
+ file_list = [(split_file_name(file_name), file_name) for file_name in file_list]
43
+ file_list = [i[1] for i in sorted(file_list)]
44
+ file_list = [os.path.join(folder, i) for i in file_list]
45
+ return file_list
46
+
47
+
48
+ class LowMemoryImageFolder:
49
+ def __init__(self, folder, file_list=None):
50
+ if file_list is None:
51
+ self.file_list = search_for_images(folder)
52
+ else:
53
+ self.file_list = [os.path.join(folder, file_name) for file_name in file_list]
54
+
55
+ def __len__(self):
56
+ return len(self.file_list)
57
+
58
+ def __getitem__(self, item):
59
+ return Image.open(self.file_list[item]).convert("RGB")
60
+
61
+ def __del__(self):
62
+ pass
63
+
64
+
65
+ def crop_and_resize(image, height, width):
66
+ image = np.array(image)
67
+ image_height, image_width, _ = image.shape
68
+ if image_height / image_width < height / width:
69
+ croped_width = int(image_height / height * width)
70
+ left = (image_width - croped_width) // 2
71
+ image = image[:, left: left+croped_width]
72
+ image = Image.fromarray(image).resize((width, height))
73
+ else:
74
+ croped_height = int(image_width / width * height)
75
+ left = (image_height - croped_height) // 2
76
+ image = image[left: left+croped_height, :]
77
+ image = Image.fromarray(image).resize((width, height))
78
+ return image
79
+
80
+
81
+ class VideoData:
82
+ def __init__(self, video_file=None, image_folder=None, height=None, width=None, **kwargs):
83
+ if video_file is not None:
84
+ self.data_type = "video"
85
+ self.data = LowMemoryVideo(video_file, **kwargs)
86
+ elif image_folder is not None:
87
+ self.data_type = "images"
88
+ self.data = LowMemoryImageFolder(image_folder, **kwargs)
89
+ else:
90
+ raise ValueError("Cannot open video or image folder")
91
+ self.length = None
92
+ self.set_shape(height, width)
93
+
94
+ def raw_data(self):
95
+ frames = []
96
+ for i in range(self.__len__()):
97
+ frames.append(self.__getitem__(i))
98
+ return frames
99
+
100
+ def set_length(self, length):
101
+ self.length = length
102
+
103
+ def set_shape(self, height, width):
104
+ self.height = height
105
+ self.width = width
106
+
107
+ def __len__(self):
108
+ if self.length is None:
109
+ return len(self.data)
110
+ else:
111
+ return self.length
112
+
113
+ def shape(self):
114
+ if self.height is not None and self.width is not None:
115
+ return self.height, self.width
116
+ else:
117
+ height, width, _ = self.__getitem__(0).shape
118
+ return height, width
119
+
120
+ def __getitem__(self, item):
121
+ frame = self.data.__getitem__(item)
122
+ width, height = frame.size
123
+ if self.height is not None and self.width is not None:
124
+ if self.height != height or self.width != width:
125
+ frame = crop_and_resize(frame, self.height, self.width)
126
+ return frame
127
+
128
+ def __del__(self):
129
+ pass
130
+
131
+ def save_images(self, folder):
132
+ os.makedirs(folder, exist_ok=True)
133
+ for i in tqdm(range(self.__len__()), desc="Saving images"):
134
+ frame = self.__getitem__(i)
135
+ frame.save(os.path.join(folder, f"{i}.png"))
136
+
137
+
138
+ def save_video(frames, save_path, fps, quality=9):
139
+ writer = imageio.get_writer(save_path, fps=fps, quality=quality)
140
+ for frame in tqdm(frames, desc="Saving video"):
141
+ frame = np.array(frame)
142
+ writer.append_data(frame)
143
+ writer.close()
144
+
145
+ def save_frames(frames, save_path):
146
+ os.makedirs(save_path, exist_ok=True)
147
+ for i, frame in enumerate(tqdm(frames, desc="Saving images")):
148
+ frame.save(os.path.join(save_path, f"{i}.png"))
diffsynth/extensions/ESRGAN/__init__.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from einops import repeat
3
+ from PIL import Image
4
+ import numpy as np
5
+
6
+
7
+ class ResidualDenseBlock(torch.nn.Module):
8
+
9
+ def __init__(self, num_feat=64, num_grow_ch=32):
10
+ super(ResidualDenseBlock, self).__init__()
11
+ self.conv1 = torch.nn.Conv2d(num_feat, num_grow_ch, 3, 1, 1)
12
+ self.conv2 = torch.nn.Conv2d(num_feat + num_grow_ch, num_grow_ch, 3, 1, 1)
13
+ self.conv3 = torch.nn.Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, 3, 1, 1)
14
+ self.conv4 = torch.nn.Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, 3, 1, 1)
15
+ self.conv5 = torch.nn.Conv2d(num_feat + 4 * num_grow_ch, num_feat, 3, 1, 1)
16
+ self.lrelu = torch.nn.LeakyReLU(negative_slope=0.2, inplace=True)
17
+
18
+ def forward(self, x):
19
+ x1 = self.lrelu(self.conv1(x))
20
+ x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
21
+ x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
22
+ x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
23
+ x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
24
+ return x5 * 0.2 + x
25
+
26
+
27
+ class RRDB(torch.nn.Module):
28
+
29
+ def __init__(self, num_feat, num_grow_ch=32):
30
+ super(RRDB, self).__init__()
31
+ self.rdb1 = ResidualDenseBlock(num_feat, num_grow_ch)
32
+ self.rdb2 = ResidualDenseBlock(num_feat, num_grow_ch)
33
+ self.rdb3 = ResidualDenseBlock(num_feat, num_grow_ch)
34
+
35
+ def forward(self, x):
36
+ out = self.rdb1(x)
37
+ out = self.rdb2(out)
38
+ out = self.rdb3(out)
39
+ return out * 0.2 + x
40
+
41
+
42
+ class RRDBNet(torch.nn.Module):
43
+
44
+ def __init__(self, num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32):
45
+ super(RRDBNet, self).__init__()
46
+ self.conv_first = torch.nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
47
+ self.body = torch.torch.nn.Sequential(*[RRDB(num_feat=num_feat, num_grow_ch=num_grow_ch) for _ in range(num_block)])
48
+ self.conv_body = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1)
49
+ # upsample
50
+ self.conv_up1 = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1)
51
+ self.conv_up2 = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1)
52
+ self.conv_hr = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1)
53
+ self.conv_last = torch.nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
54
+ self.lrelu = torch.nn.LeakyReLU(negative_slope=0.2, inplace=True)
55
+
56
+ def forward(self, x):
57
+ feat = x
58
+ feat = self.conv_first(feat)
59
+ body_feat = self.conv_body(self.body(feat))
60
+ feat = feat + body_feat
61
+ # upsample
62
+ feat = repeat(feat, "B C H W -> B C (H 2) (W 2)")
63
+ feat = self.lrelu(self.conv_up1(feat))
64
+ feat = repeat(feat, "B C H W -> B C (H 2) (W 2)")
65
+ feat = self.lrelu(self.conv_up2(feat))
66
+ out = self.conv_last(self.lrelu(self.conv_hr(feat)))
67
+ return out
68
+
69
+
70
+ class ESRGAN(torch.nn.Module):
71
+ def __init__(self, model):
72
+ super().__init__()
73
+ self.model = model
74
+
75
+ @staticmethod
76
+ def from_pretrained(model_path):
77
+ model = RRDBNet()
78
+ state_dict = torch.load(model_path, map_location="cpu")["params_ema"]
79
+ model.load_state_dict(state_dict)
80
+ model.eval()
81
+ return ESRGAN(model)
82
+
83
+ def process_image(self, image):
84
+ image = torch.Tensor(np.array(image, dtype=np.float32) / 255).permute(2, 0, 1)
85
+ return image
86
+
87
+ def process_images(self, images):
88
+ images = [self.process_image(image) for image in images]
89
+ images = torch.stack(images)
90
+ return images
91
+
92
+ def decode_images(self, images):
93
+ images = (images.permute(0, 2, 3, 1) * 255).clip(0, 255).numpy().astype(np.uint8)
94
+ images = [Image.fromarray(image) for image in images]
95
+ return images
96
+
97
+ @torch.no_grad()
98
+ def upscale(self, images, batch_size=4, progress_bar=lambda x:x):
99
+ # Preprocess
100
+ input_tensor = self.process_images(images)
101
+
102
+ # Interpolate
103
+ output_tensor = []
104
+ for batch_id in progress_bar(range(0, input_tensor.shape[0], batch_size)):
105
+ batch_id_ = min(batch_id + batch_size, input_tensor.shape[0])
106
+ batch_input_tensor = input_tensor[batch_id: batch_id_]
107
+ batch_input_tensor = batch_input_tensor.to(
108
+ device=self.model.conv_first.weight.device,
109
+ dtype=self.model.conv_first.weight.dtype)
110
+ batch_output_tensor = self.model(batch_input_tensor)
111
+ output_tensor.append(batch_output_tensor.cpu())
112
+
113
+ # Output
114
+ output_tensor = torch.concat(output_tensor, dim=0)
115
+
116
+ # To images
117
+ output_images = self.decode_images(output_tensor)
118
+ return output_images
diffsynth/extensions/FastBlend/__init__.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .runners.fast import TableManager, PyramidPatchMatcher
2
+ from PIL import Image
3
+ import numpy as np
4
+ import cupy as cp
5
+
6
+
7
+ class FastBlendSmoother:
8
+ def __init__(self):
9
+ self.batch_size = 8
10
+ self.window_size = 64
11
+ self.ebsynth_config = {
12
+ "minimum_patch_size": 5,
13
+ "threads_per_block": 8,
14
+ "num_iter": 5,
15
+ "gpu_id": 0,
16
+ "guide_weight": 10.0,
17
+ "initialize": "identity",
18
+ "tracking_window_size": 0,
19
+ }
20
+
21
+ @staticmethod
22
+ def from_model_manager(model_manager):
23
+ # TODO: fetch GPU ID from model_manager
24
+ return FastBlendSmoother()
25
+
26
+ def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config):
27
+ frames_guide = [np.array(frame) for frame in frames_guide]
28
+ frames_style = [np.array(frame) for frame in frames_style]
29
+ table_manager = TableManager()
30
+ patch_match_engine = PyramidPatchMatcher(
31
+ image_height=frames_style[0].shape[0],
32
+ image_width=frames_style[0].shape[1],
33
+ channel=3,
34
+ **ebsynth_config
35
+ )
36
+ # left part
37
+ table_l = table_manager.build_remapping_table(frames_guide, frames_style, patch_match_engine, batch_size, desc="FastBlend Step 1/4")
38
+ table_l = table_manager.remapping_table_to_blending_table(table_l)
39
+ table_l = table_manager.process_window_sum(frames_guide, table_l, patch_match_engine, window_size, batch_size, desc="FastBlend Step 2/4")
40
+ # right part
41
+ table_r = table_manager.build_remapping_table(frames_guide[::-1], frames_style[::-1], patch_match_engine, batch_size, desc="FastBlend Step 3/4")
42
+ table_r = table_manager.remapping_table_to_blending_table(table_r)
43
+ table_r = table_manager.process_window_sum(frames_guide[::-1], table_r, patch_match_engine, window_size, batch_size, desc="FastBlend Step 4/4")[::-1]
44
+ # merge
45
+ frames = []
46
+ for (frame_l, weight_l), frame_m, (frame_r, weight_r) in zip(table_l, frames_style, table_r):
47
+ weight_m = -1
48
+ weight = weight_l + weight_m + weight_r
49
+ frame = frame_l * (weight_l / weight) + frame_m * (weight_m / weight) + frame_r * (weight_r / weight)
50
+ frames.append(frame)
51
+ frames = [Image.fromarray(frame.clip(0, 255).astype("uint8")) for frame in frames]
52
+ return frames
53
+
54
+ def __call__(self, rendered_frames, original_frames=None, **kwargs):
55
+ frames = self.run(
56
+ original_frames, rendered_frames,
57
+ self.batch_size, self.window_size, self.ebsynth_config
58
+ )
59
+ mempool = cp.get_default_memory_pool()
60
+ pinned_mempool = cp.get_default_pinned_memory_pool()
61
+ mempool.free_all_blocks()
62
+ pinned_mempool.free_all_blocks()
63
+ return frames
diffsynth/extensions/FastBlend/api.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .runners import AccurateModeRunner, FastModeRunner, BalancedModeRunner, InterpolationModeRunner, InterpolationModeSingleFrameRunner
2
+ from .data import VideoData, get_video_fps, save_video, search_for_images
3
+ import os
4
+ import gradio as gr
5
+
6
+
7
+ def check_input_for_blending(video_guide, video_guide_folder, video_style, video_style_folder):
8
+ frames_guide = VideoData(video_guide, video_guide_folder)
9
+ frames_style = VideoData(video_style, video_style_folder)
10
+ message = ""
11
+ if len(frames_guide) < len(frames_style):
12
+ message += f"The number of frames mismatches. Only the first {len(frames_guide)} frames of style video will be used.\n"
13
+ frames_style.set_length(len(frames_guide))
14
+ elif len(frames_guide) > len(frames_style):
15
+ message += f"The number of frames mismatches. Only the first {len(frames_style)} frames of guide video will be used.\n"
16
+ frames_guide.set_length(len(frames_style))
17
+ height_guide, width_guide = frames_guide.shape()
18
+ height_style, width_style = frames_style.shape()
19
+ if height_guide != height_style or width_guide != width_style:
20
+ message += f"The shape of frames mismatches. The frames in style video will be resized to (height: {height_guide}, width: {width_guide})\n"
21
+ frames_style.set_shape(height_guide, width_guide)
22
+ return frames_guide, frames_style, message
23
+
24
+
25
+ def smooth_video(
26
+ video_guide,
27
+ video_guide_folder,
28
+ video_style,
29
+ video_style_folder,
30
+ mode,
31
+ window_size,
32
+ batch_size,
33
+ tracking_window_size,
34
+ output_path,
35
+ fps,
36
+ minimum_patch_size,
37
+ num_iter,
38
+ guide_weight,
39
+ initialize,
40
+ progress = None,
41
+ ):
42
+ # input
43
+ frames_guide, frames_style, message = check_input_for_blending(video_guide, video_guide_folder, video_style, video_style_folder)
44
+ if len(message) > 0:
45
+ print(message)
46
+ # output
47
+ if output_path == "":
48
+ if video_style is None:
49
+ output_path = os.path.join(video_style_folder, "output")
50
+ else:
51
+ output_path = os.path.join(os.path.split(video_style)[0], "output")
52
+ os.makedirs(output_path, exist_ok=True)
53
+ print("No valid output_path. Your video will be saved here:", output_path)
54
+ elif not os.path.exists(output_path):
55
+ os.makedirs(output_path, exist_ok=True)
56
+ print("Your video will be saved here:", output_path)
57
+ frames_path = os.path.join(output_path, "frames")
58
+ video_path = os.path.join(output_path, "video.mp4")
59
+ os.makedirs(frames_path, exist_ok=True)
60
+ # process
61
+ if mode == "Fast" or mode == "Balanced":
62
+ tracking_window_size = 0
63
+ ebsynth_config = {
64
+ "minimum_patch_size": minimum_patch_size,
65
+ "threads_per_block": 8,
66
+ "num_iter": num_iter,
67
+ "gpu_id": 0,
68
+ "guide_weight": guide_weight,
69
+ "initialize": initialize,
70
+ "tracking_window_size": tracking_window_size,
71
+ }
72
+ if mode == "Fast":
73
+ FastModeRunner().run(frames_guide, frames_style, batch_size=batch_size, window_size=window_size, ebsynth_config=ebsynth_config, save_path=frames_path)
74
+ elif mode == "Balanced":
75
+ BalancedModeRunner().run(frames_guide, frames_style, batch_size=batch_size, window_size=window_size, ebsynth_config=ebsynth_config, save_path=frames_path)
76
+ elif mode == "Accurate":
77
+ AccurateModeRunner().run(frames_guide, frames_style, batch_size=batch_size, window_size=window_size, ebsynth_config=ebsynth_config, save_path=frames_path)
78
+ # output
79
+ try:
80
+ fps = int(fps)
81
+ except:
82
+ fps = get_video_fps(video_style) if video_style is not None else 30
83
+ print("Fps:", fps)
84
+ print("Saving video...")
85
+ video_path = save_video(frames_path, video_path, num_frames=len(frames_style), fps=fps)
86
+ print("Success!")
87
+ print("Your frames are here:", frames_path)
88
+ print("Your video is here:", video_path)
89
+ return output_path, fps, video_path
90
+
91
+
92
+ class KeyFrameMatcher:
93
+ def __init__(self):
94
+ pass
95
+
96
+ def extract_number_from_filename(self, file_name):
97
+ result = []
98
+ number = -1
99
+ for i in file_name:
100
+ if ord(i)>=ord("0") and ord(i)<=ord("9"):
101
+ if number == -1:
102
+ number = 0
103
+ number = number*10 + ord(i) - ord("0")
104
+ else:
105
+ if number != -1:
106
+ result.append(number)
107
+ number = -1
108
+ if number != -1:
109
+ result.append(number)
110
+ result = tuple(result)
111
+ return result
112
+
113
+ def extract_number_from_filenames(self, file_names):
114
+ numbers = [self.extract_number_from_filename(file_name) for file_name in file_names]
115
+ min_length = min(len(i) for i in numbers)
116
+ for i in range(min_length-1, -1, -1):
117
+ if len(set(number[i] for number in numbers))==len(file_names):
118
+ return [number[i] for number in numbers]
119
+ return list(range(len(file_names)))
120
+
121
+ def match_using_filename(self, file_names_a, file_names_b):
122
+ file_names_b_set = set(file_names_b)
123
+ matched_file_name = []
124
+ for file_name in file_names_a:
125
+ if file_name not in file_names_b_set:
126
+ matched_file_name.append(None)
127
+ else:
128
+ matched_file_name.append(file_name)
129
+ return matched_file_name
130
+
131
+ def match_using_numbers(self, file_names_a, file_names_b):
132
+ numbers_a = self.extract_number_from_filenames(file_names_a)
133
+ numbers_b = self.extract_number_from_filenames(file_names_b)
134
+ numbers_b_dict = {number: file_name for number, file_name in zip(numbers_b, file_names_b)}
135
+ matched_file_name = []
136
+ for number in numbers_a:
137
+ if number in numbers_b_dict:
138
+ matched_file_name.append(numbers_b_dict[number])
139
+ else:
140
+ matched_file_name.append(None)
141
+ return matched_file_name
142
+
143
+ def match_filenames(self, file_names_a, file_names_b):
144
+ matched_file_name = self.match_using_filename(file_names_a, file_names_b)
145
+ if sum([i is not None for i in matched_file_name]) > 0:
146
+ return matched_file_name
147
+ matched_file_name = self.match_using_numbers(file_names_a, file_names_b)
148
+ return matched_file_name
149
+
150
+
151
+ def detect_frames(frames_path, keyframes_path):
152
+ if not os.path.exists(frames_path) and not os.path.exists(keyframes_path):
153
+ return "Please input the directory of guide video and rendered frames"
154
+ elif not os.path.exists(frames_path):
155
+ return "Please input the directory of guide video"
156
+ elif not os.path.exists(keyframes_path):
157
+ return "Please input the directory of rendered frames"
158
+ frames = [os.path.split(i)[-1] for i in search_for_images(frames_path)]
159
+ keyframes = [os.path.split(i)[-1] for i in search_for_images(keyframes_path)]
160
+ if len(frames)==0:
161
+ return f"No images detected in {frames_path}"
162
+ if len(keyframes)==0:
163
+ return f"No images detected in {keyframes_path}"
164
+ matched_keyframes = KeyFrameMatcher().match_filenames(frames, keyframes)
165
+ max_filename_length = max([len(i) for i in frames])
166
+ if sum([i is not None for i in matched_keyframes])==0:
167
+ message = ""
168
+ for frame, matched_keyframe in zip(frames, matched_keyframes):
169
+ message += frame + " " * (max_filename_length - len(frame) + 1)
170
+ message += "--> No matched keyframes\n"
171
+ else:
172
+ message = ""
173
+ for frame, matched_keyframe in zip(frames, matched_keyframes):
174
+ message += frame + " " * (max_filename_length - len(frame) + 1)
175
+ if matched_keyframe is None:
176
+ message += "--> [to be rendered]\n"
177
+ else:
178
+ message += f"--> {matched_keyframe}\n"
179
+ return message
180
+
181
+
182
+ def check_input_for_interpolating(frames_path, keyframes_path):
183
+ # search for images
184
+ frames = [os.path.split(i)[-1] for i in search_for_images(frames_path)]
185
+ keyframes = [os.path.split(i)[-1] for i in search_for_images(keyframes_path)]
186
+ # match frames
187
+ matched_keyframes = KeyFrameMatcher().match_filenames(frames, keyframes)
188
+ file_list = [file_name for file_name in matched_keyframes if file_name is not None]
189
+ index_style = [i for i, file_name in enumerate(matched_keyframes) if file_name is not None]
190
+ frames_guide = VideoData(None, frames_path)
191
+ frames_style = VideoData(None, keyframes_path, file_list=file_list)
192
+ # match shape
193
+ message = ""
194
+ height_guide, width_guide = frames_guide.shape()
195
+ height_style, width_style = frames_style.shape()
196
+ if height_guide != height_style or width_guide != width_style:
197
+ message += f"The shape of frames mismatches. The rendered keyframes will be resized to (height: {height_guide}, width: {width_guide})\n"
198
+ frames_style.set_shape(height_guide, width_guide)
199
+ return frames_guide, frames_style, index_style, message
200
+
201
+
202
+ def interpolate_video(
203
+ frames_path,
204
+ keyframes_path,
205
+ output_path,
206
+ fps,
207
+ batch_size,
208
+ tracking_window_size,
209
+ minimum_patch_size,
210
+ num_iter,
211
+ guide_weight,
212
+ initialize,
213
+ progress = None,
214
+ ):
215
+ # input
216
+ frames_guide, frames_style, index_style, message = check_input_for_interpolating(frames_path, keyframes_path)
217
+ if len(message) > 0:
218
+ print(message)
219
+ # output
220
+ if output_path == "":
221
+ output_path = os.path.join(keyframes_path, "output")
222
+ os.makedirs(output_path, exist_ok=True)
223
+ print("No valid output_path. Your video will be saved here:", output_path)
224
+ elif not os.path.exists(output_path):
225
+ os.makedirs(output_path, exist_ok=True)
226
+ print("Your video will be saved here:", output_path)
227
+ output_frames_path = os.path.join(output_path, "frames")
228
+ output_video_path = os.path.join(output_path, "video.mp4")
229
+ os.makedirs(output_frames_path, exist_ok=True)
230
+ # process
231
+ ebsynth_config = {
232
+ "minimum_patch_size": minimum_patch_size,
233
+ "threads_per_block": 8,
234
+ "num_iter": num_iter,
235
+ "gpu_id": 0,
236
+ "guide_weight": guide_weight,
237
+ "initialize": initialize,
238
+ "tracking_window_size": tracking_window_size
239
+ }
240
+ if len(index_style)==1:
241
+ InterpolationModeSingleFrameRunner().run(frames_guide, frames_style, index_style, batch_size=batch_size, ebsynth_config=ebsynth_config, save_path=output_frames_path)
242
+ else:
243
+ InterpolationModeRunner().run(frames_guide, frames_style, index_style, batch_size=batch_size, ebsynth_config=ebsynth_config, save_path=output_frames_path)
244
+ try:
245
+ fps = int(fps)
246
+ except:
247
+ fps = 30
248
+ print("Fps:", fps)
249
+ print("Saving video...")
250
+ video_path = save_video(output_frames_path, output_video_path, num_frames=len(frames_guide), fps=fps)
251
+ print("Success!")
252
+ print("Your frames are here:", output_frames_path)
253
+ print("Your video is here:", video_path)
254
+ return output_path, fps, video_path
255
+
256
+
257
+ def on_ui_tabs():
258
+ with gr.Blocks(analytics_enabled=False) as ui_component:
259
+ with gr.Tab("Blend"):
260
+ gr.Markdown("""
261
+ # Blend
262
+
263
+ Given a guide video and a style video, this algorithm will make the style video fluent according to the motion features of the guide video. Click [here](https://github.com/Artiprocher/sd-webui-fastblend/assets/35051019/208d902d-6aba-48d7-b7d5-cd120ebd306d) to see the example. Note that this extension doesn't support long videos. Please use short videos (e.g., several seconds). The algorithm is mainly designed for 512*512 resolution. Please use a larger `Minimum patch size` for higher resolution.
264
+ """)
265
+ with gr.Row():
266
+ with gr.Column():
267
+ with gr.Tab("Guide video"):
268
+ video_guide = gr.Video(label="Guide video")
269
+ with gr.Tab("Guide video (images format)"):
270
+ video_guide_folder = gr.Textbox(label="Guide video (images format)", value="")
271
+ with gr.Column():
272
+ with gr.Tab("Style video"):
273
+ video_style = gr.Video(label="Style video")
274
+ with gr.Tab("Style video (images format)"):
275
+ video_style_folder = gr.Textbox(label="Style video (images format)", value="")
276
+ with gr.Column():
277
+ output_path = gr.Textbox(label="Output directory", value="", placeholder="Leave empty to use the directory of style video")
278
+ fps = gr.Textbox(label="Fps", value="", placeholder="Leave empty to use the default fps")
279
+ video_output = gr.Video(label="Output video", interactive=False, show_share_button=True)
280
+ btn = gr.Button(value="Blend")
281
+ with gr.Row():
282
+ with gr.Column():
283
+ gr.Markdown("# Settings")
284
+ mode = gr.Radio(["Fast", "Balanced", "Accurate"], label="Inference mode", value="Fast", interactive=True)
285
+ window_size = gr.Slider(label="Sliding window size", value=15, minimum=1, maximum=1000, step=1, interactive=True)
286
+ batch_size = gr.Slider(label="Batch size", value=8, minimum=1, maximum=128, step=1, interactive=True)
287
+ tracking_window_size = gr.Slider(label="Tracking window size (only for accurate mode)", value=0, minimum=0, maximum=10, step=1, interactive=True)
288
+ gr.Markdown("## Advanced Settings")
289
+ minimum_patch_size = gr.Slider(label="Minimum patch size (odd number)", value=5, minimum=5, maximum=99, step=2, interactive=True)
290
+ num_iter = gr.Slider(label="Number of iterations", value=5, minimum=1, maximum=10, step=1, interactive=True)
291
+ guide_weight = gr.Slider(label="Guide weight", value=10.0, minimum=0.0, maximum=100.0, step=0.1, interactive=True)
292
+ initialize = gr.Radio(["identity", "random"], label="NNF initialization", value="identity", interactive=True)
293
+ with gr.Column():
294
+ gr.Markdown("""
295
+ # Reference
296
+
297
+ * Output directory: the directory to save the video.
298
+ * Inference mode
299
+
300
+ |Mode|Time|Memory|Quality|Frame by frame output|Description|
301
+ |-|-|-|-|-|-|
302
+ |Fast|■|■■■|■■|No|Blend the frames using a tree-like data structure, which requires much RAM but is fast.|
303
+ |Balanced|■■|■|■■|Yes|Blend the frames naively.|
304
+ |Accurate|■■■|■|■■■|Yes|Blend the frames and align them together for higher video quality. When [batch size] >= [sliding window size] * 2 + 1, the performance is the best.|
305
+
306
+ * Sliding window size: our algorithm will blend the frames in a sliding windows. If the size is n, each frame will be blended with the last n frames and the next n frames. A large sliding window can make the video fluent but sometimes smoggy.
307
+ * Batch size: a larger batch size makes the program faster but requires more VRAM.
308
+ * Tracking window size (only for accurate mode): The size of window in which our algorithm tracks moving objects. Empirically, 1 is enough.
309
+ * Advanced settings
310
+ * Minimum patch size (odd number): the minimum patch size used for patch matching. (Default: 5)
311
+ * Number of iterations: the number of iterations of patch matching. (Default: 5)
312
+ * Guide weight: a parameter that determines how much motion feature applied to the style video. (Default: 10)
313
+ * NNF initialization: how to initialize the NNF (Nearest Neighbor Field). (Default: identity)
314
+ """)
315
+ btn.click(
316
+ smooth_video,
317
+ inputs=[
318
+ video_guide,
319
+ video_guide_folder,
320
+ video_style,
321
+ video_style_folder,
322
+ mode,
323
+ window_size,
324
+ batch_size,
325
+ tracking_window_size,
326
+ output_path,
327
+ fps,
328
+ minimum_patch_size,
329
+ num_iter,
330
+ guide_weight,
331
+ initialize
332
+ ],
333
+ outputs=[output_path, fps, video_output]
334
+ )
335
+ with gr.Tab("Interpolate"):
336
+ gr.Markdown("""
337
+ # Interpolate
338
+
339
+ Given a guide video and some rendered keyframes, this algorithm will render the remaining frames. Click [here](https://github.com/Artiprocher/sd-webui-fastblend/assets/35051019/3490c5b4-8f67-478f-86de-f9adc2ace16a) to see the example. The algorithm is experimental and is only tested for 512*512 resolution.
340
+ """)
341
+ with gr.Row():
342
+ with gr.Column():
343
+ with gr.Row():
344
+ with gr.Column():
345
+ video_guide_folder_ = gr.Textbox(label="Guide video (images format)", value="")
346
+ with gr.Column():
347
+ rendered_keyframes_ = gr.Textbox(label="Rendered keyframes (images format)", value="")
348
+ with gr.Row():
349
+ detected_frames = gr.Textbox(label="Detected frames", value="Please input the directory of guide video and rendered frames", lines=9, max_lines=9, interactive=False)
350
+ video_guide_folder_.change(detect_frames, inputs=[video_guide_folder_, rendered_keyframes_], outputs=detected_frames)
351
+ rendered_keyframes_.change(detect_frames, inputs=[video_guide_folder_, rendered_keyframes_], outputs=detected_frames)
352
+ with gr.Column():
353
+ output_path_ = gr.Textbox(label="Output directory", value="", placeholder="Leave empty to use the directory of rendered keyframes")
354
+ fps_ = gr.Textbox(label="Fps", value="", placeholder="Leave empty to use the default fps")
355
+ video_output_ = gr.Video(label="Output video", interactive=False, show_share_button=True)
356
+ btn_ = gr.Button(value="Interpolate")
357
+ with gr.Row():
358
+ with gr.Column():
359
+ gr.Markdown("# Settings")
360
+ batch_size_ = gr.Slider(label="Batch size", value=8, minimum=1, maximum=128, step=1, interactive=True)
361
+ tracking_window_size_ = gr.Slider(label="Tracking window size", value=0, minimum=0, maximum=10, step=1, interactive=True)
362
+ gr.Markdown("## Advanced Settings")
363
+ minimum_patch_size_ = gr.Slider(label="Minimum patch size (odd number, larger is better)", value=15, minimum=5, maximum=99, step=2, interactive=True)
364
+ num_iter_ = gr.Slider(label="Number of iterations", value=5, minimum=1, maximum=10, step=1, interactive=True)
365
+ guide_weight_ = gr.Slider(label="Guide weight", value=10.0, minimum=0.0, maximum=100.0, step=0.1, interactive=True)
366
+ initialize_ = gr.Radio(["identity", "random"], label="NNF initialization", value="identity", interactive=True)
367
+ with gr.Column():
368
+ gr.Markdown("""
369
+ # Reference
370
+
371
+ * Output directory: the directory to save the video.
372
+ * Batch size: a larger batch size makes the program faster but requires more VRAM.
373
+ * Tracking window size (only for accurate mode): The size of window in which our algorithm tracks moving objects. Empirically, 1 is enough.
374
+ * Advanced settings
375
+ * Minimum patch size (odd number): the minimum patch size used for patch matching. **This parameter should be larger than that in blending. (Default: 15)**
376
+ * Number of iterations: the number of iterations of patch matching. (Default: 5)
377
+ * Guide weight: a parameter that determines how much motion feature applied to the style video. (Default: 10)
378
+ * NNF initialization: how to initialize the NNF (Nearest Neighbor Field). (Default: identity)
379
+ """)
380
+ btn_.click(
381
+ interpolate_video,
382
+ inputs=[
383
+ video_guide_folder_,
384
+ rendered_keyframes_,
385
+ output_path_,
386
+ fps_,
387
+ batch_size_,
388
+ tracking_window_size_,
389
+ minimum_patch_size_,
390
+ num_iter_,
391
+ guide_weight_,
392
+ initialize_,
393
+ ],
394
+ outputs=[output_path_, fps_, video_output_]
395
+ )
396
+
397
+ return [(ui_component, "FastBlend", "FastBlend_ui")]
diffsynth/extensions/FastBlend/cupy_kernels.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cupy as cp
2
+
3
+ remapping_kernel = cp.RawKernel(r'''
4
+ extern "C" __global__
5
+ void remap(
6
+ const int height,
7
+ const int width,
8
+ const int channel,
9
+ const int patch_size,
10
+ const int pad_size,
11
+ const float* source_style,
12
+ const int* nnf,
13
+ float* target_style
14
+ ) {
15
+ const int r = (patch_size - 1) / 2;
16
+ const int x = blockDim.x * blockIdx.x + threadIdx.x;
17
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;
18
+ if (x >= height or y >= width) return;
19
+ const int z = blockIdx.z * (height + pad_size * 2) * (width + pad_size * 2) * channel;
20
+ const int pid = (x + pad_size) * (width + pad_size * 2) + (y + pad_size);
21
+ const int min_px = x < r ? -x : -r;
22
+ const int max_px = x + r > height - 1 ? height - 1 - x : r;
23
+ const int min_py = y < r ? -y : -r;
24
+ const int max_py = y + r > width - 1 ? width - 1 - y : r;
25
+ int num = 0;
26
+ for (int px = min_px; px <= max_px; px++){
27
+ for (int py = min_py; py <= max_py; py++){
28
+ const int nid = (x + px) * width + y + py;
29
+ const int x_ = nnf[blockIdx.z * height * width * 2 + nid*2 + 0] - px;
30
+ const int y_ = nnf[blockIdx.z * height * width * 2 + nid*2 + 1] - py;
31
+ if (x_ < 0 or y_ < 0 or x_ >= height or y_ >= width)continue;
32
+ const int pid_ = (x_ + pad_size) * (width + pad_size * 2) + (y_ + pad_size);
33
+ num++;
34
+ for (int c = 0; c < channel; c++){
35
+ target_style[z + pid * channel + c] += source_style[z + pid_ * channel + c];
36
+ }
37
+ }
38
+ }
39
+ for (int c = 0; c < channel; c++){
40
+ target_style[z + pid * channel + c] /= num;
41
+ }
42
+ }
43
+ ''', 'remap')
44
+
45
+
46
+ patch_error_kernel = cp.RawKernel(r'''
47
+ extern "C" __global__
48
+ void patch_error(
49
+ const int height,
50
+ const int width,
51
+ const int channel,
52
+ const int patch_size,
53
+ const int pad_size,
54
+ const float* source,
55
+ const int* nnf,
56
+ const float* target,
57
+ float* error
58
+ ) {
59
+ const int r = (patch_size - 1) / 2;
60
+ const int x = blockDim.x * blockIdx.x + threadIdx.x;
61
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;
62
+ const int z = blockIdx.z * (height + pad_size * 2) * (width + pad_size * 2) * channel;
63
+ if (x >= height or y >= width) return;
64
+ const int x_ = nnf[blockIdx.z * height * width * 2 + (x * width + y)*2 + 0];
65
+ const int y_ = nnf[blockIdx.z * height * width * 2 + (x * width + y)*2 + 1];
66
+ float e = 0;
67
+ for (int px = -r; px <= r; px++){
68
+ for (int py = -r; py <= r; py++){
69
+ const int pid = (x + pad_size + px) * (width + pad_size * 2) + y + pad_size + py;
70
+ const int pid_ = (x_ + pad_size + px) * (width + pad_size * 2) + y_ + pad_size + py;
71
+ for (int c = 0; c < channel; c++){
72
+ const float diff = target[z + pid * channel + c] - source[z + pid_ * channel + c];
73
+ e += diff * diff;
74
+ }
75
+ }
76
+ }
77
+ error[blockIdx.z * height * width + x * width + y] = e;
78
+ }
79
+ ''', 'patch_error')
80
+
81
+
82
+ pairwise_patch_error_kernel = cp.RawKernel(r'''
83
+ extern "C" __global__
84
+ void pairwise_patch_error(
85
+ const int height,
86
+ const int width,
87
+ const int channel,
88
+ const int patch_size,
89
+ const int pad_size,
90
+ const float* source_a,
91
+ const int* nnf_a,
92
+ const float* source_b,
93
+ const int* nnf_b,
94
+ float* error
95
+ ) {
96
+ const int r = (patch_size - 1) / 2;
97
+ const int x = blockDim.x * blockIdx.x + threadIdx.x;
98
+ const int y = blockDim.y * blockIdx.y + threadIdx.y;
99
+ const int z = blockIdx.z * (height + pad_size * 2) * (width + pad_size * 2) * channel;
100
+ if (x >= height or y >= width) return;
101
+ const int z_nnf = blockIdx.z * height * width * 2 + (x * width + y) * 2;
102
+ const int x_a = nnf_a[z_nnf + 0];
103
+ const int y_a = nnf_a[z_nnf + 1];
104
+ const int x_b = nnf_b[z_nnf + 0];
105
+ const int y_b = nnf_b[z_nnf + 1];
106
+ float e = 0;
107
+ for (int px = -r; px <= r; px++){
108
+ for (int py = -r; py <= r; py++){
109
+ const int pid_a = (x_a + pad_size + px) * (width + pad_size * 2) + y_a + pad_size + py;
110
+ const int pid_b = (x_b + pad_size + px) * (width + pad_size * 2) + y_b + pad_size + py;
111
+ for (int c = 0; c < channel; c++){
112
+ const float diff = source_a[z + pid_a * channel + c] - source_b[z + pid_b * channel + c];
113
+ e += diff * diff;
114
+ }
115
+ }
116
+ }
117
+ error[blockIdx.z * height * width + x * width + y] = e;
118
+ }
119
+ ''', 'pairwise_patch_error')
diffsynth/extensions/FastBlend/data.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import imageio, os
2
+ import numpy as np
3
+ from PIL import Image
4
+
5
+
6
+ def read_video(file_name):
7
+ reader = imageio.get_reader(file_name)
8
+ video = []
9
+ for frame in reader:
10
+ frame = np.array(frame)
11
+ video.append(frame)
12
+ reader.close()
13
+ return video
14
+
15
+
16
+ def get_video_fps(file_name):
17
+ reader = imageio.get_reader(file_name)
18
+ fps = reader.get_meta_data()["fps"]
19
+ reader.close()
20
+ return fps
21
+
22
+
23
+ def save_video(frames_path, video_path, num_frames, fps):
24
+ writer = imageio.get_writer(video_path, fps=fps, quality=9)
25
+ for i in range(num_frames):
26
+ frame = np.array(Image.open(os.path.join(frames_path, "%05d.png" % i)))
27
+ writer.append_data(frame)
28
+ writer.close()
29
+ return video_path
30
+
31
+
32
+ class LowMemoryVideo:
33
+ def __init__(self, file_name):
34
+ self.reader = imageio.get_reader(file_name)
35
+
36
+ def __len__(self):
37
+ return self.reader.count_frames()
38
+
39
+ def __getitem__(self, item):
40
+ return np.array(self.reader.get_data(item))
41
+
42
+ def __del__(self):
43
+ self.reader.close()
44
+
45
+
46
+ def split_file_name(file_name):
47
+ result = []
48
+ number = -1
49
+ for i in file_name:
50
+ if ord(i)>=ord("0") and ord(i)<=ord("9"):
51
+ if number == -1:
52
+ number = 0
53
+ number = number*10 + ord(i) - ord("0")
54
+ else:
55
+ if number != -1:
56
+ result.append(number)
57
+ number = -1
58
+ result.append(i)
59
+ if number != -1:
60
+ result.append(number)
61
+ result = tuple(result)
62
+ return result
63
+
64
+
65
+ def search_for_images(folder):
66
+ file_list = [i for i in os.listdir(folder) if i.endswith(".jpg") or i.endswith(".png")]
67
+ file_list = [(split_file_name(file_name), file_name) for file_name in file_list]
68
+ file_list = [i[1] for i in sorted(file_list)]
69
+ file_list = [os.path.join(folder, i) for i in file_list]
70
+ return file_list
71
+
72
+
73
+ def read_images(folder):
74
+ file_list = search_for_images(folder)
75
+ frames = [np.array(Image.open(i)) for i in file_list]
76
+ return frames
77
+
78
+
79
+ class LowMemoryImageFolder:
80
+ def __init__(self, folder, file_list=None):
81
+ if file_list is None:
82
+ self.file_list = search_for_images(folder)
83
+ else:
84
+ self.file_list = [os.path.join(folder, file_name) for file_name in file_list]
85
+
86
+ def __len__(self):
87
+ return len(self.file_list)
88
+
89
+ def __getitem__(self, item):
90
+ return np.array(Image.open(self.file_list[item]))
91
+
92
+ def __del__(self):
93
+ pass
94
+
95
+
96
+ class VideoData:
97
+ def __init__(self, video_file, image_folder, **kwargs):
98
+ if video_file is not None:
99
+ self.data_type = "video"
100
+ self.data = LowMemoryVideo(video_file, **kwargs)
101
+ elif image_folder is not None:
102
+ self.data_type = "images"
103
+ self.data = LowMemoryImageFolder(image_folder, **kwargs)
104
+ else:
105
+ raise ValueError("Cannot open video or image folder")
106
+ self.length = None
107
+ self.height = None
108
+ self.width = None
109
+
110
+ def raw_data(self):
111
+ frames = []
112
+ for i in range(self.__len__()):
113
+ frames.append(self.__getitem__(i))
114
+ return frames
115
+
116
+ def set_length(self, length):
117
+ self.length = length
118
+
119
+ def set_shape(self, height, width):
120
+ self.height = height
121
+ self.width = width
122
+
123
+ def __len__(self):
124
+ if self.length is None:
125
+ return len(self.data)
126
+ else:
127
+ return self.length
128
+
129
+ def shape(self):
130
+ if self.height is not None and self.width is not None:
131
+ return self.height, self.width
132
+ else:
133
+ height, width, _ = self.__getitem__(0).shape
134
+ return height, width
135
+
136
+ def __getitem__(self, item):
137
+ frame = self.data.__getitem__(item)
138
+ height, width, _ = frame.shape
139
+ if self.height is not None and self.width is not None:
140
+ if self.height != height or self.width != width:
141
+ frame = Image.fromarray(frame).resize((self.width, self.height))
142
+ frame = np.array(frame)
143
+ return frame
144
+
145
+ def __del__(self):
146
+ pass
diffsynth/extensions/FastBlend/patch_match.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .cupy_kernels import remapping_kernel, patch_error_kernel, pairwise_patch_error_kernel
2
+ import numpy as np
3
+ import cupy as cp
4
+ import cv2
5
+
6
+
7
+ class PatchMatcher:
8
+ def __init__(
9
+ self, height, width, channel, minimum_patch_size,
10
+ threads_per_block=8, num_iter=5, gpu_id=0, guide_weight=10.0,
11
+ random_search_steps=3, random_search_range=4,
12
+ use_mean_target_style=False, use_pairwise_patch_error=False,
13
+ tracking_window_size=0
14
+ ):
15
+ self.height = height
16
+ self.width = width
17
+ self.channel = channel
18
+ self.minimum_patch_size = minimum_patch_size
19
+ self.threads_per_block = threads_per_block
20
+ self.num_iter = num_iter
21
+ self.gpu_id = gpu_id
22
+ self.guide_weight = guide_weight
23
+ self.random_search_steps = random_search_steps
24
+ self.random_search_range = random_search_range
25
+ self.use_mean_target_style = use_mean_target_style
26
+ self.use_pairwise_patch_error = use_pairwise_patch_error
27
+ self.tracking_window_size = tracking_window_size
28
+
29
+ self.patch_size_list = [minimum_patch_size + i*2 for i in range(num_iter)][::-1]
30
+ self.pad_size = self.patch_size_list[0] // 2
31
+ self.grid = (
32
+ (height + threads_per_block - 1) // threads_per_block,
33
+ (width + threads_per_block - 1) // threads_per_block
34
+ )
35
+ self.block = (threads_per_block, threads_per_block)
36
+
37
+ def pad_image(self, image):
38
+ return cp.pad(image, ((0, 0), (self.pad_size, self.pad_size), (self.pad_size, self.pad_size), (0, 0)))
39
+
40
+ def unpad_image(self, image):
41
+ return image[:, self.pad_size: -self.pad_size, self.pad_size: -self.pad_size, :]
42
+
43
+ def apply_nnf_to_image(self, nnf, source):
44
+ batch_size = source.shape[0]
45
+ target = cp.zeros((batch_size, self.height + self.pad_size * 2, self.width + self.pad_size * 2, self.channel), dtype=cp.float32)
46
+ remapping_kernel(
47
+ self.grid + (batch_size,),
48
+ self.block,
49
+ (self.height, self.width, self.channel, self.patch_size, self.pad_size, source, nnf, target)
50
+ )
51
+ return target
52
+
53
+ def get_patch_error(self, source, nnf, target):
54
+ batch_size = source.shape[0]
55
+ error = cp.zeros((batch_size, self.height, self.width), dtype=cp.float32)
56
+ patch_error_kernel(
57
+ self.grid + (batch_size,),
58
+ self.block,
59
+ (self.height, self.width, self.channel, self.patch_size, self.pad_size, source, nnf, target, error)
60
+ )
61
+ return error
62
+
63
+ def get_pairwise_patch_error(self, source, nnf):
64
+ batch_size = source.shape[0]//2
65
+ error = cp.zeros((batch_size, self.height, self.width), dtype=cp.float32)
66
+ source_a, nnf_a = source[0::2].copy(), nnf[0::2].copy()
67
+ source_b, nnf_b = source[1::2].copy(), nnf[1::2].copy()
68
+ pairwise_patch_error_kernel(
69
+ self.grid + (batch_size,),
70
+ self.block,
71
+ (self.height, self.width, self.channel, self.patch_size, self.pad_size, source_a, nnf_a, source_b, nnf_b, error)
72
+ )
73
+ error = error.repeat(2, axis=0)
74
+ return error
75
+
76
+ def get_error(self, source_guide, target_guide, source_style, target_style, nnf):
77
+ error_guide = self.get_patch_error(source_guide, nnf, target_guide)
78
+ if self.use_mean_target_style:
79
+ target_style = self.apply_nnf_to_image(nnf, source_style)
80
+ target_style = target_style.mean(axis=0, keepdims=True)
81
+ target_style = target_style.repeat(source_guide.shape[0], axis=0)
82
+ if self.use_pairwise_patch_error:
83
+ error_style = self.get_pairwise_patch_error(source_style, nnf)
84
+ else:
85
+ error_style = self.get_patch_error(source_style, nnf, target_style)
86
+ error = error_guide * self.guide_weight + error_style
87
+ return error
88
+
89
+ def clamp_bound(self, nnf):
90
+ nnf[:,:,:,0] = cp.clip(nnf[:,:,:,0], 0, self.height-1)
91
+ nnf[:,:,:,1] = cp.clip(nnf[:,:,:,1], 0, self.width-1)
92
+ return nnf
93
+
94
+ def random_step(self, nnf, r):
95
+ batch_size = nnf.shape[0]
96
+ step = cp.random.randint(-r, r+1, size=(batch_size, self.height, self.width, 2), dtype=cp.int32)
97
+ upd_nnf = self.clamp_bound(nnf + step)
98
+ return upd_nnf
99
+
100
+ def neighboor_step(self, nnf, d):
101
+ if d==0:
102
+ upd_nnf = cp.concatenate([nnf[:, :1, :], nnf[:, :-1, :]], axis=1)
103
+ upd_nnf[:, :, :, 0] += 1
104
+ elif d==1:
105
+ upd_nnf = cp.concatenate([nnf[:, :, :1], nnf[:, :, :-1]], axis=2)
106
+ upd_nnf[:, :, :, 1] += 1
107
+ elif d==2:
108
+ upd_nnf = cp.concatenate([nnf[:, 1:, :], nnf[:, -1:, :]], axis=1)
109
+ upd_nnf[:, :, :, 0] -= 1
110
+ elif d==3:
111
+ upd_nnf = cp.concatenate([nnf[:, :, 1:], nnf[:, :, -1:]], axis=2)
112
+ upd_nnf[:, :, :, 1] -= 1
113
+ upd_nnf = self.clamp_bound(upd_nnf)
114
+ return upd_nnf
115
+
116
+ def shift_nnf(self, nnf, d):
117
+ if d>0:
118
+ d = min(nnf.shape[0], d)
119
+ upd_nnf = cp.concatenate([nnf[d:]] + [nnf[-1:]] * d, axis=0)
120
+ else:
121
+ d = max(-nnf.shape[0], d)
122
+ upd_nnf = cp.concatenate([nnf[:1]] * (-d) + [nnf[:d]], axis=0)
123
+ return upd_nnf
124
+
125
+ def track_step(self, nnf, d):
126
+ if self.use_pairwise_patch_error:
127
+ upd_nnf = cp.zeros_like(nnf)
128
+ upd_nnf[0::2] = self.shift_nnf(nnf[0::2], d)
129
+ upd_nnf[1::2] = self.shift_nnf(nnf[1::2], d)
130
+ else:
131
+ upd_nnf = self.shift_nnf(nnf, d)
132
+ return upd_nnf
133
+
134
+ def C(self, n, m):
135
+ # not used
136
+ c = 1
137
+ for i in range(1, n+1):
138
+ c *= i
139
+ for i in range(1, m+1):
140
+ c //= i
141
+ for i in range(1, n-m+1):
142
+ c //= i
143
+ return c
144
+
145
+ def bezier_step(self, nnf, r):
146
+ # not used
147
+ n = r * 2 - 1
148
+ upd_nnf = cp.zeros(shape=nnf.shape, dtype=cp.float32)
149
+ for i, d in enumerate(list(range(-r, 0)) + list(range(1, r+1))):
150
+ if d>0:
151
+ ctl_nnf = cp.concatenate([nnf[d:]] + [nnf[-1:]] * d, axis=0)
152
+ elif d<0:
153
+ ctl_nnf = cp.concatenate([nnf[:1]] * (-d) + [nnf[:d]], axis=0)
154
+ upd_nnf += ctl_nnf * (self.C(n, i) / 2**n)
155
+ upd_nnf = self.clamp_bound(upd_nnf).astype(nnf.dtype)
156
+ return upd_nnf
157
+
158
+ def update(self, source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf):
159
+ upd_err = self.get_error(source_guide, target_guide, source_style, target_style, upd_nnf)
160
+ upd_idx = (upd_err < err)
161
+ nnf[upd_idx] = upd_nnf[upd_idx]
162
+ err[upd_idx] = upd_err[upd_idx]
163
+ return nnf, err
164
+
165
+ def propagation(self, source_guide, target_guide, source_style, target_style, nnf, err):
166
+ for d in cp.random.permutation(4):
167
+ upd_nnf = self.neighboor_step(nnf, d)
168
+ nnf, err = self.update(source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf)
169
+ return nnf, err
170
+
171
+ def random_search(self, source_guide, target_guide, source_style, target_style, nnf, err):
172
+ for i in range(self.random_search_steps):
173
+ upd_nnf = self.random_step(nnf, self.random_search_range)
174
+ nnf, err = self.update(source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf)
175
+ return nnf, err
176
+
177
+ def track(self, source_guide, target_guide, source_style, target_style, nnf, err):
178
+ for d in range(1, self.tracking_window_size + 1):
179
+ upd_nnf = self.track_step(nnf, d)
180
+ nnf, err = self.update(source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf)
181
+ upd_nnf = self.track_step(nnf, -d)
182
+ nnf, err = self.update(source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf)
183
+ return nnf, err
184
+
185
+ def iteration(self, source_guide, target_guide, source_style, target_style, nnf, err):
186
+ nnf, err = self.propagation(source_guide, target_guide, source_style, target_style, nnf, err)
187
+ nnf, err = self.random_search(source_guide, target_guide, source_style, target_style, nnf, err)
188
+ nnf, err = self.track(source_guide, target_guide, source_style, target_style, nnf, err)
189
+ return nnf, err
190
+
191
+ def estimate_nnf(self, source_guide, target_guide, source_style, nnf):
192
+ with cp.cuda.Device(self.gpu_id):
193
+ source_guide = self.pad_image(source_guide)
194
+ target_guide = self.pad_image(target_guide)
195
+ source_style = self.pad_image(source_style)
196
+ for it in range(self.num_iter):
197
+ self.patch_size = self.patch_size_list[it]
198
+ target_style = self.apply_nnf_to_image(nnf, source_style)
199
+ err = self.get_error(source_guide, target_guide, source_style, target_style, nnf)
200
+ nnf, err = self.iteration(source_guide, target_guide, source_style, target_style, nnf, err)
201
+ target_style = self.unpad_image(self.apply_nnf_to_image(nnf, source_style))
202
+ return nnf, target_style
203
+
204
+
205
+ class PyramidPatchMatcher:
206
+ def __init__(
207
+ self, image_height, image_width, channel, minimum_patch_size,
208
+ threads_per_block=8, num_iter=5, gpu_id=0, guide_weight=10.0,
209
+ use_mean_target_style=False, use_pairwise_patch_error=False,
210
+ tracking_window_size=0,
211
+ initialize="identity"
212
+ ):
213
+ maximum_patch_size = minimum_patch_size + (num_iter - 1) * 2
214
+ self.pyramid_level = int(np.log2(min(image_height, image_width) / maximum_patch_size))
215
+ self.pyramid_heights = []
216
+ self.pyramid_widths = []
217
+ self.patch_matchers = []
218
+ self.minimum_patch_size = minimum_patch_size
219
+ self.num_iter = num_iter
220
+ self.gpu_id = gpu_id
221
+ self.initialize = initialize
222
+ for level in range(self.pyramid_level):
223
+ height = image_height//(2**(self.pyramid_level - 1 - level))
224
+ width = image_width//(2**(self.pyramid_level - 1 - level))
225
+ self.pyramid_heights.append(height)
226
+ self.pyramid_widths.append(width)
227
+ self.patch_matchers.append(PatchMatcher(
228
+ height, width, channel, minimum_patch_size=minimum_patch_size,
229
+ threads_per_block=threads_per_block, num_iter=num_iter, gpu_id=gpu_id, guide_weight=guide_weight,
230
+ use_mean_target_style=use_mean_target_style, use_pairwise_patch_error=use_pairwise_patch_error,
231
+ tracking_window_size=tracking_window_size
232
+ ))
233
+
234
+ def resample_image(self, images, level):
235
+ height, width = self.pyramid_heights[level], self.pyramid_widths[level]
236
+ images = images.get()
237
+ images_resample = []
238
+ for image in images:
239
+ image_resample = cv2.resize(image, (width, height), interpolation=cv2.INTER_AREA)
240
+ images_resample.append(image_resample)
241
+ images_resample = cp.array(np.stack(images_resample), dtype=cp.float32)
242
+ return images_resample
243
+
244
+ def initialize_nnf(self, batch_size):
245
+ if self.initialize == "random":
246
+ height, width = self.pyramid_heights[0], self.pyramid_widths[0]
247
+ nnf = cp.stack([
248
+ cp.random.randint(0, height, (batch_size, height, width), dtype=cp.int32),
249
+ cp.random.randint(0, width, (batch_size, height, width), dtype=cp.int32)
250
+ ], axis=3)
251
+ elif self.initialize == "identity":
252
+ height, width = self.pyramid_heights[0], self.pyramid_widths[0]
253
+ nnf = cp.stack([
254
+ cp.repeat(cp.arange(height), width).reshape(height, width),
255
+ cp.tile(cp.arange(width), height).reshape(height, width)
256
+ ], axis=2)
257
+ nnf = cp.stack([nnf] * batch_size)
258
+ else:
259
+ raise NotImplementedError()
260
+ return nnf
261
+
262
+ def update_nnf(self, nnf, level):
263
+ # upscale
264
+ nnf = nnf.repeat(2, axis=1).repeat(2, axis=2) * 2
265
+ nnf[:,[i for i in range(nnf.shape[0]) if i&1],:,0] += 1
266
+ nnf[:,:,[i for i in range(nnf.shape[0]) if i&1],1] += 1
267
+ # check if scale is 2
268
+ height, width = self.pyramid_heights[level], self.pyramid_widths[level]
269
+ if height != nnf.shape[0] * 2 or width != nnf.shape[1] * 2:
270
+ nnf = nnf.get().astype(np.float32)
271
+ nnf = [cv2.resize(n, (width, height), interpolation=cv2.INTER_LINEAR) for n in nnf]
272
+ nnf = cp.array(np.stack(nnf), dtype=cp.int32)
273
+ nnf = self.patch_matchers[level].clamp_bound(nnf)
274
+ return nnf
275
+
276
+ def apply_nnf_to_image(self, nnf, image):
277
+ with cp.cuda.Device(self.gpu_id):
278
+ image = self.patch_matchers[-1].pad_image(image)
279
+ image = self.patch_matchers[-1].apply_nnf_to_image(nnf, image)
280
+ return image
281
+
282
+ def estimate_nnf(self, source_guide, target_guide, source_style):
283
+ with cp.cuda.Device(self.gpu_id):
284
+ if not isinstance(source_guide, cp.ndarray):
285
+ source_guide = cp.array(source_guide, dtype=cp.float32)
286
+ if not isinstance(target_guide, cp.ndarray):
287
+ target_guide = cp.array(target_guide, dtype=cp.float32)
288
+ if not isinstance(source_style, cp.ndarray):
289
+ source_style = cp.array(source_style, dtype=cp.float32)
290
+ for level in range(self.pyramid_level):
291
+ nnf = self.initialize_nnf(source_guide.shape[0]) if level==0 else self.update_nnf(nnf, level)
292
+ source_guide_ = self.resample_image(source_guide, level)
293
+ target_guide_ = self.resample_image(target_guide, level)
294
+ source_style_ = self.resample_image(source_style, level)
295
+ nnf, target_style = self.patch_matchers[level].estimate_nnf(
296
+ source_guide_, target_guide_, source_style_, nnf
297
+ )
298
+ return nnf.get(), target_style.get()
diffsynth/extensions/FastBlend/runners/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .accurate import AccurateModeRunner
2
+ from .fast import FastModeRunner
3
+ from .balanced import BalancedModeRunner
4
+ from .interpolation import InterpolationModeRunner, InterpolationModeSingleFrameRunner
diffsynth/extensions/FastBlend/runners/accurate.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..patch_match import PyramidPatchMatcher
2
+ import os
3
+ import numpy as np
4
+ from PIL import Image
5
+ from tqdm import tqdm
6
+
7
+
8
+ class AccurateModeRunner:
9
+ def __init__(self):
10
+ pass
11
+
12
+ def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config, desc="Accurate Mode", save_path=None):
13
+ patch_match_engine = PyramidPatchMatcher(
14
+ image_height=frames_style[0].shape[0],
15
+ image_width=frames_style[0].shape[1],
16
+ channel=3,
17
+ use_mean_target_style=True,
18
+ **ebsynth_config
19
+ )
20
+ # run
21
+ n = len(frames_style)
22
+ for target in tqdm(range(n), desc=desc):
23
+ l, r = max(target - window_size, 0), min(target + window_size + 1, n)
24
+ remapped_frames = []
25
+ for i in range(l, r, batch_size):
26
+ j = min(i + batch_size, r)
27
+ source_guide = np.stack([frames_guide[source] for source in range(i, j)])
28
+ target_guide = np.stack([frames_guide[target]] * (j - i))
29
+ source_style = np.stack([frames_style[source] for source in range(i, j)])
30
+ _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
31
+ remapped_frames.append(target_style)
32
+ frame = np.concatenate(remapped_frames, axis=0).mean(axis=0)
33
+ frame = frame.clip(0, 255).astype("uint8")
34
+ if save_path is not None:
35
+ Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % target))
diffsynth/extensions/FastBlend/runners/balanced.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..patch_match import PyramidPatchMatcher
2
+ import os
3
+ import numpy as np
4
+ from PIL import Image
5
+ from tqdm import tqdm
6
+
7
+
8
+ class BalancedModeRunner:
9
+ def __init__(self):
10
+ pass
11
+
12
+ def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config, desc="Balanced Mode", save_path=None):
13
+ patch_match_engine = PyramidPatchMatcher(
14
+ image_height=frames_style[0].shape[0],
15
+ image_width=frames_style[0].shape[1],
16
+ channel=3,
17
+ **ebsynth_config
18
+ )
19
+ # tasks
20
+ n = len(frames_style)
21
+ tasks = []
22
+ for target in range(n):
23
+ for source in range(target - window_size, target + window_size + 1):
24
+ if source >= 0 and source < n and source != target:
25
+ tasks.append((source, target))
26
+ # run
27
+ frames = [(None, 1) for i in range(n)]
28
+ for batch_id in tqdm(range(0, len(tasks), batch_size), desc=desc):
29
+ tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))]
30
+ source_guide = np.stack([frames_guide[source] for source, target in tasks_batch])
31
+ target_guide = np.stack([frames_guide[target] for source, target in tasks_batch])
32
+ source_style = np.stack([frames_style[source] for source, target in tasks_batch])
33
+ _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
34
+ for (source, target), result in zip(tasks_batch, target_style):
35
+ frame, weight = frames[target]
36
+ if frame is None:
37
+ frame = frames_style[target]
38
+ frames[target] = (
39
+ frame * (weight / (weight + 1)) + result / (weight + 1),
40
+ weight + 1
41
+ )
42
+ if weight + 1 == min(n, target + window_size + 1) - max(0, target - window_size):
43
+ frame = frame.clip(0, 255).astype("uint8")
44
+ if save_path is not None:
45
+ Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % target))
46
+ frames[target] = (None, 1)
diffsynth/extensions/FastBlend/runners/fast.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..patch_match import PyramidPatchMatcher
2
+ import functools, os
3
+ import numpy as np
4
+ from PIL import Image
5
+ from tqdm import tqdm
6
+
7
+
8
+ class TableManager:
9
+ def __init__(self):
10
+ pass
11
+
12
+ def task_list(self, n):
13
+ tasks = []
14
+ max_level = 1
15
+ while (1<<max_level)<=n:
16
+ max_level += 1
17
+ for i in range(n):
18
+ j = i
19
+ for level in range(max_level):
20
+ if i&(1<<level):
21
+ continue
22
+ j |= 1<<level
23
+ if j>=n:
24
+ break
25
+ meta_data = {
26
+ "source": i,
27
+ "target": j,
28
+ "level": level + 1
29
+ }
30
+ tasks.append(meta_data)
31
+ tasks.sort(key=functools.cmp_to_key(lambda u, v: u["level"]-v["level"]))
32
+ return tasks
33
+
34
+ def build_remapping_table(self, frames_guide, frames_style, patch_match_engine, batch_size, desc=""):
35
+ n = len(frames_guide)
36
+ tasks = self.task_list(n)
37
+ remapping_table = [[(frames_style[i], 1)] for i in range(n)]
38
+ for batch_id in tqdm(range(0, len(tasks), batch_size), desc=desc):
39
+ tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))]
40
+ source_guide = np.stack([frames_guide[task["source"]] for task in tasks_batch])
41
+ target_guide = np.stack([frames_guide[task["target"]] for task in tasks_batch])
42
+ source_style = np.stack([frames_style[task["source"]] for task in tasks_batch])
43
+ _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
44
+ for task, result in zip(tasks_batch, target_style):
45
+ target, level = task["target"], task["level"]
46
+ if len(remapping_table[target])==level:
47
+ remapping_table[target].append((result, 1))
48
+ else:
49
+ frame, weight = remapping_table[target][level]
50
+ remapping_table[target][level] = (
51
+ frame * (weight / (weight + 1)) + result / (weight + 1),
52
+ weight + 1
53
+ )
54
+ return remapping_table
55
+
56
+ def remapping_table_to_blending_table(self, table):
57
+ for i in range(len(table)):
58
+ for j in range(1, len(table[i])):
59
+ frame_1, weight_1 = table[i][j-1]
60
+ frame_2, weight_2 = table[i][j]
61
+ frame = (frame_1 + frame_2) / 2
62
+ weight = weight_1 + weight_2
63
+ table[i][j] = (frame, weight)
64
+ return table
65
+
66
+ def tree_query(self, leftbound, rightbound):
67
+ node_list = []
68
+ node_index = rightbound
69
+ while node_index>=leftbound:
70
+ node_level = 0
71
+ while (1<<node_level)&node_index and node_index-(1<<node_level+1)+1>=leftbound:
72
+ node_level += 1
73
+ node_list.append((node_index, node_level))
74
+ node_index -= 1<<node_level
75
+ return node_list
76
+
77
+ def process_window_sum(self, frames_guide, blending_table, patch_match_engine, window_size, batch_size, desc=""):
78
+ n = len(blending_table)
79
+ tasks = []
80
+ frames_result = []
81
+ for target in range(n):
82
+ node_list = self.tree_query(max(target-window_size, 0), target)
83
+ for source, level in node_list:
84
+ if source!=target:
85
+ meta_data = {
86
+ "source": source,
87
+ "target": target,
88
+ "level": level
89
+ }
90
+ tasks.append(meta_data)
91
+ else:
92
+ frames_result.append(blending_table[target][level])
93
+ for batch_id in tqdm(range(0, len(tasks), batch_size), desc=desc):
94
+ tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))]
95
+ source_guide = np.stack([frames_guide[task["source"]] for task in tasks_batch])
96
+ target_guide = np.stack([frames_guide[task["target"]] for task in tasks_batch])
97
+ source_style = np.stack([blending_table[task["source"]][task["level"]][0] for task in tasks_batch])
98
+ _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
99
+ for task, frame_2 in zip(tasks_batch, target_style):
100
+ source, target, level = task["source"], task["target"], task["level"]
101
+ frame_1, weight_1 = frames_result[target]
102
+ weight_2 = blending_table[source][level][1]
103
+ weight = weight_1 + weight_2
104
+ frame = frame_1 * (weight_1 / weight) + frame_2 * (weight_2 / weight)
105
+ frames_result[target] = (frame, weight)
106
+ return frames_result
107
+
108
+
109
+ class FastModeRunner:
110
+ def __init__(self):
111
+ pass
112
+
113
+ def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config, save_path=None):
114
+ frames_guide = frames_guide.raw_data()
115
+ frames_style = frames_style.raw_data()
116
+ table_manager = TableManager()
117
+ patch_match_engine = PyramidPatchMatcher(
118
+ image_height=frames_style[0].shape[0],
119
+ image_width=frames_style[0].shape[1],
120
+ channel=3,
121
+ **ebsynth_config
122
+ )
123
+ # left part
124
+ table_l = table_manager.build_remapping_table(frames_guide, frames_style, patch_match_engine, batch_size, desc="Fast Mode Step 1/4")
125
+ table_l = table_manager.remapping_table_to_blending_table(table_l)
126
+ table_l = table_manager.process_window_sum(frames_guide, table_l, patch_match_engine, window_size, batch_size, desc="Fast Mode Step 2/4")
127
+ # right part
128
+ table_r = table_manager.build_remapping_table(frames_guide[::-1], frames_style[::-1], patch_match_engine, batch_size, desc="Fast Mode Step 3/4")
129
+ table_r = table_manager.remapping_table_to_blending_table(table_r)
130
+ table_r = table_manager.process_window_sum(frames_guide[::-1], table_r, patch_match_engine, window_size, batch_size, desc="Fast Mode Step 4/4")[::-1]
131
+ # merge
132
+ frames = []
133
+ for (frame_l, weight_l), frame_m, (frame_r, weight_r) in zip(table_l, frames_style, table_r):
134
+ weight_m = -1
135
+ weight = weight_l + weight_m + weight_r
136
+ frame = frame_l * (weight_l / weight) + frame_m * (weight_m / weight) + frame_r * (weight_r / weight)
137
+ frames.append(frame)
138
+ frames = [frame.clip(0, 255).astype("uint8") for frame in frames]
139
+ if save_path is not None:
140
+ for target, frame in enumerate(frames):
141
+ Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % target))
diffsynth/extensions/FastBlend/runners/interpolation.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..patch_match import PyramidPatchMatcher
2
+ import os
3
+ import numpy as np
4
+ from PIL import Image
5
+ from tqdm import tqdm
6
+
7
+
8
+ class InterpolationModeRunner:
9
+ def __init__(self):
10
+ pass
11
+
12
+ def get_index_dict(self, index_style):
13
+ index_dict = {}
14
+ for i, index in enumerate(index_style):
15
+ index_dict[index] = i
16
+ return index_dict
17
+
18
+ def get_weight(self, l, m, r):
19
+ weight_l, weight_r = abs(m - r), abs(m - l)
20
+ if weight_l + weight_r == 0:
21
+ weight_l, weight_r = 0.5, 0.5
22
+ else:
23
+ weight_l, weight_r = weight_l / (weight_l + weight_r), weight_r / (weight_l + weight_r)
24
+ return weight_l, weight_r
25
+
26
+ def get_task_group(self, index_style, n):
27
+ task_group = []
28
+ index_style = sorted(index_style)
29
+ # first frame
30
+ if index_style[0]>0:
31
+ tasks = []
32
+ for m in range(index_style[0]):
33
+ tasks.append((index_style[0], m, index_style[0]))
34
+ task_group.append(tasks)
35
+ # middle frames
36
+ for l, r in zip(index_style[:-1], index_style[1:]):
37
+ tasks = []
38
+ for m in range(l, r):
39
+ tasks.append((l, m, r))
40
+ task_group.append(tasks)
41
+ # last frame
42
+ tasks = []
43
+ for m in range(index_style[-1], n):
44
+ tasks.append((index_style[-1], m, index_style[-1]))
45
+ task_group.append(tasks)
46
+ return task_group
47
+
48
+ def run(self, frames_guide, frames_style, index_style, batch_size, ebsynth_config, save_path=None):
49
+ patch_match_engine = PyramidPatchMatcher(
50
+ image_height=frames_style[0].shape[0],
51
+ image_width=frames_style[0].shape[1],
52
+ channel=3,
53
+ use_mean_target_style=False,
54
+ use_pairwise_patch_error=True,
55
+ **ebsynth_config
56
+ )
57
+ # task
58
+ index_dict = self.get_index_dict(index_style)
59
+ task_group = self.get_task_group(index_style, len(frames_guide))
60
+ # run
61
+ for tasks in task_group:
62
+ index_start, index_end = min([i[1] for i in tasks]), max([i[1] for i in tasks])
63
+ for batch_id in tqdm(range(0, len(tasks), batch_size), desc=f"Rendering frames {index_start}...{index_end}"):
64
+ tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))]
65
+ source_guide, target_guide, source_style = [], [], []
66
+ for l, m, r in tasks_batch:
67
+ # l -> m
68
+ source_guide.append(frames_guide[l])
69
+ target_guide.append(frames_guide[m])
70
+ source_style.append(frames_style[index_dict[l]])
71
+ # r -> m
72
+ source_guide.append(frames_guide[r])
73
+ target_guide.append(frames_guide[m])
74
+ source_style.append(frames_style[index_dict[r]])
75
+ source_guide = np.stack(source_guide)
76
+ target_guide = np.stack(target_guide)
77
+ source_style = np.stack(source_style)
78
+ _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
79
+ if save_path is not None:
80
+ for frame_l, frame_r, (l, m, r) in zip(target_style[0::2], target_style[1::2], tasks_batch):
81
+ weight_l, weight_r = self.get_weight(l, m, r)
82
+ frame = frame_l * weight_l + frame_r * weight_r
83
+ frame = frame.clip(0, 255).astype("uint8")
84
+ Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % m))
85
+
86
+
87
+ class InterpolationModeSingleFrameRunner:
88
+ def __init__(self):
89
+ pass
90
+
91
+ def run(self, frames_guide, frames_style, index_style, batch_size, ebsynth_config, save_path=None):
92
+ # check input
93
+ tracking_window_size = ebsynth_config["tracking_window_size"]
94
+ if tracking_window_size * 2 >= batch_size:
95
+ raise ValueError("batch_size should be larger than track_window_size * 2")
96
+ frame_style = frames_style[0]
97
+ frame_guide = frames_guide[index_style[0]]
98
+ patch_match_engine = PyramidPatchMatcher(
99
+ image_height=frame_style.shape[0],
100
+ image_width=frame_style.shape[1],
101
+ channel=3,
102
+ **ebsynth_config
103
+ )
104
+ # run
105
+ frame_id, n = 0, len(frames_guide)
106
+ for i in tqdm(range(0, n, batch_size - tracking_window_size * 2), desc=f"Rendering frames 0...{n}"):
107
+ if i + batch_size > n:
108
+ l, r = max(n - batch_size, 0), n
109
+ else:
110
+ l, r = i, i + batch_size
111
+ source_guide = np.stack([frame_guide] * (r-l))
112
+ target_guide = np.stack([frames_guide[i] for i in range(l, r)])
113
+ source_style = np.stack([frame_style] * (r-l))
114
+ _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
115
+ for i, frame in zip(range(l, r), target_style):
116
+ if i==frame_id:
117
+ frame = frame.clip(0, 255).astype("uint8")
118
+ Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % frame_id))
119
+ frame_id += 1
120
+ if r < n and r-frame_id <= tracking_window_size:
121
+ break
diffsynth/extensions/RIFE/__init__.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+ from PIL import Image
6
+
7
+
8
+ def warp(tenInput, tenFlow, device):
9
+ backwarp_tenGrid = {}
10
+ k = (str(tenFlow.device), str(tenFlow.size()))
11
+ if k not in backwarp_tenGrid:
12
+ tenHorizontal = torch.linspace(-1.0, 1.0, tenFlow.shape[3], device=device).view(
13
+ 1, 1, 1, tenFlow.shape[3]).expand(tenFlow.shape[0], -1, tenFlow.shape[2], -1)
14
+ tenVertical = torch.linspace(-1.0, 1.0, tenFlow.shape[2], device=device).view(
15
+ 1, 1, tenFlow.shape[2], 1).expand(tenFlow.shape[0], -1, -1, tenFlow.shape[3])
16
+ backwarp_tenGrid[k] = torch.cat(
17
+ [tenHorizontal, tenVertical], 1).to(device)
18
+
19
+ tenFlow = torch.cat([tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0),
20
+ tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0)], 1)
21
+
22
+ g = (backwarp_tenGrid[k] + tenFlow).permute(0, 2, 3, 1)
23
+ return torch.nn.functional.grid_sample(input=tenInput, grid=g, mode='bilinear', padding_mode='border', align_corners=True)
24
+
25
+
26
+ def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
27
+ return nn.Sequential(
28
+ nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
29
+ padding=padding, dilation=dilation, bias=True),
30
+ nn.PReLU(out_planes)
31
+ )
32
+
33
+
34
+ class IFBlock(nn.Module):
35
+ def __init__(self, in_planes, c=64):
36
+ super(IFBlock, self).__init__()
37
+ self.conv0 = nn.Sequential(conv(in_planes, c//2, 3, 2, 1), conv(c//2, c, 3, 2, 1),)
38
+ self.convblock0 = nn.Sequential(conv(c, c), conv(c, c))
39
+ self.convblock1 = nn.Sequential(conv(c, c), conv(c, c))
40
+ self.convblock2 = nn.Sequential(conv(c, c), conv(c, c))
41
+ self.convblock3 = nn.Sequential(conv(c, c), conv(c, c))
42
+ self.conv1 = nn.Sequential(nn.ConvTranspose2d(c, c//2, 4, 2, 1), nn.PReLU(c//2), nn.ConvTranspose2d(c//2, 4, 4, 2, 1))
43
+ self.conv2 = nn.Sequential(nn.ConvTranspose2d(c, c//2, 4, 2, 1), nn.PReLU(c//2), nn.ConvTranspose2d(c//2, 1, 4, 2, 1))
44
+
45
+ def forward(self, x, flow, scale=1):
46
+ x = F.interpolate(x, scale_factor= 1. / scale, mode="bilinear", align_corners=False, recompute_scale_factor=False)
47
+ flow = F.interpolate(flow, scale_factor= 1. / scale, mode="bilinear", align_corners=False, recompute_scale_factor=False) * 1. / scale
48
+ feat = self.conv0(torch.cat((x, flow), 1))
49
+ feat = self.convblock0(feat) + feat
50
+ feat = self.convblock1(feat) + feat
51
+ feat = self.convblock2(feat) + feat
52
+ feat = self.convblock3(feat) + feat
53
+ flow = self.conv1(feat)
54
+ mask = self.conv2(feat)
55
+ flow = F.interpolate(flow, scale_factor=scale, mode="bilinear", align_corners=False, recompute_scale_factor=False) * scale
56
+ mask = F.interpolate(mask, scale_factor=scale, mode="bilinear", align_corners=False, recompute_scale_factor=False)
57
+ return flow, mask
58
+
59
+
60
+ class IFNet(nn.Module):
61
+ def __init__(self):
62
+ super(IFNet, self).__init__()
63
+ self.block0 = IFBlock(7+4, c=90)
64
+ self.block1 = IFBlock(7+4, c=90)
65
+ self.block2 = IFBlock(7+4, c=90)
66
+ self.block_tea = IFBlock(10+4, c=90)
67
+
68
+ def forward(self, x, scale_list=[4, 2, 1], training=False):
69
+ if training == False:
70
+ channel = x.shape[1] // 2
71
+ img0 = x[:, :channel]
72
+ img1 = x[:, channel:]
73
+ flow_list = []
74
+ merged = []
75
+ mask_list = []
76
+ warped_img0 = img0
77
+ warped_img1 = img1
78
+ flow = (x[:, :4]).detach() * 0
79
+ mask = (x[:, :1]).detach() * 0
80
+ block = [self.block0, self.block1, self.block2]
81
+ for i in range(3):
82
+ f0, m0 = block[i](torch.cat((warped_img0[:, :3], warped_img1[:, :3], mask), 1), flow, scale=scale_list[i])
83
+ f1, m1 = block[i](torch.cat((warped_img1[:, :3], warped_img0[:, :3], -mask), 1), torch.cat((flow[:, 2:4], flow[:, :2]), 1), scale=scale_list[i])
84
+ flow = flow + (f0 + torch.cat((f1[:, 2:4], f1[:, :2]), 1)) / 2
85
+ mask = mask + (m0 + (-m1)) / 2
86
+ mask_list.append(mask)
87
+ flow_list.append(flow)
88
+ warped_img0 = warp(img0, flow[:, :2], device=x.device)
89
+ warped_img1 = warp(img1, flow[:, 2:4], device=x.device)
90
+ merged.append((warped_img0, warped_img1))
91
+ '''
92
+ c0 = self.contextnet(img0, flow[:, :2])
93
+ c1 = self.contextnet(img1, flow[:, 2:4])
94
+ tmp = self.unet(img0, img1, warped_img0, warped_img1, mask, flow, c0, c1)
95
+ res = tmp[:, 1:4] * 2 - 1
96
+ '''
97
+ for i in range(3):
98
+ mask_list[i] = torch.sigmoid(mask_list[i])
99
+ merged[i] = merged[i][0] * mask_list[i] + merged[i][1] * (1 - mask_list[i])
100
+ return flow_list, mask_list[2], merged
101
+
102
+ def state_dict_converter(self):
103
+ return IFNetStateDictConverter()
104
+
105
+
106
+ class IFNetStateDictConverter:
107
+ def __init__(self):
108
+ pass
109
+
110
+ def from_diffusers(self, state_dict):
111
+ state_dict_ = {k.replace("module.", ""): v for k, v in state_dict.items()}
112
+ return state_dict_
113
+
114
+ def from_civitai(self, state_dict):
115
+ return self.from_diffusers(state_dict)
116
+
117
+
118
+ class RIFEInterpolater:
119
+ def __init__(self, model, device="cuda"):
120
+ self.model = model
121
+ self.device = device
122
+ # IFNet only does not support float16
123
+ self.torch_dtype = torch.float32
124
+
125
+ @staticmethod
126
+ def from_model_manager(model_manager):
127
+ return RIFEInterpolater(model_manager.RIFE, device=model_manager.device)
128
+
129
+ def process_image(self, image):
130
+ width, height = image.size
131
+ if width % 32 != 0 or height % 32 != 0:
132
+ width = (width + 31) // 32
133
+ height = (height + 31) // 32
134
+ image = image.resize((width, height))
135
+ image = torch.Tensor(np.array(image, dtype=np.float32)[:, :, [2,1,0]] / 255).permute(2, 0, 1)
136
+ return image
137
+
138
+ def process_images(self, images):
139
+ images = [self.process_image(image) for image in images]
140
+ images = torch.stack(images)
141
+ return images
142
+
143
+ def decode_images(self, images):
144
+ images = (images[:, [2,1,0]].permute(0, 2, 3, 1) * 255).clip(0, 255).numpy().astype(np.uint8)
145
+ images = [Image.fromarray(image) for image in images]
146
+ return images
147
+
148
+ def add_interpolated_images(self, images, interpolated_images):
149
+ output_images = []
150
+ for image, interpolated_image in zip(images, interpolated_images):
151
+ output_images.append(image)
152
+ output_images.append(interpolated_image)
153
+ output_images.append(images[-1])
154
+ return output_images
155
+
156
+
157
+ @torch.no_grad()
158
+ def interpolate_(self, images, scale=1.0):
159
+ input_tensor = self.process_images(images)
160
+ input_tensor = torch.cat((input_tensor[:-1], input_tensor[1:]), dim=1)
161
+ input_tensor = input_tensor.to(device=self.device, dtype=self.torch_dtype)
162
+ flow, mask, merged = self.model(input_tensor, [4/scale, 2/scale, 1/scale])
163
+ output_images = self.decode_images(merged[2].cpu())
164
+ if output_images[0].size != images[0].size:
165
+ output_images = [image.resize(images[0].size) for image in output_images]
166
+ return output_images
167
+
168
+
169
+ @torch.no_grad()
170
+ def interpolate(self, images, scale=1.0, batch_size=4, num_iter=1, progress_bar=lambda x:x):
171
+ # Preprocess
172
+ processed_images = self.process_images(images)
173
+
174
+ for iter in range(num_iter):
175
+ # Input
176
+ input_tensor = torch.cat((processed_images[:-1], processed_images[1:]), dim=1)
177
+
178
+ # Interpolate
179
+ output_tensor = []
180
+ for batch_id in progress_bar(range(0, input_tensor.shape[0], batch_size)):
181
+ batch_id_ = min(batch_id + batch_size, input_tensor.shape[0])
182
+ batch_input_tensor = input_tensor[batch_id: batch_id_]
183
+ batch_input_tensor = batch_input_tensor.to(device=self.device, dtype=self.torch_dtype)
184
+ flow, mask, merged = self.model(batch_input_tensor, [4/scale, 2/scale, 1/scale])
185
+ output_tensor.append(merged[2].cpu())
186
+
187
+ # Output
188
+ output_tensor = torch.concat(output_tensor, dim=0).clip(0, 1)
189
+ processed_images = self.add_interpolated_images(processed_images, output_tensor)
190
+ processed_images = torch.stack(processed_images)
191
+
192
+ # To images
193
+ output_images = self.decode_images(processed_images)
194
+ if output_images[0].size != images[0].size:
195
+ output_images = [image.resize(images[0].size) for image in output_images]
196
+ return output_images
197
+
198
+
199
+ class RIFESmoother(RIFEInterpolater):
200
+ def __init__(self, model, device="cuda"):
201
+ super(RIFESmoother, self).__init__(model, device=device)
202
+
203
+ @staticmethod
204
+ def from_model_manager(model_manager):
205
+ return RIFESmoother(model_manager.RIFE, device=model_manager.device)
206
+
207
+ def process_tensors(self, input_tensor, scale=1.0, batch_size=4):
208
+ output_tensor = []
209
+ for batch_id in range(0, input_tensor.shape[0], batch_size):
210
+ batch_id_ = min(batch_id + batch_size, input_tensor.shape[0])
211
+ batch_input_tensor = input_tensor[batch_id: batch_id_]
212
+ batch_input_tensor = batch_input_tensor.to(device=self.device, dtype=self.torch_dtype)
213
+ flow, mask, merged = self.model(batch_input_tensor, [4/scale, 2/scale, 1/scale])
214
+ output_tensor.append(merged[2].cpu())
215
+ output_tensor = torch.concat(output_tensor, dim=0)
216
+ return output_tensor
217
+
218
+ @torch.no_grad()
219
+ def __call__(self, rendered_frames, scale=1.0, batch_size=4, num_iter=1, **kwargs):
220
+ # Preprocess
221
+ processed_images = self.process_images(rendered_frames)
222
+
223
+ for iter in range(num_iter):
224
+ # Input
225
+ input_tensor = torch.cat((processed_images[:-2], processed_images[2:]), dim=1)
226
+
227
+ # Interpolate
228
+ output_tensor = self.process_tensors(input_tensor, scale=scale, batch_size=batch_size)
229
+
230
+ # Blend
231
+ input_tensor = torch.cat((processed_images[1:-1], output_tensor), dim=1)
232
+ output_tensor = self.process_tensors(input_tensor, scale=scale, batch_size=batch_size)
233
+
234
+ # Add to frames
235
+ processed_images[1:-1] = output_tensor
236
+
237
+ # To images
238
+ output_images = self.decode_images(processed_images)
239
+ if output_images[0].size != rendered_frames[0].size:
240
+ output_images = [image.resize(rendered_frames[0].size) for image in output_images]
241
+ return output_images
diffsynth/models/__init__.py ADDED
@@ -0,0 +1,814 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, os, json
2
+ from safetensors import safe_open
3
+ from typing_extensions import Literal, TypeAlias
4
+ from typing import List
5
+
6
+ from .downloader import download_from_huggingface, download_from_modelscope
7
+
8
+ from .sd_text_encoder import SDTextEncoder
9
+ from .sd_unet import SDUNet
10
+ from .sd_vae_encoder import SDVAEEncoder
11
+ from .sd_vae_decoder import SDVAEDecoder
12
+ from .sd_lora import SDLoRA
13
+
14
+ from .sdxl_text_encoder import SDXLTextEncoder, SDXLTextEncoder2
15
+ from .sdxl_unet import SDXLUNet
16
+ from .sdxl_vae_decoder import SDXLVAEDecoder
17
+ from .sdxl_vae_encoder import SDXLVAEEncoder
18
+
19
+ from .sd3_text_encoder import SD3TextEncoder1, SD3TextEncoder2, SD3TextEncoder3
20
+ from .sd3_dit import SD3DiT
21
+ from .sd3_vae_decoder import SD3VAEDecoder
22
+ from .sd3_vae_encoder import SD3VAEEncoder
23
+
24
+ from .sd_controlnet import SDControlNet
25
+
26
+ from .sd_motion import SDMotionModel
27
+ from .sdxl_motion import SDXLMotionModel
28
+
29
+ from .svd_image_encoder import SVDImageEncoder
30
+ from .svd_unet import SVDUNet
31
+ from .svd_vae_decoder import SVDVAEDecoder
32
+ from .svd_vae_encoder import SVDVAEEncoder
33
+
34
+ from .sd_ipadapter import SDIpAdapter, IpAdapterCLIPImageEmbedder
35
+ from .sdxl_ipadapter import SDXLIpAdapter, IpAdapterXLCLIPImageEmbedder
36
+
37
+ from .hunyuan_dit_text_encoder import HunyuanDiTCLIPTextEncoder, HunyuanDiTT5TextEncoder
38
+ from .hunyuan_dit import HunyuanDiT
39
+ from .kolors_text_encoder import ChatGLMModel
40
+
41
+
42
+ preset_models_on_huggingface = {
43
+ "HunyuanDiT": [
44
+ ("Tencent-Hunyuan/HunyuanDiT", "t2i/clip_text_encoder/pytorch_model.bin", "models/HunyuanDiT/t2i/clip_text_encoder"),
45
+ ("Tencent-Hunyuan/HunyuanDiT", "t2i/mt5/pytorch_model.bin", "models/HunyuanDiT/t2i/mt5"),
46
+ ("Tencent-Hunyuan/HunyuanDiT", "t2i/model/pytorch_model_ema.pt", "models/HunyuanDiT/t2i/model"),
47
+ ("Tencent-Hunyuan/HunyuanDiT", "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin", "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix"),
48
+ ],
49
+ "stable-video-diffusion-img2vid-xt": [
50
+ ("stabilityai/stable-video-diffusion-img2vid-xt", "svd_xt.safetensors", "models/stable_video_diffusion"),
51
+ ],
52
+ "ExVideo-SVD-128f-v1": [
53
+ ("ECNU-CILab/ExVideo-SVD-128f-v1", "model.fp16.safetensors", "models/stable_video_diffusion"),
54
+ ],
55
+ }
56
+ preset_models_on_modelscope = {
57
+ # Hunyuan DiT
58
+ "HunyuanDiT": [
59
+ ("modelscope/HunyuanDiT", "t2i/clip_text_encoder/pytorch_model.bin", "models/HunyuanDiT/t2i/clip_text_encoder"),
60
+ ("modelscope/HunyuanDiT", "t2i/mt5/pytorch_model.bin", "models/HunyuanDiT/t2i/mt5"),
61
+ ("modelscope/HunyuanDiT", "t2i/model/pytorch_model_ema.pt", "models/HunyuanDiT/t2i/model"),
62
+ ("modelscope/HunyuanDiT", "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin", "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix"),
63
+ ],
64
+ # Stable Video Diffusion
65
+ "stable-video-diffusion-img2vid-xt": [
66
+ ("AI-ModelScope/stable-video-diffusion-img2vid-xt", "svd_xt.safetensors", "models/stable_video_diffusion"),
67
+ ],
68
+ # ExVideo
69
+ "ExVideo-SVD-128f-v1": [
70
+ ("ECNU-CILab/ExVideo-SVD-128f-v1", "model.fp16.safetensors", "models/stable_video_diffusion"),
71
+ ],
72
+ # Stable Diffusion
73
+ "StableDiffusion_v15": [
74
+ ("AI-ModelScope/stable-diffusion-v1-5", "v1-5-pruned-emaonly.safetensors", "models/stable_diffusion"),
75
+ ],
76
+ "DreamShaper_8": [
77
+ ("sd_lora/dreamshaper_8", "dreamshaper_8.safetensors", "models/stable_diffusion"),
78
+ ],
79
+ "AingDiffusion_v12": [
80
+ ("sd_lora/aingdiffusion_v12", "aingdiffusion_v12.safetensors", "models/stable_diffusion"),
81
+ ],
82
+ "Flat2DAnimerge_v45Sharp": [
83
+ ("sd_lora/Flat-2D-Animerge", "flat2DAnimerge_v45Sharp.safetensors", "models/stable_diffusion"),
84
+ ],
85
+ # Textual Inversion
86
+ "TextualInversion_VeryBadImageNegative_v1.3": [
87
+ ("sd_lora/verybadimagenegative_v1.3", "verybadimagenegative_v1.3.pt", "models/textual_inversion"),
88
+ ],
89
+ # Stable Diffusion XL
90
+ "StableDiffusionXL_v1": [
91
+ ("AI-ModelScope/stable-diffusion-xl-base-1.0", "sd_xl_base_1.0.safetensors", "models/stable_diffusion_xl"),
92
+ ],
93
+ "BluePencilXL_v200": [
94
+ ("sd_lora/bluePencilXL_v200", "bluePencilXL_v200.safetensors", "models/stable_diffusion_xl"),
95
+ ],
96
+ "StableDiffusionXL_Turbo": [
97
+ ("AI-ModelScope/sdxl-turbo", "sd_xl_turbo_1.0_fp16.safetensors", "models/stable_diffusion_xl_turbo"),
98
+ ],
99
+ # Stable Diffusion 3
100
+ "StableDiffusion3": [
101
+ ("AI-ModelScope/stable-diffusion-3-medium", "sd3_medium_incl_clips_t5xxlfp16.safetensors", "models/stable_diffusion_3"),
102
+ ],
103
+ "StableDiffusion3_without_T5": [
104
+ ("AI-ModelScope/stable-diffusion-3-medium", "sd3_medium_incl_clips.safetensors", "models/stable_diffusion_3"),
105
+ ],
106
+ # ControlNet
107
+ "ControlNet_v11f1p_sd15_depth": [
108
+ ("AI-ModelScope/ControlNet-v1-1", "control_v11f1p_sd15_depth.pth", "models/ControlNet"),
109
+ ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators")
110
+ ],
111
+ "ControlNet_v11p_sd15_softedge": [
112
+ ("AI-ModelScope/ControlNet-v1-1", "control_v11p_sd15_softedge.pth", "models/ControlNet"),
113
+ ("sd_lora/Annotators", "ControlNetHED.pth", "models/Annotators")
114
+ ],
115
+ "ControlNet_v11f1e_sd15_tile": [
116
+ ("AI-ModelScope/ControlNet-v1-1", "control_v11f1e_sd15_tile.pth", "models/ControlNet")
117
+ ],
118
+ "ControlNet_v11p_sd15_lineart": [
119
+ ("AI-ModelScope/ControlNet-v1-1", "control_v11p_sd15_lineart.pth", "models/ControlNet"),
120
+ ("sd_lora/Annotators", "sk_model.pth", "models/Annotators"),
121
+ ("sd_lora/Annotators", "sk_model2.pth", "models/Annotators")
122
+ ],
123
+ # AnimateDiff
124
+ "AnimateDiff_v2": [
125
+ ("Shanghai_AI_Laboratory/animatediff", "mm_sd_v15_v2.ckpt", "models/AnimateDiff"),
126
+ ],
127
+ "AnimateDiff_xl_beta": [
128
+ ("Shanghai_AI_Laboratory/animatediff", "mm_sdxl_v10_beta.ckpt", "models/AnimateDiff"),
129
+ ],
130
+ # RIFE
131
+ "RIFE": [
132
+ ("Damo_XR_Lab/cv_rife_video-frame-interpolation", "flownet.pkl", "models/RIFE"),
133
+ ],
134
+ # Beautiful Prompt
135
+ "BeautifulPrompt": [
136
+ ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
137
+ ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "generation_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
138
+ ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "model.safetensors", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
139
+ ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "special_tokens_map.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
140
+ ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "tokenizer.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
141
+ ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "tokenizer_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
142
+ ],
143
+ # Translator
144
+ "opus-mt-zh-en": [
145
+ ("moxying/opus-mt-zh-en", "config.json", "models/translator/opus-mt-zh-en"),
146
+ ("moxying/opus-mt-zh-en", "generation_config.json", "models/translator/opus-mt-zh-en"),
147
+ ("moxying/opus-mt-zh-en", "metadata.json", "models/translator/opus-mt-zh-en"),
148
+ ("moxying/opus-mt-zh-en", "pytorch_model.bin", "models/translator/opus-mt-zh-en"),
149
+ ("moxying/opus-mt-zh-en", "source.spm", "models/translator/opus-mt-zh-en"),
150
+ ("moxying/opus-mt-zh-en", "target.spm", "models/translator/opus-mt-zh-en"),
151
+ ("moxying/opus-mt-zh-en", "tokenizer_config.json", "models/translator/opus-mt-zh-en"),
152
+ ("moxying/opus-mt-zh-en", "vocab.json", "models/translator/opus-mt-zh-en"),
153
+ ],
154
+ # IP-Adapter
155
+ "IP-Adapter-SD": [
156
+ ("AI-ModelScope/IP-Adapter", "models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion/image_encoder"),
157
+ ("AI-ModelScope/IP-Adapter", "models/ip-adapter_sd15.bin", "models/IpAdapter/stable_diffusion"),
158
+ ],
159
+ "IP-Adapter-SDXL": [
160
+ ("AI-ModelScope/IP-Adapter", "sdxl_models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion_xl/image_encoder"),
161
+ ("AI-ModelScope/IP-Adapter", "sdxl_models/ip-adapter_sdxl.bin", "models/IpAdapter/stable_diffusion_xl"),
162
+ ],
163
+ # Kolors
164
+ "Kolors": [
165
+ ("Kwai-Kolors/Kolors", "text_encoder/config.json", "models/kolors/Kolors/text_encoder"),
166
+ ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model.bin.index.json", "models/kolors/Kolors/text_encoder"),
167
+ ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00001-of-00007.bin", "models/kolors/Kolors/text_encoder"),
168
+ ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00002-of-00007.bin", "models/kolors/Kolors/text_encoder"),
169
+ ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00003-of-00007.bin", "models/kolors/Kolors/text_encoder"),
170
+ ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00004-of-00007.bin", "models/kolors/Kolors/text_encoder"),
171
+ ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00005-of-00007.bin", "models/kolors/Kolors/text_encoder"),
172
+ ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00006-of-00007.bin", "models/kolors/Kolors/text_encoder"),
173
+ ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00007-of-00007.bin", "models/kolors/Kolors/text_encoder"),
174
+ ("Kwai-Kolors/Kolors", "unet/diffusion_pytorch_model.safetensors", "models/kolors/Kolors/unet"),
175
+ ("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.safetensors", "models/kolors/Kolors/vae"),
176
+ ],
177
+ "SDXL-vae-fp16-fix": [
178
+ ("AI-ModelScope/sdxl-vae-fp16-fix", "diffusion_pytorch_model.safetensors", "models/sdxl-vae-fp16-fix")
179
+ ],
180
+ }
181
+ Preset_model_id: TypeAlias = Literal[
182
+ "HunyuanDiT",
183
+ "stable-video-diffusion-img2vid-xt",
184
+ "ExVideo-SVD-128f-v1",
185
+ "StableDiffusion_v15",
186
+ "DreamShaper_8",
187
+ "AingDiffusion_v12",
188
+ "Flat2DAnimerge_v45Sharp",
189
+ "TextualInversion_VeryBadImageNegative_v1.3",
190
+ "StableDiffusionXL_v1",
191
+ "BluePencilXL_v200",
192
+ "StableDiffusionXL_Turbo",
193
+ "ControlNet_v11f1p_sd15_depth",
194
+ "ControlNet_v11p_sd15_softedge",
195
+ "ControlNet_v11f1e_sd15_tile",
196
+ "ControlNet_v11p_sd15_lineart",
197
+ "AnimateDiff_v2",
198
+ "AnimateDiff_xl_beta",
199
+ "RIFE",
200
+ "BeautifulPrompt",
201
+ "opus-mt-zh-en",
202
+ "IP-Adapter-SD",
203
+ "IP-Adapter-SDXL",
204
+ "StableDiffusion3",
205
+ "StableDiffusion3_without_T5",
206
+ "Kolors",
207
+ "SDXL-vae-fp16-fix",
208
+ ]
209
+ Preset_model_website: TypeAlias = Literal[
210
+ "HuggingFace",
211
+ "ModelScope",
212
+ ]
213
+ website_to_preset_models = {
214
+ "HuggingFace": preset_models_on_huggingface,
215
+ "ModelScope": preset_models_on_modelscope,
216
+ }
217
+ website_to_download_fn = {
218
+ "HuggingFace": download_from_huggingface,
219
+ "ModelScope": download_from_modelscope,
220
+ }
221
+
222
+
223
+ def download_models(
224
+ model_id_list: List[Preset_model_id] = [],
225
+ downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"],
226
+ ):
227
+ downloaded_files = []
228
+ for model_id in model_id_list:
229
+ for website in downloading_priority:
230
+ if model_id in website_to_preset_models[website]:
231
+ for model_id, origin_file_path, local_dir in website_to_preset_models[website][model_id]:
232
+ # Check if the file is downloaded.
233
+ file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path))
234
+ if file_to_download in downloaded_files:
235
+ continue
236
+ # Download
237
+ website_to_download_fn[website](model_id, origin_file_path, local_dir)
238
+ if os.path.basename(origin_file_path) in os.listdir(local_dir):
239
+ downloaded_files.append(file_to_download)
240
+ return downloaded_files
241
+
242
+
243
+ class ModelManager:
244
+ def __init__(
245
+ self,
246
+ torch_dtype=torch.float16,
247
+ device="cuda",
248
+ model_id_list: List[Preset_model_id] = [],
249
+ downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"],
250
+ file_path_list: List[str] = [],
251
+ ):
252
+ self.torch_dtype = torch_dtype
253
+ self.device = device
254
+ self.model = {}
255
+ self.model_path = {}
256
+ self.textual_inversion_dict = {}
257
+ downloaded_files = download_models(model_id_list, downloading_priority)
258
+ self.load_models(downloaded_files + file_path_list)
259
+
260
+ def load_model_from_origin(
261
+ self,
262
+ download_from: Preset_model_website = "ModelScope",
263
+ model_id = "",
264
+ origin_file_path = "",
265
+ local_dir = ""
266
+ ):
267
+ website_to_download_fn[download_from](model_id, origin_file_path, local_dir)
268
+ file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path))
269
+ self.load_model(file_to_download)
270
+
271
+ def is_stable_video_diffusion(self, state_dict):
272
+ param_name = "model.diffusion_model.output_blocks.9.1.time_stack.0.norm_in.weight"
273
+ return param_name in state_dict
274
+
275
+ def is_RIFE(self, state_dict):
276
+ param_name = "block_tea.convblock3.0.1.weight"
277
+ return param_name in state_dict or ("module." + param_name) in state_dict
278
+
279
+ def is_beautiful_prompt(self, state_dict):
280
+ param_name = "transformer.h.9.self_attention.query_key_value.weight"
281
+ return param_name in state_dict
282
+
283
+ def is_stabe_diffusion_xl(self, state_dict):
284
+ param_name = "conditioner.embedders.0.transformer.text_model.embeddings.position_embedding.weight"
285
+ return param_name in state_dict
286
+
287
+ def is_stable_diffusion(self, state_dict):
288
+ if self.is_stabe_diffusion_xl(state_dict):
289
+ return False
290
+ param_name = "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm3.weight"
291
+ return param_name in state_dict
292
+
293
+ def is_controlnet(self, state_dict):
294
+ param_name = "control_model.time_embed.0.weight"
295
+ return param_name in state_dict
296
+
297
+ def is_animatediff(self, state_dict):
298
+ param_name = "mid_block.motion_modules.0.temporal_transformer.proj_out.weight"
299
+ return param_name in state_dict
300
+
301
+ def is_animatediff_xl(self, state_dict):
302
+ param_name = "up_blocks.2.motion_modules.2.temporal_transformer.transformer_blocks.0.ff_norm.weight"
303
+ return param_name in state_dict
304
+
305
+ def is_sd_lora(self, state_dict):
306
+ param_name = "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_ff_net_2.lora_up.weight"
307
+ return param_name in state_dict
308
+
309
+ def is_translator(self, state_dict):
310
+ param_name = "model.encoder.layers.5.self_attn_layer_norm.weight"
311
+ return param_name in state_dict and len(state_dict) == 258
312
+
313
+ def is_ipadapter(self, state_dict):
314
+ return "image_proj" in state_dict and "ip_adapter" in state_dict and state_dict["image_proj"]["proj.weight"].shape == torch.Size([3072, 1024])
315
+
316
+ def is_ipadapter_image_encoder(self, state_dict):
317
+ param_name = "vision_model.encoder.layers.31.self_attn.v_proj.weight"
318
+ return param_name in state_dict and len(state_dict) == 521
319
+
320
+ def is_ipadapter_xl(self, state_dict):
321
+ return "image_proj" in state_dict and "ip_adapter" in state_dict and state_dict["image_proj"]["proj.weight"].shape == torch.Size([8192, 1280])
322
+
323
+ def is_ipadapter_xl_image_encoder(self, state_dict):
324
+ param_name = "vision_model.encoder.layers.47.self_attn.v_proj.weight"
325
+ return param_name in state_dict and len(state_dict) == 777
326
+
327
+ def is_hunyuan_dit_clip_text_encoder(self, state_dict):
328
+ param_name = "bert.encoder.layer.23.attention.output.dense.weight"
329
+ return param_name in state_dict
330
+
331
+ def is_hunyuan_dit_t5_text_encoder(self, state_dict):
332
+ param_name = "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
333
+ param_name_ = "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
334
+ return param_name in state_dict and param_name_ in state_dict
335
+
336
+ def is_hunyuan_dit(self, state_dict):
337
+ param_name = "final_layer.adaLN_modulation.1.weight"
338
+ return param_name in state_dict
339
+
340
+ def is_diffusers_vae(self, state_dict):
341
+ param_name = "quant_conv.weight"
342
+ return param_name in state_dict
343
+
344
+ def is_ExVideo_StableVideoDiffusion(self, state_dict):
345
+ param_name = "blocks.185.positional_embedding.embeddings"
346
+ return param_name in state_dict
347
+
348
+ def is_stable_diffusion_3(self, state_dict):
349
+ param_names = [
350
+ "text_encoders.clip_l.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight",
351
+ "text_encoders.clip_g.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight",
352
+ "model.diffusion_model.joint_blocks.9.x_block.mlp.fc2.weight",
353
+ "first_stage_model.encoder.mid.block_2.norm2.weight",
354
+ "first_stage_model.decoder.mid.block_2.norm2.weight",
355
+ ]
356
+ for param_name in param_names:
357
+ if param_name not in state_dict:
358
+ return False
359
+ return True
360
+
361
+ def is_stable_diffusion_3_t5(self, state_dict):
362
+ param_name = "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
363
+ return param_name in state_dict
364
+
365
+ def is_kolors_text_encoder(self, file_path):
366
+ file_list = os.listdir(file_path)
367
+ if "config.json" in file_list:
368
+ try:
369
+ with open(os.path.join(file_path, "config.json"), "r") as f:
370
+ config = json.load(f)
371
+ if config.get("model_type") == "chatglm":
372
+ return True
373
+ except:
374
+ pass
375
+ return False
376
+
377
+ def is_kolors_unet(self, state_dict):
378
+ return "up_blocks.2.resnets.2.time_emb_proj.weight" in state_dict and "encoder_hid_proj.weight" in state_dict
379
+
380
+ def load_stable_video_diffusion(self, state_dict, components=None, file_path="", add_positional_conv=None):
381
+ component_dict = {
382
+ "image_encoder": SVDImageEncoder,
383
+ "unet": SVDUNet,
384
+ "vae_decoder": SVDVAEDecoder,
385
+ "vae_encoder": SVDVAEEncoder,
386
+ }
387
+ if components is None:
388
+ components = ["image_encoder", "unet", "vae_decoder", "vae_encoder"]
389
+ for component in components:
390
+ if component == "unet":
391
+ self.model[component] = component_dict[component](add_positional_conv=add_positional_conv)
392
+ self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict, add_positional_conv=add_positional_conv), strict=False)
393
+ else:
394
+ self.model[component] = component_dict[component]()
395
+ self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
396
+ self.model[component].to(self.torch_dtype).to(self.device)
397
+ self.model_path[component] = file_path
398
+
399
+ def load_stable_diffusion(self, state_dict, components=None, file_path=""):
400
+ component_dict = {
401
+ "text_encoder": SDTextEncoder,
402
+ "unet": SDUNet,
403
+ "vae_decoder": SDVAEDecoder,
404
+ "vae_encoder": SDVAEEncoder,
405
+ "refiner": SDXLUNet,
406
+ }
407
+ if components is None:
408
+ components = ["text_encoder", "unet", "vae_decoder", "vae_encoder"]
409
+ for component in components:
410
+ if component == "text_encoder":
411
+ # Add additional token embeddings to text encoder
412
+ token_embeddings = [state_dict["cond_stage_model.transformer.text_model.embeddings.token_embedding.weight"]]
413
+ for keyword in self.textual_inversion_dict:
414
+ _, embeddings = self.textual_inversion_dict[keyword]
415
+ token_embeddings.append(embeddings.to(dtype=token_embeddings[0].dtype))
416
+ token_embeddings = torch.concat(token_embeddings, dim=0)
417
+ state_dict["cond_stage_model.transformer.text_model.embeddings.token_embedding.weight"] = token_embeddings
418
+ self.model[component] = component_dict[component](vocab_size=token_embeddings.shape[0])
419
+ self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
420
+ self.model[component].to(self.torch_dtype).to(self.device)
421
+ else:
422
+ self.model[component] = component_dict[component]()
423
+ self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
424
+ self.model[component].to(self.torch_dtype).to(self.device)
425
+ self.model_path[component] = file_path
426
+
427
+ def load_stable_diffusion_xl(self, state_dict, components=None, file_path=""):
428
+ component_dict = {
429
+ "text_encoder": SDXLTextEncoder,
430
+ "text_encoder_2": SDXLTextEncoder2,
431
+ "unet": SDXLUNet,
432
+ "vae_decoder": SDXLVAEDecoder,
433
+ "vae_encoder": SDXLVAEEncoder,
434
+ }
435
+ if components is None:
436
+ components = ["text_encoder", "text_encoder_2", "unet", "vae_decoder", "vae_encoder"]
437
+ for component in components:
438
+ self.model[component] = component_dict[component]()
439
+ self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
440
+ if component in ["vae_decoder", "vae_encoder"]:
441
+ # These two model will output nan when float16 is enabled.
442
+ # The precision problem happens in the last three resnet blocks.
443
+ # I do not know how to solve this problem.
444
+ self.model[component].to(torch.float32).to(self.device)
445
+ else:
446
+ self.model[component].to(self.torch_dtype).to(self.device)
447
+ self.model_path[component] = file_path
448
+
449
+ def load_controlnet(self, state_dict, file_path=""):
450
+ component = "controlnet"
451
+ if component not in self.model:
452
+ self.model[component] = []
453
+ self.model_path[component] = []
454
+ model = SDControlNet()
455
+ model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
456
+ model.to(self.torch_dtype).to(self.device)
457
+ self.model[component].append(model)
458
+ self.model_path[component].append(file_path)
459
+
460
+ def load_animatediff(self, state_dict, file_path=""):
461
+ component = "motion_modules"
462
+ model = SDMotionModel()
463
+ model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
464
+ model.to(self.torch_dtype).to(self.device)
465
+ self.model[component] = model
466
+ self.model_path[component] = file_path
467
+
468
+ def load_animatediff_xl(self, state_dict, file_path=""):
469
+ component = "motion_modules_xl"
470
+ model = SDXLMotionModel()
471
+ model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
472
+ model.to(self.torch_dtype).to(self.device)
473
+ self.model[component] = model
474
+ self.model_path[component] = file_path
475
+
476
+ def load_beautiful_prompt(self, state_dict, file_path=""):
477
+ component = "beautiful_prompt"
478
+ from transformers import AutoModelForCausalLM
479
+ model_folder = os.path.dirname(file_path)
480
+ model = AutoModelForCausalLM.from_pretrained(
481
+ model_folder, state_dict=state_dict, local_files_only=True, torch_dtype=self.torch_dtype
482
+ ).to(self.device).eval()
483
+ self.model[component] = model
484
+ self.model_path[component] = file_path
485
+
486
+ def load_RIFE(self, state_dict, file_path=""):
487
+ component = "RIFE"
488
+ from ..extensions.RIFE import IFNet
489
+ model = IFNet().eval()
490
+ model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
491
+ model.to(torch.float32).to(self.device)
492
+ self.model[component] = model
493
+ self.model_path[component] = file_path
494
+
495
+ def load_sd_lora(self, state_dict, alpha):
496
+ SDLoRA().add_lora_to_text_encoder(self.model["text_encoder"], state_dict, alpha=alpha, device=self.device)
497
+ SDLoRA().add_lora_to_unet(self.model["unet"], state_dict, alpha=alpha, device=self.device)
498
+
499
+ def load_translator(self, state_dict, file_path=""):
500
+ # This model is lightweight, we do not place it on GPU.
501
+ component = "translator"
502
+ from transformers import AutoModelForSeq2SeqLM
503
+ model_folder = os.path.dirname(file_path)
504
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_folder).eval()
505
+ self.model[component] = model
506
+ self.model_path[component] = file_path
507
+
508
+ def load_ipadapter(self, state_dict, file_path=""):
509
+ component = "ipadapter"
510
+ model = SDIpAdapter()
511
+ model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
512
+ model.to(self.torch_dtype).to(self.device)
513
+ self.model[component] = model
514
+ self.model_path[component] = file_path
515
+
516
+ def load_ipadapter_image_encoder(self, state_dict, file_path=""):
517
+ component = "ipadapter_image_encoder"
518
+ model = IpAdapterCLIPImageEmbedder()
519
+ model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
520
+ model.to(self.torch_dtype).to(self.device)
521
+ self.model[component] = model
522
+ self.model_path[component] = file_path
523
+
524
+ def load_ipadapter_xl(self, state_dict, file_path=""):
525
+ component = "ipadapter_xl"
526
+ model = SDXLIpAdapter()
527
+ model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
528
+ model.to(self.torch_dtype).to(self.device)
529
+ self.model[component] = model
530
+ self.model_path[component] = file_path
531
+
532
+ def load_ipadapter_xl_image_encoder(self, state_dict, file_path=""):
533
+ component = "ipadapter_xl_image_encoder"
534
+ model = IpAdapterXLCLIPImageEmbedder()
535
+ model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
536
+ model.to(self.torch_dtype).to(self.device)
537
+ self.model[component] = model
538
+ self.model_path[component] = file_path
539
+
540
+ def load_hunyuan_dit_clip_text_encoder(self, state_dict, file_path=""):
541
+ component = "hunyuan_dit_clip_text_encoder"
542
+ model = HunyuanDiTCLIPTextEncoder()
543
+ model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
544
+ model.to(self.torch_dtype).to(self.device)
545
+ self.model[component] = model
546
+ self.model_path[component] = file_path
547
+
548
+ def load_hunyuan_dit_t5_text_encoder(self, state_dict, file_path=""):
549
+ component = "hunyuan_dit_t5_text_encoder"
550
+ model = HunyuanDiTT5TextEncoder()
551
+ model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
552
+ model.to(self.torch_dtype).to(self.device)
553
+ self.model[component] = model
554
+ self.model_path[component] = file_path
555
+
556
+ def load_hunyuan_dit(self, state_dict, file_path=""):
557
+ component = "hunyuan_dit"
558
+ model = HunyuanDiT()
559
+ model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
560
+ model.to(self.torch_dtype).to(self.device)
561
+ self.model[component] = model
562
+ self.model_path[component] = file_path
563
+
564
+ def load_diffusers_vae(self, state_dict, file_path=""):
565
+ # TODO: detect SD and SDXL
566
+ component = "vae_encoder"
567
+ model = SDXLVAEEncoder()
568
+ model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
569
+ model.to(torch.float32).to(self.device)
570
+ self.model[component] = model
571
+ self.model_path[component] = file_path
572
+ component = "vae_decoder"
573
+ model = SDXLVAEDecoder()
574
+ model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
575
+ model.to(torch.float32).to(self.device)
576
+ self.model[component] = model
577
+ self.model_path[component] = file_path
578
+
579
+ def load_ExVideo_StableVideoDiffusion(self, state_dict, file_path=""):
580
+ unet_state_dict = self.model["unet"].state_dict()
581
+ self.model["unet"].to("cpu")
582
+ del self.model["unet"]
583
+ add_positional_conv = state_dict["blocks.185.positional_embedding.embeddings"].shape[0]
584
+ self.model["unet"] = SVDUNet(add_positional_conv=add_positional_conv)
585
+ self.model["unet"].load_state_dict(unet_state_dict, strict=False)
586
+ self.model["unet"].load_state_dict(state_dict, strict=False)
587
+ self.model["unet"].to(self.torch_dtype).to(self.device)
588
+
589
+ def load_stable_diffusion_3(self, state_dict, components=None, file_path=""):
590
+ component_dict = {
591
+ "sd3_text_encoder_1": SD3TextEncoder1,
592
+ "sd3_text_encoder_2": SD3TextEncoder2,
593
+ "sd3_text_encoder_3": SD3TextEncoder3,
594
+ "sd3_dit": SD3DiT,
595
+ "sd3_vae_decoder": SD3VAEDecoder,
596
+ "sd3_vae_encoder": SD3VAEEncoder,
597
+ }
598
+ if components is None:
599
+ components = ["sd3_text_encoder_1", "sd3_text_encoder_2", "sd3_text_encoder_3", "sd3_dit", "sd3_vae_decoder", "sd3_vae_encoder"]
600
+ for component in components:
601
+ if component == "sd3_text_encoder_3":
602
+ if "text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight" not in state_dict:
603
+ continue
604
+ if component == "sd3_text_encoder_1":
605
+ # Add additional token embeddings to text encoder
606
+ token_embeddings = [state_dict["text_encoders.clip_l.transformer.text_model.embeddings.token_embedding.weight"]]
607
+ for keyword in self.textual_inversion_dict:
608
+ _, embeddings = self.textual_inversion_dict[keyword]
609
+ token_embeddings.append(embeddings.to(dtype=token_embeddings[0].dtype))
610
+ token_embeddings = torch.concat(token_embeddings, dim=0)
611
+ state_dict["text_encoders.clip_l.transformer.text_model.embeddings.token_embedding.weight"] = token_embeddings
612
+ self.model[component] = component_dict[component](vocab_size=token_embeddings.shape[0])
613
+ self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
614
+ self.model[component].to(self.torch_dtype).to(self.device)
615
+ else:
616
+ self.model[component] = component_dict[component]()
617
+ self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
618
+ self.model[component].to(self.torch_dtype).to(self.device)
619
+ self.model_path[component] = file_path
620
+
621
+ def load_stable_diffusion_3_t5(self, state_dict, file_path=""):
622
+ component = "sd3_text_encoder_3"
623
+ model = SD3TextEncoder3()
624
+ model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
625
+ model.to(self.torch_dtype).to(self.device)
626
+ self.model[component] = model
627
+ self.model_path[component] = file_path
628
+
629
+ def load_kolors_text_encoder(self, state_dict=None, file_path=""):
630
+ component = "kolors_text_encoder"
631
+ model = ChatGLMModel.from_pretrained(file_path, torch_dtype=self.torch_dtype)
632
+ model = model.to(dtype=self.torch_dtype, device=self.device)
633
+ self.model[component] = model
634
+ self.model_path[component] = file_path
635
+
636
+ def load_kolors_unet(self, state_dict, file_path=""):
637
+ component = "kolors_unet"
638
+ model = SDXLUNet(is_kolors=True)
639
+ model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
640
+ model.to(self.torch_dtype).to(self.device)
641
+ self.model[component] = model
642
+ self.model_path[component] = file_path
643
+
644
+ def search_for_embeddings(self, state_dict):
645
+ embeddings = []
646
+ for k in state_dict:
647
+ if isinstance(state_dict[k], torch.Tensor):
648
+ embeddings.append(state_dict[k])
649
+ elif isinstance(state_dict[k], dict):
650
+ embeddings += self.search_for_embeddings(state_dict[k])
651
+ return embeddings
652
+
653
+ def load_textual_inversions(self, folder):
654
+ # Store additional tokens here
655
+ self.textual_inversion_dict = {}
656
+
657
+ # Load every textual inversion file
658
+ for file_name in os.listdir(folder):
659
+ if os.path.isdir(os.path.join(folder, file_name)) or \
660
+ not (file_name.endswith(".bin") or \
661
+ file_name.endswith(".safetensors") or \
662
+ file_name.endswith(".pth") or \
663
+ file_name.endswith(".pt")):
664
+ continue
665
+ keyword = os.path.splitext(file_name)[0]
666
+ state_dict = load_state_dict(os.path.join(folder, file_name))
667
+
668
+ # Search for embeddings
669
+ for embeddings in self.search_for_embeddings(state_dict):
670
+ if len(embeddings.shape) == 2 and embeddings.shape[1] == 768:
671
+ tokens = [f"{keyword}_{i}" for i in range(embeddings.shape[0])]
672
+ self.textual_inversion_dict[keyword] = (tokens, embeddings)
673
+ break
674
+
675
+ def load_model(self, file_path, components=None, lora_alphas=[]):
676
+ if os.path.isdir(file_path):
677
+ if self.is_kolors_text_encoder(file_path):
678
+ self.load_kolors_text_encoder(file_path=file_path)
679
+ return
680
+ state_dict = load_state_dict(file_path, torch_dtype=self.torch_dtype)
681
+ if self.is_stable_video_diffusion(state_dict):
682
+ self.load_stable_video_diffusion(state_dict, file_path=file_path)
683
+ elif self.is_animatediff(state_dict):
684
+ self.load_animatediff(state_dict, file_path=file_path)
685
+ elif self.is_animatediff_xl(state_dict):
686
+ self.load_animatediff_xl(state_dict, file_path=file_path)
687
+ elif self.is_controlnet(state_dict):
688
+ self.load_controlnet(state_dict, file_path=file_path)
689
+ elif self.is_stabe_diffusion_xl(state_dict):
690
+ self.load_stable_diffusion_xl(state_dict, components=components, file_path=file_path)
691
+ elif self.is_stable_diffusion(state_dict):
692
+ self.load_stable_diffusion(state_dict, components=components, file_path=file_path)
693
+ elif self.is_sd_lora(state_dict):
694
+ self.load_sd_lora(state_dict, alpha=lora_alphas.pop(0))
695
+ elif self.is_beautiful_prompt(state_dict):
696
+ self.load_beautiful_prompt(state_dict, file_path=file_path)
697
+ elif self.is_RIFE(state_dict):
698
+ self.load_RIFE(state_dict, file_path=file_path)
699
+ elif self.is_translator(state_dict):
700
+ self.load_translator(state_dict, file_path=file_path)
701
+ elif self.is_ipadapter(state_dict):
702
+ self.load_ipadapter(state_dict, file_path=file_path)
703
+ elif self.is_ipadapter_image_encoder(state_dict):
704
+ self.load_ipadapter_image_encoder(state_dict, file_path=file_path)
705
+ elif self.is_ipadapter_xl(state_dict):
706
+ self.load_ipadapter_xl(state_dict, file_path=file_path)
707
+ elif self.is_ipadapter_xl_image_encoder(state_dict):
708
+ self.load_ipadapter_xl_image_encoder(state_dict, file_path=file_path)
709
+ elif self.is_hunyuan_dit_clip_text_encoder(state_dict):
710
+ self.load_hunyuan_dit_clip_text_encoder(state_dict, file_path=file_path)
711
+ elif self.is_hunyuan_dit_t5_text_encoder(state_dict):
712
+ self.load_hunyuan_dit_t5_text_encoder(state_dict, file_path=file_path)
713
+ elif self.is_hunyuan_dit(state_dict):
714
+ self.load_hunyuan_dit(state_dict, file_path=file_path)
715
+ elif self.is_diffusers_vae(state_dict):
716
+ self.load_diffusers_vae(state_dict, file_path=file_path)
717
+ elif self.is_ExVideo_StableVideoDiffusion(state_dict):
718
+ self.load_ExVideo_StableVideoDiffusion(state_dict, file_path=file_path)
719
+ elif self.is_stable_diffusion_3(state_dict):
720
+ self.load_stable_diffusion_3(state_dict, components=components, file_path=file_path)
721
+ elif self.is_stable_diffusion_3_t5(state_dict):
722
+ self.load_stable_diffusion_3_t5(state_dict, file_path=file_path)
723
+ elif self.is_kolors_unet(state_dict):
724
+ self.load_kolors_unet(state_dict, file_path=file_path)
725
+
726
+ def load_models(self, file_path_list, lora_alphas=[]):
727
+ for file_path in file_path_list:
728
+ self.load_model(file_path, lora_alphas=lora_alphas)
729
+
730
+ def to(self, device):
731
+ for component in self.model:
732
+ if isinstance(self.model[component], list):
733
+ for model in self.model[component]:
734
+ model.to(device)
735
+ else:
736
+ self.model[component].to(device)
737
+ torch.cuda.empty_cache()
738
+
739
+ def get_model_with_model_path(self, model_path):
740
+ for component in self.model_path:
741
+ if isinstance(self.model_path[component], str):
742
+ if os.path.samefile(self.model_path[component], model_path):
743
+ return self.model[component]
744
+ elif isinstance(self.model_path[component], list):
745
+ for i, model_path_ in enumerate(self.model_path[component]):
746
+ if os.path.samefile(model_path_, model_path):
747
+ return self.model[component][i]
748
+ raise ValueError(f"Please load model {model_path} before you use it.")
749
+
750
+ def __getattr__(self, __name):
751
+ if __name in self.model:
752
+ return self.model[__name]
753
+ else:
754
+ return super.__getattribute__(__name)
755
+
756
+
757
+ def load_state_dict(file_path, torch_dtype=None):
758
+ if file_path.endswith(".safetensors"):
759
+ return load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype)
760
+ else:
761
+ return load_state_dict_from_bin(file_path, torch_dtype=torch_dtype)
762
+
763
+
764
+ def load_state_dict_from_safetensors(file_path, torch_dtype=None):
765
+ state_dict = {}
766
+ with safe_open(file_path, framework="pt", device="cpu") as f:
767
+ for k in f.keys():
768
+ state_dict[k] = f.get_tensor(k)
769
+ if torch_dtype is not None:
770
+ state_dict[k] = state_dict[k].to(torch_dtype)
771
+ return state_dict
772
+
773
+
774
+ def load_state_dict_from_bin(file_path, torch_dtype=None):
775
+ state_dict = torch.load(file_path, map_location="cpu")
776
+ if torch_dtype is not None:
777
+ for i in state_dict:
778
+ if isinstance(state_dict[i], torch.Tensor):
779
+ state_dict[i] = state_dict[i].to(torch_dtype)
780
+ return state_dict
781
+
782
+
783
+ def search_parameter(param, state_dict):
784
+ for name, param_ in state_dict.items():
785
+ if param.numel() == param_.numel():
786
+ if param.shape == param_.shape:
787
+ if torch.dist(param, param_) < 1e-6:
788
+ return name
789
+ else:
790
+ if torch.dist(param.flatten(), param_.flatten()) < 1e-6:
791
+ return name
792
+ return None
793
+
794
+
795
+ def build_rename_dict(source_state_dict, target_state_dict, split_qkv=False):
796
+ matched_keys = set()
797
+ with torch.no_grad():
798
+ for name in source_state_dict:
799
+ rename = search_parameter(source_state_dict[name], target_state_dict)
800
+ if rename is not None:
801
+ print(f'"{name}": "{rename}",')
802
+ matched_keys.add(rename)
803
+ elif split_qkv and len(source_state_dict[name].shape)>=1 and source_state_dict[name].shape[0]%3==0:
804
+ length = source_state_dict[name].shape[0] // 3
805
+ rename = []
806
+ for i in range(3):
807
+ rename.append(search_parameter(source_state_dict[name][i*length: i*length+length], target_state_dict))
808
+ if None not in rename:
809
+ print(f'"{name}": {rename},')
810
+ for rename_ in rename:
811
+ matched_keys.add(rename_)
812
+ for name in target_state_dict:
813
+ if name not in matched_keys:
814
+ print("Cannot find", name, target_state_dict[name].shape)
diffsynth/models/attention.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from einops import rearrange
3
+
4
+
5
+ def low_version_attention(query, key, value, attn_bias=None):
6
+ scale = 1 / query.shape[-1] ** 0.5
7
+ query = query * scale
8
+ attn = torch.matmul(query, key.transpose(-2, -1))
9
+ if attn_bias is not None:
10
+ attn = attn + attn_bias
11
+ attn = attn.softmax(-1)
12
+ return attn @ value
13
+
14
+
15
+ class Attention(torch.nn.Module):
16
+
17
+ def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_kv=False, bias_out=False):
18
+ super().__init__()
19
+ dim_inner = head_dim * num_heads
20
+ kv_dim = kv_dim if kv_dim is not None else q_dim
21
+ self.num_heads = num_heads
22
+ self.head_dim = head_dim
23
+
24
+ self.to_q = torch.nn.Linear(q_dim, dim_inner, bias=bias_q)
25
+ self.to_k = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
26
+ self.to_v = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
27
+ self.to_out = torch.nn.Linear(dim_inner, q_dim, bias=bias_out)
28
+
29
+ def interact_with_ipadapter(self, hidden_states, q, ip_k, ip_v, scale=1.0):
30
+ batch_size = q.shape[0]
31
+ ip_k = ip_k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
32
+ ip_v = ip_v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
33
+ ip_hidden_states = torch.nn.functional.scaled_dot_product_attention(q, ip_k, ip_v)
34
+ hidden_states = hidden_states + scale * ip_hidden_states
35
+ return hidden_states
36
+
37
+ def torch_forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None, ipadapter_kwargs=None, qkv_preprocessor=None):
38
+ if encoder_hidden_states is None:
39
+ encoder_hidden_states = hidden_states
40
+
41
+ batch_size = encoder_hidden_states.shape[0]
42
+
43
+ q = self.to_q(hidden_states)
44
+ k = self.to_k(encoder_hidden_states)
45
+ v = self.to_v(encoder_hidden_states)
46
+
47
+ q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
48
+ k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
49
+ v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
50
+
51
+ if qkv_preprocessor is not None:
52
+ q, k, v = qkv_preprocessor(q, k, v)
53
+
54
+ hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
55
+ if ipadapter_kwargs is not None:
56
+ hidden_states = self.interact_with_ipadapter(hidden_states, q, **ipadapter_kwargs)
57
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
58
+ hidden_states = hidden_states.to(q.dtype)
59
+
60
+ hidden_states = self.to_out(hidden_states)
61
+
62
+ return hidden_states
63
+
64
+ def xformers_forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None):
65
+ if encoder_hidden_states is None:
66
+ encoder_hidden_states = hidden_states
67
+
68
+ q = self.to_q(hidden_states)
69
+ k = self.to_k(encoder_hidden_states)
70
+ v = self.to_v(encoder_hidden_states)
71
+
72
+ q = rearrange(q, "b f (n d) -> (b n) f d", n=self.num_heads)
73
+ k = rearrange(k, "b f (n d) -> (b n) f d", n=self.num_heads)
74
+ v = rearrange(v, "b f (n d) -> (b n) f d", n=self.num_heads)
75
+
76
+ if attn_mask is not None:
77
+ hidden_states = low_version_attention(q, k, v, attn_bias=attn_mask)
78
+ else:
79
+ import xformers.ops as xops
80
+ hidden_states = xops.memory_efficient_attention(q, k, v)
81
+ hidden_states = rearrange(hidden_states, "(b n) f d -> b f (n d)", n=self.num_heads)
82
+
83
+ hidden_states = hidden_states.to(q.dtype)
84
+ hidden_states = self.to_out(hidden_states)
85
+
86
+ return hidden_states
87
+
88
+ def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None, ipadapter_kwargs=None, qkv_preprocessor=None):
89
+ return self.torch_forward(hidden_states, encoder_hidden_states=encoder_hidden_states, attn_mask=attn_mask, ipadapter_kwargs=ipadapter_kwargs, qkv_preprocessor=qkv_preprocessor)
diffsynth/models/downloader.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_download
2
+ from modelscope import snapshot_download
3
+ import os, shutil
4
+
5
+
6
+ def download_from_modelscope(model_id, origin_file_path, local_dir):
7
+ os.makedirs(local_dir, exist_ok=True)
8
+ if os.path.basename(origin_file_path) in os.listdir(local_dir):
9
+ print(f"{os.path.basename(origin_file_path)} has been already in {local_dir}.")
10
+ return
11
+ else:
12
+ print(f"Start downloading {os.path.join(local_dir, os.path.basename(origin_file_path))}")
13
+ snapshot_download(model_id, allow_file_pattern=origin_file_path, local_dir=local_dir)
14
+ downloaded_file_path = os.path.join(local_dir, origin_file_path)
15
+ target_file_path = os.path.join(local_dir, os.path.split(origin_file_path)[-1])
16
+ if downloaded_file_path != target_file_path:
17
+ shutil.move(downloaded_file_path, target_file_path)
18
+ shutil.rmtree(os.path.join(local_dir, origin_file_path.split("/")[0]))
19
+
20
+
21
+ def download_from_huggingface(model_id, origin_file_path, local_dir):
22
+ os.makedirs(local_dir, exist_ok=True)
23
+ if os.path.basename(origin_file_path) in os.listdir(local_dir):
24
+ print(f"{os.path.basename(origin_file_path)} has been already in {local_dir}.")
25
+ return
26
+ else:
27
+ print(f"Start downloading {os.path.join(local_dir, os.path.basename(origin_file_path))}")
28
+ hf_hub_download(model_id, origin_file_path, local_dir=local_dir)
diffsynth/models/hunyuan_dit.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .attention import Attention
2
+ from .tiler import TileWorker
3
+ from einops import repeat, rearrange
4
+ import math
5
+ import torch
6
+
7
+
8
+ class HunyuanDiTRotaryEmbedding(torch.nn.Module):
9
+
10
+ def __init__(self, q_norm_shape=88, k_norm_shape=88, rotary_emb_on_k=True):
11
+ super().__init__()
12
+ self.q_norm = torch.nn.LayerNorm((q_norm_shape,), elementwise_affine=True, eps=1e-06)
13
+ self.k_norm = torch.nn.LayerNorm((k_norm_shape,), elementwise_affine=True, eps=1e-06)
14
+ self.rotary_emb_on_k = rotary_emb_on_k
15
+ self.k_cache, self.v_cache = [], []
16
+
17
+ def reshape_for_broadcast(self, freqs_cis, x):
18
+ ndim = x.ndim
19
+ shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
20
+ return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
21
+
22
+ def rotate_half(self, x):
23
+ x_real, x_imag = x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)
24
+ return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
25
+
26
+ def apply_rotary_emb(self, xq, xk, freqs_cis):
27
+ xk_out = None
28
+ cos, sin = self.reshape_for_broadcast(freqs_cis, xq)
29
+ cos, sin = cos.to(xq.device), sin.to(xq.device)
30
+ xq_out = (xq.float() * cos + self.rotate_half(xq.float()) * sin).type_as(xq)
31
+ if xk is not None:
32
+ xk_out = (xk.float() * cos + self.rotate_half(xk.float()) * sin).type_as(xk)
33
+ return xq_out, xk_out
34
+
35
+ def forward(self, q, k, v, freqs_cis_img, to_cache=False):
36
+ # norm
37
+ q = self.q_norm(q)
38
+ k = self.k_norm(k)
39
+
40
+ # RoPE
41
+ if self.rotary_emb_on_k:
42
+ q, k = self.apply_rotary_emb(q, k, freqs_cis_img)
43
+ else:
44
+ q, _ = self.apply_rotary_emb(q, None, freqs_cis_img)
45
+
46
+ if to_cache:
47
+ self.k_cache.append(k)
48
+ self.v_cache.append(v)
49
+ elif len(self.k_cache) > 0 and len(self.v_cache) > 0:
50
+ k = torch.concat([k] + self.k_cache, dim=2)
51
+ v = torch.concat([v] + self.v_cache, dim=2)
52
+ self.k_cache, self.v_cache = [], []
53
+ return q, k, v
54
+
55
+
56
+ class FP32_Layernorm(torch.nn.LayerNorm):
57
+ def forward(self, inputs):
58
+ origin_dtype = inputs.dtype
59
+ return torch.nn.functional.layer_norm(inputs.float(), self.normalized_shape, self.weight.float(), self.bias.float(), self.eps).to(origin_dtype)
60
+
61
+
62
+ class FP32_SiLU(torch.nn.SiLU):
63
+ def forward(self, inputs):
64
+ origin_dtype = inputs.dtype
65
+ return torch.nn.functional.silu(inputs.float(), inplace=False).to(origin_dtype)
66
+
67
+
68
+ class HunyuanDiTFinalLayer(torch.nn.Module):
69
+ def __init__(self, final_hidden_size=1408, condition_dim=1408, patch_size=2, out_channels=8):
70
+ super().__init__()
71
+ self.norm_final = torch.nn.LayerNorm(final_hidden_size, elementwise_affine=False, eps=1e-6)
72
+ self.linear = torch.nn.Linear(final_hidden_size, patch_size * patch_size * out_channels, bias=True)
73
+ self.adaLN_modulation = torch.nn.Sequential(
74
+ FP32_SiLU(),
75
+ torch.nn.Linear(condition_dim, 2 * final_hidden_size, bias=True)
76
+ )
77
+
78
+ def modulate(self, x, shift, scale):
79
+ return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
80
+
81
+ def forward(self, hidden_states, condition_emb):
82
+ shift, scale = self.adaLN_modulation(condition_emb).chunk(2, dim=1)
83
+ hidden_states = self.modulate(self.norm_final(hidden_states), shift, scale)
84
+ hidden_states = self.linear(hidden_states)
85
+ return hidden_states
86
+
87
+
88
+ class HunyuanDiTBlock(torch.nn.Module):
89
+
90
+ def __init__(
91
+ self,
92
+ hidden_dim=1408,
93
+ condition_dim=1408,
94
+ num_heads=16,
95
+ mlp_ratio=4.3637,
96
+ text_dim=1024,
97
+ skip_connection=False
98
+ ):
99
+ super().__init__()
100
+ self.norm1 = FP32_Layernorm((hidden_dim,), eps=1e-6, elementwise_affine=True)
101
+ self.rota1 = HunyuanDiTRotaryEmbedding(hidden_dim//num_heads, hidden_dim//num_heads)
102
+ self.attn1 = Attention(hidden_dim, num_heads, hidden_dim//num_heads, bias_q=True, bias_kv=True, bias_out=True)
103
+ self.norm2 = FP32_Layernorm((hidden_dim,), eps=1e-6, elementwise_affine=True)
104
+ self.rota2 = HunyuanDiTRotaryEmbedding(hidden_dim//num_heads, hidden_dim//num_heads, rotary_emb_on_k=False)
105
+ self.attn2 = Attention(hidden_dim, num_heads, hidden_dim//num_heads, kv_dim=text_dim, bias_q=True, bias_kv=True, bias_out=True)
106
+ self.norm3 = FP32_Layernorm((hidden_dim,), eps=1e-6, elementwise_affine=True)
107
+ self.modulation = torch.nn.Sequential(FP32_SiLU(), torch.nn.Linear(condition_dim, hidden_dim, bias=True))
108
+ self.mlp = torch.nn.Sequential(
109
+ torch.nn.Linear(hidden_dim, int(hidden_dim*mlp_ratio), bias=True),
110
+ torch.nn.GELU(approximate="tanh"),
111
+ torch.nn.Linear(int(hidden_dim*mlp_ratio), hidden_dim, bias=True)
112
+ )
113
+ if skip_connection:
114
+ self.skip_norm = FP32_Layernorm((hidden_dim * 2,), eps=1e-6, elementwise_affine=True)
115
+ self.skip_linear = torch.nn.Linear(hidden_dim * 2, hidden_dim, bias=True)
116
+ else:
117
+ self.skip_norm, self.skip_linear = None, None
118
+
119
+ def forward(self, hidden_states, condition_emb, text_emb, freq_cis_img, residual=None, to_cache=False):
120
+ # Long Skip Connection
121
+ if self.skip_norm is not None and self.skip_linear is not None:
122
+ hidden_states = torch.cat([hidden_states, residual], dim=-1)
123
+ hidden_states = self.skip_norm(hidden_states)
124
+ hidden_states = self.skip_linear(hidden_states)
125
+
126
+ # Self-Attention
127
+ shift_msa = self.modulation(condition_emb).unsqueeze(dim=1)
128
+ attn_input = self.norm1(hidden_states) + shift_msa
129
+ hidden_states = hidden_states + self.attn1(attn_input, qkv_preprocessor=lambda q, k, v: self.rota1(q, k, v, freq_cis_img, to_cache=to_cache))
130
+
131
+ # Cross-Attention
132
+ attn_input = self.norm3(hidden_states)
133
+ hidden_states = hidden_states + self.attn2(attn_input, text_emb, qkv_preprocessor=lambda q, k, v: self.rota2(q, k, v, freq_cis_img))
134
+
135
+ # FFN Layer
136
+ mlp_input = self.norm2(hidden_states)
137
+ hidden_states = hidden_states + self.mlp(mlp_input)
138
+ return hidden_states
139
+
140
+
141
+ class AttentionPool(torch.nn.Module):
142
+ def __init__(self, spacial_dim, embed_dim, num_heads, output_dim = None):
143
+ super().__init__()
144
+ self.positional_embedding = torch.nn.Parameter(torch.randn(spacial_dim + 1, embed_dim) / embed_dim ** 0.5)
145
+ self.k_proj = torch.nn.Linear(embed_dim, embed_dim)
146
+ self.q_proj = torch.nn.Linear(embed_dim, embed_dim)
147
+ self.v_proj = torch.nn.Linear(embed_dim, embed_dim)
148
+ self.c_proj = torch.nn.Linear(embed_dim, output_dim or embed_dim)
149
+ self.num_heads = num_heads
150
+
151
+ def forward(self, x):
152
+ x = x.permute(1, 0, 2) # NLC -> LNC
153
+ x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (L+1)NC
154
+ x = x + self.positional_embedding[:, None, :].to(x.dtype) # (L+1)NC
155
+ x, _ = torch.nn.functional.multi_head_attention_forward(
156
+ query=x[:1], key=x, value=x,
157
+ embed_dim_to_check=x.shape[-1],
158
+ num_heads=self.num_heads,
159
+ q_proj_weight=self.q_proj.weight,
160
+ k_proj_weight=self.k_proj.weight,
161
+ v_proj_weight=self.v_proj.weight,
162
+ in_proj_weight=None,
163
+ in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
164
+ bias_k=None,
165
+ bias_v=None,
166
+ add_zero_attn=False,
167
+ dropout_p=0,
168
+ out_proj_weight=self.c_proj.weight,
169
+ out_proj_bias=self.c_proj.bias,
170
+ use_separate_proj_weight=True,
171
+ training=self.training,
172
+ need_weights=False
173
+ )
174
+ return x.squeeze(0)
175
+
176
+
177
+ class PatchEmbed(torch.nn.Module):
178
+ def __init__(
179
+ self,
180
+ patch_size=(2, 2),
181
+ in_chans=4,
182
+ embed_dim=1408,
183
+ bias=True,
184
+ ):
185
+ super().__init__()
186
+ self.proj = torch.nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
187
+
188
+ def forward(self, x):
189
+ x = self.proj(x)
190
+ x = x.flatten(2).transpose(1, 2) # BCHW -> BNC
191
+ return x
192
+
193
+
194
+ def timestep_embedding(t, dim, max_period=10000, repeat_only=False):
195
+ # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
196
+ if not repeat_only:
197
+ half = dim // 2
198
+ freqs = torch.exp(
199
+ -math.log(max_period)
200
+ * torch.arange(start=0, end=half, dtype=torch.float32)
201
+ / half
202
+ ).to(device=t.device) # size: [dim/2], 一个指数衰减的曲线
203
+ args = t[:, None].float() * freqs[None]
204
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
205
+ if dim % 2:
206
+ embedding = torch.cat(
207
+ [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
208
+ )
209
+ else:
210
+ embedding = repeat(t, "b -> b d", d=dim)
211
+ return embedding
212
+
213
+
214
+ class TimestepEmbedder(torch.nn.Module):
215
+ def __init__(self, hidden_size=1408, frequency_embedding_size=256):
216
+ super().__init__()
217
+ self.mlp = torch.nn.Sequential(
218
+ torch.nn.Linear(frequency_embedding_size, hidden_size, bias=True),
219
+ torch.nn.SiLU(),
220
+ torch.nn.Linear(hidden_size, hidden_size, bias=True),
221
+ )
222
+ self.frequency_embedding_size = frequency_embedding_size
223
+
224
+ def forward(self, t):
225
+ t_freq = timestep_embedding(t, self.frequency_embedding_size).type(self.mlp[0].weight.dtype)
226
+ t_emb = self.mlp(t_freq)
227
+ return t_emb
228
+
229
+
230
+ class HunyuanDiT(torch.nn.Module):
231
+ def __init__(self, num_layers_down=21, num_layers_up=19, in_channels=4, out_channels=8, hidden_dim=1408, text_dim=1024, t5_dim=2048, text_length=77, t5_length=256):
232
+ super().__init__()
233
+
234
+ # Embedders
235
+ self.text_emb_padding = torch.nn.Parameter(torch.randn(text_length + t5_length, text_dim, dtype=torch.float32))
236
+ self.t5_embedder = torch.nn.Sequential(
237
+ torch.nn.Linear(t5_dim, t5_dim * 4, bias=True),
238
+ FP32_SiLU(),
239
+ torch.nn.Linear(t5_dim * 4, text_dim, bias=True),
240
+ )
241
+ self.t5_pooler = AttentionPool(t5_length, t5_dim, num_heads=8, output_dim=1024)
242
+ self.style_embedder = torch.nn.Parameter(torch.randn(hidden_dim))
243
+ self.patch_embedder = PatchEmbed(in_chans=in_channels)
244
+ self.timestep_embedder = TimestepEmbedder()
245
+ self.extra_embedder = torch.nn.Sequential(
246
+ torch.nn.Linear(256 * 6 + 1024 + hidden_dim, hidden_dim * 4),
247
+ FP32_SiLU(),
248
+ torch.nn.Linear(hidden_dim * 4, hidden_dim),
249
+ )
250
+
251
+ # Transformer blocks
252
+ self.num_layers_down = num_layers_down
253
+ self.num_layers_up = num_layers_up
254
+ self.blocks = torch.nn.ModuleList(
255
+ [HunyuanDiTBlock(skip_connection=False) for _ in range(num_layers_down)] + \
256
+ [HunyuanDiTBlock(skip_connection=True) for _ in range(num_layers_up)]
257
+ )
258
+
259
+ # Output layers
260
+ self.final_layer = HunyuanDiTFinalLayer()
261
+ self.out_channels = out_channels
262
+
263
+ def prepare_text_emb(self, text_emb, text_emb_t5, text_emb_mask, text_emb_mask_t5):
264
+ text_emb_mask = text_emb_mask.bool()
265
+ text_emb_mask_t5 = text_emb_mask_t5.bool()
266
+ text_emb_t5 = self.t5_embedder(text_emb_t5)
267
+ text_emb = torch.cat([text_emb, text_emb_t5], dim=1)
268
+ text_emb_mask = torch.cat([text_emb_mask, text_emb_mask_t5], dim=-1)
269
+ text_emb = torch.where(text_emb_mask.unsqueeze(2), text_emb, self.text_emb_padding.to(text_emb))
270
+ return text_emb
271
+
272
+ def prepare_extra_emb(self, text_emb_t5, timestep, size_emb, dtype, batch_size):
273
+ # Text embedding
274
+ pooled_text_emb_t5 = self.t5_pooler(text_emb_t5)
275
+
276
+ # Timestep embedding
277
+ timestep_emb = self.timestep_embedder(timestep)
278
+
279
+ # Size embedding
280
+ size_emb = timestep_embedding(size_emb.view(-1), 256).to(dtype)
281
+ size_emb = size_emb.view(-1, 6 * 256)
282
+
283
+ # Style embedding
284
+ style_emb = repeat(self.style_embedder, "D -> B D", B=batch_size)
285
+
286
+ # Concatenate all extra vectors
287
+ extra_emb = torch.cat([pooled_text_emb_t5, size_emb, style_emb], dim=1)
288
+ condition_emb = timestep_emb + self.extra_embedder(extra_emb)
289
+
290
+ return condition_emb
291
+
292
+ def unpatchify(self, x, h, w):
293
+ return rearrange(x, "B (H W) (P Q C) -> B C (H P) (W Q)", H=h, W=w, P=2, Q=2)
294
+
295
+ def build_mask(self, data, is_bound):
296
+ _, _, H, W = data.shape
297
+ h = repeat(torch.arange(H), "H -> H W", H=H, W=W)
298
+ w = repeat(torch.arange(W), "W -> H W", H=H, W=W)
299
+ border_width = (H + W) // 4
300
+ pad = torch.ones_like(h) * border_width
301
+ mask = torch.stack([
302
+ pad if is_bound[0] else h + 1,
303
+ pad if is_bound[1] else H - h,
304
+ pad if is_bound[2] else w + 1,
305
+ pad if is_bound[3] else W - w
306
+ ]).min(dim=0).values
307
+ mask = mask.clip(1, border_width)
308
+ mask = (mask / border_width).to(dtype=data.dtype, device=data.device)
309
+ mask = rearrange(mask, "H W -> 1 H W")
310
+ return mask
311
+
312
+ def tiled_block_forward(self, block, hidden_states, condition_emb, text_emb, freq_cis_img, residual, torch_dtype, data_device, computation_device, tile_size, tile_stride):
313
+ B, C, H, W = hidden_states.shape
314
+
315
+ weight = torch.zeros((1, 1, H, W), dtype=torch_dtype, device=data_device)
316
+ values = torch.zeros((B, C, H, W), dtype=torch_dtype, device=data_device)
317
+
318
+ # Split tasks
319
+ tasks = []
320
+ for h in range(0, H, tile_stride):
321
+ for w in range(0, W, tile_stride):
322
+ if (h-tile_stride >= 0 and h-tile_stride+tile_size >= H) or (w-tile_stride >= 0 and w-tile_stride+tile_size >= W):
323
+ continue
324
+ h_, w_ = h + tile_size, w + tile_size
325
+ if h_ > H: h, h_ = H - tile_size, H
326
+ if w_ > W: w, w_ = W - tile_size, W
327
+ tasks.append((h, h_, w, w_))
328
+
329
+ # Run
330
+ for hl, hr, wl, wr in tasks:
331
+ hidden_states_batch = hidden_states[:, :, hl:hr, wl:wr].to(computation_device)
332
+ hidden_states_batch = rearrange(hidden_states_batch, "B C H W -> B (H W) C")
333
+ if residual is not None:
334
+ residual_batch = residual[:, :, hl:hr, wl:wr].to(computation_device)
335
+ residual_batch = rearrange(residual_batch, "B C H W -> B (H W) C")
336
+ else:
337
+ residual_batch = None
338
+
339
+ # Forward
340
+ hidden_states_batch = block(hidden_states_batch, condition_emb, text_emb, freq_cis_img, residual_batch).to(data_device)
341
+ hidden_states_batch = rearrange(hidden_states_batch, "B (H W) C -> B C H W", H=hr-hl)
342
+
343
+ mask = self.build_mask(hidden_states_batch, is_bound=(hl==0, hr>=H, wl==0, wr>=W))
344
+ values[:, :, hl:hr, wl:wr] += hidden_states_batch * mask
345
+ weight[:, :, hl:hr, wl:wr] += mask
346
+ values /= weight
347
+ return values
348
+
349
+ def forward(
350
+ self, hidden_states, text_emb, text_emb_t5, text_emb_mask, text_emb_mask_t5, timestep, size_emb, freq_cis_img,
351
+ tiled=False, tile_size=64, tile_stride=32,
352
+ to_cache=False,
353
+ use_gradient_checkpointing=False,
354
+ ):
355
+ # Embeddings
356
+ text_emb = self.prepare_text_emb(text_emb, text_emb_t5, text_emb_mask, text_emb_mask_t5)
357
+ condition_emb = self.prepare_extra_emb(text_emb_t5, timestep, size_emb, hidden_states.dtype, hidden_states.shape[0])
358
+
359
+ # Input
360
+ height, width = hidden_states.shape[-2], hidden_states.shape[-1]
361
+ hidden_states = self.patch_embedder(hidden_states)
362
+
363
+ # Blocks
364
+ def create_custom_forward(module):
365
+ def custom_forward(*inputs):
366
+ return module(*inputs)
367
+ return custom_forward
368
+ if tiled:
369
+ hidden_states = rearrange(hidden_states, "B (H W) C -> B C H W", H=height//2)
370
+ residuals = []
371
+ for block_id, block in enumerate(self.blocks):
372
+ residual = residuals.pop() if block_id >= self.num_layers_down else None
373
+ hidden_states = self.tiled_block_forward(
374
+ block, hidden_states, condition_emb, text_emb, freq_cis_img, residual,
375
+ torch_dtype=hidden_states.dtype, data_device=hidden_states.device, computation_device=hidden_states.device,
376
+ tile_size=tile_size, tile_stride=tile_stride
377
+ )
378
+ if block_id < self.num_layers_down - 2:
379
+ residuals.append(hidden_states)
380
+ hidden_states = rearrange(hidden_states, "B C H W -> B (H W) C")
381
+ else:
382
+ residuals = []
383
+ for block_id, block in enumerate(self.blocks):
384
+ residual = residuals.pop() if block_id >= self.num_layers_down else None
385
+ if self.training and use_gradient_checkpointing:
386
+ hidden_states = torch.utils.checkpoint.checkpoint(
387
+ create_custom_forward(block),
388
+ hidden_states, condition_emb, text_emb, freq_cis_img, residual,
389
+ use_reentrant=False,
390
+ )
391
+ else:
392
+ hidden_states = block(hidden_states, condition_emb, text_emb, freq_cis_img, residual, to_cache=to_cache)
393
+ if block_id < self.num_layers_down - 2:
394
+ residuals.append(hidden_states)
395
+
396
+ # Output
397
+ hidden_states = self.final_layer(hidden_states, condition_emb)
398
+ hidden_states = self.unpatchify(hidden_states, height//2, width//2)
399
+ hidden_states, _ = hidden_states.chunk(2, dim=1)
400
+ return hidden_states
401
+
402
+ def state_dict_converter(self):
403
+ return HunyuanDiTStateDictConverter()
404
+
405
+
406
+
407
+ class HunyuanDiTStateDictConverter():
408
+ def __init__(self):
409
+ pass
410
+
411
+ def from_diffusers(self, state_dict):
412
+ state_dict_ = {}
413
+ for name, param in state_dict.items():
414
+ name_ = name
415
+ name_ = name_.replace(".default_modulation.", ".modulation.")
416
+ name_ = name_.replace(".mlp.fc1.", ".mlp.0.")
417
+ name_ = name_.replace(".mlp.fc2.", ".mlp.2.")
418
+ name_ = name_.replace(".attn1.q_norm.", ".rota1.q_norm.")
419
+ name_ = name_.replace(".attn2.q_norm.", ".rota2.q_norm.")
420
+ name_ = name_.replace(".attn1.k_norm.", ".rota1.k_norm.")
421
+ name_ = name_.replace(".attn2.k_norm.", ".rota2.k_norm.")
422
+ name_ = name_.replace(".q_proj.", ".to_q.")
423
+ name_ = name_.replace(".out_proj.", ".to_out.")
424
+ name_ = name_.replace("text_embedding_padding", "text_emb_padding")
425
+ name_ = name_.replace("mlp_t5.0.", "t5_embedder.0.")
426
+ name_ = name_.replace("mlp_t5.2.", "t5_embedder.2.")
427
+ name_ = name_.replace("pooler.", "t5_pooler.")
428
+ name_ = name_.replace("x_embedder.", "patch_embedder.")
429
+ name_ = name_.replace("t_embedder.", "timestep_embedder.")
430
+ name_ = name_.replace("t5_pooler.to_q.", "t5_pooler.q_proj.")
431
+ name_ = name_.replace("style_embedder.weight", "style_embedder")
432
+ if ".kv_proj." in name_:
433
+ param_k = param[:param.shape[0]//2]
434
+ param_v = param[param.shape[0]//2:]
435
+ state_dict_[name_.replace(".kv_proj.", ".to_k.")] = param_k
436
+ state_dict_[name_.replace(".kv_proj.", ".to_v.")] = param_v
437
+ elif ".Wqkv." in name_:
438
+ param_q = param[:param.shape[0]//3]
439
+ param_k = param[param.shape[0]//3:param.shape[0]//3*2]
440
+ param_v = param[param.shape[0]//3*2:]
441
+ state_dict_[name_.replace(".Wqkv.", ".to_q.")] = param_q
442
+ state_dict_[name_.replace(".Wqkv.", ".to_k.")] = param_k
443
+ state_dict_[name_.replace(".Wqkv.", ".to_v.")] = param_v
444
+ elif "style_embedder" in name_:
445
+ state_dict_[name_] = param.squeeze()
446
+ else:
447
+ state_dict_[name_] = param
448
+ return state_dict_
449
+
450
+ def from_civitai(self, state_dict):
451
+ return self.from_diffusers(state_dict)
diffsynth/models/hunyuan_dit_text_encoder.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertModel, BertConfig, T5EncoderModel, T5Config
2
+ import torch
3
+
4
+
5
+
6
+ class HunyuanDiTCLIPTextEncoder(BertModel):
7
+ def __init__(self):
8
+ config = BertConfig(
9
+ _name_or_path = "",
10
+ architectures = ["BertModel"],
11
+ attention_probs_dropout_prob = 0.1,
12
+ bos_token_id = 0,
13
+ classifier_dropout = None,
14
+ directionality = "bidi",
15
+ eos_token_id = 2,
16
+ hidden_act = "gelu",
17
+ hidden_dropout_prob = 0.1,
18
+ hidden_size = 1024,
19
+ initializer_range = 0.02,
20
+ intermediate_size = 4096,
21
+ layer_norm_eps = 1e-12,
22
+ max_position_embeddings = 512,
23
+ model_type = "bert",
24
+ num_attention_heads = 16,
25
+ num_hidden_layers = 24,
26
+ output_past = True,
27
+ pad_token_id = 0,
28
+ pooler_fc_size = 768,
29
+ pooler_num_attention_heads = 12,
30
+ pooler_num_fc_layers = 3,
31
+ pooler_size_per_head = 128,
32
+ pooler_type = "first_token_transform",
33
+ position_embedding_type = "absolute",
34
+ torch_dtype = "float32",
35
+ transformers_version = "4.37.2",
36
+ type_vocab_size = 2,
37
+ use_cache = True,
38
+ vocab_size = 47020
39
+ )
40
+ super().__init__(config, add_pooling_layer=False)
41
+ self.eval()
42
+
43
+ def forward(self, input_ids, attention_mask, clip_skip=1):
44
+ input_shape = input_ids.size()
45
+
46
+ batch_size, seq_length = input_shape
47
+ device = input_ids.device
48
+
49
+ past_key_values_length = 0
50
+
51
+ if attention_mask is None:
52
+ attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
53
+
54
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
55
+
56
+ embedding_output = self.embeddings(
57
+ input_ids=input_ids,
58
+ position_ids=None,
59
+ token_type_ids=None,
60
+ inputs_embeds=None,
61
+ past_key_values_length=0,
62
+ )
63
+ encoder_outputs = self.encoder(
64
+ embedding_output,
65
+ attention_mask=extended_attention_mask,
66
+ head_mask=None,
67
+ encoder_hidden_states=None,
68
+ encoder_attention_mask=None,
69
+ past_key_values=None,
70
+ use_cache=False,
71
+ output_attentions=False,
72
+ output_hidden_states=True,
73
+ return_dict=True,
74
+ )
75
+ all_hidden_states = encoder_outputs.hidden_states
76
+ prompt_emb = all_hidden_states[-clip_skip]
77
+ if clip_skip > 1:
78
+ mean, std = all_hidden_states[-1].mean(), all_hidden_states[-1].std()
79
+ prompt_emb = (prompt_emb - prompt_emb.mean()) / prompt_emb.std() * std + mean
80
+ return prompt_emb
81
+
82
+ def state_dict_converter(self):
83
+ return HunyuanDiTCLIPTextEncoderStateDictConverter()
84
+
85
+
86
+
87
+ class HunyuanDiTT5TextEncoder(T5EncoderModel):
88
+ def __init__(self):
89
+ config = T5Config(
90
+ _name_or_path = "../HunyuanDiT/t2i/mt5",
91
+ architectures = ["MT5ForConditionalGeneration"],
92
+ classifier_dropout = 0.0,
93
+ d_ff = 5120,
94
+ d_kv = 64,
95
+ d_model = 2048,
96
+ decoder_start_token_id = 0,
97
+ dense_act_fn = "gelu_new",
98
+ dropout_rate = 0.1,
99
+ eos_token_id = 1,
100
+ feed_forward_proj = "gated-gelu",
101
+ initializer_factor = 1.0,
102
+ is_encoder_decoder = True,
103
+ is_gated_act = True,
104
+ layer_norm_epsilon = 1e-06,
105
+ model_type = "t5",
106
+ num_decoder_layers = 24,
107
+ num_heads = 32,
108
+ num_layers = 24,
109
+ output_past = True,
110
+ pad_token_id = 0,
111
+ relative_attention_max_distance = 128,
112
+ relative_attention_num_buckets = 32,
113
+ tie_word_embeddings = False,
114
+ tokenizer_class = "T5Tokenizer",
115
+ transformers_version = "4.37.2",
116
+ use_cache = True,
117
+ vocab_size = 250112
118
+ )
119
+ super().__init__(config)
120
+ self.eval()
121
+
122
+ def forward(self, input_ids, attention_mask, clip_skip=1):
123
+ outputs = super().forward(
124
+ input_ids=input_ids,
125
+ attention_mask=attention_mask,
126
+ output_hidden_states=True,
127
+ )
128
+ prompt_emb = outputs.hidden_states[-clip_skip]
129
+ if clip_skip > 1:
130
+ mean, std = outputs.hidden_states[-1].mean(), outputs.hidden_states[-1].std()
131
+ prompt_emb = (prompt_emb - prompt_emb.mean()) / prompt_emb.std() * std + mean
132
+ return prompt_emb
133
+
134
+ def state_dict_converter(self):
135
+ return HunyuanDiTT5TextEncoderStateDictConverter()
136
+
137
+
138
+
139
+ class HunyuanDiTCLIPTextEncoderStateDictConverter():
140
+ def __init__(self):
141
+ pass
142
+
143
+ def from_diffusers(self, state_dict):
144
+ state_dict_ = {name[5:]: param for name, param in state_dict.items() if name.startswith("bert.")}
145
+ return state_dict_
146
+
147
+ def from_civitai(self, state_dict):
148
+ return self.from_diffusers(state_dict)
149
+
150
+
151
+ class HunyuanDiTT5TextEncoderStateDictConverter():
152
+ def __init__(self):
153
+ pass
154
+
155
+ def from_diffusers(self, state_dict):
156
+ state_dict_ = {name: param for name, param in state_dict.items() if name.startswith("encoder.")}
157
+ state_dict_["shared.weight"] = state_dict["shared.weight"]
158
+ return state_dict_
159
+
160
+ def from_civitai(self, state_dict):
161
+ return self.from_diffusers(state_dict)
diffsynth/models/kolors_text_encoder.py ADDED
@@ -0,0 +1,1363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This model is copied from https://github.com/Kwai-Kolors/Kolors/tree/master/kolors/models.
3
+ We didn't modify this model.
4
+ The tensor operation is performed in the prompter.
5
+ """
6
+
7
+
8
+ """ PyTorch ChatGLM model. """
9
+
10
+ import math
11
+ import copy
12
+ import warnings
13
+ import re
14
+ import sys
15
+
16
+ import torch
17
+ import torch.utils.checkpoint
18
+ import torch.nn.functional as F
19
+ from torch import nn
20
+ from torch.nn import CrossEntropyLoss, LayerNorm
21
+ from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
22
+ from torch.nn.utils import skip_init
23
+ from typing import Optional, Tuple, Union, List, Callable, Dict, Any
24
+ from copy import deepcopy
25
+
26
+ from transformers.modeling_outputs import (
27
+ BaseModelOutputWithPast,
28
+ CausalLMOutputWithPast,
29
+ SequenceClassifierOutputWithPast,
30
+ )
31
+ from transformers.modeling_utils import PreTrainedModel
32
+ from transformers.utils import logging
33
+ from transformers.generation.logits_process import LogitsProcessor
34
+ from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
35
+ from transformers import PretrainedConfig
36
+
37
+
38
+
39
+ class ChatGLMConfig(PretrainedConfig):
40
+ model_type = "chatglm"
41
+ def __init__(
42
+ self,
43
+ num_layers=28,
44
+ padded_vocab_size=65024,
45
+ hidden_size=4096,
46
+ ffn_hidden_size=13696,
47
+ kv_channels=128,
48
+ num_attention_heads=32,
49
+ seq_length=2048,
50
+ hidden_dropout=0.0,
51
+ classifier_dropout=None,
52
+ attention_dropout=0.0,
53
+ layernorm_epsilon=1e-5,
54
+ rmsnorm=True,
55
+ apply_residual_connection_post_layernorm=False,
56
+ post_layer_norm=True,
57
+ add_bias_linear=False,
58
+ add_qkv_bias=False,
59
+ bias_dropout_fusion=True,
60
+ multi_query_attention=False,
61
+ multi_query_group_num=1,
62
+ apply_query_key_layer_scaling=True,
63
+ attention_softmax_in_fp32=True,
64
+ fp32_residual_connection=False,
65
+ quantization_bit=0,
66
+ pre_seq_len=None,
67
+ prefix_projection=False,
68
+ **kwargs
69
+ ):
70
+ self.num_layers = num_layers
71
+ self.vocab_size = padded_vocab_size
72
+ self.padded_vocab_size = padded_vocab_size
73
+ self.hidden_size = hidden_size
74
+ self.ffn_hidden_size = ffn_hidden_size
75
+ self.kv_channels = kv_channels
76
+ self.num_attention_heads = num_attention_heads
77
+ self.seq_length = seq_length
78
+ self.hidden_dropout = hidden_dropout
79
+ self.classifier_dropout = classifier_dropout
80
+ self.attention_dropout = attention_dropout
81
+ self.layernorm_epsilon = layernorm_epsilon
82
+ self.rmsnorm = rmsnorm
83
+ self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
84
+ self.post_layer_norm = post_layer_norm
85
+ self.add_bias_linear = add_bias_linear
86
+ self.add_qkv_bias = add_qkv_bias
87
+ self.bias_dropout_fusion = bias_dropout_fusion
88
+ self.multi_query_attention = multi_query_attention
89
+ self.multi_query_group_num = multi_query_group_num
90
+ self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
91
+ self.attention_softmax_in_fp32 = attention_softmax_in_fp32
92
+ self.fp32_residual_connection = fp32_residual_connection
93
+ self.quantization_bit = quantization_bit
94
+ self.pre_seq_len = pre_seq_len
95
+ self.prefix_projection = prefix_projection
96
+ super().__init__(**kwargs)
97
+
98
+
99
+
100
+ # flags required to enable jit fusion kernels
101
+
102
+ if sys.platform != 'darwin':
103
+ torch._C._jit_set_profiling_mode(False)
104
+ torch._C._jit_set_profiling_executor(False)
105
+ torch._C._jit_override_can_fuse_on_cpu(True)
106
+ torch._C._jit_override_can_fuse_on_gpu(True)
107
+
108
+ logger = logging.get_logger(__name__)
109
+
110
+ _CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
111
+ _CONFIG_FOR_DOC = "ChatGLM6BConfig"
112
+
113
+ CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
114
+ "THUDM/chatglm3-6b-base",
115
+ # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
116
+ ]
117
+
118
+
119
+ def default_init(cls, *args, **kwargs):
120
+ return cls(*args, **kwargs)
121
+
122
+
123
+ class InvalidScoreLogitsProcessor(LogitsProcessor):
124
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
125
+ if torch.isnan(scores).any() or torch.isinf(scores).any():
126
+ scores.zero_()
127
+ scores[..., 5] = 5e4
128
+ return scores
129
+
130
+
131
+ class PrefixEncoder(torch.nn.Module):
132
+ """
133
+ The torch.nn model to encode the prefix
134
+ Input shape: (batch-size, prefix-length)
135
+ Output shape: (batch-size, prefix-length, 2*layers*hidden)
136
+ """
137
+
138
+ def __init__(self, config: ChatGLMConfig):
139
+ super().__init__()
140
+ self.prefix_projection = config.prefix_projection
141
+ if self.prefix_projection:
142
+ # Use a two-layer MLP to encode the prefix
143
+ kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
144
+ self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
145
+ self.trans = torch.nn.Sequential(
146
+ torch.nn.Linear(kv_size, config.hidden_size),
147
+ torch.nn.Tanh(),
148
+ torch.nn.Linear(config.hidden_size, kv_size)
149
+ )
150
+ else:
151
+ self.embedding = torch.nn.Embedding(config.pre_seq_len,
152
+ config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
153
+
154
+ def forward(self, prefix: torch.Tensor):
155
+ if self.prefix_projection:
156
+ prefix_tokens = self.embedding(prefix)
157
+ past_key_values = self.trans(prefix_tokens)
158
+ else:
159
+ past_key_values = self.embedding(prefix)
160
+ return past_key_values
161
+
162
+
163
+ def split_tensor_along_last_dim(
164
+ tensor: torch.Tensor,
165
+ num_partitions: int,
166
+ contiguous_split_chunks: bool = False,
167
+ ) -> List[torch.Tensor]:
168
+ """Split a tensor along its last dimension.
169
+
170
+ Arguments:
171
+ tensor: input tensor.
172
+ num_partitions: number of partitions to split the tensor
173
+ contiguous_split_chunks: If True, make each chunk contiguous
174
+ in memory.
175
+
176
+ Returns:
177
+ A list of Tensors
178
+ """
179
+ # Get the size and dimension.
180
+ last_dim = tensor.dim() - 1
181
+ last_dim_size = tensor.size()[last_dim] // num_partitions
182
+ # Split.
183
+ tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
184
+ # Note: torch.split does not create contiguous tensors by default.
185
+ if contiguous_split_chunks:
186
+ return tuple(chunk.contiguous() for chunk in tensor_list)
187
+
188
+ return tensor_list
189
+
190
+
191
+ class RotaryEmbedding(nn.Module):
192
+ def __init__(self, dim, original_impl=False, device=None, dtype=None):
193
+ super().__init__()
194
+ inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
195
+ self.register_buffer("inv_freq", inv_freq)
196
+ self.dim = dim
197
+ self.original_impl = original_impl
198
+
199
+ def forward_impl(
200
+ self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
201
+ ):
202
+ """Enhanced Transformer with Rotary Position Embedding.
203
+
204
+ Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
205
+ transformers/rope/__init__.py. MIT License:
206
+ https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
207
+ """
208
+ # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
209
+ theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
210
+
211
+ # Create position indexes `[0, 1, ..., seq_len - 1]`
212
+ seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
213
+
214
+ # Calculate the product of position index and $\theta_i$
215
+ idx_theta = torch.outer(seq_idx, theta).float()
216
+
217
+ cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
218
+
219
+ # this is to mimic the behaviour of complex32, else we will get different results
220
+ if dtype in (torch.float16, torch.bfloat16, torch.int8):
221
+ cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
222
+ return cache
223
+
224
+ def forward(self, max_seq_len, offset=0):
225
+ return self.forward_impl(
226
+ max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
227
+ )
228
+
229
+
230
+ @torch.jit.script
231
+ def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
232
+ # x: [sq, b, np, hn]
233
+ sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
234
+ rot_dim = rope_cache.shape[-2] * 2
235
+ x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
236
+ # truncate to support variable sizes
237
+ rope_cache = rope_cache[:sq]
238
+ xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
239
+ rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
240
+ x_out2 = torch.stack(
241
+ [
242
+ xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
243
+ xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
244
+ ],
245
+ -1,
246
+ )
247
+ x_out2 = x_out2.flatten(3)
248
+ return torch.cat((x_out2, x_pass), dim=-1)
249
+
250
+
251
+ class RMSNorm(torch.nn.Module):
252
+ def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
253
+ super().__init__()
254
+ self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
255
+ self.eps = eps
256
+
257
+ def forward(self, hidden_states: torch.Tensor):
258
+ input_dtype = hidden_states.dtype
259
+ variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
260
+ hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
261
+
262
+ return (self.weight * hidden_states).to(input_dtype)
263
+
264
+
265
+ class CoreAttention(torch.nn.Module):
266
+ def __init__(self, config: ChatGLMConfig, layer_number):
267
+ super(CoreAttention, self).__init__()
268
+
269
+ self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
270
+ self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
271
+ if self.apply_query_key_layer_scaling:
272
+ self.attention_softmax_in_fp32 = True
273
+ self.layer_number = max(1, layer_number)
274
+
275
+ projection_size = config.kv_channels * config.num_attention_heads
276
+
277
+ # Per attention head and per partition values.
278
+ self.hidden_size_per_partition = projection_size
279
+ self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
280
+ self.num_attention_heads_per_partition = config.num_attention_heads
281
+
282
+ coeff = None
283
+ self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
284
+ if self.apply_query_key_layer_scaling:
285
+ coeff = self.layer_number
286
+ self.norm_factor *= coeff
287
+ self.coeff = coeff
288
+
289
+ self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
290
+
291
+ def forward(self, query_layer, key_layer, value_layer, attention_mask):
292
+ pytorch_major_version = int(torch.__version__.split('.')[0])
293
+ if pytorch_major_version >= 2:
294
+ query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
295
+ if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
296
+ context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
297
+ is_causal=True)
298
+ else:
299
+ if attention_mask is not None:
300
+ attention_mask = ~attention_mask
301
+ context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
302
+ attention_mask)
303
+ context_layer = context_layer.permute(2, 0, 1, 3)
304
+ new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
305
+ context_layer = context_layer.reshape(*new_context_layer_shape)
306
+ else:
307
+ # Raw attention scores
308
+
309
+ # [b, np, sq, sk]
310
+ output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
311
+
312
+ # [sq, b, np, hn] -> [sq, b * np, hn]
313
+ query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
314
+ # [sk, b, np, hn] -> [sk, b * np, hn]
315
+ key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
316
+
317
+ # preallocting input tensor: [b * np, sq, sk]
318
+ matmul_input_buffer = torch.empty(
319
+ output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
320
+ device=query_layer.device
321
+ )
322
+
323
+ # Raw attention scores. [b * np, sq, sk]
324
+ matmul_result = torch.baddbmm(
325
+ matmul_input_buffer,
326
+ query_layer.transpose(0, 1), # [b * np, sq, hn]
327
+ key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk]
328
+ beta=0.0,
329
+ alpha=(1.0 / self.norm_factor),
330
+ )
331
+
332
+ # change view to [b, np, sq, sk]
333
+ attention_scores = matmul_result.view(*output_size)
334
+
335
+ # ===========================
336
+ # Attention probs and dropout
337
+ # ===========================
338
+
339
+ # attention scores and attention mask [b, np, sq, sk]
340
+ if self.attention_softmax_in_fp32:
341
+ attention_scores = attention_scores.float()
342
+ if self.coeff is not None:
343
+ attention_scores = attention_scores * self.coeff
344
+ if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
345
+ attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
346
+ device=attention_scores.device, dtype=torch.bool)
347
+ attention_mask.tril_()
348
+ attention_mask = ~attention_mask
349
+ if attention_mask is not None:
350
+ attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
351
+ attention_probs = F.softmax(attention_scores, dim=-1)
352
+ attention_probs = attention_probs.type_as(value_layer)
353
+
354
+ # This is actually dropping out entire tokens to attend to, which might
355
+ # seem a bit unusual, but is taken from the original Transformer paper.
356
+ attention_probs = self.attention_dropout(attention_probs)
357
+ # =========================
358
+ # Context layer. [sq, b, hp]
359
+ # =========================
360
+
361
+ # value_layer -> context layer.
362
+ # [sk, b, np, hn] --> [b, np, sq, hn]
363
+
364
+ # context layer shape: [b, np, sq, hn]
365
+ output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
366
+ # change view [sk, b * np, hn]
367
+ value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
368
+ # change view [b * np, sq, sk]
369
+ attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
370
+ # matmul: [b * np, sq, hn]
371
+ context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
372
+ # change view [b, np, sq, hn]
373
+ context_layer = context_layer.view(*output_size)
374
+ # [b, np, sq, hn] --> [sq, b, np, hn]
375
+ context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
376
+ # [sq, b, np, hn] --> [sq, b, hp]
377
+ new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
378
+ context_layer = context_layer.view(*new_context_layer_shape)
379
+
380
+ return context_layer
381
+
382
+
383
+ class SelfAttention(torch.nn.Module):
384
+ """Parallel self-attention layer abstract class.
385
+
386
+ Self-attention layer takes input with size [s, b, h]
387
+ and returns output of the same size.
388
+ """
389
+
390
+ def __init__(self, config: ChatGLMConfig, layer_number, device=None):
391
+ super(SelfAttention, self).__init__()
392
+ self.layer_number = max(1, layer_number)
393
+
394
+ self.projection_size = config.kv_channels * config.num_attention_heads
395
+
396
+ # Per attention head and per partition values.
397
+ self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
398
+ self.num_attention_heads_per_partition = config.num_attention_heads
399
+
400
+ self.multi_query_attention = config.multi_query_attention
401
+ self.qkv_hidden_size = 3 * self.projection_size
402
+ if self.multi_query_attention:
403
+ self.num_multi_query_groups_per_partition = config.multi_query_group_num
404
+ self.qkv_hidden_size = (
405
+ self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
406
+ )
407
+ self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
408
+ bias=config.add_bias_linear or config.add_qkv_bias,
409
+ device=device, **_config_to_kwargs(config)
410
+ )
411
+
412
+ self.core_attention = CoreAttention(config, self.layer_number)
413
+
414
+ # Output.
415
+ self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
416
+ device=device, **_config_to_kwargs(config)
417
+ )
418
+
419
+ def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
420
+ if self.multi_query_attention:
421
+ num_attention_heads = self.num_multi_query_groups_per_partition
422
+ else:
423
+ num_attention_heads = self.num_attention_heads_per_partition
424
+ return torch.empty(
425
+ inference_max_sequence_len,
426
+ batch_size,
427
+ num_attention_heads,
428
+ self.hidden_size_per_attention_head,
429
+ dtype=dtype,
430
+ device=device,
431
+ )
432
+
433
+ def forward(
434
+ self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
435
+ ):
436
+ # hidden_states: [sq, b, h]
437
+
438
+ # =================================================
439
+ # Pre-allocate memory for key-values for inference.
440
+ # =================================================
441
+ # =====================
442
+ # Query, Key, and Value
443
+ # =====================
444
+
445
+ # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
446
+ mixed_x_layer = self.query_key_value(hidden_states)
447
+
448
+ if self.multi_query_attention:
449
+ (query_layer, key_layer, value_layer) = mixed_x_layer.split(
450
+ [
451
+ self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
452
+ self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
453
+ self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
454
+ ],
455
+ dim=-1,
456
+ )
457
+ query_layer = query_layer.view(
458
+ query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
459
+ )
460
+ key_layer = key_layer.view(
461
+ key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
462
+ )
463
+ value_layer = value_layer.view(
464
+ value_layer.size()[:-1]
465
+ + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
466
+ )
467
+ else:
468
+ new_tensor_shape = mixed_x_layer.size()[:-1] + \
469
+ (self.num_attention_heads_per_partition,
470
+ 3 * self.hidden_size_per_attention_head)
471
+ mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
472
+
473
+ # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
474
+ (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
475
+
476
+ # apply relative positional encoding (rotary embedding)
477
+ if rotary_pos_emb is not None:
478
+ query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
479
+ key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
480
+
481
+ # adjust key and value for inference
482
+ if kv_cache is not None:
483
+ cache_k, cache_v = kv_cache
484
+ key_layer = torch.cat((cache_k, key_layer), dim=0)
485
+ value_layer = torch.cat((cache_v, value_layer), dim=0)
486
+ if use_cache:
487
+ kv_cache = (key_layer, value_layer)
488
+ else:
489
+ kv_cache = None
490
+
491
+ if self.multi_query_attention:
492
+ key_layer = key_layer.unsqueeze(-2)
493
+ key_layer = key_layer.expand(
494
+ -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
495
+ )
496
+ key_layer = key_layer.contiguous().view(
497
+ key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
498
+ )
499
+ value_layer = value_layer.unsqueeze(-2)
500
+ value_layer = value_layer.expand(
501
+ -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
502
+ )
503
+ value_layer = value_layer.contiguous().view(
504
+ value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
505
+ )
506
+
507
+ # ==================================
508
+ # core attention computation
509
+ # ==================================
510
+
511
+ context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
512
+
513
+ # =================
514
+ # Output. [sq, b, h]
515
+ # =================
516
+
517
+ output = self.dense(context_layer)
518
+
519
+ return output, kv_cache
520
+
521
+
522
+ def _config_to_kwargs(args):
523
+ common_kwargs = {
524
+ "dtype": args.torch_dtype,
525
+ }
526
+ return common_kwargs
527
+
528
+
529
+ class MLP(torch.nn.Module):
530
+ """MLP.
531
+
532
+ MLP will take the input with h hidden state, project it to 4*h
533
+ hidden dimension, perform nonlinear transformation, and project the
534
+ state back into h hidden dimension.
535
+ """
536
+
537
+ def __init__(self, config: ChatGLMConfig, device=None):
538
+ super(MLP, self).__init__()
539
+
540
+ self.add_bias = config.add_bias_linear
541
+
542
+ # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
543
+ self.dense_h_to_4h = nn.Linear(
544
+ config.hidden_size,
545
+ config.ffn_hidden_size * 2,
546
+ bias=self.add_bias,
547
+ device=device,
548
+ **_config_to_kwargs(config)
549
+ )
550
+
551
+ def swiglu(x):
552
+ x = torch.chunk(x, 2, dim=-1)
553
+ return F.silu(x[0]) * x[1]
554
+
555
+ self.activation_func = swiglu
556
+
557
+ # Project back to h.
558
+ self.dense_4h_to_h = nn.Linear(
559
+ config.ffn_hidden_size,
560
+ config.hidden_size,
561
+ bias=self.add_bias,
562
+ device=device,
563
+ **_config_to_kwargs(config)
564
+ )
565
+
566
+ def forward(self, hidden_states):
567
+ # [s, b, 4hp]
568
+ intermediate_parallel = self.dense_h_to_4h(hidden_states)
569
+ intermediate_parallel = self.activation_func(intermediate_parallel)
570
+ # [s, b, h]
571
+ output = self.dense_4h_to_h(intermediate_parallel)
572
+ return output
573
+
574
+
575
+ class GLMBlock(torch.nn.Module):
576
+ """A single transformer layer.
577
+
578
+ Transformer layer takes input with size [s, b, h] and returns an
579
+ output of the same size.
580
+ """
581
+
582
+ def __init__(self, config: ChatGLMConfig, layer_number, device=None):
583
+ super(GLMBlock, self).__init__()
584
+ self.layer_number = layer_number
585
+
586
+ self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
587
+
588
+ self.fp32_residual_connection = config.fp32_residual_connection
589
+
590
+ LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
591
+ # Layernorm on the input data.
592
+ self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
593
+ dtype=config.torch_dtype)
594
+
595
+ # Self attention.
596
+ self.self_attention = SelfAttention(config, layer_number, device=device)
597
+ self.hidden_dropout = config.hidden_dropout
598
+
599
+ # Layernorm on the attention output
600
+ self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
601
+ dtype=config.torch_dtype)
602
+
603
+ # MLP
604
+ self.mlp = MLP(config, device=device)
605
+
606
+ def forward(
607
+ self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
608
+ ):
609
+ # hidden_states: [s, b, h]
610
+
611
+ # Layer norm at the beginning of the transformer layer.
612
+ layernorm_output = self.input_layernorm(hidden_states)
613
+ # Self attention.
614
+ attention_output, kv_cache = self.self_attention(
615
+ layernorm_output,
616
+ attention_mask,
617
+ rotary_pos_emb,
618
+ kv_cache=kv_cache,
619
+ use_cache=use_cache
620
+ )
621
+
622
+ # Residual connection.
623
+ if self.apply_residual_connection_post_layernorm:
624
+ residual = layernorm_output
625
+ else:
626
+ residual = hidden_states
627
+
628
+ layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
629
+ layernorm_input = residual + layernorm_input
630
+
631
+ # Layer norm post the self attention.
632
+ layernorm_output = self.post_attention_layernorm(layernorm_input)
633
+
634
+ # MLP.
635
+ mlp_output = self.mlp(layernorm_output)
636
+
637
+ # Second residual connection.
638
+ if self.apply_residual_connection_post_layernorm:
639
+ residual = layernorm_output
640
+ else:
641
+ residual = layernorm_input
642
+
643
+ output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
644
+ output = residual + output
645
+
646
+ return output, kv_cache
647
+
648
+
649
+ class GLMTransformer(torch.nn.Module):
650
+ """Transformer class."""
651
+
652
+ def __init__(self, config: ChatGLMConfig, device=None):
653
+ super(GLMTransformer, self).__init__()
654
+
655
+ self.fp32_residual_connection = config.fp32_residual_connection
656
+ self.post_layer_norm = config.post_layer_norm
657
+
658
+ # Number of layers.
659
+ self.num_layers = config.num_layers
660
+
661
+ # Transformer layers.
662
+ def build_layer(layer_number):
663
+ return GLMBlock(config, layer_number, device=device)
664
+
665
+ self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
666
+
667
+ if self.post_layer_norm:
668
+ LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
669
+ # Final layer norm before output.
670
+ self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
671
+ dtype=config.torch_dtype)
672
+
673
+ self.gradient_checkpointing = False
674
+
675
+ def _get_layer(self, layer_number):
676
+ return self.layers[layer_number]
677
+
678
+ def forward(
679
+ self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
680
+ use_cache: Optional[bool] = True,
681
+ output_hidden_states: Optional[bool] = False,
682
+ ):
683
+ if not kv_caches:
684
+ kv_caches = [None for _ in range(self.num_layers)]
685
+ presents = () if use_cache else None
686
+ if self.gradient_checkpointing and self.training:
687
+ if use_cache:
688
+ logger.warning_once(
689
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
690
+ )
691
+ use_cache = False
692
+
693
+ all_self_attentions = None
694
+ all_hidden_states = () if output_hidden_states else None
695
+ for index in range(self.num_layers):
696
+ if output_hidden_states:
697
+ all_hidden_states = all_hidden_states + (hidden_states,)
698
+
699
+ layer = self._get_layer(index)
700
+ if self.gradient_checkpointing and self.training:
701
+ layer_ret = torch.utils.checkpoint.checkpoint(
702
+ layer,
703
+ hidden_states,
704
+ attention_mask,
705
+ rotary_pos_emb,
706
+ kv_caches[index],
707
+ use_cache
708
+ )
709
+ else:
710
+ layer_ret = layer(
711
+ hidden_states,
712
+ attention_mask,
713
+ rotary_pos_emb,
714
+ kv_cache=kv_caches[index],
715
+ use_cache=use_cache
716
+ )
717
+ hidden_states, kv_cache = layer_ret
718
+ if use_cache:
719
+ presents = presents + (kv_cache,)
720
+
721
+ if output_hidden_states:
722
+ all_hidden_states = all_hidden_states + (hidden_states,)
723
+
724
+ # Final layer norm.
725
+ if self.post_layer_norm:
726
+ hidden_states = self.final_layernorm(hidden_states)
727
+
728
+ return hidden_states, presents, all_hidden_states, all_self_attentions
729
+
730
+
731
+ class ChatGLMPreTrainedModel(PreTrainedModel):
732
+ """
733
+ An abstract class to handle weights initialization and
734
+ a simple interface for downloading and loading pretrained models.
735
+ """
736
+
737
+ is_parallelizable = False
738
+ supports_gradient_checkpointing = True
739
+ config_class = ChatGLMConfig
740
+ base_model_prefix = "transformer"
741
+ _no_split_modules = ["GLMBlock"]
742
+
743
+ def _init_weights(self, module: nn.Module):
744
+ """Initialize the weights."""
745
+ return
746
+
747
+ def get_masks(self, input_ids, past_key_values, padding_mask=None):
748
+ batch_size, seq_length = input_ids.shape
749
+ full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
750
+ full_attention_mask.tril_()
751
+ past_length = 0
752
+ if past_key_values:
753
+ past_length = past_key_values[0][0].shape[0]
754
+ if past_length:
755
+ full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
756
+ device=input_ids.device), full_attention_mask), dim=-1)
757
+ if padding_mask is not None:
758
+ full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
759
+ if not past_length and padding_mask is not None:
760
+ full_attention_mask -= padding_mask.unsqueeze(-1) - 1
761
+ full_attention_mask = (full_attention_mask < 0.5).bool()
762
+ full_attention_mask.unsqueeze_(1)
763
+ return full_attention_mask
764
+
765
+ def get_position_ids(self, input_ids, device):
766
+ batch_size, seq_length = input_ids.shape
767
+ position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
768
+ return position_ids
769
+
770
+ def _set_gradient_checkpointing(self, module, value=False):
771
+ if isinstance(module, GLMTransformer):
772
+ module.gradient_checkpointing = value
773
+
774
+
775
+ class Embedding(torch.nn.Module):
776
+ """Language model embeddings."""
777
+
778
+ def __init__(self, config: ChatGLMConfig, device=None):
779
+ super(Embedding, self).__init__()
780
+
781
+ self.hidden_size = config.hidden_size
782
+ # Word embeddings (parallel).
783
+ self.word_embeddings = nn.Embedding(
784
+ config.padded_vocab_size,
785
+ self.hidden_size,
786
+ dtype=config.torch_dtype,
787
+ device=device
788
+ )
789
+ self.fp32_residual_connection = config.fp32_residual_connection
790
+
791
+ def forward(self, input_ids):
792
+ # Embeddings.
793
+ words_embeddings = self.word_embeddings(input_ids)
794
+ embeddings = words_embeddings
795
+ # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
796
+ embeddings = embeddings.transpose(0, 1).contiguous()
797
+ # If the input flag for fp32 residual connection is set, convert for float.
798
+ if self.fp32_residual_connection:
799
+ embeddings = embeddings.float()
800
+ return embeddings
801
+
802
+
803
+ class ChatGLMModel(ChatGLMPreTrainedModel):
804
+ def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
805
+ super().__init__(config)
806
+ if empty_init:
807
+ init_method = skip_init
808
+ else:
809
+ init_method = default_init
810
+ init_kwargs = {}
811
+ if device is not None:
812
+ init_kwargs["device"] = device
813
+ self.embedding = init_method(Embedding, config, **init_kwargs)
814
+ self.num_layers = config.num_layers
815
+ self.multi_query_group_num = config.multi_query_group_num
816
+ self.kv_channels = config.kv_channels
817
+
818
+ # Rotary positional embeddings
819
+ self.seq_length = config.seq_length
820
+ rotary_dim = (
821
+ config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
822
+ )
823
+
824
+ self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
825
+ dtype=config.torch_dtype)
826
+ self.encoder = init_method(GLMTransformer, config, **init_kwargs)
827
+ self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
828
+ dtype=config.torch_dtype, **init_kwargs)
829
+ self.pre_seq_len = config.pre_seq_len
830
+ self.prefix_projection = config.prefix_projection
831
+ if self.pre_seq_len is not None:
832
+ for param in self.parameters():
833
+ param.requires_grad = False
834
+ self.prefix_tokens = torch.arange(self.pre_seq_len).long()
835
+ self.prefix_encoder = PrefixEncoder(config)
836
+ self.dropout = torch.nn.Dropout(0.1)
837
+
838
+ def get_input_embeddings(self):
839
+ return self.embedding.word_embeddings
840
+
841
+ def get_prompt(self, batch_size, device, dtype=torch.half):
842
+ prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
843
+ past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
844
+ past_key_values = past_key_values.view(
845
+ batch_size,
846
+ self.pre_seq_len,
847
+ self.num_layers * 2,
848
+ self.multi_query_group_num,
849
+ self.kv_channels
850
+ )
851
+ # seq_len, b, nh, hidden_size
852
+ past_key_values = self.dropout(past_key_values)
853
+ past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
854
+ return past_key_values
855
+
856
+ def forward(
857
+ self,
858
+ input_ids,
859
+ position_ids: Optional[torch.Tensor] = None,
860
+ attention_mask: Optional[torch.BoolTensor] = None,
861
+ full_attention_mask: Optional[torch.BoolTensor] = None,
862
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
863
+ inputs_embeds: Optional[torch.Tensor] = None,
864
+ use_cache: Optional[bool] = None,
865
+ output_hidden_states: Optional[bool] = None,
866
+ return_dict: Optional[bool] = None,
867
+ ):
868
+ output_hidden_states = (
869
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
870
+ )
871
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
872
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
873
+
874
+ batch_size, seq_length = input_ids.shape
875
+
876
+ if inputs_embeds is None:
877
+ inputs_embeds = self.embedding(input_ids)
878
+
879
+ if self.pre_seq_len is not None:
880
+ if past_key_values is None:
881
+ past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
882
+ dtype=inputs_embeds.dtype)
883
+ if attention_mask is not None:
884
+ attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
885
+ attention_mask], dim=-1)
886
+
887
+ if full_attention_mask is None:
888
+ if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
889
+ full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
890
+
891
+ # Rotary positional embeddings
892
+ rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
893
+ if position_ids is not None:
894
+ rotary_pos_emb = rotary_pos_emb[position_ids]
895
+ else:
896
+ rotary_pos_emb = rotary_pos_emb[None, :seq_length]
897
+ rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
898
+
899
+ # Run encoder.
900
+ hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
901
+ inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
902
+ kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
903
+ )
904
+
905
+ if not return_dict:
906
+ return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
907
+
908
+ return BaseModelOutputWithPast(
909
+ last_hidden_state=hidden_states,
910
+ past_key_values=presents,
911
+ hidden_states=all_hidden_states,
912
+ attentions=all_self_attentions,
913
+ )
914
+
915
+ def quantize(self, weight_bit_width: int):
916
+ from .quantization import quantize
917
+ quantize(self.encoder, weight_bit_width)
918
+ return self
919
+
920
+
921
+ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
922
+ def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
923
+ super().__init__(config)
924
+
925
+ self.max_sequence_length = config.max_length
926
+ self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
927
+ self.config = config
928
+ self.quantized = False
929
+
930
+ if self.config.quantization_bit:
931
+ self.quantize(self.config.quantization_bit, empty_init=True)
932
+
933
+ def _update_model_kwargs_for_generation(
934
+ self,
935
+ outputs: ModelOutput,
936
+ model_kwargs: Dict[str, Any],
937
+ is_encoder_decoder: bool = False,
938
+ standardize_cache_format: bool = False,
939
+ ) -> Dict[str, Any]:
940
+ # update past_key_values
941
+ model_kwargs["past_key_values"] = self._extract_past_from_model_output(
942
+ outputs, standardize_cache_format=standardize_cache_format
943
+ )
944
+
945
+ # update attention mask
946
+ if "attention_mask" in model_kwargs:
947
+ attention_mask = model_kwargs["attention_mask"]
948
+ model_kwargs["attention_mask"] = torch.cat(
949
+ [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
950
+ )
951
+
952
+ # update position ids
953
+ if "position_ids" in model_kwargs:
954
+ position_ids = model_kwargs["position_ids"]
955
+ new_position_id = position_ids[..., -1:].clone()
956
+ new_position_id += 1
957
+ model_kwargs["position_ids"] = torch.cat(
958
+ [position_ids, new_position_id], dim=-1
959
+ )
960
+
961
+ model_kwargs["is_first_forward"] = False
962
+ return model_kwargs
963
+
964
+ def prepare_inputs_for_generation(
965
+ self,
966
+ input_ids: torch.LongTensor,
967
+ past_key_values: Optional[torch.Tensor] = None,
968
+ attention_mask: Optional[torch.Tensor] = None,
969
+ position_ids: Optional[torch.Tensor] = None,
970
+ use_cache: Optional[bool] = None,
971
+ is_first_forward: bool = True,
972
+ **kwargs
973
+ ) -> dict:
974
+ # only last token for input_ids if past is not None
975
+ if position_ids is None:
976
+ position_ids = self.get_position_ids(input_ids, device=input_ids.device)
977
+ if not is_first_forward:
978
+ if past_key_values is not None:
979
+ position_ids = position_ids[..., -1:]
980
+ input_ids = input_ids[:, -1:]
981
+ return {
982
+ "input_ids": input_ids,
983
+ "past_key_values": past_key_values,
984
+ "position_ids": position_ids,
985
+ "attention_mask": attention_mask,
986
+ "return_last_logit": True,
987
+ "use_cache": use_cache
988
+ }
989
+
990
+ def forward(
991
+ self,
992
+ input_ids: Optional[torch.Tensor] = None,
993
+ position_ids: Optional[torch.Tensor] = None,
994
+ attention_mask: Optional[torch.Tensor] = None,
995
+ past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
996
+ inputs_embeds: Optional[torch.Tensor] = None,
997
+ labels: Optional[torch.Tensor] = None,
998
+ use_cache: Optional[bool] = None,
999
+ output_attentions: Optional[bool] = None,
1000
+ output_hidden_states: Optional[bool] = None,
1001
+ return_dict: Optional[bool] = None,
1002
+ return_last_logit: Optional[bool] = False,
1003
+ ):
1004
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1005
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1006
+
1007
+ transformer_outputs = self.transformer(
1008
+ input_ids=input_ids,
1009
+ position_ids=position_ids,
1010
+ attention_mask=attention_mask,
1011
+ past_key_values=past_key_values,
1012
+ inputs_embeds=inputs_embeds,
1013
+ use_cache=use_cache,
1014
+ output_hidden_states=output_hidden_states,
1015
+ return_dict=return_dict,
1016
+ )
1017
+
1018
+ hidden_states = transformer_outputs[0]
1019
+ if return_last_logit:
1020
+ hidden_states = hidden_states[-1:]
1021
+ lm_logits = self.transformer.output_layer(hidden_states)
1022
+ lm_logits = lm_logits.transpose(0, 1).contiguous()
1023
+
1024
+ loss = None
1025
+ if labels is not None:
1026
+ lm_logits = lm_logits.to(torch.float32)
1027
+
1028
+ # Shift so that tokens < n predict n
1029
+ shift_logits = lm_logits[..., :-1, :].contiguous()
1030
+ shift_labels = labels[..., 1:].contiguous()
1031
+ # Flatten the tokens
1032
+ loss_fct = CrossEntropyLoss(ignore_index=-100)
1033
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
1034
+
1035
+ lm_logits = lm_logits.to(hidden_states.dtype)
1036
+ loss = loss.to(hidden_states.dtype)
1037
+
1038
+ if not return_dict:
1039
+ output = (lm_logits,) + transformer_outputs[1:]
1040
+ return ((loss,) + output) if loss is not None else output
1041
+
1042
+ return CausalLMOutputWithPast(
1043
+ loss=loss,
1044
+ logits=lm_logits,
1045
+ past_key_values=transformer_outputs.past_key_values,
1046
+ hidden_states=transformer_outputs.hidden_states,
1047
+ attentions=transformer_outputs.attentions,
1048
+ )
1049
+
1050
+ @staticmethod
1051
+ def _reorder_cache(
1052
+ past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
1053
+ ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
1054
+ """
1055
+ This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
1056
+ [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
1057
+ beam_idx at every generation step.
1058
+
1059
+ Output shares the same memory storage as `past`.
1060
+ """
1061
+ return tuple(
1062
+ (
1063
+ layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
1064
+ layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
1065
+ )
1066
+ for layer_past in past
1067
+ )
1068
+
1069
+ def process_response(self, output, history):
1070
+ content = ""
1071
+ history = deepcopy(history)
1072
+ for response in output.split("<|assistant|>"):
1073
+ metadata, content = response.split("\n", maxsplit=1)
1074
+ if not metadata.strip():
1075
+ content = content.strip()
1076
+ history.append({"role": "assistant", "metadata": metadata, "content": content})
1077
+ content = content.replace("[[训练时间]]", "2023年")
1078
+ else:
1079
+ history.append({"role": "assistant", "metadata": metadata, "content": content})
1080
+ if history[0]["role"] == "system" and "tools" in history[0]:
1081
+ content = "\n".join(content.split("\n")[1:-1])
1082
+ def tool_call(**kwargs):
1083
+ return kwargs
1084
+ parameters = eval(content)
1085
+ content = {"name": metadata.strip(), "parameters": parameters}
1086
+ else:
1087
+ content = {"name": metadata.strip(), "content": content}
1088
+ return content, history
1089
+
1090
+ @torch.inference_mode()
1091
+ def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, role: str = "user",
1092
+ max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
1093
+ **kwargs):
1094
+ if history is None:
1095
+ history = []
1096
+ if logits_processor is None:
1097
+ logits_processor = LogitsProcessorList()
1098
+ logits_processor.append(InvalidScoreLogitsProcessor())
1099
+ gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
1100
+ "temperature": temperature, "logits_processor": logits_processor, **kwargs}
1101
+ inputs = tokenizer.build_chat_input(query, history=history, role=role)
1102
+ inputs = inputs.to(self.device)
1103
+ eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
1104
+ tokenizer.get_command("<|observation|>")]
1105
+ outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
1106
+ outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
1107
+ response = tokenizer.decode(outputs)
1108
+ history.append({"role": role, "content": query})
1109
+ response, history = self.process_response(response, history)
1110
+ return response, history
1111
+
1112
+ @torch.inference_mode()
1113
+ def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, role: str = "user",
1114
+ past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
1115
+ logits_processor=None, return_past_key_values=False, **kwargs):
1116
+ if history is None:
1117
+ history = []
1118
+ if logits_processor is None:
1119
+ logits_processor = LogitsProcessorList()
1120
+ logits_processor.append(InvalidScoreLogitsProcessor())
1121
+ eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
1122
+ tokenizer.get_command("<|observation|>")]
1123
+ gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
1124
+ "temperature": temperature, "logits_processor": logits_processor, **kwargs}
1125
+ if past_key_values is None:
1126
+ inputs = tokenizer.build_chat_input(query, history=history, role=role)
1127
+ else:
1128
+ inputs = tokenizer.build_chat_input(query, role=role)
1129
+ inputs = inputs.to(self.device)
1130
+ if past_key_values is not None:
1131
+ past_length = past_key_values[0][0].shape[0]
1132
+ if self.transformer.pre_seq_len is not None:
1133
+ past_length -= self.transformer.pre_seq_len
1134
+ inputs.position_ids += past_length
1135
+ attention_mask = inputs.attention_mask
1136
+ attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
1137
+ inputs['attention_mask'] = attention_mask
1138
+ history.append({"role": role, "content": query})
1139
+ for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
1140
+ eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
1141
+ **gen_kwargs):
1142
+ if return_past_key_values:
1143
+ outputs, past_key_values = outputs
1144
+ outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
1145
+ response = tokenizer.decode(outputs)
1146
+ if response and response[-1] != "�":
1147
+ response, new_history = self.process_response(response, history)
1148
+ if return_past_key_values:
1149
+ yield response, new_history, past_key_values
1150
+ else:
1151
+ yield response, new_history
1152
+
1153
+ @torch.inference_mode()
1154
+ def stream_generate(
1155
+ self,
1156
+ input_ids,
1157
+ generation_config: Optional[GenerationConfig] = None,
1158
+ logits_processor: Optional[LogitsProcessorList] = None,
1159
+ stopping_criteria: Optional[StoppingCriteriaList] = None,
1160
+ prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
1161
+ return_past_key_values=False,
1162
+ **kwargs,
1163
+ ):
1164
+ batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
1165
+
1166
+ if generation_config is None:
1167
+ generation_config = self.generation_config
1168
+ generation_config = copy.deepcopy(generation_config)
1169
+ model_kwargs = generation_config.update(**kwargs)
1170
+ model_kwargs["use_cache"] = generation_config.use_cache
1171
+ bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
1172
+
1173
+ if isinstance(eos_token_id, int):
1174
+ eos_token_id = [eos_token_id]
1175
+ eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
1176
+
1177
+ has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
1178
+ if has_default_max_length and generation_config.max_new_tokens is None:
1179
+ warnings.warn(
1180
+ f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
1181
+ "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
1182
+ " recommend using `max_new_tokens` to control the maximum length of the generation.",
1183
+ UserWarning,
1184
+ )
1185
+ elif generation_config.max_new_tokens is not None:
1186
+ generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
1187
+ if not has_default_max_length:
1188
+ logger.warn(
1189
+ f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
1190
+ f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
1191
+ "Please refer to the documentation for more information. "
1192
+ "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
1193
+ UserWarning,
1194
+ )
1195
+
1196
+ if input_ids_seq_length >= generation_config.max_length:
1197
+ input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
1198
+ logger.warning(
1199
+ f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
1200
+ f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
1201
+ " increasing `max_new_tokens`."
1202
+ )
1203
+
1204
+ # 2. Set generation parameters if not already defined
1205
+ logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
1206
+ stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
1207
+
1208
+ logits_processor = self._get_logits_processor(
1209
+ generation_config=generation_config,
1210
+ input_ids_seq_length=input_ids_seq_length,
1211
+ encoder_input_ids=input_ids,
1212
+ prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
1213
+ logits_processor=logits_processor,
1214
+ )
1215
+
1216
+ stopping_criteria = self._get_stopping_criteria(
1217
+ generation_config=generation_config, stopping_criteria=stopping_criteria
1218
+ )
1219
+ logits_warper = self._get_logits_warper(generation_config)
1220
+
1221
+ unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
1222
+ scores = None
1223
+ while True:
1224
+ model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
1225
+ # forward pass to get next token
1226
+ outputs = self(
1227
+ **model_inputs,
1228
+ return_dict=True,
1229
+ output_attentions=False,
1230
+ output_hidden_states=False,
1231
+ )
1232
+
1233
+ next_token_logits = outputs.logits[:, -1, :]
1234
+
1235
+ # pre-process distribution
1236
+ next_token_scores = logits_processor(input_ids, next_token_logits)
1237
+ next_token_scores = logits_warper(input_ids, next_token_scores)
1238
+
1239
+ # sample
1240
+ probs = nn.functional.softmax(next_token_scores, dim=-1)
1241
+ if generation_config.do_sample:
1242
+ next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
1243
+ else:
1244
+ next_tokens = torch.argmax(probs, dim=-1)
1245
+ # update generated ids, model inputs, and length for next step
1246
+ input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
1247
+ model_kwargs = self._update_model_kwargs_for_generation(
1248
+ outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
1249
+ )
1250
+ unfinished_sequences = unfinished_sequences.mul(
1251
+ next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
1252
+ )
1253
+ if return_past_key_values:
1254
+ yield input_ids, outputs.past_key_values
1255
+ else:
1256
+ yield input_ids
1257
+ # stop when each sentence is finished, or if we exceed the maximum length
1258
+ if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
1259
+ break
1260
+
1261
+ def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
1262
+ if bits == 0:
1263
+ return
1264
+
1265
+ from .quantization import quantize
1266
+
1267
+ if self.quantized:
1268
+ logger.info("Already quantized.")
1269
+ return self
1270
+
1271
+ self.quantized = True
1272
+
1273
+ self.config.quantization_bit = bits
1274
+
1275
+ self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
1276
+ **kwargs)
1277
+ return self
1278
+
1279
+
1280
+ class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
1281
+ def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
1282
+ super().__init__(config)
1283
+
1284
+ self.num_labels = config.num_labels
1285
+ self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
1286
+
1287
+ self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
1288
+ if config.classifier_dropout is not None:
1289
+ self.dropout = nn.Dropout(config.classifier_dropout)
1290
+ else:
1291
+ self.dropout = None
1292
+ self.config = config
1293
+
1294
+ if self.config.quantization_bit:
1295
+ self.quantize(self.config.quantization_bit, empty_init=True)
1296
+
1297
+ def forward(
1298
+ self,
1299
+ input_ids: Optional[torch.LongTensor] = None,
1300
+ position_ids: Optional[torch.LongTensor] = None,
1301
+ attention_mask: Optional[torch.Tensor] = None,
1302
+ full_attention_mask: Optional[torch.Tensor] = None,
1303
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
1304
+ inputs_embeds: Optional[torch.LongTensor] = None,
1305
+ labels: Optional[torch.LongTensor] = None,
1306
+ use_cache: Optional[bool] = None,
1307
+ output_hidden_states: Optional[bool] = None,
1308
+ return_dict: Optional[bool] = None,
1309
+ ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
1310
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1311
+
1312
+ transformer_outputs = self.transformer(
1313
+ input_ids=input_ids,
1314
+ position_ids=position_ids,
1315
+ attention_mask=attention_mask,
1316
+ full_attention_mask=full_attention_mask,
1317
+ past_key_values=past_key_values,
1318
+ inputs_embeds=inputs_embeds,
1319
+ use_cache=use_cache,
1320
+ output_hidden_states=output_hidden_states,
1321
+ return_dict=return_dict,
1322
+ )
1323
+
1324
+ hidden_states = transformer_outputs[0]
1325
+ pooled_hidden_states = hidden_states[-1]
1326
+ if self.dropout is not None:
1327
+ pooled_hidden_states = self.dropout(pooled_hidden_states)
1328
+ logits = self.classifier_head(pooled_hidden_states)
1329
+
1330
+ loss = None
1331
+ if labels is not None:
1332
+ if self.config.problem_type is None:
1333
+ if self.num_labels == 1:
1334
+ self.config.problem_type = "regression"
1335
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1336
+ self.config.problem_type = "single_label_classification"
1337
+ else:
1338
+ self.config.problem_type = "multi_label_classification"
1339
+
1340
+ if self.config.problem_type == "regression":
1341
+ loss_fct = MSELoss()
1342
+ if self.num_labels == 1:
1343
+ loss = loss_fct(logits.squeeze().float(), labels.squeeze())
1344
+ else:
1345
+ loss = loss_fct(logits.float(), labels)
1346
+ elif self.config.problem_type == "single_label_classification":
1347
+ loss_fct = CrossEntropyLoss()
1348
+ loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
1349
+ elif self.config.problem_type == "multi_label_classification":
1350
+ loss_fct = BCEWithLogitsLoss()
1351
+ loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
1352
+
1353
+ if not return_dict:
1354
+ output = (logits,) + transformer_outputs[1:]
1355
+ return ((loss,) + output) if loss is not None else output
1356
+
1357
+ return SequenceClassifierOutputWithPast(
1358
+ loss=loss,
1359
+ logits=logits,
1360
+ past_key_values=transformer_outputs.past_key_values,
1361
+ hidden_states=transformer_outputs.hidden_states,
1362
+ attentions=transformer_outputs.attentions,
1363
+ )
diffsynth/models/sd3_dit.py ADDED
@@ -0,0 +1,797 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from einops import rearrange
3
+ from .svd_unet import TemporalTimesteps
4
+ from .tiler import TileWorker
5
+
6
+
7
+
8
+ class PatchEmbed(torch.nn.Module):
9
+ def __init__(self, patch_size=2, in_channels=16, embed_dim=1536, pos_embed_max_size=192):
10
+ super().__init__()
11
+ self.pos_embed_max_size = pos_embed_max_size
12
+ self.patch_size = patch_size
13
+
14
+ self.proj = torch.nn.Conv2d(in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size)
15
+ self.pos_embed = torch.nn.Parameter(torch.zeros(1, self.pos_embed_max_size, self.pos_embed_max_size, 1536))
16
+
17
+ def cropped_pos_embed(self, height, width):
18
+ height = height // self.patch_size
19
+ width = width // self.patch_size
20
+ top = (self.pos_embed_max_size - height) // 2
21
+ left = (self.pos_embed_max_size - width) // 2
22
+ spatial_pos_embed = self.pos_embed[:, top : top + height, left : left + width, :].flatten(1, 2)
23
+ return spatial_pos_embed
24
+
25
+ def forward(self, latent):
26
+ height, width = latent.shape[-2:]
27
+ latent = self.proj(latent)
28
+ latent = latent.flatten(2).transpose(1, 2)
29
+ pos_embed = self.cropped_pos_embed(height, width)
30
+ return latent + pos_embed
31
+
32
+
33
+
34
+ class TimestepEmbeddings(torch.nn.Module):
35
+ def __init__(self, dim_in, dim_out):
36
+ super().__init__()
37
+ self.time_proj = TemporalTimesteps(num_channels=dim_in, flip_sin_to_cos=True, downscale_freq_shift=0)
38
+ self.timestep_embedder = torch.nn.Sequential(
39
+ torch.nn.Linear(dim_in, dim_out), torch.nn.SiLU(), torch.nn.Linear(dim_out, dim_out)
40
+ )
41
+
42
+ def forward(self, timestep, dtype):
43
+ time_emb = self.time_proj(timestep).to(dtype)
44
+ time_emb = self.timestep_embedder(time_emb)
45
+ return time_emb
46
+
47
+
48
+
49
+ class AdaLayerNorm(torch.nn.Module):
50
+ def __init__(self, dim, single=False):
51
+ super().__init__()
52
+ self.single = single
53
+ self.linear = torch.nn.Linear(dim, dim * (2 if single else 6))
54
+ self.norm = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
55
+
56
+ def forward(self, x, emb):
57
+ emb = self.linear(torch.nn.functional.silu(emb))
58
+ if self.single:
59
+ scale, shift = emb.unsqueeze(1).chunk(2, dim=2)
60
+ x = self.norm(x) * (1 + scale) + shift
61
+ return x
62
+ else:
63
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.unsqueeze(1).chunk(6, dim=2)
64
+ x = self.norm(x) * (1 + scale_msa) + shift_msa
65
+ return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
66
+
67
+
68
+
69
+ class JointAttention(torch.nn.Module):
70
+ def __init__(self, dim_a, dim_b, num_heads, head_dim, only_out_a=False):
71
+ super().__init__()
72
+ self.num_heads = num_heads
73
+ self.head_dim = head_dim
74
+ self.only_out_a = only_out_a
75
+
76
+ self.a_to_qkv = torch.nn.Linear(dim_a, dim_a * 3)
77
+ self.b_to_qkv = torch.nn.Linear(dim_b, dim_b * 3)
78
+
79
+ self.a_to_out = torch.nn.Linear(dim_a, dim_a)
80
+ if not only_out_a:
81
+ self.b_to_out = torch.nn.Linear(dim_b, dim_b)
82
+
83
+ def forward(self, hidden_states_a, hidden_states_b):
84
+ batch_size = hidden_states_a.shape[0]
85
+
86
+ qkv = torch.concat([self.a_to_qkv(hidden_states_a), self.b_to_qkv(hidden_states_b)], dim=1)
87
+ qkv = qkv.view(batch_size, -1, 3 * self.num_heads, self.head_dim).transpose(1, 2)
88
+ q, k, v = qkv.chunk(3, dim=1)
89
+
90
+ hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v)
91
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
92
+ hidden_states = hidden_states.to(q.dtype)
93
+ hidden_states_a, hidden_states_b = hidden_states[:, :hidden_states_a.shape[1]], hidden_states[:, hidden_states_a.shape[1]:]
94
+ hidden_states_a = self.a_to_out(hidden_states_a)
95
+ if self.only_out_a:
96
+ return hidden_states_a
97
+ else:
98
+ hidden_states_b = self.b_to_out(hidden_states_b)
99
+ return hidden_states_a, hidden_states_b
100
+
101
+
102
+
103
+ class JointTransformerBlock(torch.nn.Module):
104
+ def __init__(self, dim, num_attention_heads):
105
+ super().__init__()
106
+ self.norm1_a = AdaLayerNorm(dim)
107
+ self.norm1_b = AdaLayerNorm(dim)
108
+
109
+ self.attn = JointAttention(dim, dim, num_attention_heads, dim // num_attention_heads)
110
+
111
+ self.norm2_a = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
112
+ self.ff_a = torch.nn.Sequential(
113
+ torch.nn.Linear(dim, dim*4),
114
+ torch.nn.GELU(approximate="tanh"),
115
+ torch.nn.Linear(dim*4, dim)
116
+ )
117
+
118
+ self.norm2_b = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
119
+ self.ff_b = torch.nn.Sequential(
120
+ torch.nn.Linear(dim, dim*4),
121
+ torch.nn.GELU(approximate="tanh"),
122
+ torch.nn.Linear(dim*4, dim)
123
+ )
124
+
125
+
126
+ def forward(self, hidden_states_a, hidden_states_b, temb):
127
+ norm_hidden_states_a, gate_msa_a, shift_mlp_a, scale_mlp_a, gate_mlp_a = self.norm1_a(hidden_states_a, emb=temb)
128
+ norm_hidden_states_b, gate_msa_b, shift_mlp_b, scale_mlp_b, gate_mlp_b = self.norm1_b(hidden_states_b, emb=temb)
129
+
130
+ # Attention
131
+ attn_output_a, attn_output_b = self.attn(norm_hidden_states_a, norm_hidden_states_b)
132
+
133
+ # Part A
134
+ hidden_states_a = hidden_states_a + gate_msa_a * attn_output_a
135
+ norm_hidden_states_a = self.norm2_a(hidden_states_a) * (1 + scale_mlp_a) + shift_mlp_a
136
+ hidden_states_a = hidden_states_a + gate_mlp_a * self.ff_a(norm_hidden_states_a)
137
+
138
+ # Part B
139
+ hidden_states_b = hidden_states_b + gate_msa_b * attn_output_b
140
+ norm_hidden_states_b = self.norm2_b(hidden_states_b) * (1 + scale_mlp_b) + shift_mlp_b
141
+ hidden_states_b = hidden_states_b + gate_mlp_b * self.ff_b(norm_hidden_states_b)
142
+
143
+ return hidden_states_a, hidden_states_b
144
+
145
+
146
+
147
+ class JointTransformerFinalBlock(torch.nn.Module):
148
+ def __init__(self, dim, num_attention_heads):
149
+ super().__init__()
150
+ self.norm1_a = AdaLayerNorm(dim)
151
+ self.norm1_b = AdaLayerNorm(dim, single=True)
152
+
153
+ self.attn = JointAttention(dim, dim, num_attention_heads, dim // num_attention_heads, only_out_a=True)
154
+
155
+ self.norm2_a = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
156
+ self.ff_a = torch.nn.Sequential(
157
+ torch.nn.Linear(dim, dim*4),
158
+ torch.nn.GELU(approximate="tanh"),
159
+ torch.nn.Linear(dim*4, dim)
160
+ )
161
+
162
+
163
+ def forward(self, hidden_states_a, hidden_states_b, temb):
164
+ norm_hidden_states_a, gate_msa_a, shift_mlp_a, scale_mlp_a, gate_mlp_a = self.norm1_a(hidden_states_a, emb=temb)
165
+ norm_hidden_states_b = self.norm1_b(hidden_states_b, emb=temb)
166
+
167
+ # Attention
168
+ attn_output_a = self.attn(norm_hidden_states_a, norm_hidden_states_b)
169
+
170
+ # Part A
171
+ hidden_states_a = hidden_states_a + gate_msa_a * attn_output_a
172
+ norm_hidden_states_a = self.norm2_a(hidden_states_a) * (1 + scale_mlp_a) + shift_mlp_a
173
+ hidden_states_a = hidden_states_a + gate_mlp_a * self.ff_a(norm_hidden_states_a)
174
+
175
+ return hidden_states_a, hidden_states_b
176
+
177
+
178
+
179
+ class SD3DiT(torch.nn.Module):
180
+ def __init__(self):
181
+ super().__init__()
182
+ self.pos_embedder = PatchEmbed(patch_size=2, in_channels=16, embed_dim=1536, pos_embed_max_size=192)
183
+ self.time_embedder = TimestepEmbeddings(256, 1536)
184
+ self.pooled_text_embedder = torch.nn.Sequential(torch.nn.Linear(2048, 1536), torch.nn.SiLU(), torch.nn.Linear(1536, 1536))
185
+ self.context_embedder = torch.nn.Linear(4096, 1536)
186
+ self.blocks = torch.nn.ModuleList([JointTransformerBlock(1536, 24) for _ in range(23)] + [JointTransformerFinalBlock(1536, 24)])
187
+ self.norm_out = AdaLayerNorm(1536, single=True)
188
+ self.proj_out = torch.nn.Linear(1536, 64)
189
+
190
+ def tiled_forward(self, hidden_states, timestep, prompt_emb, pooled_prompt_emb, tile_size=128, tile_stride=64):
191
+ # Due to the global positional embedding, we cannot implement layer-wise tiled forward.
192
+ hidden_states = TileWorker().tiled_forward(
193
+ lambda x: self.forward(x, timestep, prompt_emb, pooled_prompt_emb),
194
+ hidden_states,
195
+ tile_size,
196
+ tile_stride,
197
+ tile_device=hidden_states.device,
198
+ tile_dtype=hidden_states.dtype
199
+ )
200
+ return hidden_states
201
+
202
+ def forward(self, hidden_states, timestep, prompt_emb, pooled_prompt_emb, tiled=False, tile_size=128, tile_stride=64, use_gradient_checkpointing=False):
203
+ if tiled:
204
+ return self.tiled_forward(hidden_states, timestep, prompt_emb, pooled_prompt_emb, tile_size, tile_stride)
205
+ conditioning = self.time_embedder(timestep, hidden_states.dtype) + self.pooled_text_embedder(pooled_prompt_emb)
206
+ prompt_emb = self.context_embedder(prompt_emb)
207
+
208
+ height, width = hidden_states.shape[-2:]
209
+ hidden_states = self.pos_embedder(hidden_states)
210
+
211
+ def create_custom_forward(module):
212
+ def custom_forward(*inputs):
213
+ return module(*inputs)
214
+ return custom_forward
215
+
216
+ for block in self.blocks:
217
+ if self.training and use_gradient_checkpointing:
218
+ hidden_states, prompt_emb = torch.utils.checkpoint.checkpoint(
219
+ create_custom_forward(block),
220
+ hidden_states, prompt_emb, conditioning,
221
+ use_reentrant=False,
222
+ )
223
+ else:
224
+ hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning)
225
+
226
+ hidden_states = self.norm_out(hidden_states, conditioning)
227
+ hidden_states = self.proj_out(hidden_states)
228
+ hidden_states = rearrange(hidden_states, "B (H W) (P Q C) -> B C (H P) (W Q)", P=2, Q=2, H=height//2, W=width//2)
229
+ return hidden_states
230
+
231
+ def state_dict_converter(self):
232
+ return SD3DiTStateDictConverter()
233
+
234
+
235
+
236
+ class SD3DiTStateDictConverter:
237
+ def __init__(self):
238
+ pass
239
+
240
+ def from_diffusers(self, state_dict):
241
+ rename_dict = {
242
+ "context_embedder": "context_embedder",
243
+ "pos_embed.pos_embed": "pos_embedder.pos_embed",
244
+ "pos_embed.proj": "pos_embedder.proj",
245
+ "time_text_embed.timestep_embedder.linear_1": "time_embedder.timestep_embedder.0",
246
+ "time_text_embed.timestep_embedder.linear_2": "time_embedder.timestep_embedder.2",
247
+ "time_text_embed.text_embedder.linear_1": "pooled_text_embedder.0",
248
+ "time_text_embed.text_embedder.linear_2": "pooled_text_embedder.2",
249
+ "norm_out.linear": "norm_out.linear",
250
+ "proj_out": "proj_out",
251
+
252
+ "norm1.linear": "norm1_a.linear",
253
+ "norm1_context.linear": "norm1_b.linear",
254
+ "attn.to_q": "attn.a_to_q",
255
+ "attn.to_k": "attn.a_to_k",
256
+ "attn.to_v": "attn.a_to_v",
257
+ "attn.to_out.0": "attn.a_to_out",
258
+ "attn.add_q_proj": "attn.b_to_q",
259
+ "attn.add_k_proj": "attn.b_to_k",
260
+ "attn.add_v_proj": "attn.b_to_v",
261
+ "attn.to_add_out": "attn.b_to_out",
262
+ "ff.net.0.proj": "ff_a.0",
263
+ "ff.net.2": "ff_a.2",
264
+ "ff_context.net.0.proj": "ff_b.0",
265
+ "ff_context.net.2": "ff_b.2",
266
+ }
267
+ state_dict_ = {}
268
+ for name, param in state_dict.items():
269
+ if name in rename_dict:
270
+ if name == "pos_embed.pos_embed":
271
+ param = param.reshape((1, 192, 192, 1536))
272
+ state_dict_[rename_dict[name]] = param
273
+ elif name.endswith(".weight") or name.endswith(".bias"):
274
+ suffix = ".weight" if name.endswith(".weight") else ".bias"
275
+ prefix = name[:-len(suffix)]
276
+ if prefix in rename_dict:
277
+ state_dict_[rename_dict[prefix] + suffix] = param
278
+ elif prefix.startswith("transformer_blocks."):
279
+ names = prefix.split(".")
280
+ names[0] = "blocks"
281
+ middle = ".".join(names[2:])
282
+ if middle in rename_dict:
283
+ name_ = ".".join(names[:2] + [rename_dict[middle]] + [suffix[1:]])
284
+ state_dict_[name_] = param
285
+ return state_dict_
286
+
287
+ def from_civitai(self, state_dict):
288
+ rename_dict = {
289
+ "model.diffusion_model.context_embedder.bias": "context_embedder.bias",
290
+ "model.diffusion_model.context_embedder.weight": "context_embedder.weight",
291
+ "model.diffusion_model.final_layer.linear.bias": "proj_out.bias",
292
+ "model.diffusion_model.final_layer.linear.weight": "proj_out.weight",
293
+ "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.bias": "blocks.0.norm1_b.linear.bias",
294
+ "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.weight": "blocks.0.norm1_b.linear.weight",
295
+ "model.diffusion_model.joint_blocks.0.context_block.attn.proj.bias": "blocks.0.attn.b_to_out.bias",
296
+ "model.diffusion_model.joint_blocks.0.context_block.attn.proj.weight": "blocks.0.attn.b_to_out.weight",
297
+ "model.diffusion_model.joint_blocks.0.context_block.attn.qkv.bias": ['blocks.0.attn.b_to_q.bias', 'blocks.0.attn.b_to_k.bias', 'blocks.0.attn.b_to_v.bias'],
298
+ "model.diffusion_model.joint_blocks.0.context_block.attn.qkv.weight": ['blocks.0.attn.b_to_q.weight', 'blocks.0.attn.b_to_k.weight', 'blocks.0.attn.b_to_v.weight'],
299
+ "model.diffusion_model.joint_blocks.0.context_block.mlp.fc1.bias": "blocks.0.ff_b.0.bias",
300
+ "model.diffusion_model.joint_blocks.0.context_block.mlp.fc1.weight": "blocks.0.ff_b.0.weight",
301
+ "model.diffusion_model.joint_blocks.0.context_block.mlp.fc2.bias": "blocks.0.ff_b.2.bias",
302
+ "model.diffusion_model.joint_blocks.0.context_block.mlp.fc2.weight": "blocks.0.ff_b.2.weight",
303
+ "model.diffusion_model.joint_blocks.0.x_block.adaLN_modulation.1.bias": "blocks.0.norm1_a.linear.bias",
304
+ "model.diffusion_model.joint_blocks.0.x_block.adaLN_modulation.1.weight": "blocks.0.norm1_a.linear.weight",
305
+ "model.diffusion_model.joint_blocks.0.x_block.attn.proj.bias": "blocks.0.attn.a_to_out.bias",
306
+ "model.diffusion_model.joint_blocks.0.x_block.attn.proj.weight": "blocks.0.attn.a_to_out.weight",
307
+ "model.diffusion_model.joint_blocks.0.x_block.attn.qkv.bias": ['blocks.0.attn.a_to_q.bias', 'blocks.0.attn.a_to_k.bias', 'blocks.0.attn.a_to_v.bias'],
308
+ "model.diffusion_model.joint_blocks.0.x_block.attn.qkv.weight": ['blocks.0.attn.a_to_q.weight', 'blocks.0.attn.a_to_k.weight', 'blocks.0.attn.a_to_v.weight'],
309
+ "model.diffusion_model.joint_blocks.0.x_block.mlp.fc1.bias": "blocks.0.ff_a.0.bias",
310
+ "model.diffusion_model.joint_blocks.0.x_block.mlp.fc1.weight": "blocks.0.ff_a.0.weight",
311
+ "model.diffusion_model.joint_blocks.0.x_block.mlp.fc2.bias": "blocks.0.ff_a.2.bias",
312
+ "model.diffusion_model.joint_blocks.0.x_block.mlp.fc2.weight": "blocks.0.ff_a.2.weight",
313
+ "model.diffusion_model.joint_blocks.1.context_block.adaLN_modulation.1.bias": "blocks.1.norm1_b.linear.bias",
314
+ "model.diffusion_model.joint_blocks.1.context_block.adaLN_modulation.1.weight": "blocks.1.norm1_b.linear.weight",
315
+ "model.diffusion_model.joint_blocks.1.context_block.attn.proj.bias": "blocks.1.attn.b_to_out.bias",
316
+ "model.diffusion_model.joint_blocks.1.context_block.attn.proj.weight": "blocks.1.attn.b_to_out.weight",
317
+ "model.diffusion_model.joint_blocks.1.context_block.attn.qkv.bias": ['blocks.1.attn.b_to_q.bias', 'blocks.1.attn.b_to_k.bias', 'blocks.1.attn.b_to_v.bias'],
318
+ "model.diffusion_model.joint_blocks.1.context_block.attn.qkv.weight": ['blocks.1.attn.b_to_q.weight', 'blocks.1.attn.b_to_k.weight', 'blocks.1.attn.b_to_v.weight'],
319
+ "model.diffusion_model.joint_blocks.1.context_block.mlp.fc1.bias": "blocks.1.ff_b.0.bias",
320
+ "model.diffusion_model.joint_blocks.1.context_block.mlp.fc1.weight": "blocks.1.ff_b.0.weight",
321
+ "model.diffusion_model.joint_blocks.1.context_block.mlp.fc2.bias": "blocks.1.ff_b.2.bias",
322
+ "model.diffusion_model.joint_blocks.1.context_block.mlp.fc2.weight": "blocks.1.ff_b.2.weight",
323
+ "model.diffusion_model.joint_blocks.1.x_block.adaLN_modulation.1.bias": "blocks.1.norm1_a.linear.bias",
324
+ "model.diffusion_model.joint_blocks.1.x_block.adaLN_modulation.1.weight": "blocks.1.norm1_a.linear.weight",
325
+ "model.diffusion_model.joint_blocks.1.x_block.attn.proj.bias": "blocks.1.attn.a_to_out.bias",
326
+ "model.diffusion_model.joint_blocks.1.x_block.attn.proj.weight": "blocks.1.attn.a_to_out.weight",
327
+ "model.diffusion_model.joint_blocks.1.x_block.attn.qkv.bias": ['blocks.1.attn.a_to_q.bias', 'blocks.1.attn.a_to_k.bias', 'blocks.1.attn.a_to_v.bias'],
328
+ "model.diffusion_model.joint_blocks.1.x_block.attn.qkv.weight": ['blocks.1.attn.a_to_q.weight', 'blocks.1.attn.a_to_k.weight', 'blocks.1.attn.a_to_v.weight'],
329
+ "model.diffusion_model.joint_blocks.1.x_block.mlp.fc1.bias": "blocks.1.ff_a.0.bias",
330
+ "model.diffusion_model.joint_blocks.1.x_block.mlp.fc1.weight": "blocks.1.ff_a.0.weight",
331
+ "model.diffusion_model.joint_blocks.1.x_block.mlp.fc2.bias": "blocks.1.ff_a.2.bias",
332
+ "model.diffusion_model.joint_blocks.1.x_block.mlp.fc2.weight": "blocks.1.ff_a.2.weight",
333
+ "model.diffusion_model.joint_blocks.10.context_block.adaLN_modulation.1.bias": "blocks.10.norm1_b.linear.bias",
334
+ "model.diffusion_model.joint_blocks.10.context_block.adaLN_modulation.1.weight": "blocks.10.norm1_b.linear.weight",
335
+ "model.diffusion_model.joint_blocks.10.context_block.attn.proj.bias": "blocks.10.attn.b_to_out.bias",
336
+ "model.diffusion_model.joint_blocks.10.context_block.attn.proj.weight": "blocks.10.attn.b_to_out.weight",
337
+ "model.diffusion_model.joint_blocks.10.context_block.attn.qkv.bias": ['blocks.10.attn.b_to_q.bias', 'blocks.10.attn.b_to_k.bias', 'blocks.10.attn.b_to_v.bias'],
338
+ "model.diffusion_model.joint_blocks.10.context_block.attn.qkv.weight": ['blocks.10.attn.b_to_q.weight', 'blocks.10.attn.b_to_k.weight', 'blocks.10.attn.b_to_v.weight'],
339
+ "model.diffusion_model.joint_blocks.10.context_block.mlp.fc1.bias": "blocks.10.ff_b.0.bias",
340
+ "model.diffusion_model.joint_blocks.10.context_block.mlp.fc1.weight": "blocks.10.ff_b.0.weight",
341
+ "model.diffusion_model.joint_blocks.10.context_block.mlp.fc2.bias": "blocks.10.ff_b.2.bias",
342
+ "model.diffusion_model.joint_blocks.10.context_block.mlp.fc2.weight": "blocks.10.ff_b.2.weight",
343
+ "model.diffusion_model.joint_blocks.10.x_block.adaLN_modulation.1.bias": "blocks.10.norm1_a.linear.bias",
344
+ "model.diffusion_model.joint_blocks.10.x_block.adaLN_modulation.1.weight": "blocks.10.norm1_a.linear.weight",
345
+ "model.diffusion_model.joint_blocks.10.x_block.attn.proj.bias": "blocks.10.attn.a_to_out.bias",
346
+ "model.diffusion_model.joint_blocks.10.x_block.attn.proj.weight": "blocks.10.attn.a_to_out.weight",
347
+ "model.diffusion_model.joint_blocks.10.x_block.attn.qkv.bias": ['blocks.10.attn.a_to_q.bias', 'blocks.10.attn.a_to_k.bias', 'blocks.10.attn.a_to_v.bias'],
348
+ "model.diffusion_model.joint_blocks.10.x_block.attn.qkv.weight": ['blocks.10.attn.a_to_q.weight', 'blocks.10.attn.a_to_k.weight', 'blocks.10.attn.a_to_v.weight'],
349
+ "model.diffusion_model.joint_blocks.10.x_block.mlp.fc1.bias": "blocks.10.ff_a.0.bias",
350
+ "model.diffusion_model.joint_blocks.10.x_block.mlp.fc1.weight": "blocks.10.ff_a.0.weight",
351
+ "model.diffusion_model.joint_blocks.10.x_block.mlp.fc2.bias": "blocks.10.ff_a.2.bias",
352
+ "model.diffusion_model.joint_blocks.10.x_block.mlp.fc2.weight": "blocks.10.ff_a.2.weight",
353
+ "model.diffusion_model.joint_blocks.11.context_block.adaLN_modulation.1.bias": "blocks.11.norm1_b.linear.bias",
354
+ "model.diffusion_model.joint_blocks.11.context_block.adaLN_modulation.1.weight": "blocks.11.norm1_b.linear.weight",
355
+ "model.diffusion_model.joint_blocks.11.context_block.attn.proj.bias": "blocks.11.attn.b_to_out.bias",
356
+ "model.diffusion_model.joint_blocks.11.context_block.attn.proj.weight": "blocks.11.attn.b_to_out.weight",
357
+ "model.diffusion_model.joint_blocks.11.context_block.attn.qkv.bias": ['blocks.11.attn.b_to_q.bias', 'blocks.11.attn.b_to_k.bias', 'blocks.11.attn.b_to_v.bias'],
358
+ "model.diffusion_model.joint_blocks.11.context_block.attn.qkv.weight": ['blocks.11.attn.b_to_q.weight', 'blocks.11.attn.b_to_k.weight', 'blocks.11.attn.b_to_v.weight'],
359
+ "model.diffusion_model.joint_blocks.11.context_block.mlp.fc1.bias": "blocks.11.ff_b.0.bias",
360
+ "model.diffusion_model.joint_blocks.11.context_block.mlp.fc1.weight": "blocks.11.ff_b.0.weight",
361
+ "model.diffusion_model.joint_blocks.11.context_block.mlp.fc2.bias": "blocks.11.ff_b.2.bias",
362
+ "model.diffusion_model.joint_blocks.11.context_block.mlp.fc2.weight": "blocks.11.ff_b.2.weight",
363
+ "model.diffusion_model.joint_blocks.11.x_block.adaLN_modulation.1.bias": "blocks.11.norm1_a.linear.bias",
364
+ "model.diffusion_model.joint_blocks.11.x_block.adaLN_modulation.1.weight": "blocks.11.norm1_a.linear.weight",
365
+ "model.diffusion_model.joint_blocks.11.x_block.attn.proj.bias": "blocks.11.attn.a_to_out.bias",
366
+ "model.diffusion_model.joint_blocks.11.x_block.attn.proj.weight": "blocks.11.attn.a_to_out.weight",
367
+ "model.diffusion_model.joint_blocks.11.x_block.attn.qkv.bias": ['blocks.11.attn.a_to_q.bias', 'blocks.11.attn.a_to_k.bias', 'blocks.11.attn.a_to_v.bias'],
368
+ "model.diffusion_model.joint_blocks.11.x_block.attn.qkv.weight": ['blocks.11.attn.a_to_q.weight', 'blocks.11.attn.a_to_k.weight', 'blocks.11.attn.a_to_v.weight'],
369
+ "model.diffusion_model.joint_blocks.11.x_block.mlp.fc1.bias": "blocks.11.ff_a.0.bias",
370
+ "model.diffusion_model.joint_blocks.11.x_block.mlp.fc1.weight": "blocks.11.ff_a.0.weight",
371
+ "model.diffusion_model.joint_blocks.11.x_block.mlp.fc2.bias": "blocks.11.ff_a.2.bias",
372
+ "model.diffusion_model.joint_blocks.11.x_block.mlp.fc2.weight": "blocks.11.ff_a.2.weight",
373
+ "model.diffusion_model.joint_blocks.12.context_block.adaLN_modulation.1.bias": "blocks.12.norm1_b.linear.bias",
374
+ "model.diffusion_model.joint_blocks.12.context_block.adaLN_modulation.1.weight": "blocks.12.norm1_b.linear.weight",
375
+ "model.diffusion_model.joint_blocks.12.context_block.attn.proj.bias": "blocks.12.attn.b_to_out.bias",
376
+ "model.diffusion_model.joint_blocks.12.context_block.attn.proj.weight": "blocks.12.attn.b_to_out.weight",
377
+ "model.diffusion_model.joint_blocks.12.context_block.attn.qkv.bias": ['blocks.12.attn.b_to_q.bias', 'blocks.12.attn.b_to_k.bias', 'blocks.12.attn.b_to_v.bias'],
378
+ "model.diffusion_model.joint_blocks.12.context_block.attn.qkv.weight": ['blocks.12.attn.b_to_q.weight', 'blocks.12.attn.b_to_k.weight', 'blocks.12.attn.b_to_v.weight'],
379
+ "model.diffusion_model.joint_blocks.12.context_block.mlp.fc1.bias": "blocks.12.ff_b.0.bias",
380
+ "model.diffusion_model.joint_blocks.12.context_block.mlp.fc1.weight": "blocks.12.ff_b.0.weight",
381
+ "model.diffusion_model.joint_blocks.12.context_block.mlp.fc2.bias": "blocks.12.ff_b.2.bias",
382
+ "model.diffusion_model.joint_blocks.12.context_block.mlp.fc2.weight": "blocks.12.ff_b.2.weight",
383
+ "model.diffusion_model.joint_blocks.12.x_block.adaLN_modulation.1.bias": "blocks.12.norm1_a.linear.bias",
384
+ "model.diffusion_model.joint_blocks.12.x_block.adaLN_modulation.1.weight": "blocks.12.norm1_a.linear.weight",
385
+ "model.diffusion_model.joint_blocks.12.x_block.attn.proj.bias": "blocks.12.attn.a_to_out.bias",
386
+ "model.diffusion_model.joint_blocks.12.x_block.attn.proj.weight": "blocks.12.attn.a_to_out.weight",
387
+ "model.diffusion_model.joint_blocks.12.x_block.attn.qkv.bias": ['blocks.12.attn.a_to_q.bias', 'blocks.12.attn.a_to_k.bias', 'blocks.12.attn.a_to_v.bias'],
388
+ "model.diffusion_model.joint_blocks.12.x_block.attn.qkv.weight": ['blocks.12.attn.a_to_q.weight', 'blocks.12.attn.a_to_k.weight', 'blocks.12.attn.a_to_v.weight'],
389
+ "model.diffusion_model.joint_blocks.12.x_block.mlp.fc1.bias": "blocks.12.ff_a.0.bias",
390
+ "model.diffusion_model.joint_blocks.12.x_block.mlp.fc1.weight": "blocks.12.ff_a.0.weight",
391
+ "model.diffusion_model.joint_blocks.12.x_block.mlp.fc2.bias": "blocks.12.ff_a.2.bias",
392
+ "model.diffusion_model.joint_blocks.12.x_block.mlp.fc2.weight": "blocks.12.ff_a.2.weight",
393
+ "model.diffusion_model.joint_blocks.13.context_block.adaLN_modulation.1.bias": "blocks.13.norm1_b.linear.bias",
394
+ "model.diffusion_model.joint_blocks.13.context_block.adaLN_modulation.1.weight": "blocks.13.norm1_b.linear.weight",
395
+ "model.diffusion_model.joint_blocks.13.context_block.attn.proj.bias": "blocks.13.attn.b_to_out.bias",
396
+ "model.diffusion_model.joint_blocks.13.context_block.attn.proj.weight": "blocks.13.attn.b_to_out.weight",
397
+ "model.diffusion_model.joint_blocks.13.context_block.attn.qkv.bias": ['blocks.13.attn.b_to_q.bias', 'blocks.13.attn.b_to_k.bias', 'blocks.13.attn.b_to_v.bias'],
398
+ "model.diffusion_model.joint_blocks.13.context_block.attn.qkv.weight": ['blocks.13.attn.b_to_q.weight', 'blocks.13.attn.b_to_k.weight', 'blocks.13.attn.b_to_v.weight'],
399
+ "model.diffusion_model.joint_blocks.13.context_block.mlp.fc1.bias": "blocks.13.ff_b.0.bias",
400
+ "model.diffusion_model.joint_blocks.13.context_block.mlp.fc1.weight": "blocks.13.ff_b.0.weight",
401
+ "model.diffusion_model.joint_blocks.13.context_block.mlp.fc2.bias": "blocks.13.ff_b.2.bias",
402
+ "model.diffusion_model.joint_blocks.13.context_block.mlp.fc2.weight": "blocks.13.ff_b.2.weight",
403
+ "model.diffusion_model.joint_blocks.13.x_block.adaLN_modulation.1.bias": "blocks.13.norm1_a.linear.bias",
404
+ "model.diffusion_model.joint_blocks.13.x_block.adaLN_modulation.1.weight": "blocks.13.norm1_a.linear.weight",
405
+ "model.diffusion_model.joint_blocks.13.x_block.attn.proj.bias": "blocks.13.attn.a_to_out.bias",
406
+ "model.diffusion_model.joint_blocks.13.x_block.attn.proj.weight": "blocks.13.attn.a_to_out.weight",
407
+ "model.diffusion_model.joint_blocks.13.x_block.attn.qkv.bias": ['blocks.13.attn.a_to_q.bias', 'blocks.13.attn.a_to_k.bias', 'blocks.13.attn.a_to_v.bias'],
408
+ "model.diffusion_model.joint_blocks.13.x_block.attn.qkv.weight": ['blocks.13.attn.a_to_q.weight', 'blocks.13.attn.a_to_k.weight', 'blocks.13.attn.a_to_v.weight'],
409
+ "model.diffusion_model.joint_blocks.13.x_block.mlp.fc1.bias": "blocks.13.ff_a.0.bias",
410
+ "model.diffusion_model.joint_blocks.13.x_block.mlp.fc1.weight": "blocks.13.ff_a.0.weight",
411
+ "model.diffusion_model.joint_blocks.13.x_block.mlp.fc2.bias": "blocks.13.ff_a.2.bias",
412
+ "model.diffusion_model.joint_blocks.13.x_block.mlp.fc2.weight": "blocks.13.ff_a.2.weight",
413
+ "model.diffusion_model.joint_blocks.14.context_block.adaLN_modulation.1.bias": "blocks.14.norm1_b.linear.bias",
414
+ "model.diffusion_model.joint_blocks.14.context_block.adaLN_modulation.1.weight": "blocks.14.norm1_b.linear.weight",
415
+ "model.diffusion_model.joint_blocks.14.context_block.attn.proj.bias": "blocks.14.attn.b_to_out.bias",
416
+ "model.diffusion_model.joint_blocks.14.context_block.attn.proj.weight": "blocks.14.attn.b_to_out.weight",
417
+ "model.diffusion_model.joint_blocks.14.context_block.attn.qkv.bias": ['blocks.14.attn.b_to_q.bias', 'blocks.14.attn.b_to_k.bias', 'blocks.14.attn.b_to_v.bias'],
418
+ "model.diffusion_model.joint_blocks.14.context_block.attn.qkv.weight": ['blocks.14.attn.b_to_q.weight', 'blocks.14.attn.b_to_k.weight', 'blocks.14.attn.b_to_v.weight'],
419
+ "model.diffusion_model.joint_blocks.14.context_block.mlp.fc1.bias": "blocks.14.ff_b.0.bias",
420
+ "model.diffusion_model.joint_blocks.14.context_block.mlp.fc1.weight": "blocks.14.ff_b.0.weight",
421
+ "model.diffusion_model.joint_blocks.14.context_block.mlp.fc2.bias": "blocks.14.ff_b.2.bias",
422
+ "model.diffusion_model.joint_blocks.14.context_block.mlp.fc2.weight": "blocks.14.ff_b.2.weight",
423
+ "model.diffusion_model.joint_blocks.14.x_block.adaLN_modulation.1.bias": "blocks.14.norm1_a.linear.bias",
424
+ "model.diffusion_model.joint_blocks.14.x_block.adaLN_modulation.1.weight": "blocks.14.norm1_a.linear.weight",
425
+ "model.diffusion_model.joint_blocks.14.x_block.attn.proj.bias": "blocks.14.attn.a_to_out.bias",
426
+ "model.diffusion_model.joint_blocks.14.x_block.attn.proj.weight": "blocks.14.attn.a_to_out.weight",
427
+ "model.diffusion_model.joint_blocks.14.x_block.attn.qkv.bias": ['blocks.14.attn.a_to_q.bias', 'blocks.14.attn.a_to_k.bias', 'blocks.14.attn.a_to_v.bias'],
428
+ "model.diffusion_model.joint_blocks.14.x_block.attn.qkv.weight": ['blocks.14.attn.a_to_q.weight', 'blocks.14.attn.a_to_k.weight', 'blocks.14.attn.a_to_v.weight'],
429
+ "model.diffusion_model.joint_blocks.14.x_block.mlp.fc1.bias": "blocks.14.ff_a.0.bias",
430
+ "model.diffusion_model.joint_blocks.14.x_block.mlp.fc1.weight": "blocks.14.ff_a.0.weight",
431
+ "model.diffusion_model.joint_blocks.14.x_block.mlp.fc2.bias": "blocks.14.ff_a.2.bias",
432
+ "model.diffusion_model.joint_blocks.14.x_block.mlp.fc2.weight": "blocks.14.ff_a.2.weight",
433
+ "model.diffusion_model.joint_blocks.15.context_block.adaLN_modulation.1.bias": "blocks.15.norm1_b.linear.bias",
434
+ "model.diffusion_model.joint_blocks.15.context_block.adaLN_modulation.1.weight": "blocks.15.norm1_b.linear.weight",
435
+ "model.diffusion_model.joint_blocks.15.context_block.attn.proj.bias": "blocks.15.attn.b_to_out.bias",
436
+ "model.diffusion_model.joint_blocks.15.context_block.attn.proj.weight": "blocks.15.attn.b_to_out.weight",
437
+ "model.diffusion_model.joint_blocks.15.context_block.attn.qkv.bias": ['blocks.15.attn.b_to_q.bias', 'blocks.15.attn.b_to_k.bias', 'blocks.15.attn.b_to_v.bias'],
438
+ "model.diffusion_model.joint_blocks.15.context_block.attn.qkv.weight": ['blocks.15.attn.b_to_q.weight', 'blocks.15.attn.b_to_k.weight', 'blocks.15.attn.b_to_v.weight'],
439
+ "model.diffusion_model.joint_blocks.15.context_block.mlp.fc1.bias": "blocks.15.ff_b.0.bias",
440
+ "model.diffusion_model.joint_blocks.15.context_block.mlp.fc1.weight": "blocks.15.ff_b.0.weight",
441
+ "model.diffusion_model.joint_blocks.15.context_block.mlp.fc2.bias": "blocks.15.ff_b.2.bias",
442
+ "model.diffusion_model.joint_blocks.15.context_block.mlp.fc2.weight": "blocks.15.ff_b.2.weight",
443
+ "model.diffusion_model.joint_blocks.15.x_block.adaLN_modulation.1.bias": "blocks.15.norm1_a.linear.bias",
444
+ "model.diffusion_model.joint_blocks.15.x_block.adaLN_modulation.1.weight": "blocks.15.norm1_a.linear.weight",
445
+ "model.diffusion_model.joint_blocks.15.x_block.attn.proj.bias": "blocks.15.attn.a_to_out.bias",
446
+ "model.diffusion_model.joint_blocks.15.x_block.attn.proj.weight": "blocks.15.attn.a_to_out.weight",
447
+ "model.diffusion_model.joint_blocks.15.x_block.attn.qkv.bias": ['blocks.15.attn.a_to_q.bias', 'blocks.15.attn.a_to_k.bias', 'blocks.15.attn.a_to_v.bias'],
448
+ "model.diffusion_model.joint_blocks.15.x_block.attn.qkv.weight": ['blocks.15.attn.a_to_q.weight', 'blocks.15.attn.a_to_k.weight', 'blocks.15.attn.a_to_v.weight'],
449
+ "model.diffusion_model.joint_blocks.15.x_block.mlp.fc1.bias": "blocks.15.ff_a.0.bias",
450
+ "model.diffusion_model.joint_blocks.15.x_block.mlp.fc1.weight": "blocks.15.ff_a.0.weight",
451
+ "model.diffusion_model.joint_blocks.15.x_block.mlp.fc2.bias": "blocks.15.ff_a.2.bias",
452
+ "model.diffusion_model.joint_blocks.15.x_block.mlp.fc2.weight": "blocks.15.ff_a.2.weight",
453
+ "model.diffusion_model.joint_blocks.16.context_block.adaLN_modulation.1.bias": "blocks.16.norm1_b.linear.bias",
454
+ "model.diffusion_model.joint_blocks.16.context_block.adaLN_modulation.1.weight": "blocks.16.norm1_b.linear.weight",
455
+ "model.diffusion_model.joint_blocks.16.context_block.attn.proj.bias": "blocks.16.attn.b_to_out.bias",
456
+ "model.diffusion_model.joint_blocks.16.context_block.attn.proj.weight": "blocks.16.attn.b_to_out.weight",
457
+ "model.diffusion_model.joint_blocks.16.context_block.attn.qkv.bias": ['blocks.16.attn.b_to_q.bias', 'blocks.16.attn.b_to_k.bias', 'blocks.16.attn.b_to_v.bias'],
458
+ "model.diffusion_model.joint_blocks.16.context_block.attn.qkv.weight": ['blocks.16.attn.b_to_q.weight', 'blocks.16.attn.b_to_k.weight', 'blocks.16.attn.b_to_v.weight'],
459
+ "model.diffusion_model.joint_blocks.16.context_block.mlp.fc1.bias": "blocks.16.ff_b.0.bias",
460
+ "model.diffusion_model.joint_blocks.16.context_block.mlp.fc1.weight": "blocks.16.ff_b.0.weight",
461
+ "model.diffusion_model.joint_blocks.16.context_block.mlp.fc2.bias": "blocks.16.ff_b.2.bias",
462
+ "model.diffusion_model.joint_blocks.16.context_block.mlp.fc2.weight": "blocks.16.ff_b.2.weight",
463
+ "model.diffusion_model.joint_blocks.16.x_block.adaLN_modulation.1.bias": "blocks.16.norm1_a.linear.bias",
464
+ "model.diffusion_model.joint_blocks.16.x_block.adaLN_modulation.1.weight": "blocks.16.norm1_a.linear.weight",
465
+ "model.diffusion_model.joint_blocks.16.x_block.attn.proj.bias": "blocks.16.attn.a_to_out.bias",
466
+ "model.diffusion_model.joint_blocks.16.x_block.attn.proj.weight": "blocks.16.attn.a_to_out.weight",
467
+ "model.diffusion_model.joint_blocks.16.x_block.attn.qkv.bias": ['blocks.16.attn.a_to_q.bias', 'blocks.16.attn.a_to_k.bias', 'blocks.16.attn.a_to_v.bias'],
468
+ "model.diffusion_model.joint_blocks.16.x_block.attn.qkv.weight": ['blocks.16.attn.a_to_q.weight', 'blocks.16.attn.a_to_k.weight', 'blocks.16.attn.a_to_v.weight'],
469
+ "model.diffusion_model.joint_blocks.16.x_block.mlp.fc1.bias": "blocks.16.ff_a.0.bias",
470
+ "model.diffusion_model.joint_blocks.16.x_block.mlp.fc1.weight": "blocks.16.ff_a.0.weight",
471
+ "model.diffusion_model.joint_blocks.16.x_block.mlp.fc2.bias": "blocks.16.ff_a.2.bias",
472
+ "model.diffusion_model.joint_blocks.16.x_block.mlp.fc2.weight": "blocks.16.ff_a.2.weight",
473
+ "model.diffusion_model.joint_blocks.17.context_block.adaLN_modulation.1.bias": "blocks.17.norm1_b.linear.bias",
474
+ "model.diffusion_model.joint_blocks.17.context_block.adaLN_modulation.1.weight": "blocks.17.norm1_b.linear.weight",
475
+ "model.diffusion_model.joint_blocks.17.context_block.attn.proj.bias": "blocks.17.attn.b_to_out.bias",
476
+ "model.diffusion_model.joint_blocks.17.context_block.attn.proj.weight": "blocks.17.attn.b_to_out.weight",
477
+ "model.diffusion_model.joint_blocks.17.context_block.attn.qkv.bias": ['blocks.17.attn.b_to_q.bias', 'blocks.17.attn.b_to_k.bias', 'blocks.17.attn.b_to_v.bias'],
478
+ "model.diffusion_model.joint_blocks.17.context_block.attn.qkv.weight": ['blocks.17.attn.b_to_q.weight', 'blocks.17.attn.b_to_k.weight', 'blocks.17.attn.b_to_v.weight'],
479
+ "model.diffusion_model.joint_blocks.17.context_block.mlp.fc1.bias": "blocks.17.ff_b.0.bias",
480
+ "model.diffusion_model.joint_blocks.17.context_block.mlp.fc1.weight": "blocks.17.ff_b.0.weight",
481
+ "model.diffusion_model.joint_blocks.17.context_block.mlp.fc2.bias": "blocks.17.ff_b.2.bias",
482
+ "model.diffusion_model.joint_blocks.17.context_block.mlp.fc2.weight": "blocks.17.ff_b.2.weight",
483
+ "model.diffusion_model.joint_blocks.17.x_block.adaLN_modulation.1.bias": "blocks.17.norm1_a.linear.bias",
484
+ "model.diffusion_model.joint_blocks.17.x_block.adaLN_modulation.1.weight": "blocks.17.norm1_a.linear.weight",
485
+ "model.diffusion_model.joint_blocks.17.x_block.attn.proj.bias": "blocks.17.attn.a_to_out.bias",
486
+ "model.diffusion_model.joint_blocks.17.x_block.attn.proj.weight": "blocks.17.attn.a_to_out.weight",
487
+ "model.diffusion_model.joint_blocks.17.x_block.attn.qkv.bias": ['blocks.17.attn.a_to_q.bias', 'blocks.17.attn.a_to_k.bias', 'blocks.17.attn.a_to_v.bias'],
488
+ "model.diffusion_model.joint_blocks.17.x_block.attn.qkv.weight": ['blocks.17.attn.a_to_q.weight', 'blocks.17.attn.a_to_k.weight', 'blocks.17.attn.a_to_v.weight'],
489
+ "model.diffusion_model.joint_blocks.17.x_block.mlp.fc1.bias": "blocks.17.ff_a.0.bias",
490
+ "model.diffusion_model.joint_blocks.17.x_block.mlp.fc1.weight": "blocks.17.ff_a.0.weight",
491
+ "model.diffusion_model.joint_blocks.17.x_block.mlp.fc2.bias": "blocks.17.ff_a.2.bias",
492
+ "model.diffusion_model.joint_blocks.17.x_block.mlp.fc2.weight": "blocks.17.ff_a.2.weight",
493
+ "model.diffusion_model.joint_blocks.18.context_block.adaLN_modulation.1.bias": "blocks.18.norm1_b.linear.bias",
494
+ "model.diffusion_model.joint_blocks.18.context_block.adaLN_modulation.1.weight": "blocks.18.norm1_b.linear.weight",
495
+ "model.diffusion_model.joint_blocks.18.context_block.attn.proj.bias": "blocks.18.attn.b_to_out.bias",
496
+ "model.diffusion_model.joint_blocks.18.context_block.attn.proj.weight": "blocks.18.attn.b_to_out.weight",
497
+ "model.diffusion_model.joint_blocks.18.context_block.attn.qkv.bias": ['blocks.18.attn.b_to_q.bias', 'blocks.18.attn.b_to_k.bias', 'blocks.18.attn.b_to_v.bias'],
498
+ "model.diffusion_model.joint_blocks.18.context_block.attn.qkv.weight": ['blocks.18.attn.b_to_q.weight', 'blocks.18.attn.b_to_k.weight', 'blocks.18.attn.b_to_v.weight'],
499
+ "model.diffusion_model.joint_blocks.18.context_block.mlp.fc1.bias": "blocks.18.ff_b.0.bias",
500
+ "model.diffusion_model.joint_blocks.18.context_block.mlp.fc1.weight": "blocks.18.ff_b.0.weight",
501
+ "model.diffusion_model.joint_blocks.18.context_block.mlp.fc2.bias": "blocks.18.ff_b.2.bias",
502
+ "model.diffusion_model.joint_blocks.18.context_block.mlp.fc2.weight": "blocks.18.ff_b.2.weight",
503
+ "model.diffusion_model.joint_blocks.18.x_block.adaLN_modulation.1.bias": "blocks.18.norm1_a.linear.bias",
504
+ "model.diffusion_model.joint_blocks.18.x_block.adaLN_modulation.1.weight": "blocks.18.norm1_a.linear.weight",
505
+ "model.diffusion_model.joint_blocks.18.x_block.attn.proj.bias": "blocks.18.attn.a_to_out.bias",
506
+ "model.diffusion_model.joint_blocks.18.x_block.attn.proj.weight": "blocks.18.attn.a_to_out.weight",
507
+ "model.diffusion_model.joint_blocks.18.x_block.attn.qkv.bias": ['blocks.18.attn.a_to_q.bias', 'blocks.18.attn.a_to_k.bias', 'blocks.18.attn.a_to_v.bias'],
508
+ "model.diffusion_model.joint_blocks.18.x_block.attn.qkv.weight": ['blocks.18.attn.a_to_q.weight', 'blocks.18.attn.a_to_k.weight', 'blocks.18.attn.a_to_v.weight'],
509
+ "model.diffusion_model.joint_blocks.18.x_block.mlp.fc1.bias": "blocks.18.ff_a.0.bias",
510
+ "model.diffusion_model.joint_blocks.18.x_block.mlp.fc1.weight": "blocks.18.ff_a.0.weight",
511
+ "model.diffusion_model.joint_blocks.18.x_block.mlp.fc2.bias": "blocks.18.ff_a.2.bias",
512
+ "model.diffusion_model.joint_blocks.18.x_block.mlp.fc2.weight": "blocks.18.ff_a.2.weight",
513
+ "model.diffusion_model.joint_blocks.19.context_block.adaLN_modulation.1.bias": "blocks.19.norm1_b.linear.bias",
514
+ "model.diffusion_model.joint_blocks.19.context_block.adaLN_modulation.1.weight": "blocks.19.norm1_b.linear.weight",
515
+ "model.diffusion_model.joint_blocks.19.context_block.attn.proj.bias": "blocks.19.attn.b_to_out.bias",
516
+ "model.diffusion_model.joint_blocks.19.context_block.attn.proj.weight": "blocks.19.attn.b_to_out.weight",
517
+ "model.diffusion_model.joint_blocks.19.context_block.attn.qkv.bias": ['blocks.19.attn.b_to_q.bias', 'blocks.19.attn.b_to_k.bias', 'blocks.19.attn.b_to_v.bias'],
518
+ "model.diffusion_model.joint_blocks.19.context_block.attn.qkv.weight": ['blocks.19.attn.b_to_q.weight', 'blocks.19.attn.b_to_k.weight', 'blocks.19.attn.b_to_v.weight'],
519
+ "model.diffusion_model.joint_blocks.19.context_block.mlp.fc1.bias": "blocks.19.ff_b.0.bias",
520
+ "model.diffusion_model.joint_blocks.19.context_block.mlp.fc1.weight": "blocks.19.ff_b.0.weight",
521
+ "model.diffusion_model.joint_blocks.19.context_block.mlp.fc2.bias": "blocks.19.ff_b.2.bias",
522
+ "model.diffusion_model.joint_blocks.19.context_block.mlp.fc2.weight": "blocks.19.ff_b.2.weight",
523
+ "model.diffusion_model.joint_blocks.19.x_block.adaLN_modulation.1.bias": "blocks.19.norm1_a.linear.bias",
524
+ "model.diffusion_model.joint_blocks.19.x_block.adaLN_modulation.1.weight": "blocks.19.norm1_a.linear.weight",
525
+ "model.diffusion_model.joint_blocks.19.x_block.attn.proj.bias": "blocks.19.attn.a_to_out.bias",
526
+ "model.diffusion_model.joint_blocks.19.x_block.attn.proj.weight": "blocks.19.attn.a_to_out.weight",
527
+ "model.diffusion_model.joint_blocks.19.x_block.attn.qkv.bias": ['blocks.19.attn.a_to_q.bias', 'blocks.19.attn.a_to_k.bias', 'blocks.19.attn.a_to_v.bias'],
528
+ "model.diffusion_model.joint_blocks.19.x_block.attn.qkv.weight": ['blocks.19.attn.a_to_q.weight', 'blocks.19.attn.a_to_k.weight', 'blocks.19.attn.a_to_v.weight'],
529
+ "model.diffusion_model.joint_blocks.19.x_block.mlp.fc1.bias": "blocks.19.ff_a.0.bias",
530
+ "model.diffusion_model.joint_blocks.19.x_block.mlp.fc1.weight": "blocks.19.ff_a.0.weight",
531
+ "model.diffusion_model.joint_blocks.19.x_block.mlp.fc2.bias": "blocks.19.ff_a.2.bias",
532
+ "model.diffusion_model.joint_blocks.19.x_block.mlp.fc2.weight": "blocks.19.ff_a.2.weight",
533
+ "model.diffusion_model.joint_blocks.2.context_block.adaLN_modulation.1.bias": "blocks.2.norm1_b.linear.bias",
534
+ "model.diffusion_model.joint_blocks.2.context_block.adaLN_modulation.1.weight": "blocks.2.norm1_b.linear.weight",
535
+ "model.diffusion_model.joint_blocks.2.context_block.attn.proj.bias": "blocks.2.attn.b_to_out.bias",
536
+ "model.diffusion_model.joint_blocks.2.context_block.attn.proj.weight": "blocks.2.attn.b_to_out.weight",
537
+ "model.diffusion_model.joint_blocks.2.context_block.attn.qkv.bias": ['blocks.2.attn.b_to_q.bias', 'blocks.2.attn.b_to_k.bias', 'blocks.2.attn.b_to_v.bias'],
538
+ "model.diffusion_model.joint_blocks.2.context_block.attn.qkv.weight": ['blocks.2.attn.b_to_q.weight', 'blocks.2.attn.b_to_k.weight', 'blocks.2.attn.b_to_v.weight'],
539
+ "model.diffusion_model.joint_blocks.2.context_block.mlp.fc1.bias": "blocks.2.ff_b.0.bias",
540
+ "model.diffusion_model.joint_blocks.2.context_block.mlp.fc1.weight": "blocks.2.ff_b.0.weight",
541
+ "model.diffusion_model.joint_blocks.2.context_block.mlp.fc2.bias": "blocks.2.ff_b.2.bias",
542
+ "model.diffusion_model.joint_blocks.2.context_block.mlp.fc2.weight": "blocks.2.ff_b.2.weight",
543
+ "model.diffusion_model.joint_blocks.2.x_block.adaLN_modulation.1.bias": "blocks.2.norm1_a.linear.bias",
544
+ "model.diffusion_model.joint_blocks.2.x_block.adaLN_modulation.1.weight": "blocks.2.norm1_a.linear.weight",
545
+ "model.diffusion_model.joint_blocks.2.x_block.attn.proj.bias": "blocks.2.attn.a_to_out.bias",
546
+ "model.diffusion_model.joint_blocks.2.x_block.attn.proj.weight": "blocks.2.attn.a_to_out.weight",
547
+ "model.diffusion_model.joint_blocks.2.x_block.attn.qkv.bias": ['blocks.2.attn.a_to_q.bias', 'blocks.2.attn.a_to_k.bias', 'blocks.2.attn.a_to_v.bias'],
548
+ "model.diffusion_model.joint_blocks.2.x_block.attn.qkv.weight": ['blocks.2.attn.a_to_q.weight', 'blocks.2.attn.a_to_k.weight', 'blocks.2.attn.a_to_v.weight'],
549
+ "model.diffusion_model.joint_blocks.2.x_block.mlp.fc1.bias": "blocks.2.ff_a.0.bias",
550
+ "model.diffusion_model.joint_blocks.2.x_block.mlp.fc1.weight": "blocks.2.ff_a.0.weight",
551
+ "model.diffusion_model.joint_blocks.2.x_block.mlp.fc2.bias": "blocks.2.ff_a.2.bias",
552
+ "model.diffusion_model.joint_blocks.2.x_block.mlp.fc2.weight": "blocks.2.ff_a.2.weight",
553
+ "model.diffusion_model.joint_blocks.20.context_block.adaLN_modulation.1.bias": "blocks.20.norm1_b.linear.bias",
554
+ "model.diffusion_model.joint_blocks.20.context_block.adaLN_modulation.1.weight": "blocks.20.norm1_b.linear.weight",
555
+ "model.diffusion_model.joint_blocks.20.context_block.attn.proj.bias": "blocks.20.attn.b_to_out.bias",
556
+ "model.diffusion_model.joint_blocks.20.context_block.attn.proj.weight": "blocks.20.attn.b_to_out.weight",
557
+ "model.diffusion_model.joint_blocks.20.context_block.attn.qkv.bias": ['blocks.20.attn.b_to_q.bias', 'blocks.20.attn.b_to_k.bias', 'blocks.20.attn.b_to_v.bias'],
558
+ "model.diffusion_model.joint_blocks.20.context_block.attn.qkv.weight": ['blocks.20.attn.b_to_q.weight', 'blocks.20.attn.b_to_k.weight', 'blocks.20.attn.b_to_v.weight'],
559
+ "model.diffusion_model.joint_blocks.20.context_block.mlp.fc1.bias": "blocks.20.ff_b.0.bias",
560
+ "model.diffusion_model.joint_blocks.20.context_block.mlp.fc1.weight": "blocks.20.ff_b.0.weight",
561
+ "model.diffusion_model.joint_blocks.20.context_block.mlp.fc2.bias": "blocks.20.ff_b.2.bias",
562
+ "model.diffusion_model.joint_blocks.20.context_block.mlp.fc2.weight": "blocks.20.ff_b.2.weight",
563
+ "model.diffusion_model.joint_blocks.20.x_block.adaLN_modulation.1.bias": "blocks.20.norm1_a.linear.bias",
564
+ "model.diffusion_model.joint_blocks.20.x_block.adaLN_modulation.1.weight": "blocks.20.norm1_a.linear.weight",
565
+ "model.diffusion_model.joint_blocks.20.x_block.attn.proj.bias": "blocks.20.attn.a_to_out.bias",
566
+ "model.diffusion_model.joint_blocks.20.x_block.attn.proj.weight": "blocks.20.attn.a_to_out.weight",
567
+ "model.diffusion_model.joint_blocks.20.x_block.attn.qkv.bias": ['blocks.20.attn.a_to_q.bias', 'blocks.20.attn.a_to_k.bias', 'blocks.20.attn.a_to_v.bias'],
568
+ "model.diffusion_model.joint_blocks.20.x_block.attn.qkv.weight": ['blocks.20.attn.a_to_q.weight', 'blocks.20.attn.a_to_k.weight', 'blocks.20.attn.a_to_v.weight'],
569
+ "model.diffusion_model.joint_blocks.20.x_block.mlp.fc1.bias": "blocks.20.ff_a.0.bias",
570
+ "model.diffusion_model.joint_blocks.20.x_block.mlp.fc1.weight": "blocks.20.ff_a.0.weight",
571
+ "model.diffusion_model.joint_blocks.20.x_block.mlp.fc2.bias": "blocks.20.ff_a.2.bias",
572
+ "model.diffusion_model.joint_blocks.20.x_block.mlp.fc2.weight": "blocks.20.ff_a.2.weight",
573
+ "model.diffusion_model.joint_blocks.21.context_block.adaLN_modulation.1.bias": "blocks.21.norm1_b.linear.bias",
574
+ "model.diffusion_model.joint_blocks.21.context_block.adaLN_modulation.1.weight": "blocks.21.norm1_b.linear.weight",
575
+ "model.diffusion_model.joint_blocks.21.context_block.attn.proj.bias": "blocks.21.attn.b_to_out.bias",
576
+ "model.diffusion_model.joint_blocks.21.context_block.attn.proj.weight": "blocks.21.attn.b_to_out.weight",
577
+ "model.diffusion_model.joint_blocks.21.context_block.attn.qkv.bias": ['blocks.21.attn.b_to_q.bias', 'blocks.21.attn.b_to_k.bias', 'blocks.21.attn.b_to_v.bias'],
578
+ "model.diffusion_model.joint_blocks.21.context_block.attn.qkv.weight": ['blocks.21.attn.b_to_q.weight', 'blocks.21.attn.b_to_k.weight', 'blocks.21.attn.b_to_v.weight'],
579
+ "model.diffusion_model.joint_blocks.21.context_block.mlp.fc1.bias": "blocks.21.ff_b.0.bias",
580
+ "model.diffusion_model.joint_blocks.21.context_block.mlp.fc1.weight": "blocks.21.ff_b.0.weight",
581
+ "model.diffusion_model.joint_blocks.21.context_block.mlp.fc2.bias": "blocks.21.ff_b.2.bias",
582
+ "model.diffusion_model.joint_blocks.21.context_block.mlp.fc2.weight": "blocks.21.ff_b.2.weight",
583
+ "model.diffusion_model.joint_blocks.21.x_block.adaLN_modulation.1.bias": "blocks.21.norm1_a.linear.bias",
584
+ "model.diffusion_model.joint_blocks.21.x_block.adaLN_modulation.1.weight": "blocks.21.norm1_a.linear.weight",
585
+ "model.diffusion_model.joint_blocks.21.x_block.attn.proj.bias": "blocks.21.attn.a_to_out.bias",
586
+ "model.diffusion_model.joint_blocks.21.x_block.attn.proj.weight": "blocks.21.attn.a_to_out.weight",
587
+ "model.diffusion_model.joint_blocks.21.x_block.attn.qkv.bias": ['blocks.21.attn.a_to_q.bias', 'blocks.21.attn.a_to_k.bias', 'blocks.21.attn.a_to_v.bias'],
588
+ "model.diffusion_model.joint_blocks.21.x_block.attn.qkv.weight": ['blocks.21.attn.a_to_q.weight', 'blocks.21.attn.a_to_k.weight', 'blocks.21.attn.a_to_v.weight'],
589
+ "model.diffusion_model.joint_blocks.21.x_block.mlp.fc1.bias": "blocks.21.ff_a.0.bias",
590
+ "model.diffusion_model.joint_blocks.21.x_block.mlp.fc1.weight": "blocks.21.ff_a.0.weight",
591
+ "model.diffusion_model.joint_blocks.21.x_block.mlp.fc2.bias": "blocks.21.ff_a.2.bias",
592
+ "model.diffusion_model.joint_blocks.21.x_block.mlp.fc2.weight": "blocks.21.ff_a.2.weight",
593
+ "model.diffusion_model.joint_blocks.22.context_block.adaLN_modulation.1.bias": "blocks.22.norm1_b.linear.bias",
594
+ "model.diffusion_model.joint_blocks.22.context_block.adaLN_modulation.1.weight": "blocks.22.norm1_b.linear.weight",
595
+ "model.diffusion_model.joint_blocks.22.context_block.attn.proj.bias": "blocks.22.attn.b_to_out.bias",
596
+ "model.diffusion_model.joint_blocks.22.context_block.attn.proj.weight": "blocks.22.attn.b_to_out.weight",
597
+ "model.diffusion_model.joint_blocks.22.context_block.attn.qkv.bias": ['blocks.22.attn.b_to_q.bias', 'blocks.22.attn.b_to_k.bias', 'blocks.22.attn.b_to_v.bias'],
598
+ "model.diffusion_model.joint_blocks.22.context_block.attn.qkv.weight": ['blocks.22.attn.b_to_q.weight', 'blocks.22.attn.b_to_k.weight', 'blocks.22.attn.b_to_v.weight'],
599
+ "model.diffusion_model.joint_blocks.22.context_block.mlp.fc1.bias": "blocks.22.ff_b.0.bias",
600
+ "model.diffusion_model.joint_blocks.22.context_block.mlp.fc1.weight": "blocks.22.ff_b.0.weight",
601
+ "model.diffusion_model.joint_blocks.22.context_block.mlp.fc2.bias": "blocks.22.ff_b.2.bias",
602
+ "model.diffusion_model.joint_blocks.22.context_block.mlp.fc2.weight": "blocks.22.ff_b.2.weight",
603
+ "model.diffusion_model.joint_blocks.22.x_block.adaLN_modulation.1.bias": "blocks.22.norm1_a.linear.bias",
604
+ "model.diffusion_model.joint_blocks.22.x_block.adaLN_modulation.1.weight": "blocks.22.norm1_a.linear.weight",
605
+ "model.diffusion_model.joint_blocks.22.x_block.attn.proj.bias": "blocks.22.attn.a_to_out.bias",
606
+ "model.diffusion_model.joint_blocks.22.x_block.attn.proj.weight": "blocks.22.attn.a_to_out.weight",
607
+ "model.diffusion_model.joint_blocks.22.x_block.attn.qkv.bias": ['blocks.22.attn.a_to_q.bias', 'blocks.22.attn.a_to_k.bias', 'blocks.22.attn.a_to_v.bias'],
608
+ "model.diffusion_model.joint_blocks.22.x_block.attn.qkv.weight": ['blocks.22.attn.a_to_q.weight', 'blocks.22.attn.a_to_k.weight', 'blocks.22.attn.a_to_v.weight'],
609
+ "model.diffusion_model.joint_blocks.22.x_block.mlp.fc1.bias": "blocks.22.ff_a.0.bias",
610
+ "model.diffusion_model.joint_blocks.22.x_block.mlp.fc1.weight": "blocks.22.ff_a.0.weight",
611
+ "model.diffusion_model.joint_blocks.22.x_block.mlp.fc2.bias": "blocks.22.ff_a.2.bias",
612
+ "model.diffusion_model.joint_blocks.22.x_block.mlp.fc2.weight": "blocks.22.ff_a.2.weight",
613
+ "model.diffusion_model.joint_blocks.23.context_block.attn.qkv.bias": ['blocks.23.attn.b_to_q.bias', 'blocks.23.attn.b_to_k.bias', 'blocks.23.attn.b_to_v.bias'],
614
+ "model.diffusion_model.joint_blocks.23.context_block.attn.qkv.weight": ['blocks.23.attn.b_to_q.weight', 'blocks.23.attn.b_to_k.weight', 'blocks.23.attn.b_to_v.weight'],
615
+ "model.diffusion_model.joint_blocks.23.x_block.adaLN_modulation.1.bias": "blocks.23.norm1_a.linear.bias",
616
+ "model.diffusion_model.joint_blocks.23.x_block.adaLN_modulation.1.weight": "blocks.23.norm1_a.linear.weight",
617
+ "model.diffusion_model.joint_blocks.23.x_block.attn.proj.bias": "blocks.23.attn.a_to_out.bias",
618
+ "model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight": "blocks.23.attn.a_to_out.weight",
619
+ "model.diffusion_model.joint_blocks.23.x_block.attn.qkv.bias": ['blocks.23.attn.a_to_q.bias', 'blocks.23.attn.a_to_k.bias', 'blocks.23.attn.a_to_v.bias'],
620
+ "model.diffusion_model.joint_blocks.23.x_block.attn.qkv.weight": ['blocks.23.attn.a_to_q.weight', 'blocks.23.attn.a_to_k.weight', 'blocks.23.attn.a_to_v.weight'],
621
+ "model.diffusion_model.joint_blocks.23.x_block.mlp.fc1.bias": "blocks.23.ff_a.0.bias",
622
+ "model.diffusion_model.joint_blocks.23.x_block.mlp.fc1.weight": "blocks.23.ff_a.0.weight",
623
+ "model.diffusion_model.joint_blocks.23.x_block.mlp.fc2.bias": "blocks.23.ff_a.2.bias",
624
+ "model.diffusion_model.joint_blocks.23.x_block.mlp.fc2.weight": "blocks.23.ff_a.2.weight",
625
+ "model.diffusion_model.joint_blocks.3.context_block.adaLN_modulation.1.bias": "blocks.3.norm1_b.linear.bias",
626
+ "model.diffusion_model.joint_blocks.3.context_block.adaLN_modulation.1.weight": "blocks.3.norm1_b.linear.weight",
627
+ "model.diffusion_model.joint_blocks.3.context_block.attn.proj.bias": "blocks.3.attn.b_to_out.bias",
628
+ "model.diffusion_model.joint_blocks.3.context_block.attn.proj.weight": "blocks.3.attn.b_to_out.weight",
629
+ "model.diffusion_model.joint_blocks.3.context_block.attn.qkv.bias": ['blocks.3.attn.b_to_q.bias', 'blocks.3.attn.b_to_k.bias', 'blocks.3.attn.b_to_v.bias'],
630
+ "model.diffusion_model.joint_blocks.3.context_block.attn.qkv.weight": ['blocks.3.attn.b_to_q.weight', 'blocks.3.attn.b_to_k.weight', 'blocks.3.attn.b_to_v.weight'],
631
+ "model.diffusion_model.joint_blocks.3.context_block.mlp.fc1.bias": "blocks.3.ff_b.0.bias",
632
+ "model.diffusion_model.joint_blocks.3.context_block.mlp.fc1.weight": "blocks.3.ff_b.0.weight",
633
+ "model.diffusion_model.joint_blocks.3.context_block.mlp.fc2.bias": "blocks.3.ff_b.2.bias",
634
+ "model.diffusion_model.joint_blocks.3.context_block.mlp.fc2.weight": "blocks.3.ff_b.2.weight",
635
+ "model.diffusion_model.joint_blocks.3.x_block.adaLN_modulation.1.bias": "blocks.3.norm1_a.linear.bias",
636
+ "model.diffusion_model.joint_blocks.3.x_block.adaLN_modulation.1.weight": "blocks.3.norm1_a.linear.weight",
637
+ "model.diffusion_model.joint_blocks.3.x_block.attn.proj.bias": "blocks.3.attn.a_to_out.bias",
638
+ "model.diffusion_model.joint_blocks.3.x_block.attn.proj.weight": "blocks.3.attn.a_to_out.weight",
639
+ "model.diffusion_model.joint_blocks.3.x_block.attn.qkv.bias": ['blocks.3.attn.a_to_q.bias', 'blocks.3.attn.a_to_k.bias', 'blocks.3.attn.a_to_v.bias'],
640
+ "model.diffusion_model.joint_blocks.3.x_block.attn.qkv.weight": ['blocks.3.attn.a_to_q.weight', 'blocks.3.attn.a_to_k.weight', 'blocks.3.attn.a_to_v.weight'],
641
+ "model.diffusion_model.joint_blocks.3.x_block.mlp.fc1.bias": "blocks.3.ff_a.0.bias",
642
+ "model.diffusion_model.joint_blocks.3.x_block.mlp.fc1.weight": "blocks.3.ff_a.0.weight",
643
+ "model.diffusion_model.joint_blocks.3.x_block.mlp.fc2.bias": "blocks.3.ff_a.2.bias",
644
+ "model.diffusion_model.joint_blocks.3.x_block.mlp.fc2.weight": "blocks.3.ff_a.2.weight",
645
+ "model.diffusion_model.joint_blocks.4.context_block.adaLN_modulation.1.bias": "blocks.4.norm1_b.linear.bias",
646
+ "model.diffusion_model.joint_blocks.4.context_block.adaLN_modulation.1.weight": "blocks.4.norm1_b.linear.weight",
647
+ "model.diffusion_model.joint_blocks.4.context_block.attn.proj.bias": "blocks.4.attn.b_to_out.bias",
648
+ "model.diffusion_model.joint_blocks.4.context_block.attn.proj.weight": "blocks.4.attn.b_to_out.weight",
649
+ "model.diffusion_model.joint_blocks.4.context_block.attn.qkv.bias": ['blocks.4.attn.b_to_q.bias', 'blocks.4.attn.b_to_k.bias', 'blocks.4.attn.b_to_v.bias'],
650
+ "model.diffusion_model.joint_blocks.4.context_block.attn.qkv.weight": ['blocks.4.attn.b_to_q.weight', 'blocks.4.attn.b_to_k.weight', 'blocks.4.attn.b_to_v.weight'],
651
+ "model.diffusion_model.joint_blocks.4.context_block.mlp.fc1.bias": "blocks.4.ff_b.0.bias",
652
+ "model.diffusion_model.joint_blocks.4.context_block.mlp.fc1.weight": "blocks.4.ff_b.0.weight",
653
+ "model.diffusion_model.joint_blocks.4.context_block.mlp.fc2.bias": "blocks.4.ff_b.2.bias",
654
+ "model.diffusion_model.joint_blocks.4.context_block.mlp.fc2.weight": "blocks.4.ff_b.2.weight",
655
+ "model.diffusion_model.joint_blocks.4.x_block.adaLN_modulation.1.bias": "blocks.4.norm1_a.linear.bias",
656
+ "model.diffusion_model.joint_blocks.4.x_block.adaLN_modulation.1.weight": "blocks.4.norm1_a.linear.weight",
657
+ "model.diffusion_model.joint_blocks.4.x_block.attn.proj.bias": "blocks.4.attn.a_to_out.bias",
658
+ "model.diffusion_model.joint_blocks.4.x_block.attn.proj.weight": "blocks.4.attn.a_to_out.weight",
659
+ "model.diffusion_model.joint_blocks.4.x_block.attn.qkv.bias": ['blocks.4.attn.a_to_q.bias', 'blocks.4.attn.a_to_k.bias', 'blocks.4.attn.a_to_v.bias'],
660
+ "model.diffusion_model.joint_blocks.4.x_block.attn.qkv.weight": ['blocks.4.attn.a_to_q.weight', 'blocks.4.attn.a_to_k.weight', 'blocks.4.attn.a_to_v.weight'],
661
+ "model.diffusion_model.joint_blocks.4.x_block.mlp.fc1.bias": "blocks.4.ff_a.0.bias",
662
+ "model.diffusion_model.joint_blocks.4.x_block.mlp.fc1.weight": "blocks.4.ff_a.0.weight",
663
+ "model.diffusion_model.joint_blocks.4.x_block.mlp.fc2.bias": "blocks.4.ff_a.2.bias",
664
+ "model.diffusion_model.joint_blocks.4.x_block.mlp.fc2.weight": "blocks.4.ff_a.2.weight",
665
+ "model.diffusion_model.joint_blocks.5.context_block.adaLN_modulation.1.bias": "blocks.5.norm1_b.linear.bias",
666
+ "model.diffusion_model.joint_blocks.5.context_block.adaLN_modulation.1.weight": "blocks.5.norm1_b.linear.weight",
667
+ "model.diffusion_model.joint_blocks.5.context_block.attn.proj.bias": "blocks.5.attn.b_to_out.bias",
668
+ "model.diffusion_model.joint_blocks.5.context_block.attn.proj.weight": "blocks.5.attn.b_to_out.weight",
669
+ "model.diffusion_model.joint_blocks.5.context_block.attn.qkv.bias": ['blocks.5.attn.b_to_q.bias', 'blocks.5.attn.b_to_k.bias', 'blocks.5.attn.b_to_v.bias'],
670
+ "model.diffusion_model.joint_blocks.5.context_block.attn.qkv.weight": ['blocks.5.attn.b_to_q.weight', 'blocks.5.attn.b_to_k.weight', 'blocks.5.attn.b_to_v.weight'],
671
+ "model.diffusion_model.joint_blocks.5.context_block.mlp.fc1.bias": "blocks.5.ff_b.0.bias",
672
+ "model.diffusion_model.joint_blocks.5.context_block.mlp.fc1.weight": "blocks.5.ff_b.0.weight",
673
+ "model.diffusion_model.joint_blocks.5.context_block.mlp.fc2.bias": "blocks.5.ff_b.2.bias",
674
+ "model.diffusion_model.joint_blocks.5.context_block.mlp.fc2.weight": "blocks.5.ff_b.2.weight",
675
+ "model.diffusion_model.joint_blocks.5.x_block.adaLN_modulation.1.bias": "blocks.5.norm1_a.linear.bias",
676
+ "model.diffusion_model.joint_blocks.5.x_block.adaLN_modulation.1.weight": "blocks.5.norm1_a.linear.weight",
677
+ "model.diffusion_model.joint_blocks.5.x_block.attn.proj.bias": "blocks.5.attn.a_to_out.bias",
678
+ "model.diffusion_model.joint_blocks.5.x_block.attn.proj.weight": "blocks.5.attn.a_to_out.weight",
679
+ "model.diffusion_model.joint_blocks.5.x_block.attn.qkv.bias": ['blocks.5.attn.a_to_q.bias', 'blocks.5.attn.a_to_k.bias', 'blocks.5.attn.a_to_v.bias'],
680
+ "model.diffusion_model.joint_blocks.5.x_block.attn.qkv.weight": ['blocks.5.attn.a_to_q.weight', 'blocks.5.attn.a_to_k.weight', 'blocks.5.attn.a_to_v.weight'],
681
+ "model.diffusion_model.joint_blocks.5.x_block.mlp.fc1.bias": "blocks.5.ff_a.0.bias",
682
+ "model.diffusion_model.joint_blocks.5.x_block.mlp.fc1.weight": "blocks.5.ff_a.0.weight",
683
+ "model.diffusion_model.joint_blocks.5.x_block.mlp.fc2.bias": "blocks.5.ff_a.2.bias",
684
+ "model.diffusion_model.joint_blocks.5.x_block.mlp.fc2.weight": "blocks.5.ff_a.2.weight",
685
+ "model.diffusion_model.joint_blocks.6.context_block.adaLN_modulation.1.bias": "blocks.6.norm1_b.linear.bias",
686
+ "model.diffusion_model.joint_blocks.6.context_block.adaLN_modulation.1.weight": "blocks.6.norm1_b.linear.weight",
687
+ "model.diffusion_model.joint_blocks.6.context_block.attn.proj.bias": "blocks.6.attn.b_to_out.bias",
688
+ "model.diffusion_model.joint_blocks.6.context_block.attn.proj.weight": "blocks.6.attn.b_to_out.weight",
689
+ "model.diffusion_model.joint_blocks.6.context_block.attn.qkv.bias": ['blocks.6.attn.b_to_q.bias', 'blocks.6.attn.b_to_k.bias', 'blocks.6.attn.b_to_v.bias'],
690
+ "model.diffusion_model.joint_blocks.6.context_block.attn.qkv.weight": ['blocks.6.attn.b_to_q.weight', 'blocks.6.attn.b_to_k.weight', 'blocks.6.attn.b_to_v.weight'],
691
+ "model.diffusion_model.joint_blocks.6.context_block.mlp.fc1.bias": "blocks.6.ff_b.0.bias",
692
+ "model.diffusion_model.joint_blocks.6.context_block.mlp.fc1.weight": "blocks.6.ff_b.0.weight",
693
+ "model.diffusion_model.joint_blocks.6.context_block.mlp.fc2.bias": "blocks.6.ff_b.2.bias",
694
+ "model.diffusion_model.joint_blocks.6.context_block.mlp.fc2.weight": "blocks.6.ff_b.2.weight",
695
+ "model.diffusion_model.joint_blocks.6.x_block.adaLN_modulation.1.bias": "blocks.6.norm1_a.linear.bias",
696
+ "model.diffusion_model.joint_blocks.6.x_block.adaLN_modulation.1.weight": "blocks.6.norm1_a.linear.weight",
697
+ "model.diffusion_model.joint_blocks.6.x_block.attn.proj.bias": "blocks.6.attn.a_to_out.bias",
698
+ "model.diffusion_model.joint_blocks.6.x_block.attn.proj.weight": "blocks.6.attn.a_to_out.weight",
699
+ "model.diffusion_model.joint_blocks.6.x_block.attn.qkv.bias": ['blocks.6.attn.a_to_q.bias', 'blocks.6.attn.a_to_k.bias', 'blocks.6.attn.a_to_v.bias'],
700
+ "model.diffusion_model.joint_blocks.6.x_block.attn.qkv.weight": ['blocks.6.attn.a_to_q.weight', 'blocks.6.attn.a_to_k.weight', 'blocks.6.attn.a_to_v.weight'],
701
+ "model.diffusion_model.joint_blocks.6.x_block.mlp.fc1.bias": "blocks.6.ff_a.0.bias",
702
+ "model.diffusion_model.joint_blocks.6.x_block.mlp.fc1.weight": "blocks.6.ff_a.0.weight",
703
+ "model.diffusion_model.joint_blocks.6.x_block.mlp.fc2.bias": "blocks.6.ff_a.2.bias",
704
+ "model.diffusion_model.joint_blocks.6.x_block.mlp.fc2.weight": "blocks.6.ff_a.2.weight",
705
+ "model.diffusion_model.joint_blocks.7.context_block.adaLN_modulation.1.bias": "blocks.7.norm1_b.linear.bias",
706
+ "model.diffusion_model.joint_blocks.7.context_block.adaLN_modulation.1.weight": "blocks.7.norm1_b.linear.weight",
707
+ "model.diffusion_model.joint_blocks.7.context_block.attn.proj.bias": "blocks.7.attn.b_to_out.bias",
708
+ "model.diffusion_model.joint_blocks.7.context_block.attn.proj.weight": "blocks.7.attn.b_to_out.weight",
709
+ "model.diffusion_model.joint_blocks.7.context_block.attn.qkv.bias": ['blocks.7.attn.b_to_q.bias', 'blocks.7.attn.b_to_k.bias', 'blocks.7.attn.b_to_v.bias'],
710
+ "model.diffusion_model.joint_blocks.7.context_block.attn.qkv.weight": ['blocks.7.attn.b_to_q.weight', 'blocks.7.attn.b_to_k.weight', 'blocks.7.attn.b_to_v.weight'],
711
+ "model.diffusion_model.joint_blocks.7.context_block.mlp.fc1.bias": "blocks.7.ff_b.0.bias",
712
+ "model.diffusion_model.joint_blocks.7.context_block.mlp.fc1.weight": "blocks.7.ff_b.0.weight",
713
+ "model.diffusion_model.joint_blocks.7.context_block.mlp.fc2.bias": "blocks.7.ff_b.2.bias",
714
+ "model.diffusion_model.joint_blocks.7.context_block.mlp.fc2.weight": "blocks.7.ff_b.2.weight",
715
+ "model.diffusion_model.joint_blocks.7.x_block.adaLN_modulation.1.bias": "blocks.7.norm1_a.linear.bias",
716
+ "model.diffusion_model.joint_blocks.7.x_block.adaLN_modulation.1.weight": "blocks.7.norm1_a.linear.weight",
717
+ "model.diffusion_model.joint_blocks.7.x_block.attn.proj.bias": "blocks.7.attn.a_to_out.bias",
718
+ "model.diffusion_model.joint_blocks.7.x_block.attn.proj.weight": "blocks.7.attn.a_to_out.weight",
719
+ "model.diffusion_model.joint_blocks.7.x_block.attn.qkv.bias": ['blocks.7.attn.a_to_q.bias', 'blocks.7.attn.a_to_k.bias', 'blocks.7.attn.a_to_v.bias'],
720
+ "model.diffusion_model.joint_blocks.7.x_block.attn.qkv.weight": ['blocks.7.attn.a_to_q.weight', 'blocks.7.attn.a_to_k.weight', 'blocks.7.attn.a_to_v.weight'],
721
+ "model.diffusion_model.joint_blocks.7.x_block.mlp.fc1.bias": "blocks.7.ff_a.0.bias",
722
+ "model.diffusion_model.joint_blocks.7.x_block.mlp.fc1.weight": "blocks.7.ff_a.0.weight",
723
+ "model.diffusion_model.joint_blocks.7.x_block.mlp.fc2.bias": "blocks.7.ff_a.2.bias",
724
+ "model.diffusion_model.joint_blocks.7.x_block.mlp.fc2.weight": "blocks.7.ff_a.2.weight",
725
+ "model.diffusion_model.joint_blocks.8.context_block.adaLN_modulation.1.bias": "blocks.8.norm1_b.linear.bias",
726
+ "model.diffusion_model.joint_blocks.8.context_block.adaLN_modulation.1.weight": "blocks.8.norm1_b.linear.weight",
727
+ "model.diffusion_model.joint_blocks.8.context_block.attn.proj.bias": "blocks.8.attn.b_to_out.bias",
728
+ "model.diffusion_model.joint_blocks.8.context_block.attn.proj.weight": "blocks.8.attn.b_to_out.weight",
729
+ "model.diffusion_model.joint_blocks.8.context_block.attn.qkv.bias": ['blocks.8.attn.b_to_q.bias', 'blocks.8.attn.b_to_k.bias', 'blocks.8.attn.b_to_v.bias'],
730
+ "model.diffusion_model.joint_blocks.8.context_block.attn.qkv.weight": ['blocks.8.attn.b_to_q.weight', 'blocks.8.attn.b_to_k.weight', 'blocks.8.attn.b_to_v.weight'],
731
+ "model.diffusion_model.joint_blocks.8.context_block.mlp.fc1.bias": "blocks.8.ff_b.0.bias",
732
+ "model.diffusion_model.joint_blocks.8.context_block.mlp.fc1.weight": "blocks.8.ff_b.0.weight",
733
+ "model.diffusion_model.joint_blocks.8.context_block.mlp.fc2.bias": "blocks.8.ff_b.2.bias",
734
+ "model.diffusion_model.joint_blocks.8.context_block.mlp.fc2.weight": "blocks.8.ff_b.2.weight",
735
+ "model.diffusion_model.joint_blocks.8.x_block.adaLN_modulation.1.bias": "blocks.8.norm1_a.linear.bias",
736
+ "model.diffusion_model.joint_blocks.8.x_block.adaLN_modulation.1.weight": "blocks.8.norm1_a.linear.weight",
737
+ "model.diffusion_model.joint_blocks.8.x_block.attn.proj.bias": "blocks.8.attn.a_to_out.bias",
738
+ "model.diffusion_model.joint_blocks.8.x_block.attn.proj.weight": "blocks.8.attn.a_to_out.weight",
739
+ "model.diffusion_model.joint_blocks.8.x_block.attn.qkv.bias": ['blocks.8.attn.a_to_q.bias', 'blocks.8.attn.a_to_k.bias', 'blocks.8.attn.a_to_v.bias'],
740
+ "model.diffusion_model.joint_blocks.8.x_block.attn.qkv.weight": ['blocks.8.attn.a_to_q.weight', 'blocks.8.attn.a_to_k.weight', 'blocks.8.attn.a_to_v.weight'],
741
+ "model.diffusion_model.joint_blocks.8.x_block.mlp.fc1.bias": "blocks.8.ff_a.0.bias",
742
+ "model.diffusion_model.joint_blocks.8.x_block.mlp.fc1.weight": "blocks.8.ff_a.0.weight",
743
+ "model.diffusion_model.joint_blocks.8.x_block.mlp.fc2.bias": "blocks.8.ff_a.2.bias",
744
+ "model.diffusion_model.joint_blocks.8.x_block.mlp.fc2.weight": "blocks.8.ff_a.2.weight",
745
+ "model.diffusion_model.joint_blocks.9.context_block.adaLN_modulation.1.bias": "blocks.9.norm1_b.linear.bias",
746
+ "model.diffusion_model.joint_blocks.9.context_block.adaLN_modulation.1.weight": "blocks.9.norm1_b.linear.weight",
747
+ "model.diffusion_model.joint_blocks.9.context_block.attn.proj.bias": "blocks.9.attn.b_to_out.bias",
748
+ "model.diffusion_model.joint_blocks.9.context_block.attn.proj.weight": "blocks.9.attn.b_to_out.weight",
749
+ "model.diffusion_model.joint_blocks.9.context_block.attn.qkv.bias": ['blocks.9.attn.b_to_q.bias', 'blocks.9.attn.b_to_k.bias', 'blocks.9.attn.b_to_v.bias'],
750
+ "model.diffusion_model.joint_blocks.9.context_block.attn.qkv.weight": ['blocks.9.attn.b_to_q.weight', 'blocks.9.attn.b_to_k.weight', 'blocks.9.attn.b_to_v.weight'],
751
+ "model.diffusion_model.joint_blocks.9.context_block.mlp.fc1.bias": "blocks.9.ff_b.0.bias",
752
+ "model.diffusion_model.joint_blocks.9.context_block.mlp.fc1.weight": "blocks.9.ff_b.0.weight",
753
+ "model.diffusion_model.joint_blocks.9.context_block.mlp.fc2.bias": "blocks.9.ff_b.2.bias",
754
+ "model.diffusion_model.joint_blocks.9.context_block.mlp.fc2.weight": "blocks.9.ff_b.2.weight",
755
+ "model.diffusion_model.joint_blocks.9.x_block.adaLN_modulation.1.bias": "blocks.9.norm1_a.linear.bias",
756
+ "model.diffusion_model.joint_blocks.9.x_block.adaLN_modulation.1.weight": "blocks.9.norm1_a.linear.weight",
757
+ "model.diffusion_model.joint_blocks.9.x_block.attn.proj.bias": "blocks.9.attn.a_to_out.bias",
758
+ "model.diffusion_model.joint_blocks.9.x_block.attn.proj.weight": "blocks.9.attn.a_to_out.weight",
759
+ "model.diffusion_model.joint_blocks.9.x_block.attn.qkv.bias": ['blocks.9.attn.a_to_q.bias', 'blocks.9.attn.a_to_k.bias', 'blocks.9.attn.a_to_v.bias'],
760
+ "model.diffusion_model.joint_blocks.9.x_block.attn.qkv.weight": ['blocks.9.attn.a_to_q.weight', 'blocks.9.attn.a_to_k.weight', 'blocks.9.attn.a_to_v.weight'],
761
+ "model.diffusion_model.joint_blocks.9.x_block.mlp.fc1.bias": "blocks.9.ff_a.0.bias",
762
+ "model.diffusion_model.joint_blocks.9.x_block.mlp.fc1.weight": "blocks.9.ff_a.0.weight",
763
+ "model.diffusion_model.joint_blocks.9.x_block.mlp.fc2.bias": "blocks.9.ff_a.2.bias",
764
+ "model.diffusion_model.joint_blocks.9.x_block.mlp.fc2.weight": "blocks.9.ff_a.2.weight",
765
+ "model.diffusion_model.pos_embed": "pos_embedder.pos_embed",
766
+ "model.diffusion_model.t_embedder.mlp.0.bias": "time_embedder.timestep_embedder.0.bias",
767
+ "model.diffusion_model.t_embedder.mlp.0.weight": "time_embedder.timestep_embedder.0.weight",
768
+ "model.diffusion_model.t_embedder.mlp.2.bias": "time_embedder.timestep_embedder.2.bias",
769
+ "model.diffusion_model.t_embedder.mlp.2.weight": "time_embedder.timestep_embedder.2.weight",
770
+ "model.diffusion_model.x_embedder.proj.bias": "pos_embedder.proj.bias",
771
+ "model.diffusion_model.x_embedder.proj.weight": "pos_embedder.proj.weight",
772
+ "model.diffusion_model.y_embedder.mlp.0.bias": "pooled_text_embedder.0.bias",
773
+ "model.diffusion_model.y_embedder.mlp.0.weight": "pooled_text_embedder.0.weight",
774
+ "model.diffusion_model.y_embedder.mlp.2.bias": "pooled_text_embedder.2.bias",
775
+ "model.diffusion_model.y_embedder.mlp.2.weight": "pooled_text_embedder.2.weight",
776
+
777
+ "model.diffusion_model.joint_blocks.23.context_block.adaLN_modulation.1.weight": "blocks.23.norm1_b.linear.weight",
778
+ "model.diffusion_model.joint_blocks.23.context_block.adaLN_modulation.1.bias": "blocks.23.norm1_b.linear.bias",
779
+ "model.diffusion_model.final_layer.adaLN_modulation.1.weight": "norm_out.linear.weight",
780
+ "model.diffusion_model.final_layer.adaLN_modulation.1.bias": "norm_out.linear.bias",
781
+ }
782
+ state_dict_ = {}
783
+ for name in state_dict:
784
+ if name in rename_dict:
785
+ param = state_dict[name]
786
+ if name.startswith("model.diffusion_model.joint_blocks.23.context_block.adaLN_modulation.1."):
787
+ param = torch.concat([param[1536:], param[:1536]], axis=0)
788
+ elif name.startswith("model.diffusion_model.final_layer.adaLN_modulation.1."):
789
+ param = torch.concat([param[1536:], param[:1536]], axis=0)
790
+ elif name == "model.diffusion_model.pos_embed":
791
+ param = param.reshape((1, 192, 192, 1536))
792
+ if isinstance(rename_dict[name], str):
793
+ state_dict_[rename_dict[name]] = param
794
+ else:
795
+ name_ = rename_dict[name][0].replace(".a_to_q.", ".a_to_qkv.").replace(".b_to_q.", ".b_to_qkv.")
796
+ state_dict_[name_] = param
797
+ return state_dict_
diffsynth/models/sd3_text_encoder.py ADDED
The diff for this file is too large to render. See raw diff
 
diffsynth/models/sd3_vae_decoder.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .sd_vae_decoder import VAEAttentionBlock, SDVAEDecoderStateDictConverter
3
+ from .sd_unet import ResnetBlock, UpSampler
4
+ from .tiler import TileWorker
5
+
6
+
7
+
8
+ class SD3VAEDecoder(torch.nn.Module):
9
+ def __init__(self):
10
+ super().__init__()
11
+ self.scaling_factor = 1.5305 # Different from SD 1.x
12
+ self.shift_factor = 0.0609 # Different from SD 1.x
13
+ self.conv_in = torch.nn.Conv2d(16, 512, kernel_size=3, padding=1) # Different from SD 1.x
14
+
15
+ self.blocks = torch.nn.ModuleList([
16
+ # UNetMidBlock2D
17
+ ResnetBlock(512, 512, eps=1e-6),
18
+ VAEAttentionBlock(1, 512, 512, 1, eps=1e-6),
19
+ ResnetBlock(512, 512, eps=1e-6),
20
+ # UpDecoderBlock2D
21
+ ResnetBlock(512, 512, eps=1e-6),
22
+ ResnetBlock(512, 512, eps=1e-6),
23
+ ResnetBlock(512, 512, eps=1e-6),
24
+ UpSampler(512),
25
+ # UpDecoderBlock2D
26
+ ResnetBlock(512, 512, eps=1e-6),
27
+ ResnetBlock(512, 512, eps=1e-6),
28
+ ResnetBlock(512, 512, eps=1e-6),
29
+ UpSampler(512),
30
+ # UpDecoderBlock2D
31
+ ResnetBlock(512, 256, eps=1e-6),
32
+ ResnetBlock(256, 256, eps=1e-6),
33
+ ResnetBlock(256, 256, eps=1e-6),
34
+ UpSampler(256),
35
+ # UpDecoderBlock2D
36
+ ResnetBlock(256, 128, eps=1e-6),
37
+ ResnetBlock(128, 128, eps=1e-6),
38
+ ResnetBlock(128, 128, eps=1e-6),
39
+ ])
40
+
41
+ self.conv_norm_out = torch.nn.GroupNorm(num_channels=128, num_groups=32, eps=1e-6)
42
+ self.conv_act = torch.nn.SiLU()
43
+ self.conv_out = torch.nn.Conv2d(128, 3, kernel_size=3, padding=1)
44
+
45
+ def tiled_forward(self, sample, tile_size=64, tile_stride=32):
46
+ hidden_states = TileWorker().tiled_forward(
47
+ lambda x: self.forward(x),
48
+ sample,
49
+ tile_size,
50
+ tile_stride,
51
+ tile_device=sample.device,
52
+ tile_dtype=sample.dtype
53
+ )
54
+ return hidden_states
55
+
56
+ def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
57
+ # For VAE Decoder, we do not need to apply the tiler on each layer.
58
+ if tiled:
59
+ return self.tiled_forward(sample, tile_size=tile_size, tile_stride=tile_stride)
60
+
61
+ # 1. pre-process
62
+ hidden_states = sample / self.scaling_factor + self.shift_factor
63
+ hidden_states = self.conv_in(hidden_states)
64
+ time_emb = None
65
+ text_emb = None
66
+ res_stack = None
67
+
68
+ # 2. blocks
69
+ for i, block in enumerate(self.blocks):
70
+ hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
71
+
72
+ # 3. output
73
+ hidden_states = self.conv_norm_out(hidden_states)
74
+ hidden_states = self.conv_act(hidden_states)
75
+ hidden_states = self.conv_out(hidden_states)
76
+
77
+ return hidden_states
78
+
79
+ def state_dict_converter(self):
80
+ return SDVAEDecoderStateDictConverter()
diffsynth/models/sd3_vae_encoder.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .sd_unet import ResnetBlock, DownSampler
3
+ from .sd_vae_encoder import VAEAttentionBlock, SDVAEEncoderStateDictConverter
4
+ from .tiler import TileWorker
5
+ from einops import rearrange
6
+
7
+
8
+ class SD3VAEEncoder(torch.nn.Module):
9
+ def __init__(self):
10
+ super().__init__()
11
+ self.scaling_factor = 1.5305 # Different from SD 1.x
12
+ self.shift_factor = 0.0609 # Different from SD 1.x
13
+ self.conv_in = torch.nn.Conv2d(3, 128, kernel_size=3, padding=1)
14
+
15
+ self.blocks = torch.nn.ModuleList([
16
+ # DownEncoderBlock2D
17
+ ResnetBlock(128, 128, eps=1e-6),
18
+ ResnetBlock(128, 128, eps=1e-6),
19
+ DownSampler(128, padding=0, extra_padding=True),
20
+ # DownEncoderBlock2D
21
+ ResnetBlock(128, 256, eps=1e-6),
22
+ ResnetBlock(256, 256, eps=1e-6),
23
+ DownSampler(256, padding=0, extra_padding=True),
24
+ # DownEncoderBlock2D
25
+ ResnetBlock(256, 512, eps=1e-6),
26
+ ResnetBlock(512, 512, eps=1e-6),
27
+ DownSampler(512, padding=0, extra_padding=True),
28
+ # DownEncoderBlock2D
29
+ ResnetBlock(512, 512, eps=1e-6),
30
+ ResnetBlock(512, 512, eps=1e-6),
31
+ # UNetMidBlock2D
32
+ ResnetBlock(512, 512, eps=1e-6),
33
+ VAEAttentionBlock(1, 512, 512, 1, eps=1e-6),
34
+ ResnetBlock(512, 512, eps=1e-6),
35
+ ])
36
+
37
+ self.conv_norm_out = torch.nn.GroupNorm(num_channels=512, num_groups=32, eps=1e-6)
38
+ self.conv_act = torch.nn.SiLU()
39
+ self.conv_out = torch.nn.Conv2d(512, 32, kernel_size=3, padding=1)
40
+
41
+ def tiled_forward(self, sample, tile_size=64, tile_stride=32):
42
+ hidden_states = TileWorker().tiled_forward(
43
+ lambda x: self.forward(x),
44
+ sample,
45
+ tile_size,
46
+ tile_stride,
47
+ tile_device=sample.device,
48
+ tile_dtype=sample.dtype
49
+ )
50
+ return hidden_states
51
+
52
+ def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
53
+ # For VAE Decoder, we do not need to apply the tiler on each layer.
54
+ if tiled:
55
+ return self.tiled_forward(sample, tile_size=tile_size, tile_stride=tile_stride)
56
+
57
+ # 1. pre-process
58
+ hidden_states = self.conv_in(sample)
59
+ time_emb = None
60
+ text_emb = None
61
+ res_stack = None
62
+
63
+ # 2. blocks
64
+ for i, block in enumerate(self.blocks):
65
+ hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
66
+
67
+ # 3. output
68
+ hidden_states = self.conv_norm_out(hidden_states)
69
+ hidden_states = self.conv_act(hidden_states)
70
+ hidden_states = self.conv_out(hidden_states)
71
+ hidden_states = hidden_states[:, :16]
72
+ hidden_states = (hidden_states - self.shift_factor) * self.scaling_factor
73
+
74
+ return hidden_states
75
+
76
+ def encode_video(self, sample, batch_size=8):
77
+ B = sample.shape[0]
78
+ hidden_states = []
79
+
80
+ for i in range(0, sample.shape[2], batch_size):
81
+
82
+ j = min(i + batch_size, sample.shape[2])
83
+ sample_batch = rearrange(sample[:,:,i:j], "B C T H W -> (B T) C H W")
84
+
85
+ hidden_states_batch = self(sample_batch)
86
+ hidden_states_batch = rearrange(hidden_states_batch, "(B T) C H W -> B C T H W", B=B)
87
+
88
+ hidden_states.append(hidden_states_batch)
89
+
90
+ hidden_states = torch.concat(hidden_states, dim=2)
91
+ return hidden_states
92
+
93
+ def state_dict_converter(self):
94
+ return SDVAEEncoderStateDictConverter()
diffsynth/models/sd_controlnet.py ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .sd_unet import Timesteps, ResnetBlock, AttentionBlock, PushBlock, DownSampler
3
+ from .tiler import TileWorker
4
+
5
+
6
+ class ControlNetConditioningLayer(torch.nn.Module):
7
+ def __init__(self, channels = (3, 16, 32, 96, 256, 320)):
8
+ super().__init__()
9
+ self.blocks = torch.nn.ModuleList([])
10
+ self.blocks.append(torch.nn.Conv2d(channels[0], channels[1], kernel_size=3, padding=1))
11
+ self.blocks.append(torch.nn.SiLU())
12
+ for i in range(1, len(channels) - 2):
13
+ self.blocks.append(torch.nn.Conv2d(channels[i], channels[i], kernel_size=3, padding=1))
14
+ self.blocks.append(torch.nn.SiLU())
15
+ self.blocks.append(torch.nn.Conv2d(channels[i], channels[i+1], kernel_size=3, padding=1, stride=2))
16
+ self.blocks.append(torch.nn.SiLU())
17
+ self.blocks.append(torch.nn.Conv2d(channels[-2], channels[-1], kernel_size=3, padding=1))
18
+
19
+ def forward(self, conditioning):
20
+ for block in self.blocks:
21
+ conditioning = block(conditioning)
22
+ return conditioning
23
+
24
+
25
+ class SDControlNet(torch.nn.Module):
26
+ def __init__(self, global_pool=False):
27
+ super().__init__()
28
+ self.time_proj = Timesteps(320)
29
+ self.time_embedding = torch.nn.Sequential(
30
+ torch.nn.Linear(320, 1280),
31
+ torch.nn.SiLU(),
32
+ torch.nn.Linear(1280, 1280)
33
+ )
34
+ self.conv_in = torch.nn.Conv2d(4, 320, kernel_size=3, padding=1)
35
+
36
+ self.controlnet_conv_in = ControlNetConditioningLayer(channels=(3, 16, 32, 96, 256, 320))
37
+
38
+ self.blocks = torch.nn.ModuleList([
39
+ # CrossAttnDownBlock2D
40
+ ResnetBlock(320, 320, 1280),
41
+ AttentionBlock(8, 40, 320, 1, 768),
42
+ PushBlock(),
43
+ ResnetBlock(320, 320, 1280),
44
+ AttentionBlock(8, 40, 320, 1, 768),
45
+ PushBlock(),
46
+ DownSampler(320),
47
+ PushBlock(),
48
+ # CrossAttnDownBlock2D
49
+ ResnetBlock(320, 640, 1280),
50
+ AttentionBlock(8, 80, 640, 1, 768),
51
+ PushBlock(),
52
+ ResnetBlock(640, 640, 1280),
53
+ AttentionBlock(8, 80, 640, 1, 768),
54
+ PushBlock(),
55
+ DownSampler(640),
56
+ PushBlock(),
57
+ # CrossAttnDownBlock2D
58
+ ResnetBlock(640, 1280, 1280),
59
+ AttentionBlock(8, 160, 1280, 1, 768),
60
+ PushBlock(),
61
+ ResnetBlock(1280, 1280, 1280),
62
+ AttentionBlock(8, 160, 1280, 1, 768),
63
+ PushBlock(),
64
+ DownSampler(1280),
65
+ PushBlock(),
66
+ # DownBlock2D
67
+ ResnetBlock(1280, 1280, 1280),
68
+ PushBlock(),
69
+ ResnetBlock(1280, 1280, 1280),
70
+ PushBlock(),
71
+ # UNetMidBlock2DCrossAttn
72
+ ResnetBlock(1280, 1280, 1280),
73
+ AttentionBlock(8, 160, 1280, 1, 768),
74
+ ResnetBlock(1280, 1280, 1280),
75
+ PushBlock()
76
+ ])
77
+
78
+ self.controlnet_blocks = torch.nn.ModuleList([
79
+ torch.nn.Conv2d(320, 320, kernel_size=(1, 1)),
80
+ torch.nn.Conv2d(320, 320, kernel_size=(1, 1), bias=False),
81
+ torch.nn.Conv2d(320, 320, kernel_size=(1, 1), bias=False),
82
+ torch.nn.Conv2d(320, 320, kernel_size=(1, 1), bias=False),
83
+ torch.nn.Conv2d(640, 640, kernel_size=(1, 1)),
84
+ torch.nn.Conv2d(640, 640, kernel_size=(1, 1), bias=False),
85
+ torch.nn.Conv2d(640, 640, kernel_size=(1, 1), bias=False),
86
+ torch.nn.Conv2d(1280, 1280, kernel_size=(1, 1)),
87
+ torch.nn.Conv2d(1280, 1280, kernel_size=(1, 1), bias=False),
88
+ torch.nn.Conv2d(1280, 1280, kernel_size=(1, 1), bias=False),
89
+ torch.nn.Conv2d(1280, 1280, kernel_size=(1, 1), bias=False),
90
+ torch.nn.Conv2d(1280, 1280, kernel_size=(1, 1), bias=False),
91
+ torch.nn.Conv2d(1280, 1280, kernel_size=(1, 1), bias=False),
92
+ ])
93
+
94
+ self.global_pool = global_pool
95
+
96
+ def forward(
97
+ self,
98
+ sample, timestep, encoder_hidden_states, conditioning,
99
+ tiled=False, tile_size=64, tile_stride=32,
100
+ ):
101
+ # 1. time
102
+ time_emb = self.time_proj(timestep[None]).to(sample.dtype)
103
+ time_emb = self.time_embedding(time_emb)
104
+ time_emb = time_emb.repeat(sample.shape[0], 1)
105
+
106
+ # 2. pre-process
107
+ height, width = sample.shape[2], sample.shape[3]
108
+ hidden_states = self.conv_in(sample) + self.controlnet_conv_in(conditioning)
109
+ text_emb = encoder_hidden_states
110
+ res_stack = [hidden_states]
111
+
112
+ # 3. blocks
113
+ for i, block in enumerate(self.blocks):
114
+ if tiled and not isinstance(block, PushBlock):
115
+ _, _, inter_height, _ = hidden_states.shape
116
+ resize_scale = inter_height / height
117
+ hidden_states = TileWorker().tiled_forward(
118
+ lambda x: block(x, time_emb, text_emb, res_stack)[0],
119
+ hidden_states,
120
+ int(tile_size * resize_scale),
121
+ int(tile_stride * resize_scale),
122
+ tile_device=hidden_states.device,
123
+ tile_dtype=hidden_states.dtype
124
+ )
125
+ else:
126
+ hidden_states, _, _, _ = block(hidden_states, time_emb, text_emb, res_stack)
127
+
128
+ # 4. ControlNet blocks
129
+ controlnet_res_stack = [block(res) for block, res in zip(self.controlnet_blocks, res_stack)]
130
+
131
+ # pool
132
+ if self.global_pool:
133
+ controlnet_res_stack = [res.mean(dim=(2, 3), keepdim=True) for res in controlnet_res_stack]
134
+
135
+ return controlnet_res_stack
136
+
137
+ def state_dict_converter(self):
138
+ return SDControlNetStateDictConverter()
139
+
140
+
141
+ class SDControlNetStateDictConverter:
142
+ def __init__(self):
143
+ pass
144
+
145
+ def from_diffusers(self, state_dict):
146
+ # architecture
147
+ block_types = [
148
+ 'ResnetBlock', 'AttentionBlock', 'PushBlock', 'ResnetBlock', 'AttentionBlock', 'PushBlock', 'DownSampler', 'PushBlock',
149
+ 'ResnetBlock', 'AttentionBlock', 'PushBlock', 'ResnetBlock', 'AttentionBlock', 'PushBlock', 'DownSampler', 'PushBlock',
150
+ 'ResnetBlock', 'AttentionBlock', 'PushBlock', 'ResnetBlock', 'AttentionBlock', 'PushBlock', 'DownSampler', 'PushBlock',
151
+ 'ResnetBlock', 'PushBlock', 'ResnetBlock', 'PushBlock',
152
+ 'ResnetBlock', 'AttentionBlock', 'ResnetBlock',
153
+ 'PopBlock', 'ResnetBlock', 'PopBlock', 'ResnetBlock', 'PopBlock', 'ResnetBlock', 'UpSampler',
154
+ 'PopBlock', 'ResnetBlock', 'AttentionBlock', 'PopBlock', 'ResnetBlock', 'AttentionBlock', 'PopBlock', 'ResnetBlock', 'AttentionBlock', 'UpSampler',
155
+ 'PopBlock', 'ResnetBlock', 'AttentionBlock', 'PopBlock', 'ResnetBlock', 'AttentionBlock', 'PopBlock', 'ResnetBlock', 'AttentionBlock', 'UpSampler',
156
+ 'PopBlock', 'ResnetBlock', 'AttentionBlock', 'PopBlock', 'ResnetBlock', 'AttentionBlock', 'PopBlock', 'ResnetBlock', 'AttentionBlock'
157
+ ]
158
+
159
+ # controlnet_rename_dict
160
+ controlnet_rename_dict = {
161
+ "controlnet_cond_embedding.conv_in.weight": "controlnet_conv_in.blocks.0.weight",
162
+ "controlnet_cond_embedding.conv_in.bias": "controlnet_conv_in.blocks.0.bias",
163
+ "controlnet_cond_embedding.blocks.0.weight": "controlnet_conv_in.blocks.2.weight",
164
+ "controlnet_cond_embedding.blocks.0.bias": "controlnet_conv_in.blocks.2.bias",
165
+ "controlnet_cond_embedding.blocks.1.weight": "controlnet_conv_in.blocks.4.weight",
166
+ "controlnet_cond_embedding.blocks.1.bias": "controlnet_conv_in.blocks.4.bias",
167
+ "controlnet_cond_embedding.blocks.2.weight": "controlnet_conv_in.blocks.6.weight",
168
+ "controlnet_cond_embedding.blocks.2.bias": "controlnet_conv_in.blocks.6.bias",
169
+ "controlnet_cond_embedding.blocks.3.weight": "controlnet_conv_in.blocks.8.weight",
170
+ "controlnet_cond_embedding.blocks.3.bias": "controlnet_conv_in.blocks.8.bias",
171
+ "controlnet_cond_embedding.blocks.4.weight": "controlnet_conv_in.blocks.10.weight",
172
+ "controlnet_cond_embedding.blocks.4.bias": "controlnet_conv_in.blocks.10.bias",
173
+ "controlnet_cond_embedding.blocks.5.weight": "controlnet_conv_in.blocks.12.weight",
174
+ "controlnet_cond_embedding.blocks.5.bias": "controlnet_conv_in.blocks.12.bias",
175
+ "controlnet_cond_embedding.conv_out.weight": "controlnet_conv_in.blocks.14.weight",
176
+ "controlnet_cond_embedding.conv_out.bias": "controlnet_conv_in.blocks.14.bias",
177
+ }
178
+
179
+ # Rename each parameter
180
+ name_list = sorted([name for name in state_dict])
181
+ rename_dict = {}
182
+ block_id = {"ResnetBlock": -1, "AttentionBlock": -1, "DownSampler": -1, "UpSampler": -1}
183
+ last_block_type_with_id = {"ResnetBlock": "", "AttentionBlock": "", "DownSampler": "", "UpSampler": ""}
184
+ for name in name_list:
185
+ names = name.split(".")
186
+ if names[0] in ["conv_in", "conv_norm_out", "conv_out"]:
187
+ pass
188
+ elif name in controlnet_rename_dict:
189
+ names = controlnet_rename_dict[name].split(".")
190
+ elif names[0] == "controlnet_down_blocks":
191
+ names[0] = "controlnet_blocks"
192
+ elif names[0] == "controlnet_mid_block":
193
+ names = ["controlnet_blocks", "12", names[-1]]
194
+ elif names[0] in ["time_embedding", "add_embedding"]:
195
+ if names[0] == "add_embedding":
196
+ names[0] = "add_time_embedding"
197
+ names[1] = {"linear_1": "0", "linear_2": "2"}[names[1]]
198
+ elif names[0] in ["down_blocks", "mid_block", "up_blocks"]:
199
+ if names[0] == "mid_block":
200
+ names.insert(1, "0")
201
+ block_type = {"resnets": "ResnetBlock", "attentions": "AttentionBlock", "downsamplers": "DownSampler", "upsamplers": "UpSampler"}[names[2]]
202
+ block_type_with_id = ".".join(names[:4])
203
+ if block_type_with_id != last_block_type_with_id[block_type]:
204
+ block_id[block_type] += 1
205
+ last_block_type_with_id[block_type] = block_type_with_id
206
+ while block_id[block_type] < len(block_types) and block_types[block_id[block_type]] != block_type:
207
+ block_id[block_type] += 1
208
+ block_type_with_id = ".".join(names[:4])
209
+ names = ["blocks", str(block_id[block_type])] + names[4:]
210
+ if "ff" in names:
211
+ ff_index = names.index("ff")
212
+ component = ".".join(names[ff_index:ff_index+3])
213
+ component = {"ff.net.0": "act_fn", "ff.net.2": "ff"}[component]
214
+ names = names[:ff_index] + [component] + names[ff_index+3:]
215
+ if "to_out" in names:
216
+ names.pop(names.index("to_out") + 1)
217
+ else:
218
+ raise ValueError(f"Unknown parameters: {name}")
219
+ rename_dict[name] = ".".join(names)
220
+
221
+ # Convert state_dict
222
+ state_dict_ = {}
223
+ for name, param in state_dict.items():
224
+ if ".proj_in." in name or ".proj_out." in name:
225
+ param = param.squeeze()
226
+ if rename_dict[name] in [
227
+ "controlnet_blocks.1.bias", "controlnet_blocks.2.bias", "controlnet_blocks.3.bias", "controlnet_blocks.5.bias", "controlnet_blocks.6.bias",
228
+ "controlnet_blocks.8.bias", "controlnet_blocks.9.bias", "controlnet_blocks.10.bias", "controlnet_blocks.11.bias", "controlnet_blocks.12.bias"
229
+ ]:
230
+ continue
231
+ state_dict_[rename_dict[name]] = param
232
+ return state_dict_
233
+
234
+ def from_civitai(self, state_dict):
235
+ if "mid_block.resnets.1.time_emb_proj.weight" in state_dict:
236
+ # For controlnets in diffusers format
237
+ return self.from_diffusers(state_dict)
238
+ rename_dict = {
239
+ "control_model.time_embed.0.weight": "time_embedding.0.weight",
240
+ "control_model.time_embed.0.bias": "time_embedding.0.bias",
241
+ "control_model.time_embed.2.weight": "time_embedding.2.weight",
242
+ "control_model.time_embed.2.bias": "time_embedding.2.bias",
243
+ "control_model.input_blocks.0.0.weight": "conv_in.weight",
244
+ "control_model.input_blocks.0.0.bias": "conv_in.bias",
245
+ "control_model.input_blocks.1.0.in_layers.0.weight": "blocks.0.norm1.weight",
246
+ "control_model.input_blocks.1.0.in_layers.0.bias": "blocks.0.norm1.bias",
247
+ "control_model.input_blocks.1.0.in_layers.2.weight": "blocks.0.conv1.weight",
248
+ "control_model.input_blocks.1.0.in_layers.2.bias": "blocks.0.conv1.bias",
249
+ "control_model.input_blocks.1.0.emb_layers.1.weight": "blocks.0.time_emb_proj.weight",
250
+ "control_model.input_blocks.1.0.emb_layers.1.bias": "blocks.0.time_emb_proj.bias",
251
+ "control_model.input_blocks.1.0.out_layers.0.weight": "blocks.0.norm2.weight",
252
+ "control_model.input_blocks.1.0.out_layers.0.bias": "blocks.0.norm2.bias",
253
+ "control_model.input_blocks.1.0.out_layers.3.weight": "blocks.0.conv2.weight",
254
+ "control_model.input_blocks.1.0.out_layers.3.bias": "blocks.0.conv2.bias",
255
+ "control_model.input_blocks.1.1.norm.weight": "blocks.1.norm.weight",
256
+ "control_model.input_blocks.1.1.norm.bias": "blocks.1.norm.bias",
257
+ "control_model.input_blocks.1.1.proj_in.weight": "blocks.1.proj_in.weight",
258
+ "control_model.input_blocks.1.1.proj_in.bias": "blocks.1.proj_in.bias",
259
+ "control_model.input_blocks.1.1.transformer_blocks.0.attn1.to_q.weight": "blocks.1.transformer_blocks.0.attn1.to_q.weight",
260
+ "control_model.input_blocks.1.1.transformer_blocks.0.attn1.to_k.weight": "blocks.1.transformer_blocks.0.attn1.to_k.weight",
261
+ "control_model.input_blocks.1.1.transformer_blocks.0.attn1.to_v.weight": "blocks.1.transformer_blocks.0.attn1.to_v.weight",
262
+ "control_model.input_blocks.1.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.1.transformer_blocks.0.attn1.to_out.weight",
263
+ "control_model.input_blocks.1.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.1.transformer_blocks.0.attn1.to_out.bias",
264
+ "control_model.input_blocks.1.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.1.transformer_blocks.0.act_fn.proj.weight",
265
+ "control_model.input_blocks.1.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.1.transformer_blocks.0.act_fn.proj.bias",
266
+ "control_model.input_blocks.1.1.transformer_blocks.0.ff.net.2.weight": "blocks.1.transformer_blocks.0.ff.weight",
267
+ "control_model.input_blocks.1.1.transformer_blocks.0.ff.net.2.bias": "blocks.1.transformer_blocks.0.ff.bias",
268
+ "control_model.input_blocks.1.1.transformer_blocks.0.attn2.to_q.weight": "blocks.1.transformer_blocks.0.attn2.to_q.weight",
269
+ "control_model.input_blocks.1.1.transformer_blocks.0.attn2.to_k.weight": "blocks.1.transformer_blocks.0.attn2.to_k.weight",
270
+ "control_model.input_blocks.1.1.transformer_blocks.0.attn2.to_v.weight": "blocks.1.transformer_blocks.0.attn2.to_v.weight",
271
+ "control_model.input_blocks.1.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.1.transformer_blocks.0.attn2.to_out.weight",
272
+ "control_model.input_blocks.1.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.1.transformer_blocks.0.attn2.to_out.bias",
273
+ "control_model.input_blocks.1.1.transformer_blocks.0.norm1.weight": "blocks.1.transformer_blocks.0.norm1.weight",
274
+ "control_model.input_blocks.1.1.transformer_blocks.0.norm1.bias": "blocks.1.transformer_blocks.0.norm1.bias",
275
+ "control_model.input_blocks.1.1.transformer_blocks.0.norm2.weight": "blocks.1.transformer_blocks.0.norm2.weight",
276
+ "control_model.input_blocks.1.1.transformer_blocks.0.norm2.bias": "blocks.1.transformer_blocks.0.norm2.bias",
277
+ "control_model.input_blocks.1.1.transformer_blocks.0.norm3.weight": "blocks.1.transformer_blocks.0.norm3.weight",
278
+ "control_model.input_blocks.1.1.transformer_blocks.0.norm3.bias": "blocks.1.transformer_blocks.0.norm3.bias",
279
+ "control_model.input_blocks.1.1.proj_out.weight": "blocks.1.proj_out.weight",
280
+ "control_model.input_blocks.1.1.proj_out.bias": "blocks.1.proj_out.bias",
281
+ "control_model.input_blocks.2.0.in_layers.0.weight": "blocks.3.norm1.weight",
282
+ "control_model.input_blocks.2.0.in_layers.0.bias": "blocks.3.norm1.bias",
283
+ "control_model.input_blocks.2.0.in_layers.2.weight": "blocks.3.conv1.weight",
284
+ "control_model.input_blocks.2.0.in_layers.2.bias": "blocks.3.conv1.bias",
285
+ "control_model.input_blocks.2.0.emb_layers.1.weight": "blocks.3.time_emb_proj.weight",
286
+ "control_model.input_blocks.2.0.emb_layers.1.bias": "blocks.3.time_emb_proj.bias",
287
+ "control_model.input_blocks.2.0.out_layers.0.weight": "blocks.3.norm2.weight",
288
+ "control_model.input_blocks.2.0.out_layers.0.bias": "blocks.3.norm2.bias",
289
+ "control_model.input_blocks.2.0.out_layers.3.weight": "blocks.3.conv2.weight",
290
+ "control_model.input_blocks.2.0.out_layers.3.bias": "blocks.3.conv2.bias",
291
+ "control_model.input_blocks.2.1.norm.weight": "blocks.4.norm.weight",
292
+ "control_model.input_blocks.2.1.norm.bias": "blocks.4.norm.bias",
293
+ "control_model.input_blocks.2.1.proj_in.weight": "blocks.4.proj_in.weight",
294
+ "control_model.input_blocks.2.1.proj_in.bias": "blocks.4.proj_in.bias",
295
+ "control_model.input_blocks.2.1.transformer_blocks.0.attn1.to_q.weight": "blocks.4.transformer_blocks.0.attn1.to_q.weight",
296
+ "control_model.input_blocks.2.1.transformer_blocks.0.attn1.to_k.weight": "blocks.4.transformer_blocks.0.attn1.to_k.weight",
297
+ "control_model.input_blocks.2.1.transformer_blocks.0.attn1.to_v.weight": "blocks.4.transformer_blocks.0.attn1.to_v.weight",
298
+ "control_model.input_blocks.2.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.4.transformer_blocks.0.attn1.to_out.weight",
299
+ "control_model.input_blocks.2.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.4.transformer_blocks.0.attn1.to_out.bias",
300
+ "control_model.input_blocks.2.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.4.transformer_blocks.0.act_fn.proj.weight",
301
+ "control_model.input_blocks.2.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.4.transformer_blocks.0.act_fn.proj.bias",
302
+ "control_model.input_blocks.2.1.transformer_blocks.0.ff.net.2.weight": "blocks.4.transformer_blocks.0.ff.weight",
303
+ "control_model.input_blocks.2.1.transformer_blocks.0.ff.net.2.bias": "blocks.4.transformer_blocks.0.ff.bias",
304
+ "control_model.input_blocks.2.1.transformer_blocks.0.attn2.to_q.weight": "blocks.4.transformer_blocks.0.attn2.to_q.weight",
305
+ "control_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight": "blocks.4.transformer_blocks.0.attn2.to_k.weight",
306
+ "control_model.input_blocks.2.1.transformer_blocks.0.attn2.to_v.weight": "blocks.4.transformer_blocks.0.attn2.to_v.weight",
307
+ "control_model.input_blocks.2.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.4.transformer_blocks.0.attn2.to_out.weight",
308
+ "control_model.input_blocks.2.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.4.transformer_blocks.0.attn2.to_out.bias",
309
+ "control_model.input_blocks.2.1.transformer_blocks.0.norm1.weight": "blocks.4.transformer_blocks.0.norm1.weight",
310
+ "control_model.input_blocks.2.1.transformer_blocks.0.norm1.bias": "blocks.4.transformer_blocks.0.norm1.bias",
311
+ "control_model.input_blocks.2.1.transformer_blocks.0.norm2.weight": "blocks.4.transformer_blocks.0.norm2.weight",
312
+ "control_model.input_blocks.2.1.transformer_blocks.0.norm2.bias": "blocks.4.transformer_blocks.0.norm2.bias",
313
+ "control_model.input_blocks.2.1.transformer_blocks.0.norm3.weight": "blocks.4.transformer_blocks.0.norm3.weight",
314
+ "control_model.input_blocks.2.1.transformer_blocks.0.norm3.bias": "blocks.4.transformer_blocks.0.norm3.bias",
315
+ "control_model.input_blocks.2.1.proj_out.weight": "blocks.4.proj_out.weight",
316
+ "control_model.input_blocks.2.1.proj_out.bias": "blocks.4.proj_out.bias",
317
+ "control_model.input_blocks.3.0.op.weight": "blocks.6.conv.weight",
318
+ "control_model.input_blocks.3.0.op.bias": "blocks.6.conv.bias",
319
+ "control_model.input_blocks.4.0.in_layers.0.weight": "blocks.8.norm1.weight",
320
+ "control_model.input_blocks.4.0.in_layers.0.bias": "blocks.8.norm1.bias",
321
+ "control_model.input_blocks.4.0.in_layers.2.weight": "blocks.8.conv1.weight",
322
+ "control_model.input_blocks.4.0.in_layers.2.bias": "blocks.8.conv1.bias",
323
+ "control_model.input_blocks.4.0.emb_layers.1.weight": "blocks.8.time_emb_proj.weight",
324
+ "control_model.input_blocks.4.0.emb_layers.1.bias": "blocks.8.time_emb_proj.bias",
325
+ "control_model.input_blocks.4.0.out_layers.0.weight": "blocks.8.norm2.weight",
326
+ "control_model.input_blocks.4.0.out_layers.0.bias": "blocks.8.norm2.bias",
327
+ "control_model.input_blocks.4.0.out_layers.3.weight": "blocks.8.conv2.weight",
328
+ "control_model.input_blocks.4.0.out_layers.3.bias": "blocks.8.conv2.bias",
329
+ "control_model.input_blocks.4.0.skip_connection.weight": "blocks.8.conv_shortcut.weight",
330
+ "control_model.input_blocks.4.0.skip_connection.bias": "blocks.8.conv_shortcut.bias",
331
+ "control_model.input_blocks.4.1.norm.weight": "blocks.9.norm.weight",
332
+ "control_model.input_blocks.4.1.norm.bias": "blocks.9.norm.bias",
333
+ "control_model.input_blocks.4.1.proj_in.weight": "blocks.9.proj_in.weight",
334
+ "control_model.input_blocks.4.1.proj_in.bias": "blocks.9.proj_in.bias",
335
+ "control_model.input_blocks.4.1.transformer_blocks.0.attn1.to_q.weight": "blocks.9.transformer_blocks.0.attn1.to_q.weight",
336
+ "control_model.input_blocks.4.1.transformer_blocks.0.attn1.to_k.weight": "blocks.9.transformer_blocks.0.attn1.to_k.weight",
337
+ "control_model.input_blocks.4.1.transformer_blocks.0.attn1.to_v.weight": "blocks.9.transformer_blocks.0.attn1.to_v.weight",
338
+ "control_model.input_blocks.4.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.9.transformer_blocks.0.attn1.to_out.weight",
339
+ "control_model.input_blocks.4.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.9.transformer_blocks.0.attn1.to_out.bias",
340
+ "control_model.input_blocks.4.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.9.transformer_blocks.0.act_fn.proj.weight",
341
+ "control_model.input_blocks.4.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.9.transformer_blocks.0.act_fn.proj.bias",
342
+ "control_model.input_blocks.4.1.transformer_blocks.0.ff.net.2.weight": "blocks.9.transformer_blocks.0.ff.weight",
343
+ "control_model.input_blocks.4.1.transformer_blocks.0.ff.net.2.bias": "blocks.9.transformer_blocks.0.ff.bias",
344
+ "control_model.input_blocks.4.1.transformer_blocks.0.attn2.to_q.weight": "blocks.9.transformer_blocks.0.attn2.to_q.weight",
345
+ "control_model.input_blocks.4.1.transformer_blocks.0.attn2.to_k.weight": "blocks.9.transformer_blocks.0.attn2.to_k.weight",
346
+ "control_model.input_blocks.4.1.transformer_blocks.0.attn2.to_v.weight": "blocks.9.transformer_blocks.0.attn2.to_v.weight",
347
+ "control_model.input_blocks.4.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.9.transformer_blocks.0.attn2.to_out.weight",
348
+ "control_model.input_blocks.4.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.9.transformer_blocks.0.attn2.to_out.bias",
349
+ "control_model.input_blocks.4.1.transformer_blocks.0.norm1.weight": "blocks.9.transformer_blocks.0.norm1.weight",
350
+ "control_model.input_blocks.4.1.transformer_blocks.0.norm1.bias": "blocks.9.transformer_blocks.0.norm1.bias",
351
+ "control_model.input_blocks.4.1.transformer_blocks.0.norm2.weight": "blocks.9.transformer_blocks.0.norm2.weight",
352
+ "control_model.input_blocks.4.1.transformer_blocks.0.norm2.bias": "blocks.9.transformer_blocks.0.norm2.bias",
353
+ "control_model.input_blocks.4.1.transformer_blocks.0.norm3.weight": "blocks.9.transformer_blocks.0.norm3.weight",
354
+ "control_model.input_blocks.4.1.transformer_blocks.0.norm3.bias": "blocks.9.transformer_blocks.0.norm3.bias",
355
+ "control_model.input_blocks.4.1.proj_out.weight": "blocks.9.proj_out.weight",
356
+ "control_model.input_blocks.4.1.proj_out.bias": "blocks.9.proj_out.bias",
357
+ "control_model.input_blocks.5.0.in_layers.0.weight": "blocks.11.norm1.weight",
358
+ "control_model.input_blocks.5.0.in_layers.0.bias": "blocks.11.norm1.bias",
359
+ "control_model.input_blocks.5.0.in_layers.2.weight": "blocks.11.conv1.weight",
360
+ "control_model.input_blocks.5.0.in_layers.2.bias": "blocks.11.conv1.bias",
361
+ "control_model.input_blocks.5.0.emb_layers.1.weight": "blocks.11.time_emb_proj.weight",
362
+ "control_model.input_blocks.5.0.emb_layers.1.bias": "blocks.11.time_emb_proj.bias",
363
+ "control_model.input_blocks.5.0.out_layers.0.weight": "blocks.11.norm2.weight",
364
+ "control_model.input_blocks.5.0.out_layers.0.bias": "blocks.11.norm2.bias",
365
+ "control_model.input_blocks.5.0.out_layers.3.weight": "blocks.11.conv2.weight",
366
+ "control_model.input_blocks.5.0.out_layers.3.bias": "blocks.11.conv2.bias",
367
+ "control_model.input_blocks.5.1.norm.weight": "blocks.12.norm.weight",
368
+ "control_model.input_blocks.5.1.norm.bias": "blocks.12.norm.bias",
369
+ "control_model.input_blocks.5.1.proj_in.weight": "blocks.12.proj_in.weight",
370
+ "control_model.input_blocks.5.1.proj_in.bias": "blocks.12.proj_in.bias",
371
+ "control_model.input_blocks.5.1.transformer_blocks.0.attn1.to_q.weight": "blocks.12.transformer_blocks.0.attn1.to_q.weight",
372
+ "control_model.input_blocks.5.1.transformer_blocks.0.attn1.to_k.weight": "blocks.12.transformer_blocks.0.attn1.to_k.weight",
373
+ "control_model.input_blocks.5.1.transformer_blocks.0.attn1.to_v.weight": "blocks.12.transformer_blocks.0.attn1.to_v.weight",
374
+ "control_model.input_blocks.5.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.12.transformer_blocks.0.attn1.to_out.weight",
375
+ "control_model.input_blocks.5.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.12.transformer_blocks.0.attn1.to_out.bias",
376
+ "control_model.input_blocks.5.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.12.transformer_blocks.0.act_fn.proj.weight",
377
+ "control_model.input_blocks.5.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.12.transformer_blocks.0.act_fn.proj.bias",
378
+ "control_model.input_blocks.5.1.transformer_blocks.0.ff.net.2.weight": "blocks.12.transformer_blocks.0.ff.weight",
379
+ "control_model.input_blocks.5.1.transformer_blocks.0.ff.net.2.bias": "blocks.12.transformer_blocks.0.ff.bias",
380
+ "control_model.input_blocks.5.1.transformer_blocks.0.attn2.to_q.weight": "blocks.12.transformer_blocks.0.attn2.to_q.weight",
381
+ "control_model.input_blocks.5.1.transformer_blocks.0.attn2.to_k.weight": "blocks.12.transformer_blocks.0.attn2.to_k.weight",
382
+ "control_model.input_blocks.5.1.transformer_blocks.0.attn2.to_v.weight": "blocks.12.transformer_blocks.0.attn2.to_v.weight",
383
+ "control_model.input_blocks.5.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.12.transformer_blocks.0.attn2.to_out.weight",
384
+ "control_model.input_blocks.5.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.12.transformer_blocks.0.attn2.to_out.bias",
385
+ "control_model.input_blocks.5.1.transformer_blocks.0.norm1.weight": "blocks.12.transformer_blocks.0.norm1.weight",
386
+ "control_model.input_blocks.5.1.transformer_blocks.0.norm1.bias": "blocks.12.transformer_blocks.0.norm1.bias",
387
+ "control_model.input_blocks.5.1.transformer_blocks.0.norm2.weight": "blocks.12.transformer_blocks.0.norm2.weight",
388
+ "control_model.input_blocks.5.1.transformer_blocks.0.norm2.bias": "blocks.12.transformer_blocks.0.norm2.bias",
389
+ "control_model.input_blocks.5.1.transformer_blocks.0.norm3.weight": "blocks.12.transformer_blocks.0.norm3.weight",
390
+ "control_model.input_blocks.5.1.transformer_blocks.0.norm3.bias": "blocks.12.transformer_blocks.0.norm3.bias",
391
+ "control_model.input_blocks.5.1.proj_out.weight": "blocks.12.proj_out.weight",
392
+ "control_model.input_blocks.5.1.proj_out.bias": "blocks.12.proj_out.bias",
393
+ "control_model.input_blocks.6.0.op.weight": "blocks.14.conv.weight",
394
+ "control_model.input_blocks.6.0.op.bias": "blocks.14.conv.bias",
395
+ "control_model.input_blocks.7.0.in_layers.0.weight": "blocks.16.norm1.weight",
396
+ "control_model.input_blocks.7.0.in_layers.0.bias": "blocks.16.norm1.bias",
397
+ "control_model.input_blocks.7.0.in_layers.2.weight": "blocks.16.conv1.weight",
398
+ "control_model.input_blocks.7.0.in_layers.2.bias": "blocks.16.conv1.bias",
399
+ "control_model.input_blocks.7.0.emb_layers.1.weight": "blocks.16.time_emb_proj.weight",
400
+ "control_model.input_blocks.7.0.emb_layers.1.bias": "blocks.16.time_emb_proj.bias",
401
+ "control_model.input_blocks.7.0.out_layers.0.weight": "blocks.16.norm2.weight",
402
+ "control_model.input_blocks.7.0.out_layers.0.bias": "blocks.16.norm2.bias",
403
+ "control_model.input_blocks.7.0.out_layers.3.weight": "blocks.16.conv2.weight",
404
+ "control_model.input_blocks.7.0.out_layers.3.bias": "blocks.16.conv2.bias",
405
+ "control_model.input_blocks.7.0.skip_connection.weight": "blocks.16.conv_shortcut.weight",
406
+ "control_model.input_blocks.7.0.skip_connection.bias": "blocks.16.conv_shortcut.bias",
407
+ "control_model.input_blocks.7.1.norm.weight": "blocks.17.norm.weight",
408
+ "control_model.input_blocks.7.1.norm.bias": "blocks.17.norm.bias",
409
+ "control_model.input_blocks.7.1.proj_in.weight": "blocks.17.proj_in.weight",
410
+ "control_model.input_blocks.7.1.proj_in.bias": "blocks.17.proj_in.bias",
411
+ "control_model.input_blocks.7.1.transformer_blocks.0.attn1.to_q.weight": "blocks.17.transformer_blocks.0.attn1.to_q.weight",
412
+ "control_model.input_blocks.7.1.transformer_blocks.0.attn1.to_k.weight": "blocks.17.transformer_blocks.0.attn1.to_k.weight",
413
+ "control_model.input_blocks.7.1.transformer_blocks.0.attn1.to_v.weight": "blocks.17.transformer_blocks.0.attn1.to_v.weight",
414
+ "control_model.input_blocks.7.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.17.transformer_blocks.0.attn1.to_out.weight",
415
+ "control_model.input_blocks.7.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.17.transformer_blocks.0.attn1.to_out.bias",
416
+ "control_model.input_blocks.7.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.17.transformer_blocks.0.act_fn.proj.weight",
417
+ "control_model.input_blocks.7.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.17.transformer_blocks.0.act_fn.proj.bias",
418
+ "control_model.input_blocks.7.1.transformer_blocks.0.ff.net.2.weight": "blocks.17.transformer_blocks.0.ff.weight",
419
+ "control_model.input_blocks.7.1.transformer_blocks.0.ff.net.2.bias": "blocks.17.transformer_blocks.0.ff.bias",
420
+ "control_model.input_blocks.7.1.transformer_blocks.0.attn2.to_q.weight": "blocks.17.transformer_blocks.0.attn2.to_q.weight",
421
+ "control_model.input_blocks.7.1.transformer_blocks.0.attn2.to_k.weight": "blocks.17.transformer_blocks.0.attn2.to_k.weight",
422
+ "control_model.input_blocks.7.1.transformer_blocks.0.attn2.to_v.weight": "blocks.17.transformer_blocks.0.attn2.to_v.weight",
423
+ "control_model.input_blocks.7.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.17.transformer_blocks.0.attn2.to_out.weight",
424
+ "control_model.input_blocks.7.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.17.transformer_blocks.0.attn2.to_out.bias",
425
+ "control_model.input_blocks.7.1.transformer_blocks.0.norm1.weight": "blocks.17.transformer_blocks.0.norm1.weight",
426
+ "control_model.input_blocks.7.1.transformer_blocks.0.norm1.bias": "blocks.17.transformer_blocks.0.norm1.bias",
427
+ "control_model.input_blocks.7.1.transformer_blocks.0.norm2.weight": "blocks.17.transformer_blocks.0.norm2.weight",
428
+ "control_model.input_blocks.7.1.transformer_blocks.0.norm2.bias": "blocks.17.transformer_blocks.0.norm2.bias",
429
+ "control_model.input_blocks.7.1.transformer_blocks.0.norm3.weight": "blocks.17.transformer_blocks.0.norm3.weight",
430
+ "control_model.input_blocks.7.1.transformer_blocks.0.norm3.bias": "blocks.17.transformer_blocks.0.norm3.bias",
431
+ "control_model.input_blocks.7.1.proj_out.weight": "blocks.17.proj_out.weight",
432
+ "control_model.input_blocks.7.1.proj_out.bias": "blocks.17.proj_out.bias",
433
+ "control_model.input_blocks.8.0.in_layers.0.weight": "blocks.19.norm1.weight",
434
+ "control_model.input_blocks.8.0.in_layers.0.bias": "blocks.19.norm1.bias",
435
+ "control_model.input_blocks.8.0.in_layers.2.weight": "blocks.19.conv1.weight",
436
+ "control_model.input_blocks.8.0.in_layers.2.bias": "blocks.19.conv1.bias",
437
+ "control_model.input_blocks.8.0.emb_layers.1.weight": "blocks.19.time_emb_proj.weight",
438
+ "control_model.input_blocks.8.0.emb_layers.1.bias": "blocks.19.time_emb_proj.bias",
439
+ "control_model.input_blocks.8.0.out_layers.0.weight": "blocks.19.norm2.weight",
440
+ "control_model.input_blocks.8.0.out_layers.0.bias": "blocks.19.norm2.bias",
441
+ "control_model.input_blocks.8.0.out_layers.3.weight": "blocks.19.conv2.weight",
442
+ "control_model.input_blocks.8.0.out_layers.3.bias": "blocks.19.conv2.bias",
443
+ "control_model.input_blocks.8.1.norm.weight": "blocks.20.norm.weight",
444
+ "control_model.input_blocks.8.1.norm.bias": "blocks.20.norm.bias",
445
+ "control_model.input_blocks.8.1.proj_in.weight": "blocks.20.proj_in.weight",
446
+ "control_model.input_blocks.8.1.proj_in.bias": "blocks.20.proj_in.bias",
447
+ "control_model.input_blocks.8.1.transformer_blocks.0.attn1.to_q.weight": "blocks.20.transformer_blocks.0.attn1.to_q.weight",
448
+ "control_model.input_blocks.8.1.transformer_blocks.0.attn1.to_k.weight": "blocks.20.transformer_blocks.0.attn1.to_k.weight",
449
+ "control_model.input_blocks.8.1.transformer_blocks.0.attn1.to_v.weight": "blocks.20.transformer_blocks.0.attn1.to_v.weight",
450
+ "control_model.input_blocks.8.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.20.transformer_blocks.0.attn1.to_out.weight",
451
+ "control_model.input_blocks.8.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.20.transformer_blocks.0.attn1.to_out.bias",
452
+ "control_model.input_blocks.8.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.20.transformer_blocks.0.act_fn.proj.weight",
453
+ "control_model.input_blocks.8.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.20.transformer_blocks.0.act_fn.proj.bias",
454
+ "control_model.input_blocks.8.1.transformer_blocks.0.ff.net.2.weight": "blocks.20.transformer_blocks.0.ff.weight",
455
+ "control_model.input_blocks.8.1.transformer_blocks.0.ff.net.2.bias": "blocks.20.transformer_blocks.0.ff.bias",
456
+ "control_model.input_blocks.8.1.transformer_blocks.0.attn2.to_q.weight": "blocks.20.transformer_blocks.0.attn2.to_q.weight",
457
+ "control_model.input_blocks.8.1.transformer_blocks.0.attn2.to_k.weight": "blocks.20.transformer_blocks.0.attn2.to_k.weight",
458
+ "control_model.input_blocks.8.1.transformer_blocks.0.attn2.to_v.weight": "blocks.20.transformer_blocks.0.attn2.to_v.weight",
459
+ "control_model.input_blocks.8.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.20.transformer_blocks.0.attn2.to_out.weight",
460
+ "control_model.input_blocks.8.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.20.transformer_blocks.0.attn2.to_out.bias",
461
+ "control_model.input_blocks.8.1.transformer_blocks.0.norm1.weight": "blocks.20.transformer_blocks.0.norm1.weight",
462
+ "control_model.input_blocks.8.1.transformer_blocks.0.norm1.bias": "blocks.20.transformer_blocks.0.norm1.bias",
463
+ "control_model.input_blocks.8.1.transformer_blocks.0.norm2.weight": "blocks.20.transformer_blocks.0.norm2.weight",
464
+ "control_model.input_blocks.8.1.transformer_blocks.0.norm2.bias": "blocks.20.transformer_blocks.0.norm2.bias",
465
+ "control_model.input_blocks.8.1.transformer_blocks.0.norm3.weight": "blocks.20.transformer_blocks.0.norm3.weight",
466
+ "control_model.input_blocks.8.1.transformer_blocks.0.norm3.bias": "blocks.20.transformer_blocks.0.norm3.bias",
467
+ "control_model.input_blocks.8.1.proj_out.weight": "blocks.20.proj_out.weight",
468
+ "control_model.input_blocks.8.1.proj_out.bias": "blocks.20.proj_out.bias",
469
+ "control_model.input_blocks.9.0.op.weight": "blocks.22.conv.weight",
470
+ "control_model.input_blocks.9.0.op.bias": "blocks.22.conv.bias",
471
+ "control_model.input_blocks.10.0.in_layers.0.weight": "blocks.24.norm1.weight",
472
+ "control_model.input_blocks.10.0.in_layers.0.bias": "blocks.24.norm1.bias",
473
+ "control_model.input_blocks.10.0.in_layers.2.weight": "blocks.24.conv1.weight",
474
+ "control_model.input_blocks.10.0.in_layers.2.bias": "blocks.24.conv1.bias",
475
+ "control_model.input_blocks.10.0.emb_layers.1.weight": "blocks.24.time_emb_proj.weight",
476
+ "control_model.input_blocks.10.0.emb_layers.1.bias": "blocks.24.time_emb_proj.bias",
477
+ "control_model.input_blocks.10.0.out_layers.0.weight": "blocks.24.norm2.weight",
478
+ "control_model.input_blocks.10.0.out_layers.0.bias": "blocks.24.norm2.bias",
479
+ "control_model.input_blocks.10.0.out_layers.3.weight": "blocks.24.conv2.weight",
480
+ "control_model.input_blocks.10.0.out_layers.3.bias": "blocks.24.conv2.bias",
481
+ "control_model.input_blocks.11.0.in_layers.0.weight": "blocks.26.norm1.weight",
482
+ "control_model.input_blocks.11.0.in_layers.0.bias": "blocks.26.norm1.bias",
483
+ "control_model.input_blocks.11.0.in_layers.2.weight": "blocks.26.conv1.weight",
484
+ "control_model.input_blocks.11.0.in_layers.2.bias": "blocks.26.conv1.bias",
485
+ "control_model.input_blocks.11.0.emb_layers.1.weight": "blocks.26.time_emb_proj.weight",
486
+ "control_model.input_blocks.11.0.emb_layers.1.bias": "blocks.26.time_emb_proj.bias",
487
+ "control_model.input_blocks.11.0.out_layers.0.weight": "blocks.26.norm2.weight",
488
+ "control_model.input_blocks.11.0.out_layers.0.bias": "blocks.26.norm2.bias",
489
+ "control_model.input_blocks.11.0.out_layers.3.weight": "blocks.26.conv2.weight",
490
+ "control_model.input_blocks.11.0.out_layers.3.bias": "blocks.26.conv2.bias",
491
+ "control_model.zero_convs.0.0.weight": "controlnet_blocks.0.weight",
492
+ "control_model.zero_convs.0.0.bias": "controlnet_blocks.0.bias",
493
+ "control_model.zero_convs.1.0.weight": "controlnet_blocks.1.weight",
494
+ "control_model.zero_convs.1.0.bias": "controlnet_blocks.0.bias",
495
+ "control_model.zero_convs.2.0.weight": "controlnet_blocks.2.weight",
496
+ "control_model.zero_convs.2.0.bias": "controlnet_blocks.0.bias",
497
+ "control_model.zero_convs.3.0.weight": "controlnet_blocks.3.weight",
498
+ "control_model.zero_convs.3.0.bias": "controlnet_blocks.0.bias",
499
+ "control_model.zero_convs.4.0.weight": "controlnet_blocks.4.weight",
500
+ "control_model.zero_convs.4.0.bias": "controlnet_blocks.4.bias",
501
+ "control_model.zero_convs.5.0.weight": "controlnet_blocks.5.weight",
502
+ "control_model.zero_convs.5.0.bias": "controlnet_blocks.4.bias",
503
+ "control_model.zero_convs.6.0.weight": "controlnet_blocks.6.weight",
504
+ "control_model.zero_convs.6.0.bias": "controlnet_blocks.4.bias",
505
+ "control_model.zero_convs.7.0.weight": "controlnet_blocks.7.weight",
506
+ "control_model.zero_convs.7.0.bias": "controlnet_blocks.7.bias",
507
+ "control_model.zero_convs.8.0.weight": "controlnet_blocks.8.weight",
508
+ "control_model.zero_convs.8.0.bias": "controlnet_blocks.7.bias",
509
+ "control_model.zero_convs.9.0.weight": "controlnet_blocks.9.weight",
510
+ "control_model.zero_convs.9.0.bias": "controlnet_blocks.7.bias",
511
+ "control_model.zero_convs.10.0.weight": "controlnet_blocks.10.weight",
512
+ "control_model.zero_convs.10.0.bias": "controlnet_blocks.7.bias",
513
+ "control_model.zero_convs.11.0.weight": "controlnet_blocks.11.weight",
514
+ "control_model.zero_convs.11.0.bias": "controlnet_blocks.7.bias",
515
+ "control_model.input_hint_block.0.weight": "controlnet_conv_in.blocks.0.weight",
516
+ "control_model.input_hint_block.0.bias": "controlnet_conv_in.blocks.0.bias",
517
+ "control_model.input_hint_block.2.weight": "controlnet_conv_in.blocks.2.weight",
518
+ "control_model.input_hint_block.2.bias": "controlnet_conv_in.blocks.2.bias",
519
+ "control_model.input_hint_block.4.weight": "controlnet_conv_in.blocks.4.weight",
520
+ "control_model.input_hint_block.4.bias": "controlnet_conv_in.blocks.4.bias",
521
+ "control_model.input_hint_block.6.weight": "controlnet_conv_in.blocks.6.weight",
522
+ "control_model.input_hint_block.6.bias": "controlnet_conv_in.blocks.6.bias",
523
+ "control_model.input_hint_block.8.weight": "controlnet_conv_in.blocks.8.weight",
524
+ "control_model.input_hint_block.8.bias": "controlnet_conv_in.blocks.8.bias",
525
+ "control_model.input_hint_block.10.weight": "controlnet_conv_in.blocks.10.weight",
526
+ "control_model.input_hint_block.10.bias": "controlnet_conv_in.blocks.10.bias",
527
+ "control_model.input_hint_block.12.weight": "controlnet_conv_in.blocks.12.weight",
528
+ "control_model.input_hint_block.12.bias": "controlnet_conv_in.blocks.12.bias",
529
+ "control_model.input_hint_block.14.weight": "controlnet_conv_in.blocks.14.weight",
530
+ "control_model.input_hint_block.14.bias": "controlnet_conv_in.blocks.14.bias",
531
+ "control_model.middle_block.0.in_layers.0.weight": "blocks.28.norm1.weight",
532
+ "control_model.middle_block.0.in_layers.0.bias": "blocks.28.norm1.bias",
533
+ "control_model.middle_block.0.in_layers.2.weight": "blocks.28.conv1.weight",
534
+ "control_model.middle_block.0.in_layers.2.bias": "blocks.28.conv1.bias",
535
+ "control_model.middle_block.0.emb_layers.1.weight": "blocks.28.time_emb_proj.weight",
536
+ "control_model.middle_block.0.emb_layers.1.bias": "blocks.28.time_emb_proj.bias",
537
+ "control_model.middle_block.0.out_layers.0.weight": "blocks.28.norm2.weight",
538
+ "control_model.middle_block.0.out_layers.0.bias": "blocks.28.norm2.bias",
539
+ "control_model.middle_block.0.out_layers.3.weight": "blocks.28.conv2.weight",
540
+ "control_model.middle_block.0.out_layers.3.bias": "blocks.28.conv2.bias",
541
+ "control_model.middle_block.1.norm.weight": "blocks.29.norm.weight",
542
+ "control_model.middle_block.1.norm.bias": "blocks.29.norm.bias",
543
+ "control_model.middle_block.1.proj_in.weight": "blocks.29.proj_in.weight",
544
+ "control_model.middle_block.1.proj_in.bias": "blocks.29.proj_in.bias",
545
+ "control_model.middle_block.1.transformer_blocks.0.attn1.to_q.weight": "blocks.29.transformer_blocks.0.attn1.to_q.weight",
546
+ "control_model.middle_block.1.transformer_blocks.0.attn1.to_k.weight": "blocks.29.transformer_blocks.0.attn1.to_k.weight",
547
+ "control_model.middle_block.1.transformer_blocks.0.attn1.to_v.weight": "blocks.29.transformer_blocks.0.attn1.to_v.weight",
548
+ "control_model.middle_block.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.29.transformer_blocks.0.attn1.to_out.weight",
549
+ "control_model.middle_block.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.29.transformer_blocks.0.attn1.to_out.bias",
550
+ "control_model.middle_block.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.29.transformer_blocks.0.act_fn.proj.weight",
551
+ "control_model.middle_block.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.29.transformer_blocks.0.act_fn.proj.bias",
552
+ "control_model.middle_block.1.transformer_blocks.0.ff.net.2.weight": "blocks.29.transformer_blocks.0.ff.weight",
553
+ "control_model.middle_block.1.transformer_blocks.0.ff.net.2.bias": "blocks.29.transformer_blocks.0.ff.bias",
554
+ "control_model.middle_block.1.transformer_blocks.0.attn2.to_q.weight": "blocks.29.transformer_blocks.0.attn2.to_q.weight",
555
+ "control_model.middle_block.1.transformer_blocks.0.attn2.to_k.weight": "blocks.29.transformer_blocks.0.attn2.to_k.weight",
556
+ "control_model.middle_block.1.transformer_blocks.0.attn2.to_v.weight": "blocks.29.transformer_blocks.0.attn2.to_v.weight",
557
+ "control_model.middle_block.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.29.transformer_blocks.0.attn2.to_out.weight",
558
+ "control_model.middle_block.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.29.transformer_blocks.0.attn2.to_out.bias",
559
+ "control_model.middle_block.1.transformer_blocks.0.norm1.weight": "blocks.29.transformer_blocks.0.norm1.weight",
560
+ "control_model.middle_block.1.transformer_blocks.0.norm1.bias": "blocks.29.transformer_blocks.0.norm1.bias",
561
+ "control_model.middle_block.1.transformer_blocks.0.norm2.weight": "blocks.29.transformer_blocks.0.norm2.weight",
562
+ "control_model.middle_block.1.transformer_blocks.0.norm2.bias": "blocks.29.transformer_blocks.0.norm2.bias",
563
+ "control_model.middle_block.1.transformer_blocks.0.norm3.weight": "blocks.29.transformer_blocks.0.norm3.weight",
564
+ "control_model.middle_block.1.transformer_blocks.0.norm3.bias": "blocks.29.transformer_blocks.0.norm3.bias",
565
+ "control_model.middle_block.1.proj_out.weight": "blocks.29.proj_out.weight",
566
+ "control_model.middle_block.1.proj_out.bias": "blocks.29.proj_out.bias",
567
+ "control_model.middle_block.2.in_layers.0.weight": "blocks.30.norm1.weight",
568
+ "control_model.middle_block.2.in_layers.0.bias": "blocks.30.norm1.bias",
569
+ "control_model.middle_block.2.in_layers.2.weight": "blocks.30.conv1.weight",
570
+ "control_model.middle_block.2.in_layers.2.bias": "blocks.30.conv1.bias",
571
+ "control_model.middle_block.2.emb_layers.1.weight": "blocks.30.time_emb_proj.weight",
572
+ "control_model.middle_block.2.emb_layers.1.bias": "blocks.30.time_emb_proj.bias",
573
+ "control_model.middle_block.2.out_layers.0.weight": "blocks.30.norm2.weight",
574
+ "control_model.middle_block.2.out_layers.0.bias": "blocks.30.norm2.bias",
575
+ "control_model.middle_block.2.out_layers.3.weight": "blocks.30.conv2.weight",
576
+ "control_model.middle_block.2.out_layers.3.bias": "blocks.30.conv2.bias",
577
+ "control_model.middle_block_out.0.weight": "controlnet_blocks.12.weight",
578
+ "control_model.middle_block_out.0.bias": "controlnet_blocks.7.bias",
579
+ }
580
+ state_dict_ = {}
581
+ for name in state_dict:
582
+ if name in rename_dict:
583
+ param = state_dict[name]
584
+ if ".proj_in." in name or ".proj_out." in name:
585
+ param = param.squeeze()
586
+ state_dict_[rename_dict[name]] = param
587
+ return state_dict_
diffsynth/models/sd_ipadapter.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .svd_image_encoder import SVDImageEncoder
2
+ from .sdxl_ipadapter import IpAdapterImageProjModel, IpAdapterModule, SDXLIpAdapterStateDictConverter
3
+ from transformers import CLIPImageProcessor
4
+ import torch
5
+
6
+
7
+ class IpAdapterCLIPImageEmbedder(SVDImageEncoder):
8
+ def __init__(self):
9
+ super().__init__()
10
+ self.image_processor = CLIPImageProcessor()
11
+
12
+ def forward(self, image):
13
+ pixel_values = self.image_processor(images=image, return_tensors="pt").pixel_values
14
+ pixel_values = pixel_values.to(device=self.embeddings.class_embedding.device, dtype=self.embeddings.class_embedding.dtype)
15
+ return super().forward(pixel_values)
16
+
17
+
18
+ class SDIpAdapter(torch.nn.Module):
19
+ def __init__(self):
20
+ super().__init__()
21
+ shape_list = [(768, 320)] * 2 + [(768, 640)] * 2 + [(768, 1280)] * 5 + [(768, 640)] * 3 + [(768, 320)] * 3 + [(768, 1280)] * 1
22
+ self.ipadapter_modules = torch.nn.ModuleList([IpAdapterModule(*shape) for shape in shape_list])
23
+ self.image_proj = IpAdapterImageProjModel(cross_attention_dim=768, clip_embeddings_dim=1024, clip_extra_context_tokens=4)
24
+ self.set_full_adapter()
25
+
26
+ def set_full_adapter(self):
27
+ block_ids = [1, 4, 9, 12, 17, 20, 40, 43, 46, 50, 53, 56, 60, 63, 66, 29]
28
+ self.call_block_id = {(i, 0): j for j, i in enumerate(block_ids)}
29
+
30
+ def set_less_adapter(self):
31
+ # IP-Adapter for SD v1.5 doesn't support this feature.
32
+ self.set_full_adapter()
33
+
34
+ def forward(self, hidden_states, scale=1.0):
35
+ hidden_states = self.image_proj(hidden_states)
36
+ hidden_states = hidden_states.view(1, -1, hidden_states.shape[-1])
37
+ ip_kv_dict = {}
38
+ for (block_id, transformer_id) in self.call_block_id:
39
+ ipadapter_id = self.call_block_id[(block_id, transformer_id)]
40
+ ip_k, ip_v = self.ipadapter_modules[ipadapter_id](hidden_states)
41
+ if block_id not in ip_kv_dict:
42
+ ip_kv_dict[block_id] = {}
43
+ ip_kv_dict[block_id][transformer_id] = {
44
+ "ip_k": ip_k,
45
+ "ip_v": ip_v,
46
+ "scale": scale
47
+ }
48
+ return ip_kv_dict
49
+
50
+ def state_dict_converter(self):
51
+ return SDIpAdapterStateDictConverter()
52
+
53
+
54
+ class SDIpAdapterStateDictConverter(SDXLIpAdapterStateDictConverter):
55
+ def __init__(self):
56
+ pass
diffsynth/models/sd_lora.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .sd_unet import SDUNetStateDictConverter, SDUNet
3
+ from .sd_text_encoder import SDTextEncoderStateDictConverter, SDTextEncoder
4
+
5
+
6
+ class SDLoRA:
7
+ def __init__(self):
8
+ pass
9
+
10
+ def convert_state_dict(self, state_dict, lora_prefix="lora_unet_", alpha=1.0, device="cuda"):
11
+ special_keys = {
12
+ "down.blocks": "down_blocks",
13
+ "up.blocks": "up_blocks",
14
+ "mid.block": "mid_block",
15
+ "proj.in": "proj_in",
16
+ "proj.out": "proj_out",
17
+ "transformer.blocks": "transformer_blocks",
18
+ "to.q": "to_q",
19
+ "to.k": "to_k",
20
+ "to.v": "to_v",
21
+ "to.out": "to_out",
22
+ }
23
+ state_dict_ = {}
24
+ for key in state_dict:
25
+ if ".lora_up" not in key:
26
+ continue
27
+ if not key.startswith(lora_prefix):
28
+ continue
29
+ weight_up = state_dict[key].to(device="cuda", dtype=torch.float16)
30
+ weight_down = state_dict[key.replace(".lora_up", ".lora_down")].to(device="cuda", dtype=torch.float16)
31
+ if len(weight_up.shape) == 4:
32
+ weight_up = weight_up.squeeze(3).squeeze(2).to(torch.float32)
33
+ weight_down = weight_down.squeeze(3).squeeze(2).to(torch.float32)
34
+ lora_weight = alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3)
35
+ else:
36
+ lora_weight = alpha * torch.mm(weight_up, weight_down)
37
+ target_name = key.split(".")[0].replace("_", ".")[len(lora_prefix):] + ".weight"
38
+ for special_key in special_keys:
39
+ target_name = target_name.replace(special_key, special_keys[special_key])
40
+ state_dict_[target_name] = lora_weight.cpu()
41
+ return state_dict_
42
+
43
+ def add_lora_to_unet(self, unet: SDUNet, state_dict_lora, alpha=1.0, device="cuda"):
44
+ state_dict_unet = unet.state_dict()
45
+ state_dict_lora = self.convert_state_dict(state_dict_lora, lora_prefix="lora_unet_", alpha=alpha, device=device)
46
+ state_dict_lora = SDUNetStateDictConverter().from_diffusers(state_dict_lora)
47
+ if len(state_dict_lora) > 0:
48
+ for name in state_dict_lora:
49
+ state_dict_unet[name] += state_dict_lora[name].to(device=device)
50
+ unet.load_state_dict(state_dict_unet)
51
+
52
+ def add_lora_to_text_encoder(self, text_encoder: SDTextEncoder, state_dict_lora, alpha=1.0, device="cuda"):
53
+ state_dict_text_encoder = text_encoder.state_dict()
54
+ state_dict_lora = self.convert_state_dict(state_dict_lora, lora_prefix="lora_te_", alpha=alpha, device=device)
55
+ state_dict_lora = SDTextEncoderStateDictConverter().from_diffusers(state_dict_lora)
56
+ if len(state_dict_lora) > 0:
57
+ for name in state_dict_lora:
58
+ state_dict_text_encoder[name] += state_dict_lora[name].to(device=device)
59
+ text_encoder.load_state_dict(state_dict_text_encoder)
60
+
diffsynth/models/sd_motion.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .sd_unet import SDUNet, Attention, GEGLU
2
+ import torch
3
+ from einops import rearrange, repeat
4
+
5
+
6
+ class TemporalTransformerBlock(torch.nn.Module):
7
+
8
+ def __init__(self, dim, num_attention_heads, attention_head_dim, max_position_embeddings=32):
9
+ super().__init__()
10
+
11
+ # 1. Self-Attn
12
+ self.pe1 = torch.nn.Parameter(torch.zeros(1, max_position_embeddings, dim))
13
+ self.norm1 = torch.nn.LayerNorm(dim, elementwise_affine=True)
14
+ self.attn1 = Attention(q_dim=dim, num_heads=num_attention_heads, head_dim=attention_head_dim, bias_out=True)
15
+
16
+ # 2. Cross-Attn
17
+ self.pe2 = torch.nn.Parameter(torch.zeros(1, max_position_embeddings, dim))
18
+ self.norm2 = torch.nn.LayerNorm(dim, elementwise_affine=True)
19
+ self.attn2 = Attention(q_dim=dim, num_heads=num_attention_heads, head_dim=attention_head_dim, bias_out=True)
20
+
21
+ # 3. Feed-forward
22
+ self.norm3 = torch.nn.LayerNorm(dim, elementwise_affine=True)
23
+ self.act_fn = GEGLU(dim, dim * 4)
24
+ self.ff = torch.nn.Linear(dim * 4, dim)
25
+
26
+
27
+ def forward(self, hidden_states, batch_size=1):
28
+
29
+ # 1. Self-Attention
30
+ norm_hidden_states = self.norm1(hidden_states)
31
+ norm_hidden_states = rearrange(norm_hidden_states, "(b f) h c -> (b h) f c", b=batch_size)
32
+ attn_output = self.attn1(norm_hidden_states + self.pe1[:, :norm_hidden_states.shape[1]])
33
+ attn_output = rearrange(attn_output, "(b h) f c -> (b f) h c", b=batch_size)
34
+ hidden_states = attn_output + hidden_states
35
+
36
+ # 2. Cross-Attention
37
+ norm_hidden_states = self.norm2(hidden_states)
38
+ norm_hidden_states = rearrange(norm_hidden_states, "(b f) h c -> (b h) f c", b=batch_size)
39
+ attn_output = self.attn2(norm_hidden_states + self.pe2[:, :norm_hidden_states.shape[1]])
40
+ attn_output = rearrange(attn_output, "(b h) f c -> (b f) h c", b=batch_size)
41
+ hidden_states = attn_output + hidden_states
42
+
43
+ # 3. Feed-forward
44
+ norm_hidden_states = self.norm3(hidden_states)
45
+ ff_output = self.act_fn(norm_hidden_states)
46
+ ff_output = self.ff(ff_output)
47
+ hidden_states = ff_output + hidden_states
48
+
49
+ return hidden_states
50
+
51
+
52
+ class TemporalBlock(torch.nn.Module):
53
+
54
+ def __init__(self, num_attention_heads, attention_head_dim, in_channels, num_layers=1, norm_num_groups=32, eps=1e-5):
55
+ super().__init__()
56
+ inner_dim = num_attention_heads * attention_head_dim
57
+
58
+ self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=eps, affine=True)
59
+ self.proj_in = torch.nn.Linear(in_channels, inner_dim)
60
+
61
+ self.transformer_blocks = torch.nn.ModuleList([
62
+ TemporalTransformerBlock(
63
+ inner_dim,
64
+ num_attention_heads,
65
+ attention_head_dim
66
+ )
67
+ for d in range(num_layers)
68
+ ])
69
+
70
+ self.proj_out = torch.nn.Linear(inner_dim, in_channels)
71
+
72
+ def forward(self, hidden_states, time_emb, text_emb, res_stack, batch_size=1):
73
+ batch, _, height, width = hidden_states.shape
74
+ residual = hidden_states
75
+
76
+ hidden_states = self.norm(hidden_states)
77
+ inner_dim = hidden_states.shape[1]
78
+ hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
79
+ hidden_states = self.proj_in(hidden_states)
80
+
81
+ for block in self.transformer_blocks:
82
+ hidden_states = block(
83
+ hidden_states,
84
+ batch_size=batch_size
85
+ )
86
+
87
+ hidden_states = self.proj_out(hidden_states)
88
+ hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
89
+ hidden_states = hidden_states + residual
90
+
91
+ return hidden_states, time_emb, text_emb, res_stack
92
+
93
+
94
+ class SDMotionModel(torch.nn.Module):
95
+ def __init__(self):
96
+ super().__init__()
97
+ self.motion_modules = torch.nn.ModuleList([
98
+ TemporalBlock(8, 40, 320, eps=1e-6),
99
+ TemporalBlock(8, 40, 320, eps=1e-6),
100
+ TemporalBlock(8, 80, 640, eps=1e-6),
101
+ TemporalBlock(8, 80, 640, eps=1e-6),
102
+ TemporalBlock(8, 160, 1280, eps=1e-6),
103
+ TemporalBlock(8, 160, 1280, eps=1e-6),
104
+ TemporalBlock(8, 160, 1280, eps=1e-6),
105
+ TemporalBlock(8, 160, 1280, eps=1e-6),
106
+ TemporalBlock(8, 160, 1280, eps=1e-6),
107
+ TemporalBlock(8, 160, 1280, eps=1e-6),
108
+ TemporalBlock(8, 160, 1280, eps=1e-6),
109
+ TemporalBlock(8, 160, 1280, eps=1e-6),
110
+ TemporalBlock(8, 160, 1280, eps=1e-6),
111
+ TemporalBlock(8, 160, 1280, eps=1e-6),
112
+ TemporalBlock(8, 160, 1280, eps=1e-6),
113
+ TemporalBlock(8, 80, 640, eps=1e-6),
114
+ TemporalBlock(8, 80, 640, eps=1e-6),
115
+ TemporalBlock(8, 80, 640, eps=1e-6),
116
+ TemporalBlock(8, 40, 320, eps=1e-6),
117
+ TemporalBlock(8, 40, 320, eps=1e-6),
118
+ TemporalBlock(8, 40, 320, eps=1e-6),
119
+ ])
120
+ self.call_block_id = {
121
+ 1: 0,
122
+ 4: 1,
123
+ 9: 2,
124
+ 12: 3,
125
+ 17: 4,
126
+ 20: 5,
127
+ 24: 6,
128
+ 26: 7,
129
+ 29: 8,
130
+ 32: 9,
131
+ 34: 10,
132
+ 36: 11,
133
+ 40: 12,
134
+ 43: 13,
135
+ 46: 14,
136
+ 50: 15,
137
+ 53: 16,
138
+ 56: 17,
139
+ 60: 18,
140
+ 63: 19,
141
+ 66: 20
142
+ }
143
+
144
+ def forward(self):
145
+ pass
146
+
147
+ def state_dict_converter(self):
148
+ return SDMotionModelStateDictConverter()
149
+
150
+
151
+ class SDMotionModelStateDictConverter:
152
+ def __init__(self):
153
+ pass
154
+
155
+ def from_diffusers(self, state_dict):
156
+ rename_dict = {
157
+ "norm": "norm",
158
+ "proj_in": "proj_in",
159
+ "transformer_blocks.0.attention_blocks.0.to_q": "transformer_blocks.0.attn1.to_q",
160
+ "transformer_blocks.0.attention_blocks.0.to_k": "transformer_blocks.0.attn1.to_k",
161
+ "transformer_blocks.0.attention_blocks.0.to_v": "transformer_blocks.0.attn1.to_v",
162
+ "transformer_blocks.0.attention_blocks.0.to_out.0": "transformer_blocks.0.attn1.to_out",
163
+ "transformer_blocks.0.attention_blocks.0.pos_encoder": "transformer_blocks.0.pe1",
164
+ "transformer_blocks.0.attention_blocks.1.to_q": "transformer_blocks.0.attn2.to_q",
165
+ "transformer_blocks.0.attention_blocks.1.to_k": "transformer_blocks.0.attn2.to_k",
166
+ "transformer_blocks.0.attention_blocks.1.to_v": "transformer_blocks.0.attn2.to_v",
167
+ "transformer_blocks.0.attention_blocks.1.to_out.0": "transformer_blocks.0.attn2.to_out",
168
+ "transformer_blocks.0.attention_blocks.1.pos_encoder": "transformer_blocks.0.pe2",
169
+ "transformer_blocks.0.norms.0": "transformer_blocks.0.norm1",
170
+ "transformer_blocks.0.norms.1": "transformer_blocks.0.norm2",
171
+ "transformer_blocks.0.ff.net.0.proj": "transformer_blocks.0.act_fn.proj",
172
+ "transformer_blocks.0.ff.net.2": "transformer_blocks.0.ff",
173
+ "transformer_blocks.0.ff_norm": "transformer_blocks.0.norm3",
174
+ "proj_out": "proj_out",
175
+ }
176
+ name_list = sorted([i for i in state_dict if i.startswith("down_blocks.")])
177
+ name_list += sorted([i for i in state_dict if i.startswith("mid_block.")])
178
+ name_list += sorted([i for i in state_dict if i.startswith("up_blocks.")])
179
+ state_dict_ = {}
180
+ last_prefix, module_id = "", -1
181
+ for name in name_list:
182
+ names = name.split(".")
183
+ prefix_index = names.index("temporal_transformer") + 1
184
+ prefix = ".".join(names[:prefix_index])
185
+ if prefix != last_prefix:
186
+ last_prefix = prefix
187
+ module_id += 1
188
+ middle_name = ".".join(names[prefix_index:-1])
189
+ suffix = names[-1]
190
+ if "pos_encoder" in names:
191
+ rename = ".".join(["motion_modules", str(module_id), rename_dict[middle_name]])
192
+ else:
193
+ rename = ".".join(["motion_modules", str(module_id), rename_dict[middle_name], suffix])
194
+ state_dict_[rename] = state_dict[name]
195
+ return state_dict_
196
+
197
+ def from_civitai(self, state_dict):
198
+ return self.from_diffusers(state_dict)
diffsynth/models/sd_text_encoder.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .attention import Attention
3
+
4
+
5
+ class CLIPEncoderLayer(torch.nn.Module):
6
+ def __init__(self, embed_dim, intermediate_size, num_heads=12, head_dim=64, use_quick_gelu=True):
7
+ super().__init__()
8
+ self.attn = Attention(q_dim=embed_dim, num_heads=num_heads, head_dim=head_dim, bias_q=True, bias_kv=True, bias_out=True)
9
+ self.layer_norm1 = torch.nn.LayerNorm(embed_dim)
10
+ self.layer_norm2 = torch.nn.LayerNorm(embed_dim)
11
+ self.fc1 = torch.nn.Linear(embed_dim, intermediate_size)
12
+ self.fc2 = torch.nn.Linear(intermediate_size, embed_dim)
13
+
14
+ self.use_quick_gelu = use_quick_gelu
15
+
16
+ def quickGELU(self, x):
17
+ return x * torch.sigmoid(1.702 * x)
18
+
19
+ def forward(self, hidden_states, attn_mask=None):
20
+ residual = hidden_states
21
+
22
+ hidden_states = self.layer_norm1(hidden_states)
23
+ hidden_states = self.attn(hidden_states, attn_mask=attn_mask)
24
+ hidden_states = residual + hidden_states
25
+
26
+ residual = hidden_states
27
+ hidden_states = self.layer_norm2(hidden_states)
28
+ hidden_states = self.fc1(hidden_states)
29
+ if self.use_quick_gelu:
30
+ hidden_states = self.quickGELU(hidden_states)
31
+ else:
32
+ hidden_states = torch.nn.functional.gelu(hidden_states)
33
+ hidden_states = self.fc2(hidden_states)
34
+ hidden_states = residual + hidden_states
35
+
36
+ return hidden_states
37
+
38
+
39
+ class SDTextEncoder(torch.nn.Module):
40
+ def __init__(self, embed_dim=768, vocab_size=49408, max_position_embeddings=77, num_encoder_layers=12, encoder_intermediate_size=3072):
41
+ super().__init__()
42
+
43
+ # token_embedding
44
+ self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
45
+
46
+ # position_embeds (This is a fixed tensor)
47
+ self.position_embeds = torch.nn.Parameter(torch.zeros(1, max_position_embeddings, embed_dim))
48
+
49
+ # encoders
50
+ self.encoders = torch.nn.ModuleList([CLIPEncoderLayer(embed_dim, encoder_intermediate_size) for _ in range(num_encoder_layers)])
51
+
52
+ # attn_mask
53
+ self.attn_mask = self.attention_mask(max_position_embeddings)
54
+
55
+ # final_layer_norm
56
+ self.final_layer_norm = torch.nn.LayerNorm(embed_dim)
57
+
58
+ def attention_mask(self, length):
59
+ mask = torch.empty(length, length)
60
+ mask.fill_(float("-inf"))
61
+ mask.triu_(1)
62
+ return mask
63
+
64
+ def forward(self, input_ids, clip_skip=1):
65
+ embeds = self.token_embedding(input_ids) + self.position_embeds
66
+ attn_mask = self.attn_mask.to(device=embeds.device, dtype=embeds.dtype)
67
+ for encoder_id, encoder in enumerate(self.encoders):
68
+ embeds = encoder(embeds, attn_mask=attn_mask)
69
+ if encoder_id + clip_skip == len(self.encoders):
70
+ break
71
+ embeds = self.final_layer_norm(embeds)
72
+ return embeds
73
+
74
+ def state_dict_converter(self):
75
+ return SDTextEncoderStateDictConverter()
76
+
77
+
78
+ class SDTextEncoderStateDictConverter:
79
+ def __init__(self):
80
+ pass
81
+
82
+ def from_diffusers(self, state_dict):
83
+ rename_dict = {
84
+ "text_model.embeddings.token_embedding.weight": "token_embedding.weight",
85
+ "text_model.embeddings.position_embedding.weight": "position_embeds",
86
+ "text_model.final_layer_norm.weight": "final_layer_norm.weight",
87
+ "text_model.final_layer_norm.bias": "final_layer_norm.bias"
88
+ }
89
+ attn_rename_dict = {
90
+ "self_attn.q_proj": "attn.to_q",
91
+ "self_attn.k_proj": "attn.to_k",
92
+ "self_attn.v_proj": "attn.to_v",
93
+ "self_attn.out_proj": "attn.to_out",
94
+ "layer_norm1": "layer_norm1",
95
+ "layer_norm2": "layer_norm2",
96
+ "mlp.fc1": "fc1",
97
+ "mlp.fc2": "fc2",
98
+ }
99
+ state_dict_ = {}
100
+ for name in state_dict:
101
+ if name in rename_dict:
102
+ param = state_dict[name]
103
+ if name == "text_model.embeddings.position_embedding.weight":
104
+ param = param.reshape((1, param.shape[0], param.shape[1]))
105
+ state_dict_[rename_dict[name]] = param
106
+ elif name.startswith("text_model.encoder.layers."):
107
+ param = state_dict[name]
108
+ names = name.split(".")
109
+ layer_id, layer_type, tail = names[3], ".".join(names[4:-1]), names[-1]
110
+ name_ = ".".join(["encoders", layer_id, attn_rename_dict[layer_type], tail])
111
+ state_dict_[name_] = param
112
+ return state_dict_
113
+
114
+ def from_civitai(self, state_dict):
115
+ rename_dict = {
116
+ "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight": "token_embedding.weight",
117
+ "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.bias": "encoders.0.layer_norm1.bias",
118
+ "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.weight": "encoders.0.layer_norm1.weight",
119
+ "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.bias": "encoders.0.layer_norm2.bias",
120
+ "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.weight": "encoders.0.layer_norm2.weight",
121
+ "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.bias": "encoders.0.fc1.bias",
122
+ "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.weight": "encoders.0.fc1.weight",
123
+ "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.bias": "encoders.0.fc2.bias",
124
+ "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.weight": "encoders.0.fc2.weight",
125
+ "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias": "encoders.0.attn.to_k.bias",
126
+ "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight": "encoders.0.attn.to_k.weight",
127
+ "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias": "encoders.0.attn.to_out.bias",
128
+ "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight": "encoders.0.attn.to_out.weight",
129
+ "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias": "encoders.0.attn.to_q.bias",
130
+ "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight": "encoders.0.attn.to_q.weight",
131
+ "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias": "encoders.0.attn.to_v.bias",
132
+ "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight": "encoders.0.attn.to_v.weight",
133
+ "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.bias": "encoders.1.layer_norm1.bias",
134
+ "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.weight": "encoders.1.layer_norm1.weight",
135
+ "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.bias": "encoders.1.layer_norm2.bias",
136
+ "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.weight": "encoders.1.layer_norm2.weight",
137
+ "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.bias": "encoders.1.fc1.bias",
138
+ "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.weight": "encoders.1.fc1.weight",
139
+ "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.bias": "encoders.1.fc2.bias",
140
+ "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.weight": "encoders.1.fc2.weight",
141
+ "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias": "encoders.1.attn.to_k.bias",
142
+ "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight": "encoders.1.attn.to_k.weight",
143
+ "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias": "encoders.1.attn.to_out.bias",
144
+ "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight": "encoders.1.attn.to_out.weight",
145
+ "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias": "encoders.1.attn.to_q.bias",
146
+ "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight": "encoders.1.attn.to_q.weight",
147
+ "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias": "encoders.1.attn.to_v.bias",
148
+ "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight": "encoders.1.attn.to_v.weight",
149
+ "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.bias": "encoders.10.layer_norm1.bias",
150
+ "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.weight": "encoders.10.layer_norm1.weight",
151
+ "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.bias": "encoders.10.layer_norm2.bias",
152
+ "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.weight": "encoders.10.layer_norm2.weight",
153
+ "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.bias": "encoders.10.fc1.bias",
154
+ "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.weight": "encoders.10.fc1.weight",
155
+ "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.bias": "encoders.10.fc2.bias",
156
+ "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.weight": "encoders.10.fc2.weight",
157
+ "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias": "encoders.10.attn.to_k.bias",
158
+ "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight": "encoders.10.attn.to_k.weight",
159
+ "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias": "encoders.10.attn.to_out.bias",
160
+ "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight": "encoders.10.attn.to_out.weight",
161
+ "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias": "encoders.10.attn.to_q.bias",
162
+ "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight": "encoders.10.attn.to_q.weight",
163
+ "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias": "encoders.10.attn.to_v.bias",
164
+ "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight": "encoders.10.attn.to_v.weight",
165
+ "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.bias": "encoders.11.layer_norm1.bias",
166
+ "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.weight": "encoders.11.layer_norm1.weight",
167
+ "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.bias": "encoders.11.layer_norm2.bias",
168
+ "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.weight": "encoders.11.layer_norm2.weight",
169
+ "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.bias": "encoders.11.fc1.bias",
170
+ "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.weight": "encoders.11.fc1.weight",
171
+ "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.bias": "encoders.11.fc2.bias",
172
+ "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.weight": "encoders.11.fc2.weight",
173
+ "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias": "encoders.11.attn.to_k.bias",
174
+ "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight": "encoders.11.attn.to_k.weight",
175
+ "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.bias": "encoders.11.attn.to_out.bias",
176
+ "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.weight": "encoders.11.attn.to_out.weight",
177
+ "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias": "encoders.11.attn.to_q.bias",
178
+ "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight": "encoders.11.attn.to_q.weight",
179
+ "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias": "encoders.11.attn.to_v.bias",
180
+ "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight": "encoders.11.attn.to_v.weight",
181
+ "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.bias": "encoders.2.layer_norm1.bias",
182
+ "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.weight": "encoders.2.layer_norm1.weight",
183
+ "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.bias": "encoders.2.layer_norm2.bias",
184
+ "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.weight": "encoders.2.layer_norm2.weight",
185
+ "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.bias": "encoders.2.fc1.bias",
186
+ "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.weight": "encoders.2.fc1.weight",
187
+ "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.bias": "encoders.2.fc2.bias",
188
+ "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.weight": "encoders.2.fc2.weight",
189
+ "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias": "encoders.2.attn.to_k.bias",
190
+ "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight": "encoders.2.attn.to_k.weight",
191
+ "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias": "encoders.2.attn.to_out.bias",
192
+ "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight": "encoders.2.attn.to_out.weight",
193
+ "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias": "encoders.2.attn.to_q.bias",
194
+ "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight": "encoders.2.attn.to_q.weight",
195
+ "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias": "encoders.2.attn.to_v.bias",
196
+ "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight": "encoders.2.attn.to_v.weight",
197
+ "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.bias": "encoders.3.layer_norm1.bias",
198
+ "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.weight": "encoders.3.layer_norm1.weight",
199
+ "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.bias": "encoders.3.layer_norm2.bias",
200
+ "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.weight": "encoders.3.layer_norm2.weight",
201
+ "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.bias": "encoders.3.fc1.bias",
202
+ "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.weight": "encoders.3.fc1.weight",
203
+ "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.bias": "encoders.3.fc2.bias",
204
+ "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.weight": "encoders.3.fc2.weight",
205
+ "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias": "encoders.3.attn.to_k.bias",
206
+ "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight": "encoders.3.attn.to_k.weight",
207
+ "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias": "encoders.3.attn.to_out.bias",
208
+ "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight": "encoders.3.attn.to_out.weight",
209
+ "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias": "encoders.3.attn.to_q.bias",
210
+ "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight": "encoders.3.attn.to_q.weight",
211
+ "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias": "encoders.3.attn.to_v.bias",
212
+ "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight": "encoders.3.attn.to_v.weight",
213
+ "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.bias": "encoders.4.layer_norm1.bias",
214
+ "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.weight": "encoders.4.layer_norm1.weight",
215
+ "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.bias": "encoders.4.layer_norm2.bias",
216
+ "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.weight": "encoders.4.layer_norm2.weight",
217
+ "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.bias": "encoders.4.fc1.bias",
218
+ "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.weight": "encoders.4.fc1.weight",
219
+ "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.bias": "encoders.4.fc2.bias",
220
+ "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.weight": "encoders.4.fc2.weight",
221
+ "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias": "encoders.4.attn.to_k.bias",
222
+ "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight": "encoders.4.attn.to_k.weight",
223
+ "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias": "encoders.4.attn.to_out.bias",
224
+ "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight": "encoders.4.attn.to_out.weight",
225
+ "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias": "encoders.4.attn.to_q.bias",
226
+ "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight": "encoders.4.attn.to_q.weight",
227
+ "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias": "encoders.4.attn.to_v.bias",
228
+ "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight": "encoders.4.attn.to_v.weight",
229
+ "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.bias": "encoders.5.layer_norm1.bias",
230
+ "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.weight": "encoders.5.layer_norm1.weight",
231
+ "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.bias": "encoders.5.layer_norm2.bias",
232
+ "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.weight": "encoders.5.layer_norm2.weight",
233
+ "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.bias": "encoders.5.fc1.bias",
234
+ "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.weight": "encoders.5.fc1.weight",
235
+ "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.bias": "encoders.5.fc2.bias",
236
+ "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.weight": "encoders.5.fc2.weight",
237
+ "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias": "encoders.5.attn.to_k.bias",
238
+ "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight": "encoders.5.attn.to_k.weight",
239
+ "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias": "encoders.5.attn.to_out.bias",
240
+ "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight": "encoders.5.attn.to_out.weight",
241
+ "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias": "encoders.5.attn.to_q.bias",
242
+ "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight": "encoders.5.attn.to_q.weight",
243
+ "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias": "encoders.5.attn.to_v.bias",
244
+ "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight": "encoders.5.attn.to_v.weight",
245
+ "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.bias": "encoders.6.layer_norm1.bias",
246
+ "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.weight": "encoders.6.layer_norm1.weight",
247
+ "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.bias": "encoders.6.layer_norm2.bias",
248
+ "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.weight": "encoders.6.layer_norm2.weight",
249
+ "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.bias": "encoders.6.fc1.bias",
250
+ "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.weight": "encoders.6.fc1.weight",
251
+ "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.bias": "encoders.6.fc2.bias",
252
+ "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.weight": "encoders.6.fc2.weight",
253
+ "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias": "encoders.6.attn.to_k.bias",
254
+ "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight": "encoders.6.attn.to_k.weight",
255
+ "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias": "encoders.6.attn.to_out.bias",
256
+ "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight": "encoders.6.attn.to_out.weight",
257
+ "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias": "encoders.6.attn.to_q.bias",
258
+ "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight": "encoders.6.attn.to_q.weight",
259
+ "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias": "encoders.6.attn.to_v.bias",
260
+ "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight": "encoders.6.attn.to_v.weight",
261
+ "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.bias": "encoders.7.layer_norm1.bias",
262
+ "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.weight": "encoders.7.layer_norm1.weight",
263
+ "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.bias": "encoders.7.layer_norm2.bias",
264
+ "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.weight": "encoders.7.layer_norm2.weight",
265
+ "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.bias": "encoders.7.fc1.bias",
266
+ "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.weight": "encoders.7.fc1.weight",
267
+ "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.bias": "encoders.7.fc2.bias",
268
+ "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.weight": "encoders.7.fc2.weight",
269
+ "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias": "encoders.7.attn.to_k.bias",
270
+ "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight": "encoders.7.attn.to_k.weight",
271
+ "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias": "encoders.7.attn.to_out.bias",
272
+ "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight": "encoders.7.attn.to_out.weight",
273
+ "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias": "encoders.7.attn.to_q.bias",
274
+ "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight": "encoders.7.attn.to_q.weight",
275
+ "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias": "encoders.7.attn.to_v.bias",
276
+ "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight": "encoders.7.attn.to_v.weight",
277
+ "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.bias": "encoders.8.layer_norm1.bias",
278
+ "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.weight": "encoders.8.layer_norm1.weight",
279
+ "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.bias": "encoders.8.layer_norm2.bias",
280
+ "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.weight": "encoders.8.layer_norm2.weight",
281
+ "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.bias": "encoders.8.fc1.bias",
282
+ "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.weight": "encoders.8.fc1.weight",
283
+ "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.bias": "encoders.8.fc2.bias",
284
+ "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.weight": "encoders.8.fc2.weight",
285
+ "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias": "encoders.8.attn.to_k.bias",
286
+ "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight": "encoders.8.attn.to_k.weight",
287
+ "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias": "encoders.8.attn.to_out.bias",
288
+ "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight": "encoders.8.attn.to_out.weight",
289
+ "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias": "encoders.8.attn.to_q.bias",
290
+ "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight": "encoders.8.attn.to_q.weight",
291
+ "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias": "encoders.8.attn.to_v.bias",
292
+ "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight": "encoders.8.attn.to_v.weight",
293
+ "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.bias": "encoders.9.layer_norm1.bias",
294
+ "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.weight": "encoders.9.layer_norm1.weight",
295
+ "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.bias": "encoders.9.layer_norm2.bias",
296
+ "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.weight": "encoders.9.layer_norm2.weight",
297
+ "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.bias": "encoders.9.fc1.bias",
298
+ "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.weight": "encoders.9.fc1.weight",
299
+ "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.bias": "encoders.9.fc2.bias",
300
+ "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.weight": "encoders.9.fc2.weight",
301
+ "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias": "encoders.9.attn.to_k.bias",
302
+ "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight": "encoders.9.attn.to_k.weight",
303
+ "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias": "encoders.9.attn.to_out.bias",
304
+ "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight": "encoders.9.attn.to_out.weight",
305
+ "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias": "encoders.9.attn.to_q.bias",
306
+ "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight": "encoders.9.attn.to_q.weight",
307
+ "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias": "encoders.9.attn.to_v.bias",
308
+ "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight": "encoders.9.attn.to_v.weight",
309
+ "cond_stage_model.transformer.text_model.final_layer_norm.bias": "final_layer_norm.bias",
310
+ "cond_stage_model.transformer.text_model.final_layer_norm.weight": "final_layer_norm.weight",
311
+ "cond_stage_model.transformer.text_model.embeddings.position_embedding.weight": "position_embeds"
312
+ }
313
+ state_dict_ = {}
314
+ for name in state_dict:
315
+ if name in rename_dict:
316
+ param = state_dict[name]
317
+ if name == "cond_stage_model.transformer.text_model.embeddings.position_embedding.weight":
318
+ param = param.reshape((1, param.shape[0], param.shape[1]))
319
+ state_dict_[rename_dict[name]] = param
320
+ return state_dict_
diffsynth/models/sd_unet.py ADDED
The diff for this file is too large to render. See raw diff
 
diffsynth/models/sd_vae_decoder.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .attention import Attention
3
+ from .sd_unet import ResnetBlock, UpSampler
4
+ from .tiler import TileWorker
5
+
6
+
7
+ class VAEAttentionBlock(torch.nn.Module):
8
+
9
+ def __init__(self, num_attention_heads, attention_head_dim, in_channels, num_layers=1, norm_num_groups=32, eps=1e-5):
10
+ super().__init__()
11
+ inner_dim = num_attention_heads * attention_head_dim
12
+
13
+ self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=eps, affine=True)
14
+
15
+ self.transformer_blocks = torch.nn.ModuleList([
16
+ Attention(
17
+ inner_dim,
18
+ num_attention_heads,
19
+ attention_head_dim,
20
+ bias_q=True,
21
+ bias_kv=True,
22
+ bias_out=True
23
+ )
24
+ for d in range(num_layers)
25
+ ])
26
+
27
+ def forward(self, hidden_states, time_emb, text_emb, res_stack):
28
+ batch, _, height, width = hidden_states.shape
29
+ residual = hidden_states
30
+
31
+ hidden_states = self.norm(hidden_states)
32
+ inner_dim = hidden_states.shape[1]
33
+ hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
34
+
35
+ for block in self.transformer_blocks:
36
+ hidden_states = block(hidden_states)
37
+
38
+ hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
39
+ hidden_states = hidden_states + residual
40
+
41
+ return hidden_states, time_emb, text_emb, res_stack
42
+
43
+
44
+ class SDVAEDecoder(torch.nn.Module):
45
+ def __init__(self):
46
+ super().__init__()
47
+ self.scaling_factor = 0.18215
48
+ self.post_quant_conv = torch.nn.Conv2d(4, 4, kernel_size=1)
49
+ self.conv_in = torch.nn.Conv2d(4, 512, kernel_size=3, padding=1)
50
+
51
+ self.blocks = torch.nn.ModuleList([
52
+ # UNetMidBlock2D
53
+ ResnetBlock(512, 512, eps=1e-6),
54
+ VAEAttentionBlock(1, 512, 512, 1, eps=1e-6),
55
+ ResnetBlock(512, 512, eps=1e-6),
56
+ # UpDecoderBlock2D
57
+ ResnetBlock(512, 512, eps=1e-6),
58
+ ResnetBlock(512, 512, eps=1e-6),
59
+ ResnetBlock(512, 512, eps=1e-6),
60
+ UpSampler(512),
61
+ # UpDecoderBlock2D
62
+ ResnetBlock(512, 512, eps=1e-6),
63
+ ResnetBlock(512, 512, eps=1e-6),
64
+ ResnetBlock(512, 512, eps=1e-6),
65
+ UpSampler(512),
66
+ # UpDecoderBlock2D
67
+ ResnetBlock(512, 256, eps=1e-6),
68
+ ResnetBlock(256, 256, eps=1e-6),
69
+ ResnetBlock(256, 256, eps=1e-6),
70
+ UpSampler(256),
71
+ # UpDecoderBlock2D
72
+ ResnetBlock(256, 128, eps=1e-6),
73
+ ResnetBlock(128, 128, eps=1e-6),
74
+ ResnetBlock(128, 128, eps=1e-6),
75
+ ])
76
+
77
+ self.conv_norm_out = torch.nn.GroupNorm(num_channels=128, num_groups=32, eps=1e-5)
78
+ self.conv_act = torch.nn.SiLU()
79
+ self.conv_out = torch.nn.Conv2d(128, 3, kernel_size=3, padding=1)
80
+
81
+ def tiled_forward(self, sample, tile_size=64, tile_stride=32):
82
+ hidden_states = TileWorker().tiled_forward(
83
+ lambda x: self.forward(x),
84
+ sample,
85
+ tile_size,
86
+ tile_stride,
87
+ tile_device=sample.device,
88
+ tile_dtype=sample.dtype
89
+ )
90
+ return hidden_states
91
+
92
+ def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
93
+ # For VAE Decoder, we do not need to apply the tiler on each layer.
94
+ if tiled:
95
+ return self.tiled_forward(sample, tile_size=tile_size, tile_stride=tile_stride)
96
+
97
+ # 1. pre-process
98
+ sample = sample / self.scaling_factor
99
+ hidden_states = self.post_quant_conv(sample)
100
+ hidden_states = self.conv_in(hidden_states)
101
+ time_emb = None
102
+ text_emb = None
103
+ res_stack = None
104
+
105
+ # 2. blocks
106
+ for i, block in enumerate(self.blocks):
107
+ hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
108
+
109
+ # 3. output
110
+ hidden_states = self.conv_norm_out(hidden_states)
111
+ hidden_states = self.conv_act(hidden_states)
112
+ hidden_states = self.conv_out(hidden_states)
113
+
114
+ return hidden_states
115
+
116
+ def state_dict_converter(self):
117
+ return SDVAEDecoderStateDictConverter()
118
+
119
+
120
+ class SDVAEDecoderStateDictConverter:
121
+ def __init__(self):
122
+ pass
123
+
124
+ def from_diffusers(self, state_dict):
125
+ # architecture
126
+ block_types = [
127
+ 'ResnetBlock', 'VAEAttentionBlock', 'ResnetBlock',
128
+ 'ResnetBlock', 'ResnetBlock', 'ResnetBlock', 'UpSampler',
129
+ 'ResnetBlock', 'ResnetBlock', 'ResnetBlock', 'UpSampler',
130
+ 'ResnetBlock', 'ResnetBlock', 'ResnetBlock', 'UpSampler',
131
+ 'ResnetBlock', 'ResnetBlock', 'ResnetBlock'
132
+ ]
133
+
134
+ # Rename each parameter
135
+ local_rename_dict = {
136
+ "post_quant_conv": "post_quant_conv",
137
+ "decoder.conv_in": "conv_in",
138
+ "decoder.mid_block.attentions.0.group_norm": "blocks.1.norm",
139
+ "decoder.mid_block.attentions.0.to_q": "blocks.1.transformer_blocks.0.to_q",
140
+ "decoder.mid_block.attentions.0.to_k": "blocks.1.transformer_blocks.0.to_k",
141
+ "decoder.mid_block.attentions.0.to_v": "blocks.1.transformer_blocks.0.to_v",
142
+ "decoder.mid_block.attentions.0.to_out.0": "blocks.1.transformer_blocks.0.to_out",
143
+ "decoder.mid_block.resnets.0.norm1": "blocks.0.norm1",
144
+ "decoder.mid_block.resnets.0.conv1": "blocks.0.conv1",
145
+ "decoder.mid_block.resnets.0.norm2": "blocks.0.norm2",
146
+ "decoder.mid_block.resnets.0.conv2": "blocks.0.conv2",
147
+ "decoder.mid_block.resnets.1.norm1": "blocks.2.norm1",
148
+ "decoder.mid_block.resnets.1.conv1": "blocks.2.conv1",
149
+ "decoder.mid_block.resnets.1.norm2": "blocks.2.norm2",
150
+ "decoder.mid_block.resnets.1.conv2": "blocks.2.conv2",
151
+ "decoder.conv_norm_out": "conv_norm_out",
152
+ "decoder.conv_out": "conv_out",
153
+ }
154
+ name_list = sorted([name for name in state_dict])
155
+ rename_dict = {}
156
+ block_id = {"ResnetBlock": 2, "DownSampler": 2, "UpSampler": 2}
157
+ last_block_type_with_id = {"ResnetBlock": "", "DownSampler": "", "UpSampler": ""}
158
+ for name in name_list:
159
+ names = name.split(".")
160
+ name_prefix = ".".join(names[:-1])
161
+ if name_prefix in local_rename_dict:
162
+ rename_dict[name] = local_rename_dict[name_prefix] + "." + names[-1]
163
+ elif name.startswith("decoder.up_blocks"):
164
+ block_type = {"resnets": "ResnetBlock", "downsamplers": "DownSampler", "upsamplers": "UpSampler"}[names[3]]
165
+ block_type_with_id = ".".join(names[:5])
166
+ if block_type_with_id != last_block_type_with_id[block_type]:
167
+ block_id[block_type] += 1
168
+ last_block_type_with_id[block_type] = block_type_with_id
169
+ while block_id[block_type] < len(block_types) and block_types[block_id[block_type]] != block_type:
170
+ block_id[block_type] += 1
171
+ block_type_with_id = ".".join(names[:5])
172
+ names = ["blocks", str(block_id[block_type])] + names[5:]
173
+ rename_dict[name] = ".".join(names)
174
+
175
+ # Convert state_dict
176
+ state_dict_ = {}
177
+ for name, param in state_dict.items():
178
+ if name in rename_dict:
179
+ state_dict_[rename_dict[name]] = param
180
+ return state_dict_
181
+
182
+ def from_civitai(self, state_dict):
183
+ rename_dict = {
184
+ "first_stage_model.decoder.conv_in.bias": "conv_in.bias",
185
+ "first_stage_model.decoder.conv_in.weight": "conv_in.weight",
186
+ "first_stage_model.decoder.conv_out.bias": "conv_out.bias",
187
+ "first_stage_model.decoder.conv_out.weight": "conv_out.weight",
188
+ "first_stage_model.decoder.mid.attn_1.k.bias": "blocks.1.transformer_blocks.0.to_k.bias",
189
+ "first_stage_model.decoder.mid.attn_1.k.weight": "blocks.1.transformer_blocks.0.to_k.weight",
190
+ "first_stage_model.decoder.mid.attn_1.norm.bias": "blocks.1.norm.bias",
191
+ "first_stage_model.decoder.mid.attn_1.norm.weight": "blocks.1.norm.weight",
192
+ "first_stage_model.decoder.mid.attn_1.proj_out.bias": "blocks.1.transformer_blocks.0.to_out.bias",
193
+ "first_stage_model.decoder.mid.attn_1.proj_out.weight": "blocks.1.transformer_blocks.0.to_out.weight",
194
+ "first_stage_model.decoder.mid.attn_1.q.bias": "blocks.1.transformer_blocks.0.to_q.bias",
195
+ "first_stage_model.decoder.mid.attn_1.q.weight": "blocks.1.transformer_blocks.0.to_q.weight",
196
+ "first_stage_model.decoder.mid.attn_1.v.bias": "blocks.1.transformer_blocks.0.to_v.bias",
197
+ "first_stage_model.decoder.mid.attn_1.v.weight": "blocks.1.transformer_blocks.0.to_v.weight",
198
+ "first_stage_model.decoder.mid.block_1.conv1.bias": "blocks.0.conv1.bias",
199
+ "first_stage_model.decoder.mid.block_1.conv1.weight": "blocks.0.conv1.weight",
200
+ "first_stage_model.decoder.mid.block_1.conv2.bias": "blocks.0.conv2.bias",
201
+ "first_stage_model.decoder.mid.block_1.conv2.weight": "blocks.0.conv2.weight",
202
+ "first_stage_model.decoder.mid.block_1.norm1.bias": "blocks.0.norm1.bias",
203
+ "first_stage_model.decoder.mid.block_1.norm1.weight": "blocks.0.norm1.weight",
204
+ "first_stage_model.decoder.mid.block_1.norm2.bias": "blocks.0.norm2.bias",
205
+ "first_stage_model.decoder.mid.block_1.norm2.weight": "blocks.0.norm2.weight",
206
+ "first_stage_model.decoder.mid.block_2.conv1.bias": "blocks.2.conv1.bias",
207
+ "first_stage_model.decoder.mid.block_2.conv1.weight": "blocks.2.conv1.weight",
208
+ "first_stage_model.decoder.mid.block_2.conv2.bias": "blocks.2.conv2.bias",
209
+ "first_stage_model.decoder.mid.block_2.conv2.weight": "blocks.2.conv2.weight",
210
+ "first_stage_model.decoder.mid.block_2.norm1.bias": "blocks.2.norm1.bias",
211
+ "first_stage_model.decoder.mid.block_2.norm1.weight": "blocks.2.norm1.weight",
212
+ "first_stage_model.decoder.mid.block_2.norm2.bias": "blocks.2.norm2.bias",
213
+ "first_stage_model.decoder.mid.block_2.norm2.weight": "blocks.2.norm2.weight",
214
+ "first_stage_model.decoder.norm_out.bias": "conv_norm_out.bias",
215
+ "first_stage_model.decoder.norm_out.weight": "conv_norm_out.weight",
216
+ "first_stage_model.decoder.up.0.block.0.conv1.bias": "blocks.15.conv1.bias",
217
+ "first_stage_model.decoder.up.0.block.0.conv1.weight": "blocks.15.conv1.weight",
218
+ "first_stage_model.decoder.up.0.block.0.conv2.bias": "blocks.15.conv2.bias",
219
+ "first_stage_model.decoder.up.0.block.0.conv2.weight": "blocks.15.conv2.weight",
220
+ "first_stage_model.decoder.up.0.block.0.nin_shortcut.bias": "blocks.15.conv_shortcut.bias",
221
+ "first_stage_model.decoder.up.0.block.0.nin_shortcut.weight": "blocks.15.conv_shortcut.weight",
222
+ "first_stage_model.decoder.up.0.block.0.norm1.bias": "blocks.15.norm1.bias",
223
+ "first_stage_model.decoder.up.0.block.0.norm1.weight": "blocks.15.norm1.weight",
224
+ "first_stage_model.decoder.up.0.block.0.norm2.bias": "blocks.15.norm2.bias",
225
+ "first_stage_model.decoder.up.0.block.0.norm2.weight": "blocks.15.norm2.weight",
226
+ "first_stage_model.decoder.up.0.block.1.conv1.bias": "blocks.16.conv1.bias",
227
+ "first_stage_model.decoder.up.0.block.1.conv1.weight": "blocks.16.conv1.weight",
228
+ "first_stage_model.decoder.up.0.block.1.conv2.bias": "blocks.16.conv2.bias",
229
+ "first_stage_model.decoder.up.0.block.1.conv2.weight": "blocks.16.conv2.weight",
230
+ "first_stage_model.decoder.up.0.block.1.norm1.bias": "blocks.16.norm1.bias",
231
+ "first_stage_model.decoder.up.0.block.1.norm1.weight": "blocks.16.norm1.weight",
232
+ "first_stage_model.decoder.up.0.block.1.norm2.bias": "blocks.16.norm2.bias",
233
+ "first_stage_model.decoder.up.0.block.1.norm2.weight": "blocks.16.norm2.weight",
234
+ "first_stage_model.decoder.up.0.block.2.conv1.bias": "blocks.17.conv1.bias",
235
+ "first_stage_model.decoder.up.0.block.2.conv1.weight": "blocks.17.conv1.weight",
236
+ "first_stage_model.decoder.up.0.block.2.conv2.bias": "blocks.17.conv2.bias",
237
+ "first_stage_model.decoder.up.0.block.2.conv2.weight": "blocks.17.conv2.weight",
238
+ "first_stage_model.decoder.up.0.block.2.norm1.bias": "blocks.17.norm1.bias",
239
+ "first_stage_model.decoder.up.0.block.2.norm1.weight": "blocks.17.norm1.weight",
240
+ "first_stage_model.decoder.up.0.block.2.norm2.bias": "blocks.17.norm2.bias",
241
+ "first_stage_model.decoder.up.0.block.2.norm2.weight": "blocks.17.norm2.weight",
242
+ "first_stage_model.decoder.up.1.block.0.conv1.bias": "blocks.11.conv1.bias",
243
+ "first_stage_model.decoder.up.1.block.0.conv1.weight": "blocks.11.conv1.weight",
244
+ "first_stage_model.decoder.up.1.block.0.conv2.bias": "blocks.11.conv2.bias",
245
+ "first_stage_model.decoder.up.1.block.0.conv2.weight": "blocks.11.conv2.weight",
246
+ "first_stage_model.decoder.up.1.block.0.nin_shortcut.bias": "blocks.11.conv_shortcut.bias",
247
+ "first_stage_model.decoder.up.1.block.0.nin_shortcut.weight": "blocks.11.conv_shortcut.weight",
248
+ "first_stage_model.decoder.up.1.block.0.norm1.bias": "blocks.11.norm1.bias",
249
+ "first_stage_model.decoder.up.1.block.0.norm1.weight": "blocks.11.norm1.weight",
250
+ "first_stage_model.decoder.up.1.block.0.norm2.bias": "blocks.11.norm2.bias",
251
+ "first_stage_model.decoder.up.1.block.0.norm2.weight": "blocks.11.norm2.weight",
252
+ "first_stage_model.decoder.up.1.block.1.conv1.bias": "blocks.12.conv1.bias",
253
+ "first_stage_model.decoder.up.1.block.1.conv1.weight": "blocks.12.conv1.weight",
254
+ "first_stage_model.decoder.up.1.block.1.conv2.bias": "blocks.12.conv2.bias",
255
+ "first_stage_model.decoder.up.1.block.1.conv2.weight": "blocks.12.conv2.weight",
256
+ "first_stage_model.decoder.up.1.block.1.norm1.bias": "blocks.12.norm1.bias",
257
+ "first_stage_model.decoder.up.1.block.1.norm1.weight": "blocks.12.norm1.weight",
258
+ "first_stage_model.decoder.up.1.block.1.norm2.bias": "blocks.12.norm2.bias",
259
+ "first_stage_model.decoder.up.1.block.1.norm2.weight": "blocks.12.norm2.weight",
260
+ "first_stage_model.decoder.up.1.block.2.conv1.bias": "blocks.13.conv1.bias",
261
+ "first_stage_model.decoder.up.1.block.2.conv1.weight": "blocks.13.conv1.weight",
262
+ "first_stage_model.decoder.up.1.block.2.conv2.bias": "blocks.13.conv2.bias",
263
+ "first_stage_model.decoder.up.1.block.2.conv2.weight": "blocks.13.conv2.weight",
264
+ "first_stage_model.decoder.up.1.block.2.norm1.bias": "blocks.13.norm1.bias",
265
+ "first_stage_model.decoder.up.1.block.2.norm1.weight": "blocks.13.norm1.weight",
266
+ "first_stage_model.decoder.up.1.block.2.norm2.bias": "blocks.13.norm2.bias",
267
+ "first_stage_model.decoder.up.1.block.2.norm2.weight": "blocks.13.norm2.weight",
268
+ "first_stage_model.decoder.up.1.upsample.conv.bias": "blocks.14.conv.bias",
269
+ "first_stage_model.decoder.up.1.upsample.conv.weight": "blocks.14.conv.weight",
270
+ "first_stage_model.decoder.up.2.block.0.conv1.bias": "blocks.7.conv1.bias",
271
+ "first_stage_model.decoder.up.2.block.0.conv1.weight": "blocks.7.conv1.weight",
272
+ "first_stage_model.decoder.up.2.block.0.conv2.bias": "blocks.7.conv2.bias",
273
+ "first_stage_model.decoder.up.2.block.0.conv2.weight": "blocks.7.conv2.weight",
274
+ "first_stage_model.decoder.up.2.block.0.norm1.bias": "blocks.7.norm1.bias",
275
+ "first_stage_model.decoder.up.2.block.0.norm1.weight": "blocks.7.norm1.weight",
276
+ "first_stage_model.decoder.up.2.block.0.norm2.bias": "blocks.7.norm2.bias",
277
+ "first_stage_model.decoder.up.2.block.0.norm2.weight": "blocks.7.norm2.weight",
278
+ "first_stage_model.decoder.up.2.block.1.conv1.bias": "blocks.8.conv1.bias",
279
+ "first_stage_model.decoder.up.2.block.1.conv1.weight": "blocks.8.conv1.weight",
280
+ "first_stage_model.decoder.up.2.block.1.conv2.bias": "blocks.8.conv2.bias",
281
+ "first_stage_model.decoder.up.2.block.1.conv2.weight": "blocks.8.conv2.weight",
282
+ "first_stage_model.decoder.up.2.block.1.norm1.bias": "blocks.8.norm1.bias",
283
+ "first_stage_model.decoder.up.2.block.1.norm1.weight": "blocks.8.norm1.weight",
284
+ "first_stage_model.decoder.up.2.block.1.norm2.bias": "blocks.8.norm2.bias",
285
+ "first_stage_model.decoder.up.2.block.1.norm2.weight": "blocks.8.norm2.weight",
286
+ "first_stage_model.decoder.up.2.block.2.conv1.bias": "blocks.9.conv1.bias",
287
+ "first_stage_model.decoder.up.2.block.2.conv1.weight": "blocks.9.conv1.weight",
288
+ "first_stage_model.decoder.up.2.block.2.conv2.bias": "blocks.9.conv2.bias",
289
+ "first_stage_model.decoder.up.2.block.2.conv2.weight": "blocks.9.conv2.weight",
290
+ "first_stage_model.decoder.up.2.block.2.norm1.bias": "blocks.9.norm1.bias",
291
+ "first_stage_model.decoder.up.2.block.2.norm1.weight": "blocks.9.norm1.weight",
292
+ "first_stage_model.decoder.up.2.block.2.norm2.bias": "blocks.9.norm2.bias",
293
+ "first_stage_model.decoder.up.2.block.2.norm2.weight": "blocks.9.norm2.weight",
294
+ "first_stage_model.decoder.up.2.upsample.conv.bias": "blocks.10.conv.bias",
295
+ "first_stage_model.decoder.up.2.upsample.conv.weight": "blocks.10.conv.weight",
296
+ "first_stage_model.decoder.up.3.block.0.conv1.bias": "blocks.3.conv1.bias",
297
+ "first_stage_model.decoder.up.3.block.0.conv1.weight": "blocks.3.conv1.weight",
298
+ "first_stage_model.decoder.up.3.block.0.conv2.bias": "blocks.3.conv2.bias",
299
+ "first_stage_model.decoder.up.3.block.0.conv2.weight": "blocks.3.conv2.weight",
300
+ "first_stage_model.decoder.up.3.block.0.norm1.bias": "blocks.3.norm1.bias",
301
+ "first_stage_model.decoder.up.3.block.0.norm1.weight": "blocks.3.norm1.weight",
302
+ "first_stage_model.decoder.up.3.block.0.norm2.bias": "blocks.3.norm2.bias",
303
+ "first_stage_model.decoder.up.3.block.0.norm2.weight": "blocks.3.norm2.weight",
304
+ "first_stage_model.decoder.up.3.block.1.conv1.bias": "blocks.4.conv1.bias",
305
+ "first_stage_model.decoder.up.3.block.1.conv1.weight": "blocks.4.conv1.weight",
306
+ "first_stage_model.decoder.up.3.block.1.conv2.bias": "blocks.4.conv2.bias",
307
+ "first_stage_model.decoder.up.3.block.1.conv2.weight": "blocks.4.conv2.weight",
308
+ "first_stage_model.decoder.up.3.block.1.norm1.bias": "blocks.4.norm1.bias",
309
+ "first_stage_model.decoder.up.3.block.1.norm1.weight": "blocks.4.norm1.weight",
310
+ "first_stage_model.decoder.up.3.block.1.norm2.bias": "blocks.4.norm2.bias",
311
+ "first_stage_model.decoder.up.3.block.1.norm2.weight": "blocks.4.norm2.weight",
312
+ "first_stage_model.decoder.up.3.block.2.conv1.bias": "blocks.5.conv1.bias",
313
+ "first_stage_model.decoder.up.3.block.2.conv1.weight": "blocks.5.conv1.weight",
314
+ "first_stage_model.decoder.up.3.block.2.conv2.bias": "blocks.5.conv2.bias",
315
+ "first_stage_model.decoder.up.3.block.2.conv2.weight": "blocks.5.conv2.weight",
316
+ "first_stage_model.decoder.up.3.block.2.norm1.bias": "blocks.5.norm1.bias",
317
+ "first_stage_model.decoder.up.3.block.2.norm1.weight": "blocks.5.norm1.weight",
318
+ "first_stage_model.decoder.up.3.block.2.norm2.bias": "blocks.5.norm2.bias",
319
+ "first_stage_model.decoder.up.3.block.2.norm2.weight": "blocks.5.norm2.weight",
320
+ "first_stage_model.decoder.up.3.upsample.conv.bias": "blocks.6.conv.bias",
321
+ "first_stage_model.decoder.up.3.upsample.conv.weight": "blocks.6.conv.weight",
322
+ "first_stage_model.post_quant_conv.bias": "post_quant_conv.bias",
323
+ "first_stage_model.post_quant_conv.weight": "post_quant_conv.weight",
324
+ }
325
+ state_dict_ = {}
326
+ for name in state_dict:
327
+ if name in rename_dict:
328
+ param = state_dict[name]
329
+ if "transformer_blocks" in rename_dict[name]:
330
+ param = param.squeeze()
331
+ state_dict_[rename_dict[name]] = param
332
+ return state_dict_
diffsynth/models/sd_vae_encoder.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .sd_unet import ResnetBlock, DownSampler
3
+ from .sd_vae_decoder import VAEAttentionBlock
4
+ from .tiler import TileWorker
5
+ from einops import rearrange
6
+
7
+
8
+ class SDVAEEncoder(torch.nn.Module):
9
+ def __init__(self):
10
+ super().__init__()
11
+ self.scaling_factor = 0.18215
12
+ self.quant_conv = torch.nn.Conv2d(8, 8, kernel_size=1)
13
+ self.conv_in = torch.nn.Conv2d(3, 128, kernel_size=3, padding=1)
14
+
15
+ self.blocks = torch.nn.ModuleList([
16
+ # DownEncoderBlock2D
17
+ ResnetBlock(128, 128, eps=1e-6),
18
+ ResnetBlock(128, 128, eps=1e-6),
19
+ DownSampler(128, padding=0, extra_padding=True),
20
+ # DownEncoderBlock2D
21
+ ResnetBlock(128, 256, eps=1e-6),
22
+ ResnetBlock(256, 256, eps=1e-6),
23
+ DownSampler(256, padding=0, extra_padding=True),
24
+ # DownEncoderBlock2D
25
+ ResnetBlock(256, 512, eps=1e-6),
26
+ ResnetBlock(512, 512, eps=1e-6),
27
+ DownSampler(512, padding=0, extra_padding=True),
28
+ # DownEncoderBlock2D
29
+ ResnetBlock(512, 512, eps=1e-6),
30
+ ResnetBlock(512, 512, eps=1e-6),
31
+ # UNetMidBlock2D
32
+ ResnetBlock(512, 512, eps=1e-6),
33
+ VAEAttentionBlock(1, 512, 512, 1, eps=1e-6),
34
+ ResnetBlock(512, 512, eps=1e-6),
35
+ ])
36
+
37
+ self.conv_norm_out = torch.nn.GroupNorm(num_channels=512, num_groups=32, eps=1e-6)
38
+ self.conv_act = torch.nn.SiLU()
39
+ self.conv_out = torch.nn.Conv2d(512, 8, kernel_size=3, padding=1)
40
+
41
+ def tiled_forward(self, sample, tile_size=64, tile_stride=32):
42
+ hidden_states = TileWorker().tiled_forward(
43
+ lambda x: self.forward(x),
44
+ sample,
45
+ tile_size,
46
+ tile_stride,
47
+ tile_device=sample.device,
48
+ tile_dtype=sample.dtype
49
+ )
50
+ return hidden_states
51
+
52
+ def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
53
+ # For VAE Decoder, we do not need to apply the tiler on each layer.
54
+ if tiled:
55
+ return self.tiled_forward(sample, tile_size=tile_size, tile_stride=tile_stride)
56
+
57
+ # 1. pre-process
58
+ hidden_states = self.conv_in(sample)
59
+ time_emb = None
60
+ text_emb = None
61
+ res_stack = None
62
+
63
+ # 2. blocks
64
+ for i, block in enumerate(self.blocks):
65
+ hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
66
+
67
+ # 3. output
68
+ hidden_states = self.conv_norm_out(hidden_states)
69
+ hidden_states = self.conv_act(hidden_states)
70
+ hidden_states = self.conv_out(hidden_states)
71
+ hidden_states = self.quant_conv(hidden_states)
72
+ hidden_states = hidden_states[:, :4]
73
+ hidden_states *= self.scaling_factor
74
+
75
+ return hidden_states
76
+
77
+ def encode_video(self, sample, batch_size=8):
78
+ B = sample.shape[0]
79
+ hidden_states = []
80
+
81
+ for i in range(0, sample.shape[2], batch_size):
82
+
83
+ j = min(i + batch_size, sample.shape[2])
84
+ sample_batch = rearrange(sample[:,:,i:j], "B C T H W -> (B T) C H W")
85
+
86
+ hidden_states_batch = self(sample_batch)
87
+ hidden_states_batch = rearrange(hidden_states_batch, "(B T) C H W -> B C T H W", B=B)
88
+
89
+ hidden_states.append(hidden_states_batch)
90
+
91
+ hidden_states = torch.concat(hidden_states, dim=2)
92
+ return hidden_states
93
+
94
+ def state_dict_converter(self):
95
+ return SDVAEEncoderStateDictConverter()
96
+
97
+
98
+ class SDVAEEncoderStateDictConverter:
99
+ def __init__(self):
100
+ pass
101
+
102
+ def from_diffusers(self, state_dict):
103
+ # architecture
104
+ block_types = [
105
+ 'ResnetBlock', 'ResnetBlock', 'DownSampler',
106
+ 'ResnetBlock', 'ResnetBlock', 'DownSampler',
107
+ 'ResnetBlock', 'ResnetBlock', 'DownSampler',
108
+ 'ResnetBlock', 'ResnetBlock',
109
+ 'ResnetBlock', 'VAEAttentionBlock', 'ResnetBlock'
110
+ ]
111
+
112
+ # Rename each parameter
113
+ local_rename_dict = {
114
+ "quant_conv": "quant_conv",
115
+ "encoder.conv_in": "conv_in",
116
+ "encoder.mid_block.attentions.0.group_norm": "blocks.12.norm",
117
+ "encoder.mid_block.attentions.0.to_q": "blocks.12.transformer_blocks.0.to_q",
118
+ "encoder.mid_block.attentions.0.to_k": "blocks.12.transformer_blocks.0.to_k",
119
+ "encoder.mid_block.attentions.0.to_v": "blocks.12.transformer_blocks.0.to_v",
120
+ "encoder.mid_block.attentions.0.to_out.0": "blocks.12.transformer_blocks.0.to_out",
121
+ "encoder.mid_block.resnets.0.norm1": "blocks.11.norm1",
122
+ "encoder.mid_block.resnets.0.conv1": "blocks.11.conv1",
123
+ "encoder.mid_block.resnets.0.norm2": "blocks.11.norm2",
124
+ "encoder.mid_block.resnets.0.conv2": "blocks.11.conv2",
125
+ "encoder.mid_block.resnets.1.norm1": "blocks.13.norm1",
126
+ "encoder.mid_block.resnets.1.conv1": "blocks.13.conv1",
127
+ "encoder.mid_block.resnets.1.norm2": "blocks.13.norm2",
128
+ "encoder.mid_block.resnets.1.conv2": "blocks.13.conv2",
129
+ "encoder.conv_norm_out": "conv_norm_out",
130
+ "encoder.conv_out": "conv_out",
131
+ }
132
+ name_list = sorted([name for name in state_dict])
133
+ rename_dict = {}
134
+ block_id = {"ResnetBlock": -1, "DownSampler": -1, "UpSampler": -1}
135
+ last_block_type_with_id = {"ResnetBlock": "", "DownSampler": "", "UpSampler": ""}
136
+ for name in name_list:
137
+ names = name.split(".")
138
+ name_prefix = ".".join(names[:-1])
139
+ if name_prefix in local_rename_dict:
140
+ rename_dict[name] = local_rename_dict[name_prefix] + "." + names[-1]
141
+ elif name.startswith("encoder.down_blocks"):
142
+ block_type = {"resnets": "ResnetBlock", "downsamplers": "DownSampler", "upsamplers": "UpSampler"}[names[3]]
143
+ block_type_with_id = ".".join(names[:5])
144
+ if block_type_with_id != last_block_type_with_id[block_type]:
145
+ block_id[block_type] += 1
146
+ last_block_type_with_id[block_type] = block_type_with_id
147
+ while block_id[block_type] < len(block_types) and block_types[block_id[block_type]] != block_type:
148
+ block_id[block_type] += 1
149
+ block_type_with_id = ".".join(names[:5])
150
+ names = ["blocks", str(block_id[block_type])] + names[5:]
151
+ rename_dict[name] = ".".join(names)
152
+
153
+ # Convert state_dict
154
+ state_dict_ = {}
155
+ for name, param in state_dict.items():
156
+ if name in rename_dict:
157
+ state_dict_[rename_dict[name]] = param
158
+ return state_dict_
159
+
160
+ def from_civitai(self, state_dict):
161
+ rename_dict = {
162
+ "first_stage_model.encoder.conv_in.bias": "conv_in.bias",
163
+ "first_stage_model.encoder.conv_in.weight": "conv_in.weight",
164
+ "first_stage_model.encoder.conv_out.bias": "conv_out.bias",
165
+ "first_stage_model.encoder.conv_out.weight": "conv_out.weight",
166
+ "first_stage_model.encoder.down.0.block.0.conv1.bias": "blocks.0.conv1.bias",
167
+ "first_stage_model.encoder.down.0.block.0.conv1.weight": "blocks.0.conv1.weight",
168
+ "first_stage_model.encoder.down.0.block.0.conv2.bias": "blocks.0.conv2.bias",
169
+ "first_stage_model.encoder.down.0.block.0.conv2.weight": "blocks.0.conv2.weight",
170
+ "first_stage_model.encoder.down.0.block.0.norm1.bias": "blocks.0.norm1.bias",
171
+ "first_stage_model.encoder.down.0.block.0.norm1.weight": "blocks.0.norm1.weight",
172
+ "first_stage_model.encoder.down.0.block.0.norm2.bias": "blocks.0.norm2.bias",
173
+ "first_stage_model.encoder.down.0.block.0.norm2.weight": "blocks.0.norm2.weight",
174
+ "first_stage_model.encoder.down.0.block.1.conv1.bias": "blocks.1.conv1.bias",
175
+ "first_stage_model.encoder.down.0.block.1.conv1.weight": "blocks.1.conv1.weight",
176
+ "first_stage_model.encoder.down.0.block.1.conv2.bias": "blocks.1.conv2.bias",
177
+ "first_stage_model.encoder.down.0.block.1.conv2.weight": "blocks.1.conv2.weight",
178
+ "first_stage_model.encoder.down.0.block.1.norm1.bias": "blocks.1.norm1.bias",
179
+ "first_stage_model.encoder.down.0.block.1.norm1.weight": "blocks.1.norm1.weight",
180
+ "first_stage_model.encoder.down.0.block.1.norm2.bias": "blocks.1.norm2.bias",
181
+ "first_stage_model.encoder.down.0.block.1.norm2.weight": "blocks.1.norm2.weight",
182
+ "first_stage_model.encoder.down.0.downsample.conv.bias": "blocks.2.conv.bias",
183
+ "first_stage_model.encoder.down.0.downsample.conv.weight": "blocks.2.conv.weight",
184
+ "first_stage_model.encoder.down.1.block.0.conv1.bias": "blocks.3.conv1.bias",
185
+ "first_stage_model.encoder.down.1.block.0.conv1.weight": "blocks.3.conv1.weight",
186
+ "first_stage_model.encoder.down.1.block.0.conv2.bias": "blocks.3.conv2.bias",
187
+ "first_stage_model.encoder.down.1.block.0.conv2.weight": "blocks.3.conv2.weight",
188
+ "first_stage_model.encoder.down.1.block.0.nin_shortcut.bias": "blocks.3.conv_shortcut.bias",
189
+ "first_stage_model.encoder.down.1.block.0.nin_shortcut.weight": "blocks.3.conv_shortcut.weight",
190
+ "first_stage_model.encoder.down.1.block.0.norm1.bias": "blocks.3.norm1.bias",
191
+ "first_stage_model.encoder.down.1.block.0.norm1.weight": "blocks.3.norm1.weight",
192
+ "first_stage_model.encoder.down.1.block.0.norm2.bias": "blocks.3.norm2.bias",
193
+ "first_stage_model.encoder.down.1.block.0.norm2.weight": "blocks.3.norm2.weight",
194
+ "first_stage_model.encoder.down.1.block.1.conv1.bias": "blocks.4.conv1.bias",
195
+ "first_stage_model.encoder.down.1.block.1.conv1.weight": "blocks.4.conv1.weight",
196
+ "first_stage_model.encoder.down.1.block.1.conv2.bias": "blocks.4.conv2.bias",
197
+ "first_stage_model.encoder.down.1.block.1.conv2.weight": "blocks.4.conv2.weight",
198
+ "first_stage_model.encoder.down.1.block.1.norm1.bias": "blocks.4.norm1.bias",
199
+ "first_stage_model.encoder.down.1.block.1.norm1.weight": "blocks.4.norm1.weight",
200
+ "first_stage_model.encoder.down.1.block.1.norm2.bias": "blocks.4.norm2.bias",
201
+ "first_stage_model.encoder.down.1.block.1.norm2.weight": "blocks.4.norm2.weight",
202
+ "first_stage_model.encoder.down.1.downsample.conv.bias": "blocks.5.conv.bias",
203
+ "first_stage_model.encoder.down.1.downsample.conv.weight": "blocks.5.conv.weight",
204
+ "first_stage_model.encoder.down.2.block.0.conv1.bias": "blocks.6.conv1.bias",
205
+ "first_stage_model.encoder.down.2.block.0.conv1.weight": "blocks.6.conv1.weight",
206
+ "first_stage_model.encoder.down.2.block.0.conv2.bias": "blocks.6.conv2.bias",
207
+ "first_stage_model.encoder.down.2.block.0.conv2.weight": "blocks.6.conv2.weight",
208
+ "first_stage_model.encoder.down.2.block.0.nin_shortcut.bias": "blocks.6.conv_shortcut.bias",
209
+ "first_stage_model.encoder.down.2.block.0.nin_shortcut.weight": "blocks.6.conv_shortcut.weight",
210
+ "first_stage_model.encoder.down.2.block.0.norm1.bias": "blocks.6.norm1.bias",
211
+ "first_stage_model.encoder.down.2.block.0.norm1.weight": "blocks.6.norm1.weight",
212
+ "first_stage_model.encoder.down.2.block.0.norm2.bias": "blocks.6.norm2.bias",
213
+ "first_stage_model.encoder.down.2.block.0.norm2.weight": "blocks.6.norm2.weight",
214
+ "first_stage_model.encoder.down.2.block.1.conv1.bias": "blocks.7.conv1.bias",
215
+ "first_stage_model.encoder.down.2.block.1.conv1.weight": "blocks.7.conv1.weight",
216
+ "first_stage_model.encoder.down.2.block.1.conv2.bias": "blocks.7.conv2.bias",
217
+ "first_stage_model.encoder.down.2.block.1.conv2.weight": "blocks.7.conv2.weight",
218
+ "first_stage_model.encoder.down.2.block.1.norm1.bias": "blocks.7.norm1.bias",
219
+ "first_stage_model.encoder.down.2.block.1.norm1.weight": "blocks.7.norm1.weight",
220
+ "first_stage_model.encoder.down.2.block.1.norm2.bias": "blocks.7.norm2.bias",
221
+ "first_stage_model.encoder.down.2.block.1.norm2.weight": "blocks.7.norm2.weight",
222
+ "first_stage_model.encoder.down.2.downsample.conv.bias": "blocks.8.conv.bias",
223
+ "first_stage_model.encoder.down.2.downsample.conv.weight": "blocks.8.conv.weight",
224
+ "first_stage_model.encoder.down.3.block.0.conv1.bias": "blocks.9.conv1.bias",
225
+ "first_stage_model.encoder.down.3.block.0.conv1.weight": "blocks.9.conv1.weight",
226
+ "first_stage_model.encoder.down.3.block.0.conv2.bias": "blocks.9.conv2.bias",
227
+ "first_stage_model.encoder.down.3.block.0.conv2.weight": "blocks.9.conv2.weight",
228
+ "first_stage_model.encoder.down.3.block.0.norm1.bias": "blocks.9.norm1.bias",
229
+ "first_stage_model.encoder.down.3.block.0.norm1.weight": "blocks.9.norm1.weight",
230
+ "first_stage_model.encoder.down.3.block.0.norm2.bias": "blocks.9.norm2.bias",
231
+ "first_stage_model.encoder.down.3.block.0.norm2.weight": "blocks.9.norm2.weight",
232
+ "first_stage_model.encoder.down.3.block.1.conv1.bias": "blocks.10.conv1.bias",
233
+ "first_stage_model.encoder.down.3.block.1.conv1.weight": "blocks.10.conv1.weight",
234
+ "first_stage_model.encoder.down.3.block.1.conv2.bias": "blocks.10.conv2.bias",
235
+ "first_stage_model.encoder.down.3.block.1.conv2.weight": "blocks.10.conv2.weight",
236
+ "first_stage_model.encoder.down.3.block.1.norm1.bias": "blocks.10.norm1.bias",
237
+ "first_stage_model.encoder.down.3.block.1.norm1.weight": "blocks.10.norm1.weight",
238
+ "first_stage_model.encoder.down.3.block.1.norm2.bias": "blocks.10.norm2.bias",
239
+ "first_stage_model.encoder.down.3.block.1.norm2.weight": "blocks.10.norm2.weight",
240
+ "first_stage_model.encoder.mid.attn_1.k.bias": "blocks.12.transformer_blocks.0.to_k.bias",
241
+ "first_stage_model.encoder.mid.attn_1.k.weight": "blocks.12.transformer_blocks.0.to_k.weight",
242
+ "first_stage_model.encoder.mid.attn_1.norm.bias": "blocks.12.norm.bias",
243
+ "first_stage_model.encoder.mid.attn_1.norm.weight": "blocks.12.norm.weight",
244
+ "first_stage_model.encoder.mid.attn_1.proj_out.bias": "blocks.12.transformer_blocks.0.to_out.bias",
245
+ "first_stage_model.encoder.mid.attn_1.proj_out.weight": "blocks.12.transformer_blocks.0.to_out.weight",
246
+ "first_stage_model.encoder.mid.attn_1.q.bias": "blocks.12.transformer_blocks.0.to_q.bias",
247
+ "first_stage_model.encoder.mid.attn_1.q.weight": "blocks.12.transformer_blocks.0.to_q.weight",
248
+ "first_stage_model.encoder.mid.attn_1.v.bias": "blocks.12.transformer_blocks.0.to_v.bias",
249
+ "first_stage_model.encoder.mid.attn_1.v.weight": "blocks.12.transformer_blocks.0.to_v.weight",
250
+ "first_stage_model.encoder.mid.block_1.conv1.bias": "blocks.11.conv1.bias",
251
+ "first_stage_model.encoder.mid.block_1.conv1.weight": "blocks.11.conv1.weight",
252
+ "first_stage_model.encoder.mid.block_1.conv2.bias": "blocks.11.conv2.bias",
253
+ "first_stage_model.encoder.mid.block_1.conv2.weight": "blocks.11.conv2.weight",
254
+ "first_stage_model.encoder.mid.block_1.norm1.bias": "blocks.11.norm1.bias",
255
+ "first_stage_model.encoder.mid.block_1.norm1.weight": "blocks.11.norm1.weight",
256
+ "first_stage_model.encoder.mid.block_1.norm2.bias": "blocks.11.norm2.bias",
257
+ "first_stage_model.encoder.mid.block_1.norm2.weight": "blocks.11.norm2.weight",
258
+ "first_stage_model.encoder.mid.block_2.conv1.bias": "blocks.13.conv1.bias",
259
+ "first_stage_model.encoder.mid.block_2.conv1.weight": "blocks.13.conv1.weight",
260
+ "first_stage_model.encoder.mid.block_2.conv2.bias": "blocks.13.conv2.bias",
261
+ "first_stage_model.encoder.mid.block_2.conv2.weight": "blocks.13.conv2.weight",
262
+ "first_stage_model.encoder.mid.block_2.norm1.bias": "blocks.13.norm1.bias",
263
+ "first_stage_model.encoder.mid.block_2.norm1.weight": "blocks.13.norm1.weight",
264
+ "first_stage_model.encoder.mid.block_2.norm2.bias": "blocks.13.norm2.bias",
265
+ "first_stage_model.encoder.mid.block_2.norm2.weight": "blocks.13.norm2.weight",
266
+ "first_stage_model.encoder.norm_out.bias": "conv_norm_out.bias",
267
+ "first_stage_model.encoder.norm_out.weight": "conv_norm_out.weight",
268
+ "first_stage_model.quant_conv.bias": "quant_conv.bias",
269
+ "first_stage_model.quant_conv.weight": "quant_conv.weight",
270
+ }
271
+ state_dict_ = {}
272
+ for name in state_dict:
273
+ if name in rename_dict:
274
+ param = state_dict[name]
275
+ if "transformer_blocks" in rename_dict[name]:
276
+ param = param.squeeze()
277
+ state_dict_[rename_dict[name]] = param
278
+ return state_dict_
diffsynth/models/sdxl_ipadapter.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .svd_image_encoder import SVDImageEncoder
2
+ from transformers import CLIPImageProcessor
3
+ import torch
4
+
5
+
6
+ class IpAdapterXLCLIPImageEmbedder(SVDImageEncoder):
7
+ def __init__(self):
8
+ super().__init__(embed_dim=1664, encoder_intermediate_size=8192, projection_dim=1280, num_encoder_layers=48, num_heads=16, head_dim=104)
9
+ self.image_processor = CLIPImageProcessor()
10
+
11
+ def forward(self, image):
12
+ pixel_values = self.image_processor(images=image, return_tensors="pt").pixel_values
13
+ pixel_values = pixel_values.to(device=self.embeddings.class_embedding.device, dtype=self.embeddings.class_embedding.dtype)
14
+ return super().forward(pixel_values)
15
+
16
+
17
+ class IpAdapterImageProjModel(torch.nn.Module):
18
+ def __init__(self, cross_attention_dim=2048, clip_embeddings_dim=1280, clip_extra_context_tokens=4):
19
+ super().__init__()
20
+ self.cross_attention_dim = cross_attention_dim
21
+ self.clip_extra_context_tokens = clip_extra_context_tokens
22
+ self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
23
+ self.norm = torch.nn.LayerNorm(cross_attention_dim)
24
+
25
+ def forward(self, image_embeds):
26
+ clip_extra_context_tokens = self.proj(image_embeds).reshape(-1, self.clip_extra_context_tokens, self.cross_attention_dim)
27
+ clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
28
+ return clip_extra_context_tokens
29
+
30
+
31
+ class IpAdapterModule(torch.nn.Module):
32
+ def __init__(self, input_dim, output_dim):
33
+ super().__init__()
34
+ self.to_k_ip = torch.nn.Linear(input_dim, output_dim, bias=False)
35
+ self.to_v_ip = torch.nn.Linear(input_dim, output_dim, bias=False)
36
+
37
+ def forward(self, hidden_states):
38
+ ip_k = self.to_k_ip(hidden_states)
39
+ ip_v = self.to_v_ip(hidden_states)
40
+ return ip_k, ip_v
41
+
42
+
43
+ class SDXLIpAdapter(torch.nn.Module):
44
+ def __init__(self):
45
+ super().__init__()
46
+ shape_list = [(2048, 640)] * 4 + [(2048, 1280)] * 50 + [(2048, 640)] * 6 + [(2048, 1280)] * 10
47
+ self.ipadapter_modules = torch.nn.ModuleList([IpAdapterModule(*shape) for shape in shape_list])
48
+ self.image_proj = IpAdapterImageProjModel()
49
+ self.set_full_adapter()
50
+
51
+ def set_full_adapter(self):
52
+ map_list = sum([
53
+ [(7, i) for i in range(2)],
54
+ [(10, i) for i in range(2)],
55
+ [(15, i) for i in range(10)],
56
+ [(18, i) for i in range(10)],
57
+ [(25, i) for i in range(10)],
58
+ [(28, i) for i in range(10)],
59
+ [(31, i) for i in range(10)],
60
+ [(35, i) for i in range(2)],
61
+ [(38, i) for i in range(2)],
62
+ [(41, i) for i in range(2)],
63
+ [(21, i) for i in range(10)],
64
+ ], [])
65
+ self.call_block_id = {i: j for j, i in enumerate(map_list)}
66
+
67
+ def set_less_adapter(self):
68
+ map_list = sum([
69
+ [(7, i) for i in range(2)],
70
+ [(10, i) for i in range(2)],
71
+ [(15, i) for i in range(10)],
72
+ [(18, i) for i in range(10)],
73
+ [(25, i) for i in range(10)],
74
+ [(28, i) for i in range(10)],
75
+ [(31, i) for i in range(10)],
76
+ [(35, i) for i in range(2)],
77
+ [(38, i) for i in range(2)],
78
+ [(41, i) for i in range(2)],
79
+ [(21, i) for i in range(10)],
80
+ ], [])
81
+ self.call_block_id = {i: j for j, i in enumerate(map_list) if j>=34 and j<44}
82
+
83
+ def forward(self, hidden_states, scale=1.0):
84
+ hidden_states = self.image_proj(hidden_states)
85
+ hidden_states = hidden_states.view(1, -1, hidden_states.shape[-1])
86
+ ip_kv_dict = {}
87
+ for (block_id, transformer_id) in self.call_block_id:
88
+ ipadapter_id = self.call_block_id[(block_id, transformer_id)]
89
+ ip_k, ip_v = self.ipadapter_modules[ipadapter_id](hidden_states)
90
+ if block_id not in ip_kv_dict:
91
+ ip_kv_dict[block_id] = {}
92
+ ip_kv_dict[block_id][transformer_id] = {
93
+ "ip_k": ip_k,
94
+ "ip_v": ip_v,
95
+ "scale": scale
96
+ }
97
+ return ip_kv_dict
98
+
99
+ def state_dict_converter(self):
100
+ return SDXLIpAdapterStateDictConverter()
101
+
102
+
103
+ class SDXLIpAdapterStateDictConverter:
104
+ def __init__(self):
105
+ pass
106
+
107
+ def from_diffusers(self, state_dict):
108
+ state_dict_ = {}
109
+ for name in state_dict["ip_adapter"]:
110
+ names = name.split(".")
111
+ layer_id = str(int(names[0]) // 2)
112
+ name_ = ".".join(["ipadapter_modules"] + [layer_id] + names[1:])
113
+ state_dict_[name_] = state_dict["ip_adapter"][name]
114
+ for name in state_dict["image_proj"]:
115
+ name_ = "image_proj." + name
116
+ state_dict_[name_] = state_dict["image_proj"][name]
117
+ return state_dict_
118
+
119
+ def from_civitai(self, state_dict):
120
+ return self.from_diffusers(state_dict)
121
+
diffsynth/models/sdxl_motion.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .sd_motion import TemporalBlock
2
+ import torch
3
+
4
+
5
+
6
+ class SDXLMotionModel(torch.nn.Module):
7
+ def __init__(self):
8
+ super().__init__()
9
+ self.motion_modules = torch.nn.ModuleList([
10
+ TemporalBlock(8, 320//8, 320, eps=1e-6),
11
+ TemporalBlock(8, 320//8, 320, eps=1e-6),
12
+
13
+ TemporalBlock(8, 640//8, 640, eps=1e-6),
14
+ TemporalBlock(8, 640//8, 640, eps=1e-6),
15
+
16
+ TemporalBlock(8, 1280//8, 1280, eps=1e-6),
17
+ TemporalBlock(8, 1280//8, 1280, eps=1e-6),
18
+
19
+ TemporalBlock(8, 1280//8, 1280, eps=1e-6),
20
+ TemporalBlock(8, 1280//8, 1280, eps=1e-6),
21
+ TemporalBlock(8, 1280//8, 1280, eps=1e-6),
22
+
23
+ TemporalBlock(8, 640//8, 640, eps=1e-6),
24
+ TemporalBlock(8, 640//8, 640, eps=1e-6),
25
+ TemporalBlock(8, 640//8, 640, eps=1e-6),
26
+
27
+ TemporalBlock(8, 320//8, 320, eps=1e-6),
28
+ TemporalBlock(8, 320//8, 320, eps=1e-6),
29
+ TemporalBlock(8, 320//8, 320, eps=1e-6),
30
+ ])
31
+ self.call_block_id = {
32
+ 0: 0,
33
+ 2: 1,
34
+ 7: 2,
35
+ 10: 3,
36
+ 15: 4,
37
+ 18: 5,
38
+ 25: 6,
39
+ 28: 7,
40
+ 31: 8,
41
+ 35: 9,
42
+ 38: 10,
43
+ 41: 11,
44
+ 44: 12,
45
+ 46: 13,
46
+ 48: 14,
47
+ }
48
+
49
+ def forward(self):
50
+ pass
51
+
52
+ def state_dict_converter(self):
53
+ return SDMotionModelStateDictConverter()
54
+
55
+
56
+ class SDMotionModelStateDictConverter:
57
+ def __init__(self):
58
+ pass
59
+
60
+ def from_diffusers(self, state_dict):
61
+ rename_dict = {
62
+ "norm": "norm",
63
+ "proj_in": "proj_in",
64
+ "transformer_blocks.0.attention_blocks.0.to_q": "transformer_blocks.0.attn1.to_q",
65
+ "transformer_blocks.0.attention_blocks.0.to_k": "transformer_blocks.0.attn1.to_k",
66
+ "transformer_blocks.0.attention_blocks.0.to_v": "transformer_blocks.0.attn1.to_v",
67
+ "transformer_blocks.0.attention_blocks.0.to_out.0": "transformer_blocks.0.attn1.to_out",
68
+ "transformer_blocks.0.attention_blocks.0.pos_encoder": "transformer_blocks.0.pe1",
69
+ "transformer_blocks.0.attention_blocks.1.to_q": "transformer_blocks.0.attn2.to_q",
70
+ "transformer_blocks.0.attention_blocks.1.to_k": "transformer_blocks.0.attn2.to_k",
71
+ "transformer_blocks.0.attention_blocks.1.to_v": "transformer_blocks.0.attn2.to_v",
72
+ "transformer_blocks.0.attention_blocks.1.to_out.0": "transformer_blocks.0.attn2.to_out",
73
+ "transformer_blocks.0.attention_blocks.1.pos_encoder": "transformer_blocks.0.pe2",
74
+ "transformer_blocks.0.norms.0": "transformer_blocks.0.norm1",
75
+ "transformer_blocks.0.norms.1": "transformer_blocks.0.norm2",
76
+ "transformer_blocks.0.ff.net.0.proj": "transformer_blocks.0.act_fn.proj",
77
+ "transformer_blocks.0.ff.net.2": "transformer_blocks.0.ff",
78
+ "transformer_blocks.0.ff_norm": "transformer_blocks.0.norm3",
79
+ "proj_out": "proj_out",
80
+ }
81
+ name_list = sorted([i for i in state_dict if i.startswith("down_blocks.")])
82
+ name_list += sorted([i for i in state_dict if i.startswith("mid_block.")])
83
+ name_list += sorted([i for i in state_dict if i.startswith("up_blocks.")])
84
+ state_dict_ = {}
85
+ last_prefix, module_id = "", -1
86
+ for name in name_list:
87
+ names = name.split(".")
88
+ prefix_index = names.index("temporal_transformer") + 1
89
+ prefix = ".".join(names[:prefix_index])
90
+ if prefix != last_prefix:
91
+ last_prefix = prefix
92
+ module_id += 1
93
+ middle_name = ".".join(names[prefix_index:-1])
94
+ suffix = names[-1]
95
+ if "pos_encoder" in names:
96
+ rename = ".".join(["motion_modules", str(module_id), rename_dict[middle_name]])
97
+ else:
98
+ rename = ".".join(["motion_modules", str(module_id), rename_dict[middle_name], suffix])
99
+ state_dict_[rename] = state_dict[name]
100
+ return state_dict_
101
+
102
+ def from_civitai(self, state_dict):
103
+ return self.from_diffusers(state_dict)
diffsynth/models/sdxl_text_encoder.py ADDED
@@ -0,0 +1,757 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .sd_text_encoder import CLIPEncoderLayer
3
+
4
+
5
+ class SDXLTextEncoder(torch.nn.Module):
6
+ def __init__(self, embed_dim=768, vocab_size=49408, max_position_embeddings=77, num_encoder_layers=11, encoder_intermediate_size=3072):
7
+ super().__init__()
8
+
9
+ # token_embedding
10
+ self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
11
+
12
+ # position_embeds (This is a fixed tensor)
13
+ self.position_embeds = torch.nn.Parameter(torch.zeros(1, max_position_embeddings, embed_dim))
14
+
15
+ # encoders
16
+ self.encoders = torch.nn.ModuleList([CLIPEncoderLayer(embed_dim, encoder_intermediate_size) for _ in range(num_encoder_layers)])
17
+
18
+ # attn_mask
19
+ self.attn_mask = self.attention_mask(max_position_embeddings)
20
+
21
+ # The text encoder is different to that in Stable Diffusion 1.x.
22
+ # It does not include final_layer_norm.
23
+
24
+ def attention_mask(self, length):
25
+ mask = torch.empty(length, length)
26
+ mask.fill_(float("-inf"))
27
+ mask.triu_(1)
28
+ return mask
29
+
30
+ def forward(self, input_ids, clip_skip=1):
31
+ embeds = self.token_embedding(input_ids) + self.position_embeds
32
+ attn_mask = self.attn_mask.to(device=embeds.device, dtype=embeds.dtype)
33
+ for encoder_id, encoder in enumerate(self.encoders):
34
+ embeds = encoder(embeds, attn_mask=attn_mask)
35
+ if encoder_id + clip_skip == len(self.encoders):
36
+ break
37
+ return embeds
38
+
39
+ def state_dict_converter(self):
40
+ return SDXLTextEncoderStateDictConverter()
41
+
42
+
43
+ class SDXLTextEncoder2(torch.nn.Module):
44
+ def __init__(self, embed_dim=1280, vocab_size=49408, max_position_embeddings=77, num_encoder_layers=32, encoder_intermediate_size=5120):
45
+ super().__init__()
46
+
47
+ # token_embedding
48
+ self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
49
+
50
+ # position_embeds (This is a fixed tensor)
51
+ self.position_embeds = torch.nn.Parameter(torch.zeros(1, max_position_embeddings, embed_dim))
52
+
53
+ # encoders
54
+ self.encoders = torch.nn.ModuleList([CLIPEncoderLayer(embed_dim, encoder_intermediate_size, num_heads=20, head_dim=64, use_quick_gelu=False) for _ in range(num_encoder_layers)])
55
+
56
+ # attn_mask
57
+ self.attn_mask = self.attention_mask(max_position_embeddings)
58
+
59
+ # final_layer_norm
60
+ self.final_layer_norm = torch.nn.LayerNorm(embed_dim)
61
+
62
+ # text_projection
63
+ self.text_projection = torch.nn.Linear(embed_dim, embed_dim, bias=False)
64
+
65
+ def attention_mask(self, length):
66
+ mask = torch.empty(length, length)
67
+ mask.fill_(float("-inf"))
68
+ mask.triu_(1)
69
+ return mask
70
+
71
+ def forward(self, input_ids, clip_skip=2):
72
+ embeds = self.token_embedding(input_ids) + self.position_embeds
73
+ attn_mask = self.attn_mask.to(device=embeds.device, dtype=embeds.dtype)
74
+ for encoder_id, encoder in enumerate(self.encoders):
75
+ embeds = encoder(embeds, attn_mask=attn_mask)
76
+ if encoder_id + clip_skip == len(self.encoders):
77
+ hidden_states = embeds
78
+ embeds = self.final_layer_norm(embeds)
79
+ pooled_embeds = embeds[torch.arange(embeds.shape[0]), input_ids.to(dtype=torch.int).argmax(dim=-1)]
80
+ pooled_embeds = self.text_projection(pooled_embeds)
81
+ return pooled_embeds, hidden_states
82
+
83
+ def state_dict_converter(self):
84
+ return SDXLTextEncoder2StateDictConverter()
85
+
86
+
87
+ class SDXLTextEncoderStateDictConverter:
88
+ def __init__(self):
89
+ pass
90
+
91
+ def from_diffusers(self, state_dict):
92
+ rename_dict = {
93
+ "text_model.embeddings.token_embedding.weight": "token_embedding.weight",
94
+ "text_model.embeddings.position_embedding.weight": "position_embeds",
95
+ "text_model.final_layer_norm.weight": "final_layer_norm.weight",
96
+ "text_model.final_layer_norm.bias": "final_layer_norm.bias"
97
+ }
98
+ attn_rename_dict = {
99
+ "self_attn.q_proj": "attn.to_q",
100
+ "self_attn.k_proj": "attn.to_k",
101
+ "self_attn.v_proj": "attn.to_v",
102
+ "self_attn.out_proj": "attn.to_out",
103
+ "layer_norm1": "layer_norm1",
104
+ "layer_norm2": "layer_norm2",
105
+ "mlp.fc1": "fc1",
106
+ "mlp.fc2": "fc2",
107
+ }
108
+ state_dict_ = {}
109
+ for name in state_dict:
110
+ if name in rename_dict:
111
+ param = state_dict[name]
112
+ if name == "text_model.embeddings.position_embedding.weight":
113
+ param = param.reshape((1, param.shape[0], param.shape[1]))
114
+ state_dict_[rename_dict[name]] = param
115
+ elif name.startswith("text_model.encoder.layers."):
116
+ param = state_dict[name]
117
+ names = name.split(".")
118
+ layer_id, layer_type, tail = names[3], ".".join(names[4:-1]), names[-1]
119
+ name_ = ".".join(["encoders", layer_id, attn_rename_dict[layer_type], tail])
120
+ state_dict_[name_] = param
121
+ return state_dict_
122
+
123
+ def from_civitai(self, state_dict):
124
+ rename_dict = {
125
+ "conditioner.embedders.0.transformer.text_model.embeddings.position_embedding.weight": "position_embeds",
126
+ "conditioner.embedders.0.transformer.text_model.embeddings.token_embedding.weight": "token_embedding.weight",
127
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.layer_norm1.bias": "encoders.0.layer_norm1.bias",
128
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.layer_norm1.weight": "encoders.0.layer_norm1.weight",
129
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.layer_norm2.bias": "encoders.0.layer_norm2.bias",
130
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.layer_norm2.weight": "encoders.0.layer_norm2.weight",
131
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.mlp.fc1.bias": "encoders.0.fc1.bias",
132
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.mlp.fc1.weight": "encoders.0.fc1.weight",
133
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.mlp.fc2.bias": "encoders.0.fc2.bias",
134
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.mlp.fc2.weight": "encoders.0.fc2.weight",
135
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias": "encoders.0.attn.to_k.bias",
136
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight": "encoders.0.attn.to_k.weight",
137
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias": "encoders.0.attn.to_out.bias",
138
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight": "encoders.0.attn.to_out.weight",
139
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias": "encoders.0.attn.to_q.bias",
140
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight": "encoders.0.attn.to_q.weight",
141
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias": "encoders.0.attn.to_v.bias",
142
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight": "encoders.0.attn.to_v.weight",
143
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.layer_norm1.bias": "encoders.1.layer_norm1.bias",
144
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.layer_norm1.weight": "encoders.1.layer_norm1.weight",
145
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.layer_norm2.bias": "encoders.1.layer_norm2.bias",
146
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.layer_norm2.weight": "encoders.1.layer_norm2.weight",
147
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.mlp.fc1.bias": "encoders.1.fc1.bias",
148
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.mlp.fc1.weight": "encoders.1.fc1.weight",
149
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.mlp.fc2.bias": "encoders.1.fc2.bias",
150
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.mlp.fc2.weight": "encoders.1.fc2.weight",
151
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias": "encoders.1.attn.to_k.bias",
152
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight": "encoders.1.attn.to_k.weight",
153
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias": "encoders.1.attn.to_out.bias",
154
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight": "encoders.1.attn.to_out.weight",
155
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias": "encoders.1.attn.to_q.bias",
156
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight": "encoders.1.attn.to_q.weight",
157
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias": "encoders.1.attn.to_v.bias",
158
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight": "encoders.1.attn.to_v.weight",
159
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.layer_norm1.bias": "encoders.10.layer_norm1.bias",
160
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.layer_norm1.weight": "encoders.10.layer_norm1.weight",
161
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.layer_norm2.bias": "encoders.10.layer_norm2.bias",
162
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.layer_norm2.weight": "encoders.10.layer_norm2.weight",
163
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.mlp.fc1.bias": "encoders.10.fc1.bias",
164
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.mlp.fc1.weight": "encoders.10.fc1.weight",
165
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.mlp.fc2.bias": "encoders.10.fc2.bias",
166
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.mlp.fc2.weight": "encoders.10.fc2.weight",
167
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias": "encoders.10.attn.to_k.bias",
168
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight": "encoders.10.attn.to_k.weight",
169
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias": "encoders.10.attn.to_out.bias",
170
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight": "encoders.10.attn.to_out.weight",
171
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias": "encoders.10.attn.to_q.bias",
172
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight": "encoders.10.attn.to_q.weight",
173
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias": "encoders.10.attn.to_v.bias",
174
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight": "encoders.10.attn.to_v.weight",
175
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.layer_norm1.bias": "encoders.2.layer_norm1.bias",
176
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.layer_norm1.weight": "encoders.2.layer_norm1.weight",
177
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.layer_norm2.bias": "encoders.2.layer_norm2.bias",
178
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.layer_norm2.weight": "encoders.2.layer_norm2.weight",
179
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.mlp.fc1.bias": "encoders.2.fc1.bias",
180
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.mlp.fc1.weight": "encoders.2.fc1.weight",
181
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.mlp.fc2.bias": "encoders.2.fc2.bias",
182
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.mlp.fc2.weight": "encoders.2.fc2.weight",
183
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias": "encoders.2.attn.to_k.bias",
184
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight": "encoders.2.attn.to_k.weight",
185
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias": "encoders.2.attn.to_out.bias",
186
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight": "encoders.2.attn.to_out.weight",
187
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias": "encoders.2.attn.to_q.bias",
188
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight": "encoders.2.attn.to_q.weight",
189
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias": "encoders.2.attn.to_v.bias",
190
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight": "encoders.2.attn.to_v.weight",
191
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.layer_norm1.bias": "encoders.3.layer_norm1.bias",
192
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.layer_norm1.weight": "encoders.3.layer_norm1.weight",
193
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.layer_norm2.bias": "encoders.3.layer_norm2.bias",
194
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.layer_norm2.weight": "encoders.3.layer_norm2.weight",
195
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.mlp.fc1.bias": "encoders.3.fc1.bias",
196
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.mlp.fc1.weight": "encoders.3.fc1.weight",
197
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.mlp.fc2.bias": "encoders.3.fc2.bias",
198
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.mlp.fc2.weight": "encoders.3.fc2.weight",
199
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias": "encoders.3.attn.to_k.bias",
200
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight": "encoders.3.attn.to_k.weight",
201
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias": "encoders.3.attn.to_out.bias",
202
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight": "encoders.3.attn.to_out.weight",
203
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias": "encoders.3.attn.to_q.bias",
204
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight": "encoders.3.attn.to_q.weight",
205
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias": "encoders.3.attn.to_v.bias",
206
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight": "encoders.3.attn.to_v.weight",
207
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.layer_norm1.bias": "encoders.4.layer_norm1.bias",
208
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.layer_norm1.weight": "encoders.4.layer_norm1.weight",
209
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.layer_norm2.bias": "encoders.4.layer_norm2.bias",
210
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.layer_norm2.weight": "encoders.4.layer_norm2.weight",
211
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.mlp.fc1.bias": "encoders.4.fc1.bias",
212
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.mlp.fc1.weight": "encoders.4.fc1.weight",
213
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.mlp.fc2.bias": "encoders.4.fc2.bias",
214
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.mlp.fc2.weight": "encoders.4.fc2.weight",
215
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias": "encoders.4.attn.to_k.bias",
216
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight": "encoders.4.attn.to_k.weight",
217
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias": "encoders.4.attn.to_out.bias",
218
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight": "encoders.4.attn.to_out.weight",
219
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias": "encoders.4.attn.to_q.bias",
220
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight": "encoders.4.attn.to_q.weight",
221
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias": "encoders.4.attn.to_v.bias",
222
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight": "encoders.4.attn.to_v.weight",
223
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.layer_norm1.bias": "encoders.5.layer_norm1.bias",
224
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.layer_norm1.weight": "encoders.5.layer_norm1.weight",
225
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.layer_norm2.bias": "encoders.5.layer_norm2.bias",
226
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.layer_norm2.weight": "encoders.5.layer_norm2.weight",
227
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.mlp.fc1.bias": "encoders.5.fc1.bias",
228
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.mlp.fc1.weight": "encoders.5.fc1.weight",
229
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.mlp.fc2.bias": "encoders.5.fc2.bias",
230
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.mlp.fc2.weight": "encoders.5.fc2.weight",
231
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias": "encoders.5.attn.to_k.bias",
232
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight": "encoders.5.attn.to_k.weight",
233
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias": "encoders.5.attn.to_out.bias",
234
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight": "encoders.5.attn.to_out.weight",
235
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias": "encoders.5.attn.to_q.bias",
236
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight": "encoders.5.attn.to_q.weight",
237
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias": "encoders.5.attn.to_v.bias",
238
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight": "encoders.5.attn.to_v.weight",
239
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.layer_norm1.bias": "encoders.6.layer_norm1.bias",
240
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.layer_norm1.weight": "encoders.6.layer_norm1.weight",
241
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.layer_norm2.bias": "encoders.6.layer_norm2.bias",
242
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.layer_norm2.weight": "encoders.6.layer_norm2.weight",
243
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.mlp.fc1.bias": "encoders.6.fc1.bias",
244
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.mlp.fc1.weight": "encoders.6.fc1.weight",
245
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.mlp.fc2.bias": "encoders.6.fc2.bias",
246
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.mlp.fc2.weight": "encoders.6.fc2.weight",
247
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias": "encoders.6.attn.to_k.bias",
248
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight": "encoders.6.attn.to_k.weight",
249
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias": "encoders.6.attn.to_out.bias",
250
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight": "encoders.6.attn.to_out.weight",
251
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias": "encoders.6.attn.to_q.bias",
252
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight": "encoders.6.attn.to_q.weight",
253
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias": "encoders.6.attn.to_v.bias",
254
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight": "encoders.6.attn.to_v.weight",
255
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.layer_norm1.bias": "encoders.7.layer_norm1.bias",
256
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.layer_norm1.weight": "encoders.7.layer_norm1.weight",
257
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.layer_norm2.bias": "encoders.7.layer_norm2.bias",
258
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.layer_norm2.weight": "encoders.7.layer_norm2.weight",
259
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.mlp.fc1.bias": "encoders.7.fc1.bias",
260
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.mlp.fc1.weight": "encoders.7.fc1.weight",
261
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.mlp.fc2.bias": "encoders.7.fc2.bias",
262
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.mlp.fc2.weight": "encoders.7.fc2.weight",
263
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias": "encoders.7.attn.to_k.bias",
264
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight": "encoders.7.attn.to_k.weight",
265
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias": "encoders.7.attn.to_out.bias",
266
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight": "encoders.7.attn.to_out.weight",
267
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias": "encoders.7.attn.to_q.bias",
268
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight": "encoders.7.attn.to_q.weight",
269
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias": "encoders.7.attn.to_v.bias",
270
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight": "encoders.7.attn.to_v.weight",
271
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.layer_norm1.bias": "encoders.8.layer_norm1.bias",
272
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.layer_norm1.weight": "encoders.8.layer_norm1.weight",
273
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.layer_norm2.bias": "encoders.8.layer_norm2.bias",
274
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.layer_norm2.weight": "encoders.8.layer_norm2.weight",
275
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.mlp.fc1.bias": "encoders.8.fc1.bias",
276
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.mlp.fc1.weight": "encoders.8.fc1.weight",
277
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.mlp.fc2.bias": "encoders.8.fc2.bias",
278
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.mlp.fc2.weight": "encoders.8.fc2.weight",
279
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias": "encoders.8.attn.to_k.bias",
280
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight": "encoders.8.attn.to_k.weight",
281
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias": "encoders.8.attn.to_out.bias",
282
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight": "encoders.8.attn.to_out.weight",
283
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias": "encoders.8.attn.to_q.bias",
284
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight": "encoders.8.attn.to_q.weight",
285
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias": "encoders.8.attn.to_v.bias",
286
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight": "encoders.8.attn.to_v.weight",
287
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.layer_norm1.bias": "encoders.9.layer_norm1.bias",
288
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.layer_norm1.weight": "encoders.9.layer_norm1.weight",
289
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.layer_norm2.bias": "encoders.9.layer_norm2.bias",
290
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.layer_norm2.weight": "encoders.9.layer_norm2.weight",
291
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.mlp.fc1.bias": "encoders.9.fc1.bias",
292
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.mlp.fc1.weight": "encoders.9.fc1.weight",
293
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.mlp.fc2.bias": "encoders.9.fc2.bias",
294
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.mlp.fc2.weight": "encoders.9.fc2.weight",
295
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias": "encoders.9.attn.to_k.bias",
296
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight": "encoders.9.attn.to_k.weight",
297
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias": "encoders.9.attn.to_out.bias",
298
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight": "encoders.9.attn.to_out.weight",
299
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias": "encoders.9.attn.to_q.bias",
300
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight": "encoders.9.attn.to_q.weight",
301
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias": "encoders.9.attn.to_v.bias",
302
+ "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight": "encoders.9.attn.to_v.weight",
303
+ }
304
+ state_dict_ = {}
305
+ for name in state_dict:
306
+ if name in rename_dict:
307
+ param = state_dict[name]
308
+ if name == "conditioner.embedders.0.transformer.text_model.embeddings.position_embedding.weight":
309
+ param = param.reshape((1, param.shape[0], param.shape[1]))
310
+ state_dict_[rename_dict[name]] = param
311
+ return state_dict_
312
+
313
+
314
+ class SDXLTextEncoder2StateDictConverter:
315
+ def __init__(self):
316
+ pass
317
+
318
+ def from_diffusers(self, state_dict):
319
+ rename_dict = {
320
+ "text_model.embeddings.token_embedding.weight": "token_embedding.weight",
321
+ "text_model.embeddings.position_embedding.weight": "position_embeds",
322
+ "text_model.final_layer_norm.weight": "final_layer_norm.weight",
323
+ "text_model.final_layer_norm.bias": "final_layer_norm.bias",
324
+ "text_projection.weight": "text_projection.weight"
325
+ }
326
+ attn_rename_dict = {
327
+ "self_attn.q_proj": "attn.to_q",
328
+ "self_attn.k_proj": "attn.to_k",
329
+ "self_attn.v_proj": "attn.to_v",
330
+ "self_attn.out_proj": "attn.to_out",
331
+ "layer_norm1": "layer_norm1",
332
+ "layer_norm2": "layer_norm2",
333
+ "mlp.fc1": "fc1",
334
+ "mlp.fc2": "fc2",
335
+ }
336
+ state_dict_ = {}
337
+ for name in state_dict:
338
+ if name in rename_dict:
339
+ param = state_dict[name]
340
+ if name == "text_model.embeddings.position_embedding.weight":
341
+ param = param.reshape((1, param.shape[0], param.shape[1]))
342
+ state_dict_[rename_dict[name]] = param
343
+ elif name.startswith("text_model.encoder.layers."):
344
+ param = state_dict[name]
345
+ names = name.split(".")
346
+ layer_id, layer_type, tail = names[3], ".".join(names[4:-1]), names[-1]
347
+ name_ = ".".join(["encoders", layer_id, attn_rename_dict[layer_type], tail])
348
+ state_dict_[name_] = param
349
+ return state_dict_
350
+
351
+ def from_civitai(self, state_dict):
352
+ rename_dict = {
353
+ "conditioner.embedders.1.model.ln_final.bias": "final_layer_norm.bias",
354
+ "conditioner.embedders.1.model.ln_final.weight": "final_layer_norm.weight",
355
+ "conditioner.embedders.1.model.positional_embedding": "position_embeds",
356
+ "conditioner.embedders.1.model.token_embedding.weight": "token_embedding.weight",
357
+ "conditioner.embedders.1.model.transformer.resblocks.0.attn.in_proj_bias": ['encoders.0.attn.to_q.bias', 'encoders.0.attn.to_k.bias', 'encoders.0.attn.to_v.bias'],
358
+ "conditioner.embedders.1.model.transformer.resblocks.0.attn.in_proj_weight": ['encoders.0.attn.to_q.weight', 'encoders.0.attn.to_k.weight', 'encoders.0.attn.to_v.weight'],
359
+ "conditioner.embedders.1.model.transformer.resblocks.0.attn.out_proj.bias": "encoders.0.attn.to_out.bias",
360
+ "conditioner.embedders.1.model.transformer.resblocks.0.attn.out_proj.weight": "encoders.0.attn.to_out.weight",
361
+ "conditioner.embedders.1.model.transformer.resblocks.0.ln_1.bias": "encoders.0.layer_norm1.bias",
362
+ "conditioner.embedders.1.model.transformer.resblocks.0.ln_1.weight": "encoders.0.layer_norm1.weight",
363
+ "conditioner.embedders.1.model.transformer.resblocks.0.ln_2.bias": "encoders.0.layer_norm2.bias",
364
+ "conditioner.embedders.1.model.transformer.resblocks.0.ln_2.weight": "encoders.0.layer_norm2.weight",
365
+ "conditioner.embedders.1.model.transformer.resblocks.0.mlp.c_fc.bias": "encoders.0.fc1.bias",
366
+ "conditioner.embedders.1.model.transformer.resblocks.0.mlp.c_fc.weight": "encoders.0.fc1.weight",
367
+ "conditioner.embedders.1.model.transformer.resblocks.0.mlp.c_proj.bias": "encoders.0.fc2.bias",
368
+ "conditioner.embedders.1.model.transformer.resblocks.0.mlp.c_proj.weight": "encoders.0.fc2.weight",
369
+ "conditioner.embedders.1.model.transformer.resblocks.1.attn.in_proj_bias": ['encoders.1.attn.to_q.bias', 'encoders.1.attn.to_k.bias', 'encoders.1.attn.to_v.bias'],
370
+ "conditioner.embedders.1.model.transformer.resblocks.1.attn.in_proj_weight": ['encoders.1.attn.to_q.weight', 'encoders.1.attn.to_k.weight', 'encoders.1.attn.to_v.weight'],
371
+ "conditioner.embedders.1.model.transformer.resblocks.1.attn.out_proj.bias": "encoders.1.attn.to_out.bias",
372
+ "conditioner.embedders.1.model.transformer.resblocks.1.attn.out_proj.weight": "encoders.1.attn.to_out.weight",
373
+ "conditioner.embedders.1.model.transformer.resblocks.1.ln_1.bias": "encoders.1.layer_norm1.bias",
374
+ "conditioner.embedders.1.model.transformer.resblocks.1.ln_1.weight": "encoders.1.layer_norm1.weight",
375
+ "conditioner.embedders.1.model.transformer.resblocks.1.ln_2.bias": "encoders.1.layer_norm2.bias",
376
+ "conditioner.embedders.1.model.transformer.resblocks.1.ln_2.weight": "encoders.1.layer_norm2.weight",
377
+ "conditioner.embedders.1.model.transformer.resblocks.1.mlp.c_fc.bias": "encoders.1.fc1.bias",
378
+ "conditioner.embedders.1.model.transformer.resblocks.1.mlp.c_fc.weight": "encoders.1.fc1.weight",
379
+ "conditioner.embedders.1.model.transformer.resblocks.1.mlp.c_proj.bias": "encoders.1.fc2.bias",
380
+ "conditioner.embedders.1.model.transformer.resblocks.1.mlp.c_proj.weight": "encoders.1.fc2.weight",
381
+ "conditioner.embedders.1.model.transformer.resblocks.10.attn.in_proj_bias": ['encoders.10.attn.to_q.bias', 'encoders.10.attn.to_k.bias', 'encoders.10.attn.to_v.bias'],
382
+ "conditioner.embedders.1.model.transformer.resblocks.10.attn.in_proj_weight": ['encoders.10.attn.to_q.weight', 'encoders.10.attn.to_k.weight', 'encoders.10.attn.to_v.weight'],
383
+ "conditioner.embedders.1.model.transformer.resblocks.10.attn.out_proj.bias": "encoders.10.attn.to_out.bias",
384
+ "conditioner.embedders.1.model.transformer.resblocks.10.attn.out_proj.weight": "encoders.10.attn.to_out.weight",
385
+ "conditioner.embedders.1.model.transformer.resblocks.10.ln_1.bias": "encoders.10.layer_norm1.bias",
386
+ "conditioner.embedders.1.model.transformer.resblocks.10.ln_1.weight": "encoders.10.layer_norm1.weight",
387
+ "conditioner.embedders.1.model.transformer.resblocks.10.ln_2.bias": "encoders.10.layer_norm2.bias",
388
+ "conditioner.embedders.1.model.transformer.resblocks.10.ln_2.weight": "encoders.10.layer_norm2.weight",
389
+ "conditioner.embedders.1.model.transformer.resblocks.10.mlp.c_fc.bias": "encoders.10.fc1.bias",
390
+ "conditioner.embedders.1.model.transformer.resblocks.10.mlp.c_fc.weight": "encoders.10.fc1.weight",
391
+ "conditioner.embedders.1.model.transformer.resblocks.10.mlp.c_proj.bias": "encoders.10.fc2.bias",
392
+ "conditioner.embedders.1.model.transformer.resblocks.10.mlp.c_proj.weight": "encoders.10.fc2.weight",
393
+ "conditioner.embedders.1.model.transformer.resblocks.11.attn.in_proj_bias": ['encoders.11.attn.to_q.bias', 'encoders.11.attn.to_k.bias', 'encoders.11.attn.to_v.bias'],
394
+ "conditioner.embedders.1.model.transformer.resblocks.11.attn.in_proj_weight": ['encoders.11.attn.to_q.weight', 'encoders.11.attn.to_k.weight', 'encoders.11.attn.to_v.weight'],
395
+ "conditioner.embedders.1.model.transformer.resblocks.11.attn.out_proj.bias": "encoders.11.attn.to_out.bias",
396
+ "conditioner.embedders.1.model.transformer.resblocks.11.attn.out_proj.weight": "encoders.11.attn.to_out.weight",
397
+ "conditioner.embedders.1.model.transformer.resblocks.11.ln_1.bias": "encoders.11.layer_norm1.bias",
398
+ "conditioner.embedders.1.model.transformer.resblocks.11.ln_1.weight": "encoders.11.layer_norm1.weight",
399
+ "conditioner.embedders.1.model.transformer.resblocks.11.ln_2.bias": "encoders.11.layer_norm2.bias",
400
+ "conditioner.embedders.1.model.transformer.resblocks.11.ln_2.weight": "encoders.11.layer_norm2.weight",
401
+ "conditioner.embedders.1.model.transformer.resblocks.11.mlp.c_fc.bias": "encoders.11.fc1.bias",
402
+ "conditioner.embedders.1.model.transformer.resblocks.11.mlp.c_fc.weight": "encoders.11.fc1.weight",
403
+ "conditioner.embedders.1.model.transformer.resblocks.11.mlp.c_proj.bias": "encoders.11.fc2.bias",
404
+ "conditioner.embedders.1.model.transformer.resblocks.11.mlp.c_proj.weight": "encoders.11.fc2.weight",
405
+ "conditioner.embedders.1.model.transformer.resblocks.12.attn.in_proj_bias": ['encoders.12.attn.to_q.bias', 'encoders.12.attn.to_k.bias', 'encoders.12.attn.to_v.bias'],
406
+ "conditioner.embedders.1.model.transformer.resblocks.12.attn.in_proj_weight": ['encoders.12.attn.to_q.weight', 'encoders.12.attn.to_k.weight', 'encoders.12.attn.to_v.weight'],
407
+ "conditioner.embedders.1.model.transformer.resblocks.12.attn.out_proj.bias": "encoders.12.attn.to_out.bias",
408
+ "conditioner.embedders.1.model.transformer.resblocks.12.attn.out_proj.weight": "encoders.12.attn.to_out.weight",
409
+ "conditioner.embedders.1.model.transformer.resblocks.12.ln_1.bias": "encoders.12.layer_norm1.bias",
410
+ "conditioner.embedders.1.model.transformer.resblocks.12.ln_1.weight": "encoders.12.layer_norm1.weight",
411
+ "conditioner.embedders.1.model.transformer.resblocks.12.ln_2.bias": "encoders.12.layer_norm2.bias",
412
+ "conditioner.embedders.1.model.transformer.resblocks.12.ln_2.weight": "encoders.12.layer_norm2.weight",
413
+ "conditioner.embedders.1.model.transformer.resblocks.12.mlp.c_fc.bias": "encoders.12.fc1.bias",
414
+ "conditioner.embedders.1.model.transformer.resblocks.12.mlp.c_fc.weight": "encoders.12.fc1.weight",
415
+ "conditioner.embedders.1.model.transformer.resblocks.12.mlp.c_proj.bias": "encoders.12.fc2.bias",
416
+ "conditioner.embedders.1.model.transformer.resblocks.12.mlp.c_proj.weight": "encoders.12.fc2.weight",
417
+ "conditioner.embedders.1.model.transformer.resblocks.13.attn.in_proj_bias": ['encoders.13.attn.to_q.bias', 'encoders.13.attn.to_k.bias', 'encoders.13.attn.to_v.bias'],
418
+ "conditioner.embedders.1.model.transformer.resblocks.13.attn.in_proj_weight": ['encoders.13.attn.to_q.weight', 'encoders.13.attn.to_k.weight', 'encoders.13.attn.to_v.weight'],
419
+ "conditioner.embedders.1.model.transformer.resblocks.13.attn.out_proj.bias": "encoders.13.attn.to_out.bias",
420
+ "conditioner.embedders.1.model.transformer.resblocks.13.attn.out_proj.weight": "encoders.13.attn.to_out.weight",
421
+ "conditioner.embedders.1.model.transformer.resblocks.13.ln_1.bias": "encoders.13.layer_norm1.bias",
422
+ "conditioner.embedders.1.model.transformer.resblocks.13.ln_1.weight": "encoders.13.layer_norm1.weight",
423
+ "conditioner.embedders.1.model.transformer.resblocks.13.ln_2.bias": "encoders.13.layer_norm2.bias",
424
+ "conditioner.embedders.1.model.transformer.resblocks.13.ln_2.weight": "encoders.13.layer_norm2.weight",
425
+ "conditioner.embedders.1.model.transformer.resblocks.13.mlp.c_fc.bias": "encoders.13.fc1.bias",
426
+ "conditioner.embedders.1.model.transformer.resblocks.13.mlp.c_fc.weight": "encoders.13.fc1.weight",
427
+ "conditioner.embedders.1.model.transformer.resblocks.13.mlp.c_proj.bias": "encoders.13.fc2.bias",
428
+ "conditioner.embedders.1.model.transformer.resblocks.13.mlp.c_proj.weight": "encoders.13.fc2.weight",
429
+ "conditioner.embedders.1.model.transformer.resblocks.14.attn.in_proj_bias": ['encoders.14.attn.to_q.bias', 'encoders.14.attn.to_k.bias', 'encoders.14.attn.to_v.bias'],
430
+ "conditioner.embedders.1.model.transformer.resblocks.14.attn.in_proj_weight": ['encoders.14.attn.to_q.weight', 'encoders.14.attn.to_k.weight', 'encoders.14.attn.to_v.weight'],
431
+ "conditioner.embedders.1.model.transformer.resblocks.14.attn.out_proj.bias": "encoders.14.attn.to_out.bias",
432
+ "conditioner.embedders.1.model.transformer.resblocks.14.attn.out_proj.weight": "encoders.14.attn.to_out.weight",
433
+ "conditioner.embedders.1.model.transformer.resblocks.14.ln_1.bias": "encoders.14.layer_norm1.bias",
434
+ "conditioner.embedders.1.model.transformer.resblocks.14.ln_1.weight": "encoders.14.layer_norm1.weight",
435
+ "conditioner.embedders.1.model.transformer.resblocks.14.ln_2.bias": "encoders.14.layer_norm2.bias",
436
+ "conditioner.embedders.1.model.transformer.resblocks.14.ln_2.weight": "encoders.14.layer_norm2.weight",
437
+ "conditioner.embedders.1.model.transformer.resblocks.14.mlp.c_fc.bias": "encoders.14.fc1.bias",
438
+ "conditioner.embedders.1.model.transformer.resblocks.14.mlp.c_fc.weight": "encoders.14.fc1.weight",
439
+ "conditioner.embedders.1.model.transformer.resblocks.14.mlp.c_proj.bias": "encoders.14.fc2.bias",
440
+ "conditioner.embedders.1.model.transformer.resblocks.14.mlp.c_proj.weight": "encoders.14.fc2.weight",
441
+ "conditioner.embedders.1.model.transformer.resblocks.15.attn.in_proj_bias": ['encoders.15.attn.to_q.bias', 'encoders.15.attn.to_k.bias', 'encoders.15.attn.to_v.bias'],
442
+ "conditioner.embedders.1.model.transformer.resblocks.15.attn.in_proj_weight": ['encoders.15.attn.to_q.weight', 'encoders.15.attn.to_k.weight', 'encoders.15.attn.to_v.weight'],
443
+ "conditioner.embedders.1.model.transformer.resblocks.15.attn.out_proj.bias": "encoders.15.attn.to_out.bias",
444
+ "conditioner.embedders.1.model.transformer.resblocks.15.attn.out_proj.weight": "encoders.15.attn.to_out.weight",
445
+ "conditioner.embedders.1.model.transformer.resblocks.15.ln_1.bias": "encoders.15.layer_norm1.bias",
446
+ "conditioner.embedders.1.model.transformer.resblocks.15.ln_1.weight": "encoders.15.layer_norm1.weight",
447
+ "conditioner.embedders.1.model.transformer.resblocks.15.ln_2.bias": "encoders.15.layer_norm2.bias",
448
+ "conditioner.embedders.1.model.transformer.resblocks.15.ln_2.weight": "encoders.15.layer_norm2.weight",
449
+ "conditioner.embedders.1.model.transformer.resblocks.15.mlp.c_fc.bias": "encoders.15.fc1.bias",
450
+ "conditioner.embedders.1.model.transformer.resblocks.15.mlp.c_fc.weight": "encoders.15.fc1.weight",
451
+ "conditioner.embedders.1.model.transformer.resblocks.15.mlp.c_proj.bias": "encoders.15.fc2.bias",
452
+ "conditioner.embedders.1.model.transformer.resblocks.15.mlp.c_proj.weight": "encoders.15.fc2.weight",
453
+ "conditioner.embedders.1.model.transformer.resblocks.16.attn.in_proj_bias": ['encoders.16.attn.to_q.bias', 'encoders.16.attn.to_k.bias', 'encoders.16.attn.to_v.bias'],
454
+ "conditioner.embedders.1.model.transformer.resblocks.16.attn.in_proj_weight": ['encoders.16.attn.to_q.weight', 'encoders.16.attn.to_k.weight', 'encoders.16.attn.to_v.weight'],
455
+ "conditioner.embedders.1.model.transformer.resblocks.16.attn.out_proj.bias": "encoders.16.attn.to_out.bias",
456
+ "conditioner.embedders.1.model.transformer.resblocks.16.attn.out_proj.weight": "encoders.16.attn.to_out.weight",
457
+ "conditioner.embedders.1.model.transformer.resblocks.16.ln_1.bias": "encoders.16.layer_norm1.bias",
458
+ "conditioner.embedders.1.model.transformer.resblocks.16.ln_1.weight": "encoders.16.layer_norm1.weight",
459
+ "conditioner.embedders.1.model.transformer.resblocks.16.ln_2.bias": "encoders.16.layer_norm2.bias",
460
+ "conditioner.embedders.1.model.transformer.resblocks.16.ln_2.weight": "encoders.16.layer_norm2.weight",
461
+ "conditioner.embedders.1.model.transformer.resblocks.16.mlp.c_fc.bias": "encoders.16.fc1.bias",
462
+ "conditioner.embedders.1.model.transformer.resblocks.16.mlp.c_fc.weight": "encoders.16.fc1.weight",
463
+ "conditioner.embedders.1.model.transformer.resblocks.16.mlp.c_proj.bias": "encoders.16.fc2.bias",
464
+ "conditioner.embedders.1.model.transformer.resblocks.16.mlp.c_proj.weight": "encoders.16.fc2.weight",
465
+ "conditioner.embedders.1.model.transformer.resblocks.17.attn.in_proj_bias": ['encoders.17.attn.to_q.bias', 'encoders.17.attn.to_k.bias', 'encoders.17.attn.to_v.bias'],
466
+ "conditioner.embedders.1.model.transformer.resblocks.17.attn.in_proj_weight": ['encoders.17.attn.to_q.weight', 'encoders.17.attn.to_k.weight', 'encoders.17.attn.to_v.weight'],
467
+ "conditioner.embedders.1.model.transformer.resblocks.17.attn.out_proj.bias": "encoders.17.attn.to_out.bias",
468
+ "conditioner.embedders.1.model.transformer.resblocks.17.attn.out_proj.weight": "encoders.17.attn.to_out.weight",
469
+ "conditioner.embedders.1.model.transformer.resblocks.17.ln_1.bias": "encoders.17.layer_norm1.bias",
470
+ "conditioner.embedders.1.model.transformer.resblocks.17.ln_1.weight": "encoders.17.layer_norm1.weight",
471
+ "conditioner.embedders.1.model.transformer.resblocks.17.ln_2.bias": "encoders.17.layer_norm2.bias",
472
+ "conditioner.embedders.1.model.transformer.resblocks.17.ln_2.weight": "encoders.17.layer_norm2.weight",
473
+ "conditioner.embedders.1.model.transformer.resblocks.17.mlp.c_fc.bias": "encoders.17.fc1.bias",
474
+ "conditioner.embedders.1.model.transformer.resblocks.17.mlp.c_fc.weight": "encoders.17.fc1.weight",
475
+ "conditioner.embedders.1.model.transformer.resblocks.17.mlp.c_proj.bias": "encoders.17.fc2.bias",
476
+ "conditioner.embedders.1.model.transformer.resblocks.17.mlp.c_proj.weight": "encoders.17.fc2.weight",
477
+ "conditioner.embedders.1.model.transformer.resblocks.18.attn.in_proj_bias": ['encoders.18.attn.to_q.bias', 'encoders.18.attn.to_k.bias', 'encoders.18.attn.to_v.bias'],
478
+ "conditioner.embedders.1.model.transformer.resblocks.18.attn.in_proj_weight": ['encoders.18.attn.to_q.weight', 'encoders.18.attn.to_k.weight', 'encoders.18.attn.to_v.weight'],
479
+ "conditioner.embedders.1.model.transformer.resblocks.18.attn.out_proj.bias": "encoders.18.attn.to_out.bias",
480
+ "conditioner.embedders.1.model.transformer.resblocks.18.attn.out_proj.weight": "encoders.18.attn.to_out.weight",
481
+ "conditioner.embedders.1.model.transformer.resblocks.18.ln_1.bias": "encoders.18.layer_norm1.bias",
482
+ "conditioner.embedders.1.model.transformer.resblocks.18.ln_1.weight": "encoders.18.layer_norm1.weight",
483
+ "conditioner.embedders.1.model.transformer.resblocks.18.ln_2.bias": "encoders.18.layer_norm2.bias",
484
+ "conditioner.embedders.1.model.transformer.resblocks.18.ln_2.weight": "encoders.18.layer_norm2.weight",
485
+ "conditioner.embedders.1.model.transformer.resblocks.18.mlp.c_fc.bias": "encoders.18.fc1.bias",
486
+ "conditioner.embedders.1.model.transformer.resblocks.18.mlp.c_fc.weight": "encoders.18.fc1.weight",
487
+ "conditioner.embedders.1.model.transformer.resblocks.18.mlp.c_proj.bias": "encoders.18.fc2.bias",
488
+ "conditioner.embedders.1.model.transformer.resblocks.18.mlp.c_proj.weight": "encoders.18.fc2.weight",
489
+ "conditioner.embedders.1.model.transformer.resblocks.19.attn.in_proj_bias": ['encoders.19.attn.to_q.bias', 'encoders.19.attn.to_k.bias', 'encoders.19.attn.to_v.bias'],
490
+ "conditioner.embedders.1.model.transformer.resblocks.19.attn.in_proj_weight": ['encoders.19.attn.to_q.weight', 'encoders.19.attn.to_k.weight', 'encoders.19.attn.to_v.weight'],
491
+ "conditioner.embedders.1.model.transformer.resblocks.19.attn.out_proj.bias": "encoders.19.attn.to_out.bias",
492
+ "conditioner.embedders.1.model.transformer.resblocks.19.attn.out_proj.weight": "encoders.19.attn.to_out.weight",
493
+ "conditioner.embedders.1.model.transformer.resblocks.19.ln_1.bias": "encoders.19.layer_norm1.bias",
494
+ "conditioner.embedders.1.model.transformer.resblocks.19.ln_1.weight": "encoders.19.layer_norm1.weight",
495
+ "conditioner.embedders.1.model.transformer.resblocks.19.ln_2.bias": "encoders.19.layer_norm2.bias",
496
+ "conditioner.embedders.1.model.transformer.resblocks.19.ln_2.weight": "encoders.19.layer_norm2.weight",
497
+ "conditioner.embedders.1.model.transformer.resblocks.19.mlp.c_fc.bias": "encoders.19.fc1.bias",
498
+ "conditioner.embedders.1.model.transformer.resblocks.19.mlp.c_fc.weight": "encoders.19.fc1.weight",
499
+ "conditioner.embedders.1.model.transformer.resblocks.19.mlp.c_proj.bias": "encoders.19.fc2.bias",
500
+ "conditioner.embedders.1.model.transformer.resblocks.19.mlp.c_proj.weight": "encoders.19.fc2.weight",
501
+ "conditioner.embedders.1.model.transformer.resblocks.2.attn.in_proj_bias": ['encoders.2.attn.to_q.bias', 'encoders.2.attn.to_k.bias', 'encoders.2.attn.to_v.bias'],
502
+ "conditioner.embedders.1.model.transformer.resblocks.2.attn.in_proj_weight": ['encoders.2.attn.to_q.weight', 'encoders.2.attn.to_k.weight', 'encoders.2.attn.to_v.weight'],
503
+ "conditioner.embedders.1.model.transformer.resblocks.2.attn.out_proj.bias": "encoders.2.attn.to_out.bias",
504
+ "conditioner.embedders.1.model.transformer.resblocks.2.attn.out_proj.weight": "encoders.2.attn.to_out.weight",
505
+ "conditioner.embedders.1.model.transformer.resblocks.2.ln_1.bias": "encoders.2.layer_norm1.bias",
506
+ "conditioner.embedders.1.model.transformer.resblocks.2.ln_1.weight": "encoders.2.layer_norm1.weight",
507
+ "conditioner.embedders.1.model.transformer.resblocks.2.ln_2.bias": "encoders.2.layer_norm2.bias",
508
+ "conditioner.embedders.1.model.transformer.resblocks.2.ln_2.weight": "encoders.2.layer_norm2.weight",
509
+ "conditioner.embedders.1.model.transformer.resblocks.2.mlp.c_fc.bias": "encoders.2.fc1.bias",
510
+ "conditioner.embedders.1.model.transformer.resblocks.2.mlp.c_fc.weight": "encoders.2.fc1.weight",
511
+ "conditioner.embedders.1.model.transformer.resblocks.2.mlp.c_proj.bias": "encoders.2.fc2.bias",
512
+ "conditioner.embedders.1.model.transformer.resblocks.2.mlp.c_proj.weight": "encoders.2.fc2.weight",
513
+ "conditioner.embedders.1.model.transformer.resblocks.20.attn.in_proj_bias": ['encoders.20.attn.to_q.bias', 'encoders.20.attn.to_k.bias', 'encoders.20.attn.to_v.bias'],
514
+ "conditioner.embedders.1.model.transformer.resblocks.20.attn.in_proj_weight": ['encoders.20.attn.to_q.weight', 'encoders.20.attn.to_k.weight', 'encoders.20.attn.to_v.weight'],
515
+ "conditioner.embedders.1.model.transformer.resblocks.20.attn.out_proj.bias": "encoders.20.attn.to_out.bias",
516
+ "conditioner.embedders.1.model.transformer.resblocks.20.attn.out_proj.weight": "encoders.20.attn.to_out.weight",
517
+ "conditioner.embedders.1.model.transformer.resblocks.20.ln_1.bias": "encoders.20.layer_norm1.bias",
518
+ "conditioner.embedders.1.model.transformer.resblocks.20.ln_1.weight": "encoders.20.layer_norm1.weight",
519
+ "conditioner.embedders.1.model.transformer.resblocks.20.ln_2.bias": "encoders.20.layer_norm2.bias",
520
+ "conditioner.embedders.1.model.transformer.resblocks.20.ln_2.weight": "encoders.20.layer_norm2.weight",
521
+ "conditioner.embedders.1.model.transformer.resblocks.20.mlp.c_fc.bias": "encoders.20.fc1.bias",
522
+ "conditioner.embedders.1.model.transformer.resblocks.20.mlp.c_fc.weight": "encoders.20.fc1.weight",
523
+ "conditioner.embedders.1.model.transformer.resblocks.20.mlp.c_proj.bias": "encoders.20.fc2.bias",
524
+ "conditioner.embedders.1.model.transformer.resblocks.20.mlp.c_proj.weight": "encoders.20.fc2.weight",
525
+ "conditioner.embedders.1.model.transformer.resblocks.21.attn.in_proj_bias": ['encoders.21.attn.to_q.bias', 'encoders.21.attn.to_k.bias', 'encoders.21.attn.to_v.bias'],
526
+ "conditioner.embedders.1.model.transformer.resblocks.21.attn.in_proj_weight": ['encoders.21.attn.to_q.weight', 'encoders.21.attn.to_k.weight', 'encoders.21.attn.to_v.weight'],
527
+ "conditioner.embedders.1.model.transformer.resblocks.21.attn.out_proj.bias": "encoders.21.attn.to_out.bias",
528
+ "conditioner.embedders.1.model.transformer.resblocks.21.attn.out_proj.weight": "encoders.21.attn.to_out.weight",
529
+ "conditioner.embedders.1.model.transformer.resblocks.21.ln_1.bias": "encoders.21.layer_norm1.bias",
530
+ "conditioner.embedders.1.model.transformer.resblocks.21.ln_1.weight": "encoders.21.layer_norm1.weight",
531
+ "conditioner.embedders.1.model.transformer.resblocks.21.ln_2.bias": "encoders.21.layer_norm2.bias",
532
+ "conditioner.embedders.1.model.transformer.resblocks.21.ln_2.weight": "encoders.21.layer_norm2.weight",
533
+ "conditioner.embedders.1.model.transformer.resblocks.21.mlp.c_fc.bias": "encoders.21.fc1.bias",
534
+ "conditioner.embedders.1.model.transformer.resblocks.21.mlp.c_fc.weight": "encoders.21.fc1.weight",
535
+ "conditioner.embedders.1.model.transformer.resblocks.21.mlp.c_proj.bias": "encoders.21.fc2.bias",
536
+ "conditioner.embedders.1.model.transformer.resblocks.21.mlp.c_proj.weight": "encoders.21.fc2.weight",
537
+ "conditioner.embedders.1.model.transformer.resblocks.22.attn.in_proj_bias": ['encoders.22.attn.to_q.bias', 'encoders.22.attn.to_k.bias', 'encoders.22.attn.to_v.bias'],
538
+ "conditioner.embedders.1.model.transformer.resblocks.22.attn.in_proj_weight": ['encoders.22.attn.to_q.weight', 'encoders.22.attn.to_k.weight', 'encoders.22.attn.to_v.weight'],
539
+ "conditioner.embedders.1.model.transformer.resblocks.22.attn.out_proj.bias": "encoders.22.attn.to_out.bias",
540
+ "conditioner.embedders.1.model.transformer.resblocks.22.attn.out_proj.weight": "encoders.22.attn.to_out.weight",
541
+ "conditioner.embedders.1.model.transformer.resblocks.22.ln_1.bias": "encoders.22.layer_norm1.bias",
542
+ "conditioner.embedders.1.model.transformer.resblocks.22.ln_1.weight": "encoders.22.layer_norm1.weight",
543
+ "conditioner.embedders.1.model.transformer.resblocks.22.ln_2.bias": "encoders.22.layer_norm2.bias",
544
+ "conditioner.embedders.1.model.transformer.resblocks.22.ln_2.weight": "encoders.22.layer_norm2.weight",
545
+ "conditioner.embedders.1.model.transformer.resblocks.22.mlp.c_fc.bias": "encoders.22.fc1.bias",
546
+ "conditioner.embedders.1.model.transformer.resblocks.22.mlp.c_fc.weight": "encoders.22.fc1.weight",
547
+ "conditioner.embedders.1.model.transformer.resblocks.22.mlp.c_proj.bias": "encoders.22.fc2.bias",
548
+ "conditioner.embedders.1.model.transformer.resblocks.22.mlp.c_proj.weight": "encoders.22.fc2.weight",
549
+ "conditioner.embedders.1.model.transformer.resblocks.23.attn.in_proj_bias": ['encoders.23.attn.to_q.bias', 'encoders.23.attn.to_k.bias', 'encoders.23.attn.to_v.bias'],
550
+ "conditioner.embedders.1.model.transformer.resblocks.23.attn.in_proj_weight": ['encoders.23.attn.to_q.weight', 'encoders.23.attn.to_k.weight', 'encoders.23.attn.to_v.weight'],
551
+ "conditioner.embedders.1.model.transformer.resblocks.23.attn.out_proj.bias": "encoders.23.attn.to_out.bias",
552
+ "conditioner.embedders.1.model.transformer.resblocks.23.attn.out_proj.weight": "encoders.23.attn.to_out.weight",
553
+ "conditioner.embedders.1.model.transformer.resblocks.23.ln_1.bias": "encoders.23.layer_norm1.bias",
554
+ "conditioner.embedders.1.model.transformer.resblocks.23.ln_1.weight": "encoders.23.layer_norm1.weight",
555
+ "conditioner.embedders.1.model.transformer.resblocks.23.ln_2.bias": "encoders.23.layer_norm2.bias",
556
+ "conditioner.embedders.1.model.transformer.resblocks.23.ln_2.weight": "encoders.23.layer_norm2.weight",
557
+ "conditioner.embedders.1.model.transformer.resblocks.23.mlp.c_fc.bias": "encoders.23.fc1.bias",
558
+ "conditioner.embedders.1.model.transformer.resblocks.23.mlp.c_fc.weight": "encoders.23.fc1.weight",
559
+ "conditioner.embedders.1.model.transformer.resblocks.23.mlp.c_proj.bias": "encoders.23.fc2.bias",
560
+ "conditioner.embedders.1.model.transformer.resblocks.23.mlp.c_proj.weight": "encoders.23.fc2.weight",
561
+ "conditioner.embedders.1.model.transformer.resblocks.24.attn.in_proj_bias": ['encoders.24.attn.to_q.bias', 'encoders.24.attn.to_k.bias', 'encoders.24.attn.to_v.bias'],
562
+ "conditioner.embedders.1.model.transformer.resblocks.24.attn.in_proj_weight": ['encoders.24.attn.to_q.weight', 'encoders.24.attn.to_k.weight', 'encoders.24.attn.to_v.weight'],
563
+ "conditioner.embedders.1.model.transformer.resblocks.24.attn.out_proj.bias": "encoders.24.attn.to_out.bias",
564
+ "conditioner.embedders.1.model.transformer.resblocks.24.attn.out_proj.weight": "encoders.24.attn.to_out.weight",
565
+ "conditioner.embedders.1.model.transformer.resblocks.24.ln_1.bias": "encoders.24.layer_norm1.bias",
566
+ "conditioner.embedders.1.model.transformer.resblocks.24.ln_1.weight": "encoders.24.layer_norm1.weight",
567
+ "conditioner.embedders.1.model.transformer.resblocks.24.ln_2.bias": "encoders.24.layer_norm2.bias",
568
+ "conditioner.embedders.1.model.transformer.resblocks.24.ln_2.weight": "encoders.24.layer_norm2.weight",
569
+ "conditioner.embedders.1.model.transformer.resblocks.24.mlp.c_fc.bias": "encoders.24.fc1.bias",
570
+ "conditioner.embedders.1.model.transformer.resblocks.24.mlp.c_fc.weight": "encoders.24.fc1.weight",
571
+ "conditioner.embedders.1.model.transformer.resblocks.24.mlp.c_proj.bias": "encoders.24.fc2.bias",
572
+ "conditioner.embedders.1.model.transformer.resblocks.24.mlp.c_proj.weight": "encoders.24.fc2.weight",
573
+ "conditioner.embedders.1.model.transformer.resblocks.25.attn.in_proj_bias": ['encoders.25.attn.to_q.bias', 'encoders.25.attn.to_k.bias', 'encoders.25.attn.to_v.bias'],
574
+ "conditioner.embedders.1.model.transformer.resblocks.25.attn.in_proj_weight": ['encoders.25.attn.to_q.weight', 'encoders.25.attn.to_k.weight', 'encoders.25.attn.to_v.weight'],
575
+ "conditioner.embedders.1.model.transformer.resblocks.25.attn.out_proj.bias": "encoders.25.attn.to_out.bias",
576
+ "conditioner.embedders.1.model.transformer.resblocks.25.attn.out_proj.weight": "encoders.25.attn.to_out.weight",
577
+ "conditioner.embedders.1.model.transformer.resblocks.25.ln_1.bias": "encoders.25.layer_norm1.bias",
578
+ "conditioner.embedders.1.model.transformer.resblocks.25.ln_1.weight": "encoders.25.layer_norm1.weight",
579
+ "conditioner.embedders.1.model.transformer.resblocks.25.ln_2.bias": "encoders.25.layer_norm2.bias",
580
+ "conditioner.embedders.1.model.transformer.resblocks.25.ln_2.weight": "encoders.25.layer_norm2.weight",
581
+ "conditioner.embedders.1.model.transformer.resblocks.25.mlp.c_fc.bias": "encoders.25.fc1.bias",
582
+ "conditioner.embedders.1.model.transformer.resblocks.25.mlp.c_fc.weight": "encoders.25.fc1.weight",
583
+ "conditioner.embedders.1.model.transformer.resblocks.25.mlp.c_proj.bias": "encoders.25.fc2.bias",
584
+ "conditioner.embedders.1.model.transformer.resblocks.25.mlp.c_proj.weight": "encoders.25.fc2.weight",
585
+ "conditioner.embedders.1.model.transformer.resblocks.26.attn.in_proj_bias": ['encoders.26.attn.to_q.bias', 'encoders.26.attn.to_k.bias', 'encoders.26.attn.to_v.bias'],
586
+ "conditioner.embedders.1.model.transformer.resblocks.26.attn.in_proj_weight": ['encoders.26.attn.to_q.weight', 'encoders.26.attn.to_k.weight', 'encoders.26.attn.to_v.weight'],
587
+ "conditioner.embedders.1.model.transformer.resblocks.26.attn.out_proj.bias": "encoders.26.attn.to_out.bias",
588
+ "conditioner.embedders.1.model.transformer.resblocks.26.attn.out_proj.weight": "encoders.26.attn.to_out.weight",
589
+ "conditioner.embedders.1.model.transformer.resblocks.26.ln_1.bias": "encoders.26.layer_norm1.bias",
590
+ "conditioner.embedders.1.model.transformer.resblocks.26.ln_1.weight": "encoders.26.layer_norm1.weight",
591
+ "conditioner.embedders.1.model.transformer.resblocks.26.ln_2.bias": "encoders.26.layer_norm2.bias",
592
+ "conditioner.embedders.1.model.transformer.resblocks.26.ln_2.weight": "encoders.26.layer_norm2.weight",
593
+ "conditioner.embedders.1.model.transformer.resblocks.26.mlp.c_fc.bias": "encoders.26.fc1.bias",
594
+ "conditioner.embedders.1.model.transformer.resblocks.26.mlp.c_fc.weight": "encoders.26.fc1.weight",
595
+ "conditioner.embedders.1.model.transformer.resblocks.26.mlp.c_proj.bias": "encoders.26.fc2.bias",
596
+ "conditioner.embedders.1.model.transformer.resblocks.26.mlp.c_proj.weight": "encoders.26.fc2.weight",
597
+ "conditioner.embedders.1.model.transformer.resblocks.27.attn.in_proj_bias": ['encoders.27.attn.to_q.bias', 'encoders.27.attn.to_k.bias', 'encoders.27.attn.to_v.bias'],
598
+ "conditioner.embedders.1.model.transformer.resblocks.27.attn.in_proj_weight": ['encoders.27.attn.to_q.weight', 'encoders.27.attn.to_k.weight', 'encoders.27.attn.to_v.weight'],
599
+ "conditioner.embedders.1.model.transformer.resblocks.27.attn.out_proj.bias": "encoders.27.attn.to_out.bias",
600
+ "conditioner.embedders.1.model.transformer.resblocks.27.attn.out_proj.weight": "encoders.27.attn.to_out.weight",
601
+ "conditioner.embedders.1.model.transformer.resblocks.27.ln_1.bias": "encoders.27.layer_norm1.bias",
602
+ "conditioner.embedders.1.model.transformer.resblocks.27.ln_1.weight": "encoders.27.layer_norm1.weight",
603
+ "conditioner.embedders.1.model.transformer.resblocks.27.ln_2.bias": "encoders.27.layer_norm2.bias",
604
+ "conditioner.embedders.1.model.transformer.resblocks.27.ln_2.weight": "encoders.27.layer_norm2.weight",
605
+ "conditioner.embedders.1.model.transformer.resblocks.27.mlp.c_fc.bias": "encoders.27.fc1.bias",
606
+ "conditioner.embedders.1.model.transformer.resblocks.27.mlp.c_fc.weight": "encoders.27.fc1.weight",
607
+ "conditioner.embedders.1.model.transformer.resblocks.27.mlp.c_proj.bias": "encoders.27.fc2.bias",
608
+ "conditioner.embedders.1.model.transformer.resblocks.27.mlp.c_proj.weight": "encoders.27.fc2.weight",
609
+ "conditioner.embedders.1.model.transformer.resblocks.28.attn.in_proj_bias": ['encoders.28.attn.to_q.bias', 'encoders.28.attn.to_k.bias', 'encoders.28.attn.to_v.bias'],
610
+ "conditioner.embedders.1.model.transformer.resblocks.28.attn.in_proj_weight": ['encoders.28.attn.to_q.weight', 'encoders.28.attn.to_k.weight', 'encoders.28.attn.to_v.weight'],
611
+ "conditioner.embedders.1.model.transformer.resblocks.28.attn.out_proj.bias": "encoders.28.attn.to_out.bias",
612
+ "conditioner.embedders.1.model.transformer.resblocks.28.attn.out_proj.weight": "encoders.28.attn.to_out.weight",
613
+ "conditioner.embedders.1.model.transformer.resblocks.28.ln_1.bias": "encoders.28.layer_norm1.bias",
614
+ "conditioner.embedders.1.model.transformer.resblocks.28.ln_1.weight": "encoders.28.layer_norm1.weight",
615
+ "conditioner.embedders.1.model.transformer.resblocks.28.ln_2.bias": "encoders.28.layer_norm2.bias",
616
+ "conditioner.embedders.1.model.transformer.resblocks.28.ln_2.weight": "encoders.28.layer_norm2.weight",
617
+ "conditioner.embedders.1.model.transformer.resblocks.28.mlp.c_fc.bias": "encoders.28.fc1.bias",
618
+ "conditioner.embedders.1.model.transformer.resblocks.28.mlp.c_fc.weight": "encoders.28.fc1.weight",
619
+ "conditioner.embedders.1.model.transformer.resblocks.28.mlp.c_proj.bias": "encoders.28.fc2.bias",
620
+ "conditioner.embedders.1.model.transformer.resblocks.28.mlp.c_proj.weight": "encoders.28.fc2.weight",
621
+ "conditioner.embedders.1.model.transformer.resblocks.29.attn.in_proj_bias": ['encoders.29.attn.to_q.bias', 'encoders.29.attn.to_k.bias', 'encoders.29.attn.to_v.bias'],
622
+ "conditioner.embedders.1.model.transformer.resblocks.29.attn.in_proj_weight": ['encoders.29.attn.to_q.weight', 'encoders.29.attn.to_k.weight', 'encoders.29.attn.to_v.weight'],
623
+ "conditioner.embedders.1.model.transformer.resblocks.29.attn.out_proj.bias": "encoders.29.attn.to_out.bias",
624
+ "conditioner.embedders.1.model.transformer.resblocks.29.attn.out_proj.weight": "encoders.29.attn.to_out.weight",
625
+ "conditioner.embedders.1.model.transformer.resblocks.29.ln_1.bias": "encoders.29.layer_norm1.bias",
626
+ "conditioner.embedders.1.model.transformer.resblocks.29.ln_1.weight": "encoders.29.layer_norm1.weight",
627
+ "conditioner.embedders.1.model.transformer.resblocks.29.ln_2.bias": "encoders.29.layer_norm2.bias",
628
+ "conditioner.embedders.1.model.transformer.resblocks.29.ln_2.weight": "encoders.29.layer_norm2.weight",
629
+ "conditioner.embedders.1.model.transformer.resblocks.29.mlp.c_fc.bias": "encoders.29.fc1.bias",
630
+ "conditioner.embedders.1.model.transformer.resblocks.29.mlp.c_fc.weight": "encoders.29.fc1.weight",
631
+ "conditioner.embedders.1.model.transformer.resblocks.29.mlp.c_proj.bias": "encoders.29.fc2.bias",
632
+ "conditioner.embedders.1.model.transformer.resblocks.29.mlp.c_proj.weight": "encoders.29.fc2.weight",
633
+ "conditioner.embedders.1.model.transformer.resblocks.3.attn.in_proj_bias": ['encoders.3.attn.to_q.bias', 'encoders.3.attn.to_k.bias', 'encoders.3.attn.to_v.bias'],
634
+ "conditioner.embedders.1.model.transformer.resblocks.3.attn.in_proj_weight": ['encoders.3.attn.to_q.weight', 'encoders.3.attn.to_k.weight', 'encoders.3.attn.to_v.weight'],
635
+ "conditioner.embedders.1.model.transformer.resblocks.3.attn.out_proj.bias": "encoders.3.attn.to_out.bias",
636
+ "conditioner.embedders.1.model.transformer.resblocks.3.attn.out_proj.weight": "encoders.3.attn.to_out.weight",
637
+ "conditioner.embedders.1.model.transformer.resblocks.3.ln_1.bias": "encoders.3.layer_norm1.bias",
638
+ "conditioner.embedders.1.model.transformer.resblocks.3.ln_1.weight": "encoders.3.layer_norm1.weight",
639
+ "conditioner.embedders.1.model.transformer.resblocks.3.ln_2.bias": "encoders.3.layer_norm2.bias",
640
+ "conditioner.embedders.1.model.transformer.resblocks.3.ln_2.weight": "encoders.3.layer_norm2.weight",
641
+ "conditioner.embedders.1.model.transformer.resblocks.3.mlp.c_fc.bias": "encoders.3.fc1.bias",
642
+ "conditioner.embedders.1.model.transformer.resblocks.3.mlp.c_fc.weight": "encoders.3.fc1.weight",
643
+ "conditioner.embedders.1.model.transformer.resblocks.3.mlp.c_proj.bias": "encoders.3.fc2.bias",
644
+ "conditioner.embedders.1.model.transformer.resblocks.3.mlp.c_proj.weight": "encoders.3.fc2.weight",
645
+ "conditioner.embedders.1.model.transformer.resblocks.30.attn.in_proj_bias": ['encoders.30.attn.to_q.bias', 'encoders.30.attn.to_k.bias', 'encoders.30.attn.to_v.bias'],
646
+ "conditioner.embedders.1.model.transformer.resblocks.30.attn.in_proj_weight": ['encoders.30.attn.to_q.weight', 'encoders.30.attn.to_k.weight', 'encoders.30.attn.to_v.weight'],
647
+ "conditioner.embedders.1.model.transformer.resblocks.30.attn.out_proj.bias": "encoders.30.attn.to_out.bias",
648
+ "conditioner.embedders.1.model.transformer.resblocks.30.attn.out_proj.weight": "encoders.30.attn.to_out.weight",
649
+ "conditioner.embedders.1.model.transformer.resblocks.30.ln_1.bias": "encoders.30.layer_norm1.bias",
650
+ "conditioner.embedders.1.model.transformer.resblocks.30.ln_1.weight": "encoders.30.layer_norm1.weight",
651
+ "conditioner.embedders.1.model.transformer.resblocks.30.ln_2.bias": "encoders.30.layer_norm2.bias",
652
+ "conditioner.embedders.1.model.transformer.resblocks.30.ln_2.weight": "encoders.30.layer_norm2.weight",
653
+ "conditioner.embedders.1.model.transformer.resblocks.30.mlp.c_fc.bias": "encoders.30.fc1.bias",
654
+ "conditioner.embedders.1.model.transformer.resblocks.30.mlp.c_fc.weight": "encoders.30.fc1.weight",
655
+ "conditioner.embedders.1.model.transformer.resblocks.30.mlp.c_proj.bias": "encoders.30.fc2.bias",
656
+ "conditioner.embedders.1.model.transformer.resblocks.30.mlp.c_proj.weight": "encoders.30.fc2.weight",
657
+ "conditioner.embedders.1.model.transformer.resblocks.31.attn.in_proj_bias": ['encoders.31.attn.to_q.bias', 'encoders.31.attn.to_k.bias', 'encoders.31.attn.to_v.bias'],
658
+ "conditioner.embedders.1.model.transformer.resblocks.31.attn.in_proj_weight": ['encoders.31.attn.to_q.weight', 'encoders.31.attn.to_k.weight', 'encoders.31.attn.to_v.weight'],
659
+ "conditioner.embedders.1.model.transformer.resblocks.31.attn.out_proj.bias": "encoders.31.attn.to_out.bias",
660
+ "conditioner.embedders.1.model.transformer.resblocks.31.attn.out_proj.weight": "encoders.31.attn.to_out.weight",
661
+ "conditioner.embedders.1.model.transformer.resblocks.31.ln_1.bias": "encoders.31.layer_norm1.bias",
662
+ "conditioner.embedders.1.model.transformer.resblocks.31.ln_1.weight": "encoders.31.layer_norm1.weight",
663
+ "conditioner.embedders.1.model.transformer.resblocks.31.ln_2.bias": "encoders.31.layer_norm2.bias",
664
+ "conditioner.embedders.1.model.transformer.resblocks.31.ln_2.weight": "encoders.31.layer_norm2.weight",
665
+ "conditioner.embedders.1.model.transformer.resblocks.31.mlp.c_fc.bias": "encoders.31.fc1.bias",
666
+ "conditioner.embedders.1.model.transformer.resblocks.31.mlp.c_fc.weight": "encoders.31.fc1.weight",
667
+ "conditioner.embedders.1.model.transformer.resblocks.31.mlp.c_proj.bias": "encoders.31.fc2.bias",
668
+ "conditioner.embedders.1.model.transformer.resblocks.31.mlp.c_proj.weight": "encoders.31.fc2.weight",
669
+ "conditioner.embedders.1.model.transformer.resblocks.4.attn.in_proj_bias": ['encoders.4.attn.to_q.bias', 'encoders.4.attn.to_k.bias', 'encoders.4.attn.to_v.bias'],
670
+ "conditioner.embedders.1.model.transformer.resblocks.4.attn.in_proj_weight": ['encoders.4.attn.to_q.weight', 'encoders.4.attn.to_k.weight', 'encoders.4.attn.to_v.weight'],
671
+ "conditioner.embedders.1.model.transformer.resblocks.4.attn.out_proj.bias": "encoders.4.attn.to_out.bias",
672
+ "conditioner.embedders.1.model.transformer.resblocks.4.attn.out_proj.weight": "encoders.4.attn.to_out.weight",
673
+ "conditioner.embedders.1.model.transformer.resblocks.4.ln_1.bias": "encoders.4.layer_norm1.bias",
674
+ "conditioner.embedders.1.model.transformer.resblocks.4.ln_1.weight": "encoders.4.layer_norm1.weight",
675
+ "conditioner.embedders.1.model.transformer.resblocks.4.ln_2.bias": "encoders.4.layer_norm2.bias",
676
+ "conditioner.embedders.1.model.transformer.resblocks.4.ln_2.weight": "encoders.4.layer_norm2.weight",
677
+ "conditioner.embedders.1.model.transformer.resblocks.4.mlp.c_fc.bias": "encoders.4.fc1.bias",
678
+ "conditioner.embedders.1.model.transformer.resblocks.4.mlp.c_fc.weight": "encoders.4.fc1.weight",
679
+ "conditioner.embedders.1.model.transformer.resblocks.4.mlp.c_proj.bias": "encoders.4.fc2.bias",
680
+ "conditioner.embedders.1.model.transformer.resblocks.4.mlp.c_proj.weight": "encoders.4.fc2.weight",
681
+ "conditioner.embedders.1.model.transformer.resblocks.5.attn.in_proj_bias": ['encoders.5.attn.to_q.bias', 'encoders.5.attn.to_k.bias', 'encoders.5.attn.to_v.bias'],
682
+ "conditioner.embedders.1.model.transformer.resblocks.5.attn.in_proj_weight": ['encoders.5.attn.to_q.weight', 'encoders.5.attn.to_k.weight', 'encoders.5.attn.to_v.weight'],
683
+ "conditioner.embedders.1.model.transformer.resblocks.5.attn.out_proj.bias": "encoders.5.attn.to_out.bias",
684
+ "conditioner.embedders.1.model.transformer.resblocks.5.attn.out_proj.weight": "encoders.5.attn.to_out.weight",
685
+ "conditioner.embedders.1.model.transformer.resblocks.5.ln_1.bias": "encoders.5.layer_norm1.bias",
686
+ "conditioner.embedders.1.model.transformer.resblocks.5.ln_1.weight": "encoders.5.layer_norm1.weight",
687
+ "conditioner.embedders.1.model.transformer.resblocks.5.ln_2.bias": "encoders.5.layer_norm2.bias",
688
+ "conditioner.embedders.1.model.transformer.resblocks.5.ln_2.weight": "encoders.5.layer_norm2.weight",
689
+ "conditioner.embedders.1.model.transformer.resblocks.5.mlp.c_fc.bias": "encoders.5.fc1.bias",
690
+ "conditioner.embedders.1.model.transformer.resblocks.5.mlp.c_fc.weight": "encoders.5.fc1.weight",
691
+ "conditioner.embedders.1.model.transformer.resblocks.5.mlp.c_proj.bias": "encoders.5.fc2.bias",
692
+ "conditioner.embedders.1.model.transformer.resblocks.5.mlp.c_proj.weight": "encoders.5.fc2.weight",
693
+ "conditioner.embedders.1.model.transformer.resblocks.6.attn.in_proj_bias": ['encoders.6.attn.to_q.bias', 'encoders.6.attn.to_k.bias', 'encoders.6.attn.to_v.bias'],
694
+ "conditioner.embedders.1.model.transformer.resblocks.6.attn.in_proj_weight": ['encoders.6.attn.to_q.weight', 'encoders.6.attn.to_k.weight', 'encoders.6.attn.to_v.weight'],
695
+ "conditioner.embedders.1.model.transformer.resblocks.6.attn.out_proj.bias": "encoders.6.attn.to_out.bias",
696
+ "conditioner.embedders.1.model.transformer.resblocks.6.attn.out_proj.weight": "encoders.6.attn.to_out.weight",
697
+ "conditioner.embedders.1.model.transformer.resblocks.6.ln_1.bias": "encoders.6.layer_norm1.bias",
698
+ "conditioner.embedders.1.model.transformer.resblocks.6.ln_1.weight": "encoders.6.layer_norm1.weight",
699
+ "conditioner.embedders.1.model.transformer.resblocks.6.ln_2.bias": "encoders.6.layer_norm2.bias",
700
+ "conditioner.embedders.1.model.transformer.resblocks.6.ln_2.weight": "encoders.6.layer_norm2.weight",
701
+ "conditioner.embedders.1.model.transformer.resblocks.6.mlp.c_fc.bias": "encoders.6.fc1.bias",
702
+ "conditioner.embedders.1.model.transformer.resblocks.6.mlp.c_fc.weight": "encoders.6.fc1.weight",
703
+ "conditioner.embedders.1.model.transformer.resblocks.6.mlp.c_proj.bias": "encoders.6.fc2.bias",
704
+ "conditioner.embedders.1.model.transformer.resblocks.6.mlp.c_proj.weight": "encoders.6.fc2.weight",
705
+ "conditioner.embedders.1.model.transformer.resblocks.7.attn.in_proj_bias": ['encoders.7.attn.to_q.bias', 'encoders.7.attn.to_k.bias', 'encoders.7.attn.to_v.bias'],
706
+ "conditioner.embedders.1.model.transformer.resblocks.7.attn.in_proj_weight": ['encoders.7.attn.to_q.weight', 'encoders.7.attn.to_k.weight', 'encoders.7.attn.to_v.weight'],
707
+ "conditioner.embedders.1.model.transformer.resblocks.7.attn.out_proj.bias": "encoders.7.attn.to_out.bias",
708
+ "conditioner.embedders.1.model.transformer.resblocks.7.attn.out_proj.weight": "encoders.7.attn.to_out.weight",
709
+ "conditioner.embedders.1.model.transformer.resblocks.7.ln_1.bias": "encoders.7.layer_norm1.bias",
710
+ "conditioner.embedders.1.model.transformer.resblocks.7.ln_1.weight": "encoders.7.layer_norm1.weight",
711
+ "conditioner.embedders.1.model.transformer.resblocks.7.ln_2.bias": "encoders.7.layer_norm2.bias",
712
+ "conditioner.embedders.1.model.transformer.resblocks.7.ln_2.weight": "encoders.7.layer_norm2.weight",
713
+ "conditioner.embedders.1.model.transformer.resblocks.7.mlp.c_fc.bias": "encoders.7.fc1.bias",
714
+ "conditioner.embedders.1.model.transformer.resblocks.7.mlp.c_fc.weight": "encoders.7.fc1.weight",
715
+ "conditioner.embedders.1.model.transformer.resblocks.7.mlp.c_proj.bias": "encoders.7.fc2.bias",
716
+ "conditioner.embedders.1.model.transformer.resblocks.7.mlp.c_proj.weight": "encoders.7.fc2.weight",
717
+ "conditioner.embedders.1.model.transformer.resblocks.8.attn.in_proj_bias": ['encoders.8.attn.to_q.bias', 'encoders.8.attn.to_k.bias', 'encoders.8.attn.to_v.bias'],
718
+ "conditioner.embedders.1.model.transformer.resblocks.8.attn.in_proj_weight": ['encoders.8.attn.to_q.weight', 'encoders.8.attn.to_k.weight', 'encoders.8.attn.to_v.weight'],
719
+ "conditioner.embedders.1.model.transformer.resblocks.8.attn.out_proj.bias": "encoders.8.attn.to_out.bias",
720
+ "conditioner.embedders.1.model.transformer.resblocks.8.attn.out_proj.weight": "encoders.8.attn.to_out.weight",
721
+ "conditioner.embedders.1.model.transformer.resblocks.8.ln_1.bias": "encoders.8.layer_norm1.bias",
722
+ "conditioner.embedders.1.model.transformer.resblocks.8.ln_1.weight": "encoders.8.layer_norm1.weight",
723
+ "conditioner.embedders.1.model.transformer.resblocks.8.ln_2.bias": "encoders.8.layer_norm2.bias",
724
+ "conditioner.embedders.1.model.transformer.resblocks.8.ln_2.weight": "encoders.8.layer_norm2.weight",
725
+ "conditioner.embedders.1.model.transformer.resblocks.8.mlp.c_fc.bias": "encoders.8.fc1.bias",
726
+ "conditioner.embedders.1.model.transformer.resblocks.8.mlp.c_fc.weight": "encoders.8.fc1.weight",
727
+ "conditioner.embedders.1.model.transformer.resblocks.8.mlp.c_proj.bias": "encoders.8.fc2.bias",
728
+ "conditioner.embedders.1.model.transformer.resblocks.8.mlp.c_proj.weight": "encoders.8.fc2.weight",
729
+ "conditioner.embedders.1.model.transformer.resblocks.9.attn.in_proj_bias": ['encoders.9.attn.to_q.bias', 'encoders.9.attn.to_k.bias', 'encoders.9.attn.to_v.bias'],
730
+ "conditioner.embedders.1.model.transformer.resblocks.9.attn.in_proj_weight": ['encoders.9.attn.to_q.weight', 'encoders.9.attn.to_k.weight', 'encoders.9.attn.to_v.weight'],
731
+ "conditioner.embedders.1.model.transformer.resblocks.9.attn.out_proj.bias": "encoders.9.attn.to_out.bias",
732
+ "conditioner.embedders.1.model.transformer.resblocks.9.attn.out_proj.weight": "encoders.9.attn.to_out.weight",
733
+ "conditioner.embedders.1.model.transformer.resblocks.9.ln_1.bias": "encoders.9.layer_norm1.bias",
734
+ "conditioner.embedders.1.model.transformer.resblocks.9.ln_1.weight": "encoders.9.layer_norm1.weight",
735
+ "conditioner.embedders.1.model.transformer.resblocks.9.ln_2.bias": "encoders.9.layer_norm2.bias",
736
+ "conditioner.embedders.1.model.transformer.resblocks.9.ln_2.weight": "encoders.9.layer_norm2.weight",
737
+ "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_fc.bias": "encoders.9.fc1.bias",
738
+ "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_fc.weight": "encoders.9.fc1.weight",
739
+ "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.bias": "encoders.9.fc2.bias",
740
+ "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.weight": "encoders.9.fc2.weight",
741
+ "conditioner.embedders.1.model.text_projection": "text_projection.weight",
742
+ }
743
+ state_dict_ = {}
744
+ for name in state_dict:
745
+ if name in rename_dict:
746
+ param = state_dict[name]
747
+ if name == "conditioner.embedders.1.model.positional_embedding":
748
+ param = param.reshape((1, param.shape[0], param.shape[1]))
749
+ elif name == "conditioner.embedders.1.model.text_projection":
750
+ param = param.T
751
+ if isinstance(rename_dict[name], str):
752
+ state_dict_[rename_dict[name]] = param
753
+ else:
754
+ length = param.shape[0] // 3
755
+ for i, rename in enumerate(rename_dict[name]):
756
+ state_dict_[rename] = param[i*length: i*length+length]
757
+ return state_dict_
diffsynth/models/sdxl_unet.py ADDED
The diff for this file is too large to render. See raw diff
 
diffsynth/models/sdxl_vae_decoder.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .sd_vae_decoder import SDVAEDecoder, SDVAEDecoderStateDictConverter
2
+
3
+
4
+ class SDXLVAEDecoder(SDVAEDecoder):
5
+ def __init__(self):
6
+ super().__init__()
7
+ self.scaling_factor = 0.13025
8
+
9
+ def state_dict_converter(self):
10
+ return SDXLVAEDecoderStateDictConverter()
11
+
12
+
13
+ class SDXLVAEDecoderStateDictConverter(SDVAEDecoderStateDictConverter):
14
+ def __init__(self):
15
+ super().__init__()
diffsynth/models/sdxl_vae_encoder.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .sd_vae_encoder import SDVAEEncoderStateDictConverter, SDVAEEncoder
2
+
3
+
4
+ class SDXLVAEEncoder(SDVAEEncoder):
5
+ def __init__(self):
6
+ super().__init__()
7
+ self.scaling_factor = 0.13025
8
+
9
+ def state_dict_converter(self):
10
+ return SDXLVAEEncoderStateDictConverter()
11
+
12
+
13
+ class SDXLVAEEncoderStateDictConverter(SDVAEEncoderStateDictConverter):
14
+ def __init__(self):
15
+ super().__init__()
diffsynth/models/svd_image_encoder.py ADDED
@@ -0,0 +1,504 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .sd_text_encoder import CLIPEncoderLayer
3
+
4
+
5
+ class CLIPVisionEmbeddings(torch.nn.Module):
6
+ def __init__(self, embed_dim=1280, image_size=224, patch_size=14, num_channels=3):
7
+ super().__init__()
8
+
9
+ # class_embeds (This is a fixed tensor)
10
+ self.class_embedding = torch.nn.Parameter(torch.randn(1, 1, embed_dim))
11
+
12
+ # position_embeds
13
+ self.patch_embedding = torch.nn.Conv2d(in_channels=num_channels, out_channels=embed_dim, kernel_size=patch_size, stride=patch_size, bias=False)
14
+
15
+ # position_embeds (This is a fixed tensor)
16
+ self.position_embeds = torch.nn.Parameter(torch.zeros(1, (image_size // patch_size) ** 2 + 1, embed_dim))
17
+
18
+ def forward(self, pixel_values):
19
+ batch_size = pixel_values.shape[0]
20
+ patch_embeds = self.patch_embedding(pixel_values)
21
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
22
+ class_embeds = self.class_embedding.repeat(batch_size, 1, 1)
23
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + self.position_embeds
24
+ return embeddings
25
+
26
+
27
+ class SVDImageEncoder(torch.nn.Module):
28
+ def __init__(self, embed_dim=1280, layer_norm_eps=1e-5, num_encoder_layers=32, encoder_intermediate_size=5120, projection_dim=1024, num_heads=16, head_dim=80):
29
+ super().__init__()
30
+ self.embeddings = CLIPVisionEmbeddings(embed_dim=embed_dim)
31
+ self.pre_layernorm = torch.nn.LayerNorm(embed_dim, eps=layer_norm_eps)
32
+ self.encoders = torch.nn.ModuleList([
33
+ CLIPEncoderLayer(embed_dim, encoder_intermediate_size, num_heads=num_heads, head_dim=head_dim, use_quick_gelu=False)
34
+ for _ in range(num_encoder_layers)])
35
+ self.post_layernorm = torch.nn.LayerNorm(embed_dim, eps=layer_norm_eps)
36
+ self.visual_projection = torch.nn.Linear(embed_dim, projection_dim, bias=False)
37
+
38
+ def forward(self, pixel_values):
39
+ embeds = self.embeddings(pixel_values)
40
+ embeds = self.pre_layernorm(embeds)
41
+ for encoder_id, encoder in enumerate(self.encoders):
42
+ embeds = encoder(embeds)
43
+ embeds = self.post_layernorm(embeds[:, 0, :])
44
+ embeds = self.visual_projection(embeds)
45
+ return embeds
46
+
47
+ def state_dict_converter(self):
48
+ return SVDImageEncoderStateDictConverter()
49
+
50
+
51
+ class SVDImageEncoderStateDictConverter:
52
+ def __init__(self):
53
+ pass
54
+
55
+ def from_diffusers(self, state_dict):
56
+ rename_dict = {
57
+ "vision_model.embeddings.patch_embedding.weight": "embeddings.patch_embedding.weight",
58
+ "vision_model.embeddings.class_embedding": "embeddings.class_embedding",
59
+ "vision_model.embeddings.position_embedding.weight": "embeddings.position_embeds",
60
+ "vision_model.pre_layrnorm.weight": "pre_layernorm.weight",
61
+ "vision_model.pre_layrnorm.bias": "pre_layernorm.bias",
62
+ "vision_model.post_layernorm.weight": "post_layernorm.weight",
63
+ "vision_model.post_layernorm.bias": "post_layernorm.bias",
64
+ "visual_projection.weight": "visual_projection.weight"
65
+ }
66
+ attn_rename_dict = {
67
+ "self_attn.q_proj": "attn.to_q",
68
+ "self_attn.k_proj": "attn.to_k",
69
+ "self_attn.v_proj": "attn.to_v",
70
+ "self_attn.out_proj": "attn.to_out",
71
+ "layer_norm1": "layer_norm1",
72
+ "layer_norm2": "layer_norm2",
73
+ "mlp.fc1": "fc1",
74
+ "mlp.fc2": "fc2",
75
+ }
76
+ state_dict_ = {}
77
+ for name in state_dict:
78
+ if name in rename_dict:
79
+ param = state_dict[name]
80
+ if name == "vision_model.embeddings.class_embedding":
81
+ param = state_dict[name].view(1, 1, -1)
82
+ elif name == "vision_model.embeddings.position_embedding.weight":
83
+ param = state_dict[name].unsqueeze(0)
84
+ state_dict_[rename_dict[name]] = param
85
+ elif name.startswith("vision_model.encoder.layers."):
86
+ param = state_dict[name]
87
+ names = name.split(".")
88
+ layer_id, layer_type, tail = names[3], ".".join(names[4:-1]), names[-1]
89
+ name_ = ".".join(["encoders", layer_id, attn_rename_dict[layer_type], tail])
90
+ state_dict_[name_] = param
91
+ return state_dict_
92
+
93
+ def from_civitai(self, state_dict):
94
+ rename_dict = {
95
+ "conditioner.embedders.0.open_clip.model.visual.class_embedding": "embeddings.class_embedding",
96
+ "conditioner.embedders.0.open_clip.model.visual.conv1.weight": "embeddings.patch_embedding.weight",
97
+ "conditioner.embedders.0.open_clip.model.visual.ln_post.bias": "post_layernorm.bias",
98
+ "conditioner.embedders.0.open_clip.model.visual.ln_post.weight": "post_layernorm.weight",
99
+ "conditioner.embedders.0.open_clip.model.visual.ln_pre.bias": "pre_layernorm.bias",
100
+ "conditioner.embedders.0.open_clip.model.visual.ln_pre.weight": "pre_layernorm.weight",
101
+ "conditioner.embedders.0.open_clip.model.visual.positional_embedding": "embeddings.position_embeds",
102
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.attn.in_proj_bias": ['encoders.0.attn.to_q.bias', 'encoders.0.attn.to_k.bias', 'encoders.0.attn.to_v.bias'],
103
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.attn.in_proj_weight": ['encoders.0.attn.to_q.weight', 'encoders.0.attn.to_k.weight', 'encoders.0.attn.to_v.weight'],
104
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.attn.out_proj.bias": "encoders.0.attn.to_out.bias",
105
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.attn.out_proj.weight": "encoders.0.attn.to_out.weight",
106
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.ln_1.bias": "encoders.0.layer_norm1.bias",
107
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.ln_1.weight": "encoders.0.layer_norm1.weight",
108
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.ln_2.bias": "encoders.0.layer_norm2.bias",
109
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.ln_2.weight": "encoders.0.layer_norm2.weight",
110
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.mlp.c_fc.bias": "encoders.0.fc1.bias",
111
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.mlp.c_fc.weight": "encoders.0.fc1.weight",
112
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.mlp.c_proj.bias": "encoders.0.fc2.bias",
113
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.mlp.c_proj.weight": "encoders.0.fc2.weight",
114
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.attn.in_proj_bias": ['encoders.1.attn.to_q.bias', 'encoders.1.attn.to_k.bias', 'encoders.1.attn.to_v.bias'],
115
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.attn.in_proj_weight": ['encoders.1.attn.to_q.weight', 'encoders.1.attn.to_k.weight', 'encoders.1.attn.to_v.weight'],
116
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.attn.out_proj.bias": "encoders.1.attn.to_out.bias",
117
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.attn.out_proj.weight": "encoders.1.attn.to_out.weight",
118
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.ln_1.bias": "encoders.1.layer_norm1.bias",
119
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.ln_1.weight": "encoders.1.layer_norm1.weight",
120
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.ln_2.bias": "encoders.1.layer_norm2.bias",
121
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.ln_2.weight": "encoders.1.layer_norm2.weight",
122
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.mlp.c_fc.bias": "encoders.1.fc1.bias",
123
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.mlp.c_fc.weight": "encoders.1.fc1.weight",
124
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.mlp.c_proj.bias": "encoders.1.fc2.bias",
125
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.mlp.c_proj.weight": "encoders.1.fc2.weight",
126
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.attn.in_proj_bias": ['encoders.10.attn.to_q.bias', 'encoders.10.attn.to_k.bias', 'encoders.10.attn.to_v.bias'],
127
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.attn.in_proj_weight": ['encoders.10.attn.to_q.weight', 'encoders.10.attn.to_k.weight', 'encoders.10.attn.to_v.weight'],
128
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.attn.out_proj.bias": "encoders.10.attn.to_out.bias",
129
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.attn.out_proj.weight": "encoders.10.attn.to_out.weight",
130
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.ln_1.bias": "encoders.10.layer_norm1.bias",
131
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.ln_1.weight": "encoders.10.layer_norm1.weight",
132
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.ln_2.bias": "encoders.10.layer_norm2.bias",
133
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.ln_2.weight": "encoders.10.layer_norm2.weight",
134
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.mlp.c_fc.bias": "encoders.10.fc1.bias",
135
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.mlp.c_fc.weight": "encoders.10.fc1.weight",
136
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.mlp.c_proj.bias": "encoders.10.fc2.bias",
137
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.mlp.c_proj.weight": "encoders.10.fc2.weight",
138
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.attn.in_proj_bias": ['encoders.11.attn.to_q.bias', 'encoders.11.attn.to_k.bias', 'encoders.11.attn.to_v.bias'],
139
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.attn.in_proj_weight": ['encoders.11.attn.to_q.weight', 'encoders.11.attn.to_k.weight', 'encoders.11.attn.to_v.weight'],
140
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.attn.out_proj.bias": "encoders.11.attn.to_out.bias",
141
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.attn.out_proj.weight": "encoders.11.attn.to_out.weight",
142
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.ln_1.bias": "encoders.11.layer_norm1.bias",
143
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.ln_1.weight": "encoders.11.layer_norm1.weight",
144
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.ln_2.bias": "encoders.11.layer_norm2.bias",
145
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.ln_2.weight": "encoders.11.layer_norm2.weight",
146
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.mlp.c_fc.bias": "encoders.11.fc1.bias",
147
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.mlp.c_fc.weight": "encoders.11.fc1.weight",
148
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.mlp.c_proj.bias": "encoders.11.fc2.bias",
149
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.mlp.c_proj.weight": "encoders.11.fc2.weight",
150
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.attn.in_proj_bias": ['encoders.12.attn.to_q.bias', 'encoders.12.attn.to_k.bias', 'encoders.12.attn.to_v.bias'],
151
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.attn.in_proj_weight": ['encoders.12.attn.to_q.weight', 'encoders.12.attn.to_k.weight', 'encoders.12.attn.to_v.weight'],
152
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.attn.out_proj.bias": "encoders.12.attn.to_out.bias",
153
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.attn.out_proj.weight": "encoders.12.attn.to_out.weight",
154
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.ln_1.bias": "encoders.12.layer_norm1.bias",
155
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.ln_1.weight": "encoders.12.layer_norm1.weight",
156
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.ln_2.bias": "encoders.12.layer_norm2.bias",
157
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.ln_2.weight": "encoders.12.layer_norm2.weight",
158
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.mlp.c_fc.bias": "encoders.12.fc1.bias",
159
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.mlp.c_fc.weight": "encoders.12.fc1.weight",
160
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.mlp.c_proj.bias": "encoders.12.fc2.bias",
161
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.mlp.c_proj.weight": "encoders.12.fc2.weight",
162
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.attn.in_proj_bias": ['encoders.13.attn.to_q.bias', 'encoders.13.attn.to_k.bias', 'encoders.13.attn.to_v.bias'],
163
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.attn.in_proj_weight": ['encoders.13.attn.to_q.weight', 'encoders.13.attn.to_k.weight', 'encoders.13.attn.to_v.weight'],
164
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.attn.out_proj.bias": "encoders.13.attn.to_out.bias",
165
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.attn.out_proj.weight": "encoders.13.attn.to_out.weight",
166
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.ln_1.bias": "encoders.13.layer_norm1.bias",
167
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.ln_1.weight": "encoders.13.layer_norm1.weight",
168
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.ln_2.bias": "encoders.13.layer_norm2.bias",
169
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.ln_2.weight": "encoders.13.layer_norm2.weight",
170
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.mlp.c_fc.bias": "encoders.13.fc1.bias",
171
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.mlp.c_fc.weight": "encoders.13.fc1.weight",
172
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.mlp.c_proj.bias": "encoders.13.fc2.bias",
173
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.mlp.c_proj.weight": "encoders.13.fc2.weight",
174
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.attn.in_proj_bias": ['encoders.14.attn.to_q.bias', 'encoders.14.attn.to_k.bias', 'encoders.14.attn.to_v.bias'],
175
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.attn.in_proj_weight": ['encoders.14.attn.to_q.weight', 'encoders.14.attn.to_k.weight', 'encoders.14.attn.to_v.weight'],
176
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.attn.out_proj.bias": "encoders.14.attn.to_out.bias",
177
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.attn.out_proj.weight": "encoders.14.attn.to_out.weight",
178
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.ln_1.bias": "encoders.14.layer_norm1.bias",
179
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.ln_1.weight": "encoders.14.layer_norm1.weight",
180
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.ln_2.bias": "encoders.14.layer_norm2.bias",
181
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.ln_2.weight": "encoders.14.layer_norm2.weight",
182
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.mlp.c_fc.bias": "encoders.14.fc1.bias",
183
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.mlp.c_fc.weight": "encoders.14.fc1.weight",
184
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.mlp.c_proj.bias": "encoders.14.fc2.bias",
185
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.mlp.c_proj.weight": "encoders.14.fc2.weight",
186
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.attn.in_proj_bias": ['encoders.15.attn.to_q.bias', 'encoders.15.attn.to_k.bias', 'encoders.15.attn.to_v.bias'],
187
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.attn.in_proj_weight": ['encoders.15.attn.to_q.weight', 'encoders.15.attn.to_k.weight', 'encoders.15.attn.to_v.weight'],
188
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.attn.out_proj.bias": "encoders.15.attn.to_out.bias",
189
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.attn.out_proj.weight": "encoders.15.attn.to_out.weight",
190
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.ln_1.bias": "encoders.15.layer_norm1.bias",
191
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.ln_1.weight": "encoders.15.layer_norm1.weight",
192
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.ln_2.bias": "encoders.15.layer_norm2.bias",
193
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.ln_2.weight": "encoders.15.layer_norm2.weight",
194
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.mlp.c_fc.bias": "encoders.15.fc1.bias",
195
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.mlp.c_fc.weight": "encoders.15.fc1.weight",
196
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.mlp.c_proj.bias": "encoders.15.fc2.bias",
197
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.mlp.c_proj.weight": "encoders.15.fc2.weight",
198
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.attn.in_proj_bias": ['encoders.16.attn.to_q.bias', 'encoders.16.attn.to_k.bias', 'encoders.16.attn.to_v.bias'],
199
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.attn.in_proj_weight": ['encoders.16.attn.to_q.weight', 'encoders.16.attn.to_k.weight', 'encoders.16.attn.to_v.weight'],
200
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.attn.out_proj.bias": "encoders.16.attn.to_out.bias",
201
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.attn.out_proj.weight": "encoders.16.attn.to_out.weight",
202
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.ln_1.bias": "encoders.16.layer_norm1.bias",
203
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.ln_1.weight": "encoders.16.layer_norm1.weight",
204
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.ln_2.bias": "encoders.16.layer_norm2.bias",
205
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.ln_2.weight": "encoders.16.layer_norm2.weight",
206
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.mlp.c_fc.bias": "encoders.16.fc1.bias",
207
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.mlp.c_fc.weight": "encoders.16.fc1.weight",
208
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.mlp.c_proj.bias": "encoders.16.fc2.bias",
209
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.mlp.c_proj.weight": "encoders.16.fc2.weight",
210
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.attn.in_proj_bias": ['encoders.17.attn.to_q.bias', 'encoders.17.attn.to_k.bias', 'encoders.17.attn.to_v.bias'],
211
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.attn.in_proj_weight": ['encoders.17.attn.to_q.weight', 'encoders.17.attn.to_k.weight', 'encoders.17.attn.to_v.weight'],
212
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.attn.out_proj.bias": "encoders.17.attn.to_out.bias",
213
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.attn.out_proj.weight": "encoders.17.attn.to_out.weight",
214
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.ln_1.bias": "encoders.17.layer_norm1.bias",
215
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.ln_1.weight": "encoders.17.layer_norm1.weight",
216
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.ln_2.bias": "encoders.17.layer_norm2.bias",
217
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.ln_2.weight": "encoders.17.layer_norm2.weight",
218
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.mlp.c_fc.bias": "encoders.17.fc1.bias",
219
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.mlp.c_fc.weight": "encoders.17.fc1.weight",
220
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.mlp.c_proj.bias": "encoders.17.fc2.bias",
221
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.mlp.c_proj.weight": "encoders.17.fc2.weight",
222
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.attn.in_proj_bias": ['encoders.18.attn.to_q.bias', 'encoders.18.attn.to_k.bias', 'encoders.18.attn.to_v.bias'],
223
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.attn.in_proj_weight": ['encoders.18.attn.to_q.weight', 'encoders.18.attn.to_k.weight', 'encoders.18.attn.to_v.weight'],
224
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.attn.out_proj.bias": "encoders.18.attn.to_out.bias",
225
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.attn.out_proj.weight": "encoders.18.attn.to_out.weight",
226
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.ln_1.bias": "encoders.18.layer_norm1.bias",
227
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.ln_1.weight": "encoders.18.layer_norm1.weight",
228
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.ln_2.bias": "encoders.18.layer_norm2.bias",
229
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.ln_2.weight": "encoders.18.layer_norm2.weight",
230
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.mlp.c_fc.bias": "encoders.18.fc1.bias",
231
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.mlp.c_fc.weight": "encoders.18.fc1.weight",
232
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.mlp.c_proj.bias": "encoders.18.fc2.bias",
233
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.mlp.c_proj.weight": "encoders.18.fc2.weight",
234
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.attn.in_proj_bias": ['encoders.19.attn.to_q.bias', 'encoders.19.attn.to_k.bias', 'encoders.19.attn.to_v.bias'],
235
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.attn.in_proj_weight": ['encoders.19.attn.to_q.weight', 'encoders.19.attn.to_k.weight', 'encoders.19.attn.to_v.weight'],
236
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.attn.out_proj.bias": "encoders.19.attn.to_out.bias",
237
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.attn.out_proj.weight": "encoders.19.attn.to_out.weight",
238
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.ln_1.bias": "encoders.19.layer_norm1.bias",
239
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.ln_1.weight": "encoders.19.layer_norm1.weight",
240
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.ln_2.bias": "encoders.19.layer_norm2.bias",
241
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.ln_2.weight": "encoders.19.layer_norm2.weight",
242
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.mlp.c_fc.bias": "encoders.19.fc1.bias",
243
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.mlp.c_fc.weight": "encoders.19.fc1.weight",
244
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.mlp.c_proj.bias": "encoders.19.fc2.bias",
245
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.mlp.c_proj.weight": "encoders.19.fc2.weight",
246
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.attn.in_proj_bias": ['encoders.2.attn.to_q.bias', 'encoders.2.attn.to_k.bias', 'encoders.2.attn.to_v.bias'],
247
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.attn.in_proj_weight": ['encoders.2.attn.to_q.weight', 'encoders.2.attn.to_k.weight', 'encoders.2.attn.to_v.weight'],
248
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.attn.out_proj.bias": "encoders.2.attn.to_out.bias",
249
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.attn.out_proj.weight": "encoders.2.attn.to_out.weight",
250
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.ln_1.bias": "encoders.2.layer_norm1.bias",
251
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.ln_1.weight": "encoders.2.layer_norm1.weight",
252
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.ln_2.bias": "encoders.2.layer_norm2.bias",
253
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.ln_2.weight": "encoders.2.layer_norm2.weight",
254
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.mlp.c_fc.bias": "encoders.2.fc1.bias",
255
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.mlp.c_fc.weight": "encoders.2.fc1.weight",
256
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.mlp.c_proj.bias": "encoders.2.fc2.bias",
257
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.mlp.c_proj.weight": "encoders.2.fc2.weight",
258
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.attn.in_proj_bias": ['encoders.20.attn.to_q.bias', 'encoders.20.attn.to_k.bias', 'encoders.20.attn.to_v.bias'],
259
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.attn.in_proj_weight": ['encoders.20.attn.to_q.weight', 'encoders.20.attn.to_k.weight', 'encoders.20.attn.to_v.weight'],
260
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.attn.out_proj.bias": "encoders.20.attn.to_out.bias",
261
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.attn.out_proj.weight": "encoders.20.attn.to_out.weight",
262
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.ln_1.bias": "encoders.20.layer_norm1.bias",
263
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.ln_1.weight": "encoders.20.layer_norm1.weight",
264
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.ln_2.bias": "encoders.20.layer_norm2.bias",
265
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.ln_2.weight": "encoders.20.layer_norm2.weight",
266
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.mlp.c_fc.bias": "encoders.20.fc1.bias",
267
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.mlp.c_fc.weight": "encoders.20.fc1.weight",
268
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.mlp.c_proj.bias": "encoders.20.fc2.bias",
269
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.mlp.c_proj.weight": "encoders.20.fc2.weight",
270
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.attn.in_proj_bias": ['encoders.21.attn.to_q.bias', 'encoders.21.attn.to_k.bias', 'encoders.21.attn.to_v.bias'],
271
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.attn.in_proj_weight": ['encoders.21.attn.to_q.weight', 'encoders.21.attn.to_k.weight', 'encoders.21.attn.to_v.weight'],
272
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.attn.out_proj.bias": "encoders.21.attn.to_out.bias",
273
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.attn.out_proj.weight": "encoders.21.attn.to_out.weight",
274
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.ln_1.bias": "encoders.21.layer_norm1.bias",
275
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.ln_1.weight": "encoders.21.layer_norm1.weight",
276
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.ln_2.bias": "encoders.21.layer_norm2.bias",
277
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.ln_2.weight": "encoders.21.layer_norm2.weight",
278
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.mlp.c_fc.bias": "encoders.21.fc1.bias",
279
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.mlp.c_fc.weight": "encoders.21.fc1.weight",
280
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.mlp.c_proj.bias": "encoders.21.fc2.bias",
281
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.mlp.c_proj.weight": "encoders.21.fc2.weight",
282
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.attn.in_proj_bias": ['encoders.22.attn.to_q.bias', 'encoders.22.attn.to_k.bias', 'encoders.22.attn.to_v.bias'],
283
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.attn.in_proj_weight": ['encoders.22.attn.to_q.weight', 'encoders.22.attn.to_k.weight', 'encoders.22.attn.to_v.weight'],
284
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.attn.out_proj.bias": "encoders.22.attn.to_out.bias",
285
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.attn.out_proj.weight": "encoders.22.attn.to_out.weight",
286
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.ln_1.bias": "encoders.22.layer_norm1.bias",
287
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.ln_1.weight": "encoders.22.layer_norm1.weight",
288
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.ln_2.bias": "encoders.22.layer_norm2.bias",
289
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.ln_2.weight": "encoders.22.layer_norm2.weight",
290
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.mlp.c_fc.bias": "encoders.22.fc1.bias",
291
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.mlp.c_fc.weight": "encoders.22.fc1.weight",
292
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.mlp.c_proj.bias": "encoders.22.fc2.bias",
293
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.mlp.c_proj.weight": "encoders.22.fc2.weight",
294
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.attn.in_proj_bias": ['encoders.23.attn.to_q.bias', 'encoders.23.attn.to_k.bias', 'encoders.23.attn.to_v.bias'],
295
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.attn.in_proj_weight": ['encoders.23.attn.to_q.weight', 'encoders.23.attn.to_k.weight', 'encoders.23.attn.to_v.weight'],
296
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.attn.out_proj.bias": "encoders.23.attn.to_out.bias",
297
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.attn.out_proj.weight": "encoders.23.attn.to_out.weight",
298
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.ln_1.bias": "encoders.23.layer_norm1.bias",
299
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.ln_1.weight": "encoders.23.layer_norm1.weight",
300
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.ln_2.bias": "encoders.23.layer_norm2.bias",
301
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.ln_2.weight": "encoders.23.layer_norm2.weight",
302
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.mlp.c_fc.bias": "encoders.23.fc1.bias",
303
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.mlp.c_fc.weight": "encoders.23.fc1.weight",
304
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.mlp.c_proj.bias": "encoders.23.fc2.bias",
305
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.mlp.c_proj.weight": "encoders.23.fc2.weight",
306
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.attn.in_proj_bias": ['encoders.24.attn.to_q.bias', 'encoders.24.attn.to_k.bias', 'encoders.24.attn.to_v.bias'],
307
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.attn.in_proj_weight": ['encoders.24.attn.to_q.weight', 'encoders.24.attn.to_k.weight', 'encoders.24.attn.to_v.weight'],
308
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.attn.out_proj.bias": "encoders.24.attn.to_out.bias",
309
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.attn.out_proj.weight": "encoders.24.attn.to_out.weight",
310
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.ln_1.bias": "encoders.24.layer_norm1.bias",
311
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.ln_1.weight": "encoders.24.layer_norm1.weight",
312
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.ln_2.bias": "encoders.24.layer_norm2.bias",
313
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.ln_2.weight": "encoders.24.layer_norm2.weight",
314
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.mlp.c_fc.bias": "encoders.24.fc1.bias",
315
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.mlp.c_fc.weight": "encoders.24.fc1.weight",
316
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.mlp.c_proj.bias": "encoders.24.fc2.bias",
317
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.mlp.c_proj.weight": "encoders.24.fc2.weight",
318
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.attn.in_proj_bias": ['encoders.25.attn.to_q.bias', 'encoders.25.attn.to_k.bias', 'encoders.25.attn.to_v.bias'],
319
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.attn.in_proj_weight": ['encoders.25.attn.to_q.weight', 'encoders.25.attn.to_k.weight', 'encoders.25.attn.to_v.weight'],
320
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.attn.out_proj.bias": "encoders.25.attn.to_out.bias",
321
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.attn.out_proj.weight": "encoders.25.attn.to_out.weight",
322
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.ln_1.bias": "encoders.25.layer_norm1.bias",
323
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.ln_1.weight": "encoders.25.layer_norm1.weight",
324
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.ln_2.bias": "encoders.25.layer_norm2.bias",
325
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.ln_2.weight": "encoders.25.layer_norm2.weight",
326
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.mlp.c_fc.bias": "encoders.25.fc1.bias",
327
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.mlp.c_fc.weight": "encoders.25.fc1.weight",
328
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.mlp.c_proj.bias": "encoders.25.fc2.bias",
329
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.mlp.c_proj.weight": "encoders.25.fc2.weight",
330
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.attn.in_proj_bias": ['encoders.26.attn.to_q.bias', 'encoders.26.attn.to_k.bias', 'encoders.26.attn.to_v.bias'],
331
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.attn.in_proj_weight": ['encoders.26.attn.to_q.weight', 'encoders.26.attn.to_k.weight', 'encoders.26.attn.to_v.weight'],
332
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.attn.out_proj.bias": "encoders.26.attn.to_out.bias",
333
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.attn.out_proj.weight": "encoders.26.attn.to_out.weight",
334
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.ln_1.bias": "encoders.26.layer_norm1.bias",
335
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.ln_1.weight": "encoders.26.layer_norm1.weight",
336
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.ln_2.bias": "encoders.26.layer_norm2.bias",
337
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.ln_2.weight": "encoders.26.layer_norm2.weight",
338
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.mlp.c_fc.bias": "encoders.26.fc1.bias",
339
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.mlp.c_fc.weight": "encoders.26.fc1.weight",
340
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.mlp.c_proj.bias": "encoders.26.fc2.bias",
341
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.mlp.c_proj.weight": "encoders.26.fc2.weight",
342
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.attn.in_proj_bias": ['encoders.27.attn.to_q.bias', 'encoders.27.attn.to_k.bias', 'encoders.27.attn.to_v.bias'],
343
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.attn.in_proj_weight": ['encoders.27.attn.to_q.weight', 'encoders.27.attn.to_k.weight', 'encoders.27.attn.to_v.weight'],
344
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.attn.out_proj.bias": "encoders.27.attn.to_out.bias",
345
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.attn.out_proj.weight": "encoders.27.attn.to_out.weight",
346
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.ln_1.bias": "encoders.27.layer_norm1.bias",
347
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.ln_1.weight": "encoders.27.layer_norm1.weight",
348
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.ln_2.bias": "encoders.27.layer_norm2.bias",
349
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.ln_2.weight": "encoders.27.layer_norm2.weight",
350
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.mlp.c_fc.bias": "encoders.27.fc1.bias",
351
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.mlp.c_fc.weight": "encoders.27.fc1.weight",
352
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.mlp.c_proj.bias": "encoders.27.fc2.bias",
353
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.mlp.c_proj.weight": "encoders.27.fc2.weight",
354
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.attn.in_proj_bias": ['encoders.28.attn.to_q.bias', 'encoders.28.attn.to_k.bias', 'encoders.28.attn.to_v.bias'],
355
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.attn.in_proj_weight": ['encoders.28.attn.to_q.weight', 'encoders.28.attn.to_k.weight', 'encoders.28.attn.to_v.weight'],
356
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.attn.out_proj.bias": "encoders.28.attn.to_out.bias",
357
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.attn.out_proj.weight": "encoders.28.attn.to_out.weight",
358
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.ln_1.bias": "encoders.28.layer_norm1.bias",
359
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.ln_1.weight": "encoders.28.layer_norm1.weight",
360
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.ln_2.bias": "encoders.28.layer_norm2.bias",
361
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.ln_2.weight": "encoders.28.layer_norm2.weight",
362
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.mlp.c_fc.bias": "encoders.28.fc1.bias",
363
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.mlp.c_fc.weight": "encoders.28.fc1.weight",
364
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.mlp.c_proj.bias": "encoders.28.fc2.bias",
365
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.mlp.c_proj.weight": "encoders.28.fc2.weight",
366
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.attn.in_proj_bias": ['encoders.29.attn.to_q.bias', 'encoders.29.attn.to_k.bias', 'encoders.29.attn.to_v.bias'],
367
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.attn.in_proj_weight": ['encoders.29.attn.to_q.weight', 'encoders.29.attn.to_k.weight', 'encoders.29.attn.to_v.weight'],
368
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.attn.out_proj.bias": "encoders.29.attn.to_out.bias",
369
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.attn.out_proj.weight": "encoders.29.attn.to_out.weight",
370
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.ln_1.bias": "encoders.29.layer_norm1.bias",
371
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.ln_1.weight": "encoders.29.layer_norm1.weight",
372
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.ln_2.bias": "encoders.29.layer_norm2.bias",
373
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.ln_2.weight": "encoders.29.layer_norm2.weight",
374
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.mlp.c_fc.bias": "encoders.29.fc1.bias",
375
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.mlp.c_fc.weight": "encoders.29.fc1.weight",
376
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.mlp.c_proj.bias": "encoders.29.fc2.bias",
377
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.mlp.c_proj.weight": "encoders.29.fc2.weight",
378
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.attn.in_proj_bias": ['encoders.3.attn.to_q.bias', 'encoders.3.attn.to_k.bias', 'encoders.3.attn.to_v.bias'],
379
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.attn.in_proj_weight": ['encoders.3.attn.to_q.weight', 'encoders.3.attn.to_k.weight', 'encoders.3.attn.to_v.weight'],
380
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.attn.out_proj.bias": "encoders.3.attn.to_out.bias",
381
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.attn.out_proj.weight": "encoders.3.attn.to_out.weight",
382
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.ln_1.bias": "encoders.3.layer_norm1.bias",
383
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.ln_1.weight": "encoders.3.layer_norm1.weight",
384
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.ln_2.bias": "encoders.3.layer_norm2.bias",
385
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.ln_2.weight": "encoders.3.layer_norm2.weight",
386
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.mlp.c_fc.bias": "encoders.3.fc1.bias",
387
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.mlp.c_fc.weight": "encoders.3.fc1.weight",
388
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.mlp.c_proj.bias": "encoders.3.fc2.bias",
389
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.mlp.c_proj.weight": "encoders.3.fc2.weight",
390
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.attn.in_proj_bias": ['encoders.30.attn.to_q.bias', 'encoders.30.attn.to_k.bias', 'encoders.30.attn.to_v.bias'],
391
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.attn.in_proj_weight": ['encoders.30.attn.to_q.weight', 'encoders.30.attn.to_k.weight', 'encoders.30.attn.to_v.weight'],
392
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.attn.out_proj.bias": "encoders.30.attn.to_out.bias",
393
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.attn.out_proj.weight": "encoders.30.attn.to_out.weight",
394
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.ln_1.bias": "encoders.30.layer_norm1.bias",
395
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.ln_1.weight": "encoders.30.layer_norm1.weight",
396
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.ln_2.bias": "encoders.30.layer_norm2.bias",
397
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.ln_2.weight": "encoders.30.layer_norm2.weight",
398
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.mlp.c_fc.bias": "encoders.30.fc1.bias",
399
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.mlp.c_fc.weight": "encoders.30.fc1.weight",
400
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.mlp.c_proj.bias": "encoders.30.fc2.bias",
401
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.mlp.c_proj.weight": "encoders.30.fc2.weight",
402
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.attn.in_proj_bias": ['encoders.31.attn.to_q.bias', 'encoders.31.attn.to_k.bias', 'encoders.31.attn.to_v.bias'],
403
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.attn.in_proj_weight": ['encoders.31.attn.to_q.weight', 'encoders.31.attn.to_k.weight', 'encoders.31.attn.to_v.weight'],
404
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.attn.out_proj.bias": "encoders.31.attn.to_out.bias",
405
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.attn.out_proj.weight": "encoders.31.attn.to_out.weight",
406
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.ln_1.bias": "encoders.31.layer_norm1.bias",
407
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.ln_1.weight": "encoders.31.layer_norm1.weight",
408
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.ln_2.bias": "encoders.31.layer_norm2.bias",
409
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.ln_2.weight": "encoders.31.layer_norm2.weight",
410
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.mlp.c_fc.bias": "encoders.31.fc1.bias",
411
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.mlp.c_fc.weight": "encoders.31.fc1.weight",
412
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.mlp.c_proj.bias": "encoders.31.fc2.bias",
413
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.mlp.c_proj.weight": "encoders.31.fc2.weight",
414
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.attn.in_proj_bias": ['encoders.4.attn.to_q.bias', 'encoders.4.attn.to_k.bias', 'encoders.4.attn.to_v.bias'],
415
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.attn.in_proj_weight": ['encoders.4.attn.to_q.weight', 'encoders.4.attn.to_k.weight', 'encoders.4.attn.to_v.weight'],
416
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.attn.out_proj.bias": "encoders.4.attn.to_out.bias",
417
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.attn.out_proj.weight": "encoders.4.attn.to_out.weight",
418
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.ln_1.bias": "encoders.4.layer_norm1.bias",
419
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.ln_1.weight": "encoders.4.layer_norm1.weight",
420
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.ln_2.bias": "encoders.4.layer_norm2.bias",
421
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.ln_2.weight": "encoders.4.layer_norm2.weight",
422
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.mlp.c_fc.bias": "encoders.4.fc1.bias",
423
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.mlp.c_fc.weight": "encoders.4.fc1.weight",
424
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.mlp.c_proj.bias": "encoders.4.fc2.bias",
425
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.mlp.c_proj.weight": "encoders.4.fc2.weight",
426
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.attn.in_proj_bias": ['encoders.5.attn.to_q.bias', 'encoders.5.attn.to_k.bias', 'encoders.5.attn.to_v.bias'],
427
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.attn.in_proj_weight": ['encoders.5.attn.to_q.weight', 'encoders.5.attn.to_k.weight', 'encoders.5.attn.to_v.weight'],
428
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.attn.out_proj.bias": "encoders.5.attn.to_out.bias",
429
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.attn.out_proj.weight": "encoders.5.attn.to_out.weight",
430
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.ln_1.bias": "encoders.5.layer_norm1.bias",
431
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.ln_1.weight": "encoders.5.layer_norm1.weight",
432
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.ln_2.bias": "encoders.5.layer_norm2.bias",
433
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.ln_2.weight": "encoders.5.layer_norm2.weight",
434
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.mlp.c_fc.bias": "encoders.5.fc1.bias",
435
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.mlp.c_fc.weight": "encoders.5.fc1.weight",
436
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.mlp.c_proj.bias": "encoders.5.fc2.bias",
437
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.mlp.c_proj.weight": "encoders.5.fc2.weight",
438
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.attn.in_proj_bias": ['encoders.6.attn.to_q.bias', 'encoders.6.attn.to_k.bias', 'encoders.6.attn.to_v.bias'],
439
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.attn.in_proj_weight": ['encoders.6.attn.to_q.weight', 'encoders.6.attn.to_k.weight', 'encoders.6.attn.to_v.weight'],
440
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.attn.out_proj.bias": "encoders.6.attn.to_out.bias",
441
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.attn.out_proj.weight": "encoders.6.attn.to_out.weight",
442
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.ln_1.bias": "encoders.6.layer_norm1.bias",
443
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.ln_1.weight": "encoders.6.layer_norm1.weight",
444
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.ln_2.bias": "encoders.6.layer_norm2.bias",
445
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.ln_2.weight": "encoders.6.layer_norm2.weight",
446
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.mlp.c_fc.bias": "encoders.6.fc1.bias",
447
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.mlp.c_fc.weight": "encoders.6.fc1.weight",
448
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.mlp.c_proj.bias": "encoders.6.fc2.bias",
449
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.mlp.c_proj.weight": "encoders.6.fc2.weight",
450
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.attn.in_proj_bias": ['encoders.7.attn.to_q.bias', 'encoders.7.attn.to_k.bias', 'encoders.7.attn.to_v.bias'],
451
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.attn.in_proj_weight": ['encoders.7.attn.to_q.weight', 'encoders.7.attn.to_k.weight', 'encoders.7.attn.to_v.weight'],
452
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.attn.out_proj.bias": "encoders.7.attn.to_out.bias",
453
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.attn.out_proj.weight": "encoders.7.attn.to_out.weight",
454
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.ln_1.bias": "encoders.7.layer_norm1.bias",
455
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.ln_1.weight": "encoders.7.layer_norm1.weight",
456
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.ln_2.bias": "encoders.7.layer_norm2.bias",
457
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.ln_2.weight": "encoders.7.layer_norm2.weight",
458
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.mlp.c_fc.bias": "encoders.7.fc1.bias",
459
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.mlp.c_fc.weight": "encoders.7.fc1.weight",
460
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.mlp.c_proj.bias": "encoders.7.fc2.bias",
461
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.mlp.c_proj.weight": "encoders.7.fc2.weight",
462
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.attn.in_proj_bias": ['encoders.8.attn.to_q.bias', 'encoders.8.attn.to_k.bias', 'encoders.8.attn.to_v.bias'],
463
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.attn.in_proj_weight": ['encoders.8.attn.to_q.weight', 'encoders.8.attn.to_k.weight', 'encoders.8.attn.to_v.weight'],
464
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.attn.out_proj.bias": "encoders.8.attn.to_out.bias",
465
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.attn.out_proj.weight": "encoders.8.attn.to_out.weight",
466
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.ln_1.bias": "encoders.8.layer_norm1.bias",
467
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.ln_1.weight": "encoders.8.layer_norm1.weight",
468
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.ln_2.bias": "encoders.8.layer_norm2.bias",
469
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.ln_2.weight": "encoders.8.layer_norm2.weight",
470
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.mlp.c_fc.bias": "encoders.8.fc1.bias",
471
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.mlp.c_fc.weight": "encoders.8.fc1.weight",
472
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.mlp.c_proj.bias": "encoders.8.fc2.bias",
473
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.mlp.c_proj.weight": "encoders.8.fc2.weight",
474
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.attn.in_proj_bias": ['encoders.9.attn.to_q.bias', 'encoders.9.attn.to_k.bias', 'encoders.9.attn.to_v.bias'],
475
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.attn.in_proj_weight": ['encoders.9.attn.to_q.weight', 'encoders.9.attn.to_k.weight', 'encoders.9.attn.to_v.weight'],
476
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.attn.out_proj.bias": "encoders.9.attn.to_out.bias",
477
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.attn.out_proj.weight": "encoders.9.attn.to_out.weight",
478
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.ln_1.bias": "encoders.9.layer_norm1.bias",
479
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.ln_1.weight": "encoders.9.layer_norm1.weight",
480
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.ln_2.bias": "encoders.9.layer_norm2.bias",
481
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.ln_2.weight": "encoders.9.layer_norm2.weight",
482
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.mlp.c_fc.bias": "encoders.9.fc1.bias",
483
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.mlp.c_fc.weight": "encoders.9.fc1.weight",
484
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.mlp.c_proj.bias": "encoders.9.fc2.bias",
485
+ "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.mlp.c_proj.weight": "encoders.9.fc2.weight",
486
+ "conditioner.embedders.0.open_clip.model.visual.proj": "visual_projection.weight",
487
+ }
488
+ state_dict_ = {}
489
+ for name in state_dict:
490
+ if name in rename_dict:
491
+ param = state_dict[name]
492
+ if name == "conditioner.embedders.0.open_clip.model.visual.class_embedding":
493
+ param = param.reshape((1, 1, param.shape[0]))
494
+ elif name == "conditioner.embedders.0.open_clip.model.visual.positional_embedding":
495
+ param = param.reshape((1, param.shape[0], param.shape[1]))
496
+ elif name == "conditioner.embedders.0.open_clip.model.visual.proj":
497
+ param = param.T
498
+ if isinstance(rename_dict[name], str):
499
+ state_dict_[rename_dict[name]] = param
500
+ else:
501
+ length = param.shape[0] // 3
502
+ for i, rename in enumerate(rename_dict[name]):
503
+ state_dict_[rename] = param[i*length: i*length+length]
504
+ return state_dict_
diffsynth/models/svd_unet.py ADDED
The diff for this file is too large to render. See raw diff
 
diffsynth/models/svd_vae_decoder.py ADDED
@@ -0,0 +1,577 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .attention import Attention
3
+ from .sd_unet import ResnetBlock, UpSampler
4
+ from .tiler import TileWorker
5
+ from einops import rearrange, repeat
6
+
7
+
8
+ class VAEAttentionBlock(torch.nn.Module):
9
+
10
+ def __init__(self, num_attention_heads, attention_head_dim, in_channels, num_layers=1, norm_num_groups=32, eps=1e-5):
11
+ super().__init__()
12
+ inner_dim = num_attention_heads * attention_head_dim
13
+
14
+ self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=eps, affine=True)
15
+
16
+ self.transformer_blocks = torch.nn.ModuleList([
17
+ Attention(
18
+ inner_dim,
19
+ num_attention_heads,
20
+ attention_head_dim,
21
+ bias_q=True,
22
+ bias_kv=True,
23
+ bias_out=True
24
+ )
25
+ for d in range(num_layers)
26
+ ])
27
+
28
+ def forward(self, hidden_states, time_emb, text_emb, res_stack):
29
+ batch, _, height, width = hidden_states.shape
30
+ residual = hidden_states
31
+
32
+ hidden_states = self.norm(hidden_states)
33
+ inner_dim = hidden_states.shape[1]
34
+ hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
35
+
36
+ for block in self.transformer_blocks:
37
+ hidden_states = block(hidden_states)
38
+
39
+ hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
40
+ hidden_states = hidden_states + residual
41
+
42
+ return hidden_states, time_emb, text_emb, res_stack
43
+
44
+
45
+ class TemporalResnetBlock(torch.nn.Module):
46
+
47
+ def __init__(self, in_channels, out_channels, groups=32, eps=1e-5):
48
+ super().__init__()
49
+ self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
50
+ self.conv1 = torch.nn.Conv3d(in_channels, out_channels, kernel_size=(3, 1, 1), stride=1, padding=(1, 0, 0))
51
+ self.norm2 = torch.nn.GroupNorm(num_groups=groups, num_channels=out_channels, eps=eps, affine=True)
52
+ self.conv2 = torch.nn.Conv3d(out_channels, out_channels, kernel_size=(3, 1, 1), stride=1, padding=(1, 0, 0))
53
+ self.nonlinearity = torch.nn.SiLU()
54
+ self.mix_factor = torch.nn.Parameter(torch.Tensor([0.5]))
55
+
56
+ def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwargs):
57
+ x_spatial = hidden_states
58
+ x = rearrange(hidden_states, "T C H W -> 1 C T H W")
59
+ x = self.norm1(x)
60
+ x = self.nonlinearity(x)
61
+ x = self.conv1(x)
62
+ x = self.norm2(x)
63
+ x = self.nonlinearity(x)
64
+ x = self.conv2(x)
65
+ x_temporal = hidden_states + x[0].permute(1, 0, 2, 3)
66
+ alpha = torch.sigmoid(self.mix_factor)
67
+ hidden_states = alpha * x_temporal + (1 - alpha) * x_spatial
68
+ return hidden_states, time_emb, text_emb, res_stack
69
+
70
+
71
+ class SVDVAEDecoder(torch.nn.Module):
72
+ def __init__(self):
73
+ super().__init__()
74
+ self.scaling_factor = 0.18215
75
+ self.conv_in = torch.nn.Conv2d(4, 512, kernel_size=3, padding=1)
76
+
77
+ self.blocks = torch.nn.ModuleList([
78
+ # UNetMidBlock
79
+ ResnetBlock(512, 512, eps=1e-6),
80
+ TemporalResnetBlock(512, 512, eps=1e-6),
81
+ VAEAttentionBlock(1, 512, 512, 1, eps=1e-6),
82
+ ResnetBlock(512, 512, eps=1e-6),
83
+ TemporalResnetBlock(512, 512, eps=1e-6),
84
+ # UpDecoderBlock
85
+ ResnetBlock(512, 512, eps=1e-6),
86
+ TemporalResnetBlock(512, 512, eps=1e-6),
87
+ ResnetBlock(512, 512, eps=1e-6),
88
+ TemporalResnetBlock(512, 512, eps=1e-6),
89
+ ResnetBlock(512, 512, eps=1e-6),
90
+ TemporalResnetBlock(512, 512, eps=1e-6),
91
+ UpSampler(512),
92
+ # UpDecoderBlock
93
+ ResnetBlock(512, 512, eps=1e-6),
94
+ TemporalResnetBlock(512, 512, eps=1e-6),
95
+ ResnetBlock(512, 512, eps=1e-6),
96
+ TemporalResnetBlock(512, 512, eps=1e-6),
97
+ ResnetBlock(512, 512, eps=1e-6),
98
+ TemporalResnetBlock(512, 512, eps=1e-6),
99
+ UpSampler(512),
100
+ # UpDecoderBlock
101
+ ResnetBlock(512, 256, eps=1e-6),
102
+ TemporalResnetBlock(256, 256, eps=1e-6),
103
+ ResnetBlock(256, 256, eps=1e-6),
104
+ TemporalResnetBlock(256, 256, eps=1e-6),
105
+ ResnetBlock(256, 256, eps=1e-6),
106
+ TemporalResnetBlock(256, 256, eps=1e-6),
107
+ UpSampler(256),
108
+ # UpDecoderBlock
109
+ ResnetBlock(256, 128, eps=1e-6),
110
+ TemporalResnetBlock(128, 128, eps=1e-6),
111
+ ResnetBlock(128, 128, eps=1e-6),
112
+ TemporalResnetBlock(128, 128, eps=1e-6),
113
+ ResnetBlock(128, 128, eps=1e-6),
114
+ TemporalResnetBlock(128, 128, eps=1e-6),
115
+ ])
116
+
117
+ self.conv_norm_out = torch.nn.GroupNorm(num_channels=128, num_groups=32, eps=1e-5)
118
+ self.conv_act = torch.nn.SiLU()
119
+ self.conv_out = torch.nn.Conv2d(128, 3, kernel_size=3, padding=1)
120
+ self.time_conv_out = torch.nn.Conv3d(3, 3, kernel_size=(3, 1, 1), padding=(1, 0, 0))
121
+
122
+
123
+ def forward(self, sample):
124
+ # 1. pre-process
125
+ hidden_states = rearrange(sample, "C T H W -> T C H W")
126
+ hidden_states = hidden_states / self.scaling_factor
127
+ hidden_states = self.conv_in(hidden_states)
128
+ time_emb, text_emb, res_stack = None, None, None
129
+
130
+ # 2. blocks
131
+ for i, block in enumerate(self.blocks):
132
+ hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
133
+
134
+ # 3. output
135
+ hidden_states = self.conv_norm_out(hidden_states)
136
+ hidden_states = self.conv_act(hidden_states)
137
+ hidden_states = self.conv_out(hidden_states)
138
+ hidden_states = rearrange(hidden_states, "T C H W -> C T H W")
139
+ hidden_states = self.time_conv_out(hidden_states)
140
+
141
+ return hidden_states
142
+
143
+
144
+ def build_mask(self, data, is_bound):
145
+ _, T, H, W = data.shape
146
+ t = repeat(torch.arange(T), "T -> T H W", T=T, H=H, W=W)
147
+ h = repeat(torch.arange(H), "H -> T H W", T=T, H=H, W=W)
148
+ w = repeat(torch.arange(W), "W -> T H W", T=T, H=H, W=W)
149
+ border_width = (T + H + W) // 6
150
+ pad = torch.ones_like(t) * border_width
151
+ mask = torch.stack([
152
+ pad if is_bound[0] else t + 1,
153
+ pad if is_bound[1] else T - t,
154
+ pad if is_bound[2] else h + 1,
155
+ pad if is_bound[3] else H - h,
156
+ pad if is_bound[4] else w + 1,
157
+ pad if is_bound[5] else W - w
158
+ ]).min(dim=0).values
159
+ mask = mask.clip(1, border_width)
160
+ mask = (mask / border_width).to(dtype=data.dtype, device=data.device)
161
+ mask = rearrange(mask, "T H W -> 1 T H W")
162
+ return mask
163
+
164
+
165
+ def decode_video(
166
+ self, sample,
167
+ batch_time=8, batch_height=128, batch_width=128,
168
+ stride_time=4, stride_height=32, stride_width=32,
169
+ progress_bar=lambda x:x
170
+ ):
171
+ sample = sample.permute(1, 0, 2, 3)
172
+ data_device = sample.device
173
+ computation_device = self.conv_in.weight.device
174
+ torch_dtype = sample.dtype
175
+ _, T, H, W = sample.shape
176
+
177
+ weight = torch.zeros((1, T, H*8, W*8), dtype=torch_dtype, device=data_device)
178
+ values = torch.zeros((3, T, H*8, W*8), dtype=torch_dtype, device=data_device)
179
+
180
+ # Split tasks
181
+ tasks = []
182
+ for t in range(0, T, stride_time):
183
+ for h in range(0, H, stride_height):
184
+ for w in range(0, W, stride_width):
185
+ if (t-stride_time >= 0 and t-stride_time+batch_time >= T)\
186
+ or (h-stride_height >= 0 and h-stride_height+batch_height >= H)\
187
+ or (w-stride_width >= 0 and w-stride_width+batch_width >= W):
188
+ continue
189
+ tasks.append((t, t+batch_time, h, h+batch_height, w, w+batch_width))
190
+
191
+ # Run
192
+ for tl, tr, hl, hr, wl, wr in progress_bar(tasks):
193
+ sample_batch = sample[:, tl:tr, hl:hr, wl:wr].to(computation_device)
194
+ sample_batch = self.forward(sample_batch).to(data_device)
195
+ mask = self.build_mask(sample_batch, is_bound=(tl==0, tr>=T, hl==0, hr>=H, wl==0, wr>=W))
196
+ values[:, tl:tr, hl*8:hr*8, wl*8:wr*8] += sample_batch * mask
197
+ weight[:, tl:tr, hl*8:hr*8, wl*8:wr*8] += mask
198
+ values /= weight
199
+ return values
200
+
201
+
202
+ def state_dict_converter(self):
203
+ return SVDVAEDecoderStateDictConverter()
204
+
205
+
206
+ class SVDVAEDecoderStateDictConverter:
207
+ def __init__(self):
208
+ pass
209
+
210
+ def from_diffusers(self, state_dict):
211
+ static_rename_dict = {
212
+ "decoder.conv_in": "conv_in",
213
+ "decoder.mid_block.attentions.0.group_norm": "blocks.2.norm",
214
+ "decoder.mid_block.attentions.0.to_q": "blocks.2.transformer_blocks.0.to_q",
215
+ "decoder.mid_block.attentions.0.to_k": "blocks.2.transformer_blocks.0.to_k",
216
+ "decoder.mid_block.attentions.0.to_v": "blocks.2.transformer_blocks.0.to_v",
217
+ "decoder.mid_block.attentions.0.to_out.0": "blocks.2.transformer_blocks.0.to_out",
218
+ "decoder.up_blocks.0.upsamplers.0.conv": "blocks.11.conv",
219
+ "decoder.up_blocks.1.upsamplers.0.conv": "blocks.18.conv",
220
+ "decoder.up_blocks.2.upsamplers.0.conv": "blocks.25.conv",
221
+ "decoder.conv_norm_out": "conv_norm_out",
222
+ "decoder.conv_out": "conv_out",
223
+ "decoder.time_conv_out": "time_conv_out"
224
+ }
225
+ prefix_rename_dict = {
226
+ "decoder.mid_block.resnets.0.spatial_res_block": "blocks.0",
227
+ "decoder.mid_block.resnets.0.temporal_res_block": "blocks.1",
228
+ "decoder.mid_block.resnets.0.time_mixer": "blocks.1",
229
+ "decoder.mid_block.resnets.1.spatial_res_block": "blocks.3",
230
+ "decoder.mid_block.resnets.1.temporal_res_block": "blocks.4",
231
+ "decoder.mid_block.resnets.1.time_mixer": "blocks.4",
232
+
233
+ "decoder.up_blocks.0.resnets.0.spatial_res_block": "blocks.5",
234
+ "decoder.up_blocks.0.resnets.0.temporal_res_block": "blocks.6",
235
+ "decoder.up_blocks.0.resnets.0.time_mixer": "blocks.6",
236
+ "decoder.up_blocks.0.resnets.1.spatial_res_block": "blocks.7",
237
+ "decoder.up_blocks.0.resnets.1.temporal_res_block": "blocks.8",
238
+ "decoder.up_blocks.0.resnets.1.time_mixer": "blocks.8",
239
+ "decoder.up_blocks.0.resnets.2.spatial_res_block": "blocks.9",
240
+ "decoder.up_blocks.0.resnets.2.temporal_res_block": "blocks.10",
241
+ "decoder.up_blocks.0.resnets.2.time_mixer": "blocks.10",
242
+
243
+ "decoder.up_blocks.1.resnets.0.spatial_res_block": "blocks.12",
244
+ "decoder.up_blocks.1.resnets.0.temporal_res_block": "blocks.13",
245
+ "decoder.up_blocks.1.resnets.0.time_mixer": "blocks.13",
246
+ "decoder.up_blocks.1.resnets.1.spatial_res_block": "blocks.14",
247
+ "decoder.up_blocks.1.resnets.1.temporal_res_block": "blocks.15",
248
+ "decoder.up_blocks.1.resnets.1.time_mixer": "blocks.15",
249
+ "decoder.up_blocks.1.resnets.2.spatial_res_block": "blocks.16",
250
+ "decoder.up_blocks.1.resnets.2.temporal_res_block": "blocks.17",
251
+ "decoder.up_blocks.1.resnets.2.time_mixer": "blocks.17",
252
+
253
+ "decoder.up_blocks.2.resnets.0.spatial_res_block": "blocks.19",
254
+ "decoder.up_blocks.2.resnets.0.temporal_res_block": "blocks.20",
255
+ "decoder.up_blocks.2.resnets.0.time_mixer": "blocks.20",
256
+ "decoder.up_blocks.2.resnets.1.spatial_res_block": "blocks.21",
257
+ "decoder.up_blocks.2.resnets.1.temporal_res_block": "blocks.22",
258
+ "decoder.up_blocks.2.resnets.1.time_mixer": "blocks.22",
259
+ "decoder.up_blocks.2.resnets.2.spatial_res_block": "blocks.23",
260
+ "decoder.up_blocks.2.resnets.2.temporal_res_block": "blocks.24",
261
+ "decoder.up_blocks.2.resnets.2.time_mixer": "blocks.24",
262
+
263
+ "decoder.up_blocks.3.resnets.0.spatial_res_block": "blocks.26",
264
+ "decoder.up_blocks.3.resnets.0.temporal_res_block": "blocks.27",
265
+ "decoder.up_blocks.3.resnets.0.time_mixer": "blocks.27",
266
+ "decoder.up_blocks.3.resnets.1.spatial_res_block": "blocks.28",
267
+ "decoder.up_blocks.3.resnets.1.temporal_res_block": "blocks.29",
268
+ "decoder.up_blocks.3.resnets.1.time_mixer": "blocks.29",
269
+ "decoder.up_blocks.3.resnets.2.spatial_res_block": "blocks.30",
270
+ "decoder.up_blocks.3.resnets.2.temporal_res_block": "blocks.31",
271
+ "decoder.up_blocks.3.resnets.2.time_mixer": "blocks.31",
272
+ }
273
+ suffix_rename_dict = {
274
+ "norm1.weight": "norm1.weight",
275
+ "conv1.weight": "conv1.weight",
276
+ "norm2.weight": "norm2.weight",
277
+ "conv2.weight": "conv2.weight",
278
+ "conv_shortcut.weight": "conv_shortcut.weight",
279
+ "norm1.bias": "norm1.bias",
280
+ "conv1.bias": "conv1.bias",
281
+ "norm2.bias": "norm2.bias",
282
+ "conv2.bias": "conv2.bias",
283
+ "conv_shortcut.bias": "conv_shortcut.bias",
284
+ "mix_factor": "mix_factor",
285
+ }
286
+
287
+ state_dict_ = {}
288
+ for name in static_rename_dict:
289
+ state_dict_[static_rename_dict[name] + ".weight"] = state_dict[name + ".weight"]
290
+ state_dict_[static_rename_dict[name] + ".bias"] = state_dict[name + ".bias"]
291
+ for prefix_name in prefix_rename_dict:
292
+ for suffix_name in suffix_rename_dict:
293
+ name = prefix_name + "." + suffix_name
294
+ name_ = prefix_rename_dict[prefix_name] + "." + suffix_rename_dict[suffix_name]
295
+ if name in state_dict:
296
+ state_dict_[name_] = state_dict[name]
297
+
298
+ return state_dict_
299
+
300
+
301
+ def from_civitai(self, state_dict):
302
+ rename_dict = {
303
+ "first_stage_model.decoder.conv_in.bias": "conv_in.bias",
304
+ "first_stage_model.decoder.conv_in.weight": "conv_in.weight",
305
+ "first_stage_model.decoder.conv_out.bias": "conv_out.bias",
306
+ "first_stage_model.decoder.conv_out.time_mix_conv.bias": "time_conv_out.bias",
307
+ "first_stage_model.decoder.conv_out.time_mix_conv.weight": "time_conv_out.weight",
308
+ "first_stage_model.decoder.conv_out.weight": "conv_out.weight",
309
+ "first_stage_model.decoder.mid.attn_1.k.bias": "blocks.2.transformer_blocks.0.to_k.bias",
310
+ "first_stage_model.decoder.mid.attn_1.k.weight": "blocks.2.transformer_blocks.0.to_k.weight",
311
+ "first_stage_model.decoder.mid.attn_1.norm.bias": "blocks.2.norm.bias",
312
+ "first_stage_model.decoder.mid.attn_1.norm.weight": "blocks.2.norm.weight",
313
+ "first_stage_model.decoder.mid.attn_1.proj_out.bias": "blocks.2.transformer_blocks.0.to_out.bias",
314
+ "first_stage_model.decoder.mid.attn_1.proj_out.weight": "blocks.2.transformer_blocks.0.to_out.weight",
315
+ "first_stage_model.decoder.mid.attn_1.q.bias": "blocks.2.transformer_blocks.0.to_q.bias",
316
+ "first_stage_model.decoder.mid.attn_1.q.weight": "blocks.2.transformer_blocks.0.to_q.weight",
317
+ "first_stage_model.decoder.mid.attn_1.v.bias": "blocks.2.transformer_blocks.0.to_v.bias",
318
+ "first_stage_model.decoder.mid.attn_1.v.weight": "blocks.2.transformer_blocks.0.to_v.weight",
319
+ "first_stage_model.decoder.mid.block_1.conv1.bias": "blocks.0.conv1.bias",
320
+ "first_stage_model.decoder.mid.block_1.conv1.weight": "blocks.0.conv1.weight",
321
+ "first_stage_model.decoder.mid.block_1.conv2.bias": "blocks.0.conv2.bias",
322
+ "first_stage_model.decoder.mid.block_1.conv2.weight": "blocks.0.conv2.weight",
323
+ "first_stage_model.decoder.mid.block_1.mix_factor": "blocks.1.mix_factor",
324
+ "first_stage_model.decoder.mid.block_1.norm1.bias": "blocks.0.norm1.bias",
325
+ "first_stage_model.decoder.mid.block_1.norm1.weight": "blocks.0.norm1.weight",
326
+ "first_stage_model.decoder.mid.block_1.norm2.bias": "blocks.0.norm2.bias",
327
+ "first_stage_model.decoder.mid.block_1.norm2.weight": "blocks.0.norm2.weight",
328
+ "first_stage_model.decoder.mid.block_1.time_stack.in_layers.0.bias": "blocks.1.norm1.bias",
329
+ "first_stage_model.decoder.mid.block_1.time_stack.in_layers.0.weight": "blocks.1.norm1.weight",
330
+ "first_stage_model.decoder.mid.block_1.time_stack.in_layers.2.bias": "blocks.1.conv1.bias",
331
+ "first_stage_model.decoder.mid.block_1.time_stack.in_layers.2.weight": "blocks.1.conv1.weight",
332
+ "first_stage_model.decoder.mid.block_1.time_stack.out_layers.0.bias": "blocks.1.norm2.bias",
333
+ "first_stage_model.decoder.mid.block_1.time_stack.out_layers.0.weight": "blocks.1.norm2.weight",
334
+ "first_stage_model.decoder.mid.block_1.time_stack.out_layers.3.bias": "blocks.1.conv2.bias",
335
+ "first_stage_model.decoder.mid.block_1.time_stack.out_layers.3.weight": "blocks.1.conv2.weight",
336
+ "first_stage_model.decoder.mid.block_2.conv1.bias": "blocks.3.conv1.bias",
337
+ "first_stage_model.decoder.mid.block_2.conv1.weight": "blocks.3.conv1.weight",
338
+ "first_stage_model.decoder.mid.block_2.conv2.bias": "blocks.3.conv2.bias",
339
+ "first_stage_model.decoder.mid.block_2.conv2.weight": "blocks.3.conv2.weight",
340
+ "first_stage_model.decoder.mid.block_2.mix_factor": "blocks.4.mix_factor",
341
+ "first_stage_model.decoder.mid.block_2.norm1.bias": "blocks.3.norm1.bias",
342
+ "first_stage_model.decoder.mid.block_2.norm1.weight": "blocks.3.norm1.weight",
343
+ "first_stage_model.decoder.mid.block_2.norm2.bias": "blocks.3.norm2.bias",
344
+ "first_stage_model.decoder.mid.block_2.norm2.weight": "blocks.3.norm2.weight",
345
+ "first_stage_model.decoder.mid.block_2.time_stack.in_layers.0.bias": "blocks.4.norm1.bias",
346
+ "first_stage_model.decoder.mid.block_2.time_stack.in_layers.0.weight": "blocks.4.norm1.weight",
347
+ "first_stage_model.decoder.mid.block_2.time_stack.in_layers.2.bias": "blocks.4.conv1.bias",
348
+ "first_stage_model.decoder.mid.block_2.time_stack.in_layers.2.weight": "blocks.4.conv1.weight",
349
+ "first_stage_model.decoder.mid.block_2.time_stack.out_layers.0.bias": "blocks.4.norm2.bias",
350
+ "first_stage_model.decoder.mid.block_2.time_stack.out_layers.0.weight": "blocks.4.norm2.weight",
351
+ "first_stage_model.decoder.mid.block_2.time_stack.out_layers.3.bias": "blocks.4.conv2.bias",
352
+ "first_stage_model.decoder.mid.block_2.time_stack.out_layers.3.weight": "blocks.4.conv2.weight",
353
+ "first_stage_model.decoder.norm_out.bias": "conv_norm_out.bias",
354
+ "first_stage_model.decoder.norm_out.weight": "conv_norm_out.weight",
355
+ "first_stage_model.decoder.up.0.block.0.conv1.bias": "blocks.26.conv1.bias",
356
+ "first_stage_model.decoder.up.0.block.0.conv1.weight": "blocks.26.conv1.weight",
357
+ "first_stage_model.decoder.up.0.block.0.conv2.bias": "blocks.26.conv2.bias",
358
+ "first_stage_model.decoder.up.0.block.0.conv2.weight": "blocks.26.conv2.weight",
359
+ "first_stage_model.decoder.up.0.block.0.mix_factor": "blocks.27.mix_factor",
360
+ "first_stage_model.decoder.up.0.block.0.nin_shortcut.bias": "blocks.26.conv_shortcut.bias",
361
+ "first_stage_model.decoder.up.0.block.0.nin_shortcut.weight": "blocks.26.conv_shortcut.weight",
362
+ "first_stage_model.decoder.up.0.block.0.norm1.bias": "blocks.26.norm1.bias",
363
+ "first_stage_model.decoder.up.0.block.0.norm1.weight": "blocks.26.norm1.weight",
364
+ "first_stage_model.decoder.up.0.block.0.norm2.bias": "blocks.26.norm2.bias",
365
+ "first_stage_model.decoder.up.0.block.0.norm2.weight": "blocks.26.norm2.weight",
366
+ "first_stage_model.decoder.up.0.block.0.time_stack.in_layers.0.bias": "blocks.27.norm1.bias",
367
+ "first_stage_model.decoder.up.0.block.0.time_stack.in_layers.0.weight": "blocks.27.norm1.weight",
368
+ "first_stage_model.decoder.up.0.block.0.time_stack.in_layers.2.bias": "blocks.27.conv1.bias",
369
+ "first_stage_model.decoder.up.0.block.0.time_stack.in_layers.2.weight": "blocks.27.conv1.weight",
370
+ "first_stage_model.decoder.up.0.block.0.time_stack.out_layers.0.bias": "blocks.27.norm2.bias",
371
+ "first_stage_model.decoder.up.0.block.0.time_stack.out_layers.0.weight": "blocks.27.norm2.weight",
372
+ "first_stage_model.decoder.up.0.block.0.time_stack.out_layers.3.bias": "blocks.27.conv2.bias",
373
+ "first_stage_model.decoder.up.0.block.0.time_stack.out_layers.3.weight": "blocks.27.conv2.weight",
374
+ "first_stage_model.decoder.up.0.block.1.conv1.bias": "blocks.28.conv1.bias",
375
+ "first_stage_model.decoder.up.0.block.1.conv1.weight": "blocks.28.conv1.weight",
376
+ "first_stage_model.decoder.up.0.block.1.conv2.bias": "blocks.28.conv2.bias",
377
+ "first_stage_model.decoder.up.0.block.1.conv2.weight": "blocks.28.conv2.weight",
378
+ "first_stage_model.decoder.up.0.block.1.mix_factor": "blocks.29.mix_factor",
379
+ "first_stage_model.decoder.up.0.block.1.norm1.bias": "blocks.28.norm1.bias",
380
+ "first_stage_model.decoder.up.0.block.1.norm1.weight": "blocks.28.norm1.weight",
381
+ "first_stage_model.decoder.up.0.block.1.norm2.bias": "blocks.28.norm2.bias",
382
+ "first_stage_model.decoder.up.0.block.1.norm2.weight": "blocks.28.norm2.weight",
383
+ "first_stage_model.decoder.up.0.block.1.time_stack.in_layers.0.bias": "blocks.29.norm1.bias",
384
+ "first_stage_model.decoder.up.0.block.1.time_stack.in_layers.0.weight": "blocks.29.norm1.weight",
385
+ "first_stage_model.decoder.up.0.block.1.time_stack.in_layers.2.bias": "blocks.29.conv1.bias",
386
+ "first_stage_model.decoder.up.0.block.1.time_stack.in_layers.2.weight": "blocks.29.conv1.weight",
387
+ "first_stage_model.decoder.up.0.block.1.time_stack.out_layers.0.bias": "blocks.29.norm2.bias",
388
+ "first_stage_model.decoder.up.0.block.1.time_stack.out_layers.0.weight": "blocks.29.norm2.weight",
389
+ "first_stage_model.decoder.up.0.block.1.time_stack.out_layers.3.bias": "blocks.29.conv2.bias",
390
+ "first_stage_model.decoder.up.0.block.1.time_stack.out_layers.3.weight": "blocks.29.conv2.weight",
391
+ "first_stage_model.decoder.up.0.block.2.conv1.bias": "blocks.30.conv1.bias",
392
+ "first_stage_model.decoder.up.0.block.2.conv1.weight": "blocks.30.conv1.weight",
393
+ "first_stage_model.decoder.up.0.block.2.conv2.bias": "blocks.30.conv2.bias",
394
+ "first_stage_model.decoder.up.0.block.2.conv2.weight": "blocks.30.conv2.weight",
395
+ "first_stage_model.decoder.up.0.block.2.mix_factor": "blocks.31.mix_factor",
396
+ "first_stage_model.decoder.up.0.block.2.norm1.bias": "blocks.30.norm1.bias",
397
+ "first_stage_model.decoder.up.0.block.2.norm1.weight": "blocks.30.norm1.weight",
398
+ "first_stage_model.decoder.up.0.block.2.norm2.bias": "blocks.30.norm2.bias",
399
+ "first_stage_model.decoder.up.0.block.2.norm2.weight": "blocks.30.norm2.weight",
400
+ "first_stage_model.decoder.up.0.block.2.time_stack.in_layers.0.bias": "blocks.31.norm1.bias",
401
+ "first_stage_model.decoder.up.0.block.2.time_stack.in_layers.0.weight": "blocks.31.norm1.weight",
402
+ "first_stage_model.decoder.up.0.block.2.time_stack.in_layers.2.bias": "blocks.31.conv1.bias",
403
+ "first_stage_model.decoder.up.0.block.2.time_stack.in_layers.2.weight": "blocks.31.conv1.weight",
404
+ "first_stage_model.decoder.up.0.block.2.time_stack.out_layers.0.bias": "blocks.31.norm2.bias",
405
+ "first_stage_model.decoder.up.0.block.2.time_stack.out_layers.0.weight": "blocks.31.norm2.weight",
406
+ "first_stage_model.decoder.up.0.block.2.time_stack.out_layers.3.bias": "blocks.31.conv2.bias",
407
+ "first_stage_model.decoder.up.0.block.2.time_stack.out_layers.3.weight": "blocks.31.conv2.weight",
408
+ "first_stage_model.decoder.up.1.block.0.conv1.bias": "blocks.19.conv1.bias",
409
+ "first_stage_model.decoder.up.1.block.0.conv1.weight": "blocks.19.conv1.weight",
410
+ "first_stage_model.decoder.up.1.block.0.conv2.bias": "blocks.19.conv2.bias",
411
+ "first_stage_model.decoder.up.1.block.0.conv2.weight": "blocks.19.conv2.weight",
412
+ "first_stage_model.decoder.up.1.block.0.mix_factor": "blocks.20.mix_factor",
413
+ "first_stage_model.decoder.up.1.block.0.nin_shortcut.bias": "blocks.19.conv_shortcut.bias",
414
+ "first_stage_model.decoder.up.1.block.0.nin_shortcut.weight": "blocks.19.conv_shortcut.weight",
415
+ "first_stage_model.decoder.up.1.block.0.norm1.bias": "blocks.19.norm1.bias",
416
+ "first_stage_model.decoder.up.1.block.0.norm1.weight": "blocks.19.norm1.weight",
417
+ "first_stage_model.decoder.up.1.block.0.norm2.bias": "blocks.19.norm2.bias",
418
+ "first_stage_model.decoder.up.1.block.0.norm2.weight": "blocks.19.norm2.weight",
419
+ "first_stage_model.decoder.up.1.block.0.time_stack.in_layers.0.bias": "blocks.20.norm1.bias",
420
+ "first_stage_model.decoder.up.1.block.0.time_stack.in_layers.0.weight": "blocks.20.norm1.weight",
421
+ "first_stage_model.decoder.up.1.block.0.time_stack.in_layers.2.bias": "blocks.20.conv1.bias",
422
+ "first_stage_model.decoder.up.1.block.0.time_stack.in_layers.2.weight": "blocks.20.conv1.weight",
423
+ "first_stage_model.decoder.up.1.block.0.time_stack.out_layers.0.bias": "blocks.20.norm2.bias",
424
+ "first_stage_model.decoder.up.1.block.0.time_stack.out_layers.0.weight": "blocks.20.norm2.weight",
425
+ "first_stage_model.decoder.up.1.block.0.time_stack.out_layers.3.bias": "blocks.20.conv2.bias",
426
+ "first_stage_model.decoder.up.1.block.0.time_stack.out_layers.3.weight": "blocks.20.conv2.weight",
427
+ "first_stage_model.decoder.up.1.block.1.conv1.bias": "blocks.21.conv1.bias",
428
+ "first_stage_model.decoder.up.1.block.1.conv1.weight": "blocks.21.conv1.weight",
429
+ "first_stage_model.decoder.up.1.block.1.conv2.bias": "blocks.21.conv2.bias",
430
+ "first_stage_model.decoder.up.1.block.1.conv2.weight": "blocks.21.conv2.weight",
431
+ "first_stage_model.decoder.up.1.block.1.mix_factor": "blocks.22.mix_factor",
432
+ "first_stage_model.decoder.up.1.block.1.norm1.bias": "blocks.21.norm1.bias",
433
+ "first_stage_model.decoder.up.1.block.1.norm1.weight": "blocks.21.norm1.weight",
434
+ "first_stage_model.decoder.up.1.block.1.norm2.bias": "blocks.21.norm2.bias",
435
+ "first_stage_model.decoder.up.1.block.1.norm2.weight": "blocks.21.norm2.weight",
436
+ "first_stage_model.decoder.up.1.block.1.time_stack.in_layers.0.bias": "blocks.22.norm1.bias",
437
+ "first_stage_model.decoder.up.1.block.1.time_stack.in_layers.0.weight": "blocks.22.norm1.weight",
438
+ "first_stage_model.decoder.up.1.block.1.time_stack.in_layers.2.bias": "blocks.22.conv1.bias",
439
+ "first_stage_model.decoder.up.1.block.1.time_stack.in_layers.2.weight": "blocks.22.conv1.weight",
440
+ "first_stage_model.decoder.up.1.block.1.time_stack.out_layers.0.bias": "blocks.22.norm2.bias",
441
+ "first_stage_model.decoder.up.1.block.1.time_stack.out_layers.0.weight": "blocks.22.norm2.weight",
442
+ "first_stage_model.decoder.up.1.block.1.time_stack.out_layers.3.bias": "blocks.22.conv2.bias",
443
+ "first_stage_model.decoder.up.1.block.1.time_stack.out_layers.3.weight": "blocks.22.conv2.weight",
444
+ "first_stage_model.decoder.up.1.block.2.conv1.bias": "blocks.23.conv1.bias",
445
+ "first_stage_model.decoder.up.1.block.2.conv1.weight": "blocks.23.conv1.weight",
446
+ "first_stage_model.decoder.up.1.block.2.conv2.bias": "blocks.23.conv2.bias",
447
+ "first_stage_model.decoder.up.1.block.2.conv2.weight": "blocks.23.conv2.weight",
448
+ "first_stage_model.decoder.up.1.block.2.mix_factor": "blocks.24.mix_factor",
449
+ "first_stage_model.decoder.up.1.block.2.norm1.bias": "blocks.23.norm1.bias",
450
+ "first_stage_model.decoder.up.1.block.2.norm1.weight": "blocks.23.norm1.weight",
451
+ "first_stage_model.decoder.up.1.block.2.norm2.bias": "blocks.23.norm2.bias",
452
+ "first_stage_model.decoder.up.1.block.2.norm2.weight": "blocks.23.norm2.weight",
453
+ "first_stage_model.decoder.up.1.block.2.time_stack.in_layers.0.bias": "blocks.24.norm1.bias",
454
+ "first_stage_model.decoder.up.1.block.2.time_stack.in_layers.0.weight": "blocks.24.norm1.weight",
455
+ "first_stage_model.decoder.up.1.block.2.time_stack.in_layers.2.bias": "blocks.24.conv1.bias",
456
+ "first_stage_model.decoder.up.1.block.2.time_stack.in_layers.2.weight": "blocks.24.conv1.weight",
457
+ "first_stage_model.decoder.up.1.block.2.time_stack.out_layers.0.bias": "blocks.24.norm2.bias",
458
+ "first_stage_model.decoder.up.1.block.2.time_stack.out_layers.0.weight": "blocks.24.norm2.weight",
459
+ "first_stage_model.decoder.up.1.block.2.time_stack.out_layers.3.bias": "blocks.24.conv2.bias",
460
+ "first_stage_model.decoder.up.1.block.2.time_stack.out_layers.3.weight": "blocks.24.conv2.weight",
461
+ "first_stage_model.decoder.up.1.upsample.conv.bias": "blocks.25.conv.bias",
462
+ "first_stage_model.decoder.up.1.upsample.conv.weight": "blocks.25.conv.weight",
463
+ "first_stage_model.decoder.up.2.block.0.conv1.bias": "blocks.12.conv1.bias",
464
+ "first_stage_model.decoder.up.2.block.0.conv1.weight": "blocks.12.conv1.weight",
465
+ "first_stage_model.decoder.up.2.block.0.conv2.bias": "blocks.12.conv2.bias",
466
+ "first_stage_model.decoder.up.2.block.0.conv2.weight": "blocks.12.conv2.weight",
467
+ "first_stage_model.decoder.up.2.block.0.mix_factor": "blocks.13.mix_factor",
468
+ "first_stage_model.decoder.up.2.block.0.norm1.bias": "blocks.12.norm1.bias",
469
+ "first_stage_model.decoder.up.2.block.0.norm1.weight": "blocks.12.norm1.weight",
470
+ "first_stage_model.decoder.up.2.block.0.norm2.bias": "blocks.12.norm2.bias",
471
+ "first_stage_model.decoder.up.2.block.0.norm2.weight": "blocks.12.norm2.weight",
472
+ "first_stage_model.decoder.up.2.block.0.time_stack.in_layers.0.bias": "blocks.13.norm1.bias",
473
+ "first_stage_model.decoder.up.2.block.0.time_stack.in_layers.0.weight": "blocks.13.norm1.weight",
474
+ "first_stage_model.decoder.up.2.block.0.time_stack.in_layers.2.bias": "blocks.13.conv1.bias",
475
+ "first_stage_model.decoder.up.2.block.0.time_stack.in_layers.2.weight": "blocks.13.conv1.weight",
476
+ "first_stage_model.decoder.up.2.block.0.time_stack.out_layers.0.bias": "blocks.13.norm2.bias",
477
+ "first_stage_model.decoder.up.2.block.0.time_stack.out_layers.0.weight": "blocks.13.norm2.weight",
478
+ "first_stage_model.decoder.up.2.block.0.time_stack.out_layers.3.bias": "blocks.13.conv2.bias",
479
+ "first_stage_model.decoder.up.2.block.0.time_stack.out_layers.3.weight": "blocks.13.conv2.weight",
480
+ "first_stage_model.decoder.up.2.block.1.conv1.bias": "blocks.14.conv1.bias",
481
+ "first_stage_model.decoder.up.2.block.1.conv1.weight": "blocks.14.conv1.weight",
482
+ "first_stage_model.decoder.up.2.block.1.conv2.bias": "blocks.14.conv2.bias",
483
+ "first_stage_model.decoder.up.2.block.1.conv2.weight": "blocks.14.conv2.weight",
484
+ "first_stage_model.decoder.up.2.block.1.mix_factor": "blocks.15.mix_factor",
485
+ "first_stage_model.decoder.up.2.block.1.norm1.bias": "blocks.14.norm1.bias",
486
+ "first_stage_model.decoder.up.2.block.1.norm1.weight": "blocks.14.norm1.weight",
487
+ "first_stage_model.decoder.up.2.block.1.norm2.bias": "blocks.14.norm2.bias",
488
+ "first_stage_model.decoder.up.2.block.1.norm2.weight": "blocks.14.norm2.weight",
489
+ "first_stage_model.decoder.up.2.block.1.time_stack.in_layers.0.bias": "blocks.15.norm1.bias",
490
+ "first_stage_model.decoder.up.2.block.1.time_stack.in_layers.0.weight": "blocks.15.norm1.weight",
491
+ "first_stage_model.decoder.up.2.block.1.time_stack.in_layers.2.bias": "blocks.15.conv1.bias",
492
+ "first_stage_model.decoder.up.2.block.1.time_stack.in_layers.2.weight": "blocks.15.conv1.weight",
493
+ "first_stage_model.decoder.up.2.block.1.time_stack.out_layers.0.bias": "blocks.15.norm2.bias",
494
+ "first_stage_model.decoder.up.2.block.1.time_stack.out_layers.0.weight": "blocks.15.norm2.weight",
495
+ "first_stage_model.decoder.up.2.block.1.time_stack.out_layers.3.bias": "blocks.15.conv2.bias",
496
+ "first_stage_model.decoder.up.2.block.1.time_stack.out_layers.3.weight": "blocks.15.conv2.weight",
497
+ "first_stage_model.decoder.up.2.block.2.conv1.bias": "blocks.16.conv1.bias",
498
+ "first_stage_model.decoder.up.2.block.2.conv1.weight": "blocks.16.conv1.weight",
499
+ "first_stage_model.decoder.up.2.block.2.conv2.bias": "blocks.16.conv2.bias",
500
+ "first_stage_model.decoder.up.2.block.2.conv2.weight": "blocks.16.conv2.weight",
501
+ "first_stage_model.decoder.up.2.block.2.mix_factor": "blocks.17.mix_factor",
502
+ "first_stage_model.decoder.up.2.block.2.norm1.bias": "blocks.16.norm1.bias",
503
+ "first_stage_model.decoder.up.2.block.2.norm1.weight": "blocks.16.norm1.weight",
504
+ "first_stage_model.decoder.up.2.block.2.norm2.bias": "blocks.16.norm2.bias",
505
+ "first_stage_model.decoder.up.2.block.2.norm2.weight": "blocks.16.norm2.weight",
506
+ "first_stage_model.decoder.up.2.block.2.time_stack.in_layers.0.bias": "blocks.17.norm1.bias",
507
+ "first_stage_model.decoder.up.2.block.2.time_stack.in_layers.0.weight": "blocks.17.norm1.weight",
508
+ "first_stage_model.decoder.up.2.block.2.time_stack.in_layers.2.bias": "blocks.17.conv1.bias",
509
+ "first_stage_model.decoder.up.2.block.2.time_stack.in_layers.2.weight": "blocks.17.conv1.weight",
510
+ "first_stage_model.decoder.up.2.block.2.time_stack.out_layers.0.bias": "blocks.17.norm2.bias",
511
+ "first_stage_model.decoder.up.2.block.2.time_stack.out_layers.0.weight": "blocks.17.norm2.weight",
512
+ "first_stage_model.decoder.up.2.block.2.time_stack.out_layers.3.bias": "blocks.17.conv2.bias",
513
+ "first_stage_model.decoder.up.2.block.2.time_stack.out_layers.3.weight": "blocks.17.conv2.weight",
514
+ "first_stage_model.decoder.up.2.upsample.conv.bias": "blocks.18.conv.bias",
515
+ "first_stage_model.decoder.up.2.upsample.conv.weight": "blocks.18.conv.weight",
516
+ "first_stage_model.decoder.up.3.block.0.conv1.bias": "blocks.5.conv1.bias",
517
+ "first_stage_model.decoder.up.3.block.0.conv1.weight": "blocks.5.conv1.weight",
518
+ "first_stage_model.decoder.up.3.block.0.conv2.bias": "blocks.5.conv2.bias",
519
+ "first_stage_model.decoder.up.3.block.0.conv2.weight": "blocks.5.conv2.weight",
520
+ "first_stage_model.decoder.up.3.block.0.mix_factor": "blocks.6.mix_factor",
521
+ "first_stage_model.decoder.up.3.block.0.norm1.bias": "blocks.5.norm1.bias",
522
+ "first_stage_model.decoder.up.3.block.0.norm1.weight": "blocks.5.norm1.weight",
523
+ "first_stage_model.decoder.up.3.block.0.norm2.bias": "blocks.5.norm2.bias",
524
+ "first_stage_model.decoder.up.3.block.0.norm2.weight": "blocks.5.norm2.weight",
525
+ "first_stage_model.decoder.up.3.block.0.time_stack.in_layers.0.bias": "blocks.6.norm1.bias",
526
+ "first_stage_model.decoder.up.3.block.0.time_stack.in_layers.0.weight": "blocks.6.norm1.weight",
527
+ "first_stage_model.decoder.up.3.block.0.time_stack.in_layers.2.bias": "blocks.6.conv1.bias",
528
+ "first_stage_model.decoder.up.3.block.0.time_stack.in_layers.2.weight": "blocks.6.conv1.weight",
529
+ "first_stage_model.decoder.up.3.block.0.time_stack.out_layers.0.bias": "blocks.6.norm2.bias",
530
+ "first_stage_model.decoder.up.3.block.0.time_stack.out_layers.0.weight": "blocks.6.norm2.weight",
531
+ "first_stage_model.decoder.up.3.block.0.time_stack.out_layers.3.bias": "blocks.6.conv2.bias",
532
+ "first_stage_model.decoder.up.3.block.0.time_stack.out_layers.3.weight": "blocks.6.conv2.weight",
533
+ "first_stage_model.decoder.up.3.block.1.conv1.bias": "blocks.7.conv1.bias",
534
+ "first_stage_model.decoder.up.3.block.1.conv1.weight": "blocks.7.conv1.weight",
535
+ "first_stage_model.decoder.up.3.block.1.conv2.bias": "blocks.7.conv2.bias",
536
+ "first_stage_model.decoder.up.3.block.1.conv2.weight": "blocks.7.conv2.weight",
537
+ "first_stage_model.decoder.up.3.block.1.mix_factor": "blocks.8.mix_factor",
538
+ "first_stage_model.decoder.up.3.block.1.norm1.bias": "blocks.7.norm1.bias",
539
+ "first_stage_model.decoder.up.3.block.1.norm1.weight": "blocks.7.norm1.weight",
540
+ "first_stage_model.decoder.up.3.block.1.norm2.bias": "blocks.7.norm2.bias",
541
+ "first_stage_model.decoder.up.3.block.1.norm2.weight": "blocks.7.norm2.weight",
542
+ "first_stage_model.decoder.up.3.block.1.time_stack.in_layers.0.bias": "blocks.8.norm1.bias",
543
+ "first_stage_model.decoder.up.3.block.1.time_stack.in_layers.0.weight": "blocks.8.norm1.weight",
544
+ "first_stage_model.decoder.up.3.block.1.time_stack.in_layers.2.bias": "blocks.8.conv1.bias",
545
+ "first_stage_model.decoder.up.3.block.1.time_stack.in_layers.2.weight": "blocks.8.conv1.weight",
546
+ "first_stage_model.decoder.up.3.block.1.time_stack.out_layers.0.bias": "blocks.8.norm2.bias",
547
+ "first_stage_model.decoder.up.3.block.1.time_stack.out_layers.0.weight": "blocks.8.norm2.weight",
548
+ "first_stage_model.decoder.up.3.block.1.time_stack.out_layers.3.bias": "blocks.8.conv2.bias",
549
+ "first_stage_model.decoder.up.3.block.1.time_stack.out_layers.3.weight": "blocks.8.conv2.weight",
550
+ "first_stage_model.decoder.up.3.block.2.conv1.bias": "blocks.9.conv1.bias",
551
+ "first_stage_model.decoder.up.3.block.2.conv1.weight": "blocks.9.conv1.weight",
552
+ "first_stage_model.decoder.up.3.block.2.conv2.bias": "blocks.9.conv2.bias",
553
+ "first_stage_model.decoder.up.3.block.2.conv2.weight": "blocks.9.conv2.weight",
554
+ "first_stage_model.decoder.up.3.block.2.mix_factor": "blocks.10.mix_factor",
555
+ "first_stage_model.decoder.up.3.block.2.norm1.bias": "blocks.9.norm1.bias",
556
+ "first_stage_model.decoder.up.3.block.2.norm1.weight": "blocks.9.norm1.weight",
557
+ "first_stage_model.decoder.up.3.block.2.norm2.bias": "blocks.9.norm2.bias",
558
+ "first_stage_model.decoder.up.3.block.2.norm2.weight": "blocks.9.norm2.weight",
559
+ "first_stage_model.decoder.up.3.block.2.time_stack.in_layers.0.bias": "blocks.10.norm1.bias",
560
+ "first_stage_model.decoder.up.3.block.2.time_stack.in_layers.0.weight": "blocks.10.norm1.weight",
561
+ "first_stage_model.decoder.up.3.block.2.time_stack.in_layers.2.bias": "blocks.10.conv1.bias",
562
+ "first_stage_model.decoder.up.3.block.2.time_stack.in_layers.2.weight": "blocks.10.conv1.weight",
563
+ "first_stage_model.decoder.up.3.block.2.time_stack.out_layers.0.bias": "blocks.10.norm2.bias",
564
+ "first_stage_model.decoder.up.3.block.2.time_stack.out_layers.0.weight": "blocks.10.norm2.weight",
565
+ "first_stage_model.decoder.up.3.block.2.time_stack.out_layers.3.bias": "blocks.10.conv2.bias",
566
+ "first_stage_model.decoder.up.3.block.2.time_stack.out_layers.3.weight": "blocks.10.conv2.weight",
567
+ "first_stage_model.decoder.up.3.upsample.conv.bias": "blocks.11.conv.bias",
568
+ "first_stage_model.decoder.up.3.upsample.conv.weight": "blocks.11.conv.weight",
569
+ }
570
+ state_dict_ = {}
571
+ for name in state_dict:
572
+ if name in rename_dict:
573
+ param = state_dict[name]
574
+ if "blocks.2.transformer_blocks.0" in rename_dict[name]:
575
+ param = param.squeeze()
576
+ state_dict_[rename_dict[name]] = param
577
+ return state_dict_
diffsynth/models/svd_vae_encoder.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .sd_vae_encoder import SDVAEEncoderStateDictConverter, SDVAEEncoder
2
+
3
+
4
+ class SVDVAEEncoder(SDVAEEncoder):
5
+ def __init__(self):
6
+ super().__init__()
7
+ self.scaling_factor = 0.13025
8
+
9
+ def state_dict_converter(self):
10
+ return SVDVAEEncoderStateDictConverter()
11
+
12
+
13
+ class SVDVAEEncoderStateDictConverter(SDVAEEncoderStateDictConverter):
14
+ def __init__(self):
15
+ super().__init__()
16
+
17
+ def from_diffusers(self, state_dict):
18
+ return super().from_diffusers(state_dict)
19
+
20
+ def from_civitai(self, state_dict):
21
+ rename_dict = {
22
+ "conditioner.embedders.3.encoder.encoder.conv_in.bias": "conv_in.bias",
23
+ "conditioner.embedders.3.encoder.encoder.conv_in.weight": "conv_in.weight",
24
+ "conditioner.embedders.3.encoder.encoder.conv_out.bias": "conv_out.bias",
25
+ "conditioner.embedders.3.encoder.encoder.conv_out.weight": "conv_out.weight",
26
+ "conditioner.embedders.3.encoder.encoder.down.0.block.0.conv1.bias": "blocks.0.conv1.bias",
27
+ "conditioner.embedders.3.encoder.encoder.down.0.block.0.conv1.weight": "blocks.0.conv1.weight",
28
+ "conditioner.embedders.3.encoder.encoder.down.0.block.0.conv2.bias": "blocks.0.conv2.bias",
29
+ "conditioner.embedders.3.encoder.encoder.down.0.block.0.conv2.weight": "blocks.0.conv2.weight",
30
+ "conditioner.embedders.3.encoder.encoder.down.0.block.0.norm1.bias": "blocks.0.norm1.bias",
31
+ "conditioner.embedders.3.encoder.encoder.down.0.block.0.norm1.weight": "blocks.0.norm1.weight",
32
+ "conditioner.embedders.3.encoder.encoder.down.0.block.0.norm2.bias": "blocks.0.norm2.bias",
33
+ "conditioner.embedders.3.encoder.encoder.down.0.block.0.norm2.weight": "blocks.0.norm2.weight",
34
+ "conditioner.embedders.3.encoder.encoder.down.0.block.1.conv1.bias": "blocks.1.conv1.bias",
35
+ "conditioner.embedders.3.encoder.encoder.down.0.block.1.conv1.weight": "blocks.1.conv1.weight",
36
+ "conditioner.embedders.3.encoder.encoder.down.0.block.1.conv2.bias": "blocks.1.conv2.bias",
37
+ "conditioner.embedders.3.encoder.encoder.down.0.block.1.conv2.weight": "blocks.1.conv2.weight",
38
+ "conditioner.embedders.3.encoder.encoder.down.0.block.1.norm1.bias": "blocks.1.norm1.bias",
39
+ "conditioner.embedders.3.encoder.encoder.down.0.block.1.norm1.weight": "blocks.1.norm1.weight",
40
+ "conditioner.embedders.3.encoder.encoder.down.0.block.1.norm2.bias": "blocks.1.norm2.bias",
41
+ "conditioner.embedders.3.encoder.encoder.down.0.block.1.norm2.weight": "blocks.1.norm2.weight",
42
+ "conditioner.embedders.3.encoder.encoder.down.0.downsample.conv.bias": "blocks.2.conv.bias",
43
+ "conditioner.embedders.3.encoder.encoder.down.0.downsample.conv.weight": "blocks.2.conv.weight",
44
+ "conditioner.embedders.3.encoder.encoder.down.1.block.0.conv1.bias": "blocks.3.conv1.bias",
45
+ "conditioner.embedders.3.encoder.encoder.down.1.block.0.conv1.weight": "blocks.3.conv1.weight",
46
+ "conditioner.embedders.3.encoder.encoder.down.1.block.0.conv2.bias": "blocks.3.conv2.bias",
47
+ "conditioner.embedders.3.encoder.encoder.down.1.block.0.conv2.weight": "blocks.3.conv2.weight",
48
+ "conditioner.embedders.3.encoder.encoder.down.1.block.0.nin_shortcut.bias": "blocks.3.conv_shortcut.bias",
49
+ "conditioner.embedders.3.encoder.encoder.down.1.block.0.nin_shortcut.weight": "blocks.3.conv_shortcut.weight",
50
+ "conditioner.embedders.3.encoder.encoder.down.1.block.0.norm1.bias": "blocks.3.norm1.bias",
51
+ "conditioner.embedders.3.encoder.encoder.down.1.block.0.norm1.weight": "blocks.3.norm1.weight",
52
+ "conditioner.embedders.3.encoder.encoder.down.1.block.0.norm2.bias": "blocks.3.norm2.bias",
53
+ "conditioner.embedders.3.encoder.encoder.down.1.block.0.norm2.weight": "blocks.3.norm2.weight",
54
+ "conditioner.embedders.3.encoder.encoder.down.1.block.1.conv1.bias": "blocks.4.conv1.bias",
55
+ "conditioner.embedders.3.encoder.encoder.down.1.block.1.conv1.weight": "blocks.4.conv1.weight",
56
+ "conditioner.embedders.3.encoder.encoder.down.1.block.1.conv2.bias": "blocks.4.conv2.bias",
57
+ "conditioner.embedders.3.encoder.encoder.down.1.block.1.conv2.weight": "blocks.4.conv2.weight",
58
+ "conditioner.embedders.3.encoder.encoder.down.1.block.1.norm1.bias": "blocks.4.norm1.bias",
59
+ "conditioner.embedders.3.encoder.encoder.down.1.block.1.norm1.weight": "blocks.4.norm1.weight",
60
+ "conditioner.embedders.3.encoder.encoder.down.1.block.1.norm2.bias": "blocks.4.norm2.bias",
61
+ "conditioner.embedders.3.encoder.encoder.down.1.block.1.norm2.weight": "blocks.4.norm2.weight",
62
+ "conditioner.embedders.3.encoder.encoder.down.1.downsample.conv.bias": "blocks.5.conv.bias",
63
+ "conditioner.embedders.3.encoder.encoder.down.1.downsample.conv.weight": "blocks.5.conv.weight",
64
+ "conditioner.embedders.3.encoder.encoder.down.2.block.0.conv1.bias": "blocks.6.conv1.bias",
65
+ "conditioner.embedders.3.encoder.encoder.down.2.block.0.conv1.weight": "blocks.6.conv1.weight",
66
+ "conditioner.embedders.3.encoder.encoder.down.2.block.0.conv2.bias": "blocks.6.conv2.bias",
67
+ "conditioner.embedders.3.encoder.encoder.down.2.block.0.conv2.weight": "blocks.6.conv2.weight",
68
+ "conditioner.embedders.3.encoder.encoder.down.2.block.0.nin_shortcut.bias": "blocks.6.conv_shortcut.bias",
69
+ "conditioner.embedders.3.encoder.encoder.down.2.block.0.nin_shortcut.weight": "blocks.6.conv_shortcut.weight",
70
+ "conditioner.embedders.3.encoder.encoder.down.2.block.0.norm1.bias": "blocks.6.norm1.bias",
71
+ "conditioner.embedders.3.encoder.encoder.down.2.block.0.norm1.weight": "blocks.6.norm1.weight",
72
+ "conditioner.embedders.3.encoder.encoder.down.2.block.0.norm2.bias": "blocks.6.norm2.bias",
73
+ "conditioner.embedders.3.encoder.encoder.down.2.block.0.norm2.weight": "blocks.6.norm2.weight",
74
+ "conditioner.embedders.3.encoder.encoder.down.2.block.1.conv1.bias": "blocks.7.conv1.bias",
75
+ "conditioner.embedders.3.encoder.encoder.down.2.block.1.conv1.weight": "blocks.7.conv1.weight",
76
+ "conditioner.embedders.3.encoder.encoder.down.2.block.1.conv2.bias": "blocks.7.conv2.bias",
77
+ "conditioner.embedders.3.encoder.encoder.down.2.block.1.conv2.weight": "blocks.7.conv2.weight",
78
+ "conditioner.embedders.3.encoder.encoder.down.2.block.1.norm1.bias": "blocks.7.norm1.bias",
79
+ "conditioner.embedders.3.encoder.encoder.down.2.block.1.norm1.weight": "blocks.7.norm1.weight",
80
+ "conditioner.embedders.3.encoder.encoder.down.2.block.1.norm2.bias": "blocks.7.norm2.bias",
81
+ "conditioner.embedders.3.encoder.encoder.down.2.block.1.norm2.weight": "blocks.7.norm2.weight",
82
+ "conditioner.embedders.3.encoder.encoder.down.2.downsample.conv.bias": "blocks.8.conv.bias",
83
+ "conditioner.embedders.3.encoder.encoder.down.2.downsample.conv.weight": "blocks.8.conv.weight",
84
+ "conditioner.embedders.3.encoder.encoder.down.3.block.0.conv1.bias": "blocks.9.conv1.bias",
85
+ "conditioner.embedders.3.encoder.encoder.down.3.block.0.conv1.weight": "blocks.9.conv1.weight",
86
+ "conditioner.embedders.3.encoder.encoder.down.3.block.0.conv2.bias": "blocks.9.conv2.bias",
87
+ "conditioner.embedders.3.encoder.encoder.down.3.block.0.conv2.weight": "blocks.9.conv2.weight",
88
+ "conditioner.embedders.3.encoder.encoder.down.3.block.0.norm1.bias": "blocks.9.norm1.bias",
89
+ "conditioner.embedders.3.encoder.encoder.down.3.block.0.norm1.weight": "blocks.9.norm1.weight",
90
+ "conditioner.embedders.3.encoder.encoder.down.3.block.0.norm2.bias": "blocks.9.norm2.bias",
91
+ "conditioner.embedders.3.encoder.encoder.down.3.block.0.norm2.weight": "blocks.9.norm2.weight",
92
+ "conditioner.embedders.3.encoder.encoder.down.3.block.1.conv1.bias": "blocks.10.conv1.bias",
93
+ "conditioner.embedders.3.encoder.encoder.down.3.block.1.conv1.weight": "blocks.10.conv1.weight",
94
+ "conditioner.embedders.3.encoder.encoder.down.3.block.1.conv2.bias": "blocks.10.conv2.bias",
95
+ "conditioner.embedders.3.encoder.encoder.down.3.block.1.conv2.weight": "blocks.10.conv2.weight",
96
+ "conditioner.embedders.3.encoder.encoder.down.3.block.1.norm1.bias": "blocks.10.norm1.bias",
97
+ "conditioner.embedders.3.encoder.encoder.down.3.block.1.norm1.weight": "blocks.10.norm1.weight",
98
+ "conditioner.embedders.3.encoder.encoder.down.3.block.1.norm2.bias": "blocks.10.norm2.bias",
99
+ "conditioner.embedders.3.encoder.encoder.down.3.block.1.norm2.weight": "blocks.10.norm2.weight",
100
+ "conditioner.embedders.3.encoder.encoder.mid.attn_1.k.bias": "blocks.12.transformer_blocks.0.to_k.bias",
101
+ "conditioner.embedders.3.encoder.encoder.mid.attn_1.k.weight": "blocks.12.transformer_blocks.0.to_k.weight",
102
+ "conditioner.embedders.3.encoder.encoder.mid.attn_1.norm.bias": "blocks.12.norm.bias",
103
+ "conditioner.embedders.3.encoder.encoder.mid.attn_1.norm.weight": "blocks.12.norm.weight",
104
+ "conditioner.embedders.3.encoder.encoder.mid.attn_1.proj_out.bias": "blocks.12.transformer_blocks.0.to_out.bias",
105
+ "conditioner.embedders.3.encoder.encoder.mid.attn_1.proj_out.weight": "blocks.12.transformer_blocks.0.to_out.weight",
106
+ "conditioner.embedders.3.encoder.encoder.mid.attn_1.q.bias": "blocks.12.transformer_blocks.0.to_q.bias",
107
+ "conditioner.embedders.3.encoder.encoder.mid.attn_1.q.weight": "blocks.12.transformer_blocks.0.to_q.weight",
108
+ "conditioner.embedders.3.encoder.encoder.mid.attn_1.v.bias": "blocks.12.transformer_blocks.0.to_v.bias",
109
+ "conditioner.embedders.3.encoder.encoder.mid.attn_1.v.weight": "blocks.12.transformer_blocks.0.to_v.weight",
110
+ "conditioner.embedders.3.encoder.encoder.mid.block_1.conv1.bias": "blocks.11.conv1.bias",
111
+ "conditioner.embedders.3.encoder.encoder.mid.block_1.conv1.weight": "blocks.11.conv1.weight",
112
+ "conditioner.embedders.3.encoder.encoder.mid.block_1.conv2.bias": "blocks.11.conv2.bias",
113
+ "conditioner.embedders.3.encoder.encoder.mid.block_1.conv2.weight": "blocks.11.conv2.weight",
114
+ "conditioner.embedders.3.encoder.encoder.mid.block_1.norm1.bias": "blocks.11.norm1.bias",
115
+ "conditioner.embedders.3.encoder.encoder.mid.block_1.norm1.weight": "blocks.11.norm1.weight",
116
+ "conditioner.embedders.3.encoder.encoder.mid.block_1.norm2.bias": "blocks.11.norm2.bias",
117
+ "conditioner.embedders.3.encoder.encoder.mid.block_1.norm2.weight": "blocks.11.norm2.weight",
118
+ "conditioner.embedders.3.encoder.encoder.mid.block_2.conv1.bias": "blocks.13.conv1.bias",
119
+ "conditioner.embedders.3.encoder.encoder.mid.block_2.conv1.weight": "blocks.13.conv1.weight",
120
+ "conditioner.embedders.3.encoder.encoder.mid.block_2.conv2.bias": "blocks.13.conv2.bias",
121
+ "conditioner.embedders.3.encoder.encoder.mid.block_2.conv2.weight": "blocks.13.conv2.weight",
122
+ "conditioner.embedders.3.encoder.encoder.mid.block_2.norm1.bias": "blocks.13.norm1.bias",
123
+ "conditioner.embedders.3.encoder.encoder.mid.block_2.norm1.weight": "blocks.13.norm1.weight",
124
+ "conditioner.embedders.3.encoder.encoder.mid.block_2.norm2.bias": "blocks.13.norm2.bias",
125
+ "conditioner.embedders.3.encoder.encoder.mid.block_2.norm2.weight": "blocks.13.norm2.weight",
126
+ "conditioner.embedders.3.encoder.encoder.norm_out.bias": "conv_norm_out.bias",
127
+ "conditioner.embedders.3.encoder.encoder.norm_out.weight": "conv_norm_out.weight",
128
+ "conditioner.embedders.3.encoder.quant_conv.bias": "quant_conv.bias",
129
+ "conditioner.embedders.3.encoder.quant_conv.weight": "quant_conv.weight",
130
+ }
131
+ state_dict_ = {}
132
+ for name in state_dict:
133
+ if name in rename_dict:
134
+ param = state_dict[name]
135
+ if "transformer_blocks" in rename_dict[name]:
136
+ param = param.squeeze()
137
+ state_dict_[rename_dict[name]] = param
138
+ return state_dict_