Spaces:

kevinwang676
/

Diffutoon

Runtime error

App Files Files Community

kevinwang676 commited on Jul 25, 2024

Commit

fb4fac3

verified ·

1 Parent(s): 4d4f2d3

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
DiffSynth_Studio.py +15 -0
LICENSE +201 -0
README.md +117 -13
diffsynth/__init__.py +6 -0
diffsynth/controlnets/__init__.py +2 -0
diffsynth/controlnets/controlnet_unit.py +53 -0
diffsynth/controlnets/processors.py +51 -0
diffsynth/data/__init__.py +1 -0
diffsynth/data/video.py +148 -0
diffsynth/extensions/ESRGAN/__init__.py +118 -0
diffsynth/extensions/FastBlend/__init__.py +63 -0
diffsynth/extensions/FastBlend/api.py +397 -0
diffsynth/extensions/FastBlend/cupy_kernels.py +119 -0
diffsynth/extensions/FastBlend/data.py +146 -0
diffsynth/extensions/FastBlend/patch_match.py +298 -0
diffsynth/extensions/FastBlend/runners/__init__.py +4 -0
diffsynth/extensions/FastBlend/runners/accurate.py +35 -0
diffsynth/extensions/FastBlend/runners/balanced.py +46 -0
diffsynth/extensions/FastBlend/runners/fast.py +141 -0
diffsynth/extensions/FastBlend/runners/interpolation.py +121 -0
diffsynth/extensions/RIFE/__init__.py +241 -0
diffsynth/models/__init__.py +814 -0
diffsynth/models/attention.py +89 -0
diffsynth/models/downloader.py +28 -0
diffsynth/models/hunyuan_dit.py +451 -0
diffsynth/models/hunyuan_dit_text_encoder.py +161 -0
diffsynth/models/kolors_text_encoder.py +1363 -0
diffsynth/models/sd3_dit.py +797 -0
diffsynth/models/sd3_text_encoder.py +0 -0
diffsynth/models/sd3_vae_decoder.py +80 -0
diffsynth/models/sd3_vae_encoder.py +94 -0
diffsynth/models/sd_controlnet.py +587 -0
diffsynth/models/sd_ipadapter.py +56 -0
diffsynth/models/sd_lora.py +60 -0
diffsynth/models/sd_motion.py +198 -0
diffsynth/models/sd_text_encoder.py +320 -0
diffsynth/models/sd_unet.py +0 -0
diffsynth/models/sd_vae_decoder.py +332 -0
diffsynth/models/sd_vae_encoder.py +278 -0
diffsynth/models/sdxl_ipadapter.py +121 -0
diffsynth/models/sdxl_motion.py +103 -0
diffsynth/models/sdxl_text_encoder.py +757 -0
diffsynth/models/sdxl_unet.py +0 -0
diffsynth/models/sdxl_vae_decoder.py +15 -0
diffsynth/models/sdxl_vae_encoder.py +15 -0
diffsynth/models/svd_image_encoder.py +504 -0
diffsynth/models/svd_unet.py +0 -0
diffsynth/models/svd_vae_decoder.py +577 -0
diffsynth/models/svd_vae_encoder.py +138 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+diffsynth/tokenizer_configs/kolors/tokenizer/vocab.txt filter=lfs diff=lfs merge=lfs -text

DiffSynth_Studio.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Set web page format
+import streamlit as st
+st.set_page_config(layout="wide")
+# Diasble virtual VRAM on windows system
+import torch
+torch.cuda.set_per_process_memory_fraction(0.999, 0)
+st.markdown("""
+# DiffSynth Studio
+[Source Code](https://github.com/Artiprocher/DiffSynth-Studio)
+Welcome to DiffSynth Studio.
+""")

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [2023] [Zhongjie Duan]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,13 +1,117 @@
----
-title: Diffutoon
-emoji: 🏃
-colorFrom: green
-colorTo: yellow
-sdk: gradio
-sdk_version: 4.39.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# DiffSynth Studio
+## Introduction
+DiffSynth Studio is a Diffusion engine. We have restructured architectures including Text Encoder, UNet, VAE, among others, maintaining compatibility with models from the open-source community while enhancing computational performance. We provide many interesting features. Enjoy the magic of Diffusion models!
+Until now, DiffSynth Studio has supported the following models:
+* [ExVideo](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1)
+* [Kolors](https://huggingface.co/Kwai-Kolors/Kolors)
+* [Stable Diffusion 3](https://huggingface.co/stabilityai/stable-diffusion-3-medium)
+* [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt)
+* [Hunyuan-DiT](https://github.com/Tencent/HunyuanDiT)
+* [RIFE](https://github.com/hzwer/ECCV2022-RIFE)
+* [ESRGAN](https://github.com/xinntao/ESRGAN)
+* [Ip-Adapter](https://github.com/tencent-ailab/IP-Adapter)
+* [AnimateDiff](https://github.com/guoyww/animatediff/)
+* [ControlNet](https://github.com/lllyasviel/ControlNet)
+* [Stable Diffusion XL](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+* [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5)
+## News
+- **June 21, 2024.** 🔥🔥🔥 We propose ExVideo, a post-tuning technique aimed at enhancing the capability of video generation models. We have extended Stable Video Diffusion to achieve the generation of long videos up to 128 frames.
+  - [Project Page](https://ecnu-cilab.github.io/ExVideoProjectPage/)
+  - Source code is released in this repo. See [`examples/ExVideo`](./examples/ExVideo/).
+  - Models are released on [HuggingFace](https://huggingface.co/ECNU-CILab/ExVideo-SVD-128f-v1) and [ModelScope](https://modelscope.cn/models/ECNU-CILab/ExVideo-SVD-128f-v1).
+  - Technical report is released on [arXiv](https://arxiv.org/abs/2406.14130).
+  - You can try ExVideo in this [Demo](https://huggingface.co/spaces/modelscope/ExVideo-SVD-128f-v1)!
+- **June 13, 2024.** DiffSynth Studio is transferred to ModelScope. The developers have transitioned from "I" to "we". Of course, I will still participate in development and maintenance.
+- **Jan 29, 2024.** We propose Diffutoon, a fantastic solution for toon shading.
+  - [Project Page](https://ecnu-cilab.github.io/DiffutoonProjectPage/)
+  - The source codes are released in this project.
+  - The technical report (IJCAI 2024) is released on [arXiv](https://arxiv.org/abs/2401.16224).
+- **Dec 8, 2023.** We decide to develop a new Project, aiming to release the potential of diffusion models, especially in video synthesis. The development of this project is started.
+- **Nov 15, 2023.** We propose FastBlend, a powerful video deflickering algorithm.
+  - The sd-webui extension is released on [GitHub](https://github.com/Artiprocher/sd-webui-fastblend).
+  - Demo videos are shown on Bilibili, including three tasks.
+    - [Video deflickering](https://www.bilibili.com/video/BV1d94y1W7PE)
+    - [Video interpolation](https://www.bilibili.com/video/BV1Lw411m71p)
+    - [Image-driven video rendering](https://www.bilibili.com/video/BV1RB4y1Z7LF)
+  - The technical report is released on [arXiv](https://arxiv.org/abs/2311.09265).
+  - An unofficial ComfyUI extension developed by other users is released on [GitHub](https://github.com/AInseven/ComfyUI-fastblend).
+- **Oct 1, 2023.** We release an early version of this project, namely FastSDXL. A try for building a diffusion engine.
+  - The source codes are released on [GitHub](https://github.com/Artiprocher/FastSDXL).
+  - FastSDXL includes a trainable OLSS scheduler for efficiency improvement.
+    - The original repo of OLSS is [here](https://github.com/alibaba/EasyNLP/tree/master/diffusion/olss_scheduler).
+    - The technical report (CIKM 2023) is released on [arXiv](https://arxiv.org/abs/2305.14677).
+    - A demo video is shown on [Bilibili](https://www.bilibili.com/video/BV1w8411y7uj).
+    - Since OLSS requires additional training, we don't implement it in this project.
+- **Aug 29, 2023.** We propose DiffSynth, a video synthesis framework.
+  - [Project Page](https://ecnu-cilab.github.io/DiffSynth.github.io/).
+  - The source codes are released in [EasyNLP](https://github.com/alibaba/EasyNLP/tree/master/diffusion/DiffSynth).
+  - The technical report (ECML PKDD 2024) is released on [arXiv](https://arxiv.org/abs/2308.03463).
+## Installation
+```
+git clone https://github.com/modelscope/DiffSynth-Studio.git
+cd DiffSynth-Studio
+pip install -e .
+```
+## Usage (in Python code)
+The Python examples are in [`examples`](./examples/). We provide an overview here.
+### Long Video Synthesis
+We trained an extended video synthesis model, which can generate 128 frames. [`examples/ExVideo`](./examples/ExVideo/)
+https://github.com/modelscope/DiffSynth-Studio/assets/35051019/d97f6aa9-8064-4b5b-9d49-ed6001bb9acc
+### Image Synthesis
+Generate high-resolution images, by breaking the limitation of diffusion models! [`examples/image_synthesis`](./examples/image_synthesis/).
+LoRA fine-tuning is supported in [`examples/train`](./examples/train/).
+|Model|Example|
+|-|-|
+|Stable Diffusion|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/6fc84611-8da6-4a1f-8fee-9a34eba3b4a5)|
+|Stable Diffusion XL|![1024](https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/67687748-e738-438c-aee5-96096f09ac90)|
+|Stable Diffusion 3|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/4df346db-6f91-420a-b4c1-26e205376098)|
+|Kolors|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/53ef6f41-da11-4701-8665-9f64392607bf)|
+|Hunyuan-DiT|![image_1024](https://github.com/modelscope/DiffSynth-Studio/assets/35051019/60b022c8-df3f-4541-95ab-bf39f2fa8bb5)|
+### Toon Shading
+Render realistic videos in a flatten style and enable video editing features. [`examples/Diffutoon`](./examples/Diffutoon/)
+https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/b54c05c5-d747-4709-be5e-b39af82404dd
+https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/20528af5-5100-474a-8cdc-440b9efdd86c
+### Video Stylization
+Video stylization without video models. [`examples/diffsynth`](./examples/diffsynth/)
+https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/59fb2f7b-8de0-4481-b79f-0c3a7361a1ea
+## Usage (in WebUI)
+```
+python -m streamlit run DiffSynth_Studio.py
+```
+https://github.com/Artiprocher/DiffSynth-Studio/assets/35051019/93085557-73f3-4eee-a205-9829591ef954

diffsynth/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .data import *
+from .models import *
+from .prompts import *
+from .schedulers import *
+from .pipelines import *
+from .controlnets import *

diffsynth/controlnets/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .controlnet_unit import ControlNetConfigUnit, ControlNetUnit, MultiControlNetManager
2	+ from .processors import Annotator

diffsynth/controlnets/controlnet_unit.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+import numpy as np
+from .processors import Processor_id
+class ControlNetConfigUnit:
+    def __init__(self, processor_id: Processor_id, model_path, scale=1.0):
+        self.processor_id = processor_id
+        self.model_path = model_path
+        self.scale = scale
+class ControlNetUnit:
+    def __init__(self, processor, model, scale=1.0):
+        self.processor = processor
+        self.model = model
+        self.scale = scale
+class MultiControlNetManager:
+    def __init__(self, controlnet_units=[]):
+        self.processors = [unit.processor for unit in controlnet_units]
+        self.models = [unit.model for unit in controlnet_units]
+        self.scales = [unit.scale for unit in controlnet_units]
+    def process_image(self, image, processor_id=None):
+        if processor_id is None:
+            processed_image = [processor(image) for processor in self.processors]
+        else:
+            processed_image = [self.processors[processor_id](image)]
+        processed_image = torch.concat([
+            torch.Tensor(np.array(image_, dtype=np.float32) / 255).permute(2, 0, 1).unsqueeze(0)
+            for image_ in processed_image
+        ], dim=0)
+        return processed_image
+    def __call__(
+        self,
+        sample, timestep, encoder_hidden_states, conditionings,
+        tiled=False, tile_size=64, tile_stride=32
+    ):
+        res_stack = None
+        for conditioning, model, scale in zip(conditionings, self.models, self.scales):
+            res_stack_ = model(
+                sample, timestep, encoder_hidden_states, conditioning,
+                tiled=tiled, tile_size=tile_size, tile_stride=tile_stride
+            )
+            res_stack_ = [res * scale for res in res_stack_]
+            if res_stack is None:
+                res_stack = res_stack_
+            else:
+                res_stack = [i + j for i, j in zip(res_stack, res_stack_)]
+        return res_stack

diffsynth/controlnets/processors.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from typing_extensions import Literal, TypeAlias
+import warnings
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore")
+    from controlnet_aux.processor import (
+        CannyDetector, MidasDetector, HEDdetector, LineartDetector, LineartAnimeDetector, OpenposeDetector
+    )
+Processor_id: TypeAlias = Literal[
+    "canny", "depth", "softedge", "lineart", "lineart_anime", "openpose", "tile"
+]
+class Annotator:
+    def __init__(self, processor_id: Processor_id, model_path="models/Annotators", detect_resolution=None, device='cuda'):
+        if processor_id == "canny":
+            self.processor = CannyDetector()
+        elif processor_id == "depth":
+            self.processor = MidasDetector.from_pretrained(model_path).to(device)
+        elif processor_id == "softedge":
+            self.processor = HEDdetector.from_pretrained(model_path).to(device)
+        elif processor_id == "lineart":
+            self.processor = LineartDetector.from_pretrained(model_path).to(device)
+        elif processor_id == "lineart_anime":
+            self.processor = LineartAnimeDetector.from_pretrained(model_path).to(device)
+        elif processor_id == "openpose":
+            self.processor = OpenposeDetector.from_pretrained(model_path).to(device)
+        elif processor_id == "tile":
+            self.processor = None
+        else:
+            raise ValueError(f"Unsupported processor_id: {processor_id}")
+        self.processor_id = processor_id
+        self.detect_resolution = detect_resolution
+    def __call__(self, image):
+        width, height = image.size
+        if self.processor_id == "openpose":
+            kwargs = {
+                "include_body": True,
+                "include_hand": True,
+                "include_face": True
+            }
+        else:
+            kwargs = {}
+        if self.processor is not None:
+            detect_resolution = self.detect_resolution if self.detect_resolution is not None else min(width, height)
+            image = self.processor(image, detect_resolution=detect_resolution, image_resolution=min(width, height), **kwargs)
+        image = image.resize((width, height))
+        return image

diffsynth/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .video import VideoData, save_video, save_frames

diffsynth/data/video.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import imageio, os
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+class LowMemoryVideo:
+    def __init__(self, file_name):
+        self.reader = imageio.get_reader(file_name)
+    def __len__(self):
+        return self.reader.count_frames()
+    def __getitem__(self, item):
+        return Image.fromarray(np.array(self.reader.get_data(item))).convert("RGB")
+    def __del__(self):
+        self.reader.close()
+def split_file_name(file_name):
+    result = []
+    number = -1
+    for i in file_name:
+        if ord(i)>=ord("0") and ord(i)<=ord("9"):
+            if number == -1:
+                number = 0
+            number = number*10 + ord(i) - ord("0")
+        else:
+            if number != -1:
+                result.append(number)
+                number = -1
+            result.append(i)
+    if number != -1:
+        result.append(number)
+    result = tuple(result)
+    return result
+def search_for_images(folder):
+    file_list = [i for i in os.listdir(folder) if i.endswith(".jpg") or i.endswith(".png")]
+    file_list = [(split_file_name(file_name), file_name) for file_name in file_list]
+    file_list = [i[1] for i in sorted(file_list)]
+    file_list = [os.path.join(folder, i) for i in file_list]
+    return file_list
+class LowMemoryImageFolder:
+    def __init__(self, folder, file_list=None):
+        if file_list is None:
+            self.file_list = search_for_images(folder)
+        else:
+            self.file_list = [os.path.join(folder, file_name) for file_name in file_list]
+    def __len__(self):
+        return len(self.file_list)
+    def __getitem__(self, item):
+        return Image.open(self.file_list[item]).convert("RGB")
+    def __del__(self):
+        pass
+def crop_and_resize(image, height, width):
+    image = np.array(image)
+    image_height, image_width, _ = image.shape
+    if image_height / image_width < height / width:
+        croped_width = int(image_height / height * width)
+        left = (image_width - croped_width) // 2
+        image = image[:, left: left+croped_width]
+        image = Image.fromarray(image).resize((width, height))
+    else:
+        croped_height = int(image_width / width * height)
+        left = (image_height - croped_height) // 2
+        image = image[left: left+croped_height, :]
+        image = Image.fromarray(image).resize((width, height))
+    return image
+class VideoData:
+    def __init__(self, video_file=None, image_folder=None, height=None, width=None, **kwargs):
+        if video_file is not None:
+            self.data_type = "video"
+            self.data = LowMemoryVideo(video_file, **kwargs)
+        elif image_folder is not None:
+            self.data_type = "images"
+            self.data = LowMemoryImageFolder(image_folder, **kwargs)
+        else:
+            raise ValueError("Cannot open video or image folder")
+        self.length = None
+        self.set_shape(height, width)
+    def raw_data(self):
+        frames = []
+        for i in range(self.__len__()):
+            frames.append(self.__getitem__(i))
+        return frames
+    def set_length(self, length):
+        self.length = length
+    def set_shape(self, height, width):
+        self.height = height
+        self.width = width
+    def __len__(self):
+        if self.length is None:
+            return len(self.data)
+        else:
+            return self.length
+    def shape(self):
+        if self.height is not None and self.width is not None:
+            return self.height, self.width
+        else:
+            height, width, _ = self.__getitem__(0).shape
+            return height, width
+    def __getitem__(self, item):
+        frame = self.data.__getitem__(item)
+        width, height = frame.size
+        if self.height is not None and self.width is not None:
+            if self.height != height or self.width != width:
+                frame = crop_and_resize(frame, self.height, self.width)
+        return frame
+    def __del__(self):
+        pass
+    def save_images(self, folder):
+        os.makedirs(folder, exist_ok=True)
+        for i in tqdm(range(self.__len__()), desc="Saving images"):
+            frame = self.__getitem__(i)
+            frame.save(os.path.join(folder, f"{i}.png"))
+def save_video(frames, save_path, fps, quality=9):
+    writer = imageio.get_writer(save_path, fps=fps, quality=quality)
+    for frame in tqdm(frames, desc="Saving video"):
+        frame = np.array(frame)
+        writer.append_data(frame)
+    writer.close()
+def save_frames(frames, save_path):
+    os.makedirs(save_path, exist_ok=True)
+    for i, frame in enumerate(tqdm(frames, desc="Saving images")):
+        frame.save(os.path.join(save_path, f"{i}.png"))

diffsynth/extensions/ESRGAN/__init__.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import torch
+from einops import repeat
+from PIL import Image
+import numpy as np
+class ResidualDenseBlock(torch.nn.Module):
+    def __init__(self, num_feat=64, num_grow_ch=32):
+        super(ResidualDenseBlock, self).__init__()
+        self.conv1 = torch.nn.Conv2d(num_feat, num_grow_ch, 3, 1, 1)
+        self.conv2 = torch.nn.Conv2d(num_feat + num_grow_ch, num_grow_ch, 3, 1, 1)
+        self.conv3 = torch.nn.Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, 3, 1, 1)
+        self.conv4 = torch.nn.Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, 3, 1, 1)
+        self.conv5 = torch.nn.Conv2d(num_feat + 4 * num_grow_ch, num_feat, 3, 1, 1)
+        self.lrelu = torch.nn.LeakyReLU(negative_slope=0.2, inplace=True)
+    def forward(self, x):
+        x1 = self.lrelu(self.conv1(x))
+        x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
+        x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
+        x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
+        x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
+        return x5 * 0.2 + x
+class RRDB(torch.nn.Module):
+    def __init__(self, num_feat, num_grow_ch=32):
+        super(RRDB, self).__init__()
+        self.rdb1 = ResidualDenseBlock(num_feat, num_grow_ch)
+        self.rdb2 = ResidualDenseBlock(num_feat, num_grow_ch)
+        self.rdb3 = ResidualDenseBlock(num_feat, num_grow_ch)
+    def forward(self, x):
+        out = self.rdb1(x)
+        out = self.rdb2(out)
+        out = self.rdb3(out)
+        return out * 0.2 + x
+class RRDBNet(torch.nn.Module):
+    def __init__(self, num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32):
+        super(RRDBNet, self).__init__()
+        self.conv_first = torch.nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)
+        self.body = torch.torch.nn.Sequential(*[RRDB(num_feat=num_feat, num_grow_ch=num_grow_ch) for _ in range(num_block)])
+        self.conv_body = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1)
+        # upsample
+        self.conv_up1 = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1)
+        self.conv_up2 = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1)
+        self.conv_hr = torch.nn.Conv2d(num_feat, num_feat, 3, 1, 1)
+        self.conv_last = torch.nn.Conv2d(num_feat, num_out_ch, 3, 1, 1)
+        self.lrelu = torch.nn.LeakyReLU(negative_slope=0.2, inplace=True)
+    def forward(self, x):
+        feat = x
+        feat = self.conv_first(feat)
+        body_feat = self.conv_body(self.body(feat))
+        feat = feat + body_feat
+        # upsample
+        feat = repeat(feat, "B C H W -> B C (H 2) (W 2)")
+        feat = self.lrelu(self.conv_up1(feat))
+        feat = repeat(feat, "B C H W -> B C (H 2) (W 2)")
+        feat = self.lrelu(self.conv_up2(feat))
+        out = self.conv_last(self.lrelu(self.conv_hr(feat)))
+        return out
+class ESRGAN(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+    @staticmethod
+    def from_pretrained(model_path):
+        model = RRDBNet()
+        state_dict = torch.load(model_path, map_location="cpu")["params_ema"]
+        model.load_state_dict(state_dict)
+        model.eval()
+        return ESRGAN(model)
+    def process_image(self, image):
+        image = torch.Tensor(np.array(image, dtype=np.float32) / 255).permute(2, 0, 1)
+        return image
+    def process_images(self, images):
+        images = [self.process_image(image) for image in images]
+        images = torch.stack(images)
+        return images
+    def decode_images(self, images):
+        images = (images.permute(0, 2, 3, 1) * 255).clip(0, 255).numpy().astype(np.uint8)
+        images = [Image.fromarray(image) for image in images]
+        return images
+    @torch.no_grad()
+    def upscale(self, images, batch_size=4, progress_bar=lambda x:x):
+        # Preprocess
+        input_tensor = self.process_images(images)
+        # Interpolate
+        output_tensor = []
+        for batch_id in progress_bar(range(0, input_tensor.shape[0], batch_size)):
+            batch_id_ = min(batch_id + batch_size, input_tensor.shape[0])
+            batch_input_tensor = input_tensor[batch_id: batch_id_]
+            batch_input_tensor = batch_input_tensor.to(
+                device=self.model.conv_first.weight.device,
+                dtype=self.model.conv_first.weight.dtype)
+            batch_output_tensor = self.model(batch_input_tensor)
+            output_tensor.append(batch_output_tensor.cpu())
+        # Output
+        output_tensor = torch.concat(output_tensor, dim=0)
+        # To images
+        output_images = self.decode_images(output_tensor)
+        return output_images

diffsynth/extensions/FastBlend/__init__.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from .runners.fast import TableManager, PyramidPatchMatcher
+from PIL import Image
+import numpy as np
+import cupy as cp
+class FastBlendSmoother:
+    def __init__(self):
+        self.batch_size = 8
+        self.window_size = 64
+        self.ebsynth_config = {
+            "minimum_patch_size": 5,
+            "threads_per_block": 8,
+            "num_iter": 5,
+            "gpu_id": 0,
+            "guide_weight": 10.0,
+            "initialize": "identity",
+            "tracking_window_size": 0,
+        }
+    @staticmethod
+    def from_model_manager(model_manager):
+        # TODO: fetch GPU ID from model_manager
+        return FastBlendSmoother()
+    def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config):
+        frames_guide = [np.array(frame) for frame in frames_guide]
+        frames_style = [np.array(frame) for frame in frames_style]
+        table_manager = TableManager()
+        patch_match_engine = PyramidPatchMatcher(
+            image_height=frames_style[0].shape[0],
+            image_width=frames_style[0].shape[1],
+            channel=3,
+            **ebsynth_config
+        )
+        # left part
+        table_l = table_manager.build_remapping_table(frames_guide, frames_style, patch_match_engine, batch_size, desc="FastBlend Step 1/4")
+        table_l = table_manager.remapping_table_to_blending_table(table_l)
+        table_l = table_manager.process_window_sum(frames_guide, table_l, patch_match_engine, window_size, batch_size, desc="FastBlend Step 2/4")
+        # right part
+        table_r = table_manager.build_remapping_table(frames_guide[::-1], frames_style[::-1], patch_match_engine, batch_size, desc="FastBlend Step 3/4")
+        table_r = table_manager.remapping_table_to_blending_table(table_r)
+        table_r = table_manager.process_window_sum(frames_guide[::-1], table_r, patch_match_engine, window_size, batch_size, desc="FastBlend Step 4/4")[::-1]
+        # merge
+        frames = []
+        for (frame_l, weight_l), frame_m, (frame_r, weight_r) in zip(table_l, frames_style, table_r):
+            weight_m = -1
+            weight = weight_l + weight_m + weight_r
+            frame = frame_l * (weight_l / weight) + frame_m * (weight_m / weight) + frame_r * (weight_r / weight)
+            frames.append(frame)
+        frames = [Image.fromarray(frame.clip(0, 255).astype("uint8")) for frame in frames]
+        return frames
+    def __call__(self, rendered_frames, original_frames=None, **kwargs):
+        frames = self.run(
+            original_frames, rendered_frames,
+            self.batch_size, self.window_size, self.ebsynth_config
+        )
+        mempool = cp.get_default_memory_pool()
+        pinned_mempool = cp.get_default_pinned_memory_pool()
+        mempool.free_all_blocks()
+        pinned_mempool.free_all_blocks()
+        return frames

diffsynth/extensions/FastBlend/api.py ADDED Viewed

	@@ -0,0 +1,397 @@

+from .runners import AccurateModeRunner, FastModeRunner, BalancedModeRunner, InterpolationModeRunner, InterpolationModeSingleFrameRunner
+from .data import VideoData, get_video_fps, save_video, search_for_images
+import os
+import gradio as gr
+def check_input_for_blending(video_guide, video_guide_folder, video_style, video_style_folder):
+    frames_guide = VideoData(video_guide, video_guide_folder)
+    frames_style = VideoData(video_style, video_style_folder)
+    message = ""
+    if len(frames_guide) < len(frames_style):
+        message += f"The number of frames mismatches. Only the first {len(frames_guide)} frames of style video will be used.\n"
+        frames_style.set_length(len(frames_guide))
+    elif len(frames_guide) > len(frames_style):
+        message += f"The number of frames mismatches. Only the first {len(frames_style)} frames of guide video will be used.\n"
+        frames_guide.set_length(len(frames_style))
+    height_guide, width_guide = frames_guide.shape()
+    height_style, width_style = frames_style.shape()
+    if height_guide != height_style or width_guide != width_style:
+        message += f"The shape of frames mismatches. The frames in style video will be resized to (height: {height_guide}, width: {width_guide})\n"
+        frames_style.set_shape(height_guide, width_guide)
+    return frames_guide, frames_style, message
+def smooth_video(
+    video_guide,
+    video_guide_folder,
+    video_style,
+    video_style_folder,
+    mode,
+    window_size,
+    batch_size,
+    tracking_window_size,
+    output_path,
+    fps,
+    minimum_patch_size,
+    num_iter,
+    guide_weight,
+    initialize,
+    progress = None,
+):
+    # input
+    frames_guide, frames_style, message = check_input_for_blending(video_guide, video_guide_folder, video_style, video_style_folder)
+    if len(message) > 0:
+        print(message)
+    # output
+    if output_path == "":
+        if video_style is None:
+            output_path = os.path.join(video_style_folder, "output")
+        else:
+            output_path = os.path.join(os.path.split(video_style)[0], "output")
+        os.makedirs(output_path, exist_ok=True)
+        print("No valid output_path. Your video will be saved here:", output_path)
+    elif not os.path.exists(output_path):
+        os.makedirs(output_path, exist_ok=True)
+        print("Your video will be saved here:", output_path)
+    frames_path = os.path.join(output_path, "frames")
+    video_path = os.path.join(output_path, "video.mp4")
+    os.makedirs(frames_path, exist_ok=True)
+    # process
+    if mode == "Fast" or mode == "Balanced":
+        tracking_window_size = 0
+    ebsynth_config = {
+        "minimum_patch_size": minimum_patch_size,
+        "threads_per_block": 8,
+        "num_iter": num_iter,
+        "gpu_id": 0,
+        "guide_weight": guide_weight,
+        "initialize": initialize,
+        "tracking_window_size": tracking_window_size,
+    }
+    if mode == "Fast":
+        FastModeRunner().run(frames_guide, frames_style, batch_size=batch_size, window_size=window_size, ebsynth_config=ebsynth_config, save_path=frames_path)
+    elif mode == "Balanced":
+        BalancedModeRunner().run(frames_guide, frames_style, batch_size=batch_size, window_size=window_size, ebsynth_config=ebsynth_config, save_path=frames_path)
+    elif mode == "Accurate":
+        AccurateModeRunner().run(frames_guide, frames_style, batch_size=batch_size, window_size=window_size, ebsynth_config=ebsynth_config, save_path=frames_path)
+    # output
+    try:
+        fps = int(fps)
+    except:
+        fps = get_video_fps(video_style) if video_style is not None else 30
+    print("Fps:", fps)
+    print("Saving video...")
+    video_path = save_video(frames_path, video_path, num_frames=len(frames_style), fps=fps)
+    print("Success!")
+    print("Your frames are here:", frames_path)
+    print("Your video is here:", video_path)
+    return output_path, fps, video_path
+class KeyFrameMatcher:
+    def __init__(self):
+        pass
+    def extract_number_from_filename(self, file_name):
+        result = []
+        number = -1
+        for i in file_name:
+            if ord(i)>=ord("0") and ord(i)<=ord("9"):
+                if number == -1:
+                    number = 0
+                number = number*10 + ord(i) - ord("0")
+            else:
+                if number != -1:
+                    result.append(number)
+                    number = -1
+        if number != -1:
+            result.append(number)
+        result = tuple(result)
+        return result
+    def extract_number_from_filenames(self, file_names):
+        numbers = [self.extract_number_from_filename(file_name) for file_name in file_names]
+        min_length = min(len(i) for i in numbers)
+        for i in range(min_length-1, -1, -1):
+            if len(set(number[i] for number in numbers))==len(file_names):
+                return [number[i] for number in numbers]
+        return list(range(len(file_names)))
+    def match_using_filename(self, file_names_a, file_names_b):
+        file_names_b_set = set(file_names_b)
+        matched_file_name = []
+        for file_name in file_names_a:
+            if file_name not in file_names_b_set:
+                matched_file_name.append(None)
+            else:
+                matched_file_name.append(file_name)
+        return matched_file_name
+    def match_using_numbers(self, file_names_a, file_names_b):
+        numbers_a = self.extract_number_from_filenames(file_names_a)
+        numbers_b = self.extract_number_from_filenames(file_names_b)
+        numbers_b_dict = {number: file_name for number, file_name in zip(numbers_b, file_names_b)}
+        matched_file_name = []
+        for number in numbers_a:
+            if number in numbers_b_dict:
+                matched_file_name.append(numbers_b_dict[number])
+            else:
+                matched_file_name.append(None)
+        return matched_file_name
+    def match_filenames(self, file_names_a, file_names_b):
+        matched_file_name = self.match_using_filename(file_names_a, file_names_b)
+        if sum([i is not None for i in matched_file_name]) > 0:
+            return matched_file_name
+        matched_file_name = self.match_using_numbers(file_names_a, file_names_b)
+        return matched_file_name
+def detect_frames(frames_path, keyframes_path):
+    if not os.path.exists(frames_path) and not os.path.exists(keyframes_path):
+        return "Please input the directory of guide video and rendered frames"
+    elif not os.path.exists(frames_path):
+        return "Please input the directory of guide video"
+    elif not os.path.exists(keyframes_path):
+        return "Please input the directory of rendered frames"
+    frames = [os.path.split(i)[-1] for i in search_for_images(frames_path)]
+    keyframes = [os.path.split(i)[-1] for i in search_for_images(keyframes_path)]
+    if len(frames)==0:
+        return f"No images detected in {frames_path}"
+    if len(keyframes)==0:
+        return f"No images detected in {keyframes_path}"
+    matched_keyframes = KeyFrameMatcher().match_filenames(frames, keyframes)
+    max_filename_length = max([len(i) for i in frames])
+    if sum([i is not None for i in matched_keyframes])==0:
+        message = ""
+        for frame, matched_keyframe in zip(frames, matched_keyframes):
+            message += frame + " " * (max_filename_length - len(frame) + 1)
+            message += "--> No matched keyframes\n"
+    else:
+        message = ""
+        for frame, matched_keyframe in zip(frames, matched_keyframes):
+            message += frame + " " * (max_filename_length - len(frame) + 1)
+            if matched_keyframe is None:
+                message += "--> [to be rendered]\n"
+            else:
+                message += f"--> {matched_keyframe}\n"
+    return message
+def check_input_for_interpolating(frames_path, keyframes_path):
+    # search for images
+    frames = [os.path.split(i)[-1] for i in search_for_images(frames_path)]
+    keyframes = [os.path.split(i)[-1] for i in search_for_images(keyframes_path)]
+    # match frames
+    matched_keyframes = KeyFrameMatcher().match_filenames(frames, keyframes)
+    file_list = [file_name for file_name in matched_keyframes if file_name is not None]
+    index_style = [i for i, file_name in enumerate(matched_keyframes) if file_name is not None]
+    frames_guide = VideoData(None, frames_path)
+    frames_style = VideoData(None, keyframes_path, file_list=file_list)
+    # match shape
+    message = ""
+    height_guide, width_guide = frames_guide.shape()
+    height_style, width_style = frames_style.shape()
+    if height_guide != height_style or width_guide != width_style:
+        message += f"The shape of frames mismatches. The rendered keyframes will be resized to (height: {height_guide}, width: {width_guide})\n"
+        frames_style.set_shape(height_guide, width_guide)
+    return frames_guide, frames_style, index_style, message
+def interpolate_video(
+    frames_path,
+    keyframes_path,
+    output_path,
+    fps,
+    batch_size,
+    tracking_window_size,
+    minimum_patch_size,
+    num_iter,
+    guide_weight,
+    initialize,
+    progress = None,
+):
+    # input
+    frames_guide, frames_style, index_style, message = check_input_for_interpolating(frames_path, keyframes_path)
+    if len(message) > 0:
+        print(message)
+    # output
+    if output_path == "":
+        output_path = os.path.join(keyframes_path, "output")
+        os.makedirs(output_path, exist_ok=True)
+        print("No valid output_path. Your video will be saved here:", output_path)
+    elif not os.path.exists(output_path):
+        os.makedirs(output_path, exist_ok=True)
+        print("Your video will be saved here:", output_path)
+    output_frames_path = os.path.join(output_path, "frames")
+    output_video_path = os.path.join(output_path, "video.mp4")
+    os.makedirs(output_frames_path, exist_ok=True)
+    # process
+    ebsynth_config = {
+        "minimum_patch_size": minimum_patch_size,
+        "threads_per_block": 8,
+        "num_iter": num_iter,
+        "gpu_id": 0,
+        "guide_weight": guide_weight,
+        "initialize": initialize,
+        "tracking_window_size": tracking_window_size
+    }
+    if len(index_style)==1:
+        InterpolationModeSingleFrameRunner().run(frames_guide, frames_style, index_style, batch_size=batch_size, ebsynth_config=ebsynth_config, save_path=output_frames_path)
+    else:
+        InterpolationModeRunner().run(frames_guide, frames_style, index_style, batch_size=batch_size, ebsynth_config=ebsynth_config, save_path=output_frames_path)
+    try:
+        fps = int(fps)
+    except:
+        fps = 30
+    print("Fps:", fps)
+    print("Saving video...")
+    video_path = save_video(output_frames_path, output_video_path, num_frames=len(frames_guide), fps=fps)
+    print("Success!")
+    print("Your frames are here:", output_frames_path)
+    print("Your video is here:", video_path)
+    return output_path, fps, video_path
+def on_ui_tabs():
+    with gr.Blocks(analytics_enabled=False) as ui_component:
+        with gr.Tab("Blend"):
+            gr.Markdown("""
+# Blend
+Given a guide video and a style video, this algorithm will make the style video fluent according to the motion features of the guide video. Click [here](https://github.com/Artiprocher/sd-webui-fastblend/assets/35051019/208d902d-6aba-48d7-b7d5-cd120ebd306d) to see the example. Note that this extension doesn't support long videos. Please use short videos (e.g., several seconds). The algorithm is mainly designed for 512*512 resolution. Please use a larger `Minimum patch size` for higher resolution.
+            """)
+            with gr.Row():
+                with gr.Column():
+                    with gr.Tab("Guide video"):
+                        video_guide = gr.Video(label="Guide video")
+                    with gr.Tab("Guide video (images format)"):
+                        video_guide_folder = gr.Textbox(label="Guide video (images format)", value="")
+                with gr.Column():
+                    with gr.Tab("Style video"):
+                        video_style = gr.Video(label="Style video")
+                    with gr.Tab("Style video (images format)"):
+                        video_style_folder = gr.Textbox(label="Style video (images format)", value="")
+                with gr.Column():
+                    output_path = gr.Textbox(label="Output directory", value="", placeholder="Leave empty to use the directory of style video")
+                    fps = gr.Textbox(label="Fps", value="", placeholder="Leave empty to use the default fps")
+                    video_output = gr.Video(label="Output video", interactive=False, show_share_button=True)
+            btn = gr.Button(value="Blend")
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("# Settings")
+                    mode = gr.Radio(["Fast", "Balanced", "Accurate"], label="Inference mode", value="Fast", interactive=True)
+                    window_size = gr.Slider(label="Sliding window size", value=15, minimum=1, maximum=1000, step=1, interactive=True)
+                    batch_size = gr.Slider(label="Batch size", value=8, minimum=1, maximum=128, step=1, interactive=True)
+                    tracking_window_size = gr.Slider(label="Tracking window size (only for accurate mode)", value=0, minimum=0, maximum=10, step=1, interactive=True)
+                    gr.Markdown("## Advanced Settings")
+                    minimum_patch_size = gr.Slider(label="Minimum patch size (odd number)", value=5, minimum=5, maximum=99, step=2, interactive=True)
+                    num_iter = gr.Slider(label="Number of iterations", value=5, minimum=1, maximum=10, step=1, interactive=True)
+                    guide_weight = gr.Slider(label="Guide weight", value=10.0, minimum=0.0, maximum=100.0, step=0.1, interactive=True)
+                    initialize = gr.Radio(["identity", "random"], label="NNF initialization", value="identity", interactive=True)
+                with gr.Column():
+                    gr.Markdown("""
+# Reference
+* Output directory: the directory to save the video.
+* Inference mode
+|Mode|Time|Memory|Quality|Frame by frame output|Description|
+|-|-|-|-|-|-|
+|Fast|■|■■■|■■|No|Blend the frames using a tree-like data structure, which requires much RAM but is fast.|
+|Balanced|■■|■|■■|Yes|Blend the frames naively.|
+|Accurate|■■■|■|■■■|Yes|Blend the frames and align them together for higher video quality. When [batch size] >= [sliding window size] * 2 + 1, the performance is the best.|
+* Sliding window size: our algorithm will blend the frames in a sliding windows. If the size is n, each frame will be blended with the last n frames and the next n frames. A large sliding window can make the video fluent but sometimes smoggy.
+* Batch size: a larger batch size makes the program faster but requires more VRAM.
+* Tracking window size (only for accurate mode): The size of window in which our algorithm tracks moving objects. Empirically, 1 is enough.
+* Advanced settings
+    * Minimum patch size (odd number): the minimum patch size used for patch matching. (Default: 5)
+    * Number of iterations: the number of iterations of patch matching. (Default: 5)
+    * Guide weight: a parameter that determines how much motion feature applied to the style video. (Default: 10)
+    * NNF initialization: how to initialize the NNF (Nearest Neighbor Field). (Default: identity)
+                    """)
+            btn.click(
+                smooth_video,
+                inputs=[
+                    video_guide,
+                    video_guide_folder,
+                    video_style,
+                    video_style_folder,
+                    mode,
+                    window_size,
+                    batch_size,
+                    tracking_window_size,
+                    output_path,
+                    fps,
+                    minimum_patch_size,
+                    num_iter,
+                    guide_weight,
+                    initialize
+                ],
+                outputs=[output_path, fps, video_output]
+            )
+        with gr.Tab("Interpolate"):
+            gr.Markdown("""
+# Interpolate
+Given a guide video and some rendered keyframes, this algorithm will render the remaining frames. Click [here](https://github.com/Artiprocher/sd-webui-fastblend/assets/35051019/3490c5b4-8f67-478f-86de-f9adc2ace16a) to see the example. The algorithm is experimental and is only tested for 512*512 resolution.
+            """)
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        with gr.Column():
+                            video_guide_folder_ = gr.Textbox(label="Guide video (images format)", value="")
+                        with gr.Column():
+                            rendered_keyframes_ = gr.Textbox(label="Rendered keyframes (images format)", value="")
+                    with gr.Row():
+                        detected_frames = gr.Textbox(label="Detected frames", value="Please input the directory of guide video and rendered frames", lines=9, max_lines=9, interactive=False)
+                    video_guide_folder_.change(detect_frames, inputs=[video_guide_folder_, rendered_keyframes_], outputs=detected_frames)
+                    rendered_keyframes_.change(detect_frames, inputs=[video_guide_folder_, rendered_keyframes_], outputs=detected_frames)
+                with gr.Column():
+                    output_path_ = gr.Textbox(label="Output directory", value="", placeholder="Leave empty to use the directory of rendered keyframes")
+                    fps_ = gr.Textbox(label="Fps", value="", placeholder="Leave empty to use the default fps")
+                    video_output_ = gr.Video(label="Output video", interactive=False, show_share_button=True)
+            btn_ = gr.Button(value="Interpolate")
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("# Settings")
+                    batch_size_ = gr.Slider(label="Batch size", value=8, minimum=1, maximum=128, step=1, interactive=True)
+                    tracking_window_size_ = gr.Slider(label="Tracking window size", value=0, minimum=0, maximum=10, step=1, interactive=True)
+                    gr.Markdown("## Advanced Settings")
+                    minimum_patch_size_ = gr.Slider(label="Minimum patch size (odd number, larger is better)", value=15, minimum=5, maximum=99, step=2, interactive=True)
+                    num_iter_ = gr.Slider(label="Number of iterations", value=5, minimum=1, maximum=10, step=1, interactive=True)
+                    guide_weight_ = gr.Slider(label="Guide weight", value=10.0, minimum=0.0, maximum=100.0, step=0.1, interactive=True)
+                    initialize_ = gr.Radio(["identity", "random"], label="NNF initialization", value="identity", interactive=True)
+                with gr.Column():
+                    gr.Markdown("""
+# Reference
+* Output directory: the directory to save the video.
+* Batch size: a larger batch size makes the program faster but requires more VRAM.
+* Tracking window size (only for accurate mode): The size of window in which our algorithm tracks moving objects. Empirically, 1 is enough.
+* Advanced settings
+    * Minimum patch size (odd number): the minimum patch size used for patch matching. **This parameter should be larger than that in blending. (Default: 15)**
+    * Number of iterations: the number of iterations of patch matching. (Default: 5)
+    * Guide weight: a parameter that determines how much motion feature applied to the style video. (Default: 10)
+    * NNF initialization: how to initialize the NNF (Nearest Neighbor Field). (Default: identity)
+                    """)
+            btn_.click(
+                interpolate_video,
+                inputs=[
+                    video_guide_folder_,
+                    rendered_keyframes_,
+                    output_path_,
+                    fps_,
+                    batch_size_,
+                    tracking_window_size_,
+                    minimum_patch_size_,
+                    num_iter_,
+                    guide_weight_,
+                    initialize_,
+                ],
+                outputs=[output_path_, fps_, video_output_]
+            )
+        return [(ui_component, "FastBlend", "FastBlend_ui")]

diffsynth/extensions/FastBlend/cupy_kernels.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import cupy as cp
+remapping_kernel = cp.RawKernel(r'''
+extern "C" __global__
+void remap(
+    const int height,
+    const int width,
+    const int channel,
+    const int patch_size,
+    const int pad_size,
+    const float* source_style,
+    const int* nnf,
+    float* target_style
+) {
+    const int r = (patch_size - 1) / 2;
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+    if (x >= height or y >= width) return;
+    const int z = blockIdx.z * (height + pad_size * 2) * (width + pad_size * 2) * channel;
+    const int pid = (x + pad_size) * (width + pad_size * 2) + (y + pad_size);
+    const int min_px = x < r ? -x : -r;
+    const int max_px = x + r > height - 1 ? height - 1 - x : r;
+    const int min_py = y < r ? -y : -r;
+    const int max_py = y + r > width - 1 ? width - 1 - y : r;
+    int num = 0;
+    for (int px = min_px; px <= max_px; px++){
+        for (int py = min_py; py <= max_py; py++){
+            const int nid = (x + px) * width + y + py;
+            const int x_ = nnf[blockIdx.z * height * width * 2 + nid*2 + 0] - px;
+            const int y_ = nnf[blockIdx.z * height * width * 2 + nid*2 + 1] - py;
+            if (x_ < 0 or y_ < 0 or x_ >= height or y_ >= width)continue;
+            const int pid_ = (x_ + pad_size) * (width + pad_size * 2) + (y_ + pad_size);
+            num++;
+            for (int c = 0; c < channel; c++){
+                target_style[z + pid * channel + c] += source_style[z + pid_ * channel + c];
+            }
+        }
+    }
+    for (int c = 0; c < channel; c++){
+        target_style[z + pid * channel + c] /= num;
+    }
+}
+''', 'remap')
+patch_error_kernel = cp.RawKernel(r'''
+extern "C" __global__
+void patch_error(
+    const int height,
+    const int width,
+    const int channel,
+    const int patch_size,
+    const int pad_size,
+    const float* source,
+    const int* nnf,
+    const float* target,
+    float* error
+) {
+    const int r = (patch_size - 1) / 2;
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+    const int z = blockIdx.z * (height + pad_size * 2) * (width + pad_size * 2) * channel;
+    if (x >= height or y >= width) return;
+    const int x_ = nnf[blockIdx.z * height * width * 2 + (x * width + y)*2 + 0];
+    const int y_ = nnf[blockIdx.z * height * width * 2 + (x * width + y)*2 + 1];
+    float e = 0;
+    for (int px = -r; px <= r; px++){
+        for (int py = -r; py <= r; py++){
+            const int pid = (x + pad_size + px) * (width + pad_size * 2) + y + pad_size + py;
+            const int pid_ = (x_ + pad_size + px) * (width + pad_size * 2) + y_ + pad_size + py;
+            for (int c = 0; c < channel; c++){
+                const float diff = target[z + pid * channel + c] - source[z + pid_ * channel + c];
+                e += diff * diff;
+            }
+        }
+    }
+    error[blockIdx.z * height * width + x * width + y] = e;
+}
+''', 'patch_error')
+pairwise_patch_error_kernel = cp.RawKernel(r'''
+extern "C" __global__
+void pairwise_patch_error(
+    const int height,
+    const int width,
+    const int channel,
+    const int patch_size,
+    const int pad_size,
+    const float* source_a,
+    const int* nnf_a,
+    const float* source_b,
+    const int* nnf_b,
+    float* error
+) {
+    const int r = (patch_size - 1) / 2;
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+    const int z = blockIdx.z * (height + pad_size * 2) * (width + pad_size * 2) * channel;
+    if (x >= height or y >= width) return;
+    const int z_nnf = blockIdx.z * height * width * 2 + (x * width + y) * 2;
+    const int x_a = nnf_a[z_nnf + 0];
+    const int y_a = nnf_a[z_nnf + 1];
+    const int x_b = nnf_b[z_nnf + 0];
+    const int y_b = nnf_b[z_nnf + 1];
+    float e = 0;
+    for (int px = -r; px <= r; px++){
+        for (int py = -r; py <= r; py++){
+            const int pid_a = (x_a + pad_size + px) * (width + pad_size * 2) + y_a + pad_size + py;
+            const int pid_b = (x_b + pad_size + px) * (width + pad_size * 2) + y_b + pad_size + py;
+            for (int c = 0; c < channel; c++){
+                const float diff = source_a[z + pid_a * channel + c] - source_b[z + pid_b * channel + c];
+                e += diff * diff;
+            }
+        }
+    }
+    error[blockIdx.z * height * width + x * width + y] = e;
+}
+''', 'pairwise_patch_error')

diffsynth/extensions/FastBlend/data.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import imageio, os
+import numpy as np
+from PIL import Image
+def read_video(file_name):
+    reader = imageio.get_reader(file_name)
+    video = []
+    for frame in reader:
+        frame = np.array(frame)
+        video.append(frame)
+    reader.close()
+    return video
+def get_video_fps(file_name):
+    reader = imageio.get_reader(file_name)
+    fps = reader.get_meta_data()["fps"]
+    reader.close()
+    return fps
+def save_video(frames_path, video_path, num_frames, fps):
+    writer = imageio.get_writer(video_path, fps=fps, quality=9)
+    for i in range(num_frames):
+        frame = np.array(Image.open(os.path.join(frames_path, "%05d.png" % i)))
+        writer.append_data(frame)
+    writer.close()
+    return video_path
+class LowMemoryVideo:
+    def __init__(self, file_name):
+        self.reader = imageio.get_reader(file_name)
+    def __len__(self):
+        return self.reader.count_frames()
+    def __getitem__(self, item):
+        return np.array(self.reader.get_data(item))
+    def __del__(self):
+        self.reader.close()
+def split_file_name(file_name):
+    result = []
+    number = -1
+    for i in file_name:
+        if ord(i)>=ord("0") and ord(i)<=ord("9"):
+            if number == -1:
+                number = 0
+            number = number*10 + ord(i) - ord("0")
+        else:
+            if number != -1:
+                result.append(number)
+                number = -1
+            result.append(i)
+    if number != -1:
+        result.append(number)
+    result = tuple(result)
+    return result
+def search_for_images(folder):
+    file_list = [i for i in os.listdir(folder) if i.endswith(".jpg") or i.endswith(".png")]
+    file_list = [(split_file_name(file_name), file_name) for file_name in file_list]
+    file_list = [i[1] for i in sorted(file_list)]
+    file_list = [os.path.join(folder, i) for i in file_list]
+    return file_list
+def read_images(folder):
+    file_list = search_for_images(folder)
+    frames = [np.array(Image.open(i)) for i in file_list]
+    return frames
+class LowMemoryImageFolder:
+    def __init__(self, folder, file_list=None):
+        if file_list is None:
+            self.file_list = search_for_images(folder)
+        else:
+            self.file_list = [os.path.join(folder, file_name) for file_name in file_list]
+    def __len__(self):
+        return len(self.file_list)
+    def __getitem__(self, item):
+        return np.array(Image.open(self.file_list[item]))
+    def __del__(self):
+        pass
+class VideoData:
+    def __init__(self, video_file, image_folder, **kwargs):
+        if video_file is not None:
+            self.data_type = "video"
+            self.data = LowMemoryVideo(video_file, **kwargs)
+        elif image_folder is not None:
+            self.data_type = "images"
+            self.data = LowMemoryImageFolder(image_folder, **kwargs)
+        else:
+            raise ValueError("Cannot open video or image folder")
+        self.length = None
+        self.height = None
+        self.width = None
+    def raw_data(self):
+        frames = []
+        for i in range(self.__len__()):
+            frames.append(self.__getitem__(i))
+        return frames
+    def set_length(self, length):
+        self.length = length
+    def set_shape(self, height, width):
+        self.height = height
+        self.width = width
+    def __len__(self):
+        if self.length is None:
+            return len(self.data)
+        else:
+            return self.length
+    def shape(self):
+        if self.height is not None and self.width is not None:
+            return self.height, self.width
+        else:
+            height, width, _ = self.__getitem__(0).shape
+            return height, width
+    def __getitem__(self, item):
+        frame = self.data.__getitem__(item)
+        height, width, _ = frame.shape
+        if self.height is not None and self.width is not None:
+            if self.height != height or self.width != width:
+                frame = Image.fromarray(frame).resize((self.width, self.height))
+                frame = np.array(frame)
+        return frame
+    def __del__(self):
+        pass

diffsynth/extensions/FastBlend/patch_match.py ADDED Viewed

	@@ -0,0 +1,298 @@

+from .cupy_kernels import remapping_kernel, patch_error_kernel, pairwise_patch_error_kernel
+import numpy as np
+import cupy as cp
+import cv2
+class PatchMatcher:
+    def __init__(
+        self, height, width, channel, minimum_patch_size,
+        threads_per_block=8, num_iter=5, gpu_id=0, guide_weight=10.0,
+        random_search_steps=3, random_search_range=4,
+        use_mean_target_style=False, use_pairwise_patch_error=False,
+        tracking_window_size=0
+    ):
+        self.height = height
+        self.width = width
+        self.channel = channel
+        self.minimum_patch_size = minimum_patch_size
+        self.threads_per_block = threads_per_block
+        self.num_iter = num_iter
+        self.gpu_id = gpu_id
+        self.guide_weight = guide_weight
+        self.random_search_steps = random_search_steps
+        self.random_search_range = random_search_range
+        self.use_mean_target_style = use_mean_target_style
+        self.use_pairwise_patch_error = use_pairwise_patch_error
+        self.tracking_window_size = tracking_window_size
+        self.patch_size_list = [minimum_patch_size + i*2 for i in range(num_iter)][::-1]
+        self.pad_size = self.patch_size_list[0] // 2
+        self.grid = (
+            (height + threads_per_block - 1) // threads_per_block,
+            (width + threads_per_block - 1) // threads_per_block
+        )
+        self.block = (threads_per_block, threads_per_block)
+    def pad_image(self, image):
+        return cp.pad(image, ((0, 0), (self.pad_size, self.pad_size), (self.pad_size, self.pad_size), (0, 0)))
+    def unpad_image(self, image):
+        return image[:, self.pad_size: -self.pad_size, self.pad_size: -self.pad_size, :]
+    def apply_nnf_to_image(self, nnf, source):
+        batch_size = source.shape[0]
+        target = cp.zeros((batch_size, self.height + self.pad_size * 2, self.width + self.pad_size * 2, self.channel), dtype=cp.float32)
+        remapping_kernel(
+            self.grid + (batch_size,),
+            self.block,
+            (self.height, self.width, self.channel, self.patch_size, self.pad_size, source, nnf, target)
+        )
+        return target
+    def get_patch_error(self, source, nnf, target):
+        batch_size = source.shape[0]
+        error = cp.zeros((batch_size, self.height, self.width), dtype=cp.float32)
+        patch_error_kernel(
+            self.grid + (batch_size,),
+            self.block,
+            (self.height, self.width, self.channel, self.patch_size, self.pad_size, source, nnf, target, error)
+        )
+        return error
+    def get_pairwise_patch_error(self, source, nnf):
+        batch_size = source.shape[0]//2
+        error = cp.zeros((batch_size, self.height, self.width), dtype=cp.float32)
+        source_a, nnf_a = source[0::2].copy(), nnf[0::2].copy()
+        source_b, nnf_b = source[1::2].copy(), nnf[1::2].copy()
+        pairwise_patch_error_kernel(
+            self.grid + (batch_size,),
+            self.block,
+            (self.height, self.width, self.channel, self.patch_size, self.pad_size, source_a, nnf_a, source_b, nnf_b, error)
+        )
+        error = error.repeat(2, axis=0)
+        return error
+    def get_error(self, source_guide, target_guide, source_style, target_style, nnf):
+        error_guide = self.get_patch_error(source_guide, nnf, target_guide)
+        if self.use_mean_target_style:
+            target_style = self.apply_nnf_to_image(nnf, source_style)
+            target_style = target_style.mean(axis=0, keepdims=True)
+            target_style = target_style.repeat(source_guide.shape[0], axis=0)
+        if self.use_pairwise_patch_error:
+            error_style = self.get_pairwise_patch_error(source_style, nnf)
+        else:
+            error_style = self.get_patch_error(source_style, nnf, target_style)
+        error = error_guide * self.guide_weight + error_style
+        return error
+    def clamp_bound(self, nnf):
+        nnf[:,:,:,0] = cp.clip(nnf[:,:,:,0], 0, self.height-1)
+        nnf[:,:,:,1] = cp.clip(nnf[:,:,:,1], 0, self.width-1)
+        return nnf
+    def random_step(self, nnf, r):
+        batch_size = nnf.shape[0]
+        step = cp.random.randint(-r, r+1, size=(batch_size, self.height, self.width, 2), dtype=cp.int32)
+        upd_nnf = self.clamp_bound(nnf + step)
+        return upd_nnf
+    def neighboor_step(self, nnf, d):
+        if d==0:
+            upd_nnf = cp.concatenate([nnf[:, :1, :], nnf[:, :-1, :]], axis=1)
+            upd_nnf[:, :, :, 0] += 1
+        elif d==1:
+            upd_nnf = cp.concatenate([nnf[:, :, :1], nnf[:, :, :-1]], axis=2)
+            upd_nnf[:, :, :, 1] += 1
+        elif d==2:
+            upd_nnf = cp.concatenate([nnf[:, 1:, :], nnf[:, -1:, :]], axis=1)
+            upd_nnf[:, :, :, 0] -= 1
+        elif d==3:
+            upd_nnf = cp.concatenate([nnf[:, :, 1:], nnf[:, :, -1:]], axis=2)
+            upd_nnf[:, :, :, 1] -= 1
+        upd_nnf = self.clamp_bound(upd_nnf)
+        return upd_nnf
+    def shift_nnf(self, nnf, d):
+        if d>0:
+            d = min(nnf.shape[0], d)
+            upd_nnf = cp.concatenate([nnf[d:]] + [nnf[-1:]] * d, axis=0)
+        else:
+            d = max(-nnf.shape[0], d)
+            upd_nnf = cp.concatenate([nnf[:1]] * (-d) + [nnf[:d]], axis=0)
+        return upd_nnf
+    def track_step(self, nnf, d):
+        if self.use_pairwise_patch_error:
+            upd_nnf = cp.zeros_like(nnf)
+            upd_nnf[0::2] = self.shift_nnf(nnf[0::2], d)
+            upd_nnf[1::2] = self.shift_nnf(nnf[1::2], d)
+        else:
+            upd_nnf = self.shift_nnf(nnf, d)
+        return upd_nnf
+    def C(self, n, m):
+        # not used
+        c = 1
+        for i in range(1, n+1):
+            c *= i
+        for i in range(1, m+1):
+            c //= i
+        for i in range(1, n-m+1):
+            c //= i
+        return c
+    def bezier_step(self, nnf, r):
+        # not used
+        n = r * 2 - 1
+        upd_nnf = cp.zeros(shape=nnf.shape, dtype=cp.float32)
+        for i, d in enumerate(list(range(-r, 0)) + list(range(1, r+1))):
+            if d>0:
+                ctl_nnf = cp.concatenate([nnf[d:]] + [nnf[-1:]] * d, axis=0)
+            elif d<0:
+                ctl_nnf = cp.concatenate([nnf[:1]] * (-d) + [nnf[:d]], axis=0)
+            upd_nnf += ctl_nnf * (self.C(n, i) / 2**n)
+        upd_nnf = self.clamp_bound(upd_nnf).astype(nnf.dtype)
+        return upd_nnf
+    def update(self, source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf):
+        upd_err = self.get_error(source_guide, target_guide, source_style, target_style, upd_nnf)
+        upd_idx = (upd_err < err)
+        nnf[upd_idx] = upd_nnf[upd_idx]
+        err[upd_idx] = upd_err[upd_idx]
+        return nnf, err
+    def propagation(self, source_guide, target_guide, source_style, target_style, nnf, err):
+        for d in cp.random.permutation(4):
+            upd_nnf = self.neighboor_step(nnf, d)
+            nnf, err = self.update(source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf)
+        return nnf, err
+    def random_search(self, source_guide, target_guide, source_style, target_style, nnf, err):
+        for i in range(self.random_search_steps):
+            upd_nnf = self.random_step(nnf, self.random_search_range)
+            nnf, err = self.update(source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf)
+        return nnf, err
+    def track(self, source_guide, target_guide, source_style, target_style, nnf, err):
+        for d in range(1, self.tracking_window_size + 1):
+            upd_nnf = self.track_step(nnf, d)
+            nnf, err = self.update(source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf)
+            upd_nnf = self.track_step(nnf, -d)
+            nnf, err = self.update(source_guide, target_guide, source_style, target_style, nnf, err, upd_nnf)
+        return nnf, err
+    def iteration(self, source_guide, target_guide, source_style, target_style, nnf, err):
+        nnf, err = self.propagation(source_guide, target_guide, source_style, target_style, nnf, err)
+        nnf, err = self.random_search(source_guide, target_guide, source_style, target_style, nnf, err)
+        nnf, err = self.track(source_guide, target_guide, source_style, target_style, nnf, err)
+        return nnf, err
+    def estimate_nnf(self, source_guide, target_guide, source_style, nnf):
+        with cp.cuda.Device(self.gpu_id):
+            source_guide = self.pad_image(source_guide)
+            target_guide = self.pad_image(target_guide)
+            source_style = self.pad_image(source_style)
+            for it in range(self.num_iter):
+                self.patch_size = self.patch_size_list[it]
+                target_style = self.apply_nnf_to_image(nnf, source_style)
+                err = self.get_error(source_guide, target_guide, source_style, target_style, nnf)
+                nnf, err = self.iteration(source_guide, target_guide, source_style, target_style, nnf, err)
+            target_style = self.unpad_image(self.apply_nnf_to_image(nnf, source_style))
+        return nnf, target_style
+class PyramidPatchMatcher:
+    def __init__(
+        self, image_height, image_width, channel, minimum_patch_size,
+        threads_per_block=8, num_iter=5, gpu_id=0, guide_weight=10.0,
+        use_mean_target_style=False, use_pairwise_patch_error=False,
+        tracking_window_size=0,
+        initialize="identity"
+    ):
+        maximum_patch_size = minimum_patch_size + (num_iter - 1) * 2
+        self.pyramid_level = int(np.log2(min(image_height, image_width) / maximum_patch_size))
+        self.pyramid_heights = []
+        self.pyramid_widths = []
+        self.patch_matchers = []
+        self.minimum_patch_size = minimum_patch_size
+        self.num_iter = num_iter
+        self.gpu_id = gpu_id
+        self.initialize = initialize
+        for level in range(self.pyramid_level):
+            height = image_height//(2**(self.pyramid_level - 1 - level))
+            width = image_width//(2**(self.pyramid_level - 1 - level))
+            self.pyramid_heights.append(height)
+            self.pyramid_widths.append(width)
+            self.patch_matchers.append(PatchMatcher(
+                height, width, channel, minimum_patch_size=minimum_patch_size,
+                threads_per_block=threads_per_block, num_iter=num_iter, gpu_id=gpu_id, guide_weight=guide_weight,
+                use_mean_target_style=use_mean_target_style, use_pairwise_patch_error=use_pairwise_patch_error,
+                tracking_window_size=tracking_window_size
+            ))
+    def resample_image(self, images, level):
+        height, width = self.pyramid_heights[level], self.pyramid_widths[level]
+        images = images.get()
+        images_resample = []
+        for image in images:
+            image_resample = cv2.resize(image, (width, height), interpolation=cv2.INTER_AREA)
+            images_resample.append(image_resample)
+        images_resample = cp.array(np.stack(images_resample), dtype=cp.float32)
+        return images_resample
+    def initialize_nnf(self, batch_size):
+        if self.initialize == "random":
+            height, width = self.pyramid_heights[0], self.pyramid_widths[0]
+            nnf = cp.stack([
+                cp.random.randint(0, height, (batch_size, height, width), dtype=cp.int32),
+                cp.random.randint(0, width, (batch_size, height, width), dtype=cp.int32)
+            ], axis=3)
+        elif self.initialize == "identity":
+            height, width = self.pyramid_heights[0], self.pyramid_widths[0]
+            nnf = cp.stack([
+                cp.repeat(cp.arange(height), width).reshape(height, width),
+                cp.tile(cp.arange(width), height).reshape(height, width)
+            ], axis=2)
+            nnf = cp.stack([nnf] * batch_size)
+        else:
+            raise NotImplementedError()
+        return nnf
+    def update_nnf(self, nnf, level):
+        # upscale
+        nnf = nnf.repeat(2, axis=1).repeat(2, axis=2) * 2
+        nnf[:,[i for i in range(nnf.shape[0]) if i&1],:,0] += 1
+        nnf[:,:,[i for i in range(nnf.shape[0]) if i&1],1] += 1
+        # check if scale is 2
+        height, width = self.pyramid_heights[level], self.pyramid_widths[level]
+        if height != nnf.shape[0] * 2 or width != nnf.shape[1] * 2:
+            nnf = nnf.get().astype(np.float32)
+            nnf = [cv2.resize(n, (width, height), interpolation=cv2.INTER_LINEAR) for n in nnf]
+            nnf = cp.array(np.stack(nnf), dtype=cp.int32)
+            nnf = self.patch_matchers[level].clamp_bound(nnf)
+        return nnf
+    def apply_nnf_to_image(self, nnf, image):
+        with cp.cuda.Device(self.gpu_id):
+            image = self.patch_matchers[-1].pad_image(image)
+            image = self.patch_matchers[-1].apply_nnf_to_image(nnf, image)
+        return image
+    def estimate_nnf(self, source_guide, target_guide, source_style):
+        with cp.cuda.Device(self.gpu_id):
+            if not isinstance(source_guide, cp.ndarray):
+                source_guide = cp.array(source_guide, dtype=cp.float32)
+            if not isinstance(target_guide, cp.ndarray):
+                target_guide = cp.array(target_guide, dtype=cp.float32)
+            if not isinstance(source_style, cp.ndarray):
+                source_style = cp.array(source_style, dtype=cp.float32)
+            for level in range(self.pyramid_level):
+                nnf = self.initialize_nnf(source_guide.shape[0]) if level==0 else self.update_nnf(nnf, level)
+                source_guide_ = self.resample_image(source_guide, level)
+                target_guide_ = self.resample_image(target_guide, level)
+                source_style_ = self.resample_image(source_style, level)
+                nnf, target_style = self.patch_matchers[level].estimate_nnf(
+                    source_guide_, target_guide_, source_style_, nnf
+                )
+        return nnf.get(), target_style.get()

diffsynth/extensions/FastBlend/runners/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .accurate import AccurateModeRunner
+from .fast import FastModeRunner
+from .balanced import BalancedModeRunner
+from .interpolation import InterpolationModeRunner, InterpolationModeSingleFrameRunner

diffsynth/extensions/FastBlend/runners/accurate.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from ..patch_match import PyramidPatchMatcher
+import os
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+class AccurateModeRunner:
+    def __init__(self):
+        pass
+    def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config, desc="Accurate Mode", save_path=None):
+        patch_match_engine = PyramidPatchMatcher(
+            image_height=frames_style[0].shape[0],
+            image_width=frames_style[0].shape[1],
+            channel=3,
+            use_mean_target_style=True,
+            **ebsynth_config
+        )
+        # run
+        n = len(frames_style)
+        for target in tqdm(range(n), desc=desc):
+            l, r = max(target - window_size, 0), min(target + window_size + 1, n)
+            remapped_frames = []
+            for i in range(l, r, batch_size):
+                j = min(i + batch_size, r)
+                source_guide = np.stack([frames_guide[source] for source in range(i, j)])
+                target_guide = np.stack([frames_guide[target]] * (j - i))
+                source_style = np.stack([frames_style[source] for source in range(i, j)])
+                _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
+                remapped_frames.append(target_style)
+            frame = np.concatenate(remapped_frames, axis=0).mean(axis=0)
+            frame = frame.clip(0, 255).astype("uint8")
+            if save_path is not None:
+                Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % target))

diffsynth/extensions/FastBlend/runners/balanced.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from ..patch_match import PyramidPatchMatcher
+import os
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+class BalancedModeRunner:
+    def __init__(self):
+        pass
+    def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config, desc="Balanced Mode", save_path=None):
+        patch_match_engine = PyramidPatchMatcher(
+            image_height=frames_style[0].shape[0],
+            image_width=frames_style[0].shape[1],
+            channel=3,
+            **ebsynth_config
+        )
+        # tasks
+        n = len(frames_style)
+        tasks = []
+        for target in range(n):
+            for source in range(target - window_size, target + window_size + 1):
+                if source >= 0 and source < n and source != target:
+                    tasks.append((source, target))
+        # run
+        frames = [(None, 1) for i in range(n)]
+        for batch_id in tqdm(range(0, len(tasks), batch_size), desc=desc):
+            tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))]
+            source_guide = np.stack([frames_guide[source] for source, target in tasks_batch])
+            target_guide = np.stack([frames_guide[target] for source, target in tasks_batch])
+            source_style = np.stack([frames_style[source] for source, target in tasks_batch])
+            _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
+            for (source, target), result in zip(tasks_batch, target_style):
+                frame, weight = frames[target]
+                if frame is None:
+                    frame = frames_style[target]
+                frames[target] = (
+                    frame * (weight / (weight + 1)) + result / (weight + 1),
+                    weight + 1
+                )
+                if weight + 1 == min(n, target + window_size + 1) - max(0, target - window_size):
+                    frame = frame.clip(0, 255).astype("uint8")
+                    if save_path is not None:
+                        Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % target))
+                    frames[target] = (None, 1)

diffsynth/extensions/FastBlend/runners/fast.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from ..patch_match import PyramidPatchMatcher
+import functools, os
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+class TableManager:
+    def __init__(self):
+        pass
+    def task_list(self, n):
+        tasks = []
+        max_level = 1
+        while (1<<max_level)<=n:
+            max_level += 1
+        for i in range(n):
+            j = i
+            for level in range(max_level):
+                if i&(1<<level):
+                    continue
+                j |= 1<<level
+                if j>=n:
+                    break
+                meta_data = {
+                    "source": i,
+                    "target": j,
+                    "level": level + 1
+                }
+                tasks.append(meta_data)
+        tasks.sort(key=functools.cmp_to_key(lambda u, v: u["level"]-v["level"]))
+        return tasks
+    def build_remapping_table(self, frames_guide, frames_style, patch_match_engine, batch_size, desc=""):
+        n = len(frames_guide)
+        tasks = self.task_list(n)
+        remapping_table = [[(frames_style[i], 1)] for i in range(n)]
+        for batch_id in tqdm(range(0, len(tasks), batch_size), desc=desc):
+            tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))]
+            source_guide = np.stack([frames_guide[task["source"]] for task in tasks_batch])
+            target_guide = np.stack([frames_guide[task["target"]] for task in tasks_batch])
+            source_style = np.stack([frames_style[task["source"]] for task in tasks_batch])
+            _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
+            for task, result in zip(tasks_batch, target_style):
+                target, level = task["target"], task["level"]
+                if len(remapping_table[target])==level:
+                    remapping_table[target].append((result, 1))
+                else:
+                    frame, weight = remapping_table[target][level]
+                    remapping_table[target][level] = (
+                        frame * (weight / (weight + 1)) + result / (weight + 1),
+                        weight + 1
+                    )
+        return remapping_table
+    def remapping_table_to_blending_table(self, table):
+        for i in range(len(table)):
+            for j in range(1, len(table[i])):
+                frame_1, weight_1 = table[i][j-1]
+                frame_2, weight_2 = table[i][j]
+                frame = (frame_1 + frame_2) / 2
+                weight = weight_1 + weight_2
+                table[i][j] = (frame, weight)
+        return table
+    def tree_query(self, leftbound, rightbound):
+        node_list = []
+        node_index = rightbound
+        while node_index>=leftbound:
+            node_level = 0
+            while (1<<node_level)&node_index and node_index-(1<<node_level+1)+1>=leftbound:
+                node_level += 1
+            node_list.append((node_index, node_level))
+            node_index -= 1<<node_level
+        return node_list
+    def process_window_sum(self, frames_guide, blending_table, patch_match_engine, window_size, batch_size, desc=""):
+        n = len(blending_table)
+        tasks = []
+        frames_result = []
+        for target in range(n):
+            node_list = self.tree_query(max(target-window_size, 0), target)
+            for source, level in node_list:
+                if source!=target:
+                    meta_data = {
+                        "source": source,
+                        "target": target,
+                        "level": level
+                    }
+                    tasks.append(meta_data)
+                else:
+                    frames_result.append(blending_table[target][level])
+        for batch_id in tqdm(range(0, len(tasks), batch_size), desc=desc):
+            tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))]
+            source_guide = np.stack([frames_guide[task["source"]] for task in tasks_batch])
+            target_guide = np.stack([frames_guide[task["target"]] for task in tasks_batch])
+            source_style = np.stack([blending_table[task["source"]][task["level"]][0] for task in tasks_batch])
+            _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
+            for task, frame_2 in zip(tasks_batch, target_style):
+                source, target, level = task["source"], task["target"], task["level"]
+                frame_1, weight_1 = frames_result[target]
+                weight_2 = blending_table[source][level][1]
+                weight = weight_1 + weight_2
+                frame = frame_1 * (weight_1 / weight) + frame_2 * (weight_2 / weight)
+                frames_result[target] = (frame, weight)
+        return frames_result
+class FastModeRunner:
+    def __init__(self):
+        pass
+    def run(self, frames_guide, frames_style, batch_size, window_size, ebsynth_config, save_path=None):
+        frames_guide = frames_guide.raw_data()
+        frames_style = frames_style.raw_data()
+        table_manager = TableManager()
+        patch_match_engine = PyramidPatchMatcher(
+            image_height=frames_style[0].shape[0],
+            image_width=frames_style[0].shape[1],
+            channel=3,
+            **ebsynth_config
+        )
+        # left part
+        table_l = table_manager.build_remapping_table(frames_guide, frames_style, patch_match_engine, batch_size, desc="Fast Mode Step 1/4")
+        table_l = table_manager.remapping_table_to_blending_table(table_l)
+        table_l = table_manager.process_window_sum(frames_guide, table_l, patch_match_engine, window_size, batch_size, desc="Fast Mode Step 2/4")
+        # right part
+        table_r = table_manager.build_remapping_table(frames_guide[::-1], frames_style[::-1], patch_match_engine, batch_size, desc="Fast Mode Step 3/4")
+        table_r = table_manager.remapping_table_to_blending_table(table_r)
+        table_r = table_manager.process_window_sum(frames_guide[::-1], table_r, patch_match_engine, window_size, batch_size, desc="Fast Mode Step 4/4")[::-1]
+        # merge
+        frames = []
+        for (frame_l, weight_l), frame_m, (frame_r, weight_r) in zip(table_l, frames_style, table_r):
+            weight_m = -1
+            weight = weight_l + weight_m + weight_r
+            frame = frame_l * (weight_l / weight) + frame_m * (weight_m / weight) + frame_r * (weight_r / weight)
+            frames.append(frame)
+        frames = [frame.clip(0, 255).astype("uint8") for frame in frames]
+        if save_path is not None:
+            for target, frame in enumerate(frames):
+                Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % target))

diffsynth/extensions/FastBlend/runners/interpolation.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from ..patch_match import PyramidPatchMatcher
+import os
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+class InterpolationModeRunner:
+    def __init__(self):
+        pass
+    def get_index_dict(self, index_style):
+        index_dict = {}
+        for i, index in enumerate(index_style):
+            index_dict[index] = i
+        return index_dict
+    def get_weight(self, l, m, r):
+        weight_l, weight_r = abs(m - r), abs(m - l)
+        if weight_l + weight_r == 0:
+            weight_l, weight_r = 0.5, 0.5
+        else:
+            weight_l, weight_r = weight_l / (weight_l + weight_r), weight_r / (weight_l + weight_r)
+        return weight_l, weight_r
+    def get_task_group(self, index_style, n):
+        task_group = []
+        index_style = sorted(index_style)
+        # first frame
+        if index_style[0]>0:
+            tasks = []
+            for m in range(index_style[0]):
+                tasks.append((index_style[0], m, index_style[0]))
+            task_group.append(tasks)
+        # middle frames
+        for l, r in zip(index_style[:-1], index_style[1:]):
+            tasks = []
+            for m in range(l, r):
+                tasks.append((l, m, r))
+            task_group.append(tasks)
+        # last frame
+        tasks = []
+        for m in range(index_style[-1], n):
+            tasks.append((index_style[-1], m, index_style[-1]))
+        task_group.append(tasks)
+        return task_group
+    def run(self, frames_guide, frames_style, index_style, batch_size, ebsynth_config, save_path=None):
+        patch_match_engine = PyramidPatchMatcher(
+            image_height=frames_style[0].shape[0],
+            image_width=frames_style[0].shape[1],
+            channel=3,
+            use_mean_target_style=False,
+            use_pairwise_patch_error=True,
+            **ebsynth_config
+        )
+        # task
+        index_dict = self.get_index_dict(index_style)
+        task_group = self.get_task_group(index_style, len(frames_guide))
+        # run
+        for tasks in task_group:
+            index_start, index_end = min([i[1] for i in tasks]), max([i[1] for i in tasks])
+            for batch_id in tqdm(range(0, len(tasks), batch_size), desc=f"Rendering frames {index_start}...{index_end}"):
+                tasks_batch = tasks[batch_id: min(batch_id+batch_size, len(tasks))]
+                source_guide, target_guide, source_style = [], [], []
+                for l, m, r in tasks_batch:
+                    # l -> m
+                    source_guide.append(frames_guide[l])
+                    target_guide.append(frames_guide[m])
+                    source_style.append(frames_style[index_dict[l]])
+                    # r -> m
+                    source_guide.append(frames_guide[r])
+                    target_guide.append(frames_guide[m])
+                    source_style.append(frames_style[index_dict[r]])
+                source_guide = np.stack(source_guide)
+                target_guide = np.stack(target_guide)
+                source_style = np.stack(source_style)
+                _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
+                if save_path is not None:
+                    for frame_l, frame_r, (l, m, r) in zip(target_style[0::2], target_style[1::2], tasks_batch):
+                        weight_l, weight_r = self.get_weight(l, m, r)
+                        frame = frame_l * weight_l + frame_r * weight_r
+                        frame = frame.clip(0, 255).astype("uint8")
+                        Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % m))
+class InterpolationModeSingleFrameRunner:
+    def __init__(self):
+        pass
+    def run(self, frames_guide, frames_style, index_style, batch_size, ebsynth_config, save_path=None):
+        # check input
+        tracking_window_size = ebsynth_config["tracking_window_size"]
+        if tracking_window_size * 2 >= batch_size:
+            raise ValueError("batch_size should be larger than track_window_size * 2")
+        frame_style = frames_style[0]
+        frame_guide = frames_guide[index_style[0]]
+        patch_match_engine = PyramidPatchMatcher(
+            image_height=frame_style.shape[0],
+            image_width=frame_style.shape[1],
+            channel=3,
+            **ebsynth_config
+        )
+        # run
+        frame_id, n = 0, len(frames_guide)
+        for i in tqdm(range(0, n, batch_size - tracking_window_size * 2), desc=f"Rendering frames 0...{n}"):
+            if i + batch_size > n:
+                l, r = max(n - batch_size, 0), n
+            else:
+                l, r = i, i + batch_size
+            source_guide = np.stack([frame_guide] * (r-l))
+            target_guide = np.stack([frames_guide[i] for i in range(l, r)])
+            source_style = np.stack([frame_style] * (r-l))
+            _, target_style = patch_match_engine.estimate_nnf(source_guide, target_guide, source_style)
+            for i, frame in zip(range(l, r), target_style):
+                if i==frame_id:
+                    frame = frame.clip(0, 255).astype("uint8")
+                    Image.fromarray(frame).save(os.path.join(save_path, "%05d.png" % frame_id))
+                    frame_id += 1
+                if r < n and r-frame_id <= tracking_window_size:
+                    break

diffsynth/extensions/RIFE/__init__.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+def warp(tenInput, tenFlow, device):
+    backwarp_tenGrid = {}
+    k = (str(tenFlow.device), str(tenFlow.size()))
+    if k not in backwarp_tenGrid:
+        tenHorizontal = torch.linspace(-1.0, 1.0, tenFlow.shape[3], device=device).view(
+            1, 1, 1, tenFlow.shape[3]).expand(tenFlow.shape[0], -1, tenFlow.shape[2], -1)
+        tenVertical = torch.linspace(-1.0, 1.0, tenFlow.shape[2], device=device).view(
+            1, 1, tenFlow.shape[2], 1).expand(tenFlow.shape[0], -1, -1, tenFlow.shape[3])
+        backwarp_tenGrid[k] = torch.cat(
+            [tenHorizontal, tenVertical], 1).to(device)
+    tenFlow = torch.cat([tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0),
+                         tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0)], 1)
+    g = (backwarp_tenGrid[k] + tenFlow).permute(0, 2, 3, 1)
+    return torch.nn.functional.grid_sample(input=tenInput, grid=g, mode='bilinear', padding_mode='border', align_corners=True)
+def conv(in_planes, out_planes, kernel_size=3, stride=1, padding=1, dilation=1):
+    return nn.Sequential(
+        nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
+                  padding=padding, dilation=dilation, bias=True),
+        nn.PReLU(out_planes)
+    )
+class IFBlock(nn.Module):
+    def __init__(self, in_planes, c=64):
+        super(IFBlock, self).__init__()
+        self.conv0 = nn.Sequential(conv(in_planes, c//2, 3, 2, 1), conv(c//2, c, 3, 2, 1),)
+        self.convblock0 = nn.Sequential(conv(c, c), conv(c, c))
+        self.convblock1 = nn.Sequential(conv(c, c), conv(c, c))
+        self.convblock2 = nn.Sequential(conv(c, c), conv(c, c))
+        self.convblock3 = nn.Sequential(conv(c, c), conv(c, c))
+        self.conv1 = nn.Sequential(nn.ConvTranspose2d(c, c//2, 4, 2, 1), nn.PReLU(c//2), nn.ConvTranspose2d(c//2, 4, 4, 2, 1))
+        self.conv2 = nn.Sequential(nn.ConvTranspose2d(c, c//2, 4, 2, 1), nn.PReLU(c//2), nn.ConvTranspose2d(c//2, 1, 4, 2, 1))
+    def forward(self, x, flow, scale=1):
+        x = F.interpolate(x, scale_factor= 1. / scale, mode="bilinear", align_corners=False, recompute_scale_factor=False)
+        flow = F.interpolate(flow, scale_factor= 1. / scale, mode="bilinear", align_corners=False, recompute_scale_factor=False) * 1. / scale
+        feat = self.conv0(torch.cat((x, flow), 1))
+        feat = self.convblock0(feat) + feat
+        feat = self.convblock1(feat) + feat
+        feat = self.convblock2(feat) + feat
+        feat = self.convblock3(feat) + feat
+        flow = self.conv1(feat)
+        mask = self.conv2(feat)
+        flow = F.interpolate(flow, scale_factor=scale, mode="bilinear", align_corners=False, recompute_scale_factor=False) * scale
+        mask = F.interpolate(mask, scale_factor=scale, mode="bilinear", align_corners=False, recompute_scale_factor=False)
+        return flow, mask
+class IFNet(nn.Module):
+    def __init__(self):
+        super(IFNet, self).__init__()
+        self.block0 = IFBlock(7+4, c=90)
+        self.block1 = IFBlock(7+4, c=90)
+        self.block2 = IFBlock(7+4, c=90)
+        self.block_tea = IFBlock(10+4, c=90)
+    def forward(self, x, scale_list=[4, 2, 1], training=False):
+        if training == False:
+            channel = x.shape[1] // 2
+            img0 = x[:, :channel]
+            img1 = x[:, channel:]
+        flow_list = []
+        merged = []
+        mask_list = []
+        warped_img0 = img0
+        warped_img1 = img1
+        flow = (x[:, :4]).detach() * 0
+        mask = (x[:, :1]).detach() * 0
+        block = [self.block0, self.block1, self.block2]
+        for i in range(3):
+            f0, m0 = block[i](torch.cat((warped_img0[:, :3], warped_img1[:, :3], mask), 1), flow, scale=scale_list[i])
+            f1, m1 = block[i](torch.cat((warped_img1[:, :3], warped_img0[:, :3], -mask), 1), torch.cat((flow[:, 2:4], flow[:, :2]), 1), scale=scale_list[i])
+            flow = flow + (f0 + torch.cat((f1[:, 2:4], f1[:, :2]), 1)) / 2
+            mask = mask + (m0 + (-m1)) / 2
+            mask_list.append(mask)
+            flow_list.append(flow)
+            warped_img0 = warp(img0, flow[:, :2], device=x.device)
+            warped_img1 = warp(img1, flow[:, 2:4], device=x.device)
+            merged.append((warped_img0, warped_img1))
+        '''
+        c0 = self.contextnet(img0, flow[:, :2])
+        c1 = self.contextnet(img1, flow[:, 2:4])
+        tmp = self.unet(img0, img1, warped_img0, warped_img1, mask, flow, c0, c1)
+        res = tmp[:, 1:4] * 2 - 1
+        '''
+        for i in range(3):
+            mask_list[i] = torch.sigmoid(mask_list[i])
+            merged[i] = merged[i][0] * mask_list[i] + merged[i][1] * (1 - mask_list[i])
+        return flow_list, mask_list[2], merged
+    def state_dict_converter(self):
+        return IFNetStateDictConverter()
+class IFNetStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        state_dict_ = {k.replace("module.", ""): v for k, v in state_dict.items()}
+        return state_dict_
+    def from_civitai(self, state_dict):
+        return self.from_diffusers(state_dict)
+class RIFEInterpolater:
+    def __init__(self, model, device="cuda"):
+        self.model = model
+        self.device = device
+        # IFNet only does not support float16
+        self.torch_dtype = torch.float32
+    @staticmethod
+    def from_model_manager(model_manager):
+        return RIFEInterpolater(model_manager.RIFE, device=model_manager.device)
+    def process_image(self, image):
+        width, height = image.size
+        if width % 32 != 0 or height % 32 != 0:
+            width = (width + 31) // 32
+            height = (height + 31) // 32
+            image = image.resize((width, height))
+        image = torch.Tensor(np.array(image, dtype=np.float32)[:, :, [2,1,0]] / 255).permute(2, 0, 1)
+        return image
+    def process_images(self, images):
+        images = [self.process_image(image) for image in images]
+        images = torch.stack(images)
+        return images
+    def decode_images(self, images):
+        images = (images[:, [2,1,0]].permute(0, 2, 3, 1) * 255).clip(0, 255).numpy().astype(np.uint8)
+        images = [Image.fromarray(image) for image in images]
+        return images
+    def add_interpolated_images(self, images, interpolated_images):
+        output_images = []
+        for image, interpolated_image in zip(images, interpolated_images):
+            output_images.append(image)
+            output_images.append(interpolated_image)
+        output_images.append(images[-1])
+        return output_images
+    @torch.no_grad()
+    def interpolate_(self, images, scale=1.0):
+        input_tensor = self.process_images(images)
+        input_tensor = torch.cat((input_tensor[:-1], input_tensor[1:]), dim=1)
+        input_tensor = input_tensor.to(device=self.device, dtype=self.torch_dtype)
+        flow, mask, merged = self.model(input_tensor, [4/scale, 2/scale, 1/scale])
+        output_images = self.decode_images(merged[2].cpu())
+        if output_images[0].size != images[0].size:
+            output_images = [image.resize(images[0].size) for image in output_images]
+        return output_images
+    @torch.no_grad()
+    def interpolate(self, images, scale=1.0, batch_size=4, num_iter=1, progress_bar=lambda x:x):
+        # Preprocess
+        processed_images = self.process_images(images)
+        for iter in range(num_iter):
+            # Input
+            input_tensor = torch.cat((processed_images[:-1], processed_images[1:]), dim=1)
+            # Interpolate
+            output_tensor = []
+            for batch_id in progress_bar(range(0, input_tensor.shape[0], batch_size)):
+                batch_id_ = min(batch_id + batch_size, input_tensor.shape[0])
+                batch_input_tensor = input_tensor[batch_id: batch_id_]
+                batch_input_tensor = batch_input_tensor.to(device=self.device, dtype=self.torch_dtype)
+                flow, mask, merged = self.model(batch_input_tensor, [4/scale, 2/scale, 1/scale])
+                output_tensor.append(merged[2].cpu())
+            # Output
+            output_tensor = torch.concat(output_tensor, dim=0).clip(0, 1)
+            processed_images = self.add_interpolated_images(processed_images, output_tensor)
+            processed_images = torch.stack(processed_images)
+        # To images
+        output_images = self.decode_images(processed_images)
+        if output_images[0].size != images[0].size:
+            output_images = [image.resize(images[0].size) for image in output_images]
+        return output_images
+class RIFESmoother(RIFEInterpolater):
+    def __init__(self, model, device="cuda"):
+        super(RIFESmoother, self).__init__(model, device=device)
+    @staticmethod
+    def from_model_manager(model_manager):
+        return RIFESmoother(model_manager.RIFE, device=model_manager.device)
+    def process_tensors(self, input_tensor, scale=1.0, batch_size=4):
+        output_tensor = []
+        for batch_id in range(0, input_tensor.shape[0], batch_size):
+            batch_id_ = min(batch_id + batch_size, input_tensor.shape[0])
+            batch_input_tensor = input_tensor[batch_id: batch_id_]
+            batch_input_tensor = batch_input_tensor.to(device=self.device, dtype=self.torch_dtype)
+            flow, mask, merged = self.model(batch_input_tensor, [4/scale, 2/scale, 1/scale])
+            output_tensor.append(merged[2].cpu())
+        output_tensor = torch.concat(output_tensor, dim=0)
+        return output_tensor
+    @torch.no_grad()
+    def __call__(self, rendered_frames, scale=1.0, batch_size=4, num_iter=1, **kwargs):
+        # Preprocess
+        processed_images = self.process_images(rendered_frames)
+        for iter in range(num_iter):
+            # Input
+            input_tensor = torch.cat((processed_images[:-2], processed_images[2:]), dim=1)
+            # Interpolate
+            output_tensor = self.process_tensors(input_tensor, scale=scale, batch_size=batch_size)
+            # Blend
+            input_tensor = torch.cat((processed_images[1:-1], output_tensor), dim=1)
+            output_tensor = self.process_tensors(input_tensor, scale=scale, batch_size=batch_size)
+            # Add to frames
+            processed_images[1:-1] = output_tensor
+        # To images
+        output_images = self.decode_images(processed_images)
+        if output_images[0].size != rendered_frames[0].size:
+            output_images = [image.resize(rendered_frames[0].size) for image in output_images]
+        return output_images

diffsynth/models/__init__.py ADDED Viewed

	@@ -0,0 +1,814 @@

+import torch, os, json
+from safetensors import safe_open
+from typing_extensions import Literal, TypeAlias
+from typing import List
+from .downloader import download_from_huggingface, download_from_modelscope
+from .sd_text_encoder import SDTextEncoder
+from .sd_unet import SDUNet
+from .sd_vae_encoder import SDVAEEncoder
+from .sd_vae_decoder import SDVAEDecoder
+from .sd_lora import SDLoRA
+from .sdxl_text_encoder import SDXLTextEncoder, SDXLTextEncoder2
+from .sdxl_unet import SDXLUNet
+from .sdxl_vae_decoder import SDXLVAEDecoder
+from .sdxl_vae_encoder import SDXLVAEEncoder
+from .sd3_text_encoder import SD3TextEncoder1, SD3TextEncoder2, SD3TextEncoder3
+from .sd3_dit import SD3DiT
+from .sd3_vae_decoder import SD3VAEDecoder
+from .sd3_vae_encoder import SD3VAEEncoder
+from .sd_controlnet import SDControlNet
+from .sd_motion import SDMotionModel
+from .sdxl_motion import SDXLMotionModel
+from .svd_image_encoder import SVDImageEncoder
+from .svd_unet import SVDUNet
+from .svd_vae_decoder import SVDVAEDecoder
+from .svd_vae_encoder import SVDVAEEncoder
+from .sd_ipadapter import SDIpAdapter, IpAdapterCLIPImageEmbedder
+from .sdxl_ipadapter import SDXLIpAdapter, IpAdapterXLCLIPImageEmbedder
+from .hunyuan_dit_text_encoder import HunyuanDiTCLIPTextEncoder, HunyuanDiTT5TextEncoder
+from .hunyuan_dit import HunyuanDiT
+from .kolors_text_encoder import ChatGLMModel
+preset_models_on_huggingface = {
+    "HunyuanDiT": [
+        ("Tencent-Hunyuan/HunyuanDiT", "t2i/clip_text_encoder/pytorch_model.bin", "models/HunyuanDiT/t2i/clip_text_encoder"),
+        ("Tencent-Hunyuan/HunyuanDiT", "t2i/mt5/pytorch_model.bin", "models/HunyuanDiT/t2i/mt5"),
+        ("Tencent-Hunyuan/HunyuanDiT", "t2i/model/pytorch_model_ema.pt", "models/HunyuanDiT/t2i/model"),
+        ("Tencent-Hunyuan/HunyuanDiT", "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin", "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix"),
+    ],
+    "stable-video-diffusion-img2vid-xt": [
+        ("stabilityai/stable-video-diffusion-img2vid-xt", "svd_xt.safetensors", "models/stable_video_diffusion"),
+    ],
+    "ExVideo-SVD-128f-v1": [
+        ("ECNU-CILab/ExVideo-SVD-128f-v1", "model.fp16.safetensors", "models/stable_video_diffusion"),
+    ],
+}
+preset_models_on_modelscope = {
+    # Hunyuan DiT
+    "HunyuanDiT": [
+        ("modelscope/HunyuanDiT", "t2i/clip_text_encoder/pytorch_model.bin", "models/HunyuanDiT/t2i/clip_text_encoder"),
+        ("modelscope/HunyuanDiT", "t2i/mt5/pytorch_model.bin", "models/HunyuanDiT/t2i/mt5"),
+        ("modelscope/HunyuanDiT", "t2i/model/pytorch_model_ema.pt", "models/HunyuanDiT/t2i/model"),
+        ("modelscope/HunyuanDiT", "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin", "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix"),
+    ],
+    # Stable Video Diffusion
+    "stable-video-diffusion-img2vid-xt": [
+        ("AI-ModelScope/stable-video-diffusion-img2vid-xt", "svd_xt.safetensors", "models/stable_video_diffusion"),
+    ],
+    # ExVideo
+    "ExVideo-SVD-128f-v1": [
+        ("ECNU-CILab/ExVideo-SVD-128f-v1", "model.fp16.safetensors", "models/stable_video_diffusion"),
+    ],
+    # Stable Diffusion
+    "StableDiffusion_v15": [
+        ("AI-ModelScope/stable-diffusion-v1-5", "v1-5-pruned-emaonly.safetensors", "models/stable_diffusion"),
+    ],
+    "DreamShaper_8": [
+        ("sd_lora/dreamshaper_8", "dreamshaper_8.safetensors", "models/stable_diffusion"),
+    ],
+    "AingDiffusion_v12": [
+        ("sd_lora/aingdiffusion_v12", "aingdiffusion_v12.safetensors", "models/stable_diffusion"),
+    ],
+    "Flat2DAnimerge_v45Sharp": [
+        ("sd_lora/Flat-2D-Animerge", "flat2DAnimerge_v45Sharp.safetensors", "models/stable_diffusion"),
+    ],
+    # Textual Inversion
+    "TextualInversion_VeryBadImageNegative_v1.3": [
+        ("sd_lora/verybadimagenegative_v1.3", "verybadimagenegative_v1.3.pt", "models/textual_inversion"),
+    ],
+    # Stable Diffusion XL
+    "StableDiffusionXL_v1": [
+        ("AI-ModelScope/stable-diffusion-xl-base-1.0", "sd_xl_base_1.0.safetensors", "models/stable_diffusion_xl"),
+    ],
+    "BluePencilXL_v200": [
+        ("sd_lora/bluePencilXL_v200", "bluePencilXL_v200.safetensors", "models/stable_diffusion_xl"),
+    ],
+    "StableDiffusionXL_Turbo": [
+        ("AI-ModelScope/sdxl-turbo", "sd_xl_turbo_1.0_fp16.safetensors", "models/stable_diffusion_xl_turbo"),
+    ],
+    # Stable Diffusion 3
+    "StableDiffusion3": [
+        ("AI-ModelScope/stable-diffusion-3-medium", "sd3_medium_incl_clips_t5xxlfp16.safetensors", "models/stable_diffusion_3"),
+    ],
+    "StableDiffusion3_without_T5": [
+        ("AI-ModelScope/stable-diffusion-3-medium", "sd3_medium_incl_clips.safetensors", "models/stable_diffusion_3"),
+    ],
+    # ControlNet
+    "ControlNet_v11f1p_sd15_depth": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11f1p_sd15_depth.pth", "models/ControlNet"),
+        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators")
+    ],
+    "ControlNet_v11p_sd15_softedge": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11p_sd15_softedge.pth", "models/ControlNet"),
+        ("sd_lora/Annotators", "ControlNetHED.pth", "models/Annotators")
+    ],
+    "ControlNet_v11f1e_sd15_tile": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11f1e_sd15_tile.pth", "models/ControlNet")
+    ],
+    "ControlNet_v11p_sd15_lineart": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11p_sd15_lineart.pth", "models/ControlNet"),
+        ("sd_lora/Annotators", "sk_model.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "sk_model2.pth", "models/Annotators")
+    ],
+    # AnimateDiff
+    "AnimateDiff_v2": [
+        ("Shanghai_AI_Laboratory/animatediff", "mm_sd_v15_v2.ckpt", "models/AnimateDiff"),
+    ],
+    "AnimateDiff_xl_beta": [
+        ("Shanghai_AI_Laboratory/animatediff", "mm_sdxl_v10_beta.ckpt", "models/AnimateDiff"),
+    ],
+    # RIFE
+    "RIFE": [
+        ("Damo_XR_Lab/cv_rife_video-frame-interpolation", "flownet.pkl", "models/RIFE"),
+    ],
+    # Beautiful Prompt
+    "BeautifulPrompt": [
+        ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "generation_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "model.safetensors", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "special_tokens_map.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "tokenizer.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "tokenizer_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+    ],
+    # Translator
+    "opus-mt-zh-en": [
+        ("moxying/opus-mt-zh-en", "config.json", "models/translator/opus-mt-zh-en"),
+        ("moxying/opus-mt-zh-en", "generation_config.json", "models/translator/opus-mt-zh-en"),
+        ("moxying/opus-mt-zh-en", "metadata.json", "models/translator/opus-mt-zh-en"),
+        ("moxying/opus-mt-zh-en", "pytorch_model.bin", "models/translator/opus-mt-zh-en"),
+        ("moxying/opus-mt-zh-en", "source.spm", "models/translator/opus-mt-zh-en"),
+        ("moxying/opus-mt-zh-en", "target.spm", "models/translator/opus-mt-zh-en"),
+        ("moxying/opus-mt-zh-en", "tokenizer_config.json", "models/translator/opus-mt-zh-en"),
+        ("moxying/opus-mt-zh-en", "vocab.json", "models/translator/opus-mt-zh-en"),
+    ],
+    # IP-Adapter
+    "IP-Adapter-SD": [
+        ("AI-ModelScope/IP-Adapter", "models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion/image_encoder"),
+        ("AI-ModelScope/IP-Adapter", "models/ip-adapter_sd15.bin", "models/IpAdapter/stable_diffusion"),
+    ],
+    "IP-Adapter-SDXL": [
+        ("AI-ModelScope/IP-Adapter", "sdxl_models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion_xl/image_encoder"),
+        ("AI-ModelScope/IP-Adapter", "sdxl_models/ip-adapter_sdxl.bin", "models/IpAdapter/stable_diffusion_xl"),
+    ],
+    # Kolors
+    "Kolors": [
+        ("Kwai-Kolors/Kolors", "text_encoder/config.json", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model.bin.index.json", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00001-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00002-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00003-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00004-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00005-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00006-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00007-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "unet/diffusion_pytorch_model.safetensors", "models/kolors/Kolors/unet"),
+        ("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.safetensors", "models/kolors/Kolors/vae"),
+    ],
+    "SDXL-vae-fp16-fix": [
+        ("AI-ModelScope/sdxl-vae-fp16-fix", "diffusion_pytorch_model.safetensors", "models/sdxl-vae-fp16-fix")
+    ],
+}
+Preset_model_id: TypeAlias = Literal[
+    "HunyuanDiT",
+    "stable-video-diffusion-img2vid-xt",
+    "ExVideo-SVD-128f-v1",
+    "StableDiffusion_v15",
+    "DreamShaper_8",
+    "AingDiffusion_v12",
+    "Flat2DAnimerge_v45Sharp",
+    "TextualInversion_VeryBadImageNegative_v1.3",
+    "StableDiffusionXL_v1",
+    "BluePencilXL_v200",
+    "StableDiffusionXL_Turbo",
+    "ControlNet_v11f1p_sd15_depth",
+    "ControlNet_v11p_sd15_softedge",
+    "ControlNet_v11f1e_sd15_tile",
+    "ControlNet_v11p_sd15_lineart",
+    "AnimateDiff_v2",
+    "AnimateDiff_xl_beta",
+    "RIFE",
+    "BeautifulPrompt",
+    "opus-mt-zh-en",
+    "IP-Adapter-SD",
+    "IP-Adapter-SDXL",
+    "StableDiffusion3",
+    "StableDiffusion3_without_T5",
+    "Kolors",
+    "SDXL-vae-fp16-fix",
+]
+Preset_model_website: TypeAlias = Literal[
+    "HuggingFace",
+    "ModelScope",
+]
+website_to_preset_models = {
+    "HuggingFace": preset_models_on_huggingface,
+    "ModelScope": preset_models_on_modelscope,
+}
+website_to_download_fn = {
+    "HuggingFace": download_from_huggingface,
+    "ModelScope": download_from_modelscope,
+}
+def download_models(
+    model_id_list: List[Preset_model_id] = [],
+    downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"],
+):
+    downloaded_files = []
+    for model_id in model_id_list:
+        for website in downloading_priority:
+            if model_id in website_to_preset_models[website]:
+                for model_id, origin_file_path, local_dir in website_to_preset_models[website][model_id]:
+                    # Check if the file is downloaded.
+                    file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path))
+                    if file_to_download in downloaded_files:
+                        continue
+                    # Download
+                    website_to_download_fn[website](model_id, origin_file_path, local_dir)
+                    if os.path.basename(origin_file_path) in os.listdir(local_dir):
+                        downloaded_files.append(file_to_download)
+    return downloaded_files
+class ModelManager:
+    def __init__(
+        self,
+        torch_dtype=torch.float16,
+        device="cuda",
+        model_id_list: List[Preset_model_id] = [],
+        downloading_priority: List[Preset_model_website] = ["ModelScope", "HuggingFace"],
+        file_path_list: List[str] = [],
+    ):
+        self.torch_dtype = torch_dtype
+        self.device = device
+        self.model = {}
+        self.model_path = {}
+        self.textual_inversion_dict = {}
+        downloaded_files = download_models(model_id_list, downloading_priority)
+        self.load_models(downloaded_files + file_path_list)
+    def load_model_from_origin(
+        self,
+        download_from: Preset_model_website = "ModelScope",
+        model_id = "",
+        origin_file_path = "",
+        local_dir = ""
+    ):
+        website_to_download_fn[download_from](model_id, origin_file_path, local_dir)
+        file_to_download = os.path.join(local_dir, os.path.basename(origin_file_path))
+        self.load_model(file_to_download)
+    def is_stable_video_diffusion(self, state_dict):
+        param_name = "model.diffusion_model.output_blocks.9.1.time_stack.0.norm_in.weight"
+        return param_name in state_dict
+    def is_RIFE(self, state_dict):
+        param_name = "block_tea.convblock3.0.1.weight"
+        return param_name in state_dict or ("module." + param_name) in state_dict
+    def is_beautiful_prompt(self, state_dict):
+        param_name = "transformer.h.9.self_attention.query_key_value.weight"
+        return param_name in state_dict
+    def is_stabe_diffusion_xl(self, state_dict):
+        param_name = "conditioner.embedders.0.transformer.text_model.embeddings.position_embedding.weight"
+        return param_name in state_dict
+    def is_stable_diffusion(self, state_dict):
+        if self.is_stabe_diffusion_xl(state_dict):
+            return False
+        param_name = "model.diffusion_model.output_blocks.9.1.transformer_blocks.0.norm3.weight"
+        return param_name in state_dict
+    def is_controlnet(self, state_dict):
+        param_name = "control_model.time_embed.0.weight"
+        return param_name in state_dict
+    def is_animatediff(self, state_dict):
+        param_name = "mid_block.motion_modules.0.temporal_transformer.proj_out.weight"
+        return param_name in state_dict
+    def is_animatediff_xl(self, state_dict):
+        param_name = "up_blocks.2.motion_modules.2.temporal_transformer.transformer_blocks.0.ff_norm.weight"
+        return param_name in state_dict
+    def is_sd_lora(self, state_dict):
+        param_name = "lora_unet_up_blocks_3_attentions_2_transformer_blocks_0_ff_net_2.lora_up.weight"
+        return param_name in state_dict
+    def is_translator(self, state_dict):
+        param_name = "model.encoder.layers.5.self_attn_layer_norm.weight"
+        return param_name in state_dict and len(state_dict) == 258
+    def is_ipadapter(self, state_dict):
+        return "image_proj" in state_dict and "ip_adapter" in state_dict and state_dict["image_proj"]["proj.weight"].shape == torch.Size([3072, 1024])
+    def is_ipadapter_image_encoder(self, state_dict):
+        param_name = "vision_model.encoder.layers.31.self_attn.v_proj.weight"
+        return param_name in state_dict and len(state_dict) == 521
+    def is_ipadapter_xl(self, state_dict):
+        return "image_proj" in state_dict and "ip_adapter" in state_dict and state_dict["image_proj"]["proj.weight"].shape == torch.Size([8192, 1280])
+    def is_ipadapter_xl_image_encoder(self, state_dict):
+        param_name = "vision_model.encoder.layers.47.self_attn.v_proj.weight"
+        return param_name in state_dict and len(state_dict) == 777
+    def is_hunyuan_dit_clip_text_encoder(self, state_dict):
+        param_name = "bert.encoder.layer.23.attention.output.dense.weight"
+        return param_name in state_dict
+    def is_hunyuan_dit_t5_text_encoder(self, state_dict):
+        param_name = "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
+        param_name_ = "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
+        return param_name in state_dict and param_name_ in state_dict
+    def is_hunyuan_dit(self, state_dict):
+        param_name = "final_layer.adaLN_modulation.1.weight"
+        return param_name in state_dict
+    def is_diffusers_vae(self, state_dict):
+        param_name = "quant_conv.weight"
+        return param_name in state_dict
+    def is_ExVideo_StableVideoDiffusion(self, state_dict):
+        param_name = "blocks.185.positional_embedding.embeddings"
+        return param_name in state_dict
+    def is_stable_diffusion_3(self, state_dict):
+        param_names = [
+            "text_encoders.clip_l.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight",
+            "text_encoders.clip_g.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight",
+            "model.diffusion_model.joint_blocks.9.x_block.mlp.fc2.weight",
+            "first_stage_model.encoder.mid.block_2.norm2.weight",
+            "first_stage_model.decoder.mid.block_2.norm2.weight",
+        ]
+        for param_name in param_names:
+            if param_name not in state_dict:
+                return False
+        return True
+    def is_stable_diffusion_3_t5(self, state_dict):
+        param_name = "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight"
+        return param_name in state_dict
+    def is_kolors_text_encoder(self, file_path):
+        file_list = os.listdir(file_path)
+        if "config.json" in file_list:
+            try:
+                with open(os.path.join(file_path, "config.json"), "r") as f:
+                    config = json.load(f)
+                    if config.get("model_type") == "chatglm":
+                        return True
+            except:
+                pass
+        return False
+    def is_kolors_unet(self, state_dict):
+        return "up_blocks.2.resnets.2.time_emb_proj.weight" in state_dict and "encoder_hid_proj.weight" in state_dict
+    def load_stable_video_diffusion(self, state_dict, components=None, file_path="", add_positional_conv=None):
+        component_dict = {
+            "image_encoder": SVDImageEncoder,
+            "unet": SVDUNet,
+            "vae_decoder": SVDVAEDecoder,
+            "vae_encoder": SVDVAEEncoder,
+        }
+        if components is None:
+            components = ["image_encoder", "unet", "vae_decoder", "vae_encoder"]
+        for component in components:
+            if component == "unet":
+                self.model[component] = component_dict[component](add_positional_conv=add_positional_conv)
+                self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict, add_positional_conv=add_positional_conv), strict=False)
+            else:
+                self.model[component] = component_dict[component]()
+                self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
+            self.model[component].to(self.torch_dtype).to(self.device)
+            self.model_path[component] = file_path
+    def load_stable_diffusion(self, state_dict, components=None, file_path=""):
+        component_dict = {
+            "text_encoder": SDTextEncoder,
+            "unet": SDUNet,
+            "vae_decoder": SDVAEDecoder,
+            "vae_encoder": SDVAEEncoder,
+            "refiner": SDXLUNet,
+        }
+        if components is None:
+            components = ["text_encoder", "unet", "vae_decoder", "vae_encoder"]
+        for component in components:
+            if component == "text_encoder":
+                # Add additional token embeddings to text encoder
+                token_embeddings = [state_dict["cond_stage_model.transformer.text_model.embeddings.token_embedding.weight"]]
+                for keyword in self.textual_inversion_dict:
+                    _, embeddings = self.textual_inversion_dict[keyword]
+                    token_embeddings.append(embeddings.to(dtype=token_embeddings[0].dtype))
+                token_embeddings = torch.concat(token_embeddings, dim=0)
+                state_dict["cond_stage_model.transformer.text_model.embeddings.token_embedding.weight"] = token_embeddings
+                self.model[component] = component_dict[component](vocab_size=token_embeddings.shape[0])
+                self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
+                self.model[component].to(self.torch_dtype).to(self.device)
+            else:
+                self.model[component] = component_dict[component]()
+                self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
+                self.model[component].to(self.torch_dtype).to(self.device)
+            self.model_path[component] = file_path
+    def load_stable_diffusion_xl(self, state_dict, components=None, file_path=""):
+        component_dict = {
+            "text_encoder": SDXLTextEncoder,
+            "text_encoder_2": SDXLTextEncoder2,
+            "unet": SDXLUNet,
+            "vae_decoder": SDXLVAEDecoder,
+            "vae_encoder": SDXLVAEEncoder,
+        }
+        if components is None:
+            components = ["text_encoder", "text_encoder_2", "unet", "vae_decoder", "vae_encoder"]
+        for component in components:
+            self.model[component] = component_dict[component]()
+            self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
+            if component in ["vae_decoder", "vae_encoder"]:
+                # These two model will output nan when float16 is enabled.
+                # The precision problem happens in the last three resnet blocks.
+                # I do not know how to solve this problem.
+                self.model[component].to(torch.float32).to(self.device)
+            else:
+                self.model[component].to(self.torch_dtype).to(self.device)
+            self.model_path[component] = file_path
+    def load_controlnet(self, state_dict, file_path=""):
+        component = "controlnet"
+        if component not in self.model:
+            self.model[component] = []
+            self.model_path[component] = []
+        model = SDControlNet()
+        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
+        model.to(self.torch_dtype).to(self.device)
+        self.model[component].append(model)
+        self.model_path[component].append(file_path)
+    def load_animatediff(self, state_dict, file_path=""):
+        component = "motion_modules"
+        model = SDMotionModel()
+        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
+        model.to(self.torch_dtype).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_animatediff_xl(self, state_dict, file_path=""):
+        component = "motion_modules_xl"
+        model = SDXLMotionModel()
+        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
+        model.to(self.torch_dtype).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_beautiful_prompt(self, state_dict, file_path=""):
+        component = "beautiful_prompt"
+        from transformers import AutoModelForCausalLM
+        model_folder = os.path.dirname(file_path)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_folder, state_dict=state_dict, local_files_only=True, torch_dtype=self.torch_dtype
+        ).to(self.device).eval()
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_RIFE(self, state_dict, file_path=""):
+        component = "RIFE"
+        from ..extensions.RIFE import IFNet
+        model = IFNet().eval()
+        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
+        model.to(torch.float32).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_sd_lora(self, state_dict, alpha):
+        SDLoRA().add_lora_to_text_encoder(self.model["text_encoder"], state_dict, alpha=alpha, device=self.device)
+        SDLoRA().add_lora_to_unet(self.model["unet"], state_dict, alpha=alpha, device=self.device)
+    def load_translator(self, state_dict, file_path=""):
+        # This model is lightweight, we do not place it on GPU.
+        component = "translator"
+        from transformers import AutoModelForSeq2SeqLM
+        model_folder = os.path.dirname(file_path)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_folder).eval()
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_ipadapter(self, state_dict, file_path=""):
+        component = "ipadapter"
+        model = SDIpAdapter()
+        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
+        model.to(self.torch_dtype).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_ipadapter_image_encoder(self, state_dict, file_path=""):
+        component = "ipadapter_image_encoder"
+        model = IpAdapterCLIPImageEmbedder()
+        model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
+        model.to(self.torch_dtype).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_ipadapter_xl(self, state_dict, file_path=""):
+        component = "ipadapter_xl"
+        model = SDXLIpAdapter()
+        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
+        model.to(self.torch_dtype).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_ipadapter_xl_image_encoder(self, state_dict, file_path=""):
+        component = "ipadapter_xl_image_encoder"
+        model = IpAdapterXLCLIPImageEmbedder()
+        model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
+        model.to(self.torch_dtype).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_hunyuan_dit_clip_text_encoder(self, state_dict, file_path=""):
+        component = "hunyuan_dit_clip_text_encoder"
+        model = HunyuanDiTCLIPTextEncoder()
+        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
+        model.to(self.torch_dtype).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_hunyuan_dit_t5_text_encoder(self, state_dict, file_path=""):
+        component = "hunyuan_dit_t5_text_encoder"
+        model = HunyuanDiTT5TextEncoder()
+        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
+        model.to(self.torch_dtype).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_hunyuan_dit(self, state_dict, file_path=""):
+        component = "hunyuan_dit"
+        model = HunyuanDiT()
+        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
+        model.to(self.torch_dtype).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_diffusers_vae(self, state_dict, file_path=""):
+        # TODO: detect SD and SDXL
+        component = "vae_encoder"
+        model = SDXLVAEEncoder()
+        model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
+        model.to(torch.float32).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+        component = "vae_decoder"
+        model = SDXLVAEDecoder()
+        model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
+        model.to(torch.float32).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_ExVideo_StableVideoDiffusion(self, state_dict, file_path=""):
+        unet_state_dict = self.model["unet"].state_dict()
+        self.model["unet"].to("cpu")
+        del self.model["unet"]
+        add_positional_conv = state_dict["blocks.185.positional_embedding.embeddings"].shape[0]
+        self.model["unet"] = SVDUNet(add_positional_conv=add_positional_conv)
+        self.model["unet"].load_state_dict(unet_state_dict, strict=False)
+        self.model["unet"].load_state_dict(state_dict, strict=False)
+        self.model["unet"].to(self.torch_dtype).to(self.device)
+    def load_stable_diffusion_3(self, state_dict, components=None, file_path=""):
+        component_dict = {
+            "sd3_text_encoder_1": SD3TextEncoder1,
+            "sd3_text_encoder_2": SD3TextEncoder2,
+            "sd3_text_encoder_3": SD3TextEncoder3,
+            "sd3_dit": SD3DiT,
+            "sd3_vae_decoder": SD3VAEDecoder,
+            "sd3_vae_encoder": SD3VAEEncoder,
+        }
+        if components is None:
+            components = ["sd3_text_encoder_1", "sd3_text_encoder_2", "sd3_text_encoder_3", "sd3_dit", "sd3_vae_decoder", "sd3_vae_encoder"]
+        for component in components:
+            if component == "sd3_text_encoder_3":
+                if "text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight" not in state_dict:
+                    continue
+            if component == "sd3_text_encoder_1":
+                # Add additional token embeddings to text encoder
+                token_embeddings = [state_dict["text_encoders.clip_l.transformer.text_model.embeddings.token_embedding.weight"]]
+                for keyword in self.textual_inversion_dict:
+                    _, embeddings = self.textual_inversion_dict[keyword]
+                    token_embeddings.append(embeddings.to(dtype=token_embeddings[0].dtype))
+                token_embeddings = torch.concat(token_embeddings, dim=0)
+                state_dict["text_encoders.clip_l.transformer.text_model.embeddings.token_embedding.weight"] = token_embeddings
+                self.model[component] = component_dict[component](vocab_size=token_embeddings.shape[0])
+                self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
+                self.model[component].to(self.torch_dtype).to(self.device)
+            else:
+                self.model[component] = component_dict[component]()
+                self.model[component].load_state_dict(self.model[component].state_dict_converter().from_civitai(state_dict))
+                self.model[component].to(self.torch_dtype).to(self.device)
+                self.model_path[component] = file_path
+    def load_stable_diffusion_3_t5(self, state_dict, file_path=""):
+        component = "sd3_text_encoder_3"
+        model = SD3TextEncoder3()
+        model.load_state_dict(model.state_dict_converter().from_civitai(state_dict))
+        model.to(self.torch_dtype).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_kolors_text_encoder(self, state_dict=None, file_path=""):
+        component = "kolors_text_encoder"
+        model = ChatGLMModel.from_pretrained(file_path, torch_dtype=self.torch_dtype)
+        model = model.to(dtype=self.torch_dtype, device=self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def load_kolors_unet(self, state_dict, file_path=""):
+        component = "kolors_unet"
+        model = SDXLUNet(is_kolors=True)
+        model.load_state_dict(model.state_dict_converter().from_diffusers(state_dict))
+        model.to(self.torch_dtype).to(self.device)
+        self.model[component] = model
+        self.model_path[component] = file_path
+    def search_for_embeddings(self, state_dict):
+        embeddings = []
+        for k in state_dict:
+            if isinstance(state_dict[k], torch.Tensor):
+                embeddings.append(state_dict[k])
+            elif isinstance(state_dict[k], dict):
+                embeddings += self.search_for_embeddings(state_dict[k])
+        return embeddings
+    def load_textual_inversions(self, folder):
+        # Store additional tokens here
+        self.textual_inversion_dict = {}
+        # Load every textual inversion file
+        for file_name in os.listdir(folder):
+            if os.path.isdir(os.path.join(folder, file_name)) or \
+                not (file_name.endswith(".bin") or \
+                     file_name.endswith(".safetensors") or \
+                     file_name.endswith(".pth") or \
+                     file_name.endswith(".pt")):
+                continue
+            keyword = os.path.splitext(file_name)[0]
+            state_dict = load_state_dict(os.path.join(folder, file_name))
+            # Search for embeddings
+            for embeddings in self.search_for_embeddings(state_dict):
+                if len(embeddings.shape) == 2 and embeddings.shape[1] == 768:
+                    tokens = [f"{keyword}_{i}" for i in range(embeddings.shape[0])]
+                    self.textual_inversion_dict[keyword] = (tokens, embeddings)
+                    break
+    def load_model(self, file_path, components=None, lora_alphas=[]):
+        if os.path.isdir(file_path):
+            if self.is_kolors_text_encoder(file_path):
+                self.load_kolors_text_encoder(file_path=file_path)
+            return
+        state_dict = load_state_dict(file_path, torch_dtype=self.torch_dtype)
+        if self.is_stable_video_diffusion(state_dict):
+            self.load_stable_video_diffusion(state_dict, file_path=file_path)
+        elif self.is_animatediff(state_dict):
+            self.load_animatediff(state_dict, file_path=file_path)
+        elif self.is_animatediff_xl(state_dict):
+            self.load_animatediff_xl(state_dict, file_path=file_path)
+        elif self.is_controlnet(state_dict):
+            self.load_controlnet(state_dict, file_path=file_path)
+        elif self.is_stabe_diffusion_xl(state_dict):
+            self.load_stable_diffusion_xl(state_dict, components=components, file_path=file_path)
+        elif self.is_stable_diffusion(state_dict):
+            self.load_stable_diffusion(state_dict, components=components, file_path=file_path)
+        elif self.is_sd_lora(state_dict):
+            self.load_sd_lora(state_dict, alpha=lora_alphas.pop(0))
+        elif self.is_beautiful_prompt(state_dict):
+            self.load_beautiful_prompt(state_dict, file_path=file_path)
+        elif self.is_RIFE(state_dict):
+            self.load_RIFE(state_dict, file_path=file_path)
+        elif self.is_translator(state_dict):
+            self.load_translator(state_dict, file_path=file_path)
+        elif self.is_ipadapter(state_dict):
+            self.load_ipadapter(state_dict, file_path=file_path)
+        elif self.is_ipadapter_image_encoder(state_dict):
+            self.load_ipadapter_image_encoder(state_dict, file_path=file_path)
+        elif self.is_ipadapter_xl(state_dict):
+            self.load_ipadapter_xl(state_dict, file_path=file_path)
+        elif self.is_ipadapter_xl_image_encoder(state_dict):
+            self.load_ipadapter_xl_image_encoder(state_dict, file_path=file_path)
+        elif self.is_hunyuan_dit_clip_text_encoder(state_dict):
+            self.load_hunyuan_dit_clip_text_encoder(state_dict, file_path=file_path)
+        elif self.is_hunyuan_dit_t5_text_encoder(state_dict):
+            self.load_hunyuan_dit_t5_text_encoder(state_dict, file_path=file_path)
+        elif self.is_hunyuan_dit(state_dict):
+            self.load_hunyuan_dit(state_dict, file_path=file_path)
+        elif self.is_diffusers_vae(state_dict):
+            self.load_diffusers_vae(state_dict, file_path=file_path)
+        elif self.is_ExVideo_StableVideoDiffusion(state_dict):
+            self.load_ExVideo_StableVideoDiffusion(state_dict, file_path=file_path)
+        elif self.is_stable_diffusion_3(state_dict):
+            self.load_stable_diffusion_3(state_dict, components=components, file_path=file_path)
+        elif self.is_stable_diffusion_3_t5(state_dict):
+            self.load_stable_diffusion_3_t5(state_dict, file_path=file_path)
+        elif self.is_kolors_unet(state_dict):
+            self.load_kolors_unet(state_dict, file_path=file_path)
+    def load_models(self, file_path_list, lora_alphas=[]):
+        for file_path in file_path_list:
+            self.load_model(file_path, lora_alphas=lora_alphas)
+    def to(self, device):
+        for component in self.model:
+            if isinstance(self.model[component], list):
+                for model in self.model[component]:
+                    model.to(device)
+            else:
+                self.model[component].to(device)
+        torch.cuda.empty_cache()
+    def get_model_with_model_path(self, model_path):
+        for component in self.model_path:
+            if isinstance(self.model_path[component], str):
+                if os.path.samefile(self.model_path[component], model_path):
+                    return self.model[component]
+            elif isinstance(self.model_path[component], list):
+                for i, model_path_ in enumerate(self.model_path[component]):
+                    if os.path.samefile(model_path_, model_path):
+                        return self.model[component][i]
+        raise ValueError(f"Please load model {model_path} before you use it.")
+    def __getattr__(self, __name):
+        if __name in self.model:
+            return self.model[__name]
+        else:
+            return super.__getattribute__(__name)
+def load_state_dict(file_path, torch_dtype=None):
+    if file_path.endswith(".safetensors"):
+        return load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype)
+    else:
+        return load_state_dict_from_bin(file_path, torch_dtype=torch_dtype)
+def load_state_dict_from_safetensors(file_path, torch_dtype=None):
+    state_dict = {}
+    with safe_open(file_path, framework="pt", device="cpu") as f:
+        for k in f.keys():
+            state_dict[k] = f.get_tensor(k)
+            if torch_dtype is not None:
+                state_dict[k] = state_dict[k].to(torch_dtype)
+    return state_dict
+def load_state_dict_from_bin(file_path, torch_dtype=None):
+    state_dict = torch.load(file_path, map_location="cpu")
+    if torch_dtype is not None:
+        for i in state_dict:
+            if isinstance(state_dict[i], torch.Tensor):
+                state_dict[i] = state_dict[i].to(torch_dtype)
+    return state_dict
+def search_parameter(param, state_dict):
+    for name, param_ in state_dict.items():
+        if param.numel() == param_.numel():
+            if param.shape == param_.shape:
+                if torch.dist(param, param_) < 1e-6:
+                    return name
+            else:
+                if torch.dist(param.flatten(), param_.flatten()) < 1e-6:
+                    return name
+    return None
+def build_rename_dict(source_state_dict, target_state_dict, split_qkv=False):
+    matched_keys = set()
+    with torch.no_grad():
+        for name in source_state_dict:
+            rename = search_parameter(source_state_dict[name], target_state_dict)
+            if rename is not None:
+                print(f'"{name}": "{rename}",')
+                matched_keys.add(rename)
+            elif split_qkv and len(source_state_dict[name].shape)>=1 and source_state_dict[name].shape[0]%3==0:
+                length = source_state_dict[name].shape[0] // 3
+                rename = []
+                for i in range(3):
+                    rename.append(search_parameter(source_state_dict[name][i*length: i*length+length], target_state_dict))
+                if None not in rename:
+                    print(f'"{name}": {rename},')
+                    for rename_ in rename:
+                        matched_keys.add(rename_)
+    for name in target_state_dict:
+        if name not in matched_keys:
+            print("Cannot find", name, target_state_dict[name].shape)

diffsynth/models/attention.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import torch
+from einops import rearrange
+def low_version_attention(query, key, value, attn_bias=None):
+    scale = 1 / query.shape[-1] ** 0.5
+    query = query * scale
+    attn = torch.matmul(query, key.transpose(-2, -1))
+    if attn_bias is not None:
+        attn = attn + attn_bias
+    attn = attn.softmax(-1)
+    return attn @ value
+class Attention(torch.nn.Module):
+    def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_kv=False, bias_out=False):
+        super().__init__()
+        dim_inner = head_dim * num_heads
+        kv_dim = kv_dim if kv_dim is not None else q_dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.to_q = torch.nn.Linear(q_dim, dim_inner, bias=bias_q)
+        self.to_k = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
+        self.to_v = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
+        self.to_out = torch.nn.Linear(dim_inner, q_dim, bias=bias_out)
+    def interact_with_ipadapter(self, hidden_states, q, ip_k, ip_v, scale=1.0):
+        batch_size = q.shape[0]
+        ip_k = ip_k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        ip_v = ip_v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        ip_hidden_states = torch.nn.functional.scaled_dot_product_attention(q, ip_k, ip_v)
+        hidden_states = hidden_states + scale * ip_hidden_states
+        return hidden_states
+    def torch_forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None, ipadapter_kwargs=None, qkv_preprocessor=None):
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        batch_size = encoder_hidden_states.shape[0]
+        q = self.to_q(hidden_states)
+        k = self.to_k(encoder_hidden_states)
+        v = self.to_v(encoder_hidden_states)
+        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        if qkv_preprocessor is not None:
+            q, k, v = qkv_preprocessor(q, k, v)
+        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+        if ipadapter_kwargs is not None:
+            hidden_states = self.interact_with_ipadapter(hidden_states, q, **ipadapter_kwargs)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
+        hidden_states = hidden_states.to(q.dtype)
+        hidden_states = self.to_out(hidden_states)
+        return hidden_states
+    def xformers_forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None):
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        q = self.to_q(hidden_states)
+        k = self.to_k(encoder_hidden_states)
+        v = self.to_v(encoder_hidden_states)
+        q = rearrange(q, "b f (n d) -> (b n) f d", n=self.num_heads)
+        k = rearrange(k, "b f (n d) -> (b n) f d", n=self.num_heads)
+        v = rearrange(v, "b f (n d) -> (b n) f d", n=self.num_heads)
+        if attn_mask is not None:
+            hidden_states = low_version_attention(q, k, v, attn_bias=attn_mask)
+        else:
+            import xformers.ops as xops
+            hidden_states = xops.memory_efficient_attention(q, k, v)
+        hidden_states = rearrange(hidden_states, "(b n) f d -> b f (n d)", n=self.num_heads)
+        hidden_states = hidden_states.to(q.dtype)
+        hidden_states = self.to_out(hidden_states)
+        return hidden_states
+    def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None, ipadapter_kwargs=None, qkv_preprocessor=None):
+        return self.torch_forward(hidden_states, encoder_hidden_states=encoder_hidden_states, attn_mask=attn_mask, ipadapter_kwargs=ipadapter_kwargs, qkv_preprocessor=qkv_preprocessor)

diffsynth/models/downloader.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from huggingface_hub import hf_hub_download
+from modelscope import snapshot_download
+import os, shutil
+def download_from_modelscope(model_id, origin_file_path, local_dir):
+    os.makedirs(local_dir, exist_ok=True)
+    if os.path.basename(origin_file_path) in os.listdir(local_dir):
+        print(f"{os.path.basename(origin_file_path)} has been already in {local_dir}.")
+        return
+    else:
+        print(f"Start downloading {os.path.join(local_dir, os.path.basename(origin_file_path))}")
+    snapshot_download(model_id, allow_file_pattern=origin_file_path, local_dir=local_dir)
+    downloaded_file_path = os.path.join(local_dir, origin_file_path)
+    target_file_path = os.path.join(local_dir, os.path.split(origin_file_path)[-1])
+    if downloaded_file_path != target_file_path:
+        shutil.move(downloaded_file_path, target_file_path)
+        shutil.rmtree(os.path.join(local_dir, origin_file_path.split("/")[0]))
+def download_from_huggingface(model_id, origin_file_path, local_dir):
+    os.makedirs(local_dir, exist_ok=True)
+    if os.path.basename(origin_file_path) in os.listdir(local_dir):
+        print(f"{os.path.basename(origin_file_path)} has been already in {local_dir}.")
+        return
+    else:
+        print(f"Start downloading {os.path.join(local_dir, os.path.basename(origin_file_path))}")
+    hf_hub_download(model_id, origin_file_path, local_dir=local_dir)

diffsynth/models/hunyuan_dit.py ADDED Viewed

	@@ -0,0 +1,451 @@

+from .attention import Attention
+from .tiler import TileWorker
+from einops import repeat, rearrange
+import math
+import torch
+class HunyuanDiTRotaryEmbedding(torch.nn.Module):
+    def __init__(self, q_norm_shape=88, k_norm_shape=88, rotary_emb_on_k=True):
+        super().__init__()
+        self.q_norm = torch.nn.LayerNorm((q_norm_shape,), elementwise_affine=True, eps=1e-06)
+        self.k_norm = torch.nn.LayerNorm((k_norm_shape,), elementwise_affine=True, eps=1e-06)
+        self.rotary_emb_on_k = rotary_emb_on_k
+        self.k_cache, self.v_cache = [], []
+    def reshape_for_broadcast(self, freqs_cis, x):
+        ndim = x.ndim
+        shape = [d if i == ndim - 2 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+        return freqs_cis[0].view(*shape), freqs_cis[1].view(*shape)
+    def rotate_half(self, x):
+        x_real, x_imag = x.float().reshape(*x.shape[:-1], -1, 2).unbind(-1)
+        return torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+    def apply_rotary_emb(self, xq, xk, freqs_cis):
+        xk_out = None
+        cos, sin = self.reshape_for_broadcast(freqs_cis, xq)
+        cos, sin = cos.to(xq.device), sin.to(xq.device)
+        xq_out = (xq.float() * cos + self.rotate_half(xq.float()) * sin).type_as(xq)
+        if xk is not None:
+            xk_out = (xk.float() * cos + self.rotate_half(xk.float()) * sin).type_as(xk)
+        return xq_out, xk_out
+    def forward(self, q, k, v, freqs_cis_img, to_cache=False):
+        # norm
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        # RoPE
+        if self.rotary_emb_on_k:
+            q, k = self.apply_rotary_emb(q, k, freqs_cis_img)
+        else:
+            q, _ = self.apply_rotary_emb(q, None, freqs_cis_img)
+        if to_cache:
+            self.k_cache.append(k)
+            self.v_cache.append(v)
+        elif len(self.k_cache) > 0 and len(self.v_cache) > 0:
+            k = torch.concat([k] + self.k_cache, dim=2)
+            v = torch.concat([v] + self.v_cache, dim=2)
+            self.k_cache, self.v_cache = [], []
+        return q, k, v
+class FP32_Layernorm(torch.nn.LayerNorm):
+    def forward(self, inputs):
+        origin_dtype = inputs.dtype
+        return torch.nn.functional.layer_norm(inputs.float(), self.normalized_shape, self.weight.float(), self.bias.float(), self.eps).to(origin_dtype)
+class FP32_SiLU(torch.nn.SiLU):
+    def forward(self, inputs):
+        origin_dtype = inputs.dtype
+        return torch.nn.functional.silu(inputs.float(), inplace=False).to(origin_dtype)
+class HunyuanDiTFinalLayer(torch.nn.Module):
+    def __init__(self, final_hidden_size=1408, condition_dim=1408, patch_size=2, out_channels=8):
+        super().__init__()
+        self.norm_final = torch.nn.LayerNorm(final_hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = torch.nn.Linear(final_hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = torch.nn.Sequential(
+            FP32_SiLU(),
+            torch.nn.Linear(condition_dim, 2 * final_hidden_size, bias=True)
+        )
+    def modulate(self, x, shift, scale):
+        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+    def forward(self, hidden_states, condition_emb):
+        shift, scale = self.adaLN_modulation(condition_emb).chunk(2, dim=1)
+        hidden_states = self.modulate(self.norm_final(hidden_states), shift, scale)
+        hidden_states = self.linear(hidden_states)
+        return hidden_states
+class HunyuanDiTBlock(torch.nn.Module):
+    def __init__(
+        self,
+        hidden_dim=1408,
+        condition_dim=1408,
+        num_heads=16,
+        mlp_ratio=4.3637,
+        text_dim=1024,
+        skip_connection=False
+    ):
+        super().__init__()
+        self.norm1 = FP32_Layernorm((hidden_dim,), eps=1e-6, elementwise_affine=True)
+        self.rota1 = HunyuanDiTRotaryEmbedding(hidden_dim//num_heads, hidden_dim//num_heads)
+        self.attn1 = Attention(hidden_dim, num_heads, hidden_dim//num_heads, bias_q=True, bias_kv=True, bias_out=True)
+        self.norm2 = FP32_Layernorm((hidden_dim,), eps=1e-6, elementwise_affine=True)
+        self.rota2 = HunyuanDiTRotaryEmbedding(hidden_dim//num_heads, hidden_dim//num_heads, rotary_emb_on_k=False)
+        self.attn2 = Attention(hidden_dim, num_heads, hidden_dim//num_heads, kv_dim=text_dim, bias_q=True, bias_kv=True, bias_out=True)
+        self.norm3 = FP32_Layernorm((hidden_dim,), eps=1e-6, elementwise_affine=True)
+        self.modulation = torch.nn.Sequential(FP32_SiLU(), torch.nn.Linear(condition_dim, hidden_dim, bias=True))
+        self.mlp = torch.nn.Sequential(
+            torch.nn.Linear(hidden_dim, int(hidden_dim*mlp_ratio), bias=True),
+            torch.nn.GELU(approximate="tanh"),
+            torch.nn.Linear(int(hidden_dim*mlp_ratio), hidden_dim, bias=True)
+        )
+        if skip_connection:
+            self.skip_norm = FP32_Layernorm((hidden_dim * 2,), eps=1e-6, elementwise_affine=True)
+            self.skip_linear = torch.nn.Linear(hidden_dim * 2, hidden_dim, bias=True)
+        else:
+            self.skip_norm, self.skip_linear = None, None
+    def forward(self, hidden_states, condition_emb, text_emb, freq_cis_img, residual=None, to_cache=False):
+        # Long Skip Connection
+        if self.skip_norm is not None and self.skip_linear is not None:
+            hidden_states = torch.cat([hidden_states, residual], dim=-1)
+            hidden_states = self.skip_norm(hidden_states)
+            hidden_states = self.skip_linear(hidden_states)
+        # Self-Attention
+        shift_msa = self.modulation(condition_emb).unsqueeze(dim=1)
+        attn_input = self.norm1(hidden_states) + shift_msa
+        hidden_states = hidden_states + self.attn1(attn_input, qkv_preprocessor=lambda q, k, v: self.rota1(q, k, v, freq_cis_img, to_cache=to_cache))
+        # Cross-Attention
+        attn_input = self.norm3(hidden_states)
+        hidden_states = hidden_states + self.attn2(attn_input, text_emb, qkv_preprocessor=lambda q, k, v: self.rota2(q, k, v, freq_cis_img))
+        # FFN Layer
+        mlp_input = self.norm2(hidden_states)
+        hidden_states = hidden_states + self.mlp(mlp_input)
+        return hidden_states
+class AttentionPool(torch.nn.Module):
+    def __init__(self, spacial_dim, embed_dim, num_heads, output_dim = None):
+        super().__init__()
+        self.positional_embedding = torch.nn.Parameter(torch.randn(spacial_dim + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = torch.nn.Linear(embed_dim, embed_dim)
+        self.q_proj = torch.nn.Linear(embed_dim, embed_dim)
+        self.v_proj = torch.nn.Linear(embed_dim, embed_dim)
+        self.c_proj = torch.nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.permute(1, 0, 2)  # NLC -> LNC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (L+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (L+1)NC
+        x, _ = torch.nn.functional.multi_head_attention_forward(
+            query=x[:1], key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x.squeeze(0)
+class PatchEmbed(torch.nn.Module):
+    def __init__(
+        self,
+        patch_size=(2, 2),
+        in_chans=4,
+        embed_dim=1408,
+        bias=True,
+    ):
+        super().__init__()
+        self.proj = torch.nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
+    def forward(self, x):
+        x = self.proj(x)
+        x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        return x
+def timestep_embedding(t, dim, max_period=10000, repeat_only=False):
+    # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=t.device)   # size: [dim/2], 一个指数衰减的曲线
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+    else:
+        embedding = repeat(t, "b -> b d", d=dim)
+    return embedding
+class TimestepEmbedder(torch.nn.Module):
+    def __init__(self, hidden_size=1408, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = torch.nn.Sequential(
+            torch.nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            torch.nn.SiLU(),
+            torch.nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size).type(self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class HunyuanDiT(torch.nn.Module):
+    def __init__(self, num_layers_down=21, num_layers_up=19, in_channels=4, out_channels=8, hidden_dim=1408, text_dim=1024, t5_dim=2048, text_length=77, t5_length=256):
+        super().__init__()
+        # Embedders
+        self.text_emb_padding = torch.nn.Parameter(torch.randn(text_length + t5_length, text_dim, dtype=torch.float32))
+        self.t5_embedder = torch.nn.Sequential(
+            torch.nn.Linear(t5_dim, t5_dim * 4, bias=True),
+            FP32_SiLU(),
+            torch.nn.Linear(t5_dim * 4, text_dim, bias=True),
+        )
+        self.t5_pooler = AttentionPool(t5_length, t5_dim, num_heads=8, output_dim=1024)
+        self.style_embedder = torch.nn.Parameter(torch.randn(hidden_dim))
+        self.patch_embedder = PatchEmbed(in_chans=in_channels)
+        self.timestep_embedder = TimestepEmbedder()
+        self.extra_embedder = torch.nn.Sequential(
+            torch.nn.Linear(256 * 6 + 1024 + hidden_dim, hidden_dim * 4),
+            FP32_SiLU(),
+            torch.nn.Linear(hidden_dim * 4, hidden_dim),
+        )
+        # Transformer blocks
+        self.num_layers_down = num_layers_down
+        self.num_layers_up = num_layers_up
+        self.blocks = torch.nn.ModuleList(
+            [HunyuanDiTBlock(skip_connection=False) for _ in range(num_layers_down)] + \
+            [HunyuanDiTBlock(skip_connection=True) for _ in range(num_layers_up)]
+        )
+        # Output layers
+        self.final_layer = HunyuanDiTFinalLayer()
+        self.out_channels = out_channels
+    def prepare_text_emb(self, text_emb, text_emb_t5, text_emb_mask, text_emb_mask_t5):
+        text_emb_mask = text_emb_mask.bool()
+        text_emb_mask_t5 = text_emb_mask_t5.bool()
+        text_emb_t5 = self.t5_embedder(text_emb_t5)
+        text_emb = torch.cat([text_emb, text_emb_t5], dim=1)
+        text_emb_mask = torch.cat([text_emb_mask, text_emb_mask_t5], dim=-1)
+        text_emb = torch.where(text_emb_mask.unsqueeze(2), text_emb, self.text_emb_padding.to(text_emb))
+        return text_emb
+    def prepare_extra_emb(self, text_emb_t5, timestep, size_emb, dtype, batch_size):
+        # Text embedding
+        pooled_text_emb_t5 = self.t5_pooler(text_emb_t5)
+        # Timestep embedding
+        timestep_emb = self.timestep_embedder(timestep)
+        # Size embedding
+        size_emb = timestep_embedding(size_emb.view(-1), 256).to(dtype)
+        size_emb = size_emb.view(-1, 6 * 256)
+        # Style embedding
+        style_emb = repeat(self.style_embedder, "D -> B D", B=batch_size)
+        # Concatenate all extra vectors
+        extra_emb = torch.cat([pooled_text_emb_t5, size_emb, style_emb], dim=1)
+        condition_emb = timestep_emb + self.extra_embedder(extra_emb)
+        return condition_emb
+    def unpatchify(self, x, h, w):
+        return rearrange(x, "B (H W) (P Q C) -> B C (H P) (W Q)", H=h, W=w, P=2, Q=2)
+    def build_mask(self, data, is_bound):
+        _, _, H, W = data.shape
+        h = repeat(torch.arange(H), "H -> H W", H=H, W=W)
+        w = repeat(torch.arange(W), "W -> H W", H=H, W=W)
+        border_width = (H + W) // 4
+        pad = torch.ones_like(h) * border_width
+        mask = torch.stack([
+            pad if is_bound[0] else h + 1,
+            pad if is_bound[1] else H - h,
+            pad if is_bound[2] else w + 1,
+            pad if is_bound[3] else W - w
+        ]).min(dim=0).values
+        mask = mask.clip(1, border_width)
+        mask = (mask / border_width).to(dtype=data.dtype, device=data.device)
+        mask = rearrange(mask, "H W -> 1 H W")
+        return mask
+    def tiled_block_forward(self, block, hidden_states, condition_emb, text_emb, freq_cis_img, residual, torch_dtype, data_device, computation_device, tile_size, tile_stride):
+        B, C, H, W = hidden_states.shape
+        weight = torch.zeros((1, 1, H, W), dtype=torch_dtype, device=data_device)
+        values = torch.zeros((B, C, H, W), dtype=torch_dtype, device=data_device)
+        # Split tasks
+        tasks = []
+        for h in range(0, H, tile_stride):
+            for w in range(0, W, tile_stride):
+                if (h-tile_stride >= 0 and h-tile_stride+tile_size >= H) or (w-tile_stride >= 0 and w-tile_stride+tile_size >= W):
+                    continue
+                h_, w_ = h + tile_size, w + tile_size
+                if h_ > H: h, h_ = H - tile_size, H
+                if w_ > W: w, w_ = W - tile_size, W
+                tasks.append((h, h_, w, w_))
+        # Run
+        for hl, hr, wl, wr in tasks:
+            hidden_states_batch = hidden_states[:, :, hl:hr, wl:wr].to(computation_device)
+            hidden_states_batch = rearrange(hidden_states_batch, "B C H W -> B (H W) C")
+            if residual is not None:
+                residual_batch = residual[:, :, hl:hr, wl:wr].to(computation_device)
+                residual_batch = rearrange(residual_batch, "B C H W -> B (H W) C")
+            else:
+                residual_batch = None
+            # Forward
+            hidden_states_batch = block(hidden_states_batch, condition_emb, text_emb, freq_cis_img, residual_batch).to(data_device)
+            hidden_states_batch = rearrange(hidden_states_batch, "B (H W) C -> B C H W", H=hr-hl)
+            mask = self.build_mask(hidden_states_batch, is_bound=(hl==0, hr>=H, wl==0, wr>=W))
+            values[:, :, hl:hr, wl:wr] += hidden_states_batch * mask
+            weight[:, :, hl:hr, wl:wr] += mask
+        values /= weight
+        return values
+    def forward(
+        self, hidden_states, text_emb, text_emb_t5, text_emb_mask, text_emb_mask_t5, timestep, size_emb, freq_cis_img,
+        tiled=False, tile_size=64, tile_stride=32,
+        to_cache=False,
+        use_gradient_checkpointing=False,
+    ):
+        # Embeddings
+        text_emb = self.prepare_text_emb(text_emb, text_emb_t5, text_emb_mask, text_emb_mask_t5)
+        condition_emb = self.prepare_extra_emb(text_emb_t5, timestep, size_emb, hidden_states.dtype, hidden_states.shape[0])
+        # Input
+        height, width = hidden_states.shape[-2], hidden_states.shape[-1]
+        hidden_states = self.patch_embedder(hidden_states)
+        # Blocks
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+        if tiled:
+            hidden_states = rearrange(hidden_states, "B (H W) C -> B C H W", H=height//2)
+            residuals = []
+            for block_id, block in enumerate(self.blocks):
+                residual = residuals.pop() if block_id >= self.num_layers_down else None
+                hidden_states = self.tiled_block_forward(
+                    block, hidden_states, condition_emb, text_emb, freq_cis_img, residual,
+                    torch_dtype=hidden_states.dtype, data_device=hidden_states.device, computation_device=hidden_states.device,
+                    tile_size=tile_size, tile_stride=tile_stride
+                )
+                if block_id < self.num_layers_down - 2:
+                    residuals.append(hidden_states)
+            hidden_states = rearrange(hidden_states, "B C H W -> B (H W) C")
+        else:
+            residuals = []
+            for block_id, block in enumerate(self.blocks):
+                residual = residuals.pop() if block_id >= self.num_layers_down else None
+                if self.training and use_gradient_checkpointing:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        hidden_states, condition_emb, text_emb, freq_cis_img, residual,
+                        use_reentrant=False,
+                    )
+                else:
+                    hidden_states = block(hidden_states, condition_emb, text_emb, freq_cis_img, residual, to_cache=to_cache)
+                if block_id < self.num_layers_down - 2:
+                    residuals.append(hidden_states)
+        # Output
+        hidden_states = self.final_layer(hidden_states, condition_emb)
+        hidden_states = self.unpatchify(hidden_states, height//2, width//2)
+        hidden_states, _ = hidden_states.chunk(2, dim=1)
+        return hidden_states
+    def state_dict_converter(self):
+        return HunyuanDiTStateDictConverter()
+class HunyuanDiTStateDictConverter():
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            name_ = name
+            name_ = name_.replace(".default_modulation.", ".modulation.")
+            name_ = name_.replace(".mlp.fc1.", ".mlp.0.")
+            name_ = name_.replace(".mlp.fc2.", ".mlp.2.")
+            name_ = name_.replace(".attn1.q_norm.", ".rota1.q_norm.")
+            name_ = name_.replace(".attn2.q_norm.", ".rota2.q_norm.")
+            name_ = name_.replace(".attn1.k_norm.", ".rota1.k_norm.")
+            name_ = name_.replace(".attn2.k_norm.", ".rota2.k_norm.")
+            name_ = name_.replace(".q_proj.", ".to_q.")
+            name_ = name_.replace(".out_proj.", ".to_out.")
+            name_ = name_.replace("text_embedding_padding", "text_emb_padding")
+            name_ = name_.replace("mlp_t5.0.", "t5_embedder.0.")
+            name_ = name_.replace("mlp_t5.2.", "t5_embedder.2.")
+            name_ = name_.replace("pooler.", "t5_pooler.")
+            name_ = name_.replace("x_embedder.", "patch_embedder.")
+            name_ = name_.replace("t_embedder.", "timestep_embedder.")
+            name_ = name_.replace("t5_pooler.to_q.", "t5_pooler.q_proj.")
+            name_ = name_.replace("style_embedder.weight", "style_embedder")
+            if ".kv_proj." in name_:
+                param_k = param[:param.shape[0]//2]
+                param_v = param[param.shape[0]//2:]
+                state_dict_[name_.replace(".kv_proj.", ".to_k.")] = param_k
+                state_dict_[name_.replace(".kv_proj.", ".to_v.")] = param_v
+            elif ".Wqkv." in name_:
+                param_q = param[:param.shape[0]//3]
+                param_k = param[param.shape[0]//3:param.shape[0]//3*2]
+                param_v = param[param.shape[0]//3*2:]
+                state_dict_[name_.replace(".Wqkv.", ".to_q.")] = param_q
+                state_dict_[name_.replace(".Wqkv.", ".to_k.")] = param_k
+                state_dict_[name_.replace(".Wqkv.", ".to_v.")] = param_v
+            elif "style_embedder" in name_:
+                state_dict_[name_] = param.squeeze()
+            else:
+                state_dict_[name_] = param
+        return state_dict_
+    def from_civitai(self, state_dict):
+        return self.from_diffusers(state_dict)

diffsynth/models/hunyuan_dit_text_encoder.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from transformers import BertModel, BertConfig, T5EncoderModel, T5Config
+import torch
+class HunyuanDiTCLIPTextEncoder(BertModel):
+    def __init__(self):
+        config = BertConfig(
+            _name_or_path = "",
+            architectures = ["BertModel"],
+            attention_probs_dropout_prob = 0.1,
+            bos_token_id = 0,
+            classifier_dropout = None,
+            directionality = "bidi",
+            eos_token_id = 2,
+            hidden_act = "gelu",
+            hidden_dropout_prob = 0.1,
+            hidden_size = 1024,
+            initializer_range = 0.02,
+            intermediate_size = 4096,
+            layer_norm_eps = 1e-12,
+            max_position_embeddings = 512,
+            model_type = "bert",
+            num_attention_heads = 16,
+            num_hidden_layers = 24,
+            output_past = True,
+            pad_token_id = 0,
+            pooler_fc_size = 768,
+            pooler_num_attention_heads = 12,
+            pooler_num_fc_layers = 3,
+            pooler_size_per_head = 128,
+            pooler_type = "first_token_transform",
+            position_embedding_type = "absolute",
+            torch_dtype = "float32",
+            transformers_version = "4.37.2",
+            type_vocab_size = 2,
+            use_cache = True,
+            vocab_size = 47020
+        )
+        super().__init__(config, add_pooling_layer=False)
+        self.eval()
+    def forward(self, input_ids, attention_mask, clip_skip=1):
+        input_shape = input_ids.size()
+        batch_size, seq_length = input_shape
+        device = input_ids.device
+        past_key_values_length = 0
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=None,
+            token_type_ids=None,
+            inputs_embeds=None,
+            past_key_values_length=0,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            past_key_values=None,
+            use_cache=False,
+            output_attentions=False,
+            output_hidden_states=True,
+            return_dict=True,
+        )
+        all_hidden_states = encoder_outputs.hidden_states
+        prompt_emb = all_hidden_states[-clip_skip]
+        if clip_skip > 1:
+            mean, std = all_hidden_states[-1].mean(), all_hidden_states[-1].std()
+            prompt_emb = (prompt_emb - prompt_emb.mean()) / prompt_emb.std() * std + mean
+        return prompt_emb
+    def state_dict_converter(self):
+        return HunyuanDiTCLIPTextEncoderStateDictConverter()
+class HunyuanDiTT5TextEncoder(T5EncoderModel):
+    def __init__(self):
+        config = T5Config(
+            _name_or_path = "../HunyuanDiT/t2i/mt5",
+            architectures = ["MT5ForConditionalGeneration"],
+            classifier_dropout = 0.0,
+            d_ff = 5120,
+            d_kv = 64,
+            d_model = 2048,
+            decoder_start_token_id = 0,
+            dense_act_fn = "gelu_new",
+            dropout_rate = 0.1,
+            eos_token_id = 1,
+            feed_forward_proj = "gated-gelu",
+            initializer_factor = 1.0,
+            is_encoder_decoder = True,
+            is_gated_act = True,
+            layer_norm_epsilon = 1e-06,
+            model_type = "t5",
+            num_decoder_layers = 24,
+            num_heads = 32,
+            num_layers = 24,
+            output_past = True,
+            pad_token_id = 0,
+            relative_attention_max_distance = 128,
+            relative_attention_num_buckets = 32,
+            tie_word_embeddings = False,
+            tokenizer_class = "T5Tokenizer",
+            transformers_version = "4.37.2",
+            use_cache = True,
+            vocab_size = 250112
+        )
+        super().__init__(config)
+        self.eval()
+    def forward(self, input_ids, attention_mask, clip_skip=1):
+        outputs = super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+        )
+        prompt_emb = outputs.hidden_states[-clip_skip]
+        if clip_skip > 1:
+            mean, std = outputs.hidden_states[-1].mean(), outputs.hidden_states[-1].std()
+            prompt_emb = (prompt_emb - prompt_emb.mean()) / prompt_emb.std() * std + mean
+        return prompt_emb
+    def state_dict_converter(self):
+        return HunyuanDiTT5TextEncoderStateDictConverter()
+class HunyuanDiTCLIPTextEncoderStateDictConverter():
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        state_dict_ = {name[5:]: param for name, param in state_dict.items() if name.startswith("bert.")}
+        return state_dict_
+    def from_civitai(self, state_dict):
+        return self.from_diffusers(state_dict)
+class HunyuanDiTT5TextEncoderStateDictConverter():
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        state_dict_ = {name: param for name, param in state_dict.items() if name.startswith("encoder.")}
+        state_dict_["shared.weight"] = state_dict["shared.weight"]
+        return state_dict_
+    def from_civitai(self, state_dict):
+        return self.from_diffusers(state_dict)

diffsynth/models/kolors_text_encoder.py ADDED Viewed

	@@ -0,0 +1,1363 @@

+"""
+This model is copied from https://github.com/Kwai-Kolors/Kolors/tree/master/kolors/models.
+We didn't modify this model.
+The tensor operation is performed in the prompter.
+"""
+""" PyTorch ChatGLM model. """
+import math
+import copy
+import warnings
+import re
+import sys
+import torch
+import torch.utils.checkpoint
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss, LayerNorm
+from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss
+from torch.nn.utils import skip_init
+from typing import Optional, Tuple, Union, List, Callable, Dict, Any
+from copy import deepcopy
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.generation.logits_process import LogitsProcessor
+from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput
+from transformers import PretrainedConfig
+class ChatGLMConfig(PretrainedConfig):
+    model_type = "chatglm"
+    def __init__(
+        self,
+        num_layers=28,
+        padded_vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        seq_length=2048,
+        hidden_dropout=0.0,
+        classifier_dropout=None,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        bias_dropout_fusion=True,
+        multi_query_attention=False,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        **kwargs
+    ):
+        self.num_layers = num_layers
+        self.vocab_size = padded_vocab_size
+        self.padded_vocab_size = padded_vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.seq_length = seq_length
+        self.hidden_dropout = hidden_dropout
+        self.classifier_dropout = classifier_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_attention = multi_query_attention
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        super().__init__(**kwargs)
+# flags required to enable jit fusion kernels
+if sys.platform != 'darwin':
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_override_can_fuse_on_cpu(True)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM"
+_CONFIG_FOR_DOC = "ChatGLM6BConfig"
+CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "THUDM/chatglm3-6b-base",
+    # See all ChatGLM models at https://huggingface.co/models?filter=chatglm
+]
+def default_init(cls, *args, **kwargs):
+    return cls(*args, **kwargs)
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+class PrefixEncoder(torch.nn.Module):
+    """
+    The torch.nn model to encode the prefix
+    Input shape: (batch-size, prefix-length)
+    Output shape: (batch-size, prefix-length, 2*layers*hidden)
+    """
+    def __init__(self, config: ChatGLMConfig):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2
+            self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size)
+            self.trans = torch.nn.Sequential(
+                torch.nn.Linear(kv_size, config.hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(config.hidden_size, kv_size)
+            )
+        else:
+            self.embedding = torch.nn.Embedding(config.pre_seq_len,
+                                                config.num_layers * config.kv_channels * config.multi_query_group_num * 2)
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
+def split_tensor_along_last_dim(
+        tensor: torch.Tensor,
+        num_partitions: int,
+        contiguous_split_chunks: bool = False,
+) -> List[torch.Tensor]:
+    """Split a tensor along its last dimension.
+    Arguments:
+        tensor: input tensor.
+        num_partitions: number of partitions to split the tensor
+        contiguous_split_chunks: If True, make each chunk contiguous
+                                 in memory.
+    Returns:
+        A list of Tensors
+    """
+    # Get the size and dimension.
+    last_dim = tensor.dim() - 1
+    last_dim_size = tensor.size()[last_dim] // num_partitions
+    # Split.
+    tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+    # Note: torch.split does not create contiguous tensors by default.
+    if contiguous_split_chunks:
+        return tuple(chunk.contiguous() for chunk in tensor_list)
+    return tensor_list
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, original_impl=False, device=None, dtype=None):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.dim = dim
+        self.original_impl = original_impl
+    def forward_impl(
+            self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000
+    ):
+        """Enhanced Transformer with Rotary Position Embedding.
+        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+        transformers/rope/__init__.py. MIT License:
+        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+        """
+        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem))
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = torch.arange(seq_len, dtype=torch.float, device=device)
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = torch.outer(seq_idx, theta).float()
+        cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
+        # this is to mimic the behaviour of complex32, else we will get different results
+        if dtype in (torch.float16, torch.bfloat16, torch.int8):
+            cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
+        return cache
+    def forward(self, max_seq_len, offset=0):
+        return self.forward_impl(
+            max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device
+        )
+@torch.jit.script
+def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor:
+    # x: [sq, b, np, hn]
+    sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3)
+    rot_dim = rope_cache.shape[-2] * 2
+    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+    # truncate to support variable sizes
+    rope_cache = rope_cache[:sq]
+    xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2)
+    rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2)
+    x_out2 = torch.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return torch.cat((x_out2, x_pass), dim=-1)
+class RMSNorm(torch.nn.Module):
+    def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs):
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype))
+        self.eps = eps
+    def forward(self, hidden_states: torch.Tensor):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        return (self.weight * hidden_states).to(input_dtype)
+class CoreAttention(torch.nn.Module):
+    def __init__(self, config: ChatGLMConfig, layer_number):
+        super(CoreAttention, self).__init__()
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+        projection_size = config.kv_channels * config.num_attention_heads
+        # Per attention head and per partition values.
+        self.hidden_size_per_partition = projection_size
+        self.hidden_size_per_attention_head = projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.coeff = coeff
+        self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        pytorch_major_version = int(torch.__version__.split('.')[0])
+        if pytorch_major_version >= 2:
+            query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]]
+            if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]:
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 is_causal=True)
+            else:
+                if attention_mask is not None:
+                    attention_mask = ~attention_mask
+                context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer,
+                                                                                 attention_mask)
+            context_layer = context_layer.permute(2, 0, 1, 3)
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.reshape(*new_context_layer_shape)
+        else:
+            # Raw attention scores
+            # [b, np, sq, sk]
+            output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0))
+            # [sq, b, np, hn] -> [sq, b * np, hn]
+            query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1)
+            # [sk, b, np, hn] -> [sk, b * np, hn]
+            key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
+            # preallocting input tensor: [b * np, sq, sk]
+            matmul_input_buffer = torch.empty(
+                output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype,
+                device=query_layer.device
+            )
+            # Raw attention scores. [b * np, sq, sk]
+            matmul_result = torch.baddbmm(
+                matmul_input_buffer,
+                query_layer.transpose(0, 1),  # [b * np, sq, hn]
+                key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
+                beta=0.0,
+                alpha=(1.0 / self.norm_factor),
+            )
+            # change view to [b, np, sq, sk]
+            attention_scores = matmul_result.view(*output_size)
+            # ===========================
+            # Attention probs and dropout
+            # ===========================
+            # attention scores and attention mask [b, np, sq, sk]
+            if self.attention_softmax_in_fp32:
+                attention_scores = attention_scores.float()
+            if self.coeff is not None:
+                attention_scores = attention_scores * self.coeff
+            if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]:
+                attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3],
+                                            device=attention_scores.device, dtype=torch.bool)
+                attention_mask.tril_()
+                attention_mask = ~attention_mask
+            if attention_mask is not None:
+                attention_scores = attention_scores.masked_fill(attention_mask, float("-inf"))
+            attention_probs = F.softmax(attention_scores, dim=-1)
+            attention_probs = attention_probs.type_as(value_layer)
+            # This is actually dropping out entire tokens to attend to, which might
+            # seem a bit unusual, but is taken from the original Transformer paper.
+            attention_probs = self.attention_dropout(attention_probs)
+            # =========================
+            # Context layer. [sq, b, hp]
+            # =========================
+            # value_layer -> context layer.
+            # [sk, b, np, hn] --> [b, np, sq, hn]
+            # context layer shape: [b, np, sq, hn]
+            output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3))
+            # change view [sk, b * np, hn]
+            value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
+            # change view [b * np, sq, sk]
+            attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
+            # matmul: [b * np, sq, hn]
+            context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
+            # change view [b, np, sq, hn]
+            context_layer = context_layer.view(*output_size)
+            # [b, np, sq, hn] --> [sq, b, np, hn]
+            context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+            # [sq, b, np, hn] --> [sq, b, hp]
+            new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
+            context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+class SelfAttention(torch.nn.Module):
+    """Parallel self-attention layer abstract class.
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(SelfAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+        self.projection_size = config.kv_channels * config.num_attention_heads
+        # Per attention head and per partition values.
+        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads
+        self.multi_query_attention = config.multi_query_attention
+        self.qkv_hidden_size = 3 * self.projection_size
+        if self.multi_query_attention:
+            self.num_multi_query_groups_per_partition = config.multi_query_group_num
+            self.qkv_hidden_size = (
+                    self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num
+            )
+        self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size,
+                                         bias=config.add_bias_linear or config.add_qkv_bias,
+                                         device=device, **_config_to_kwargs(config)
+                                         )
+        self.core_attention = CoreAttention(config, self.layer_number)
+        # Output.
+        self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear,
+                               device=device, **_config_to_kwargs(config)
+                               )
+    def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None):
+        if self.multi_query_attention:
+            num_attention_heads = self.num_multi_query_groups_per_partition
+        else:
+            num_attention_heads = self.num_attention_heads_per_partition
+        return torch.empty(
+            inference_max_sequence_len,
+            batch_size,
+            num_attention_heads,
+            self.hidden_size_per_attention_head,
+            dtype=dtype,
+            device=device,
+        )
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True
+    ):
+        # hidden_states: [sq, b, h]
+        # =================================================
+        # Pre-allocate memory for key-values for inference.
+        # =================================================
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)]
+        mixed_x_layer = self.query_key_value(hidden_states)
+        if self.multi_query_attention:
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                [
+                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                ],
+                dim=-1,
+            )
+            query_layer = query_layer.view(
+                query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            key_layer = key_layer.view(
+                key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.view(
+                value_layer.size()[:-1]
+                + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head)
+            )
+        else:
+            new_tensor_shape = mixed_x_layer.size()[:-1] + \
+                               (self.num_attention_heads_per_partition,
+                                3 * self.hidden_size_per_attention_head)
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
+            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
+        # adjust key and value for inference
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            key_layer = torch.cat((cache_k, key_layer), dim=0)
+            value_layer = torch.cat((cache_v, value_layer), dim=0)
+        if use_cache:
+            kv_cache = (key_layer, value_layer)
+        else:
+            kv_cache = None
+        if self.multi_query_attention:
+            key_layer = key_layer.unsqueeze(-2)
+            key_layer = key_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            key_layer = key_layer.contiguous().view(
+                key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+            value_layer = value_layer.unsqueeze(-2)
+            value_layer = value_layer.expand(
+                -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1
+            )
+            value_layer = value_layer.contiguous().view(
+                value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head)
+            )
+        # ==================================
+        # core attention computation
+        # ==================================
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+        # =================
+        # Output. [sq, b, h]
+        # =================
+        output = self.dense(context_layer)
+        return output, kv_cache
+def _config_to_kwargs(args):
+    common_kwargs = {
+        "dtype": args.torch_dtype,
+    }
+    return common_kwargs
+class MLP(torch.nn.Module):
+    """MLP.
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(MLP, self).__init__()
+        self.add_bias = config.add_bias_linear
+        # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        self.dense_h_to_4h = nn.Linear(
+            config.hidden_size,
+            config.ffn_hidden_size * 2,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+        def swiglu(x):
+            x = torch.chunk(x, 2, dim=-1)
+            return F.silu(x[0]) * x[1]
+        self.activation_func = swiglu
+        # Project back to h.
+        self.dense_4h_to_h = nn.Linear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias=self.add_bias,
+            device=device,
+            **_config_to_kwargs(config)
+        )
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation_func(intermediate_parallel)
+        # [s, b, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        return output
+class GLMBlock(torch.nn.Module):
+    """A single transformer layer.
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+    def __init__(self, config: ChatGLMConfig, layer_number, device=None):
+        super(GLMBlock, self).__init__()
+        self.layer_number = layer_number
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+        self.fp32_residual_connection = config.fp32_residual_connection
+        LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                             dtype=config.torch_dtype)
+        # Self attention.
+        self.self_attention = SelfAttention(config, layer_number, device=device)
+        self.hidden_dropout = config.hidden_dropout
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                      dtype=config.torch_dtype)
+        # MLP
+        self.mlp = MLP(config, device=device)
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True,
+    ):
+        # hidden_states: [s, b, h]
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Self attention.
+        attention_output, kv_cache = self.self_attention(
+            layernorm_output,
+            attention_mask,
+            rotary_pos_emb,
+            kv_cache=kv_cache,
+            use_cache=use_cache
+        )
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+        layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training)
+        layernorm_input = residual + layernorm_input
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+        output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
+        output = residual + output
+        return output, kv_cache
+class GLMTransformer(torch.nn.Module):
+    """Transformer class."""
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(GLMTransformer, self).__init__()
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = config.post_layer_norm
+        # Number of layers.
+        self.num_layers = config.num_layers
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GLMBlock(config, layer_number, device=device)
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
+        if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device,
+                                                 dtype=config.torch_dtype)
+        self.gradient_checkpointing = False
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+    def forward(
+            self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None,
+            use_cache: Optional[bool] = True,
+            output_hidden_states: Optional[bool] = False,
+    ):
+        if not kv_caches:
+            kv_caches = [None for _ in range(self.num_layers)]
+        presents = () if use_cache else None
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        all_self_attentions = None
+        all_hidden_states = () if output_hidden_states else None
+        for index in range(self.num_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer = self._get_layer(index)
+            if self.gradient_checkpointing and self.training:
+                layer_ret = torch.utils.checkpoint.checkpoint(
+                    layer,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_caches[index],
+                    use_cache
+                )
+            else:
+                layer_ret = layer(
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_cache=kv_caches[index],
+                    use_cache=use_cache
+                )
+            hidden_states, kv_cache = layer_ret
+            if use_cache:
+                presents = presents + (kv_cache,)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states, presents, all_hidden_states, all_self_attentions
+class ChatGLMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+    is_parallelizable = False
+    supports_gradient_checkpointing = True
+    config_class = ChatGLMConfig
+    base_model_prefix = "transformer"
+    _no_split_modules = ["GLMBlock"]
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        return
+    def get_masks(self, input_ids, past_key_values, padding_mask=None):
+        batch_size, seq_length = input_ids.shape
+        full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device)
+        full_attention_mask.tril_()
+        past_length = 0
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[0]
+        if past_length:
+            full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length,
+                                                        device=input_ids.device), full_attention_mask), dim=-1)
+        if padding_mask is not None:
+            full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1)
+        if not past_length and padding_mask is not None:
+            full_attention_mask -= padding_mask.unsqueeze(-1) - 1
+        full_attention_mask = (full_attention_mask < 0.5).bool()
+        full_attention_mask.unsqueeze_(1)
+        return full_attention_mask
+    def get_position_ids(self, input_ids, device):
+        batch_size, seq_length = input_ids.shape
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1)
+        return position_ids
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GLMTransformer):
+            module.gradient_checkpointing = value
+class Embedding(torch.nn.Module):
+    """Language model embeddings."""
+    def __init__(self, config: ChatGLMConfig, device=None):
+        super(Embedding, self).__init__()
+        self.hidden_size = config.hidden_size
+        # Word embeddings (parallel).
+        self.word_embeddings = nn.Embedding(
+            config.padded_vocab_size,
+            self.hidden_size,
+            dtype=config.torch_dtype,
+            device=device
+        )
+        self.fp32_residual_connection = config.fp32_residual_connection
+    def forward(self, input_ids):
+        # Embeddings.
+        words_embeddings = self.word_embeddings(input_ids)
+        embeddings = words_embeddings
+        # Data format change to avoid explicit tranposes : [b s h] --> [s b h].
+        embeddings = embeddings.transpose(0, 1).contiguous()
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.float()
+        return embeddings
+class ChatGLMModel(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, device=None, empty_init=True):
+        super().__init__(config)
+        if empty_init:
+            init_method = skip_init
+        else:
+            init_method = default_init
+        init_kwargs = {}
+        if device is not None:
+            init_kwargs["device"] = device
+        self.embedding = init_method(Embedding, config, **init_kwargs)
+        self.num_layers = config.num_layers
+        self.multi_query_group_num = config.multi_query_group_num
+        self.kv_channels = config.kv_channels
+        # Rotary positional embeddings
+        self.seq_length = config.seq_length
+        rotary_dim = (
+            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
+        )
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device,
+                                              dtype=config.torch_dtype)
+        self.encoder = init_method(GLMTransformer, config, **init_kwargs)
+        self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False,
+                                        dtype=config.torch_dtype, **init_kwargs)
+        self.pre_seq_len = config.pre_seq_len
+        self.prefix_projection = config.prefix_projection
+        if self.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = torch.arange(self.pre_seq_len).long()
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = torch.nn.Dropout(0.1)
+    def get_input_embeddings(self):
+        return self.embedding.word_embeddings
+    def get_prompt(self, batch_size, device, dtype=torch.half):
+        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device)
+        past_key_values = self.prefix_encoder(prefix_tokens).type(dtype)
+        past_key_values = past_key_values.view(
+            batch_size,
+            self.pre_seq_len,
+            self.num_layers * 2,
+            self.multi_query_group_num,
+            self.kv_channels
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2)
+        return past_key_values
+    def forward(
+            self,
+            input_ids,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.BoolTensor] = None,
+            full_attention_mask: Optional[torch.BoolTensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, seq_length = input_ids.shape
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding(input_ids)
+        if self.pre_seq_len is not None:
+            if past_key_values is None:
+                past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device,
+                                                  dtype=inputs_embeds.dtype)
+            if attention_mask is not None:
+                attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)),
+                                            attention_mask], dim=-1)
+        if full_attention_mask is None:
+            if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
+                full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
+        # Rotary positional embeddings
+        rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
+        if position_ids is not None:
+            rotary_pos_emb = rotary_pos_emb[position_ids]
+        else:
+            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
+        rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
+        # Run encoder.
+        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
+            inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,
+            kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states
+        )
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+    def quantize(self, weight_bit_width: int):
+        from .quantization import quantize
+        quantize(self.encoder, weight_bit_width)
+        return self
+class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+        self.max_sequence_length = config.max_length
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+        self.config = config
+        self.quantized = False
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+    def _update_model_kwargs_for_generation(
+            self,
+            outputs: ModelOutput,
+            model_kwargs: Dict[str, Any],
+            is_encoder_decoder: bool = False,
+            standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = self._extract_past_from_model_output(
+            outputs, standardize_cache_format=standardize_cache_format
+        )
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = torch.cat(
+                [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+            )
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id += 1
+            model_kwargs["position_ids"] = torch.cat(
+                [position_ids, new_position_id], dim=-1
+            )
+        model_kwargs["is_first_forward"] = False
+        return model_kwargs
+    def prepare_inputs_for_generation(
+            self,
+            input_ids: torch.LongTensor,
+            past_key_values: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            is_first_forward: bool = True,
+            **kwargs
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if position_ids is None:
+            position_ids = self.get_position_ids(input_ids, device=input_ids.device)
+        if not is_first_forward:
+            if past_key_values is not None:
+                position_ids = position_ids[..., -1:]
+                input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "return_last_logit": True,
+            "use_cache": use_cache
+        }
+    def forward(
+            self,
+            input_ids: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            return_last_logit: Optional[bool] = False,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        if return_last_logit:
+            hidden_states = hidden_states[-1:]
+        lm_logits = self.transformer.output_layer(hidden_states)
+        lm_logits = lm_logits.transpose(0, 1).contiguous()
+        loss = None
+        if labels is not None:
+            lm_logits = lm_logits.to(torch.float32)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            lm_logits = lm_logits.to(hidden_states.dtype)
+            loss = loss.to(hidden_states.dtype)
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    @staticmethod
+    def _reorder_cache(
+            past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor
+    ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+        Output shares the same memory storage as `past`.
+        """
+        return tuple(
+            (
+                layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)),
+                layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)),
+            )
+            for layer_past in past
+        )
+    def process_response(self, output, history):
+        content = ""
+        history = deepcopy(history)
+        for response in output.split("<|assistant|>"):
+            metadata, content = response.split("\n", maxsplit=1)
+            if not metadata.strip():
+                content = content.strip()
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                content = content.replace("[[训练时间]]", "2023年")
+            else:
+                history.append({"role": "assistant", "metadata": metadata, "content": content})
+                if history[0]["role"] == "system" and "tools" in history[0]:
+                    content = "\n".join(content.split("\n")[1:-1])
+                    def tool_call(**kwargs):
+                        return kwargs
+                    parameters = eval(content)
+                    content = {"name": metadata.strip(), "parameters": parameters}
+                else:
+                    content = {"name": metadata.strip(), "content": content}
+        return content, history
+    @torch.inference_mode()
+    def chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, role: str = "user",
+             max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None,
+             **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        inputs = tokenizer.build_chat_input(query, history=history, role=role)
+        inputs = inputs.to(self.device)
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
+                        tokenizer.get_command("<|observation|>")]
+        outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id)
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+        response = tokenizer.decode(outputs)
+        history.append({"role": role, "content": query})
+        response, history = self.process_response(response, history)
+        return response, history
+    @torch.inference_mode()
+    def stream_chat(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, role: str = "user",
+                    past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8,
+                    logits_processor=None, return_past_key_values=False, **kwargs):
+        if history is None:
+            history = []
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+        logits_processor.append(InvalidScoreLogitsProcessor())
+        eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"),
+                        tokenizer.get_command("<|observation|>")]
+        gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p,
+                      "temperature": temperature, "logits_processor": logits_processor, **kwargs}
+        if past_key_values is None:
+            inputs = tokenizer.build_chat_input(query, history=history, role=role)
+        else:
+            inputs = tokenizer.build_chat_input(query, role=role)
+        inputs = inputs.to(self.device)
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[0]
+            if self.transformer.pre_seq_len is not None:
+                past_length -= self.transformer.pre_seq_len
+            inputs.position_ids += past_length
+            attention_mask = inputs.attention_mask
+            attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1)
+            inputs['attention_mask'] = attention_mask
+        history.append({"role": role, "content": query})
+        for outputs in self.stream_generate(**inputs, past_key_values=past_key_values,
+                                            eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
+                                            **gen_kwargs):
+            if return_past_key_values:
+                outputs, past_key_values = outputs
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+            response = tokenizer.decode(outputs)
+            if response and response[-1] != "�":
+                response, new_history = self.process_response(response, history)
+                if return_past_key_values:
+                    yield response, new_history, past_key_values
+                else:
+                    yield response, new_history
+    @torch.inference_mode()
+    def stream_generate(
+            self,
+            input_ids,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            return_past_key_values=False,
+            **kwargs,
+    ):
+        batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1]
+        if generation_config is None:
+            generation_config = self.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+        model_kwargs["use_cache"] = generation_config.use_cache
+        bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        if has_default_max_length and generation_config.max_new_tokens is None:
+            warnings.warn(
+                f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
+                "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
+                " recommend using `max_new_tokens` to control the maximum length of the generation.",
+                UserWarning,
+            )
+        elif generation_config.max_new_tokens is not None:
+            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
+            if not has_default_max_length:
+                logger.warn(
+                    f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
+                    f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
+                    "Please refer to the documentation for more information. "
+                    "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)",
+                    UserWarning,
+                )
+        if input_ids_seq_length >= generation_config.max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to"
+                f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+                " increasing `max_new_tokens`."
+            )
+        # 2. Set generation parameters if not already defined
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+        )
+        stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config, stopping_criteria=stopping_criteria
+        )
+        logits_warper = self._get_logits_warper(generation_config)
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        scores = None
+        while True:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=False,
+                output_hidden_states=False,
+            )
+            next_token_logits = outputs.logits[:, -1, :]
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            if generation_config.do_sample:
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(probs, dim=-1)
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            unfinished_sequences = unfinished_sequences.mul(
+                next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+            )
+            if return_past_key_values:
+                yield input_ids, outputs.past_key_values
+            else:
+                yield input_ids
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+    def quantize(self, bits: int, empty_init=False, device=None, **kwargs):
+        if bits == 0:
+            return
+        from .quantization import quantize
+        if self.quantized:
+            logger.info("Already quantized.")
+            return self
+        self.quantized = True
+        self.config.quantization_bit = bits
+        self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device,
+                                            **kwargs)
+        return self
+class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel):
+    def __init__(self, config: ChatGLMConfig, empty_init=True, device=None):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device)
+        self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half)
+        if config.classifier_dropout is not None:
+            self.dropout = nn.Dropout(config.classifier_dropout)
+        else:
+            self.dropout = None
+        self.config = config
+        if self.config.quantization_bit:
+            self.quantize(self.config.quantization_bit, empty_init=True)
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            full_attention_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
+            inputs_embeds: Optional[torch.LongTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            full_attention_mask=full_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        pooled_hidden_states = hidden_states[-1]
+        if self.dropout is not None:
+            pooled_hidden_states = self.dropout(pooled_hidden_states)
+        logits = self.classifier_head(pooled_hidden_states)
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze().float(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits.float(), labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits.float(), labels.view(-1, self.num_labels))
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

diffsynth/models/sd3_dit.py ADDED Viewed

	@@ -0,0 +1,797 @@

+import torch
+from einops import rearrange
+from .svd_unet import TemporalTimesteps
+from .tiler import TileWorker
+class PatchEmbed(torch.nn.Module):
+    def __init__(self, patch_size=2, in_channels=16, embed_dim=1536, pos_embed_max_size=192):
+        super().__init__()
+        self.pos_embed_max_size = pos_embed_max_size
+        self.patch_size = patch_size
+        self.proj = torch.nn.Conv2d(in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size)
+        self.pos_embed = torch.nn.Parameter(torch.zeros(1, self.pos_embed_max_size, self.pos_embed_max_size, 1536))
+    def cropped_pos_embed(self, height, width):
+        height = height // self.patch_size
+        width = width // self.patch_size
+        top = (self.pos_embed_max_size - height) // 2
+        left = (self.pos_embed_max_size - width) // 2
+        spatial_pos_embed = self.pos_embed[:, top : top + height, left : left + width, :].flatten(1, 2)
+        return spatial_pos_embed
+    def forward(self, latent):
+        height, width = latent.shape[-2:]
+        latent = self.proj(latent)
+        latent = latent.flatten(2).transpose(1, 2)
+        pos_embed = self.cropped_pos_embed(height, width)
+        return latent + pos_embed
+class TimestepEmbeddings(torch.nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.time_proj = TemporalTimesteps(num_channels=dim_in, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = torch.nn.Sequential(
+            torch.nn.Linear(dim_in, dim_out), torch.nn.SiLU(), torch.nn.Linear(dim_out, dim_out)
+        )
+    def forward(self, timestep, dtype):
+        time_emb = self.time_proj(timestep).to(dtype)
+        time_emb = self.timestep_embedder(time_emb)
+        return time_emb
+class AdaLayerNorm(torch.nn.Module):
+    def __init__(self, dim, single=False):
+        super().__init__()
+        self.single = single
+        self.linear = torch.nn.Linear(dim, dim * (2 if single else 6))
+        self.norm = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb):
+        emb = self.linear(torch.nn.functional.silu(emb))
+        if self.single:
+            scale, shift = emb.unsqueeze(1).chunk(2, dim=2)
+            x = self.norm(x) * (1 + scale) + shift
+            return x
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.unsqueeze(1).chunk(6, dim=2)
+            x = self.norm(x) * (1 + scale_msa) + shift_msa
+            return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+class JointAttention(torch.nn.Module):
+    def __init__(self, dim_a, dim_b, num_heads, head_dim, only_out_a=False):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.only_out_a = only_out_a
+        self.a_to_qkv = torch.nn.Linear(dim_a, dim_a * 3)
+        self.b_to_qkv = torch.nn.Linear(dim_b, dim_b * 3)
+        self.a_to_out = torch.nn.Linear(dim_a, dim_a)
+        if not only_out_a:
+            self.b_to_out = torch.nn.Linear(dim_b, dim_b)
+    def forward(self, hidden_states_a, hidden_states_b):
+        batch_size = hidden_states_a.shape[0]
+        qkv = torch.concat([self.a_to_qkv(hidden_states_a), self.b_to_qkv(hidden_states_b)], dim=1)
+        qkv = qkv.view(batch_size, -1, 3 * self.num_heads, self.head_dim).transpose(1, 2)
+        q, k, v = qkv.chunk(3, dim=1)
+        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
+        hidden_states = hidden_states.to(q.dtype)
+        hidden_states_a, hidden_states_b = hidden_states[:, :hidden_states_a.shape[1]], hidden_states[:, hidden_states_a.shape[1]:]
+        hidden_states_a = self.a_to_out(hidden_states_a)
+        if self.only_out_a:
+            return hidden_states_a
+        else:
+            hidden_states_b = self.b_to_out(hidden_states_b)
+            return hidden_states_a, hidden_states_b
+class JointTransformerBlock(torch.nn.Module):
+    def __init__(self, dim, num_attention_heads):
+        super().__init__()
+        self.norm1_a = AdaLayerNorm(dim)
+        self.norm1_b = AdaLayerNorm(dim)
+        self.attn = JointAttention(dim, dim, num_attention_heads, dim // num_attention_heads)
+        self.norm2_a = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_a = torch.nn.Sequential(
+            torch.nn.Linear(dim, dim*4),
+            torch.nn.GELU(approximate="tanh"),
+            torch.nn.Linear(dim*4, dim)
+        )
+        self.norm2_b = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_b = torch.nn.Sequential(
+            torch.nn.Linear(dim, dim*4),
+            torch.nn.GELU(approximate="tanh"),
+            torch.nn.Linear(dim*4, dim)
+        )
+    def forward(self, hidden_states_a, hidden_states_b, temb):
+        norm_hidden_states_a, gate_msa_a, shift_mlp_a, scale_mlp_a, gate_mlp_a = self.norm1_a(hidden_states_a, emb=temb)
+        norm_hidden_states_b, gate_msa_b, shift_mlp_b, scale_mlp_b, gate_mlp_b = self.norm1_b(hidden_states_b, emb=temb)
+        # Attention
+        attn_output_a, attn_output_b = self.attn(norm_hidden_states_a, norm_hidden_states_b)
+        # Part A
+        hidden_states_a = hidden_states_a + gate_msa_a * attn_output_a
+        norm_hidden_states_a = self.norm2_a(hidden_states_a) * (1 + scale_mlp_a) + shift_mlp_a
+        hidden_states_a = hidden_states_a + gate_mlp_a * self.ff_a(norm_hidden_states_a)
+        # Part B
+        hidden_states_b = hidden_states_b + gate_msa_b * attn_output_b
+        norm_hidden_states_b = self.norm2_b(hidden_states_b) * (1 + scale_mlp_b) + shift_mlp_b
+        hidden_states_b = hidden_states_b + gate_mlp_b * self.ff_b(norm_hidden_states_b)
+        return hidden_states_a, hidden_states_b
+class JointTransformerFinalBlock(torch.nn.Module):
+    def __init__(self, dim, num_attention_heads):
+        super().__init__()
+        self.norm1_a = AdaLayerNorm(dim)
+        self.norm1_b = AdaLayerNorm(dim, single=True)
+        self.attn = JointAttention(dim, dim, num_attention_heads, dim // num_attention_heads, only_out_a=True)
+        self.norm2_a = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_a = torch.nn.Sequential(
+            torch.nn.Linear(dim, dim*4),
+            torch.nn.GELU(approximate="tanh"),
+            torch.nn.Linear(dim*4, dim)
+        )
+    def forward(self, hidden_states_a, hidden_states_b, temb):
+        norm_hidden_states_a, gate_msa_a, shift_mlp_a, scale_mlp_a, gate_mlp_a = self.norm1_a(hidden_states_a, emb=temb)
+        norm_hidden_states_b = self.norm1_b(hidden_states_b, emb=temb)
+        # Attention
+        attn_output_a = self.attn(norm_hidden_states_a, norm_hidden_states_b)
+        # Part A
+        hidden_states_a = hidden_states_a + gate_msa_a * attn_output_a
+        norm_hidden_states_a = self.norm2_a(hidden_states_a) * (1 + scale_mlp_a) + shift_mlp_a
+        hidden_states_a = hidden_states_a + gate_mlp_a * self.ff_a(norm_hidden_states_a)
+        return hidden_states_a, hidden_states_b
+class SD3DiT(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.pos_embedder = PatchEmbed(patch_size=2, in_channels=16, embed_dim=1536, pos_embed_max_size=192)
+        self.time_embedder = TimestepEmbeddings(256, 1536)
+        self.pooled_text_embedder = torch.nn.Sequential(torch.nn.Linear(2048, 1536), torch.nn.SiLU(), torch.nn.Linear(1536, 1536))
+        self.context_embedder = torch.nn.Linear(4096, 1536)
+        self.blocks = torch.nn.ModuleList([JointTransformerBlock(1536, 24) for _ in range(23)] + [JointTransformerFinalBlock(1536, 24)])
+        self.norm_out = AdaLayerNorm(1536, single=True)
+        self.proj_out = torch.nn.Linear(1536, 64)
+    def tiled_forward(self, hidden_states, timestep, prompt_emb, pooled_prompt_emb, tile_size=128, tile_stride=64):
+        # Due to the global positional embedding, we cannot implement layer-wise tiled forward.
+        hidden_states = TileWorker().tiled_forward(
+            lambda x: self.forward(x, timestep, prompt_emb, pooled_prompt_emb),
+            hidden_states,
+            tile_size,
+            tile_stride,
+            tile_device=hidden_states.device,
+            tile_dtype=hidden_states.dtype
+        )
+        return hidden_states
+    def forward(self, hidden_states, timestep, prompt_emb, pooled_prompt_emb, tiled=False, tile_size=128, tile_stride=64, use_gradient_checkpointing=False):
+        if tiled:
+            return self.tiled_forward(hidden_states, timestep, prompt_emb, pooled_prompt_emb, tile_size, tile_stride)
+        conditioning = self.time_embedder(timestep, hidden_states.dtype) + self.pooled_text_embedder(pooled_prompt_emb)
+        prompt_emb = self.context_embedder(prompt_emb)
+        height, width = hidden_states.shape[-2:]
+        hidden_states = self.pos_embedder(hidden_states)
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+            return custom_forward
+        for block in self.blocks:
+            if self.training and use_gradient_checkpointing:
+                hidden_states, prompt_emb = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states, prompt_emb, conditioning,
+                    use_reentrant=False,
+                )
+            else:
+                hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning)
+        hidden_states = self.norm_out(hidden_states, conditioning)
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = rearrange(hidden_states, "B (H W) (P Q C) -> B C (H P) (W Q)", P=2, Q=2, H=height//2, W=width//2)
+        return hidden_states
+    def state_dict_converter(self):
+        return SD3DiTStateDictConverter()
+class SD3DiTStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        rename_dict = {
+            "context_embedder": "context_embedder",
+            "pos_embed.pos_embed": "pos_embedder.pos_embed",
+            "pos_embed.proj": "pos_embedder.proj",
+            "time_text_embed.timestep_embedder.linear_1": "time_embedder.timestep_embedder.0",
+            "time_text_embed.timestep_embedder.linear_2": "time_embedder.timestep_embedder.2",
+            "time_text_embed.text_embedder.linear_1": "pooled_text_embedder.0",
+            "time_text_embed.text_embedder.linear_2": "pooled_text_embedder.2",
+            "norm_out.linear": "norm_out.linear",
+            "proj_out": "proj_out",
+            "norm1.linear": "norm1_a.linear",
+            "norm1_context.linear": "norm1_b.linear",
+            "attn.to_q": "attn.a_to_q",
+            "attn.to_k": "attn.a_to_k",
+            "attn.to_v": "attn.a_to_v",
+            "attn.to_out.0": "attn.a_to_out",
+            "attn.add_q_proj": "attn.b_to_q",
+            "attn.add_k_proj": "attn.b_to_k",
+            "attn.add_v_proj": "attn.b_to_v",
+            "attn.to_add_out": "attn.b_to_out",
+            "ff.net.0.proj": "ff_a.0",
+            "ff.net.2": "ff_a.2",
+            "ff_context.net.0.proj": "ff_b.0",
+            "ff_context.net.2": "ff_b.2",
+        }
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name in rename_dict:
+                if name == "pos_embed.pos_embed":
+                    param = param.reshape((1, 192, 192, 1536))
+                state_dict_[rename_dict[name]] = param
+            elif name.endswith(".weight") or name.endswith(".bias"):
+                suffix = ".weight" if name.endswith(".weight") else ".bias"
+                prefix = name[:-len(suffix)]
+                if prefix in rename_dict:
+                    state_dict_[rename_dict[prefix] + suffix] = param
+                elif prefix.startswith("transformer_blocks."):
+                    names = prefix.split(".")
+                    names[0] = "blocks"
+                    middle = ".".join(names[2:])
+                    if middle in rename_dict:
+                        name_ = ".".join(names[:2] + [rename_dict[middle]] + [suffix[1:]])
+                        state_dict_[name_] = param
+        return state_dict_
+    def from_civitai(self, state_dict):
+        rename_dict = {
+            "model.diffusion_model.context_embedder.bias": "context_embedder.bias",
+            "model.diffusion_model.context_embedder.weight": "context_embedder.weight",
+            "model.diffusion_model.final_layer.linear.bias": "proj_out.bias",
+            "model.diffusion_model.final_layer.linear.weight": "proj_out.weight",
+            "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.bias": "blocks.0.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.0.context_block.adaLN_modulation.1.weight": "blocks.0.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.0.context_block.attn.proj.bias": "blocks.0.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.0.context_block.attn.proj.weight": "blocks.0.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.0.context_block.attn.qkv.bias": ['blocks.0.attn.b_to_q.bias', 'blocks.0.attn.b_to_k.bias', 'blocks.0.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.0.context_block.attn.qkv.weight": ['blocks.0.attn.b_to_q.weight', 'blocks.0.attn.b_to_k.weight', 'blocks.0.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.0.context_block.mlp.fc1.bias": "blocks.0.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.0.context_block.mlp.fc1.weight": "blocks.0.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.0.context_block.mlp.fc2.bias": "blocks.0.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.0.context_block.mlp.fc2.weight": "blocks.0.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.0.x_block.adaLN_modulation.1.bias": "blocks.0.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.0.x_block.adaLN_modulation.1.weight": "blocks.0.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.0.x_block.attn.proj.bias": "blocks.0.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.0.x_block.attn.proj.weight": "blocks.0.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.0.x_block.attn.qkv.bias": ['blocks.0.attn.a_to_q.bias', 'blocks.0.attn.a_to_k.bias', 'blocks.0.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.0.x_block.attn.qkv.weight": ['blocks.0.attn.a_to_q.weight', 'blocks.0.attn.a_to_k.weight', 'blocks.0.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.0.x_block.mlp.fc1.bias": "blocks.0.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.0.x_block.mlp.fc1.weight": "blocks.0.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.0.x_block.mlp.fc2.bias": "blocks.0.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.0.x_block.mlp.fc2.weight": "blocks.0.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.1.context_block.adaLN_modulation.1.bias": "blocks.1.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.1.context_block.adaLN_modulation.1.weight": "blocks.1.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.1.context_block.attn.proj.bias": "blocks.1.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.1.context_block.attn.proj.weight": "blocks.1.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.1.context_block.attn.qkv.bias": ['blocks.1.attn.b_to_q.bias', 'blocks.1.attn.b_to_k.bias', 'blocks.1.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.1.context_block.attn.qkv.weight": ['blocks.1.attn.b_to_q.weight', 'blocks.1.attn.b_to_k.weight', 'blocks.1.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.1.context_block.mlp.fc1.bias": "blocks.1.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.1.context_block.mlp.fc1.weight": "blocks.1.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.1.context_block.mlp.fc2.bias": "blocks.1.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.1.context_block.mlp.fc2.weight": "blocks.1.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.1.x_block.adaLN_modulation.1.bias": "blocks.1.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.1.x_block.adaLN_modulation.1.weight": "blocks.1.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.1.x_block.attn.proj.bias": "blocks.1.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.1.x_block.attn.proj.weight": "blocks.1.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.1.x_block.attn.qkv.bias": ['blocks.1.attn.a_to_q.bias', 'blocks.1.attn.a_to_k.bias', 'blocks.1.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.1.x_block.attn.qkv.weight": ['blocks.1.attn.a_to_q.weight', 'blocks.1.attn.a_to_k.weight', 'blocks.1.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.1.x_block.mlp.fc1.bias": "blocks.1.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.1.x_block.mlp.fc1.weight": "blocks.1.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.1.x_block.mlp.fc2.bias": "blocks.1.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.1.x_block.mlp.fc2.weight": "blocks.1.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.10.context_block.adaLN_modulation.1.bias": "blocks.10.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.10.context_block.adaLN_modulation.1.weight": "blocks.10.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.10.context_block.attn.proj.bias": "blocks.10.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.10.context_block.attn.proj.weight": "blocks.10.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.10.context_block.attn.qkv.bias": ['blocks.10.attn.b_to_q.bias', 'blocks.10.attn.b_to_k.bias', 'blocks.10.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.10.context_block.attn.qkv.weight": ['blocks.10.attn.b_to_q.weight', 'blocks.10.attn.b_to_k.weight', 'blocks.10.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.10.context_block.mlp.fc1.bias": "blocks.10.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.10.context_block.mlp.fc1.weight": "blocks.10.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.10.context_block.mlp.fc2.bias": "blocks.10.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.10.context_block.mlp.fc2.weight": "blocks.10.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.10.x_block.adaLN_modulation.1.bias": "blocks.10.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.10.x_block.adaLN_modulation.1.weight": "blocks.10.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.10.x_block.attn.proj.bias": "blocks.10.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.10.x_block.attn.proj.weight": "blocks.10.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.10.x_block.attn.qkv.bias": ['blocks.10.attn.a_to_q.bias', 'blocks.10.attn.a_to_k.bias', 'blocks.10.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.10.x_block.attn.qkv.weight": ['blocks.10.attn.a_to_q.weight', 'blocks.10.attn.a_to_k.weight', 'blocks.10.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.10.x_block.mlp.fc1.bias": "blocks.10.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.10.x_block.mlp.fc1.weight": "blocks.10.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.10.x_block.mlp.fc2.bias": "blocks.10.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.10.x_block.mlp.fc2.weight": "blocks.10.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.11.context_block.adaLN_modulation.1.bias": "blocks.11.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.11.context_block.adaLN_modulation.1.weight": "blocks.11.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.11.context_block.attn.proj.bias": "blocks.11.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.11.context_block.attn.proj.weight": "blocks.11.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.11.context_block.attn.qkv.bias": ['blocks.11.attn.b_to_q.bias', 'blocks.11.attn.b_to_k.bias', 'blocks.11.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.11.context_block.attn.qkv.weight": ['blocks.11.attn.b_to_q.weight', 'blocks.11.attn.b_to_k.weight', 'blocks.11.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.11.context_block.mlp.fc1.bias": "blocks.11.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.11.context_block.mlp.fc1.weight": "blocks.11.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.11.context_block.mlp.fc2.bias": "blocks.11.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.11.context_block.mlp.fc2.weight": "blocks.11.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.11.x_block.adaLN_modulation.1.bias": "blocks.11.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.11.x_block.adaLN_modulation.1.weight": "blocks.11.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.11.x_block.attn.proj.bias": "blocks.11.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.11.x_block.attn.proj.weight": "blocks.11.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.11.x_block.attn.qkv.bias": ['blocks.11.attn.a_to_q.bias', 'blocks.11.attn.a_to_k.bias', 'blocks.11.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.11.x_block.attn.qkv.weight": ['blocks.11.attn.a_to_q.weight', 'blocks.11.attn.a_to_k.weight', 'blocks.11.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.11.x_block.mlp.fc1.bias": "blocks.11.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.11.x_block.mlp.fc1.weight": "blocks.11.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.11.x_block.mlp.fc2.bias": "blocks.11.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.11.x_block.mlp.fc2.weight": "blocks.11.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.12.context_block.adaLN_modulation.1.bias": "blocks.12.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.12.context_block.adaLN_modulation.1.weight": "blocks.12.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.12.context_block.attn.proj.bias": "blocks.12.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.12.context_block.attn.proj.weight": "blocks.12.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.12.context_block.attn.qkv.bias": ['blocks.12.attn.b_to_q.bias', 'blocks.12.attn.b_to_k.bias', 'blocks.12.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.12.context_block.attn.qkv.weight": ['blocks.12.attn.b_to_q.weight', 'blocks.12.attn.b_to_k.weight', 'blocks.12.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.12.context_block.mlp.fc1.bias": "blocks.12.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.12.context_block.mlp.fc1.weight": "blocks.12.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.12.context_block.mlp.fc2.bias": "blocks.12.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.12.context_block.mlp.fc2.weight": "blocks.12.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.12.x_block.adaLN_modulation.1.bias": "blocks.12.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.12.x_block.adaLN_modulation.1.weight": "blocks.12.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.12.x_block.attn.proj.bias": "blocks.12.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.12.x_block.attn.proj.weight": "blocks.12.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.12.x_block.attn.qkv.bias": ['blocks.12.attn.a_to_q.bias', 'blocks.12.attn.a_to_k.bias', 'blocks.12.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.12.x_block.attn.qkv.weight": ['blocks.12.attn.a_to_q.weight', 'blocks.12.attn.a_to_k.weight', 'blocks.12.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.12.x_block.mlp.fc1.bias": "blocks.12.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.12.x_block.mlp.fc1.weight": "blocks.12.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.12.x_block.mlp.fc2.bias": "blocks.12.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.12.x_block.mlp.fc2.weight": "blocks.12.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.13.context_block.adaLN_modulation.1.bias": "blocks.13.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.13.context_block.adaLN_modulation.1.weight": "blocks.13.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.13.context_block.attn.proj.bias": "blocks.13.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.13.context_block.attn.proj.weight": "blocks.13.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.13.context_block.attn.qkv.bias": ['blocks.13.attn.b_to_q.bias', 'blocks.13.attn.b_to_k.bias', 'blocks.13.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.13.context_block.attn.qkv.weight": ['blocks.13.attn.b_to_q.weight', 'blocks.13.attn.b_to_k.weight', 'blocks.13.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.13.context_block.mlp.fc1.bias": "blocks.13.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.13.context_block.mlp.fc1.weight": "blocks.13.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.13.context_block.mlp.fc2.bias": "blocks.13.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.13.context_block.mlp.fc2.weight": "blocks.13.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.13.x_block.adaLN_modulation.1.bias": "blocks.13.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.13.x_block.adaLN_modulation.1.weight": "blocks.13.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.13.x_block.attn.proj.bias": "blocks.13.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.13.x_block.attn.proj.weight": "blocks.13.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.13.x_block.attn.qkv.bias": ['blocks.13.attn.a_to_q.bias', 'blocks.13.attn.a_to_k.bias', 'blocks.13.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.13.x_block.attn.qkv.weight": ['blocks.13.attn.a_to_q.weight', 'blocks.13.attn.a_to_k.weight', 'blocks.13.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.13.x_block.mlp.fc1.bias": "blocks.13.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.13.x_block.mlp.fc1.weight": "blocks.13.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.13.x_block.mlp.fc2.bias": "blocks.13.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.13.x_block.mlp.fc2.weight": "blocks.13.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.14.context_block.adaLN_modulation.1.bias": "blocks.14.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.14.context_block.adaLN_modulation.1.weight": "blocks.14.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.14.context_block.attn.proj.bias": "blocks.14.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.14.context_block.attn.proj.weight": "blocks.14.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.14.context_block.attn.qkv.bias": ['blocks.14.attn.b_to_q.bias', 'blocks.14.attn.b_to_k.bias', 'blocks.14.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.14.context_block.attn.qkv.weight": ['blocks.14.attn.b_to_q.weight', 'blocks.14.attn.b_to_k.weight', 'blocks.14.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.14.context_block.mlp.fc1.bias": "blocks.14.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.14.context_block.mlp.fc1.weight": "blocks.14.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.14.context_block.mlp.fc2.bias": "blocks.14.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.14.context_block.mlp.fc2.weight": "blocks.14.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.14.x_block.adaLN_modulation.1.bias": "blocks.14.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.14.x_block.adaLN_modulation.1.weight": "blocks.14.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.14.x_block.attn.proj.bias": "blocks.14.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.14.x_block.attn.proj.weight": "blocks.14.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.14.x_block.attn.qkv.bias": ['blocks.14.attn.a_to_q.bias', 'blocks.14.attn.a_to_k.bias', 'blocks.14.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.14.x_block.attn.qkv.weight": ['blocks.14.attn.a_to_q.weight', 'blocks.14.attn.a_to_k.weight', 'blocks.14.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.14.x_block.mlp.fc1.bias": "blocks.14.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.14.x_block.mlp.fc1.weight": "blocks.14.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.14.x_block.mlp.fc2.bias": "blocks.14.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.14.x_block.mlp.fc2.weight": "blocks.14.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.15.context_block.adaLN_modulation.1.bias": "blocks.15.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.15.context_block.adaLN_modulation.1.weight": "blocks.15.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.15.context_block.attn.proj.bias": "blocks.15.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.15.context_block.attn.proj.weight": "blocks.15.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.15.context_block.attn.qkv.bias": ['blocks.15.attn.b_to_q.bias', 'blocks.15.attn.b_to_k.bias', 'blocks.15.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.15.context_block.attn.qkv.weight": ['blocks.15.attn.b_to_q.weight', 'blocks.15.attn.b_to_k.weight', 'blocks.15.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.15.context_block.mlp.fc1.bias": "blocks.15.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.15.context_block.mlp.fc1.weight": "blocks.15.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.15.context_block.mlp.fc2.bias": "blocks.15.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.15.context_block.mlp.fc2.weight": "blocks.15.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.15.x_block.adaLN_modulation.1.bias": "blocks.15.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.15.x_block.adaLN_modulation.1.weight": "blocks.15.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.15.x_block.attn.proj.bias": "blocks.15.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.15.x_block.attn.proj.weight": "blocks.15.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.15.x_block.attn.qkv.bias": ['blocks.15.attn.a_to_q.bias', 'blocks.15.attn.a_to_k.bias', 'blocks.15.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.15.x_block.attn.qkv.weight": ['blocks.15.attn.a_to_q.weight', 'blocks.15.attn.a_to_k.weight', 'blocks.15.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.15.x_block.mlp.fc1.bias": "blocks.15.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.15.x_block.mlp.fc1.weight": "blocks.15.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.15.x_block.mlp.fc2.bias": "blocks.15.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.15.x_block.mlp.fc2.weight": "blocks.15.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.16.context_block.adaLN_modulation.1.bias": "blocks.16.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.16.context_block.adaLN_modulation.1.weight": "blocks.16.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.16.context_block.attn.proj.bias": "blocks.16.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.16.context_block.attn.proj.weight": "blocks.16.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.16.context_block.attn.qkv.bias": ['blocks.16.attn.b_to_q.bias', 'blocks.16.attn.b_to_k.bias', 'blocks.16.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.16.context_block.attn.qkv.weight": ['blocks.16.attn.b_to_q.weight', 'blocks.16.attn.b_to_k.weight', 'blocks.16.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.16.context_block.mlp.fc1.bias": "blocks.16.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.16.context_block.mlp.fc1.weight": "blocks.16.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.16.context_block.mlp.fc2.bias": "blocks.16.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.16.context_block.mlp.fc2.weight": "blocks.16.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.16.x_block.adaLN_modulation.1.bias": "blocks.16.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.16.x_block.adaLN_modulation.1.weight": "blocks.16.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.16.x_block.attn.proj.bias": "blocks.16.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.16.x_block.attn.proj.weight": "blocks.16.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.16.x_block.attn.qkv.bias": ['blocks.16.attn.a_to_q.bias', 'blocks.16.attn.a_to_k.bias', 'blocks.16.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.16.x_block.attn.qkv.weight": ['blocks.16.attn.a_to_q.weight', 'blocks.16.attn.a_to_k.weight', 'blocks.16.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.16.x_block.mlp.fc1.bias": "blocks.16.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.16.x_block.mlp.fc1.weight": "blocks.16.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.16.x_block.mlp.fc2.bias": "blocks.16.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.16.x_block.mlp.fc2.weight": "blocks.16.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.17.context_block.adaLN_modulation.1.bias": "blocks.17.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.17.context_block.adaLN_modulation.1.weight": "blocks.17.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.17.context_block.attn.proj.bias": "blocks.17.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.17.context_block.attn.proj.weight": "blocks.17.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.17.context_block.attn.qkv.bias": ['blocks.17.attn.b_to_q.bias', 'blocks.17.attn.b_to_k.bias', 'blocks.17.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.17.context_block.attn.qkv.weight": ['blocks.17.attn.b_to_q.weight', 'blocks.17.attn.b_to_k.weight', 'blocks.17.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.17.context_block.mlp.fc1.bias": "blocks.17.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.17.context_block.mlp.fc1.weight": "blocks.17.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.17.context_block.mlp.fc2.bias": "blocks.17.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.17.context_block.mlp.fc2.weight": "blocks.17.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.17.x_block.adaLN_modulation.1.bias": "blocks.17.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.17.x_block.adaLN_modulation.1.weight": "blocks.17.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.17.x_block.attn.proj.bias": "blocks.17.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.17.x_block.attn.proj.weight": "blocks.17.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.17.x_block.attn.qkv.bias": ['blocks.17.attn.a_to_q.bias', 'blocks.17.attn.a_to_k.bias', 'blocks.17.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.17.x_block.attn.qkv.weight": ['blocks.17.attn.a_to_q.weight', 'blocks.17.attn.a_to_k.weight', 'blocks.17.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.17.x_block.mlp.fc1.bias": "blocks.17.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.17.x_block.mlp.fc1.weight": "blocks.17.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.17.x_block.mlp.fc2.bias": "blocks.17.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.17.x_block.mlp.fc2.weight": "blocks.17.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.18.context_block.adaLN_modulation.1.bias": "blocks.18.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.18.context_block.adaLN_modulation.1.weight": "blocks.18.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.18.context_block.attn.proj.bias": "blocks.18.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.18.context_block.attn.proj.weight": "blocks.18.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.18.context_block.attn.qkv.bias": ['blocks.18.attn.b_to_q.bias', 'blocks.18.attn.b_to_k.bias', 'blocks.18.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.18.context_block.attn.qkv.weight": ['blocks.18.attn.b_to_q.weight', 'blocks.18.attn.b_to_k.weight', 'blocks.18.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.18.context_block.mlp.fc1.bias": "blocks.18.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.18.context_block.mlp.fc1.weight": "blocks.18.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.18.context_block.mlp.fc2.bias": "blocks.18.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.18.context_block.mlp.fc2.weight": "blocks.18.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.18.x_block.adaLN_modulation.1.bias": "blocks.18.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.18.x_block.adaLN_modulation.1.weight": "blocks.18.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.18.x_block.attn.proj.bias": "blocks.18.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.18.x_block.attn.proj.weight": "blocks.18.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.18.x_block.attn.qkv.bias": ['blocks.18.attn.a_to_q.bias', 'blocks.18.attn.a_to_k.bias', 'blocks.18.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.18.x_block.attn.qkv.weight": ['blocks.18.attn.a_to_q.weight', 'blocks.18.attn.a_to_k.weight', 'blocks.18.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.18.x_block.mlp.fc1.bias": "blocks.18.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.18.x_block.mlp.fc1.weight": "blocks.18.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.18.x_block.mlp.fc2.bias": "blocks.18.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.18.x_block.mlp.fc2.weight": "blocks.18.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.19.context_block.adaLN_modulation.1.bias": "blocks.19.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.19.context_block.adaLN_modulation.1.weight": "blocks.19.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.19.context_block.attn.proj.bias": "blocks.19.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.19.context_block.attn.proj.weight": "blocks.19.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.19.context_block.attn.qkv.bias": ['blocks.19.attn.b_to_q.bias', 'blocks.19.attn.b_to_k.bias', 'blocks.19.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.19.context_block.attn.qkv.weight": ['blocks.19.attn.b_to_q.weight', 'blocks.19.attn.b_to_k.weight', 'blocks.19.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.19.context_block.mlp.fc1.bias": "blocks.19.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.19.context_block.mlp.fc1.weight": "blocks.19.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.19.context_block.mlp.fc2.bias": "blocks.19.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.19.context_block.mlp.fc2.weight": "blocks.19.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.19.x_block.adaLN_modulation.1.bias": "blocks.19.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.19.x_block.adaLN_modulation.1.weight": "blocks.19.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.19.x_block.attn.proj.bias": "blocks.19.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.19.x_block.attn.proj.weight": "blocks.19.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.19.x_block.attn.qkv.bias": ['blocks.19.attn.a_to_q.bias', 'blocks.19.attn.a_to_k.bias', 'blocks.19.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.19.x_block.attn.qkv.weight": ['blocks.19.attn.a_to_q.weight', 'blocks.19.attn.a_to_k.weight', 'blocks.19.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.19.x_block.mlp.fc1.bias": "blocks.19.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.19.x_block.mlp.fc1.weight": "blocks.19.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.19.x_block.mlp.fc2.bias": "blocks.19.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.19.x_block.mlp.fc2.weight": "blocks.19.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.2.context_block.adaLN_modulation.1.bias": "blocks.2.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.2.context_block.adaLN_modulation.1.weight": "blocks.2.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.2.context_block.attn.proj.bias": "blocks.2.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.2.context_block.attn.proj.weight": "blocks.2.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.2.context_block.attn.qkv.bias": ['blocks.2.attn.b_to_q.bias', 'blocks.2.attn.b_to_k.bias', 'blocks.2.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.2.context_block.attn.qkv.weight": ['blocks.2.attn.b_to_q.weight', 'blocks.2.attn.b_to_k.weight', 'blocks.2.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.2.context_block.mlp.fc1.bias": "blocks.2.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.2.context_block.mlp.fc1.weight": "blocks.2.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.2.context_block.mlp.fc2.bias": "blocks.2.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.2.context_block.mlp.fc2.weight": "blocks.2.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.2.x_block.adaLN_modulation.1.bias": "blocks.2.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.2.x_block.adaLN_modulation.1.weight": "blocks.2.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.2.x_block.attn.proj.bias": "blocks.2.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.2.x_block.attn.proj.weight": "blocks.2.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.2.x_block.attn.qkv.bias": ['blocks.2.attn.a_to_q.bias', 'blocks.2.attn.a_to_k.bias', 'blocks.2.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.2.x_block.attn.qkv.weight": ['blocks.2.attn.a_to_q.weight', 'blocks.2.attn.a_to_k.weight', 'blocks.2.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.2.x_block.mlp.fc1.bias": "blocks.2.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.2.x_block.mlp.fc1.weight": "blocks.2.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.2.x_block.mlp.fc2.bias": "blocks.2.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.2.x_block.mlp.fc2.weight": "blocks.2.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.20.context_block.adaLN_modulation.1.bias": "blocks.20.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.20.context_block.adaLN_modulation.1.weight": "blocks.20.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.20.context_block.attn.proj.bias": "blocks.20.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.20.context_block.attn.proj.weight": "blocks.20.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.20.context_block.attn.qkv.bias": ['blocks.20.attn.b_to_q.bias', 'blocks.20.attn.b_to_k.bias', 'blocks.20.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.20.context_block.attn.qkv.weight": ['blocks.20.attn.b_to_q.weight', 'blocks.20.attn.b_to_k.weight', 'blocks.20.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.20.context_block.mlp.fc1.bias": "blocks.20.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.20.context_block.mlp.fc1.weight": "blocks.20.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.20.context_block.mlp.fc2.bias": "blocks.20.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.20.context_block.mlp.fc2.weight": "blocks.20.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.20.x_block.adaLN_modulation.1.bias": "blocks.20.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.20.x_block.adaLN_modulation.1.weight": "blocks.20.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.20.x_block.attn.proj.bias": "blocks.20.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.20.x_block.attn.proj.weight": "blocks.20.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.20.x_block.attn.qkv.bias": ['blocks.20.attn.a_to_q.bias', 'blocks.20.attn.a_to_k.bias', 'blocks.20.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.20.x_block.attn.qkv.weight": ['blocks.20.attn.a_to_q.weight', 'blocks.20.attn.a_to_k.weight', 'blocks.20.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.20.x_block.mlp.fc1.bias": "blocks.20.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.20.x_block.mlp.fc1.weight": "blocks.20.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.20.x_block.mlp.fc2.bias": "blocks.20.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.20.x_block.mlp.fc2.weight": "blocks.20.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.21.context_block.adaLN_modulation.1.bias": "blocks.21.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.21.context_block.adaLN_modulation.1.weight": "blocks.21.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.21.context_block.attn.proj.bias": "blocks.21.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.21.context_block.attn.proj.weight": "blocks.21.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.21.context_block.attn.qkv.bias": ['blocks.21.attn.b_to_q.bias', 'blocks.21.attn.b_to_k.bias', 'blocks.21.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.21.context_block.attn.qkv.weight": ['blocks.21.attn.b_to_q.weight', 'blocks.21.attn.b_to_k.weight', 'blocks.21.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.21.context_block.mlp.fc1.bias": "blocks.21.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.21.context_block.mlp.fc1.weight": "blocks.21.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.21.context_block.mlp.fc2.bias": "blocks.21.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.21.context_block.mlp.fc2.weight": "blocks.21.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.21.x_block.adaLN_modulation.1.bias": "blocks.21.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.21.x_block.adaLN_modulation.1.weight": "blocks.21.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.21.x_block.attn.proj.bias": "blocks.21.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.21.x_block.attn.proj.weight": "blocks.21.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.21.x_block.attn.qkv.bias": ['blocks.21.attn.a_to_q.bias', 'blocks.21.attn.a_to_k.bias', 'blocks.21.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.21.x_block.attn.qkv.weight": ['blocks.21.attn.a_to_q.weight', 'blocks.21.attn.a_to_k.weight', 'blocks.21.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.21.x_block.mlp.fc1.bias": "blocks.21.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.21.x_block.mlp.fc1.weight": "blocks.21.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.21.x_block.mlp.fc2.bias": "blocks.21.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.21.x_block.mlp.fc2.weight": "blocks.21.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.22.context_block.adaLN_modulation.1.bias": "blocks.22.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.22.context_block.adaLN_modulation.1.weight": "blocks.22.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.22.context_block.attn.proj.bias": "blocks.22.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.22.context_block.attn.proj.weight": "blocks.22.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.22.context_block.attn.qkv.bias": ['blocks.22.attn.b_to_q.bias', 'blocks.22.attn.b_to_k.bias', 'blocks.22.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.22.context_block.attn.qkv.weight": ['blocks.22.attn.b_to_q.weight', 'blocks.22.attn.b_to_k.weight', 'blocks.22.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.22.context_block.mlp.fc1.bias": "blocks.22.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.22.context_block.mlp.fc1.weight": "blocks.22.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.22.context_block.mlp.fc2.bias": "blocks.22.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.22.context_block.mlp.fc2.weight": "blocks.22.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.22.x_block.adaLN_modulation.1.bias": "blocks.22.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.22.x_block.adaLN_modulation.1.weight": "blocks.22.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.22.x_block.attn.proj.bias": "blocks.22.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.22.x_block.attn.proj.weight": "blocks.22.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.22.x_block.attn.qkv.bias": ['blocks.22.attn.a_to_q.bias', 'blocks.22.attn.a_to_k.bias', 'blocks.22.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.22.x_block.attn.qkv.weight": ['blocks.22.attn.a_to_q.weight', 'blocks.22.attn.a_to_k.weight', 'blocks.22.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.22.x_block.mlp.fc1.bias": "blocks.22.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.22.x_block.mlp.fc1.weight": "blocks.22.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.22.x_block.mlp.fc2.bias": "blocks.22.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.22.x_block.mlp.fc2.weight": "blocks.22.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.23.context_block.attn.qkv.bias": ['blocks.23.attn.b_to_q.bias', 'blocks.23.attn.b_to_k.bias', 'blocks.23.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.23.context_block.attn.qkv.weight": ['blocks.23.attn.b_to_q.weight', 'blocks.23.attn.b_to_k.weight', 'blocks.23.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.23.x_block.adaLN_modulation.1.bias": "blocks.23.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.23.x_block.adaLN_modulation.1.weight": "blocks.23.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.23.x_block.attn.proj.bias": "blocks.23.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight": "blocks.23.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.23.x_block.attn.qkv.bias": ['blocks.23.attn.a_to_q.bias', 'blocks.23.attn.a_to_k.bias', 'blocks.23.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.23.x_block.attn.qkv.weight": ['blocks.23.attn.a_to_q.weight', 'blocks.23.attn.a_to_k.weight', 'blocks.23.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.23.x_block.mlp.fc1.bias": "blocks.23.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.23.x_block.mlp.fc1.weight": "blocks.23.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.23.x_block.mlp.fc2.bias": "blocks.23.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.23.x_block.mlp.fc2.weight": "blocks.23.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.3.context_block.adaLN_modulation.1.bias": "blocks.3.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.3.context_block.adaLN_modulation.1.weight": "blocks.3.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.3.context_block.attn.proj.bias": "blocks.3.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.3.context_block.attn.proj.weight": "blocks.3.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.3.context_block.attn.qkv.bias": ['blocks.3.attn.b_to_q.bias', 'blocks.3.attn.b_to_k.bias', 'blocks.3.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.3.context_block.attn.qkv.weight": ['blocks.3.attn.b_to_q.weight', 'blocks.3.attn.b_to_k.weight', 'blocks.3.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.3.context_block.mlp.fc1.bias": "blocks.3.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.3.context_block.mlp.fc1.weight": "blocks.3.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.3.context_block.mlp.fc2.bias": "blocks.3.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.3.context_block.mlp.fc2.weight": "blocks.3.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.3.x_block.adaLN_modulation.1.bias": "blocks.3.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.3.x_block.adaLN_modulation.1.weight": "blocks.3.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.3.x_block.attn.proj.bias": "blocks.3.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.3.x_block.attn.proj.weight": "blocks.3.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.3.x_block.attn.qkv.bias": ['blocks.3.attn.a_to_q.bias', 'blocks.3.attn.a_to_k.bias', 'blocks.3.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.3.x_block.attn.qkv.weight": ['blocks.3.attn.a_to_q.weight', 'blocks.3.attn.a_to_k.weight', 'blocks.3.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.3.x_block.mlp.fc1.bias": "blocks.3.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.3.x_block.mlp.fc1.weight": "blocks.3.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.3.x_block.mlp.fc2.bias": "blocks.3.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.3.x_block.mlp.fc2.weight": "blocks.3.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.4.context_block.adaLN_modulation.1.bias": "blocks.4.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.4.context_block.adaLN_modulation.1.weight": "blocks.4.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.4.context_block.attn.proj.bias": "blocks.4.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.4.context_block.attn.proj.weight": "blocks.4.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.4.context_block.attn.qkv.bias": ['blocks.4.attn.b_to_q.bias', 'blocks.4.attn.b_to_k.bias', 'blocks.4.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.4.context_block.attn.qkv.weight": ['blocks.4.attn.b_to_q.weight', 'blocks.4.attn.b_to_k.weight', 'blocks.4.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.4.context_block.mlp.fc1.bias": "blocks.4.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.4.context_block.mlp.fc1.weight": "blocks.4.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.4.context_block.mlp.fc2.bias": "blocks.4.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.4.context_block.mlp.fc2.weight": "blocks.4.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.4.x_block.adaLN_modulation.1.bias": "blocks.4.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.4.x_block.adaLN_modulation.1.weight": "blocks.4.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.4.x_block.attn.proj.bias": "blocks.4.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.4.x_block.attn.proj.weight": "blocks.4.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.4.x_block.attn.qkv.bias": ['blocks.4.attn.a_to_q.bias', 'blocks.4.attn.a_to_k.bias', 'blocks.4.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.4.x_block.attn.qkv.weight": ['blocks.4.attn.a_to_q.weight', 'blocks.4.attn.a_to_k.weight', 'blocks.4.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.4.x_block.mlp.fc1.bias": "blocks.4.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.4.x_block.mlp.fc1.weight": "blocks.4.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.4.x_block.mlp.fc2.bias": "blocks.4.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.4.x_block.mlp.fc2.weight": "blocks.4.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.5.context_block.adaLN_modulation.1.bias": "blocks.5.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.5.context_block.adaLN_modulation.1.weight": "blocks.5.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.5.context_block.attn.proj.bias": "blocks.5.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.5.context_block.attn.proj.weight": "blocks.5.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.5.context_block.attn.qkv.bias": ['blocks.5.attn.b_to_q.bias', 'blocks.5.attn.b_to_k.bias', 'blocks.5.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.5.context_block.attn.qkv.weight": ['blocks.5.attn.b_to_q.weight', 'blocks.5.attn.b_to_k.weight', 'blocks.5.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.5.context_block.mlp.fc1.bias": "blocks.5.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.5.context_block.mlp.fc1.weight": "blocks.5.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.5.context_block.mlp.fc2.bias": "blocks.5.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.5.context_block.mlp.fc2.weight": "blocks.5.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.5.x_block.adaLN_modulation.1.bias": "blocks.5.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.5.x_block.adaLN_modulation.1.weight": "blocks.5.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.5.x_block.attn.proj.bias": "blocks.5.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.5.x_block.attn.proj.weight": "blocks.5.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.5.x_block.attn.qkv.bias": ['blocks.5.attn.a_to_q.bias', 'blocks.5.attn.a_to_k.bias', 'blocks.5.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.5.x_block.attn.qkv.weight": ['blocks.5.attn.a_to_q.weight', 'blocks.5.attn.a_to_k.weight', 'blocks.5.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.5.x_block.mlp.fc1.bias": "blocks.5.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.5.x_block.mlp.fc1.weight": "blocks.5.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.5.x_block.mlp.fc2.bias": "blocks.5.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.5.x_block.mlp.fc2.weight": "blocks.5.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.6.context_block.adaLN_modulation.1.bias": "blocks.6.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.6.context_block.adaLN_modulation.1.weight": "blocks.6.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.6.context_block.attn.proj.bias": "blocks.6.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.6.context_block.attn.proj.weight": "blocks.6.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.6.context_block.attn.qkv.bias": ['blocks.6.attn.b_to_q.bias', 'blocks.6.attn.b_to_k.bias', 'blocks.6.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.6.context_block.attn.qkv.weight": ['blocks.6.attn.b_to_q.weight', 'blocks.6.attn.b_to_k.weight', 'blocks.6.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.6.context_block.mlp.fc1.bias": "blocks.6.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.6.context_block.mlp.fc1.weight": "blocks.6.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.6.context_block.mlp.fc2.bias": "blocks.6.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.6.context_block.mlp.fc2.weight": "blocks.6.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.6.x_block.adaLN_modulation.1.bias": "blocks.6.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.6.x_block.adaLN_modulation.1.weight": "blocks.6.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.6.x_block.attn.proj.bias": "blocks.6.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.6.x_block.attn.proj.weight": "blocks.6.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.6.x_block.attn.qkv.bias": ['blocks.6.attn.a_to_q.bias', 'blocks.6.attn.a_to_k.bias', 'blocks.6.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.6.x_block.attn.qkv.weight": ['blocks.6.attn.a_to_q.weight', 'blocks.6.attn.a_to_k.weight', 'blocks.6.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.6.x_block.mlp.fc1.bias": "blocks.6.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.6.x_block.mlp.fc1.weight": "blocks.6.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.6.x_block.mlp.fc2.bias": "blocks.6.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.6.x_block.mlp.fc2.weight": "blocks.6.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.7.context_block.adaLN_modulation.1.bias": "blocks.7.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.7.context_block.adaLN_modulation.1.weight": "blocks.7.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.7.context_block.attn.proj.bias": "blocks.7.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.7.context_block.attn.proj.weight": "blocks.7.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.7.context_block.attn.qkv.bias": ['blocks.7.attn.b_to_q.bias', 'blocks.7.attn.b_to_k.bias', 'blocks.7.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.7.context_block.attn.qkv.weight": ['blocks.7.attn.b_to_q.weight', 'blocks.7.attn.b_to_k.weight', 'blocks.7.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.7.context_block.mlp.fc1.bias": "blocks.7.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.7.context_block.mlp.fc1.weight": "blocks.7.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.7.context_block.mlp.fc2.bias": "blocks.7.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.7.context_block.mlp.fc2.weight": "blocks.7.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.7.x_block.adaLN_modulation.1.bias": "blocks.7.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.7.x_block.adaLN_modulation.1.weight": "blocks.7.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.7.x_block.attn.proj.bias": "blocks.7.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.7.x_block.attn.proj.weight": "blocks.7.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.7.x_block.attn.qkv.bias": ['blocks.7.attn.a_to_q.bias', 'blocks.7.attn.a_to_k.bias', 'blocks.7.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.7.x_block.attn.qkv.weight": ['blocks.7.attn.a_to_q.weight', 'blocks.7.attn.a_to_k.weight', 'blocks.7.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.7.x_block.mlp.fc1.bias": "blocks.7.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.7.x_block.mlp.fc1.weight": "blocks.7.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.7.x_block.mlp.fc2.bias": "blocks.7.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.7.x_block.mlp.fc2.weight": "blocks.7.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.8.context_block.adaLN_modulation.1.bias": "blocks.8.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.8.context_block.adaLN_modulation.1.weight": "blocks.8.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.8.context_block.attn.proj.bias": "blocks.8.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.8.context_block.attn.proj.weight": "blocks.8.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.8.context_block.attn.qkv.bias": ['blocks.8.attn.b_to_q.bias', 'blocks.8.attn.b_to_k.bias', 'blocks.8.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.8.context_block.attn.qkv.weight": ['blocks.8.attn.b_to_q.weight', 'blocks.8.attn.b_to_k.weight', 'blocks.8.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.8.context_block.mlp.fc1.bias": "blocks.8.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.8.context_block.mlp.fc1.weight": "blocks.8.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.8.context_block.mlp.fc2.bias": "blocks.8.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.8.context_block.mlp.fc2.weight": "blocks.8.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.8.x_block.adaLN_modulation.1.bias": "blocks.8.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.8.x_block.adaLN_modulation.1.weight": "blocks.8.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.8.x_block.attn.proj.bias": "blocks.8.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.8.x_block.attn.proj.weight": "blocks.8.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.8.x_block.attn.qkv.bias": ['blocks.8.attn.a_to_q.bias', 'blocks.8.attn.a_to_k.bias', 'blocks.8.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.8.x_block.attn.qkv.weight": ['blocks.8.attn.a_to_q.weight', 'blocks.8.attn.a_to_k.weight', 'blocks.8.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.8.x_block.mlp.fc1.bias": "blocks.8.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.8.x_block.mlp.fc1.weight": "blocks.8.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.8.x_block.mlp.fc2.bias": "blocks.8.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.8.x_block.mlp.fc2.weight": "blocks.8.ff_a.2.weight",
+            "model.diffusion_model.joint_blocks.9.context_block.adaLN_modulation.1.bias": "blocks.9.norm1_b.linear.bias",
+            "model.diffusion_model.joint_blocks.9.context_block.adaLN_modulation.1.weight": "blocks.9.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.9.context_block.attn.proj.bias": "blocks.9.attn.b_to_out.bias",
+            "model.diffusion_model.joint_blocks.9.context_block.attn.proj.weight": "blocks.9.attn.b_to_out.weight",
+            "model.diffusion_model.joint_blocks.9.context_block.attn.qkv.bias": ['blocks.9.attn.b_to_q.bias', 'blocks.9.attn.b_to_k.bias', 'blocks.9.attn.b_to_v.bias'],
+            "model.diffusion_model.joint_blocks.9.context_block.attn.qkv.weight": ['blocks.9.attn.b_to_q.weight', 'blocks.9.attn.b_to_k.weight', 'blocks.9.attn.b_to_v.weight'],
+            "model.diffusion_model.joint_blocks.9.context_block.mlp.fc1.bias": "blocks.9.ff_b.0.bias",
+            "model.diffusion_model.joint_blocks.9.context_block.mlp.fc1.weight": "blocks.9.ff_b.0.weight",
+            "model.diffusion_model.joint_blocks.9.context_block.mlp.fc2.bias": "blocks.9.ff_b.2.bias",
+            "model.diffusion_model.joint_blocks.9.context_block.mlp.fc2.weight": "blocks.9.ff_b.2.weight",
+            "model.diffusion_model.joint_blocks.9.x_block.adaLN_modulation.1.bias": "blocks.9.norm1_a.linear.bias",
+            "model.diffusion_model.joint_blocks.9.x_block.adaLN_modulation.1.weight": "blocks.9.norm1_a.linear.weight",
+            "model.diffusion_model.joint_blocks.9.x_block.attn.proj.bias": "blocks.9.attn.a_to_out.bias",
+            "model.diffusion_model.joint_blocks.9.x_block.attn.proj.weight": "blocks.9.attn.a_to_out.weight",
+            "model.diffusion_model.joint_blocks.9.x_block.attn.qkv.bias": ['blocks.9.attn.a_to_q.bias', 'blocks.9.attn.a_to_k.bias', 'blocks.9.attn.a_to_v.bias'],
+            "model.diffusion_model.joint_blocks.9.x_block.attn.qkv.weight": ['blocks.9.attn.a_to_q.weight', 'blocks.9.attn.a_to_k.weight', 'blocks.9.attn.a_to_v.weight'],
+            "model.diffusion_model.joint_blocks.9.x_block.mlp.fc1.bias": "blocks.9.ff_a.0.bias",
+            "model.diffusion_model.joint_blocks.9.x_block.mlp.fc1.weight": "blocks.9.ff_a.0.weight",
+            "model.diffusion_model.joint_blocks.9.x_block.mlp.fc2.bias": "blocks.9.ff_a.2.bias",
+            "model.diffusion_model.joint_blocks.9.x_block.mlp.fc2.weight": "blocks.9.ff_a.2.weight",
+            "model.diffusion_model.pos_embed": "pos_embedder.pos_embed",
+            "model.diffusion_model.t_embedder.mlp.0.bias": "time_embedder.timestep_embedder.0.bias",
+            "model.diffusion_model.t_embedder.mlp.0.weight": "time_embedder.timestep_embedder.0.weight",
+            "model.diffusion_model.t_embedder.mlp.2.bias": "time_embedder.timestep_embedder.2.bias",
+            "model.diffusion_model.t_embedder.mlp.2.weight": "time_embedder.timestep_embedder.2.weight",
+            "model.diffusion_model.x_embedder.proj.bias": "pos_embedder.proj.bias",
+            "model.diffusion_model.x_embedder.proj.weight": "pos_embedder.proj.weight",
+            "model.diffusion_model.y_embedder.mlp.0.bias": "pooled_text_embedder.0.bias",
+            "model.diffusion_model.y_embedder.mlp.0.weight": "pooled_text_embedder.0.weight",
+            "model.diffusion_model.y_embedder.mlp.2.bias": "pooled_text_embedder.2.bias",
+            "model.diffusion_model.y_embedder.mlp.2.weight": "pooled_text_embedder.2.weight",
+            "model.diffusion_model.joint_blocks.23.context_block.adaLN_modulation.1.weight": "blocks.23.norm1_b.linear.weight",
+            "model.diffusion_model.joint_blocks.23.context_block.adaLN_modulation.1.bias": "blocks.23.norm1_b.linear.bias",
+            "model.diffusion_model.final_layer.adaLN_modulation.1.weight": "norm_out.linear.weight",
+            "model.diffusion_model.final_layer.adaLN_modulation.1.bias": "norm_out.linear.bias",
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if name.startswith("model.diffusion_model.joint_blocks.23.context_block.adaLN_modulation.1."):
+                    param = torch.concat([param[1536:], param[:1536]], axis=0)
+                elif name.startswith("model.diffusion_model.final_layer.adaLN_modulation.1."):
+                    param = torch.concat([param[1536:], param[:1536]], axis=0)
+                elif name == "model.diffusion_model.pos_embed":
+                    param = param.reshape((1, 192, 192, 1536))
+                if isinstance(rename_dict[name], str):
+                    state_dict_[rename_dict[name]] = param
+                else:
+                    name_ = rename_dict[name][0].replace(".a_to_q.", ".a_to_qkv.").replace(".b_to_q.", ".b_to_qkv.")
+                    state_dict_[name_] = param
+        return state_dict_

diffsynth/models/sd3_text_encoder.py ADDED Viewed

The diff for this file is too large to render. See raw diff

diffsynth/models/sd3_vae_decoder.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+from .sd_vae_decoder import VAEAttentionBlock, SDVAEDecoderStateDictConverter
+from .sd_unet import ResnetBlock, UpSampler
+from .tiler import TileWorker
+class SD3VAEDecoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.scaling_factor = 1.5305 # Different from SD 1.x
+        self.shift_factor = 0.0609 # Different from SD 1.x
+        self.conv_in = torch.nn.Conv2d(16, 512, kernel_size=3, padding=1) # Different from SD 1.x
+        self.blocks = torch.nn.ModuleList([
+            # UNetMidBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            VAEAttentionBlock(1, 512, 512, 1, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            # UpDecoderBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            UpSampler(512),
+            # UpDecoderBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            UpSampler(512),
+            # UpDecoderBlock2D
+            ResnetBlock(512, 256, eps=1e-6),
+            ResnetBlock(256, 256, eps=1e-6),
+            ResnetBlock(256, 256, eps=1e-6),
+            UpSampler(256),
+            # UpDecoderBlock2D
+            ResnetBlock(256, 128, eps=1e-6),
+            ResnetBlock(128, 128, eps=1e-6),
+            ResnetBlock(128, 128, eps=1e-6),
+        ])
+        self.conv_norm_out = torch.nn.GroupNorm(num_channels=128, num_groups=32, eps=1e-6)
+        self.conv_act = torch.nn.SiLU()
+        self.conv_out = torch.nn.Conv2d(128, 3, kernel_size=3, padding=1)
+    def tiled_forward(self, sample, tile_size=64, tile_stride=32):
+        hidden_states = TileWorker().tiled_forward(
+            lambda x: self.forward(x),
+            sample,
+            tile_size,
+            tile_stride,
+            tile_device=sample.device,
+            tile_dtype=sample.dtype
+        )
+        return hidden_states
+    def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
+        # For VAE Decoder, we do not need to apply the tiler on each layer.
+        if tiled:
+            return self.tiled_forward(sample, tile_size=tile_size, tile_stride=tile_stride)
+        # 1. pre-process
+        hidden_states = sample / self.scaling_factor + self.shift_factor
+        hidden_states = self.conv_in(hidden_states)
+        time_emb = None
+        text_emb = None
+        res_stack = None
+        # 2. blocks
+        for i, block in enumerate(self.blocks):
+            hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
+        # 3. output
+        hidden_states = self.conv_norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+    def state_dict_converter(self):
+        return SDVAEDecoderStateDictConverter()

diffsynth/models/sd3_vae_encoder.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+from .sd_unet import ResnetBlock, DownSampler
+from .sd_vae_encoder import VAEAttentionBlock, SDVAEEncoderStateDictConverter
+from .tiler import TileWorker
+from einops import rearrange
+class SD3VAEEncoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.scaling_factor = 1.5305 # Different from SD 1.x
+        self.shift_factor = 0.0609 # Different from SD 1.x
+        self.conv_in = torch.nn.Conv2d(3, 128, kernel_size=3, padding=1)
+        self.blocks = torch.nn.ModuleList([
+            # DownEncoderBlock2D
+            ResnetBlock(128, 128, eps=1e-6),
+            ResnetBlock(128, 128, eps=1e-6),
+            DownSampler(128, padding=0, extra_padding=True),
+            # DownEncoderBlock2D
+            ResnetBlock(128, 256, eps=1e-6),
+            ResnetBlock(256, 256, eps=1e-6),
+            DownSampler(256, padding=0, extra_padding=True),
+            # DownEncoderBlock2D
+            ResnetBlock(256, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            DownSampler(512, padding=0, extra_padding=True),
+            # DownEncoderBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            # UNetMidBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            VAEAttentionBlock(1, 512, 512, 1, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+        ])
+        self.conv_norm_out = torch.nn.GroupNorm(num_channels=512, num_groups=32, eps=1e-6)
+        self.conv_act = torch.nn.SiLU()
+        self.conv_out = torch.nn.Conv2d(512, 32, kernel_size=3, padding=1)
+    def tiled_forward(self, sample, tile_size=64, tile_stride=32):
+        hidden_states = TileWorker().tiled_forward(
+            lambda x: self.forward(x),
+            sample,
+            tile_size,
+            tile_stride,
+            tile_device=sample.device,
+            tile_dtype=sample.dtype
+        )
+        return hidden_states
+    def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
+        # For VAE Decoder, we do not need to apply the tiler on each layer.
+        if tiled:
+            return self.tiled_forward(sample, tile_size=tile_size, tile_stride=tile_stride)
+        # 1. pre-process
+        hidden_states = self.conv_in(sample)
+        time_emb = None
+        text_emb = None
+        res_stack = None
+        # 2. blocks
+        for i, block in enumerate(self.blocks):
+            hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
+        # 3. output
+        hidden_states = self.conv_norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        hidden_states = hidden_states[:, :16]
+        hidden_states = (hidden_states - self.shift_factor) * self.scaling_factor
+        return hidden_states
+    def encode_video(self, sample, batch_size=8):
+        B = sample.shape[0]
+        hidden_states = []
+        for i in range(0, sample.shape[2], batch_size):
+            j = min(i + batch_size, sample.shape[2])
+            sample_batch = rearrange(sample[:,:,i:j], "B C T H W -> (B T) C H W")
+            hidden_states_batch = self(sample_batch)
+            hidden_states_batch = rearrange(hidden_states_batch, "(B T) C H W -> B C T H W", B=B)
+            hidden_states.append(hidden_states_batch)
+        hidden_states = torch.concat(hidden_states, dim=2)
+        return hidden_states
+    def state_dict_converter(self):
+        return SDVAEEncoderStateDictConverter()

diffsynth/models/sd_controlnet.py ADDED Viewed

	@@ -0,0 +1,587 @@

+import torch
+from .sd_unet import Timesteps, ResnetBlock, AttentionBlock, PushBlock, DownSampler
+from .tiler import TileWorker
+class ControlNetConditioningLayer(torch.nn.Module):
+    def __init__(self, channels = (3, 16, 32, 96, 256, 320)):
+        super().__init__()
+        self.blocks = torch.nn.ModuleList([])
+        self.blocks.append(torch.nn.Conv2d(channels[0], channels[1], kernel_size=3, padding=1))
+        self.blocks.append(torch.nn.SiLU())
+        for i in range(1, len(channels) - 2):
+            self.blocks.append(torch.nn.Conv2d(channels[i], channels[i], kernel_size=3, padding=1))
+            self.blocks.append(torch.nn.SiLU())
+            self.blocks.append(torch.nn.Conv2d(channels[i], channels[i+1], kernel_size=3, padding=1, stride=2))
+            self.blocks.append(torch.nn.SiLU())
+        self.blocks.append(torch.nn.Conv2d(channels[-2], channels[-1], kernel_size=3, padding=1))
+    def forward(self, conditioning):
+        for block in self.blocks:
+            conditioning = block(conditioning)
+        return conditioning
+class SDControlNet(torch.nn.Module):
+    def __init__(self, global_pool=False):
+        super().__init__()
+        self.time_proj = Timesteps(320)
+        self.time_embedding = torch.nn.Sequential(
+            torch.nn.Linear(320, 1280),
+            torch.nn.SiLU(),
+            torch.nn.Linear(1280, 1280)
+        )
+        self.conv_in = torch.nn.Conv2d(4, 320, kernel_size=3, padding=1)
+        self.controlnet_conv_in = ControlNetConditioningLayer(channels=(3, 16, 32, 96, 256, 320))
+        self.blocks = torch.nn.ModuleList([
+            # CrossAttnDownBlock2D
+            ResnetBlock(320, 320, 1280),
+            AttentionBlock(8, 40, 320, 1, 768),
+            PushBlock(),
+            ResnetBlock(320, 320, 1280),
+            AttentionBlock(8, 40, 320, 1, 768),
+            PushBlock(),
+            DownSampler(320),
+            PushBlock(),
+            # CrossAttnDownBlock2D
+            ResnetBlock(320, 640, 1280),
+            AttentionBlock(8, 80, 640, 1, 768),
+            PushBlock(),
+            ResnetBlock(640, 640, 1280),
+            AttentionBlock(8, 80, 640, 1, 768),
+            PushBlock(),
+            DownSampler(640),
+            PushBlock(),
+            # CrossAttnDownBlock2D
+            ResnetBlock(640, 1280, 1280),
+            AttentionBlock(8, 160, 1280, 1, 768),
+            PushBlock(),
+            ResnetBlock(1280, 1280, 1280),
+            AttentionBlock(8, 160, 1280, 1, 768),
+            PushBlock(),
+            DownSampler(1280),
+            PushBlock(),
+            # DownBlock2D
+            ResnetBlock(1280, 1280, 1280),
+            PushBlock(),
+            ResnetBlock(1280, 1280, 1280),
+            PushBlock(),
+            # UNetMidBlock2DCrossAttn
+            ResnetBlock(1280, 1280, 1280),
+            AttentionBlock(8, 160, 1280, 1, 768),
+            ResnetBlock(1280, 1280, 1280),
+            PushBlock()
+        ])
+        self.controlnet_blocks = torch.nn.ModuleList([
+            torch.nn.Conv2d(320, 320, kernel_size=(1, 1)),
+            torch.nn.Conv2d(320, 320, kernel_size=(1, 1), bias=False),
+            torch.nn.Conv2d(320, 320, kernel_size=(1, 1), bias=False),
+            torch.nn.Conv2d(320, 320, kernel_size=(1, 1), bias=False),
+            torch.nn.Conv2d(640, 640, kernel_size=(1, 1)),
+            torch.nn.Conv2d(640, 640, kernel_size=(1, 1), bias=False),
+            torch.nn.Conv2d(640, 640, kernel_size=(1, 1), bias=False),
+            torch.nn.Conv2d(1280, 1280, kernel_size=(1, 1)),
+            torch.nn.Conv2d(1280, 1280, kernel_size=(1, 1), bias=False),
+            torch.nn.Conv2d(1280, 1280, kernel_size=(1, 1), bias=False),
+            torch.nn.Conv2d(1280, 1280, kernel_size=(1, 1), bias=False),
+            torch.nn.Conv2d(1280, 1280, kernel_size=(1, 1), bias=False),
+            torch.nn.Conv2d(1280, 1280, kernel_size=(1, 1), bias=False),
+        ])
+        self.global_pool = global_pool
+    def forward(
+        self,
+        sample, timestep, encoder_hidden_states, conditioning,
+        tiled=False, tile_size=64, tile_stride=32,
+    ):
+        # 1. time
+        time_emb = self.time_proj(timestep[None]).to(sample.dtype)
+        time_emb = self.time_embedding(time_emb)
+        time_emb = time_emb.repeat(sample.shape[0], 1)
+        # 2. pre-process
+        height, width = sample.shape[2], sample.shape[3]
+        hidden_states = self.conv_in(sample) + self.controlnet_conv_in(conditioning)
+        text_emb = encoder_hidden_states
+        res_stack = [hidden_states]
+        # 3. blocks
+        for i, block in enumerate(self.blocks):
+            if tiled and not isinstance(block, PushBlock):
+                _, _, inter_height, _ = hidden_states.shape
+                resize_scale = inter_height / height
+                hidden_states = TileWorker().tiled_forward(
+                    lambda x: block(x, time_emb, text_emb, res_stack)[0],
+                    hidden_states,
+                    int(tile_size * resize_scale),
+                    int(tile_stride * resize_scale),
+                    tile_device=hidden_states.device,
+                    tile_dtype=hidden_states.dtype
+                )
+            else:
+                hidden_states, _, _, _ = block(hidden_states, time_emb, text_emb, res_stack)
+        # 4. ControlNet blocks
+        controlnet_res_stack = [block(res) for block, res in zip(self.controlnet_blocks, res_stack)]
+        # pool
+        if self.global_pool:
+            controlnet_res_stack = [res.mean(dim=(2, 3), keepdim=True) for res in controlnet_res_stack]
+        return controlnet_res_stack
+    def state_dict_converter(self):
+        return SDControlNetStateDictConverter()
+class SDControlNetStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        # architecture
+        block_types = [
+            'ResnetBlock', 'AttentionBlock', 'PushBlock', 'ResnetBlock', 'AttentionBlock', 'PushBlock', 'DownSampler', 'PushBlock',
+            'ResnetBlock', 'AttentionBlock', 'PushBlock', 'ResnetBlock', 'AttentionBlock', 'PushBlock', 'DownSampler', 'PushBlock',
+            'ResnetBlock', 'AttentionBlock', 'PushBlock', 'ResnetBlock', 'AttentionBlock', 'PushBlock', 'DownSampler', 'PushBlock',
+            'ResnetBlock', 'PushBlock', 'ResnetBlock', 'PushBlock',
+            'ResnetBlock', 'AttentionBlock', 'ResnetBlock',
+            'PopBlock', 'ResnetBlock', 'PopBlock', 'ResnetBlock', 'PopBlock', 'ResnetBlock', 'UpSampler',
+            'PopBlock', 'ResnetBlock', 'AttentionBlock', 'PopBlock', 'ResnetBlock', 'AttentionBlock', 'PopBlock', 'ResnetBlock', 'AttentionBlock', 'UpSampler',
+            'PopBlock', 'ResnetBlock', 'AttentionBlock', 'PopBlock', 'ResnetBlock', 'AttentionBlock', 'PopBlock', 'ResnetBlock', 'AttentionBlock', 'UpSampler',
+            'PopBlock', 'ResnetBlock', 'AttentionBlock', 'PopBlock', 'ResnetBlock', 'AttentionBlock', 'PopBlock', 'ResnetBlock', 'AttentionBlock'
+        ]
+        # controlnet_rename_dict
+        controlnet_rename_dict = {
+            "controlnet_cond_embedding.conv_in.weight": "controlnet_conv_in.blocks.0.weight",
+            "controlnet_cond_embedding.conv_in.bias": "controlnet_conv_in.blocks.0.bias",
+            "controlnet_cond_embedding.blocks.0.weight": "controlnet_conv_in.blocks.2.weight",
+            "controlnet_cond_embedding.blocks.0.bias": "controlnet_conv_in.blocks.2.bias",
+            "controlnet_cond_embedding.blocks.1.weight": "controlnet_conv_in.blocks.4.weight",
+            "controlnet_cond_embedding.blocks.1.bias": "controlnet_conv_in.blocks.4.bias",
+            "controlnet_cond_embedding.blocks.2.weight": "controlnet_conv_in.blocks.6.weight",
+            "controlnet_cond_embedding.blocks.2.bias": "controlnet_conv_in.blocks.6.bias",
+            "controlnet_cond_embedding.blocks.3.weight": "controlnet_conv_in.blocks.8.weight",
+            "controlnet_cond_embedding.blocks.3.bias": "controlnet_conv_in.blocks.8.bias",
+            "controlnet_cond_embedding.blocks.4.weight": "controlnet_conv_in.blocks.10.weight",
+            "controlnet_cond_embedding.blocks.4.bias": "controlnet_conv_in.blocks.10.bias",
+            "controlnet_cond_embedding.blocks.5.weight": "controlnet_conv_in.blocks.12.weight",
+            "controlnet_cond_embedding.blocks.5.bias": "controlnet_conv_in.blocks.12.bias",
+            "controlnet_cond_embedding.conv_out.weight": "controlnet_conv_in.blocks.14.weight",
+            "controlnet_cond_embedding.conv_out.bias": "controlnet_conv_in.blocks.14.bias",
+        }
+        # Rename each parameter
+        name_list = sorted([name for name in state_dict])
+        rename_dict = {}
+        block_id = {"ResnetBlock": -1, "AttentionBlock": -1, "DownSampler": -1, "UpSampler": -1}
+        last_block_type_with_id = {"ResnetBlock": "", "AttentionBlock": "", "DownSampler": "", "UpSampler": ""}
+        for name in name_list:
+            names = name.split(".")
+            if names[0] in ["conv_in", "conv_norm_out", "conv_out"]:
+                pass
+            elif name in controlnet_rename_dict:
+                names = controlnet_rename_dict[name].split(".")
+            elif names[0] == "controlnet_down_blocks":
+                names[0] = "controlnet_blocks"
+            elif names[0] == "controlnet_mid_block":
+                names = ["controlnet_blocks", "12", names[-1]]
+            elif names[0] in ["time_embedding", "add_embedding"]:
+                if names[0] == "add_embedding":
+                    names[0] = "add_time_embedding"
+                names[1] = {"linear_1": "0", "linear_2": "2"}[names[1]]
+            elif names[0] in ["down_blocks", "mid_block", "up_blocks"]:
+                if names[0] == "mid_block":
+                    names.insert(1, "0")
+                block_type = {"resnets": "ResnetBlock", "attentions": "AttentionBlock", "downsamplers": "DownSampler", "upsamplers": "UpSampler"}[names[2]]
+                block_type_with_id = ".".join(names[:4])
+                if block_type_with_id != last_block_type_with_id[block_type]:
+                    block_id[block_type] += 1
+                last_block_type_with_id[block_type] = block_type_with_id
+                while block_id[block_type] < len(block_types) and block_types[block_id[block_type]] != block_type:
+                    block_id[block_type] += 1
+                block_type_with_id = ".".join(names[:4])
+                names = ["blocks", str(block_id[block_type])] + names[4:]
+                if "ff" in names:
+                    ff_index = names.index("ff")
+                    component = ".".join(names[ff_index:ff_index+3])
+                    component = {"ff.net.0": "act_fn", "ff.net.2": "ff"}[component]
+                    names = names[:ff_index] + [component] + names[ff_index+3:]
+                if "to_out" in names:
+                    names.pop(names.index("to_out") + 1)
+            else:
+                raise ValueError(f"Unknown parameters: {name}")
+            rename_dict[name] = ".".join(names)
+        # Convert state_dict
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if ".proj_in." in name or ".proj_out." in name:
+                param = param.squeeze()
+            if rename_dict[name] in [
+                "controlnet_blocks.1.bias", "controlnet_blocks.2.bias", "controlnet_blocks.3.bias", "controlnet_blocks.5.bias", "controlnet_blocks.6.bias",
+                "controlnet_blocks.8.bias", "controlnet_blocks.9.bias", "controlnet_blocks.10.bias", "controlnet_blocks.11.bias", "controlnet_blocks.12.bias"
+            ]:
+                continue
+            state_dict_[rename_dict[name]] = param
+        return state_dict_
+    def from_civitai(self, state_dict):
+        if "mid_block.resnets.1.time_emb_proj.weight" in state_dict:
+            # For controlnets in diffusers format
+            return self.from_diffusers(state_dict)
+        rename_dict = {
+            "control_model.time_embed.0.weight": "time_embedding.0.weight",
+            "control_model.time_embed.0.bias": "time_embedding.0.bias",
+            "control_model.time_embed.2.weight": "time_embedding.2.weight",
+            "control_model.time_embed.2.bias": "time_embedding.2.bias",
+            "control_model.input_blocks.0.0.weight": "conv_in.weight",
+            "control_model.input_blocks.0.0.bias": "conv_in.bias",
+            "control_model.input_blocks.1.0.in_layers.0.weight": "blocks.0.norm1.weight",
+            "control_model.input_blocks.1.0.in_layers.0.bias": "blocks.0.norm1.bias",
+            "control_model.input_blocks.1.0.in_layers.2.weight": "blocks.0.conv1.weight",
+            "control_model.input_blocks.1.0.in_layers.2.bias": "blocks.0.conv1.bias",
+            "control_model.input_blocks.1.0.emb_layers.1.weight": "blocks.0.time_emb_proj.weight",
+            "control_model.input_blocks.1.0.emb_layers.1.bias": "blocks.0.time_emb_proj.bias",
+            "control_model.input_blocks.1.0.out_layers.0.weight": "blocks.0.norm2.weight",
+            "control_model.input_blocks.1.0.out_layers.0.bias": "blocks.0.norm2.bias",
+            "control_model.input_blocks.1.0.out_layers.3.weight": "blocks.0.conv2.weight",
+            "control_model.input_blocks.1.0.out_layers.3.bias": "blocks.0.conv2.bias",
+            "control_model.input_blocks.1.1.norm.weight": "blocks.1.norm.weight",
+            "control_model.input_blocks.1.1.norm.bias": "blocks.1.norm.bias",
+            "control_model.input_blocks.1.1.proj_in.weight": "blocks.1.proj_in.weight",
+            "control_model.input_blocks.1.1.proj_in.bias": "blocks.1.proj_in.bias",
+            "control_model.input_blocks.1.1.transformer_blocks.0.attn1.to_q.weight": "blocks.1.transformer_blocks.0.attn1.to_q.weight",
+            "control_model.input_blocks.1.1.transformer_blocks.0.attn1.to_k.weight": "blocks.1.transformer_blocks.0.attn1.to_k.weight",
+            "control_model.input_blocks.1.1.transformer_blocks.0.attn1.to_v.weight": "blocks.1.transformer_blocks.0.attn1.to_v.weight",
+            "control_model.input_blocks.1.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.1.transformer_blocks.0.attn1.to_out.weight",
+            "control_model.input_blocks.1.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.1.transformer_blocks.0.attn1.to_out.bias",
+            "control_model.input_blocks.1.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.1.transformer_blocks.0.act_fn.proj.weight",
+            "control_model.input_blocks.1.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.1.transformer_blocks.0.act_fn.proj.bias",
+            "control_model.input_blocks.1.1.transformer_blocks.0.ff.net.2.weight": "blocks.1.transformer_blocks.0.ff.weight",
+            "control_model.input_blocks.1.1.transformer_blocks.0.ff.net.2.bias": "blocks.1.transformer_blocks.0.ff.bias",
+            "control_model.input_blocks.1.1.transformer_blocks.0.attn2.to_q.weight": "blocks.1.transformer_blocks.0.attn2.to_q.weight",
+            "control_model.input_blocks.1.1.transformer_blocks.0.attn2.to_k.weight": "blocks.1.transformer_blocks.0.attn2.to_k.weight",
+            "control_model.input_blocks.1.1.transformer_blocks.0.attn2.to_v.weight": "blocks.1.transformer_blocks.0.attn2.to_v.weight",
+            "control_model.input_blocks.1.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.1.transformer_blocks.0.attn2.to_out.weight",
+            "control_model.input_blocks.1.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.1.transformer_blocks.0.attn2.to_out.bias",
+            "control_model.input_blocks.1.1.transformer_blocks.0.norm1.weight": "blocks.1.transformer_blocks.0.norm1.weight",
+            "control_model.input_blocks.1.1.transformer_blocks.0.norm1.bias": "blocks.1.transformer_blocks.0.norm1.bias",
+            "control_model.input_blocks.1.1.transformer_blocks.0.norm2.weight": "blocks.1.transformer_blocks.0.norm2.weight",
+            "control_model.input_blocks.1.1.transformer_blocks.0.norm2.bias": "blocks.1.transformer_blocks.0.norm2.bias",
+            "control_model.input_blocks.1.1.transformer_blocks.0.norm3.weight": "blocks.1.transformer_blocks.0.norm3.weight",
+            "control_model.input_blocks.1.1.transformer_blocks.0.norm3.bias": "blocks.1.transformer_blocks.0.norm3.bias",
+            "control_model.input_blocks.1.1.proj_out.weight": "blocks.1.proj_out.weight",
+            "control_model.input_blocks.1.1.proj_out.bias": "blocks.1.proj_out.bias",
+            "control_model.input_blocks.2.0.in_layers.0.weight": "blocks.3.norm1.weight",
+            "control_model.input_blocks.2.0.in_layers.0.bias": "blocks.3.norm1.bias",
+            "control_model.input_blocks.2.0.in_layers.2.weight": "blocks.3.conv1.weight",
+            "control_model.input_blocks.2.0.in_layers.2.bias": "blocks.3.conv1.bias",
+            "control_model.input_blocks.2.0.emb_layers.1.weight": "blocks.3.time_emb_proj.weight",
+            "control_model.input_blocks.2.0.emb_layers.1.bias": "blocks.3.time_emb_proj.bias",
+            "control_model.input_blocks.2.0.out_layers.0.weight": "blocks.3.norm2.weight",
+            "control_model.input_blocks.2.0.out_layers.0.bias": "blocks.3.norm2.bias",
+            "control_model.input_blocks.2.0.out_layers.3.weight": "blocks.3.conv2.weight",
+            "control_model.input_blocks.2.0.out_layers.3.bias": "blocks.3.conv2.bias",
+            "control_model.input_blocks.2.1.norm.weight": "blocks.4.norm.weight",
+            "control_model.input_blocks.2.1.norm.bias": "blocks.4.norm.bias",
+            "control_model.input_blocks.2.1.proj_in.weight": "blocks.4.proj_in.weight",
+            "control_model.input_blocks.2.1.proj_in.bias": "blocks.4.proj_in.bias",
+            "control_model.input_blocks.2.1.transformer_blocks.0.attn1.to_q.weight": "blocks.4.transformer_blocks.0.attn1.to_q.weight",
+            "control_model.input_blocks.2.1.transformer_blocks.0.attn1.to_k.weight": "blocks.4.transformer_blocks.0.attn1.to_k.weight",
+            "control_model.input_blocks.2.1.transformer_blocks.0.attn1.to_v.weight": "blocks.4.transformer_blocks.0.attn1.to_v.weight",
+            "control_model.input_blocks.2.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.4.transformer_blocks.0.attn1.to_out.weight",
+            "control_model.input_blocks.2.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.4.transformer_blocks.0.attn1.to_out.bias",
+            "control_model.input_blocks.2.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.4.transformer_blocks.0.act_fn.proj.weight",
+            "control_model.input_blocks.2.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.4.transformer_blocks.0.act_fn.proj.bias",
+            "control_model.input_blocks.2.1.transformer_blocks.0.ff.net.2.weight": "blocks.4.transformer_blocks.0.ff.weight",
+            "control_model.input_blocks.2.1.transformer_blocks.0.ff.net.2.bias": "blocks.4.transformer_blocks.0.ff.bias",
+            "control_model.input_blocks.2.1.transformer_blocks.0.attn2.to_q.weight": "blocks.4.transformer_blocks.0.attn2.to_q.weight",
+            "control_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight": "blocks.4.transformer_blocks.0.attn2.to_k.weight",
+            "control_model.input_blocks.2.1.transformer_blocks.0.attn2.to_v.weight": "blocks.4.transformer_blocks.0.attn2.to_v.weight",
+            "control_model.input_blocks.2.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.4.transformer_blocks.0.attn2.to_out.weight",
+            "control_model.input_blocks.2.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.4.transformer_blocks.0.attn2.to_out.bias",
+            "control_model.input_blocks.2.1.transformer_blocks.0.norm1.weight": "blocks.4.transformer_blocks.0.norm1.weight",
+            "control_model.input_blocks.2.1.transformer_blocks.0.norm1.bias": "blocks.4.transformer_blocks.0.norm1.bias",
+            "control_model.input_blocks.2.1.transformer_blocks.0.norm2.weight": "blocks.4.transformer_blocks.0.norm2.weight",
+            "control_model.input_blocks.2.1.transformer_blocks.0.norm2.bias": "blocks.4.transformer_blocks.0.norm2.bias",
+            "control_model.input_blocks.2.1.transformer_blocks.0.norm3.weight": "blocks.4.transformer_blocks.0.norm3.weight",
+            "control_model.input_blocks.2.1.transformer_blocks.0.norm3.bias": "blocks.4.transformer_blocks.0.norm3.bias",
+            "control_model.input_blocks.2.1.proj_out.weight": "blocks.4.proj_out.weight",
+            "control_model.input_blocks.2.1.proj_out.bias": "blocks.4.proj_out.bias",
+            "control_model.input_blocks.3.0.op.weight": "blocks.6.conv.weight",
+            "control_model.input_blocks.3.0.op.bias": "blocks.6.conv.bias",
+            "control_model.input_blocks.4.0.in_layers.0.weight": "blocks.8.norm1.weight",
+            "control_model.input_blocks.4.0.in_layers.0.bias": "blocks.8.norm1.bias",
+            "control_model.input_blocks.4.0.in_layers.2.weight": "blocks.8.conv1.weight",
+            "control_model.input_blocks.4.0.in_layers.2.bias": "blocks.8.conv1.bias",
+            "control_model.input_blocks.4.0.emb_layers.1.weight": "blocks.8.time_emb_proj.weight",
+            "control_model.input_blocks.4.0.emb_layers.1.bias": "blocks.8.time_emb_proj.bias",
+            "control_model.input_blocks.4.0.out_layers.0.weight": "blocks.8.norm2.weight",
+            "control_model.input_blocks.4.0.out_layers.0.bias": "blocks.8.norm2.bias",
+            "control_model.input_blocks.4.0.out_layers.3.weight": "blocks.8.conv2.weight",
+            "control_model.input_blocks.4.0.out_layers.3.bias": "blocks.8.conv2.bias",
+            "control_model.input_blocks.4.0.skip_connection.weight": "blocks.8.conv_shortcut.weight",
+            "control_model.input_blocks.4.0.skip_connection.bias": "blocks.8.conv_shortcut.bias",
+            "control_model.input_blocks.4.1.norm.weight": "blocks.9.norm.weight",
+            "control_model.input_blocks.4.1.norm.bias": "blocks.9.norm.bias",
+            "control_model.input_blocks.4.1.proj_in.weight": "blocks.9.proj_in.weight",
+            "control_model.input_blocks.4.1.proj_in.bias": "blocks.9.proj_in.bias",
+            "control_model.input_blocks.4.1.transformer_blocks.0.attn1.to_q.weight": "blocks.9.transformer_blocks.0.attn1.to_q.weight",
+            "control_model.input_blocks.4.1.transformer_blocks.0.attn1.to_k.weight": "blocks.9.transformer_blocks.0.attn1.to_k.weight",
+            "control_model.input_blocks.4.1.transformer_blocks.0.attn1.to_v.weight": "blocks.9.transformer_blocks.0.attn1.to_v.weight",
+            "control_model.input_blocks.4.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.9.transformer_blocks.0.attn1.to_out.weight",
+            "control_model.input_blocks.4.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.9.transformer_blocks.0.attn1.to_out.bias",
+            "control_model.input_blocks.4.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.9.transformer_blocks.0.act_fn.proj.weight",
+            "control_model.input_blocks.4.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.9.transformer_blocks.0.act_fn.proj.bias",
+            "control_model.input_blocks.4.1.transformer_blocks.0.ff.net.2.weight": "blocks.9.transformer_blocks.0.ff.weight",
+            "control_model.input_blocks.4.1.transformer_blocks.0.ff.net.2.bias": "blocks.9.transformer_blocks.0.ff.bias",
+            "control_model.input_blocks.4.1.transformer_blocks.0.attn2.to_q.weight": "blocks.9.transformer_blocks.0.attn2.to_q.weight",
+            "control_model.input_blocks.4.1.transformer_blocks.0.attn2.to_k.weight": "blocks.9.transformer_blocks.0.attn2.to_k.weight",
+            "control_model.input_blocks.4.1.transformer_blocks.0.attn2.to_v.weight": "blocks.9.transformer_blocks.0.attn2.to_v.weight",
+            "control_model.input_blocks.4.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.9.transformer_blocks.0.attn2.to_out.weight",
+            "control_model.input_blocks.4.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.9.transformer_blocks.0.attn2.to_out.bias",
+            "control_model.input_blocks.4.1.transformer_blocks.0.norm1.weight": "blocks.9.transformer_blocks.0.norm1.weight",
+            "control_model.input_blocks.4.1.transformer_blocks.0.norm1.bias": "blocks.9.transformer_blocks.0.norm1.bias",
+            "control_model.input_blocks.4.1.transformer_blocks.0.norm2.weight": "blocks.9.transformer_blocks.0.norm2.weight",
+            "control_model.input_blocks.4.1.transformer_blocks.0.norm2.bias": "blocks.9.transformer_blocks.0.norm2.bias",
+            "control_model.input_blocks.4.1.transformer_blocks.0.norm3.weight": "blocks.9.transformer_blocks.0.norm3.weight",
+            "control_model.input_blocks.4.1.transformer_blocks.0.norm3.bias": "blocks.9.transformer_blocks.0.norm3.bias",
+            "control_model.input_blocks.4.1.proj_out.weight": "blocks.9.proj_out.weight",
+            "control_model.input_blocks.4.1.proj_out.bias": "blocks.9.proj_out.bias",
+            "control_model.input_blocks.5.0.in_layers.0.weight": "blocks.11.norm1.weight",
+            "control_model.input_blocks.5.0.in_layers.0.bias": "blocks.11.norm1.bias",
+            "control_model.input_blocks.5.0.in_layers.2.weight": "blocks.11.conv1.weight",
+            "control_model.input_blocks.5.0.in_layers.2.bias": "blocks.11.conv1.bias",
+            "control_model.input_blocks.5.0.emb_layers.1.weight": "blocks.11.time_emb_proj.weight",
+            "control_model.input_blocks.5.0.emb_layers.1.bias": "blocks.11.time_emb_proj.bias",
+            "control_model.input_blocks.5.0.out_layers.0.weight": "blocks.11.norm2.weight",
+            "control_model.input_blocks.5.0.out_layers.0.bias": "blocks.11.norm2.bias",
+            "control_model.input_blocks.5.0.out_layers.3.weight": "blocks.11.conv2.weight",
+            "control_model.input_blocks.5.0.out_layers.3.bias": "blocks.11.conv2.bias",
+            "control_model.input_blocks.5.1.norm.weight": "blocks.12.norm.weight",
+            "control_model.input_blocks.5.1.norm.bias": "blocks.12.norm.bias",
+            "control_model.input_blocks.5.1.proj_in.weight": "blocks.12.proj_in.weight",
+            "control_model.input_blocks.5.1.proj_in.bias": "blocks.12.proj_in.bias",
+            "control_model.input_blocks.5.1.transformer_blocks.0.attn1.to_q.weight": "blocks.12.transformer_blocks.0.attn1.to_q.weight",
+            "control_model.input_blocks.5.1.transformer_blocks.0.attn1.to_k.weight": "blocks.12.transformer_blocks.0.attn1.to_k.weight",
+            "control_model.input_blocks.5.1.transformer_blocks.0.attn1.to_v.weight": "blocks.12.transformer_blocks.0.attn1.to_v.weight",
+            "control_model.input_blocks.5.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.12.transformer_blocks.0.attn1.to_out.weight",
+            "control_model.input_blocks.5.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.12.transformer_blocks.0.attn1.to_out.bias",
+            "control_model.input_blocks.5.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.12.transformer_blocks.0.act_fn.proj.weight",
+            "control_model.input_blocks.5.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.12.transformer_blocks.0.act_fn.proj.bias",
+            "control_model.input_blocks.5.1.transformer_blocks.0.ff.net.2.weight": "blocks.12.transformer_blocks.0.ff.weight",
+            "control_model.input_blocks.5.1.transformer_blocks.0.ff.net.2.bias": "blocks.12.transformer_blocks.0.ff.bias",
+            "control_model.input_blocks.5.1.transformer_blocks.0.attn2.to_q.weight": "blocks.12.transformer_blocks.0.attn2.to_q.weight",
+            "control_model.input_blocks.5.1.transformer_blocks.0.attn2.to_k.weight": "blocks.12.transformer_blocks.0.attn2.to_k.weight",
+            "control_model.input_blocks.5.1.transformer_blocks.0.attn2.to_v.weight": "blocks.12.transformer_blocks.0.attn2.to_v.weight",
+            "control_model.input_blocks.5.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.12.transformer_blocks.0.attn2.to_out.weight",
+            "control_model.input_blocks.5.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.12.transformer_blocks.0.attn2.to_out.bias",
+            "control_model.input_blocks.5.1.transformer_blocks.0.norm1.weight": "blocks.12.transformer_blocks.0.norm1.weight",
+            "control_model.input_blocks.5.1.transformer_blocks.0.norm1.bias": "blocks.12.transformer_blocks.0.norm1.bias",
+            "control_model.input_blocks.5.1.transformer_blocks.0.norm2.weight": "blocks.12.transformer_blocks.0.norm2.weight",
+            "control_model.input_blocks.5.1.transformer_blocks.0.norm2.bias": "blocks.12.transformer_blocks.0.norm2.bias",
+            "control_model.input_blocks.5.1.transformer_blocks.0.norm3.weight": "blocks.12.transformer_blocks.0.norm3.weight",
+            "control_model.input_blocks.5.1.transformer_blocks.0.norm3.bias": "blocks.12.transformer_blocks.0.norm3.bias",
+            "control_model.input_blocks.5.1.proj_out.weight": "blocks.12.proj_out.weight",
+            "control_model.input_blocks.5.1.proj_out.bias": "blocks.12.proj_out.bias",
+            "control_model.input_blocks.6.0.op.weight": "blocks.14.conv.weight",
+            "control_model.input_blocks.6.0.op.bias": "blocks.14.conv.bias",
+            "control_model.input_blocks.7.0.in_layers.0.weight": "blocks.16.norm1.weight",
+            "control_model.input_blocks.7.0.in_layers.0.bias": "blocks.16.norm1.bias",
+            "control_model.input_blocks.7.0.in_layers.2.weight": "blocks.16.conv1.weight",
+            "control_model.input_blocks.7.0.in_layers.2.bias": "blocks.16.conv1.bias",
+            "control_model.input_blocks.7.0.emb_layers.1.weight": "blocks.16.time_emb_proj.weight",
+            "control_model.input_blocks.7.0.emb_layers.1.bias": "blocks.16.time_emb_proj.bias",
+            "control_model.input_blocks.7.0.out_layers.0.weight": "blocks.16.norm2.weight",
+            "control_model.input_blocks.7.0.out_layers.0.bias": "blocks.16.norm2.bias",
+            "control_model.input_blocks.7.0.out_layers.3.weight": "blocks.16.conv2.weight",
+            "control_model.input_blocks.7.0.out_layers.3.bias": "blocks.16.conv2.bias",
+            "control_model.input_blocks.7.0.skip_connection.weight": "blocks.16.conv_shortcut.weight",
+            "control_model.input_blocks.7.0.skip_connection.bias": "blocks.16.conv_shortcut.bias",
+            "control_model.input_blocks.7.1.norm.weight": "blocks.17.norm.weight",
+            "control_model.input_blocks.7.1.norm.bias": "blocks.17.norm.bias",
+            "control_model.input_blocks.7.1.proj_in.weight": "blocks.17.proj_in.weight",
+            "control_model.input_blocks.7.1.proj_in.bias": "blocks.17.proj_in.bias",
+            "control_model.input_blocks.7.1.transformer_blocks.0.attn1.to_q.weight": "blocks.17.transformer_blocks.0.attn1.to_q.weight",
+            "control_model.input_blocks.7.1.transformer_blocks.0.attn1.to_k.weight": "blocks.17.transformer_blocks.0.attn1.to_k.weight",
+            "control_model.input_blocks.7.1.transformer_blocks.0.attn1.to_v.weight": "blocks.17.transformer_blocks.0.attn1.to_v.weight",
+            "control_model.input_blocks.7.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.17.transformer_blocks.0.attn1.to_out.weight",
+            "control_model.input_blocks.7.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.17.transformer_blocks.0.attn1.to_out.bias",
+            "control_model.input_blocks.7.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.17.transformer_blocks.0.act_fn.proj.weight",
+            "control_model.input_blocks.7.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.17.transformer_blocks.0.act_fn.proj.bias",
+            "control_model.input_blocks.7.1.transformer_blocks.0.ff.net.2.weight": "blocks.17.transformer_blocks.0.ff.weight",
+            "control_model.input_blocks.7.1.transformer_blocks.0.ff.net.2.bias": "blocks.17.transformer_blocks.0.ff.bias",
+            "control_model.input_blocks.7.1.transformer_blocks.0.attn2.to_q.weight": "blocks.17.transformer_blocks.0.attn2.to_q.weight",
+            "control_model.input_blocks.7.1.transformer_blocks.0.attn2.to_k.weight": "blocks.17.transformer_blocks.0.attn2.to_k.weight",
+            "control_model.input_blocks.7.1.transformer_blocks.0.attn2.to_v.weight": "blocks.17.transformer_blocks.0.attn2.to_v.weight",
+            "control_model.input_blocks.7.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.17.transformer_blocks.0.attn2.to_out.weight",
+            "control_model.input_blocks.7.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.17.transformer_blocks.0.attn2.to_out.bias",
+            "control_model.input_blocks.7.1.transformer_blocks.0.norm1.weight": "blocks.17.transformer_blocks.0.norm1.weight",
+            "control_model.input_blocks.7.1.transformer_blocks.0.norm1.bias": "blocks.17.transformer_blocks.0.norm1.bias",
+            "control_model.input_blocks.7.1.transformer_blocks.0.norm2.weight": "blocks.17.transformer_blocks.0.norm2.weight",
+            "control_model.input_blocks.7.1.transformer_blocks.0.norm2.bias": "blocks.17.transformer_blocks.0.norm2.bias",
+            "control_model.input_blocks.7.1.transformer_blocks.0.norm3.weight": "blocks.17.transformer_blocks.0.norm3.weight",
+            "control_model.input_blocks.7.1.transformer_blocks.0.norm3.bias": "blocks.17.transformer_blocks.0.norm3.bias",
+            "control_model.input_blocks.7.1.proj_out.weight": "blocks.17.proj_out.weight",
+            "control_model.input_blocks.7.1.proj_out.bias": "blocks.17.proj_out.bias",
+            "control_model.input_blocks.8.0.in_layers.0.weight": "blocks.19.norm1.weight",
+            "control_model.input_blocks.8.0.in_layers.0.bias": "blocks.19.norm1.bias",
+            "control_model.input_blocks.8.0.in_layers.2.weight": "blocks.19.conv1.weight",
+            "control_model.input_blocks.8.0.in_layers.2.bias": "blocks.19.conv1.bias",
+            "control_model.input_blocks.8.0.emb_layers.1.weight": "blocks.19.time_emb_proj.weight",
+            "control_model.input_blocks.8.0.emb_layers.1.bias": "blocks.19.time_emb_proj.bias",
+            "control_model.input_blocks.8.0.out_layers.0.weight": "blocks.19.norm2.weight",
+            "control_model.input_blocks.8.0.out_layers.0.bias": "blocks.19.norm2.bias",
+            "control_model.input_blocks.8.0.out_layers.3.weight": "blocks.19.conv2.weight",
+            "control_model.input_blocks.8.0.out_layers.3.bias": "blocks.19.conv2.bias",
+            "control_model.input_blocks.8.1.norm.weight": "blocks.20.norm.weight",
+            "control_model.input_blocks.8.1.norm.bias": "blocks.20.norm.bias",
+            "control_model.input_blocks.8.1.proj_in.weight": "blocks.20.proj_in.weight",
+            "control_model.input_blocks.8.1.proj_in.bias": "blocks.20.proj_in.bias",
+            "control_model.input_blocks.8.1.transformer_blocks.0.attn1.to_q.weight": "blocks.20.transformer_blocks.0.attn1.to_q.weight",
+            "control_model.input_blocks.8.1.transformer_blocks.0.attn1.to_k.weight": "blocks.20.transformer_blocks.0.attn1.to_k.weight",
+            "control_model.input_blocks.8.1.transformer_blocks.0.attn1.to_v.weight": "blocks.20.transformer_blocks.0.attn1.to_v.weight",
+            "control_model.input_blocks.8.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.20.transformer_blocks.0.attn1.to_out.weight",
+            "control_model.input_blocks.8.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.20.transformer_blocks.0.attn1.to_out.bias",
+            "control_model.input_blocks.8.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.20.transformer_blocks.0.act_fn.proj.weight",
+            "control_model.input_blocks.8.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.20.transformer_blocks.0.act_fn.proj.bias",
+            "control_model.input_blocks.8.1.transformer_blocks.0.ff.net.2.weight": "blocks.20.transformer_blocks.0.ff.weight",
+            "control_model.input_blocks.8.1.transformer_blocks.0.ff.net.2.bias": "blocks.20.transformer_blocks.0.ff.bias",
+            "control_model.input_blocks.8.1.transformer_blocks.0.attn2.to_q.weight": "blocks.20.transformer_blocks.0.attn2.to_q.weight",
+            "control_model.input_blocks.8.1.transformer_blocks.0.attn2.to_k.weight": "blocks.20.transformer_blocks.0.attn2.to_k.weight",
+            "control_model.input_blocks.8.1.transformer_blocks.0.attn2.to_v.weight": "blocks.20.transformer_blocks.0.attn2.to_v.weight",
+            "control_model.input_blocks.8.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.20.transformer_blocks.0.attn2.to_out.weight",
+            "control_model.input_blocks.8.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.20.transformer_blocks.0.attn2.to_out.bias",
+            "control_model.input_blocks.8.1.transformer_blocks.0.norm1.weight": "blocks.20.transformer_blocks.0.norm1.weight",
+            "control_model.input_blocks.8.1.transformer_blocks.0.norm1.bias": "blocks.20.transformer_blocks.0.norm1.bias",
+            "control_model.input_blocks.8.1.transformer_blocks.0.norm2.weight": "blocks.20.transformer_blocks.0.norm2.weight",
+            "control_model.input_blocks.8.1.transformer_blocks.0.norm2.bias": "blocks.20.transformer_blocks.0.norm2.bias",
+            "control_model.input_blocks.8.1.transformer_blocks.0.norm3.weight": "blocks.20.transformer_blocks.0.norm3.weight",
+            "control_model.input_blocks.8.1.transformer_blocks.0.norm3.bias": "blocks.20.transformer_blocks.0.norm3.bias",
+            "control_model.input_blocks.8.1.proj_out.weight": "blocks.20.proj_out.weight",
+            "control_model.input_blocks.8.1.proj_out.bias": "blocks.20.proj_out.bias",
+            "control_model.input_blocks.9.0.op.weight": "blocks.22.conv.weight",
+            "control_model.input_blocks.9.0.op.bias": "blocks.22.conv.bias",
+            "control_model.input_blocks.10.0.in_layers.0.weight": "blocks.24.norm1.weight",
+            "control_model.input_blocks.10.0.in_layers.0.bias": "blocks.24.norm1.bias",
+            "control_model.input_blocks.10.0.in_layers.2.weight": "blocks.24.conv1.weight",
+            "control_model.input_blocks.10.0.in_layers.2.bias": "blocks.24.conv1.bias",
+            "control_model.input_blocks.10.0.emb_layers.1.weight": "blocks.24.time_emb_proj.weight",
+            "control_model.input_blocks.10.0.emb_layers.1.bias": "blocks.24.time_emb_proj.bias",
+            "control_model.input_blocks.10.0.out_layers.0.weight": "blocks.24.norm2.weight",
+            "control_model.input_blocks.10.0.out_layers.0.bias": "blocks.24.norm2.bias",
+            "control_model.input_blocks.10.0.out_layers.3.weight": "blocks.24.conv2.weight",
+            "control_model.input_blocks.10.0.out_layers.3.bias": "blocks.24.conv2.bias",
+            "control_model.input_blocks.11.0.in_layers.0.weight": "blocks.26.norm1.weight",
+            "control_model.input_blocks.11.0.in_layers.0.bias": "blocks.26.norm1.bias",
+            "control_model.input_blocks.11.0.in_layers.2.weight": "blocks.26.conv1.weight",
+            "control_model.input_blocks.11.0.in_layers.2.bias": "blocks.26.conv1.bias",
+            "control_model.input_blocks.11.0.emb_layers.1.weight": "blocks.26.time_emb_proj.weight",
+            "control_model.input_blocks.11.0.emb_layers.1.bias": "blocks.26.time_emb_proj.bias",
+            "control_model.input_blocks.11.0.out_layers.0.weight": "blocks.26.norm2.weight",
+            "control_model.input_blocks.11.0.out_layers.0.bias": "blocks.26.norm2.bias",
+            "control_model.input_blocks.11.0.out_layers.3.weight": "blocks.26.conv2.weight",
+            "control_model.input_blocks.11.0.out_layers.3.bias": "blocks.26.conv2.bias",
+            "control_model.zero_convs.0.0.weight": "controlnet_blocks.0.weight",
+            "control_model.zero_convs.0.0.bias": "controlnet_blocks.0.bias",
+            "control_model.zero_convs.1.0.weight": "controlnet_blocks.1.weight",
+            "control_model.zero_convs.1.0.bias": "controlnet_blocks.0.bias",
+            "control_model.zero_convs.2.0.weight": "controlnet_blocks.2.weight",
+            "control_model.zero_convs.2.0.bias": "controlnet_blocks.0.bias",
+            "control_model.zero_convs.3.0.weight": "controlnet_blocks.3.weight",
+            "control_model.zero_convs.3.0.bias": "controlnet_blocks.0.bias",
+            "control_model.zero_convs.4.0.weight": "controlnet_blocks.4.weight",
+            "control_model.zero_convs.4.0.bias": "controlnet_blocks.4.bias",
+            "control_model.zero_convs.5.0.weight": "controlnet_blocks.5.weight",
+            "control_model.zero_convs.5.0.bias": "controlnet_blocks.4.bias",
+            "control_model.zero_convs.6.0.weight": "controlnet_blocks.6.weight",
+            "control_model.zero_convs.6.0.bias": "controlnet_blocks.4.bias",
+            "control_model.zero_convs.7.0.weight": "controlnet_blocks.7.weight",
+            "control_model.zero_convs.7.0.bias": "controlnet_blocks.7.bias",
+            "control_model.zero_convs.8.0.weight": "controlnet_blocks.8.weight",
+            "control_model.zero_convs.8.0.bias": "controlnet_blocks.7.bias",
+            "control_model.zero_convs.9.0.weight": "controlnet_blocks.9.weight",
+            "control_model.zero_convs.9.0.bias": "controlnet_blocks.7.bias",
+            "control_model.zero_convs.10.0.weight": "controlnet_blocks.10.weight",
+            "control_model.zero_convs.10.0.bias": "controlnet_blocks.7.bias",
+            "control_model.zero_convs.11.0.weight": "controlnet_blocks.11.weight",
+            "control_model.zero_convs.11.0.bias": "controlnet_blocks.7.bias",
+            "control_model.input_hint_block.0.weight": "controlnet_conv_in.blocks.0.weight",
+            "control_model.input_hint_block.0.bias": "controlnet_conv_in.blocks.0.bias",
+            "control_model.input_hint_block.2.weight": "controlnet_conv_in.blocks.2.weight",
+            "control_model.input_hint_block.2.bias": "controlnet_conv_in.blocks.2.bias",
+            "control_model.input_hint_block.4.weight": "controlnet_conv_in.blocks.4.weight",
+            "control_model.input_hint_block.4.bias": "controlnet_conv_in.blocks.4.bias",
+            "control_model.input_hint_block.6.weight": "controlnet_conv_in.blocks.6.weight",
+            "control_model.input_hint_block.6.bias": "controlnet_conv_in.blocks.6.bias",
+            "control_model.input_hint_block.8.weight": "controlnet_conv_in.blocks.8.weight",
+            "control_model.input_hint_block.8.bias": "controlnet_conv_in.blocks.8.bias",
+            "control_model.input_hint_block.10.weight": "controlnet_conv_in.blocks.10.weight",
+            "control_model.input_hint_block.10.bias": "controlnet_conv_in.blocks.10.bias",
+            "control_model.input_hint_block.12.weight": "controlnet_conv_in.blocks.12.weight",
+            "control_model.input_hint_block.12.bias": "controlnet_conv_in.blocks.12.bias",
+            "control_model.input_hint_block.14.weight": "controlnet_conv_in.blocks.14.weight",
+            "control_model.input_hint_block.14.bias": "controlnet_conv_in.blocks.14.bias",
+            "control_model.middle_block.0.in_layers.0.weight": "blocks.28.norm1.weight",
+            "control_model.middle_block.0.in_layers.0.bias": "blocks.28.norm1.bias",
+            "control_model.middle_block.0.in_layers.2.weight": "blocks.28.conv1.weight",
+            "control_model.middle_block.0.in_layers.2.bias": "blocks.28.conv1.bias",
+            "control_model.middle_block.0.emb_layers.1.weight": "blocks.28.time_emb_proj.weight",
+            "control_model.middle_block.0.emb_layers.1.bias": "blocks.28.time_emb_proj.bias",
+            "control_model.middle_block.0.out_layers.0.weight": "blocks.28.norm2.weight",
+            "control_model.middle_block.0.out_layers.0.bias": "blocks.28.norm2.bias",
+            "control_model.middle_block.0.out_layers.3.weight": "blocks.28.conv2.weight",
+            "control_model.middle_block.0.out_layers.3.bias": "blocks.28.conv2.bias",
+            "control_model.middle_block.1.norm.weight": "blocks.29.norm.weight",
+            "control_model.middle_block.1.norm.bias": "blocks.29.norm.bias",
+            "control_model.middle_block.1.proj_in.weight": "blocks.29.proj_in.weight",
+            "control_model.middle_block.1.proj_in.bias": "blocks.29.proj_in.bias",
+            "control_model.middle_block.1.transformer_blocks.0.attn1.to_q.weight": "blocks.29.transformer_blocks.0.attn1.to_q.weight",
+            "control_model.middle_block.1.transformer_blocks.0.attn1.to_k.weight": "blocks.29.transformer_blocks.0.attn1.to_k.weight",
+            "control_model.middle_block.1.transformer_blocks.0.attn1.to_v.weight": "blocks.29.transformer_blocks.0.attn1.to_v.weight",
+            "control_model.middle_block.1.transformer_blocks.0.attn1.to_out.0.weight": "blocks.29.transformer_blocks.0.attn1.to_out.weight",
+            "control_model.middle_block.1.transformer_blocks.0.attn1.to_out.0.bias": "blocks.29.transformer_blocks.0.attn1.to_out.bias",
+            "control_model.middle_block.1.transformer_blocks.0.ff.net.0.proj.weight": "blocks.29.transformer_blocks.0.act_fn.proj.weight",
+            "control_model.middle_block.1.transformer_blocks.0.ff.net.0.proj.bias": "blocks.29.transformer_blocks.0.act_fn.proj.bias",
+            "control_model.middle_block.1.transformer_blocks.0.ff.net.2.weight": "blocks.29.transformer_blocks.0.ff.weight",
+            "control_model.middle_block.1.transformer_blocks.0.ff.net.2.bias": "blocks.29.transformer_blocks.0.ff.bias",
+            "control_model.middle_block.1.transformer_blocks.0.attn2.to_q.weight": "blocks.29.transformer_blocks.0.attn2.to_q.weight",
+            "control_model.middle_block.1.transformer_blocks.0.attn2.to_k.weight": "blocks.29.transformer_blocks.0.attn2.to_k.weight",
+            "control_model.middle_block.1.transformer_blocks.0.attn2.to_v.weight": "blocks.29.transformer_blocks.0.attn2.to_v.weight",
+            "control_model.middle_block.1.transformer_blocks.0.attn2.to_out.0.weight": "blocks.29.transformer_blocks.0.attn2.to_out.weight",
+            "control_model.middle_block.1.transformer_blocks.0.attn2.to_out.0.bias": "blocks.29.transformer_blocks.0.attn2.to_out.bias",
+            "control_model.middle_block.1.transformer_blocks.0.norm1.weight": "blocks.29.transformer_blocks.0.norm1.weight",
+            "control_model.middle_block.1.transformer_blocks.0.norm1.bias": "blocks.29.transformer_blocks.0.norm1.bias",
+            "control_model.middle_block.1.transformer_blocks.0.norm2.weight": "blocks.29.transformer_blocks.0.norm2.weight",
+            "control_model.middle_block.1.transformer_blocks.0.norm2.bias": "blocks.29.transformer_blocks.0.norm2.bias",
+            "control_model.middle_block.1.transformer_blocks.0.norm3.weight": "blocks.29.transformer_blocks.0.norm3.weight",
+            "control_model.middle_block.1.transformer_blocks.0.norm3.bias": "blocks.29.transformer_blocks.0.norm3.bias",
+            "control_model.middle_block.1.proj_out.weight": "blocks.29.proj_out.weight",
+            "control_model.middle_block.1.proj_out.bias": "blocks.29.proj_out.bias",
+            "control_model.middle_block.2.in_layers.0.weight": "blocks.30.norm1.weight",
+            "control_model.middle_block.2.in_layers.0.bias": "blocks.30.norm1.bias",
+            "control_model.middle_block.2.in_layers.2.weight": "blocks.30.conv1.weight",
+            "control_model.middle_block.2.in_layers.2.bias": "blocks.30.conv1.bias",
+            "control_model.middle_block.2.emb_layers.1.weight": "blocks.30.time_emb_proj.weight",
+            "control_model.middle_block.2.emb_layers.1.bias": "blocks.30.time_emb_proj.bias",
+            "control_model.middle_block.2.out_layers.0.weight": "blocks.30.norm2.weight",
+            "control_model.middle_block.2.out_layers.0.bias": "blocks.30.norm2.bias",
+            "control_model.middle_block.2.out_layers.3.weight": "blocks.30.conv2.weight",
+            "control_model.middle_block.2.out_layers.3.bias": "blocks.30.conv2.bias",
+            "control_model.middle_block_out.0.weight": "controlnet_blocks.12.weight",
+            "control_model.middle_block_out.0.bias": "controlnet_blocks.7.bias",
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if ".proj_in." in name or ".proj_out." in name:
+                    param = param.squeeze()
+                state_dict_[rename_dict[name]] = param
+        return state_dict_

diffsynth/models/sd_ipadapter.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from .svd_image_encoder import SVDImageEncoder
+from .sdxl_ipadapter import IpAdapterImageProjModel, IpAdapterModule, SDXLIpAdapterStateDictConverter
+from transformers import CLIPImageProcessor
+import torch
+class IpAdapterCLIPImageEmbedder(SVDImageEncoder):
+    def __init__(self):
+        super().__init__()
+        self.image_processor = CLIPImageProcessor()
+    def forward(self, image):
+        pixel_values = self.image_processor(images=image, return_tensors="pt").pixel_values
+        pixel_values = pixel_values.to(device=self.embeddings.class_embedding.device, dtype=self.embeddings.class_embedding.dtype)
+        return super().forward(pixel_values)
+class SDIpAdapter(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        shape_list = [(768, 320)] * 2 + [(768, 640)] * 2 + [(768, 1280)] * 5 + [(768, 640)] * 3  + [(768, 320)] * 3 + [(768, 1280)] * 1
+        self.ipadapter_modules = torch.nn.ModuleList([IpAdapterModule(*shape) for shape in shape_list])
+        self.image_proj = IpAdapterImageProjModel(cross_attention_dim=768, clip_embeddings_dim=1024, clip_extra_context_tokens=4)
+        self.set_full_adapter()
+    def set_full_adapter(self):
+        block_ids = [1, 4, 9, 12, 17, 20, 40, 43, 46, 50, 53, 56, 60, 63, 66, 29]
+        self.call_block_id = {(i, 0): j for j, i in enumerate(block_ids)}
+    def set_less_adapter(self):
+        # IP-Adapter for SD v1.5 doesn't support this feature.
+        self.set_full_adapter()
+    def forward(self, hidden_states, scale=1.0):
+        hidden_states = self.image_proj(hidden_states)
+        hidden_states = hidden_states.view(1, -1, hidden_states.shape[-1])
+        ip_kv_dict = {}
+        for (block_id, transformer_id) in self.call_block_id:
+            ipadapter_id = self.call_block_id[(block_id, transformer_id)]
+            ip_k, ip_v = self.ipadapter_modules[ipadapter_id](hidden_states)
+            if block_id not in ip_kv_dict:
+                ip_kv_dict[block_id] = {}
+            ip_kv_dict[block_id][transformer_id] = {
+                "ip_k": ip_k,
+                "ip_v": ip_v,
+                "scale": scale
+            }
+        return ip_kv_dict
+    def state_dict_converter(self):
+        return SDIpAdapterStateDictConverter()
+class SDIpAdapterStateDictConverter(SDXLIpAdapterStateDictConverter):
+    def __init__(self):
+        pass

diffsynth/models/sd_lora.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import torch
+from .sd_unet import SDUNetStateDictConverter, SDUNet
+from .sd_text_encoder import SDTextEncoderStateDictConverter, SDTextEncoder
+class SDLoRA:
+    def __init__(self):
+        pass
+    def convert_state_dict(self, state_dict, lora_prefix="lora_unet_", alpha=1.0, device="cuda"):
+        special_keys = {
+            "down.blocks": "down_blocks",
+            "up.blocks": "up_blocks",
+            "mid.block": "mid_block",
+            "proj.in": "proj_in",
+            "proj.out": "proj_out",
+            "transformer.blocks": "transformer_blocks",
+            "to.q": "to_q",
+            "to.k": "to_k",
+            "to.v": "to_v",
+            "to.out": "to_out",
+        }
+        state_dict_ = {}
+        for key in state_dict:
+            if ".lora_up" not in key:
+                continue
+            if not key.startswith(lora_prefix):
+                continue
+            weight_up = state_dict[key].to(device="cuda", dtype=torch.float16)
+            weight_down = state_dict[key.replace(".lora_up", ".lora_down")].to(device="cuda", dtype=torch.float16)
+            if len(weight_up.shape) == 4:
+                weight_up = weight_up.squeeze(3).squeeze(2).to(torch.float32)
+                weight_down = weight_down.squeeze(3).squeeze(2).to(torch.float32)
+                lora_weight = alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3)
+            else:
+                lora_weight = alpha * torch.mm(weight_up, weight_down)
+            target_name = key.split(".")[0].replace("_", ".")[len(lora_prefix):] + ".weight"
+            for special_key in special_keys:
+                target_name = target_name.replace(special_key, special_keys[special_key])
+            state_dict_[target_name] = lora_weight.cpu()
+        return state_dict_
+    def add_lora_to_unet(self, unet: SDUNet, state_dict_lora, alpha=1.0, device="cuda"):
+        state_dict_unet = unet.state_dict()
+        state_dict_lora = self.convert_state_dict(state_dict_lora, lora_prefix="lora_unet_", alpha=alpha, device=device)
+        state_dict_lora = SDUNetStateDictConverter().from_diffusers(state_dict_lora)
+        if len(state_dict_lora) > 0:
+            for name in state_dict_lora:
+                state_dict_unet[name] += state_dict_lora[name].to(device=device)
+            unet.load_state_dict(state_dict_unet)
+    def add_lora_to_text_encoder(self, text_encoder: SDTextEncoder, state_dict_lora, alpha=1.0, device="cuda"):
+        state_dict_text_encoder = text_encoder.state_dict()
+        state_dict_lora = self.convert_state_dict(state_dict_lora, lora_prefix="lora_te_", alpha=alpha, device=device)
+        state_dict_lora = SDTextEncoderStateDictConverter().from_diffusers(state_dict_lora)
+        if len(state_dict_lora) > 0:
+            for name in state_dict_lora:
+                state_dict_text_encoder[name] += state_dict_lora[name].to(device=device)
+            text_encoder.load_state_dict(state_dict_text_encoder)

diffsynth/models/sd_motion.py ADDED Viewed

	@@ -0,0 +1,198 @@

+from .sd_unet import SDUNet, Attention, GEGLU
+import torch
+from einops import rearrange, repeat
+class TemporalTransformerBlock(torch.nn.Module):
+    def __init__(self, dim, num_attention_heads, attention_head_dim, max_position_embeddings=32):
+        super().__init__()
+        # 1. Self-Attn
+        self.pe1 = torch.nn.Parameter(torch.zeros(1, max_position_embeddings, dim))
+        self.norm1 = torch.nn.LayerNorm(dim, elementwise_affine=True)
+        self.attn1 = Attention(q_dim=dim, num_heads=num_attention_heads, head_dim=attention_head_dim, bias_out=True)
+        # 2. Cross-Attn
+        self.pe2 = torch.nn.Parameter(torch.zeros(1, max_position_embeddings, dim))
+        self.norm2 = torch.nn.LayerNorm(dim, elementwise_affine=True)
+        self.attn2 = Attention(q_dim=dim, num_heads=num_attention_heads, head_dim=attention_head_dim, bias_out=True)
+        # 3. Feed-forward
+        self.norm3 = torch.nn.LayerNorm(dim, elementwise_affine=True)
+        self.act_fn = GEGLU(dim, dim * 4)
+        self.ff = torch.nn.Linear(dim * 4, dim)
+    def forward(self, hidden_states, batch_size=1):
+        # 1. Self-Attention
+        norm_hidden_states = self.norm1(hidden_states)
+        norm_hidden_states = rearrange(norm_hidden_states, "(b f) h c -> (b h) f c", b=batch_size)
+        attn_output = self.attn1(norm_hidden_states + self.pe1[:, :norm_hidden_states.shape[1]])
+        attn_output = rearrange(attn_output, "(b h) f c -> (b f) h c", b=batch_size)
+        hidden_states = attn_output + hidden_states
+        # 2. Cross-Attention
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = rearrange(norm_hidden_states, "(b f) h c -> (b h) f c", b=batch_size)
+        attn_output = self.attn2(norm_hidden_states + self.pe2[:, :norm_hidden_states.shape[1]])
+        attn_output = rearrange(attn_output, "(b h) f c -> (b f) h c", b=batch_size)
+        hidden_states = attn_output + hidden_states
+        # 3. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        ff_output = self.act_fn(norm_hidden_states)
+        ff_output = self.ff(ff_output)
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+class TemporalBlock(torch.nn.Module):
+    def __init__(self, num_attention_heads, attention_head_dim, in_channels, num_layers=1, norm_num_groups=32, eps=1e-5):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=eps, affine=True)
+        self.proj_in = torch.nn.Linear(in_channels, inner_dim)
+        self.transformer_blocks = torch.nn.ModuleList([
+            TemporalTransformerBlock(
+                inner_dim,
+                num_attention_heads,
+                attention_head_dim
+            )
+            for d in range(num_layers)
+        ])
+        self.proj_out = torch.nn.Linear(inner_dim, in_channels)
+    def forward(self, hidden_states, time_emb, text_emb, res_stack, batch_size=1):
+        batch, _, height, width = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        inner_dim = hidden_states.shape[1]
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+        hidden_states = self.proj_in(hidden_states)
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                batch_size=batch_size
+            )
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+        hidden_states = hidden_states + residual
+        return hidden_states, time_emb, text_emb, res_stack
+class SDMotionModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.motion_modules = torch.nn.ModuleList([
+            TemporalBlock(8, 40, 320, eps=1e-6),
+            TemporalBlock(8, 40, 320, eps=1e-6),
+            TemporalBlock(8, 80, 640, eps=1e-6),
+            TemporalBlock(8, 80, 640, eps=1e-6),
+            TemporalBlock(8, 160, 1280, eps=1e-6),
+            TemporalBlock(8, 160, 1280, eps=1e-6),
+            TemporalBlock(8, 160, 1280, eps=1e-6),
+            TemporalBlock(8, 160, 1280, eps=1e-6),
+            TemporalBlock(8, 160, 1280, eps=1e-6),
+            TemporalBlock(8, 160, 1280, eps=1e-6),
+            TemporalBlock(8, 160, 1280, eps=1e-6),
+            TemporalBlock(8, 160, 1280, eps=1e-6),
+            TemporalBlock(8, 160, 1280, eps=1e-6),
+            TemporalBlock(8, 160, 1280, eps=1e-6),
+            TemporalBlock(8, 160, 1280, eps=1e-6),
+            TemporalBlock(8, 80, 640, eps=1e-6),
+            TemporalBlock(8, 80, 640, eps=1e-6),
+            TemporalBlock(8, 80, 640, eps=1e-6),
+            TemporalBlock(8, 40, 320, eps=1e-6),
+            TemporalBlock(8, 40, 320, eps=1e-6),
+            TemporalBlock(8, 40, 320, eps=1e-6),
+        ])
+        self.call_block_id = {
+            1: 0,
+            4: 1,
+            9: 2,
+            12: 3,
+            17: 4,
+            20: 5,
+            24: 6,
+            26: 7,
+            29: 8,
+            32: 9,
+            34: 10,
+            36: 11,
+            40: 12,
+            43: 13,
+            46: 14,
+            50: 15,
+            53: 16,
+            56: 17,
+            60: 18,
+            63: 19,
+            66: 20
+        }
+    def forward(self):
+        pass
+    def state_dict_converter(self):
+        return SDMotionModelStateDictConverter()
+class SDMotionModelStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        rename_dict = {
+            "norm": "norm",
+            "proj_in": "proj_in",
+            "transformer_blocks.0.attention_blocks.0.to_q": "transformer_blocks.0.attn1.to_q",
+            "transformer_blocks.0.attention_blocks.0.to_k": "transformer_blocks.0.attn1.to_k",
+            "transformer_blocks.0.attention_blocks.0.to_v": "transformer_blocks.0.attn1.to_v",
+            "transformer_blocks.0.attention_blocks.0.to_out.0": "transformer_blocks.0.attn1.to_out",
+            "transformer_blocks.0.attention_blocks.0.pos_encoder": "transformer_blocks.0.pe1",
+            "transformer_blocks.0.attention_blocks.1.to_q": "transformer_blocks.0.attn2.to_q",
+            "transformer_blocks.0.attention_blocks.1.to_k": "transformer_blocks.0.attn2.to_k",
+            "transformer_blocks.0.attention_blocks.1.to_v": "transformer_blocks.0.attn2.to_v",
+            "transformer_blocks.0.attention_blocks.1.to_out.0": "transformer_blocks.0.attn2.to_out",
+            "transformer_blocks.0.attention_blocks.1.pos_encoder": "transformer_blocks.0.pe2",
+            "transformer_blocks.0.norms.0": "transformer_blocks.0.norm1",
+            "transformer_blocks.0.norms.1": "transformer_blocks.0.norm2",
+            "transformer_blocks.0.ff.net.0.proj": "transformer_blocks.0.act_fn.proj",
+            "transformer_blocks.0.ff.net.2": "transformer_blocks.0.ff",
+            "transformer_blocks.0.ff_norm": "transformer_blocks.0.norm3",
+            "proj_out": "proj_out",
+        }
+        name_list = sorted([i for i in state_dict if i.startswith("down_blocks.")])
+        name_list += sorted([i for i in state_dict if i.startswith("mid_block.")])
+        name_list += sorted([i for i in state_dict if i.startswith("up_blocks.")])
+        state_dict_ = {}
+        last_prefix, module_id = "", -1
+        for name in name_list:
+            names = name.split(".")
+            prefix_index = names.index("temporal_transformer") + 1
+            prefix = ".".join(names[:prefix_index])
+            if prefix != last_prefix:
+                last_prefix = prefix
+                module_id += 1
+            middle_name = ".".join(names[prefix_index:-1])
+            suffix = names[-1]
+            if "pos_encoder" in names:
+                rename = ".".join(["motion_modules", str(module_id), rename_dict[middle_name]])
+            else:
+                rename = ".".join(["motion_modules", str(module_id), rename_dict[middle_name], suffix])
+            state_dict_[rename] = state_dict[name]
+        return state_dict_
+    def from_civitai(self, state_dict):
+        return self.from_diffusers(state_dict)

diffsynth/models/sd_text_encoder.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import torch
+from .attention import Attention
+class CLIPEncoderLayer(torch.nn.Module):
+    def __init__(self, embed_dim, intermediate_size, num_heads=12, head_dim=64, use_quick_gelu=True):
+        super().__init__()
+        self.attn = Attention(q_dim=embed_dim, num_heads=num_heads, head_dim=head_dim, bias_q=True, bias_kv=True, bias_out=True)
+        self.layer_norm1 = torch.nn.LayerNorm(embed_dim)
+        self.layer_norm2 = torch.nn.LayerNorm(embed_dim)
+        self.fc1 = torch.nn.Linear(embed_dim, intermediate_size)
+        self.fc2 = torch.nn.Linear(intermediate_size, embed_dim)
+        self.use_quick_gelu = use_quick_gelu
+    def quickGELU(self, x):
+        return x * torch.sigmoid(1.702 * x)
+    def forward(self, hidden_states, attn_mask=None):
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.attn(hidden_states, attn_mask=attn_mask)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.fc1(hidden_states)
+        if self.use_quick_gelu:
+            hidden_states = self.quickGELU(hidden_states)
+        else:
+            hidden_states = torch.nn.functional.gelu(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class SDTextEncoder(torch.nn.Module):
+    def __init__(self, embed_dim=768, vocab_size=49408, max_position_embeddings=77, num_encoder_layers=12, encoder_intermediate_size=3072):
+        super().__init__()
+        # token_embedding
+        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
+        # position_embeds (This is a fixed tensor)
+        self.position_embeds = torch.nn.Parameter(torch.zeros(1, max_position_embeddings, embed_dim))
+        # encoders
+        self.encoders = torch.nn.ModuleList([CLIPEncoderLayer(embed_dim, encoder_intermediate_size) for _ in range(num_encoder_layers)])
+        # attn_mask
+        self.attn_mask = self.attention_mask(max_position_embeddings)
+        # final_layer_norm
+        self.final_layer_norm = torch.nn.LayerNorm(embed_dim)
+    def attention_mask(self, length):
+        mask = torch.empty(length, length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)
+        return mask
+    def forward(self, input_ids, clip_skip=1):
+        embeds = self.token_embedding(input_ids) + self.position_embeds
+        attn_mask = self.attn_mask.to(device=embeds.device, dtype=embeds.dtype)
+        for encoder_id, encoder in enumerate(self.encoders):
+            embeds = encoder(embeds, attn_mask=attn_mask)
+            if encoder_id + clip_skip == len(self.encoders):
+                break
+        embeds = self.final_layer_norm(embeds)
+        return embeds
+    def state_dict_converter(self):
+        return SDTextEncoderStateDictConverter()
+class SDTextEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        rename_dict = {
+            "text_model.embeddings.token_embedding.weight": "token_embedding.weight",
+            "text_model.embeddings.position_embedding.weight": "position_embeds",
+            "text_model.final_layer_norm.weight": "final_layer_norm.weight",
+            "text_model.final_layer_norm.bias": "final_layer_norm.bias"
+        }
+        attn_rename_dict = {
+            "self_attn.q_proj": "attn.to_q",
+            "self_attn.k_proj": "attn.to_k",
+            "self_attn.v_proj": "attn.to_v",
+            "self_attn.out_proj": "attn.to_out",
+            "layer_norm1": "layer_norm1",
+            "layer_norm2": "layer_norm2",
+            "mlp.fc1": "fc1",
+            "mlp.fc2": "fc2",
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if name == "text_model.embeddings.position_embedding.weight":
+                    param = param.reshape((1, param.shape[0], param.shape[1]))
+                state_dict_[rename_dict[name]] = param
+            elif name.startswith("text_model.encoder.layers."):
+                param = state_dict[name]
+                names = name.split(".")
+                layer_id, layer_type, tail = names[3], ".".join(names[4:-1]), names[-1]
+                name_ = ".".join(["encoders", layer_id, attn_rename_dict[layer_type], tail])
+                state_dict_[name_] = param
+        return state_dict_
+    def from_civitai(self, state_dict):
+        rename_dict = {
+            "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight": "token_embedding.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.bias": "encoders.0.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.weight": "encoders.0.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.bias": "encoders.0.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.weight": "encoders.0.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.bias": "encoders.0.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.weight": "encoders.0.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.bias": "encoders.0.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.weight": "encoders.0.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias": "encoders.0.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight": "encoders.0.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias": "encoders.0.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight": "encoders.0.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias": "encoders.0.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight": "encoders.0.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias": "encoders.0.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight": "encoders.0.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.bias": "encoders.1.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.weight": "encoders.1.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.bias": "encoders.1.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.weight": "encoders.1.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.bias": "encoders.1.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.weight": "encoders.1.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.bias": "encoders.1.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.weight": "encoders.1.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias": "encoders.1.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight": "encoders.1.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias": "encoders.1.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight": "encoders.1.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias": "encoders.1.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight": "encoders.1.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias": "encoders.1.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight": "encoders.1.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.bias": "encoders.10.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.weight": "encoders.10.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.bias": "encoders.10.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.weight": "encoders.10.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.bias": "encoders.10.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.weight": "encoders.10.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.bias": "encoders.10.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.weight": "encoders.10.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias": "encoders.10.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight": "encoders.10.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias": "encoders.10.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight": "encoders.10.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias": "encoders.10.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight": "encoders.10.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias": "encoders.10.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight": "encoders.10.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.bias": "encoders.11.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.weight": "encoders.11.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.bias": "encoders.11.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.weight": "encoders.11.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.bias": "encoders.11.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.weight": "encoders.11.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.bias": "encoders.11.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.weight": "encoders.11.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias": "encoders.11.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight": "encoders.11.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.bias": "encoders.11.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.weight": "encoders.11.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias": "encoders.11.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight": "encoders.11.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias": "encoders.11.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight": "encoders.11.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.bias": "encoders.2.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.weight": "encoders.2.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.bias": "encoders.2.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.weight": "encoders.2.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.bias": "encoders.2.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.weight": "encoders.2.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.bias": "encoders.2.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.weight": "encoders.2.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias": "encoders.2.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight": "encoders.2.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias": "encoders.2.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight": "encoders.2.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias": "encoders.2.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight": "encoders.2.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias": "encoders.2.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight": "encoders.2.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.bias": "encoders.3.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.weight": "encoders.3.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.bias": "encoders.3.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.weight": "encoders.3.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.bias": "encoders.3.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.weight": "encoders.3.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.bias": "encoders.3.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.weight": "encoders.3.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias": "encoders.3.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight": "encoders.3.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias": "encoders.3.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight": "encoders.3.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias": "encoders.3.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight": "encoders.3.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias": "encoders.3.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight": "encoders.3.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.bias": "encoders.4.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.weight": "encoders.4.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.bias": "encoders.4.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.weight": "encoders.4.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.bias": "encoders.4.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.weight": "encoders.4.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.bias": "encoders.4.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.weight": "encoders.4.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias": "encoders.4.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight": "encoders.4.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias": "encoders.4.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight": "encoders.4.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias": "encoders.4.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight": "encoders.4.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias": "encoders.4.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight": "encoders.4.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.bias": "encoders.5.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.weight": "encoders.5.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.bias": "encoders.5.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.weight": "encoders.5.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.bias": "encoders.5.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.weight": "encoders.5.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.bias": "encoders.5.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.weight": "encoders.5.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias": "encoders.5.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight": "encoders.5.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias": "encoders.5.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight": "encoders.5.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias": "encoders.5.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight": "encoders.5.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias": "encoders.5.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight": "encoders.5.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.bias": "encoders.6.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.weight": "encoders.6.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.bias": "encoders.6.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.weight": "encoders.6.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.bias": "encoders.6.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.weight": "encoders.6.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.bias": "encoders.6.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.weight": "encoders.6.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias": "encoders.6.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight": "encoders.6.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias": "encoders.6.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight": "encoders.6.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias": "encoders.6.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight": "encoders.6.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias": "encoders.6.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight": "encoders.6.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.bias": "encoders.7.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.weight": "encoders.7.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.bias": "encoders.7.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.weight": "encoders.7.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.bias": "encoders.7.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.weight": "encoders.7.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.bias": "encoders.7.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.weight": "encoders.7.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias": "encoders.7.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight": "encoders.7.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias": "encoders.7.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight": "encoders.7.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias": "encoders.7.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight": "encoders.7.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias": "encoders.7.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight": "encoders.7.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.bias": "encoders.8.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.weight": "encoders.8.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.bias": "encoders.8.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.weight": "encoders.8.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.bias": "encoders.8.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.weight": "encoders.8.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.bias": "encoders.8.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.weight": "encoders.8.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias": "encoders.8.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight": "encoders.8.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias": "encoders.8.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight": "encoders.8.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias": "encoders.8.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight": "encoders.8.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias": "encoders.8.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight": "encoders.8.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.bias": "encoders.9.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.weight": "encoders.9.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.bias": "encoders.9.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.weight": "encoders.9.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.bias": "encoders.9.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.weight": "encoders.9.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.bias": "encoders.9.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.weight": "encoders.9.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias": "encoders.9.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight": "encoders.9.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias": "encoders.9.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight": "encoders.9.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias": "encoders.9.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight": "encoders.9.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias": "encoders.9.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight": "encoders.9.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.final_layer_norm.bias": "final_layer_norm.bias",
+            "cond_stage_model.transformer.text_model.final_layer_norm.weight": "final_layer_norm.weight",
+            "cond_stage_model.transformer.text_model.embeddings.position_embedding.weight": "position_embeds"
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if name == "cond_stage_model.transformer.text_model.embeddings.position_embedding.weight":
+                    param = param.reshape((1, param.shape[0], param.shape[1]))
+                state_dict_[rename_dict[name]] = param
+        return state_dict_

diffsynth/models/sd_unet.py ADDED Viewed

The diff for this file is too large to render. See raw diff

diffsynth/models/sd_vae_decoder.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import torch
+from .attention import Attention
+from .sd_unet import ResnetBlock, UpSampler
+from .tiler import TileWorker
+class VAEAttentionBlock(torch.nn.Module):
+    def __init__(self, num_attention_heads, attention_head_dim, in_channels, num_layers=1, norm_num_groups=32, eps=1e-5):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=eps, affine=True)
+        self.transformer_blocks = torch.nn.ModuleList([
+            Attention(
+                inner_dim,
+                num_attention_heads,
+                attention_head_dim,
+                bias_q=True,
+                bias_kv=True,
+                bias_out=True
+            )
+            for d in range(num_layers)
+        ])
+    def forward(self, hidden_states, time_emb, text_emb, res_stack):
+        batch, _, height, width = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        inner_dim = hidden_states.shape[1]
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states)
+        hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+        hidden_states = hidden_states + residual
+        return hidden_states, time_emb, text_emb, res_stack
+class SDVAEDecoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.scaling_factor = 0.18215
+        self.post_quant_conv = torch.nn.Conv2d(4, 4, kernel_size=1)
+        self.conv_in = torch.nn.Conv2d(4, 512, kernel_size=3, padding=1)
+        self.blocks = torch.nn.ModuleList([
+            # UNetMidBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            VAEAttentionBlock(1, 512, 512, 1, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            # UpDecoderBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            UpSampler(512),
+            # UpDecoderBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            UpSampler(512),
+            # UpDecoderBlock2D
+            ResnetBlock(512, 256, eps=1e-6),
+            ResnetBlock(256, 256, eps=1e-6),
+            ResnetBlock(256, 256, eps=1e-6),
+            UpSampler(256),
+            # UpDecoderBlock2D
+            ResnetBlock(256, 128, eps=1e-6),
+            ResnetBlock(128, 128, eps=1e-6),
+            ResnetBlock(128, 128, eps=1e-6),
+        ])
+        self.conv_norm_out = torch.nn.GroupNorm(num_channels=128, num_groups=32, eps=1e-5)
+        self.conv_act = torch.nn.SiLU()
+        self.conv_out = torch.nn.Conv2d(128, 3, kernel_size=3, padding=1)
+    def tiled_forward(self, sample, tile_size=64, tile_stride=32):
+        hidden_states = TileWorker().tiled_forward(
+            lambda x: self.forward(x),
+            sample,
+            tile_size,
+            tile_stride,
+            tile_device=sample.device,
+            tile_dtype=sample.dtype
+        )
+        return hidden_states
+    def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
+        # For VAE Decoder, we do not need to apply the tiler on each layer.
+        if tiled:
+            return self.tiled_forward(sample, tile_size=tile_size, tile_stride=tile_stride)
+        # 1. pre-process
+        sample = sample / self.scaling_factor
+        hidden_states = self.post_quant_conv(sample)
+        hidden_states = self.conv_in(hidden_states)
+        time_emb = None
+        text_emb = None
+        res_stack = None
+        # 2. blocks
+        for i, block in enumerate(self.blocks):
+            hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
+        # 3. output
+        hidden_states = self.conv_norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+    def state_dict_converter(self):
+        return SDVAEDecoderStateDictConverter()
+class SDVAEDecoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        # architecture
+        block_types = [
+            'ResnetBlock', 'VAEAttentionBlock', 'ResnetBlock',
+            'ResnetBlock', 'ResnetBlock', 'ResnetBlock', 'UpSampler',
+            'ResnetBlock', 'ResnetBlock', 'ResnetBlock', 'UpSampler',
+            'ResnetBlock', 'ResnetBlock', 'ResnetBlock', 'UpSampler',
+            'ResnetBlock', 'ResnetBlock', 'ResnetBlock'
+        ]
+        # Rename each parameter
+        local_rename_dict = {
+            "post_quant_conv": "post_quant_conv",
+            "decoder.conv_in": "conv_in",
+            "decoder.mid_block.attentions.0.group_norm": "blocks.1.norm",
+            "decoder.mid_block.attentions.0.to_q": "blocks.1.transformer_blocks.0.to_q",
+            "decoder.mid_block.attentions.0.to_k": "blocks.1.transformer_blocks.0.to_k",
+            "decoder.mid_block.attentions.0.to_v": "blocks.1.transformer_blocks.0.to_v",
+            "decoder.mid_block.attentions.0.to_out.0": "blocks.1.transformer_blocks.0.to_out",
+            "decoder.mid_block.resnets.0.norm1": "blocks.0.norm1",
+            "decoder.mid_block.resnets.0.conv1": "blocks.0.conv1",
+            "decoder.mid_block.resnets.0.norm2": "blocks.0.norm2",
+            "decoder.mid_block.resnets.0.conv2": "blocks.0.conv2",
+            "decoder.mid_block.resnets.1.norm1": "blocks.2.norm1",
+            "decoder.mid_block.resnets.1.conv1": "blocks.2.conv1",
+            "decoder.mid_block.resnets.1.norm2": "blocks.2.norm2",
+            "decoder.mid_block.resnets.1.conv2": "blocks.2.conv2",
+            "decoder.conv_norm_out": "conv_norm_out",
+            "decoder.conv_out": "conv_out",
+        }
+        name_list = sorted([name for name in state_dict])
+        rename_dict = {}
+        block_id = {"ResnetBlock": 2, "DownSampler": 2, "UpSampler": 2}
+        last_block_type_with_id = {"ResnetBlock": "", "DownSampler": "", "UpSampler": ""}
+        for name in name_list:
+            names = name.split(".")
+            name_prefix = ".".join(names[:-1])
+            if name_prefix in local_rename_dict:
+                rename_dict[name] = local_rename_dict[name_prefix] + "." + names[-1]
+            elif name.startswith("decoder.up_blocks"):
+                block_type = {"resnets": "ResnetBlock", "downsamplers": "DownSampler", "upsamplers": "UpSampler"}[names[3]]
+                block_type_with_id = ".".join(names[:5])
+                if block_type_with_id != last_block_type_with_id[block_type]:
+                    block_id[block_type] += 1
+                last_block_type_with_id[block_type] = block_type_with_id
+                while block_id[block_type] < len(block_types) and block_types[block_id[block_type]] != block_type:
+                    block_id[block_type] += 1
+                block_type_with_id = ".".join(names[:5])
+                names = ["blocks", str(block_id[block_type])] + names[5:]
+                rename_dict[name] = ".".join(names)
+        # Convert state_dict
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name in rename_dict:
+                state_dict_[rename_dict[name]] = param
+        return state_dict_
+    def from_civitai(self, state_dict):
+        rename_dict = {
+            "first_stage_model.decoder.conv_in.bias": "conv_in.bias",
+            "first_stage_model.decoder.conv_in.weight": "conv_in.weight",
+            "first_stage_model.decoder.conv_out.bias": "conv_out.bias",
+            "first_stage_model.decoder.conv_out.weight": "conv_out.weight",
+            "first_stage_model.decoder.mid.attn_1.k.bias": "blocks.1.transformer_blocks.0.to_k.bias",
+            "first_stage_model.decoder.mid.attn_1.k.weight": "blocks.1.transformer_blocks.0.to_k.weight",
+            "first_stage_model.decoder.mid.attn_1.norm.bias": "blocks.1.norm.bias",
+            "first_stage_model.decoder.mid.attn_1.norm.weight": "blocks.1.norm.weight",
+            "first_stage_model.decoder.mid.attn_1.proj_out.bias": "blocks.1.transformer_blocks.0.to_out.bias",
+            "first_stage_model.decoder.mid.attn_1.proj_out.weight": "blocks.1.transformer_blocks.0.to_out.weight",
+            "first_stage_model.decoder.mid.attn_1.q.bias": "blocks.1.transformer_blocks.0.to_q.bias",
+            "first_stage_model.decoder.mid.attn_1.q.weight": "blocks.1.transformer_blocks.0.to_q.weight",
+            "first_stage_model.decoder.mid.attn_1.v.bias": "blocks.1.transformer_blocks.0.to_v.bias",
+            "first_stage_model.decoder.mid.attn_1.v.weight": "blocks.1.transformer_blocks.0.to_v.weight",
+            "first_stage_model.decoder.mid.block_1.conv1.bias": "blocks.0.conv1.bias",
+            "first_stage_model.decoder.mid.block_1.conv1.weight": "blocks.0.conv1.weight",
+            "first_stage_model.decoder.mid.block_1.conv2.bias": "blocks.0.conv2.bias",
+            "first_stage_model.decoder.mid.block_1.conv2.weight": "blocks.0.conv2.weight",
+            "first_stage_model.decoder.mid.block_1.norm1.bias": "blocks.0.norm1.bias",
+            "first_stage_model.decoder.mid.block_1.norm1.weight": "blocks.0.norm1.weight",
+            "first_stage_model.decoder.mid.block_1.norm2.bias": "blocks.0.norm2.bias",
+            "first_stage_model.decoder.mid.block_1.norm2.weight": "blocks.0.norm2.weight",
+            "first_stage_model.decoder.mid.block_2.conv1.bias": "blocks.2.conv1.bias",
+            "first_stage_model.decoder.mid.block_2.conv1.weight": "blocks.2.conv1.weight",
+            "first_stage_model.decoder.mid.block_2.conv2.bias": "blocks.2.conv2.bias",
+            "first_stage_model.decoder.mid.block_2.conv2.weight": "blocks.2.conv2.weight",
+            "first_stage_model.decoder.mid.block_2.norm1.bias": "blocks.2.norm1.bias",
+            "first_stage_model.decoder.mid.block_2.norm1.weight": "blocks.2.norm1.weight",
+            "first_stage_model.decoder.mid.block_2.norm2.bias": "blocks.2.norm2.bias",
+            "first_stage_model.decoder.mid.block_2.norm2.weight": "blocks.2.norm2.weight",
+            "first_stage_model.decoder.norm_out.bias": "conv_norm_out.bias",
+            "first_stage_model.decoder.norm_out.weight": "conv_norm_out.weight",
+            "first_stage_model.decoder.up.0.block.0.conv1.bias": "blocks.15.conv1.bias",
+            "first_stage_model.decoder.up.0.block.0.conv1.weight": "blocks.15.conv1.weight",
+            "first_stage_model.decoder.up.0.block.0.conv2.bias": "blocks.15.conv2.bias",
+            "first_stage_model.decoder.up.0.block.0.conv2.weight": "blocks.15.conv2.weight",
+            "first_stage_model.decoder.up.0.block.0.nin_shortcut.bias": "blocks.15.conv_shortcut.bias",
+            "first_stage_model.decoder.up.0.block.0.nin_shortcut.weight": "blocks.15.conv_shortcut.weight",
+            "first_stage_model.decoder.up.0.block.0.norm1.bias": "blocks.15.norm1.bias",
+            "first_stage_model.decoder.up.0.block.0.norm1.weight": "blocks.15.norm1.weight",
+            "first_stage_model.decoder.up.0.block.0.norm2.bias": "blocks.15.norm2.bias",
+            "first_stage_model.decoder.up.0.block.0.norm2.weight": "blocks.15.norm2.weight",
+            "first_stage_model.decoder.up.0.block.1.conv1.bias": "blocks.16.conv1.bias",
+            "first_stage_model.decoder.up.0.block.1.conv1.weight": "blocks.16.conv1.weight",
+            "first_stage_model.decoder.up.0.block.1.conv2.bias": "blocks.16.conv2.bias",
+            "first_stage_model.decoder.up.0.block.1.conv2.weight": "blocks.16.conv2.weight",
+            "first_stage_model.decoder.up.0.block.1.norm1.bias": "blocks.16.norm1.bias",
+            "first_stage_model.decoder.up.0.block.1.norm1.weight": "blocks.16.norm1.weight",
+            "first_stage_model.decoder.up.0.block.1.norm2.bias": "blocks.16.norm2.bias",
+            "first_stage_model.decoder.up.0.block.1.norm2.weight": "blocks.16.norm2.weight",
+            "first_stage_model.decoder.up.0.block.2.conv1.bias": "blocks.17.conv1.bias",
+            "first_stage_model.decoder.up.0.block.2.conv1.weight": "blocks.17.conv1.weight",
+            "first_stage_model.decoder.up.0.block.2.conv2.bias": "blocks.17.conv2.bias",
+            "first_stage_model.decoder.up.0.block.2.conv2.weight": "blocks.17.conv2.weight",
+            "first_stage_model.decoder.up.0.block.2.norm1.bias": "blocks.17.norm1.bias",
+            "first_stage_model.decoder.up.0.block.2.norm1.weight": "blocks.17.norm1.weight",
+            "first_stage_model.decoder.up.0.block.2.norm2.bias": "blocks.17.norm2.bias",
+            "first_stage_model.decoder.up.0.block.2.norm2.weight": "blocks.17.norm2.weight",
+            "first_stage_model.decoder.up.1.block.0.conv1.bias": "blocks.11.conv1.bias",
+            "first_stage_model.decoder.up.1.block.0.conv1.weight": "blocks.11.conv1.weight",
+            "first_stage_model.decoder.up.1.block.0.conv2.bias": "blocks.11.conv2.bias",
+            "first_stage_model.decoder.up.1.block.0.conv2.weight": "blocks.11.conv2.weight",
+            "first_stage_model.decoder.up.1.block.0.nin_shortcut.bias": "blocks.11.conv_shortcut.bias",
+            "first_stage_model.decoder.up.1.block.0.nin_shortcut.weight": "blocks.11.conv_shortcut.weight",
+            "first_stage_model.decoder.up.1.block.0.norm1.bias": "blocks.11.norm1.bias",
+            "first_stage_model.decoder.up.1.block.0.norm1.weight": "blocks.11.norm1.weight",
+            "first_stage_model.decoder.up.1.block.0.norm2.bias": "blocks.11.norm2.bias",
+            "first_stage_model.decoder.up.1.block.0.norm2.weight": "blocks.11.norm2.weight",
+            "first_stage_model.decoder.up.1.block.1.conv1.bias": "blocks.12.conv1.bias",
+            "first_stage_model.decoder.up.1.block.1.conv1.weight": "blocks.12.conv1.weight",
+            "first_stage_model.decoder.up.1.block.1.conv2.bias": "blocks.12.conv2.bias",
+            "first_stage_model.decoder.up.1.block.1.conv2.weight": "blocks.12.conv2.weight",
+            "first_stage_model.decoder.up.1.block.1.norm1.bias": "blocks.12.norm1.bias",
+            "first_stage_model.decoder.up.1.block.1.norm1.weight": "blocks.12.norm1.weight",
+            "first_stage_model.decoder.up.1.block.1.norm2.bias": "blocks.12.norm2.bias",
+            "first_stage_model.decoder.up.1.block.1.norm2.weight": "blocks.12.norm2.weight",
+            "first_stage_model.decoder.up.1.block.2.conv1.bias": "blocks.13.conv1.bias",
+            "first_stage_model.decoder.up.1.block.2.conv1.weight": "blocks.13.conv1.weight",
+            "first_stage_model.decoder.up.1.block.2.conv2.bias": "blocks.13.conv2.bias",
+            "first_stage_model.decoder.up.1.block.2.conv2.weight": "blocks.13.conv2.weight",
+            "first_stage_model.decoder.up.1.block.2.norm1.bias": "blocks.13.norm1.bias",
+            "first_stage_model.decoder.up.1.block.2.norm1.weight": "blocks.13.norm1.weight",
+            "first_stage_model.decoder.up.1.block.2.norm2.bias": "blocks.13.norm2.bias",
+            "first_stage_model.decoder.up.1.block.2.norm2.weight": "blocks.13.norm2.weight",
+            "first_stage_model.decoder.up.1.upsample.conv.bias": "blocks.14.conv.bias",
+            "first_stage_model.decoder.up.1.upsample.conv.weight": "blocks.14.conv.weight",
+            "first_stage_model.decoder.up.2.block.0.conv1.bias": "blocks.7.conv1.bias",
+            "first_stage_model.decoder.up.2.block.0.conv1.weight": "blocks.7.conv1.weight",
+            "first_stage_model.decoder.up.2.block.0.conv2.bias": "blocks.7.conv2.bias",
+            "first_stage_model.decoder.up.2.block.0.conv2.weight": "blocks.7.conv2.weight",
+            "first_stage_model.decoder.up.2.block.0.norm1.bias": "blocks.7.norm1.bias",
+            "first_stage_model.decoder.up.2.block.0.norm1.weight": "blocks.7.norm1.weight",
+            "first_stage_model.decoder.up.2.block.0.norm2.bias": "blocks.7.norm2.bias",
+            "first_stage_model.decoder.up.2.block.0.norm2.weight": "blocks.7.norm2.weight",
+            "first_stage_model.decoder.up.2.block.1.conv1.bias": "blocks.8.conv1.bias",
+            "first_stage_model.decoder.up.2.block.1.conv1.weight": "blocks.8.conv1.weight",
+            "first_stage_model.decoder.up.2.block.1.conv2.bias": "blocks.8.conv2.bias",
+            "first_stage_model.decoder.up.2.block.1.conv2.weight": "blocks.8.conv2.weight",
+            "first_stage_model.decoder.up.2.block.1.norm1.bias": "blocks.8.norm1.bias",
+            "first_stage_model.decoder.up.2.block.1.norm1.weight": "blocks.8.norm1.weight",
+            "first_stage_model.decoder.up.2.block.1.norm2.bias": "blocks.8.norm2.bias",
+            "first_stage_model.decoder.up.2.block.1.norm2.weight": "blocks.8.norm2.weight",
+            "first_stage_model.decoder.up.2.block.2.conv1.bias": "blocks.9.conv1.bias",
+            "first_stage_model.decoder.up.2.block.2.conv1.weight": "blocks.9.conv1.weight",
+            "first_stage_model.decoder.up.2.block.2.conv2.bias": "blocks.9.conv2.bias",
+            "first_stage_model.decoder.up.2.block.2.conv2.weight": "blocks.9.conv2.weight",
+            "first_stage_model.decoder.up.2.block.2.norm1.bias": "blocks.9.norm1.bias",
+            "first_stage_model.decoder.up.2.block.2.norm1.weight": "blocks.9.norm1.weight",
+            "first_stage_model.decoder.up.2.block.2.norm2.bias": "blocks.9.norm2.bias",
+            "first_stage_model.decoder.up.2.block.2.norm2.weight": "blocks.9.norm2.weight",
+            "first_stage_model.decoder.up.2.upsample.conv.bias": "blocks.10.conv.bias",
+            "first_stage_model.decoder.up.2.upsample.conv.weight": "blocks.10.conv.weight",
+            "first_stage_model.decoder.up.3.block.0.conv1.bias": "blocks.3.conv1.bias",
+            "first_stage_model.decoder.up.3.block.0.conv1.weight": "blocks.3.conv1.weight",
+            "first_stage_model.decoder.up.3.block.0.conv2.bias": "blocks.3.conv2.bias",
+            "first_stage_model.decoder.up.3.block.0.conv2.weight": "blocks.3.conv2.weight",
+            "first_stage_model.decoder.up.3.block.0.norm1.bias": "blocks.3.norm1.bias",
+            "first_stage_model.decoder.up.3.block.0.norm1.weight": "blocks.3.norm1.weight",
+            "first_stage_model.decoder.up.3.block.0.norm2.bias": "blocks.3.norm2.bias",
+            "first_stage_model.decoder.up.3.block.0.norm2.weight": "blocks.3.norm2.weight",
+            "first_stage_model.decoder.up.3.block.1.conv1.bias": "blocks.4.conv1.bias",
+            "first_stage_model.decoder.up.3.block.1.conv1.weight": "blocks.4.conv1.weight",
+            "first_stage_model.decoder.up.3.block.1.conv2.bias": "blocks.4.conv2.bias",
+            "first_stage_model.decoder.up.3.block.1.conv2.weight": "blocks.4.conv2.weight",
+            "first_stage_model.decoder.up.3.block.1.norm1.bias": "blocks.4.norm1.bias",
+            "first_stage_model.decoder.up.3.block.1.norm1.weight": "blocks.4.norm1.weight",
+            "first_stage_model.decoder.up.3.block.1.norm2.bias": "blocks.4.norm2.bias",
+            "first_stage_model.decoder.up.3.block.1.norm2.weight": "blocks.4.norm2.weight",
+            "first_stage_model.decoder.up.3.block.2.conv1.bias": "blocks.5.conv1.bias",
+            "first_stage_model.decoder.up.3.block.2.conv1.weight": "blocks.5.conv1.weight",
+            "first_stage_model.decoder.up.3.block.2.conv2.bias": "blocks.5.conv2.bias",
+            "first_stage_model.decoder.up.3.block.2.conv2.weight": "blocks.5.conv2.weight",
+            "first_stage_model.decoder.up.3.block.2.norm1.bias": "blocks.5.norm1.bias",
+            "first_stage_model.decoder.up.3.block.2.norm1.weight": "blocks.5.norm1.weight",
+            "first_stage_model.decoder.up.3.block.2.norm2.bias": "blocks.5.norm2.bias",
+            "first_stage_model.decoder.up.3.block.2.norm2.weight": "blocks.5.norm2.weight",
+            "first_stage_model.decoder.up.3.upsample.conv.bias": "blocks.6.conv.bias",
+            "first_stage_model.decoder.up.3.upsample.conv.weight": "blocks.6.conv.weight",
+            "first_stage_model.post_quant_conv.bias": "post_quant_conv.bias",
+            "first_stage_model.post_quant_conv.weight": "post_quant_conv.weight",
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if "transformer_blocks" in rename_dict[name]:
+                    param = param.squeeze()
+                state_dict_[rename_dict[name]] = param
+        return state_dict_

diffsynth/models/sd_vae_encoder.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import torch
+from .sd_unet import ResnetBlock, DownSampler
+from .sd_vae_decoder import VAEAttentionBlock
+from .tiler import TileWorker
+from einops import rearrange
+class SDVAEEncoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.scaling_factor = 0.18215
+        self.quant_conv = torch.nn.Conv2d(8, 8, kernel_size=1)
+        self.conv_in = torch.nn.Conv2d(3, 128, kernel_size=3, padding=1)
+        self.blocks = torch.nn.ModuleList([
+            # DownEncoderBlock2D
+            ResnetBlock(128, 128, eps=1e-6),
+            ResnetBlock(128, 128, eps=1e-6),
+            DownSampler(128, padding=0, extra_padding=True),
+            # DownEncoderBlock2D
+            ResnetBlock(128, 256, eps=1e-6),
+            ResnetBlock(256, 256, eps=1e-6),
+            DownSampler(256, padding=0, extra_padding=True),
+            # DownEncoderBlock2D
+            ResnetBlock(256, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            DownSampler(512, padding=0, extra_padding=True),
+            # DownEncoderBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            # UNetMidBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            VAEAttentionBlock(1, 512, 512, 1, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+        ])
+        self.conv_norm_out = torch.nn.GroupNorm(num_channels=512, num_groups=32, eps=1e-6)
+        self.conv_act = torch.nn.SiLU()
+        self.conv_out = torch.nn.Conv2d(512, 8, kernel_size=3, padding=1)
+    def tiled_forward(self, sample, tile_size=64, tile_stride=32):
+        hidden_states = TileWorker().tiled_forward(
+            lambda x: self.forward(x),
+            sample,
+            tile_size,
+            tile_stride,
+            tile_device=sample.device,
+            tile_dtype=sample.dtype
+        )
+        return hidden_states
+    def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
+        # For VAE Decoder, we do not need to apply the tiler on each layer.
+        if tiled:
+            return self.tiled_forward(sample, tile_size=tile_size, tile_stride=tile_stride)
+        # 1. pre-process
+        hidden_states = self.conv_in(sample)
+        time_emb = None
+        text_emb = None
+        res_stack = None
+        # 2. blocks
+        for i, block in enumerate(self.blocks):
+            hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
+        # 3. output
+        hidden_states = self.conv_norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        hidden_states = self.quant_conv(hidden_states)
+        hidden_states = hidden_states[:, :4]
+        hidden_states *= self.scaling_factor
+        return hidden_states
+    def encode_video(self, sample, batch_size=8):
+        B = sample.shape[0]
+        hidden_states = []
+        for i in range(0, sample.shape[2], batch_size):
+            j = min(i + batch_size, sample.shape[2])
+            sample_batch = rearrange(sample[:,:,i:j], "B C T H W -> (B T) C H W")
+            hidden_states_batch = self(sample_batch)
+            hidden_states_batch = rearrange(hidden_states_batch, "(B T) C H W -> B C T H W", B=B)
+            hidden_states.append(hidden_states_batch)
+        hidden_states = torch.concat(hidden_states, dim=2)
+        return hidden_states
+    def state_dict_converter(self):
+        return SDVAEEncoderStateDictConverter()
+class SDVAEEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        # architecture
+        block_types = [
+            'ResnetBlock', 'ResnetBlock', 'DownSampler',
+            'ResnetBlock', 'ResnetBlock', 'DownSampler',
+            'ResnetBlock', 'ResnetBlock', 'DownSampler',
+            'ResnetBlock', 'ResnetBlock',
+            'ResnetBlock', 'VAEAttentionBlock', 'ResnetBlock'
+        ]
+        # Rename each parameter
+        local_rename_dict = {
+            "quant_conv": "quant_conv",
+            "encoder.conv_in": "conv_in",
+            "encoder.mid_block.attentions.0.group_norm": "blocks.12.norm",
+            "encoder.mid_block.attentions.0.to_q": "blocks.12.transformer_blocks.0.to_q",
+            "encoder.mid_block.attentions.0.to_k": "blocks.12.transformer_blocks.0.to_k",
+            "encoder.mid_block.attentions.0.to_v": "blocks.12.transformer_blocks.0.to_v",
+            "encoder.mid_block.attentions.0.to_out.0": "blocks.12.transformer_blocks.0.to_out",
+            "encoder.mid_block.resnets.0.norm1": "blocks.11.norm1",
+            "encoder.mid_block.resnets.0.conv1": "blocks.11.conv1",
+            "encoder.mid_block.resnets.0.norm2": "blocks.11.norm2",
+            "encoder.mid_block.resnets.0.conv2": "blocks.11.conv2",
+            "encoder.mid_block.resnets.1.norm1": "blocks.13.norm1",
+            "encoder.mid_block.resnets.1.conv1": "blocks.13.conv1",
+            "encoder.mid_block.resnets.1.norm2": "blocks.13.norm2",
+            "encoder.mid_block.resnets.1.conv2": "blocks.13.conv2",
+            "encoder.conv_norm_out": "conv_norm_out",
+            "encoder.conv_out": "conv_out",
+        }
+        name_list = sorted([name for name in state_dict])
+        rename_dict = {}
+        block_id = {"ResnetBlock": -1, "DownSampler": -1, "UpSampler": -1}
+        last_block_type_with_id = {"ResnetBlock": "", "DownSampler": "", "UpSampler": ""}
+        for name in name_list:
+            names = name.split(".")
+            name_prefix = ".".join(names[:-1])
+            if name_prefix in local_rename_dict:
+                rename_dict[name] = local_rename_dict[name_prefix] + "." + names[-1]
+            elif name.startswith("encoder.down_blocks"):
+                block_type = {"resnets": "ResnetBlock", "downsamplers": "DownSampler", "upsamplers": "UpSampler"}[names[3]]
+                block_type_with_id = ".".join(names[:5])
+                if block_type_with_id != last_block_type_with_id[block_type]:
+                    block_id[block_type] += 1
+                last_block_type_with_id[block_type] = block_type_with_id
+                while block_id[block_type] < len(block_types) and block_types[block_id[block_type]] != block_type:
+                    block_id[block_type] += 1
+                block_type_with_id = ".".join(names[:5])
+                names = ["blocks", str(block_id[block_type])] + names[5:]
+                rename_dict[name] = ".".join(names)
+        # Convert state_dict
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name in rename_dict:
+                state_dict_[rename_dict[name]] = param
+        return state_dict_
+    def from_civitai(self, state_dict):
+        rename_dict = {
+            "first_stage_model.encoder.conv_in.bias": "conv_in.bias",
+            "first_stage_model.encoder.conv_in.weight": "conv_in.weight",
+            "first_stage_model.encoder.conv_out.bias": "conv_out.bias",
+            "first_stage_model.encoder.conv_out.weight": "conv_out.weight",
+            "first_stage_model.encoder.down.0.block.0.conv1.bias": "blocks.0.conv1.bias",
+            "first_stage_model.encoder.down.0.block.0.conv1.weight": "blocks.0.conv1.weight",
+            "first_stage_model.encoder.down.0.block.0.conv2.bias": "blocks.0.conv2.bias",
+            "first_stage_model.encoder.down.0.block.0.conv2.weight": "blocks.0.conv2.weight",
+            "first_stage_model.encoder.down.0.block.0.norm1.bias": "blocks.0.norm1.bias",
+            "first_stage_model.encoder.down.0.block.0.norm1.weight": "blocks.0.norm1.weight",
+            "first_stage_model.encoder.down.0.block.0.norm2.bias": "blocks.0.norm2.bias",
+            "first_stage_model.encoder.down.0.block.0.norm2.weight": "blocks.0.norm2.weight",
+            "first_stage_model.encoder.down.0.block.1.conv1.bias": "blocks.1.conv1.bias",
+            "first_stage_model.encoder.down.0.block.1.conv1.weight": "blocks.1.conv1.weight",
+            "first_stage_model.encoder.down.0.block.1.conv2.bias": "blocks.1.conv2.bias",
+            "first_stage_model.encoder.down.0.block.1.conv2.weight": "blocks.1.conv2.weight",
+            "first_stage_model.encoder.down.0.block.1.norm1.bias": "blocks.1.norm1.bias",
+            "first_stage_model.encoder.down.0.block.1.norm1.weight": "blocks.1.norm1.weight",
+            "first_stage_model.encoder.down.0.block.1.norm2.bias": "blocks.1.norm2.bias",
+            "first_stage_model.encoder.down.0.block.1.norm2.weight": "blocks.1.norm2.weight",
+            "first_stage_model.encoder.down.0.downsample.conv.bias": "blocks.2.conv.bias",
+            "first_stage_model.encoder.down.0.downsample.conv.weight": "blocks.2.conv.weight",
+            "first_stage_model.encoder.down.1.block.0.conv1.bias": "blocks.3.conv1.bias",
+            "first_stage_model.encoder.down.1.block.0.conv1.weight": "blocks.3.conv1.weight",
+            "first_stage_model.encoder.down.1.block.0.conv2.bias": "blocks.3.conv2.bias",
+            "first_stage_model.encoder.down.1.block.0.conv2.weight": "blocks.3.conv2.weight",
+            "first_stage_model.encoder.down.1.block.0.nin_shortcut.bias": "blocks.3.conv_shortcut.bias",
+            "first_stage_model.encoder.down.1.block.0.nin_shortcut.weight": "blocks.3.conv_shortcut.weight",
+            "first_stage_model.encoder.down.1.block.0.norm1.bias": "blocks.3.norm1.bias",
+            "first_stage_model.encoder.down.1.block.0.norm1.weight": "blocks.3.norm1.weight",
+            "first_stage_model.encoder.down.1.block.0.norm2.bias": "blocks.3.norm2.bias",
+            "first_stage_model.encoder.down.1.block.0.norm2.weight": "blocks.3.norm2.weight",
+            "first_stage_model.encoder.down.1.block.1.conv1.bias": "blocks.4.conv1.bias",
+            "first_stage_model.encoder.down.1.block.1.conv1.weight": "blocks.4.conv1.weight",
+            "first_stage_model.encoder.down.1.block.1.conv2.bias": "blocks.4.conv2.bias",
+            "first_stage_model.encoder.down.1.block.1.conv2.weight": "blocks.4.conv2.weight",
+            "first_stage_model.encoder.down.1.block.1.norm1.bias": "blocks.4.norm1.bias",
+            "first_stage_model.encoder.down.1.block.1.norm1.weight": "blocks.4.norm1.weight",
+            "first_stage_model.encoder.down.1.block.1.norm2.bias": "blocks.4.norm2.bias",
+            "first_stage_model.encoder.down.1.block.1.norm2.weight": "blocks.4.norm2.weight",
+            "first_stage_model.encoder.down.1.downsample.conv.bias": "blocks.5.conv.bias",
+            "first_stage_model.encoder.down.1.downsample.conv.weight": "blocks.5.conv.weight",
+            "first_stage_model.encoder.down.2.block.0.conv1.bias": "blocks.6.conv1.bias",
+            "first_stage_model.encoder.down.2.block.0.conv1.weight": "blocks.6.conv1.weight",
+            "first_stage_model.encoder.down.2.block.0.conv2.bias": "blocks.6.conv2.bias",
+            "first_stage_model.encoder.down.2.block.0.conv2.weight": "blocks.6.conv2.weight",
+            "first_stage_model.encoder.down.2.block.0.nin_shortcut.bias": "blocks.6.conv_shortcut.bias",
+            "first_stage_model.encoder.down.2.block.0.nin_shortcut.weight": "blocks.6.conv_shortcut.weight",
+            "first_stage_model.encoder.down.2.block.0.norm1.bias": "blocks.6.norm1.bias",
+            "first_stage_model.encoder.down.2.block.0.norm1.weight": "blocks.6.norm1.weight",
+            "first_stage_model.encoder.down.2.block.0.norm2.bias": "blocks.6.norm2.bias",
+            "first_stage_model.encoder.down.2.block.0.norm2.weight": "blocks.6.norm2.weight",
+            "first_stage_model.encoder.down.2.block.1.conv1.bias": "blocks.7.conv1.bias",
+            "first_stage_model.encoder.down.2.block.1.conv1.weight": "blocks.7.conv1.weight",
+            "first_stage_model.encoder.down.2.block.1.conv2.bias": "blocks.7.conv2.bias",
+            "first_stage_model.encoder.down.2.block.1.conv2.weight": "blocks.7.conv2.weight",
+            "first_stage_model.encoder.down.2.block.1.norm1.bias": "blocks.7.norm1.bias",
+            "first_stage_model.encoder.down.2.block.1.norm1.weight": "blocks.7.norm1.weight",
+            "first_stage_model.encoder.down.2.block.1.norm2.bias": "blocks.7.norm2.bias",
+            "first_stage_model.encoder.down.2.block.1.norm2.weight": "blocks.7.norm2.weight",
+            "first_stage_model.encoder.down.2.downsample.conv.bias": "blocks.8.conv.bias",
+            "first_stage_model.encoder.down.2.downsample.conv.weight": "blocks.8.conv.weight",
+            "first_stage_model.encoder.down.3.block.0.conv1.bias": "blocks.9.conv1.bias",
+            "first_stage_model.encoder.down.3.block.0.conv1.weight": "blocks.9.conv1.weight",
+            "first_stage_model.encoder.down.3.block.0.conv2.bias": "blocks.9.conv2.bias",
+            "first_stage_model.encoder.down.3.block.0.conv2.weight": "blocks.9.conv2.weight",
+            "first_stage_model.encoder.down.3.block.0.norm1.bias": "blocks.9.norm1.bias",
+            "first_stage_model.encoder.down.3.block.0.norm1.weight": "blocks.9.norm1.weight",
+            "first_stage_model.encoder.down.3.block.0.norm2.bias": "blocks.9.norm2.bias",
+            "first_stage_model.encoder.down.3.block.0.norm2.weight": "blocks.9.norm2.weight",
+            "first_stage_model.encoder.down.3.block.1.conv1.bias": "blocks.10.conv1.bias",
+            "first_stage_model.encoder.down.3.block.1.conv1.weight": "blocks.10.conv1.weight",
+            "first_stage_model.encoder.down.3.block.1.conv2.bias": "blocks.10.conv2.bias",
+            "first_stage_model.encoder.down.3.block.1.conv2.weight": "blocks.10.conv2.weight",
+            "first_stage_model.encoder.down.3.block.1.norm1.bias": "blocks.10.norm1.bias",
+            "first_stage_model.encoder.down.3.block.1.norm1.weight": "blocks.10.norm1.weight",
+            "first_stage_model.encoder.down.3.block.1.norm2.bias": "blocks.10.norm2.bias",
+            "first_stage_model.encoder.down.3.block.1.norm2.weight": "blocks.10.norm2.weight",
+            "first_stage_model.encoder.mid.attn_1.k.bias": "blocks.12.transformer_blocks.0.to_k.bias",
+            "first_stage_model.encoder.mid.attn_1.k.weight": "blocks.12.transformer_blocks.0.to_k.weight",
+            "first_stage_model.encoder.mid.attn_1.norm.bias": "blocks.12.norm.bias",
+            "first_stage_model.encoder.mid.attn_1.norm.weight": "blocks.12.norm.weight",
+            "first_stage_model.encoder.mid.attn_1.proj_out.bias": "blocks.12.transformer_blocks.0.to_out.bias",
+            "first_stage_model.encoder.mid.attn_1.proj_out.weight": "blocks.12.transformer_blocks.0.to_out.weight",
+            "first_stage_model.encoder.mid.attn_1.q.bias": "blocks.12.transformer_blocks.0.to_q.bias",
+            "first_stage_model.encoder.mid.attn_1.q.weight": "blocks.12.transformer_blocks.0.to_q.weight",
+            "first_stage_model.encoder.mid.attn_1.v.bias": "blocks.12.transformer_blocks.0.to_v.bias",
+            "first_stage_model.encoder.mid.attn_1.v.weight": "blocks.12.transformer_blocks.0.to_v.weight",
+            "first_stage_model.encoder.mid.block_1.conv1.bias": "blocks.11.conv1.bias",
+            "first_stage_model.encoder.mid.block_1.conv1.weight": "blocks.11.conv1.weight",
+            "first_stage_model.encoder.mid.block_1.conv2.bias": "blocks.11.conv2.bias",
+            "first_stage_model.encoder.mid.block_1.conv2.weight": "blocks.11.conv2.weight",
+            "first_stage_model.encoder.mid.block_1.norm1.bias": "blocks.11.norm1.bias",
+            "first_stage_model.encoder.mid.block_1.norm1.weight": "blocks.11.norm1.weight",
+            "first_stage_model.encoder.mid.block_1.norm2.bias": "blocks.11.norm2.bias",
+            "first_stage_model.encoder.mid.block_1.norm2.weight": "blocks.11.norm2.weight",
+            "first_stage_model.encoder.mid.block_2.conv1.bias": "blocks.13.conv1.bias",
+            "first_stage_model.encoder.mid.block_2.conv1.weight": "blocks.13.conv1.weight",
+            "first_stage_model.encoder.mid.block_2.conv2.bias": "blocks.13.conv2.bias",
+            "first_stage_model.encoder.mid.block_2.conv2.weight": "blocks.13.conv2.weight",
+            "first_stage_model.encoder.mid.block_2.norm1.bias": "blocks.13.norm1.bias",
+            "first_stage_model.encoder.mid.block_2.norm1.weight": "blocks.13.norm1.weight",
+            "first_stage_model.encoder.mid.block_2.norm2.bias": "blocks.13.norm2.bias",
+            "first_stage_model.encoder.mid.block_2.norm2.weight": "blocks.13.norm2.weight",
+            "first_stage_model.encoder.norm_out.bias": "conv_norm_out.bias",
+            "first_stage_model.encoder.norm_out.weight": "conv_norm_out.weight",
+            "first_stage_model.quant_conv.bias": "quant_conv.bias",
+            "first_stage_model.quant_conv.weight": "quant_conv.weight",
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if "transformer_blocks" in rename_dict[name]:
+                    param = param.squeeze()
+                state_dict_[rename_dict[name]] = param
+        return state_dict_

diffsynth/models/sdxl_ipadapter.py ADDED Viewed

	@@ -0,0 +1,121 @@

+from .svd_image_encoder import SVDImageEncoder
+from transformers import CLIPImageProcessor
+import torch
+class IpAdapterXLCLIPImageEmbedder(SVDImageEncoder):
+    def __init__(self):
+        super().__init__(embed_dim=1664, encoder_intermediate_size=8192, projection_dim=1280, num_encoder_layers=48, num_heads=16, head_dim=104)
+        self.image_processor = CLIPImageProcessor()
+    def forward(self, image):
+        pixel_values = self.image_processor(images=image, return_tensors="pt").pixel_values
+        pixel_values = pixel_values.to(device=self.embeddings.class_embedding.device, dtype=self.embeddings.class_embedding.dtype)
+        return super().forward(pixel_values)
+class IpAdapterImageProjModel(torch.nn.Module):
+    def __init__(self, cross_attention_dim=2048, clip_embeddings_dim=1280, clip_extra_context_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+    def forward(self, image_embeds):
+        clip_extra_context_tokens = self.proj(image_embeds).reshape(-1, self.clip_extra_context_tokens, self.cross_attention_dim)
+        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
+        return clip_extra_context_tokens
+class IpAdapterModule(torch.nn.Module):
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.to_k_ip = torch.nn.Linear(input_dim, output_dim, bias=False)
+        self.to_v_ip = torch.nn.Linear(input_dim, output_dim, bias=False)
+    def forward(self, hidden_states):
+        ip_k = self.to_k_ip(hidden_states)
+        ip_v = self.to_v_ip(hidden_states)
+        return ip_k, ip_v
+class SDXLIpAdapter(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        shape_list = [(2048, 640)] * 4 + [(2048, 1280)] * 50 + [(2048, 640)] * 6 + [(2048, 1280)] * 10
+        self.ipadapter_modules = torch.nn.ModuleList([IpAdapterModule(*shape) for shape in shape_list])
+        self.image_proj = IpAdapterImageProjModel()
+        self.set_full_adapter()
+    def set_full_adapter(self):
+        map_list = sum([
+            [(7, i) for i in range(2)],
+            [(10, i) for i in range(2)],
+            [(15, i) for i in range(10)],
+            [(18, i) for i in range(10)],
+            [(25, i) for i in range(10)],
+            [(28, i) for i in range(10)],
+            [(31, i) for i in range(10)],
+            [(35, i) for i in range(2)],
+            [(38, i) for i in range(2)],
+            [(41, i) for i in range(2)],
+            [(21, i) for i in range(10)],
+        ], [])
+        self.call_block_id = {i: j for j, i in enumerate(map_list)}
+    def set_less_adapter(self):
+        map_list = sum([
+            [(7, i) for i in range(2)],
+            [(10, i) for i in range(2)],
+            [(15, i) for i in range(10)],
+            [(18, i) for i in range(10)],
+            [(25, i) for i in range(10)],
+            [(28, i) for i in range(10)],
+            [(31, i) for i in range(10)],
+            [(35, i) for i in range(2)],
+            [(38, i) for i in range(2)],
+            [(41, i) for i in range(2)],
+            [(21, i) for i in range(10)],
+        ], [])
+        self.call_block_id = {i: j for j, i in enumerate(map_list) if j>=34 and j<44}
+    def forward(self, hidden_states, scale=1.0):
+        hidden_states = self.image_proj(hidden_states)
+        hidden_states = hidden_states.view(1, -1, hidden_states.shape[-1])
+        ip_kv_dict = {}
+        for (block_id, transformer_id) in self.call_block_id:
+            ipadapter_id = self.call_block_id[(block_id, transformer_id)]
+            ip_k, ip_v = self.ipadapter_modules[ipadapter_id](hidden_states)
+            if block_id not in ip_kv_dict:
+                ip_kv_dict[block_id] = {}
+            ip_kv_dict[block_id][transformer_id] = {
+                "ip_k": ip_k,
+                "ip_v": ip_v,
+                "scale": scale
+            }
+        return ip_kv_dict
+    def state_dict_converter(self):
+        return SDXLIpAdapterStateDictConverter()
+class SDXLIpAdapterStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        state_dict_ = {}
+        for name in state_dict["ip_adapter"]:
+            names = name.split(".")
+            layer_id = str(int(names[0]) // 2)
+            name_ = ".".join(["ipadapter_modules"] + [layer_id] + names[1:])
+            state_dict_[name_] = state_dict["ip_adapter"][name]
+        for name in state_dict["image_proj"]:
+            name_ = "image_proj." + name
+            state_dict_[name_] = state_dict["image_proj"][name]
+        return state_dict_
+    def from_civitai(self, state_dict):
+        return self.from_diffusers(state_dict)

diffsynth/models/sdxl_motion.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from .sd_motion import TemporalBlock
+import torch
+class SDXLMotionModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.motion_modules = torch.nn.ModuleList([
+            TemporalBlock(8, 320//8, 320, eps=1e-6),
+            TemporalBlock(8, 320//8, 320, eps=1e-6),
+            TemporalBlock(8, 640//8, 640, eps=1e-6),
+            TemporalBlock(8, 640//8, 640, eps=1e-6),
+            TemporalBlock(8, 1280//8, 1280, eps=1e-6),
+            TemporalBlock(8, 1280//8, 1280, eps=1e-6),
+            TemporalBlock(8, 1280//8, 1280, eps=1e-6),
+            TemporalBlock(8, 1280//8, 1280, eps=1e-6),
+            TemporalBlock(8, 1280//8, 1280, eps=1e-6),
+            TemporalBlock(8, 640//8, 640, eps=1e-6),
+            TemporalBlock(8, 640//8, 640, eps=1e-6),
+            TemporalBlock(8, 640//8, 640, eps=1e-6),
+            TemporalBlock(8, 320//8, 320, eps=1e-6),
+            TemporalBlock(8, 320//8, 320, eps=1e-6),
+            TemporalBlock(8, 320//8, 320, eps=1e-6),
+        ])
+        self.call_block_id = {
+            0: 0,
+            2: 1,
+            7: 2,
+            10: 3,
+            15: 4,
+            18: 5,
+            25: 6,
+            28: 7,
+            31: 8,
+            35: 9,
+            38: 10,
+            41: 11,
+            44: 12,
+            46: 13,
+            48: 14,
+        }
+    def forward(self):
+        pass
+    def state_dict_converter(self):
+        return SDMotionModelStateDictConverter()
+class SDMotionModelStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        rename_dict = {
+            "norm": "norm",
+            "proj_in": "proj_in",
+            "transformer_blocks.0.attention_blocks.0.to_q": "transformer_blocks.0.attn1.to_q",
+            "transformer_blocks.0.attention_blocks.0.to_k": "transformer_blocks.0.attn1.to_k",
+            "transformer_blocks.0.attention_blocks.0.to_v": "transformer_blocks.0.attn1.to_v",
+            "transformer_blocks.0.attention_blocks.0.to_out.0": "transformer_blocks.0.attn1.to_out",
+            "transformer_blocks.0.attention_blocks.0.pos_encoder": "transformer_blocks.0.pe1",
+            "transformer_blocks.0.attention_blocks.1.to_q": "transformer_blocks.0.attn2.to_q",
+            "transformer_blocks.0.attention_blocks.1.to_k": "transformer_blocks.0.attn2.to_k",
+            "transformer_blocks.0.attention_blocks.1.to_v": "transformer_blocks.0.attn2.to_v",
+            "transformer_blocks.0.attention_blocks.1.to_out.0": "transformer_blocks.0.attn2.to_out",
+            "transformer_blocks.0.attention_blocks.1.pos_encoder": "transformer_blocks.0.pe2",
+            "transformer_blocks.0.norms.0": "transformer_blocks.0.norm1",
+            "transformer_blocks.0.norms.1": "transformer_blocks.0.norm2",
+            "transformer_blocks.0.ff.net.0.proj": "transformer_blocks.0.act_fn.proj",
+            "transformer_blocks.0.ff.net.2": "transformer_blocks.0.ff",
+            "transformer_blocks.0.ff_norm": "transformer_blocks.0.norm3",
+            "proj_out": "proj_out",
+        }
+        name_list = sorted([i for i in state_dict if i.startswith("down_blocks.")])
+        name_list += sorted([i for i in state_dict if i.startswith("mid_block.")])
+        name_list += sorted([i for i in state_dict if i.startswith("up_blocks.")])
+        state_dict_ = {}
+        last_prefix, module_id = "", -1
+        for name in name_list:
+            names = name.split(".")
+            prefix_index = names.index("temporal_transformer") + 1
+            prefix = ".".join(names[:prefix_index])
+            if prefix != last_prefix:
+                last_prefix = prefix
+                module_id += 1
+            middle_name = ".".join(names[prefix_index:-1])
+            suffix = names[-1]
+            if "pos_encoder" in names:
+                rename = ".".join(["motion_modules", str(module_id), rename_dict[middle_name]])
+            else:
+                rename = ".".join(["motion_modules", str(module_id), rename_dict[middle_name], suffix])
+            state_dict_[rename] = state_dict[name]
+        return state_dict_
+    def from_civitai(self, state_dict):
+        return self.from_diffusers(state_dict)

diffsynth/models/sdxl_text_encoder.py ADDED Viewed

	@@ -0,0 +1,757 @@

+import torch
+from .sd_text_encoder import CLIPEncoderLayer
+class SDXLTextEncoder(torch.nn.Module):
+    def __init__(self, embed_dim=768, vocab_size=49408, max_position_embeddings=77, num_encoder_layers=11, encoder_intermediate_size=3072):
+        super().__init__()
+        # token_embedding
+        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
+        # position_embeds (This is a fixed tensor)
+        self.position_embeds = torch.nn.Parameter(torch.zeros(1, max_position_embeddings, embed_dim))
+        # encoders
+        self.encoders = torch.nn.ModuleList([CLIPEncoderLayer(embed_dim, encoder_intermediate_size) for _ in range(num_encoder_layers)])
+        # attn_mask
+        self.attn_mask = self.attention_mask(max_position_embeddings)
+        # The text encoder is different to that in Stable Diffusion 1.x.
+        # It does not include final_layer_norm.
+    def attention_mask(self, length):
+        mask = torch.empty(length, length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)
+        return mask
+    def forward(self, input_ids, clip_skip=1):
+        embeds = self.token_embedding(input_ids) + self.position_embeds
+        attn_mask = self.attn_mask.to(device=embeds.device, dtype=embeds.dtype)
+        for encoder_id, encoder in enumerate(self.encoders):
+            embeds = encoder(embeds, attn_mask=attn_mask)
+            if encoder_id + clip_skip == len(self.encoders):
+                break
+        return embeds
+    def state_dict_converter(self):
+        return SDXLTextEncoderStateDictConverter()
+class SDXLTextEncoder2(torch.nn.Module):
+    def __init__(self, embed_dim=1280, vocab_size=49408, max_position_embeddings=77, num_encoder_layers=32, encoder_intermediate_size=5120):
+        super().__init__()
+        # token_embedding
+        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
+        # position_embeds (This is a fixed tensor)
+        self.position_embeds = torch.nn.Parameter(torch.zeros(1, max_position_embeddings, embed_dim))
+        # encoders
+        self.encoders = torch.nn.ModuleList([CLIPEncoderLayer(embed_dim, encoder_intermediate_size, num_heads=20, head_dim=64, use_quick_gelu=False) for _ in range(num_encoder_layers)])
+        # attn_mask
+        self.attn_mask = self.attention_mask(max_position_embeddings)
+        # final_layer_norm
+        self.final_layer_norm = torch.nn.LayerNorm(embed_dim)
+        # text_projection
+        self.text_projection = torch.nn.Linear(embed_dim, embed_dim, bias=False)
+    def attention_mask(self, length):
+        mask = torch.empty(length, length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)
+        return mask
+    def forward(self, input_ids, clip_skip=2):
+        embeds = self.token_embedding(input_ids) + self.position_embeds
+        attn_mask = self.attn_mask.to(device=embeds.device, dtype=embeds.dtype)
+        for encoder_id, encoder in enumerate(self.encoders):
+            embeds = encoder(embeds, attn_mask=attn_mask)
+            if encoder_id + clip_skip == len(self.encoders):
+                hidden_states = embeds
+        embeds = self.final_layer_norm(embeds)
+        pooled_embeds = embeds[torch.arange(embeds.shape[0]), input_ids.to(dtype=torch.int).argmax(dim=-1)]
+        pooled_embeds = self.text_projection(pooled_embeds)
+        return pooled_embeds, hidden_states
+    def state_dict_converter(self):
+        return SDXLTextEncoder2StateDictConverter()
+class SDXLTextEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        rename_dict = {
+            "text_model.embeddings.token_embedding.weight": "token_embedding.weight",
+            "text_model.embeddings.position_embedding.weight": "position_embeds",
+            "text_model.final_layer_norm.weight": "final_layer_norm.weight",
+            "text_model.final_layer_norm.bias": "final_layer_norm.bias"
+        }
+        attn_rename_dict = {
+            "self_attn.q_proj": "attn.to_q",
+            "self_attn.k_proj": "attn.to_k",
+            "self_attn.v_proj": "attn.to_v",
+            "self_attn.out_proj": "attn.to_out",
+            "layer_norm1": "layer_norm1",
+            "layer_norm2": "layer_norm2",
+            "mlp.fc1": "fc1",
+            "mlp.fc2": "fc2",
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if name == "text_model.embeddings.position_embedding.weight":
+                    param = param.reshape((1, param.shape[0], param.shape[1]))
+                state_dict_[rename_dict[name]] = param
+            elif name.startswith("text_model.encoder.layers."):
+                param = state_dict[name]
+                names = name.split(".")
+                layer_id, layer_type, tail = names[3], ".".join(names[4:-1]), names[-1]
+                name_ = ".".join(["encoders", layer_id, attn_rename_dict[layer_type], tail])
+                state_dict_[name_] = param
+        return state_dict_
+    def from_civitai(self, state_dict):
+        rename_dict = {
+            "conditioner.embedders.0.transformer.text_model.embeddings.position_embedding.weight": "position_embeds",
+            "conditioner.embedders.0.transformer.text_model.embeddings.token_embedding.weight": "token_embedding.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.layer_norm1.bias": "encoders.0.layer_norm1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.layer_norm1.weight": "encoders.0.layer_norm1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.layer_norm2.bias": "encoders.0.layer_norm2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.layer_norm2.weight": "encoders.0.layer_norm2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.mlp.fc1.bias": "encoders.0.fc1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.mlp.fc1.weight": "encoders.0.fc1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.mlp.fc2.bias": "encoders.0.fc2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.mlp.fc2.weight": "encoders.0.fc2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias": "encoders.0.attn.to_k.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight": "encoders.0.attn.to_k.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias": "encoders.0.attn.to_out.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight": "encoders.0.attn.to_out.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias": "encoders.0.attn.to_q.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight": "encoders.0.attn.to_q.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias": "encoders.0.attn.to_v.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight": "encoders.0.attn.to_v.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.layer_norm1.bias": "encoders.1.layer_norm1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.layer_norm1.weight": "encoders.1.layer_norm1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.layer_norm2.bias": "encoders.1.layer_norm2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.layer_norm2.weight": "encoders.1.layer_norm2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.mlp.fc1.bias": "encoders.1.fc1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.mlp.fc1.weight": "encoders.1.fc1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.mlp.fc2.bias": "encoders.1.fc2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.mlp.fc2.weight": "encoders.1.fc2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias": "encoders.1.attn.to_k.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight": "encoders.1.attn.to_k.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias": "encoders.1.attn.to_out.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight": "encoders.1.attn.to_out.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias": "encoders.1.attn.to_q.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight": "encoders.1.attn.to_q.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias": "encoders.1.attn.to_v.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight": "encoders.1.attn.to_v.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.layer_norm1.bias": "encoders.10.layer_norm1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.layer_norm1.weight": "encoders.10.layer_norm1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.layer_norm2.bias": "encoders.10.layer_norm2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.layer_norm2.weight": "encoders.10.layer_norm2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.mlp.fc1.bias": "encoders.10.fc1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.mlp.fc1.weight": "encoders.10.fc1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.mlp.fc2.bias": "encoders.10.fc2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.mlp.fc2.weight": "encoders.10.fc2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias": "encoders.10.attn.to_k.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight": "encoders.10.attn.to_k.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias": "encoders.10.attn.to_out.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight": "encoders.10.attn.to_out.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias": "encoders.10.attn.to_q.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight": "encoders.10.attn.to_q.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias": "encoders.10.attn.to_v.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight": "encoders.10.attn.to_v.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.layer_norm1.bias": "encoders.2.layer_norm1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.layer_norm1.weight": "encoders.2.layer_norm1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.layer_norm2.bias": "encoders.2.layer_norm2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.layer_norm2.weight": "encoders.2.layer_norm2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.mlp.fc1.bias": "encoders.2.fc1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.mlp.fc1.weight": "encoders.2.fc1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.mlp.fc2.bias": "encoders.2.fc2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.mlp.fc2.weight": "encoders.2.fc2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias": "encoders.2.attn.to_k.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight": "encoders.2.attn.to_k.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias": "encoders.2.attn.to_out.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight": "encoders.2.attn.to_out.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias": "encoders.2.attn.to_q.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight": "encoders.2.attn.to_q.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias": "encoders.2.attn.to_v.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight": "encoders.2.attn.to_v.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.layer_norm1.bias": "encoders.3.layer_norm1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.layer_norm1.weight": "encoders.3.layer_norm1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.layer_norm2.bias": "encoders.3.layer_norm2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.layer_norm2.weight": "encoders.3.layer_norm2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.mlp.fc1.bias": "encoders.3.fc1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.mlp.fc1.weight": "encoders.3.fc1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.mlp.fc2.bias": "encoders.3.fc2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.mlp.fc2.weight": "encoders.3.fc2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias": "encoders.3.attn.to_k.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight": "encoders.3.attn.to_k.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias": "encoders.3.attn.to_out.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight": "encoders.3.attn.to_out.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias": "encoders.3.attn.to_q.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight": "encoders.3.attn.to_q.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias": "encoders.3.attn.to_v.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight": "encoders.3.attn.to_v.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.layer_norm1.bias": "encoders.4.layer_norm1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.layer_norm1.weight": "encoders.4.layer_norm1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.layer_norm2.bias": "encoders.4.layer_norm2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.layer_norm2.weight": "encoders.4.layer_norm2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.mlp.fc1.bias": "encoders.4.fc1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.mlp.fc1.weight": "encoders.4.fc1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.mlp.fc2.bias": "encoders.4.fc2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.mlp.fc2.weight": "encoders.4.fc2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias": "encoders.4.attn.to_k.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight": "encoders.4.attn.to_k.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias": "encoders.4.attn.to_out.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight": "encoders.4.attn.to_out.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias": "encoders.4.attn.to_q.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight": "encoders.4.attn.to_q.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias": "encoders.4.attn.to_v.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight": "encoders.4.attn.to_v.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.layer_norm1.bias": "encoders.5.layer_norm1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.layer_norm1.weight": "encoders.5.layer_norm1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.layer_norm2.bias": "encoders.5.layer_norm2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.layer_norm2.weight": "encoders.5.layer_norm2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.mlp.fc1.bias": "encoders.5.fc1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.mlp.fc1.weight": "encoders.5.fc1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.mlp.fc2.bias": "encoders.5.fc2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.mlp.fc2.weight": "encoders.5.fc2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias": "encoders.5.attn.to_k.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight": "encoders.5.attn.to_k.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias": "encoders.5.attn.to_out.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight": "encoders.5.attn.to_out.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias": "encoders.5.attn.to_q.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight": "encoders.5.attn.to_q.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias": "encoders.5.attn.to_v.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight": "encoders.5.attn.to_v.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.layer_norm1.bias": "encoders.6.layer_norm1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.layer_norm1.weight": "encoders.6.layer_norm1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.layer_norm2.bias": "encoders.6.layer_norm2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.layer_norm2.weight": "encoders.6.layer_norm2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.mlp.fc1.bias": "encoders.6.fc1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.mlp.fc1.weight": "encoders.6.fc1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.mlp.fc2.bias": "encoders.6.fc2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.mlp.fc2.weight": "encoders.6.fc2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias": "encoders.6.attn.to_k.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight": "encoders.6.attn.to_k.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias": "encoders.6.attn.to_out.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight": "encoders.6.attn.to_out.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias": "encoders.6.attn.to_q.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight": "encoders.6.attn.to_q.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias": "encoders.6.attn.to_v.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight": "encoders.6.attn.to_v.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.layer_norm1.bias": "encoders.7.layer_norm1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.layer_norm1.weight": "encoders.7.layer_norm1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.layer_norm2.bias": "encoders.7.layer_norm2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.layer_norm2.weight": "encoders.7.layer_norm2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.mlp.fc1.bias": "encoders.7.fc1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.mlp.fc1.weight": "encoders.7.fc1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.mlp.fc2.bias": "encoders.7.fc2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.mlp.fc2.weight": "encoders.7.fc2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias": "encoders.7.attn.to_k.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight": "encoders.7.attn.to_k.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias": "encoders.7.attn.to_out.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight": "encoders.7.attn.to_out.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias": "encoders.7.attn.to_q.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight": "encoders.7.attn.to_q.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias": "encoders.7.attn.to_v.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight": "encoders.7.attn.to_v.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.layer_norm1.bias": "encoders.8.layer_norm1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.layer_norm1.weight": "encoders.8.layer_norm1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.layer_norm2.bias": "encoders.8.layer_norm2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.layer_norm2.weight": "encoders.8.layer_norm2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.mlp.fc1.bias": "encoders.8.fc1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.mlp.fc1.weight": "encoders.8.fc1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.mlp.fc2.bias": "encoders.8.fc2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.mlp.fc2.weight": "encoders.8.fc2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias": "encoders.8.attn.to_k.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight": "encoders.8.attn.to_k.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias": "encoders.8.attn.to_out.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight": "encoders.8.attn.to_out.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias": "encoders.8.attn.to_q.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight": "encoders.8.attn.to_q.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias": "encoders.8.attn.to_v.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight": "encoders.8.attn.to_v.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.layer_norm1.bias": "encoders.9.layer_norm1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.layer_norm1.weight": "encoders.9.layer_norm1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.layer_norm2.bias": "encoders.9.layer_norm2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.layer_norm2.weight": "encoders.9.layer_norm2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.mlp.fc1.bias": "encoders.9.fc1.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.mlp.fc1.weight": "encoders.9.fc1.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.mlp.fc2.bias": "encoders.9.fc2.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.mlp.fc2.weight": "encoders.9.fc2.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias": "encoders.9.attn.to_k.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight": "encoders.9.attn.to_k.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias": "encoders.9.attn.to_out.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight": "encoders.9.attn.to_out.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias": "encoders.9.attn.to_q.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight": "encoders.9.attn.to_q.weight",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias": "encoders.9.attn.to_v.bias",
+            "conditioner.embedders.0.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight": "encoders.9.attn.to_v.weight",
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if name == "conditioner.embedders.0.transformer.text_model.embeddings.position_embedding.weight":
+                    param = param.reshape((1, param.shape[0], param.shape[1]))
+                state_dict_[rename_dict[name]] = param
+        return state_dict_
+class SDXLTextEncoder2StateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        rename_dict = {
+            "text_model.embeddings.token_embedding.weight": "token_embedding.weight",
+            "text_model.embeddings.position_embedding.weight": "position_embeds",
+            "text_model.final_layer_norm.weight": "final_layer_norm.weight",
+            "text_model.final_layer_norm.bias": "final_layer_norm.bias",
+            "text_projection.weight": "text_projection.weight"
+        }
+        attn_rename_dict = {
+            "self_attn.q_proj": "attn.to_q",
+            "self_attn.k_proj": "attn.to_k",
+            "self_attn.v_proj": "attn.to_v",
+            "self_attn.out_proj": "attn.to_out",
+            "layer_norm1": "layer_norm1",
+            "layer_norm2": "layer_norm2",
+            "mlp.fc1": "fc1",
+            "mlp.fc2": "fc2",
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if name == "text_model.embeddings.position_embedding.weight":
+                    param = param.reshape((1, param.shape[0], param.shape[1]))
+                state_dict_[rename_dict[name]] = param
+            elif name.startswith("text_model.encoder.layers."):
+                param = state_dict[name]
+                names = name.split(".")
+                layer_id, layer_type, tail = names[3], ".".join(names[4:-1]), names[-1]
+                name_ = ".".join(["encoders", layer_id, attn_rename_dict[layer_type], tail])
+                state_dict_[name_] = param
+        return state_dict_
+    def from_civitai(self, state_dict):
+        rename_dict = {
+            "conditioner.embedders.1.model.ln_final.bias": "final_layer_norm.bias",
+            "conditioner.embedders.1.model.ln_final.weight": "final_layer_norm.weight",
+            "conditioner.embedders.1.model.positional_embedding": "position_embeds",
+            "conditioner.embedders.1.model.token_embedding.weight": "token_embedding.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.0.attn.in_proj_bias": ['encoders.0.attn.to_q.bias', 'encoders.0.attn.to_k.bias', 'encoders.0.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.0.attn.in_proj_weight": ['encoders.0.attn.to_q.weight', 'encoders.0.attn.to_k.weight', 'encoders.0.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.0.attn.out_proj.bias": "encoders.0.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.0.attn.out_proj.weight": "encoders.0.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.0.ln_1.bias": "encoders.0.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.0.ln_1.weight": "encoders.0.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.0.ln_2.bias": "encoders.0.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.0.ln_2.weight": "encoders.0.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.0.mlp.c_fc.bias": "encoders.0.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.0.mlp.c_fc.weight": "encoders.0.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.0.mlp.c_proj.bias": "encoders.0.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.0.mlp.c_proj.weight": "encoders.0.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.1.attn.in_proj_bias": ['encoders.1.attn.to_q.bias', 'encoders.1.attn.to_k.bias', 'encoders.1.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.1.attn.in_proj_weight": ['encoders.1.attn.to_q.weight', 'encoders.1.attn.to_k.weight', 'encoders.1.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.1.attn.out_proj.bias": "encoders.1.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.1.attn.out_proj.weight": "encoders.1.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.1.ln_1.bias": "encoders.1.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.1.ln_1.weight": "encoders.1.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.1.ln_2.bias": "encoders.1.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.1.ln_2.weight": "encoders.1.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.1.mlp.c_fc.bias": "encoders.1.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.1.mlp.c_fc.weight": "encoders.1.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.1.mlp.c_proj.bias": "encoders.1.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.1.mlp.c_proj.weight": "encoders.1.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.10.attn.in_proj_bias": ['encoders.10.attn.to_q.bias', 'encoders.10.attn.to_k.bias', 'encoders.10.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.10.attn.in_proj_weight": ['encoders.10.attn.to_q.weight', 'encoders.10.attn.to_k.weight', 'encoders.10.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.10.attn.out_proj.bias": "encoders.10.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.10.attn.out_proj.weight": "encoders.10.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.10.ln_1.bias": "encoders.10.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.10.ln_1.weight": "encoders.10.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.10.ln_2.bias": "encoders.10.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.10.ln_2.weight": "encoders.10.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.10.mlp.c_fc.bias": "encoders.10.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.10.mlp.c_fc.weight": "encoders.10.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.10.mlp.c_proj.bias": "encoders.10.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.10.mlp.c_proj.weight": "encoders.10.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.11.attn.in_proj_bias": ['encoders.11.attn.to_q.bias', 'encoders.11.attn.to_k.bias', 'encoders.11.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.11.attn.in_proj_weight": ['encoders.11.attn.to_q.weight', 'encoders.11.attn.to_k.weight', 'encoders.11.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.11.attn.out_proj.bias": "encoders.11.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.11.attn.out_proj.weight": "encoders.11.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.11.ln_1.bias": "encoders.11.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.11.ln_1.weight": "encoders.11.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.11.ln_2.bias": "encoders.11.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.11.ln_2.weight": "encoders.11.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.11.mlp.c_fc.bias": "encoders.11.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.11.mlp.c_fc.weight": "encoders.11.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.11.mlp.c_proj.bias": "encoders.11.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.11.mlp.c_proj.weight": "encoders.11.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.12.attn.in_proj_bias": ['encoders.12.attn.to_q.bias', 'encoders.12.attn.to_k.bias', 'encoders.12.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.12.attn.in_proj_weight": ['encoders.12.attn.to_q.weight', 'encoders.12.attn.to_k.weight', 'encoders.12.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.12.attn.out_proj.bias": "encoders.12.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.12.attn.out_proj.weight": "encoders.12.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.12.ln_1.bias": "encoders.12.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.12.ln_1.weight": "encoders.12.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.12.ln_2.bias": "encoders.12.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.12.ln_2.weight": "encoders.12.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.12.mlp.c_fc.bias": "encoders.12.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.12.mlp.c_fc.weight": "encoders.12.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.12.mlp.c_proj.bias": "encoders.12.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.12.mlp.c_proj.weight": "encoders.12.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.13.attn.in_proj_bias": ['encoders.13.attn.to_q.bias', 'encoders.13.attn.to_k.bias', 'encoders.13.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.13.attn.in_proj_weight": ['encoders.13.attn.to_q.weight', 'encoders.13.attn.to_k.weight', 'encoders.13.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.13.attn.out_proj.bias": "encoders.13.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.13.attn.out_proj.weight": "encoders.13.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.13.ln_1.bias": "encoders.13.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.13.ln_1.weight": "encoders.13.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.13.ln_2.bias": "encoders.13.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.13.ln_2.weight": "encoders.13.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.13.mlp.c_fc.bias": "encoders.13.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.13.mlp.c_fc.weight": "encoders.13.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.13.mlp.c_proj.bias": "encoders.13.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.13.mlp.c_proj.weight": "encoders.13.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.14.attn.in_proj_bias": ['encoders.14.attn.to_q.bias', 'encoders.14.attn.to_k.bias', 'encoders.14.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.14.attn.in_proj_weight": ['encoders.14.attn.to_q.weight', 'encoders.14.attn.to_k.weight', 'encoders.14.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.14.attn.out_proj.bias": "encoders.14.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.14.attn.out_proj.weight": "encoders.14.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.14.ln_1.bias": "encoders.14.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.14.ln_1.weight": "encoders.14.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.14.ln_2.bias": "encoders.14.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.14.ln_2.weight": "encoders.14.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.14.mlp.c_fc.bias": "encoders.14.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.14.mlp.c_fc.weight": "encoders.14.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.14.mlp.c_proj.bias": "encoders.14.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.14.mlp.c_proj.weight": "encoders.14.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.15.attn.in_proj_bias": ['encoders.15.attn.to_q.bias', 'encoders.15.attn.to_k.bias', 'encoders.15.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.15.attn.in_proj_weight": ['encoders.15.attn.to_q.weight', 'encoders.15.attn.to_k.weight', 'encoders.15.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.15.attn.out_proj.bias": "encoders.15.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.15.attn.out_proj.weight": "encoders.15.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.15.ln_1.bias": "encoders.15.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.15.ln_1.weight": "encoders.15.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.15.ln_2.bias": "encoders.15.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.15.ln_2.weight": "encoders.15.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.15.mlp.c_fc.bias": "encoders.15.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.15.mlp.c_fc.weight": "encoders.15.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.15.mlp.c_proj.bias": "encoders.15.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.15.mlp.c_proj.weight": "encoders.15.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.16.attn.in_proj_bias": ['encoders.16.attn.to_q.bias', 'encoders.16.attn.to_k.bias', 'encoders.16.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.16.attn.in_proj_weight": ['encoders.16.attn.to_q.weight', 'encoders.16.attn.to_k.weight', 'encoders.16.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.16.attn.out_proj.bias": "encoders.16.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.16.attn.out_proj.weight": "encoders.16.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.16.ln_1.bias": "encoders.16.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.16.ln_1.weight": "encoders.16.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.16.ln_2.bias": "encoders.16.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.16.ln_2.weight": "encoders.16.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.16.mlp.c_fc.bias": "encoders.16.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.16.mlp.c_fc.weight": "encoders.16.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.16.mlp.c_proj.bias": "encoders.16.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.16.mlp.c_proj.weight": "encoders.16.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.17.attn.in_proj_bias": ['encoders.17.attn.to_q.bias', 'encoders.17.attn.to_k.bias', 'encoders.17.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.17.attn.in_proj_weight": ['encoders.17.attn.to_q.weight', 'encoders.17.attn.to_k.weight', 'encoders.17.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.17.attn.out_proj.bias": "encoders.17.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.17.attn.out_proj.weight": "encoders.17.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.17.ln_1.bias": "encoders.17.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.17.ln_1.weight": "encoders.17.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.17.ln_2.bias": "encoders.17.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.17.ln_2.weight": "encoders.17.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.17.mlp.c_fc.bias": "encoders.17.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.17.mlp.c_fc.weight": "encoders.17.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.17.mlp.c_proj.bias": "encoders.17.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.17.mlp.c_proj.weight": "encoders.17.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.18.attn.in_proj_bias": ['encoders.18.attn.to_q.bias', 'encoders.18.attn.to_k.bias', 'encoders.18.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.18.attn.in_proj_weight": ['encoders.18.attn.to_q.weight', 'encoders.18.attn.to_k.weight', 'encoders.18.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.18.attn.out_proj.bias": "encoders.18.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.18.attn.out_proj.weight": "encoders.18.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.18.ln_1.bias": "encoders.18.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.18.ln_1.weight": "encoders.18.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.18.ln_2.bias": "encoders.18.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.18.ln_2.weight": "encoders.18.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.18.mlp.c_fc.bias": "encoders.18.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.18.mlp.c_fc.weight": "encoders.18.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.18.mlp.c_proj.bias": "encoders.18.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.18.mlp.c_proj.weight": "encoders.18.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.19.attn.in_proj_bias": ['encoders.19.attn.to_q.bias', 'encoders.19.attn.to_k.bias', 'encoders.19.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.19.attn.in_proj_weight": ['encoders.19.attn.to_q.weight', 'encoders.19.attn.to_k.weight', 'encoders.19.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.19.attn.out_proj.bias": "encoders.19.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.19.attn.out_proj.weight": "encoders.19.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.19.ln_1.bias": "encoders.19.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.19.ln_1.weight": "encoders.19.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.19.ln_2.bias": "encoders.19.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.19.ln_2.weight": "encoders.19.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.19.mlp.c_fc.bias": "encoders.19.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.19.mlp.c_fc.weight": "encoders.19.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.19.mlp.c_proj.bias": "encoders.19.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.19.mlp.c_proj.weight": "encoders.19.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.2.attn.in_proj_bias": ['encoders.2.attn.to_q.bias', 'encoders.2.attn.to_k.bias', 'encoders.2.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.2.attn.in_proj_weight": ['encoders.2.attn.to_q.weight', 'encoders.2.attn.to_k.weight', 'encoders.2.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.2.attn.out_proj.bias": "encoders.2.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.2.attn.out_proj.weight": "encoders.2.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.2.ln_1.bias": "encoders.2.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.2.ln_1.weight": "encoders.2.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.2.ln_2.bias": "encoders.2.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.2.ln_2.weight": "encoders.2.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.2.mlp.c_fc.bias": "encoders.2.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.2.mlp.c_fc.weight": "encoders.2.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.2.mlp.c_proj.bias": "encoders.2.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.2.mlp.c_proj.weight": "encoders.2.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.20.attn.in_proj_bias": ['encoders.20.attn.to_q.bias', 'encoders.20.attn.to_k.bias', 'encoders.20.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.20.attn.in_proj_weight": ['encoders.20.attn.to_q.weight', 'encoders.20.attn.to_k.weight', 'encoders.20.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.20.attn.out_proj.bias": "encoders.20.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.20.attn.out_proj.weight": "encoders.20.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.20.ln_1.bias": "encoders.20.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.20.ln_1.weight": "encoders.20.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.20.ln_2.bias": "encoders.20.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.20.ln_2.weight": "encoders.20.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.20.mlp.c_fc.bias": "encoders.20.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.20.mlp.c_fc.weight": "encoders.20.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.20.mlp.c_proj.bias": "encoders.20.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.20.mlp.c_proj.weight": "encoders.20.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.21.attn.in_proj_bias": ['encoders.21.attn.to_q.bias', 'encoders.21.attn.to_k.bias', 'encoders.21.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.21.attn.in_proj_weight": ['encoders.21.attn.to_q.weight', 'encoders.21.attn.to_k.weight', 'encoders.21.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.21.attn.out_proj.bias": "encoders.21.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.21.attn.out_proj.weight": "encoders.21.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.21.ln_1.bias": "encoders.21.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.21.ln_1.weight": "encoders.21.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.21.ln_2.bias": "encoders.21.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.21.ln_2.weight": "encoders.21.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.21.mlp.c_fc.bias": "encoders.21.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.21.mlp.c_fc.weight": "encoders.21.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.21.mlp.c_proj.bias": "encoders.21.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.21.mlp.c_proj.weight": "encoders.21.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.22.attn.in_proj_bias": ['encoders.22.attn.to_q.bias', 'encoders.22.attn.to_k.bias', 'encoders.22.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.22.attn.in_proj_weight": ['encoders.22.attn.to_q.weight', 'encoders.22.attn.to_k.weight', 'encoders.22.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.22.attn.out_proj.bias": "encoders.22.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.22.attn.out_proj.weight": "encoders.22.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.22.ln_1.bias": "encoders.22.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.22.ln_1.weight": "encoders.22.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.22.ln_2.bias": "encoders.22.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.22.ln_2.weight": "encoders.22.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.22.mlp.c_fc.bias": "encoders.22.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.22.mlp.c_fc.weight": "encoders.22.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.22.mlp.c_proj.bias": "encoders.22.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.22.mlp.c_proj.weight": "encoders.22.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.23.attn.in_proj_bias": ['encoders.23.attn.to_q.bias', 'encoders.23.attn.to_k.bias', 'encoders.23.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.23.attn.in_proj_weight": ['encoders.23.attn.to_q.weight', 'encoders.23.attn.to_k.weight', 'encoders.23.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.23.attn.out_proj.bias": "encoders.23.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.23.attn.out_proj.weight": "encoders.23.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.23.ln_1.bias": "encoders.23.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.23.ln_1.weight": "encoders.23.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.23.ln_2.bias": "encoders.23.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.23.ln_2.weight": "encoders.23.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.23.mlp.c_fc.bias": "encoders.23.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.23.mlp.c_fc.weight": "encoders.23.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.23.mlp.c_proj.bias": "encoders.23.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.23.mlp.c_proj.weight": "encoders.23.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.24.attn.in_proj_bias": ['encoders.24.attn.to_q.bias', 'encoders.24.attn.to_k.bias', 'encoders.24.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.24.attn.in_proj_weight": ['encoders.24.attn.to_q.weight', 'encoders.24.attn.to_k.weight', 'encoders.24.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.24.attn.out_proj.bias": "encoders.24.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.24.attn.out_proj.weight": "encoders.24.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.24.ln_1.bias": "encoders.24.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.24.ln_1.weight": "encoders.24.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.24.ln_2.bias": "encoders.24.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.24.ln_2.weight": "encoders.24.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.24.mlp.c_fc.bias": "encoders.24.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.24.mlp.c_fc.weight": "encoders.24.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.24.mlp.c_proj.bias": "encoders.24.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.24.mlp.c_proj.weight": "encoders.24.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.25.attn.in_proj_bias": ['encoders.25.attn.to_q.bias', 'encoders.25.attn.to_k.bias', 'encoders.25.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.25.attn.in_proj_weight": ['encoders.25.attn.to_q.weight', 'encoders.25.attn.to_k.weight', 'encoders.25.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.25.attn.out_proj.bias": "encoders.25.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.25.attn.out_proj.weight": "encoders.25.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.25.ln_1.bias": "encoders.25.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.25.ln_1.weight": "encoders.25.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.25.ln_2.bias": "encoders.25.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.25.ln_2.weight": "encoders.25.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.25.mlp.c_fc.bias": "encoders.25.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.25.mlp.c_fc.weight": "encoders.25.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.25.mlp.c_proj.bias": "encoders.25.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.25.mlp.c_proj.weight": "encoders.25.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.26.attn.in_proj_bias": ['encoders.26.attn.to_q.bias', 'encoders.26.attn.to_k.bias', 'encoders.26.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.26.attn.in_proj_weight": ['encoders.26.attn.to_q.weight', 'encoders.26.attn.to_k.weight', 'encoders.26.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.26.attn.out_proj.bias": "encoders.26.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.26.attn.out_proj.weight": "encoders.26.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.26.ln_1.bias": "encoders.26.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.26.ln_1.weight": "encoders.26.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.26.ln_2.bias": "encoders.26.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.26.ln_2.weight": "encoders.26.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.26.mlp.c_fc.bias": "encoders.26.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.26.mlp.c_fc.weight": "encoders.26.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.26.mlp.c_proj.bias": "encoders.26.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.26.mlp.c_proj.weight": "encoders.26.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.27.attn.in_proj_bias": ['encoders.27.attn.to_q.bias', 'encoders.27.attn.to_k.bias', 'encoders.27.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.27.attn.in_proj_weight": ['encoders.27.attn.to_q.weight', 'encoders.27.attn.to_k.weight', 'encoders.27.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.27.attn.out_proj.bias": "encoders.27.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.27.attn.out_proj.weight": "encoders.27.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.27.ln_1.bias": "encoders.27.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.27.ln_1.weight": "encoders.27.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.27.ln_2.bias": "encoders.27.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.27.ln_2.weight": "encoders.27.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.27.mlp.c_fc.bias": "encoders.27.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.27.mlp.c_fc.weight": "encoders.27.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.27.mlp.c_proj.bias": "encoders.27.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.27.mlp.c_proj.weight": "encoders.27.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.28.attn.in_proj_bias": ['encoders.28.attn.to_q.bias', 'encoders.28.attn.to_k.bias', 'encoders.28.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.28.attn.in_proj_weight": ['encoders.28.attn.to_q.weight', 'encoders.28.attn.to_k.weight', 'encoders.28.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.28.attn.out_proj.bias": "encoders.28.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.28.attn.out_proj.weight": "encoders.28.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.28.ln_1.bias": "encoders.28.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.28.ln_1.weight": "encoders.28.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.28.ln_2.bias": "encoders.28.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.28.ln_2.weight": "encoders.28.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.28.mlp.c_fc.bias": "encoders.28.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.28.mlp.c_fc.weight": "encoders.28.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.28.mlp.c_proj.bias": "encoders.28.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.28.mlp.c_proj.weight": "encoders.28.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.29.attn.in_proj_bias": ['encoders.29.attn.to_q.bias', 'encoders.29.attn.to_k.bias', 'encoders.29.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.29.attn.in_proj_weight": ['encoders.29.attn.to_q.weight', 'encoders.29.attn.to_k.weight', 'encoders.29.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.29.attn.out_proj.bias": "encoders.29.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.29.attn.out_proj.weight": "encoders.29.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.29.ln_1.bias": "encoders.29.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.29.ln_1.weight": "encoders.29.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.29.ln_2.bias": "encoders.29.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.29.ln_2.weight": "encoders.29.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.29.mlp.c_fc.bias": "encoders.29.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.29.mlp.c_fc.weight": "encoders.29.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.29.mlp.c_proj.bias": "encoders.29.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.29.mlp.c_proj.weight": "encoders.29.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.3.attn.in_proj_bias": ['encoders.3.attn.to_q.bias', 'encoders.3.attn.to_k.bias', 'encoders.3.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.3.attn.in_proj_weight": ['encoders.3.attn.to_q.weight', 'encoders.3.attn.to_k.weight', 'encoders.3.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.3.attn.out_proj.bias": "encoders.3.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.3.attn.out_proj.weight": "encoders.3.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.3.ln_1.bias": "encoders.3.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.3.ln_1.weight": "encoders.3.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.3.ln_2.bias": "encoders.3.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.3.ln_2.weight": "encoders.3.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.3.mlp.c_fc.bias": "encoders.3.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.3.mlp.c_fc.weight": "encoders.3.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.3.mlp.c_proj.bias": "encoders.3.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.3.mlp.c_proj.weight": "encoders.3.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.30.attn.in_proj_bias": ['encoders.30.attn.to_q.bias', 'encoders.30.attn.to_k.bias', 'encoders.30.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.30.attn.in_proj_weight": ['encoders.30.attn.to_q.weight', 'encoders.30.attn.to_k.weight', 'encoders.30.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.30.attn.out_proj.bias": "encoders.30.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.30.attn.out_proj.weight": "encoders.30.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.30.ln_1.bias": "encoders.30.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.30.ln_1.weight": "encoders.30.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.30.ln_2.bias": "encoders.30.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.30.ln_2.weight": "encoders.30.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.30.mlp.c_fc.bias": "encoders.30.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.30.mlp.c_fc.weight": "encoders.30.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.30.mlp.c_proj.bias": "encoders.30.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.30.mlp.c_proj.weight": "encoders.30.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.31.attn.in_proj_bias": ['encoders.31.attn.to_q.bias', 'encoders.31.attn.to_k.bias', 'encoders.31.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.31.attn.in_proj_weight": ['encoders.31.attn.to_q.weight', 'encoders.31.attn.to_k.weight', 'encoders.31.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.31.attn.out_proj.bias": "encoders.31.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.31.attn.out_proj.weight": "encoders.31.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.31.ln_1.bias": "encoders.31.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.31.ln_1.weight": "encoders.31.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.31.ln_2.bias": "encoders.31.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.31.ln_2.weight": "encoders.31.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.31.mlp.c_fc.bias": "encoders.31.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.31.mlp.c_fc.weight": "encoders.31.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.31.mlp.c_proj.bias": "encoders.31.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.31.mlp.c_proj.weight": "encoders.31.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.4.attn.in_proj_bias": ['encoders.4.attn.to_q.bias', 'encoders.4.attn.to_k.bias', 'encoders.4.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.4.attn.in_proj_weight": ['encoders.4.attn.to_q.weight', 'encoders.4.attn.to_k.weight', 'encoders.4.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.4.attn.out_proj.bias": "encoders.4.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.4.attn.out_proj.weight": "encoders.4.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.4.ln_1.bias": "encoders.4.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.4.ln_1.weight": "encoders.4.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.4.ln_2.bias": "encoders.4.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.4.ln_2.weight": "encoders.4.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.4.mlp.c_fc.bias": "encoders.4.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.4.mlp.c_fc.weight": "encoders.4.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.4.mlp.c_proj.bias": "encoders.4.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.4.mlp.c_proj.weight": "encoders.4.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.5.attn.in_proj_bias": ['encoders.5.attn.to_q.bias', 'encoders.5.attn.to_k.bias', 'encoders.5.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.5.attn.in_proj_weight": ['encoders.5.attn.to_q.weight', 'encoders.5.attn.to_k.weight', 'encoders.5.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.5.attn.out_proj.bias": "encoders.5.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.5.attn.out_proj.weight": "encoders.5.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.5.ln_1.bias": "encoders.5.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.5.ln_1.weight": "encoders.5.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.5.ln_2.bias": "encoders.5.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.5.ln_2.weight": "encoders.5.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.5.mlp.c_fc.bias": "encoders.5.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.5.mlp.c_fc.weight": "encoders.5.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.5.mlp.c_proj.bias": "encoders.5.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.5.mlp.c_proj.weight": "encoders.5.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.6.attn.in_proj_bias": ['encoders.6.attn.to_q.bias', 'encoders.6.attn.to_k.bias', 'encoders.6.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.6.attn.in_proj_weight": ['encoders.6.attn.to_q.weight', 'encoders.6.attn.to_k.weight', 'encoders.6.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.6.attn.out_proj.bias": "encoders.6.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.6.attn.out_proj.weight": "encoders.6.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.6.ln_1.bias": "encoders.6.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.6.ln_1.weight": "encoders.6.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.6.ln_2.bias": "encoders.6.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.6.ln_2.weight": "encoders.6.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.6.mlp.c_fc.bias": "encoders.6.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.6.mlp.c_fc.weight": "encoders.6.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.6.mlp.c_proj.bias": "encoders.6.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.6.mlp.c_proj.weight": "encoders.6.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.7.attn.in_proj_bias": ['encoders.7.attn.to_q.bias', 'encoders.7.attn.to_k.bias', 'encoders.7.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.7.attn.in_proj_weight": ['encoders.7.attn.to_q.weight', 'encoders.7.attn.to_k.weight', 'encoders.7.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.7.attn.out_proj.bias": "encoders.7.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.7.attn.out_proj.weight": "encoders.7.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.7.ln_1.bias": "encoders.7.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.7.ln_1.weight": "encoders.7.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.7.ln_2.bias": "encoders.7.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.7.ln_2.weight": "encoders.7.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.7.mlp.c_fc.bias": "encoders.7.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.7.mlp.c_fc.weight": "encoders.7.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.7.mlp.c_proj.bias": "encoders.7.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.7.mlp.c_proj.weight": "encoders.7.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.8.attn.in_proj_bias": ['encoders.8.attn.to_q.bias', 'encoders.8.attn.to_k.bias', 'encoders.8.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.8.attn.in_proj_weight": ['encoders.8.attn.to_q.weight', 'encoders.8.attn.to_k.weight', 'encoders.8.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.8.attn.out_proj.bias": "encoders.8.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.8.attn.out_proj.weight": "encoders.8.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.8.ln_1.bias": "encoders.8.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.8.ln_1.weight": "encoders.8.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.8.ln_2.bias": "encoders.8.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.8.ln_2.weight": "encoders.8.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.8.mlp.c_fc.bias": "encoders.8.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.8.mlp.c_fc.weight": "encoders.8.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.8.mlp.c_proj.bias": "encoders.8.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.8.mlp.c_proj.weight": "encoders.8.fc2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.9.attn.in_proj_bias": ['encoders.9.attn.to_q.bias', 'encoders.9.attn.to_k.bias', 'encoders.9.attn.to_v.bias'],
+            "conditioner.embedders.1.model.transformer.resblocks.9.attn.in_proj_weight": ['encoders.9.attn.to_q.weight', 'encoders.9.attn.to_k.weight', 'encoders.9.attn.to_v.weight'],
+            "conditioner.embedders.1.model.transformer.resblocks.9.attn.out_proj.bias": "encoders.9.attn.to_out.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.9.attn.out_proj.weight": "encoders.9.attn.to_out.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.9.ln_1.bias": "encoders.9.layer_norm1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.9.ln_1.weight": "encoders.9.layer_norm1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.9.ln_2.bias": "encoders.9.layer_norm2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.9.ln_2.weight": "encoders.9.layer_norm2.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_fc.bias": "encoders.9.fc1.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_fc.weight": "encoders.9.fc1.weight",
+            "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.bias": "encoders.9.fc2.bias",
+            "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.weight": "encoders.9.fc2.weight",
+            "conditioner.embedders.1.model.text_projection": "text_projection.weight",
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if name == "conditioner.embedders.1.model.positional_embedding":
+                    param = param.reshape((1, param.shape[0], param.shape[1]))
+                elif name == "conditioner.embedders.1.model.text_projection":
+                    param = param.T
+                if isinstance(rename_dict[name], str):
+                    state_dict_[rename_dict[name]] = param
+                else:
+                    length = param.shape[0] // 3
+                    for i, rename in enumerate(rename_dict[name]):
+                        state_dict_[rename] = param[i*length: i*length+length]
+        return state_dict_

diffsynth/models/sdxl_unet.py ADDED Viewed

The diff for this file is too large to render. See raw diff

diffsynth/models/sdxl_vae_decoder.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from .sd_vae_decoder import SDVAEDecoder, SDVAEDecoderStateDictConverter
+class SDXLVAEDecoder(SDVAEDecoder):
+    def __init__(self):
+        super().__init__()
+        self.scaling_factor = 0.13025
+    def state_dict_converter(self):
+        return SDXLVAEDecoderStateDictConverter()
+class SDXLVAEDecoderStateDictConverter(SDVAEDecoderStateDictConverter):
+    def __init__(self):
+        super().__init__()

diffsynth/models/sdxl_vae_encoder.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from .sd_vae_encoder import SDVAEEncoderStateDictConverter, SDVAEEncoder
+class SDXLVAEEncoder(SDVAEEncoder):
+    def __init__(self):
+        super().__init__()
+        self.scaling_factor = 0.13025
+    def state_dict_converter(self):
+        return SDXLVAEEncoderStateDictConverter()
+class SDXLVAEEncoderStateDictConverter(SDVAEEncoderStateDictConverter):
+    def __init__(self):
+        super().__init__()

diffsynth/models/svd_image_encoder.py ADDED Viewed

	@@ -0,0 +1,504 @@

+import torch
+from .sd_text_encoder import CLIPEncoderLayer
+class CLIPVisionEmbeddings(torch.nn.Module):
+    def __init__(self, embed_dim=1280, image_size=224, patch_size=14, num_channels=3):
+        super().__init__()
+        # class_embeds (This is a fixed tensor)
+        self.class_embedding = torch.nn.Parameter(torch.randn(1, 1, embed_dim))
+        # position_embeds
+        self.patch_embedding = torch.nn.Conv2d(in_channels=num_channels, out_channels=embed_dim, kernel_size=patch_size, stride=patch_size, bias=False)
+        # position_embeds (This is a fixed tensor)
+        self.position_embeds = torch.nn.Parameter(torch.zeros(1, (image_size // patch_size) ** 2 + 1, embed_dim))
+    def forward(self, pixel_values):
+        batch_size = pixel_values.shape[0]
+        patch_embeds = self.patch_embedding(pixel_values)
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.repeat(batch_size, 1, 1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + self.position_embeds
+        return embeddings
+class SVDImageEncoder(torch.nn.Module):
+    def __init__(self, embed_dim=1280, layer_norm_eps=1e-5, num_encoder_layers=32, encoder_intermediate_size=5120, projection_dim=1024, num_heads=16, head_dim=80):
+        super().__init__()
+        self.embeddings = CLIPVisionEmbeddings(embed_dim=embed_dim)
+        self.pre_layernorm = torch.nn.LayerNorm(embed_dim, eps=layer_norm_eps)
+        self.encoders = torch.nn.ModuleList([
+            CLIPEncoderLayer(embed_dim, encoder_intermediate_size, num_heads=num_heads, head_dim=head_dim, use_quick_gelu=False)
+            for _ in range(num_encoder_layers)])
+        self.post_layernorm = torch.nn.LayerNorm(embed_dim, eps=layer_norm_eps)
+        self.visual_projection = torch.nn.Linear(embed_dim, projection_dim, bias=False)
+    def forward(self, pixel_values):
+        embeds = self.embeddings(pixel_values)
+        embeds = self.pre_layernorm(embeds)
+        for encoder_id, encoder in enumerate(self.encoders):
+            embeds = encoder(embeds)
+        embeds = self.post_layernorm(embeds[:, 0, :])
+        embeds = self.visual_projection(embeds)
+        return embeds
+    def state_dict_converter(self):
+        return SVDImageEncoderStateDictConverter()
+class SVDImageEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        rename_dict = {
+            "vision_model.embeddings.patch_embedding.weight": "embeddings.patch_embedding.weight",
+            "vision_model.embeddings.class_embedding": "embeddings.class_embedding",
+            "vision_model.embeddings.position_embedding.weight": "embeddings.position_embeds",
+            "vision_model.pre_layrnorm.weight": "pre_layernorm.weight",
+            "vision_model.pre_layrnorm.bias": "pre_layernorm.bias",
+            "vision_model.post_layernorm.weight": "post_layernorm.weight",
+            "vision_model.post_layernorm.bias": "post_layernorm.bias",
+            "visual_projection.weight": "visual_projection.weight"
+        }
+        attn_rename_dict = {
+            "self_attn.q_proj": "attn.to_q",
+            "self_attn.k_proj": "attn.to_k",
+            "self_attn.v_proj": "attn.to_v",
+            "self_attn.out_proj": "attn.to_out",
+            "layer_norm1": "layer_norm1",
+            "layer_norm2": "layer_norm2",
+            "mlp.fc1": "fc1",
+            "mlp.fc2": "fc2",
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if name == "vision_model.embeddings.class_embedding":
+                    param = state_dict[name].view(1, 1, -1)
+                elif name == "vision_model.embeddings.position_embedding.weight":
+                    param = state_dict[name].unsqueeze(0)
+                state_dict_[rename_dict[name]] = param
+            elif name.startswith("vision_model.encoder.layers."):
+                param = state_dict[name]
+                names = name.split(".")
+                layer_id, layer_type, tail = names[3], ".".join(names[4:-1]), names[-1]
+                name_ = ".".join(["encoders", layer_id, attn_rename_dict[layer_type], tail])
+                state_dict_[name_] = param
+        return state_dict_
+    def from_civitai(self, state_dict):
+        rename_dict = {
+            "conditioner.embedders.0.open_clip.model.visual.class_embedding": "embeddings.class_embedding",
+            "conditioner.embedders.0.open_clip.model.visual.conv1.weight": "embeddings.patch_embedding.weight",
+            "conditioner.embedders.0.open_clip.model.visual.ln_post.bias": "post_layernorm.bias",
+            "conditioner.embedders.0.open_clip.model.visual.ln_post.weight": "post_layernorm.weight",
+            "conditioner.embedders.0.open_clip.model.visual.ln_pre.bias": "pre_layernorm.bias",
+            "conditioner.embedders.0.open_clip.model.visual.ln_pre.weight": "pre_layernorm.weight",
+            "conditioner.embedders.0.open_clip.model.visual.positional_embedding": "embeddings.position_embeds",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.attn.in_proj_bias": ['encoders.0.attn.to_q.bias', 'encoders.0.attn.to_k.bias', 'encoders.0.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.attn.in_proj_weight": ['encoders.0.attn.to_q.weight', 'encoders.0.attn.to_k.weight', 'encoders.0.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.attn.out_proj.bias": "encoders.0.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.attn.out_proj.weight": "encoders.0.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.ln_1.bias": "encoders.0.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.ln_1.weight": "encoders.0.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.ln_2.bias": "encoders.0.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.ln_2.weight": "encoders.0.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.mlp.c_fc.bias": "encoders.0.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.mlp.c_fc.weight": "encoders.0.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.mlp.c_proj.bias": "encoders.0.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.0.mlp.c_proj.weight": "encoders.0.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.attn.in_proj_bias": ['encoders.1.attn.to_q.bias', 'encoders.1.attn.to_k.bias', 'encoders.1.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.attn.in_proj_weight": ['encoders.1.attn.to_q.weight', 'encoders.1.attn.to_k.weight', 'encoders.1.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.attn.out_proj.bias": "encoders.1.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.attn.out_proj.weight": "encoders.1.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.ln_1.bias": "encoders.1.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.ln_1.weight": "encoders.1.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.ln_2.bias": "encoders.1.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.ln_2.weight": "encoders.1.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.mlp.c_fc.bias": "encoders.1.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.mlp.c_fc.weight": "encoders.1.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.mlp.c_proj.bias": "encoders.1.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.1.mlp.c_proj.weight": "encoders.1.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.attn.in_proj_bias": ['encoders.10.attn.to_q.bias', 'encoders.10.attn.to_k.bias', 'encoders.10.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.attn.in_proj_weight": ['encoders.10.attn.to_q.weight', 'encoders.10.attn.to_k.weight', 'encoders.10.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.attn.out_proj.bias": "encoders.10.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.attn.out_proj.weight": "encoders.10.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.ln_1.bias": "encoders.10.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.ln_1.weight": "encoders.10.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.ln_2.bias": "encoders.10.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.ln_2.weight": "encoders.10.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.mlp.c_fc.bias": "encoders.10.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.mlp.c_fc.weight": "encoders.10.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.mlp.c_proj.bias": "encoders.10.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.10.mlp.c_proj.weight": "encoders.10.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.attn.in_proj_bias": ['encoders.11.attn.to_q.bias', 'encoders.11.attn.to_k.bias', 'encoders.11.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.attn.in_proj_weight": ['encoders.11.attn.to_q.weight', 'encoders.11.attn.to_k.weight', 'encoders.11.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.attn.out_proj.bias": "encoders.11.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.attn.out_proj.weight": "encoders.11.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.ln_1.bias": "encoders.11.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.ln_1.weight": "encoders.11.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.ln_2.bias": "encoders.11.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.ln_2.weight": "encoders.11.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.mlp.c_fc.bias": "encoders.11.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.mlp.c_fc.weight": "encoders.11.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.mlp.c_proj.bias": "encoders.11.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.11.mlp.c_proj.weight": "encoders.11.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.attn.in_proj_bias": ['encoders.12.attn.to_q.bias', 'encoders.12.attn.to_k.bias', 'encoders.12.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.attn.in_proj_weight": ['encoders.12.attn.to_q.weight', 'encoders.12.attn.to_k.weight', 'encoders.12.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.attn.out_proj.bias": "encoders.12.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.attn.out_proj.weight": "encoders.12.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.ln_1.bias": "encoders.12.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.ln_1.weight": "encoders.12.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.ln_2.bias": "encoders.12.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.ln_2.weight": "encoders.12.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.mlp.c_fc.bias": "encoders.12.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.mlp.c_fc.weight": "encoders.12.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.mlp.c_proj.bias": "encoders.12.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.12.mlp.c_proj.weight": "encoders.12.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.attn.in_proj_bias": ['encoders.13.attn.to_q.bias', 'encoders.13.attn.to_k.bias', 'encoders.13.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.attn.in_proj_weight": ['encoders.13.attn.to_q.weight', 'encoders.13.attn.to_k.weight', 'encoders.13.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.attn.out_proj.bias": "encoders.13.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.attn.out_proj.weight": "encoders.13.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.ln_1.bias": "encoders.13.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.ln_1.weight": "encoders.13.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.ln_2.bias": "encoders.13.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.ln_2.weight": "encoders.13.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.mlp.c_fc.bias": "encoders.13.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.mlp.c_fc.weight": "encoders.13.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.mlp.c_proj.bias": "encoders.13.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.13.mlp.c_proj.weight": "encoders.13.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.attn.in_proj_bias": ['encoders.14.attn.to_q.bias', 'encoders.14.attn.to_k.bias', 'encoders.14.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.attn.in_proj_weight": ['encoders.14.attn.to_q.weight', 'encoders.14.attn.to_k.weight', 'encoders.14.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.attn.out_proj.bias": "encoders.14.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.attn.out_proj.weight": "encoders.14.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.ln_1.bias": "encoders.14.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.ln_1.weight": "encoders.14.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.ln_2.bias": "encoders.14.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.ln_2.weight": "encoders.14.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.mlp.c_fc.bias": "encoders.14.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.mlp.c_fc.weight": "encoders.14.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.mlp.c_proj.bias": "encoders.14.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.14.mlp.c_proj.weight": "encoders.14.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.attn.in_proj_bias": ['encoders.15.attn.to_q.bias', 'encoders.15.attn.to_k.bias', 'encoders.15.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.attn.in_proj_weight": ['encoders.15.attn.to_q.weight', 'encoders.15.attn.to_k.weight', 'encoders.15.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.attn.out_proj.bias": "encoders.15.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.attn.out_proj.weight": "encoders.15.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.ln_1.bias": "encoders.15.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.ln_1.weight": "encoders.15.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.ln_2.bias": "encoders.15.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.ln_2.weight": "encoders.15.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.mlp.c_fc.bias": "encoders.15.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.mlp.c_fc.weight": "encoders.15.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.mlp.c_proj.bias": "encoders.15.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.15.mlp.c_proj.weight": "encoders.15.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.attn.in_proj_bias": ['encoders.16.attn.to_q.bias', 'encoders.16.attn.to_k.bias', 'encoders.16.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.attn.in_proj_weight": ['encoders.16.attn.to_q.weight', 'encoders.16.attn.to_k.weight', 'encoders.16.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.attn.out_proj.bias": "encoders.16.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.attn.out_proj.weight": "encoders.16.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.ln_1.bias": "encoders.16.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.ln_1.weight": "encoders.16.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.ln_2.bias": "encoders.16.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.ln_2.weight": "encoders.16.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.mlp.c_fc.bias": "encoders.16.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.mlp.c_fc.weight": "encoders.16.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.mlp.c_proj.bias": "encoders.16.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.16.mlp.c_proj.weight": "encoders.16.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.attn.in_proj_bias": ['encoders.17.attn.to_q.bias', 'encoders.17.attn.to_k.bias', 'encoders.17.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.attn.in_proj_weight": ['encoders.17.attn.to_q.weight', 'encoders.17.attn.to_k.weight', 'encoders.17.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.attn.out_proj.bias": "encoders.17.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.attn.out_proj.weight": "encoders.17.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.ln_1.bias": "encoders.17.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.ln_1.weight": "encoders.17.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.ln_2.bias": "encoders.17.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.ln_2.weight": "encoders.17.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.mlp.c_fc.bias": "encoders.17.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.mlp.c_fc.weight": "encoders.17.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.mlp.c_proj.bias": "encoders.17.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.17.mlp.c_proj.weight": "encoders.17.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.attn.in_proj_bias": ['encoders.18.attn.to_q.bias', 'encoders.18.attn.to_k.bias', 'encoders.18.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.attn.in_proj_weight": ['encoders.18.attn.to_q.weight', 'encoders.18.attn.to_k.weight', 'encoders.18.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.attn.out_proj.bias": "encoders.18.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.attn.out_proj.weight": "encoders.18.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.ln_1.bias": "encoders.18.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.ln_1.weight": "encoders.18.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.ln_2.bias": "encoders.18.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.ln_2.weight": "encoders.18.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.mlp.c_fc.bias": "encoders.18.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.mlp.c_fc.weight": "encoders.18.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.mlp.c_proj.bias": "encoders.18.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.18.mlp.c_proj.weight": "encoders.18.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.attn.in_proj_bias": ['encoders.19.attn.to_q.bias', 'encoders.19.attn.to_k.bias', 'encoders.19.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.attn.in_proj_weight": ['encoders.19.attn.to_q.weight', 'encoders.19.attn.to_k.weight', 'encoders.19.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.attn.out_proj.bias": "encoders.19.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.attn.out_proj.weight": "encoders.19.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.ln_1.bias": "encoders.19.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.ln_1.weight": "encoders.19.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.ln_2.bias": "encoders.19.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.ln_2.weight": "encoders.19.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.mlp.c_fc.bias": "encoders.19.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.mlp.c_fc.weight": "encoders.19.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.mlp.c_proj.bias": "encoders.19.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.19.mlp.c_proj.weight": "encoders.19.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.attn.in_proj_bias": ['encoders.2.attn.to_q.bias', 'encoders.2.attn.to_k.bias', 'encoders.2.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.attn.in_proj_weight": ['encoders.2.attn.to_q.weight', 'encoders.2.attn.to_k.weight', 'encoders.2.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.attn.out_proj.bias": "encoders.2.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.attn.out_proj.weight": "encoders.2.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.ln_1.bias": "encoders.2.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.ln_1.weight": "encoders.2.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.ln_2.bias": "encoders.2.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.ln_2.weight": "encoders.2.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.mlp.c_fc.bias": "encoders.2.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.mlp.c_fc.weight": "encoders.2.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.mlp.c_proj.bias": "encoders.2.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.2.mlp.c_proj.weight": "encoders.2.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.attn.in_proj_bias": ['encoders.20.attn.to_q.bias', 'encoders.20.attn.to_k.bias', 'encoders.20.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.attn.in_proj_weight": ['encoders.20.attn.to_q.weight', 'encoders.20.attn.to_k.weight', 'encoders.20.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.attn.out_proj.bias": "encoders.20.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.attn.out_proj.weight": "encoders.20.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.ln_1.bias": "encoders.20.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.ln_1.weight": "encoders.20.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.ln_2.bias": "encoders.20.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.ln_2.weight": "encoders.20.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.mlp.c_fc.bias": "encoders.20.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.mlp.c_fc.weight": "encoders.20.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.mlp.c_proj.bias": "encoders.20.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.20.mlp.c_proj.weight": "encoders.20.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.attn.in_proj_bias": ['encoders.21.attn.to_q.bias', 'encoders.21.attn.to_k.bias', 'encoders.21.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.attn.in_proj_weight": ['encoders.21.attn.to_q.weight', 'encoders.21.attn.to_k.weight', 'encoders.21.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.attn.out_proj.bias": "encoders.21.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.attn.out_proj.weight": "encoders.21.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.ln_1.bias": "encoders.21.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.ln_1.weight": "encoders.21.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.ln_2.bias": "encoders.21.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.ln_2.weight": "encoders.21.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.mlp.c_fc.bias": "encoders.21.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.mlp.c_fc.weight": "encoders.21.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.mlp.c_proj.bias": "encoders.21.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.21.mlp.c_proj.weight": "encoders.21.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.attn.in_proj_bias": ['encoders.22.attn.to_q.bias', 'encoders.22.attn.to_k.bias', 'encoders.22.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.attn.in_proj_weight": ['encoders.22.attn.to_q.weight', 'encoders.22.attn.to_k.weight', 'encoders.22.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.attn.out_proj.bias": "encoders.22.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.attn.out_proj.weight": "encoders.22.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.ln_1.bias": "encoders.22.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.ln_1.weight": "encoders.22.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.ln_2.bias": "encoders.22.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.ln_2.weight": "encoders.22.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.mlp.c_fc.bias": "encoders.22.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.mlp.c_fc.weight": "encoders.22.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.mlp.c_proj.bias": "encoders.22.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.22.mlp.c_proj.weight": "encoders.22.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.attn.in_proj_bias": ['encoders.23.attn.to_q.bias', 'encoders.23.attn.to_k.bias', 'encoders.23.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.attn.in_proj_weight": ['encoders.23.attn.to_q.weight', 'encoders.23.attn.to_k.weight', 'encoders.23.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.attn.out_proj.bias": "encoders.23.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.attn.out_proj.weight": "encoders.23.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.ln_1.bias": "encoders.23.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.ln_1.weight": "encoders.23.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.ln_2.bias": "encoders.23.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.ln_2.weight": "encoders.23.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.mlp.c_fc.bias": "encoders.23.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.mlp.c_fc.weight": "encoders.23.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.mlp.c_proj.bias": "encoders.23.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.23.mlp.c_proj.weight": "encoders.23.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.attn.in_proj_bias": ['encoders.24.attn.to_q.bias', 'encoders.24.attn.to_k.bias', 'encoders.24.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.attn.in_proj_weight": ['encoders.24.attn.to_q.weight', 'encoders.24.attn.to_k.weight', 'encoders.24.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.attn.out_proj.bias": "encoders.24.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.attn.out_proj.weight": "encoders.24.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.ln_1.bias": "encoders.24.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.ln_1.weight": "encoders.24.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.ln_2.bias": "encoders.24.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.ln_2.weight": "encoders.24.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.mlp.c_fc.bias": "encoders.24.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.mlp.c_fc.weight": "encoders.24.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.mlp.c_proj.bias": "encoders.24.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.24.mlp.c_proj.weight": "encoders.24.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.attn.in_proj_bias": ['encoders.25.attn.to_q.bias', 'encoders.25.attn.to_k.bias', 'encoders.25.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.attn.in_proj_weight": ['encoders.25.attn.to_q.weight', 'encoders.25.attn.to_k.weight', 'encoders.25.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.attn.out_proj.bias": "encoders.25.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.attn.out_proj.weight": "encoders.25.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.ln_1.bias": "encoders.25.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.ln_1.weight": "encoders.25.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.ln_2.bias": "encoders.25.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.ln_2.weight": "encoders.25.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.mlp.c_fc.bias": "encoders.25.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.mlp.c_fc.weight": "encoders.25.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.mlp.c_proj.bias": "encoders.25.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.25.mlp.c_proj.weight": "encoders.25.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.attn.in_proj_bias": ['encoders.26.attn.to_q.bias', 'encoders.26.attn.to_k.bias', 'encoders.26.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.attn.in_proj_weight": ['encoders.26.attn.to_q.weight', 'encoders.26.attn.to_k.weight', 'encoders.26.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.attn.out_proj.bias": "encoders.26.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.attn.out_proj.weight": "encoders.26.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.ln_1.bias": "encoders.26.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.ln_1.weight": "encoders.26.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.ln_2.bias": "encoders.26.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.ln_2.weight": "encoders.26.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.mlp.c_fc.bias": "encoders.26.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.mlp.c_fc.weight": "encoders.26.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.mlp.c_proj.bias": "encoders.26.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.26.mlp.c_proj.weight": "encoders.26.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.attn.in_proj_bias": ['encoders.27.attn.to_q.bias', 'encoders.27.attn.to_k.bias', 'encoders.27.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.attn.in_proj_weight": ['encoders.27.attn.to_q.weight', 'encoders.27.attn.to_k.weight', 'encoders.27.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.attn.out_proj.bias": "encoders.27.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.attn.out_proj.weight": "encoders.27.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.ln_1.bias": "encoders.27.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.ln_1.weight": "encoders.27.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.ln_2.bias": "encoders.27.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.ln_2.weight": "encoders.27.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.mlp.c_fc.bias": "encoders.27.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.mlp.c_fc.weight": "encoders.27.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.mlp.c_proj.bias": "encoders.27.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.27.mlp.c_proj.weight": "encoders.27.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.attn.in_proj_bias": ['encoders.28.attn.to_q.bias', 'encoders.28.attn.to_k.bias', 'encoders.28.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.attn.in_proj_weight": ['encoders.28.attn.to_q.weight', 'encoders.28.attn.to_k.weight', 'encoders.28.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.attn.out_proj.bias": "encoders.28.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.attn.out_proj.weight": "encoders.28.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.ln_1.bias": "encoders.28.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.ln_1.weight": "encoders.28.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.ln_2.bias": "encoders.28.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.ln_2.weight": "encoders.28.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.mlp.c_fc.bias": "encoders.28.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.mlp.c_fc.weight": "encoders.28.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.mlp.c_proj.bias": "encoders.28.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.28.mlp.c_proj.weight": "encoders.28.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.attn.in_proj_bias": ['encoders.29.attn.to_q.bias', 'encoders.29.attn.to_k.bias', 'encoders.29.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.attn.in_proj_weight": ['encoders.29.attn.to_q.weight', 'encoders.29.attn.to_k.weight', 'encoders.29.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.attn.out_proj.bias": "encoders.29.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.attn.out_proj.weight": "encoders.29.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.ln_1.bias": "encoders.29.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.ln_1.weight": "encoders.29.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.ln_2.bias": "encoders.29.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.ln_2.weight": "encoders.29.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.mlp.c_fc.bias": "encoders.29.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.mlp.c_fc.weight": "encoders.29.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.mlp.c_proj.bias": "encoders.29.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.29.mlp.c_proj.weight": "encoders.29.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.attn.in_proj_bias": ['encoders.3.attn.to_q.bias', 'encoders.3.attn.to_k.bias', 'encoders.3.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.attn.in_proj_weight": ['encoders.3.attn.to_q.weight', 'encoders.3.attn.to_k.weight', 'encoders.3.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.attn.out_proj.bias": "encoders.3.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.attn.out_proj.weight": "encoders.3.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.ln_1.bias": "encoders.3.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.ln_1.weight": "encoders.3.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.ln_2.bias": "encoders.3.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.ln_2.weight": "encoders.3.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.mlp.c_fc.bias": "encoders.3.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.mlp.c_fc.weight": "encoders.3.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.mlp.c_proj.bias": "encoders.3.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.3.mlp.c_proj.weight": "encoders.3.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.attn.in_proj_bias": ['encoders.30.attn.to_q.bias', 'encoders.30.attn.to_k.bias', 'encoders.30.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.attn.in_proj_weight": ['encoders.30.attn.to_q.weight', 'encoders.30.attn.to_k.weight', 'encoders.30.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.attn.out_proj.bias": "encoders.30.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.attn.out_proj.weight": "encoders.30.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.ln_1.bias": "encoders.30.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.ln_1.weight": "encoders.30.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.ln_2.bias": "encoders.30.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.ln_2.weight": "encoders.30.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.mlp.c_fc.bias": "encoders.30.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.mlp.c_fc.weight": "encoders.30.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.mlp.c_proj.bias": "encoders.30.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.30.mlp.c_proj.weight": "encoders.30.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.attn.in_proj_bias": ['encoders.31.attn.to_q.bias', 'encoders.31.attn.to_k.bias', 'encoders.31.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.attn.in_proj_weight": ['encoders.31.attn.to_q.weight', 'encoders.31.attn.to_k.weight', 'encoders.31.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.attn.out_proj.bias": "encoders.31.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.attn.out_proj.weight": "encoders.31.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.ln_1.bias": "encoders.31.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.ln_1.weight": "encoders.31.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.ln_2.bias": "encoders.31.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.ln_2.weight": "encoders.31.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.mlp.c_fc.bias": "encoders.31.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.mlp.c_fc.weight": "encoders.31.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.mlp.c_proj.bias": "encoders.31.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.31.mlp.c_proj.weight": "encoders.31.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.attn.in_proj_bias": ['encoders.4.attn.to_q.bias', 'encoders.4.attn.to_k.bias', 'encoders.4.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.attn.in_proj_weight": ['encoders.4.attn.to_q.weight', 'encoders.4.attn.to_k.weight', 'encoders.4.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.attn.out_proj.bias": "encoders.4.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.attn.out_proj.weight": "encoders.4.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.ln_1.bias": "encoders.4.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.ln_1.weight": "encoders.4.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.ln_2.bias": "encoders.4.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.ln_2.weight": "encoders.4.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.mlp.c_fc.bias": "encoders.4.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.mlp.c_fc.weight": "encoders.4.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.mlp.c_proj.bias": "encoders.4.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.4.mlp.c_proj.weight": "encoders.4.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.attn.in_proj_bias": ['encoders.5.attn.to_q.bias', 'encoders.5.attn.to_k.bias', 'encoders.5.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.attn.in_proj_weight": ['encoders.5.attn.to_q.weight', 'encoders.5.attn.to_k.weight', 'encoders.5.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.attn.out_proj.bias": "encoders.5.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.attn.out_proj.weight": "encoders.5.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.ln_1.bias": "encoders.5.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.ln_1.weight": "encoders.5.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.ln_2.bias": "encoders.5.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.ln_2.weight": "encoders.5.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.mlp.c_fc.bias": "encoders.5.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.mlp.c_fc.weight": "encoders.5.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.mlp.c_proj.bias": "encoders.5.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.5.mlp.c_proj.weight": "encoders.5.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.attn.in_proj_bias": ['encoders.6.attn.to_q.bias', 'encoders.6.attn.to_k.bias', 'encoders.6.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.attn.in_proj_weight": ['encoders.6.attn.to_q.weight', 'encoders.6.attn.to_k.weight', 'encoders.6.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.attn.out_proj.bias": "encoders.6.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.attn.out_proj.weight": "encoders.6.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.ln_1.bias": "encoders.6.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.ln_1.weight": "encoders.6.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.ln_2.bias": "encoders.6.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.ln_2.weight": "encoders.6.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.mlp.c_fc.bias": "encoders.6.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.mlp.c_fc.weight": "encoders.6.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.mlp.c_proj.bias": "encoders.6.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.6.mlp.c_proj.weight": "encoders.6.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.attn.in_proj_bias": ['encoders.7.attn.to_q.bias', 'encoders.7.attn.to_k.bias', 'encoders.7.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.attn.in_proj_weight": ['encoders.7.attn.to_q.weight', 'encoders.7.attn.to_k.weight', 'encoders.7.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.attn.out_proj.bias": "encoders.7.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.attn.out_proj.weight": "encoders.7.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.ln_1.bias": "encoders.7.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.ln_1.weight": "encoders.7.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.ln_2.bias": "encoders.7.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.ln_2.weight": "encoders.7.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.mlp.c_fc.bias": "encoders.7.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.mlp.c_fc.weight": "encoders.7.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.mlp.c_proj.bias": "encoders.7.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.7.mlp.c_proj.weight": "encoders.7.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.attn.in_proj_bias": ['encoders.8.attn.to_q.bias', 'encoders.8.attn.to_k.bias', 'encoders.8.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.attn.in_proj_weight": ['encoders.8.attn.to_q.weight', 'encoders.8.attn.to_k.weight', 'encoders.8.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.attn.out_proj.bias": "encoders.8.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.attn.out_proj.weight": "encoders.8.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.ln_1.bias": "encoders.8.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.ln_1.weight": "encoders.8.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.ln_2.bias": "encoders.8.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.ln_2.weight": "encoders.8.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.mlp.c_fc.bias": "encoders.8.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.mlp.c_fc.weight": "encoders.8.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.mlp.c_proj.bias": "encoders.8.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.8.mlp.c_proj.weight": "encoders.8.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.attn.in_proj_bias": ['encoders.9.attn.to_q.bias', 'encoders.9.attn.to_k.bias', 'encoders.9.attn.to_v.bias'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.attn.in_proj_weight": ['encoders.9.attn.to_q.weight', 'encoders.9.attn.to_k.weight', 'encoders.9.attn.to_v.weight'],
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.attn.out_proj.bias": "encoders.9.attn.to_out.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.attn.out_proj.weight": "encoders.9.attn.to_out.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.ln_1.bias": "encoders.9.layer_norm1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.ln_1.weight": "encoders.9.layer_norm1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.ln_2.bias": "encoders.9.layer_norm2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.ln_2.weight": "encoders.9.layer_norm2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.mlp.c_fc.bias": "encoders.9.fc1.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.mlp.c_fc.weight": "encoders.9.fc1.weight",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.mlp.c_proj.bias": "encoders.9.fc2.bias",
+            "conditioner.embedders.0.open_clip.model.visual.transformer.resblocks.9.mlp.c_proj.weight": "encoders.9.fc2.weight",
+            "conditioner.embedders.0.open_clip.model.visual.proj": "visual_projection.weight",
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if name == "conditioner.embedders.0.open_clip.model.visual.class_embedding":
+                    param = param.reshape((1, 1, param.shape[0]))
+                elif name == "conditioner.embedders.0.open_clip.model.visual.positional_embedding":
+                    param = param.reshape((1, param.shape[0], param.shape[1]))
+                elif name == "conditioner.embedders.0.open_clip.model.visual.proj":
+                    param = param.T
+                if isinstance(rename_dict[name], str):
+                    state_dict_[rename_dict[name]] = param
+                else:
+                    length = param.shape[0] // 3
+                    for i, rename in enumerate(rename_dict[name]):
+                        state_dict_[rename] = param[i*length: i*length+length]
+        return state_dict_

diffsynth/models/svd_unet.py ADDED Viewed

The diff for this file is too large to render. See raw diff

diffsynth/models/svd_vae_decoder.py ADDED Viewed

	@@ -0,0 +1,577 @@

+import torch
+from .attention import Attention
+from .sd_unet import ResnetBlock, UpSampler
+from .tiler import TileWorker
+from einops import rearrange, repeat
+class VAEAttentionBlock(torch.nn.Module):
+    def __init__(self, num_attention_heads, attention_head_dim, in_channels, num_layers=1, norm_num_groups=32, eps=1e-5):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=eps, affine=True)
+        self.transformer_blocks = torch.nn.ModuleList([
+            Attention(
+                inner_dim,
+                num_attention_heads,
+                attention_head_dim,
+                bias_q=True,
+                bias_kv=True,
+                bias_out=True
+            )
+            for d in range(num_layers)
+        ])
+    def forward(self, hidden_states, time_emb, text_emb, res_stack):
+        batch, _, height, width = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        inner_dim = hidden_states.shape[1]
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states)
+        hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+        hidden_states = hidden_states + residual
+        return hidden_states, time_emb, text_emb, res_stack
+class TemporalResnetBlock(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, groups=32, eps=1e-5):
+        super().__init__()
+        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = torch.nn.Conv3d(in_channels, out_channels, kernel_size=(3, 1, 1), stride=1, padding=(1, 0, 0))
+        self.norm2 = torch.nn.GroupNorm(num_groups=groups, num_channels=out_channels, eps=eps, affine=True)
+        self.conv2 = torch.nn.Conv3d(out_channels, out_channels, kernel_size=(3, 1, 1), stride=1, padding=(1, 0, 0))
+        self.nonlinearity = torch.nn.SiLU()
+        self.mix_factor = torch.nn.Parameter(torch.Tensor([0.5]))
+    def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwargs):
+        x_spatial = hidden_states
+        x = rearrange(hidden_states, "T C H W -> 1 C T H W")
+        x = self.norm1(x)
+        x = self.nonlinearity(x)
+        x = self.conv1(x)
+        x = self.norm2(x)
+        x = self.nonlinearity(x)
+        x = self.conv2(x)
+        x_temporal = hidden_states + x[0].permute(1, 0, 2, 3)
+        alpha = torch.sigmoid(self.mix_factor)
+        hidden_states = alpha * x_temporal + (1 - alpha) * x_spatial
+        return hidden_states, time_emb, text_emb, res_stack
+class SVDVAEDecoder(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.scaling_factor = 0.18215
+        self.conv_in = torch.nn.Conv2d(4, 512, kernel_size=3, padding=1)
+        self.blocks = torch.nn.ModuleList([
+            # UNetMidBlock
+            ResnetBlock(512, 512, eps=1e-6),
+            TemporalResnetBlock(512, 512, eps=1e-6),
+            VAEAttentionBlock(1, 512, 512, 1, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            TemporalResnetBlock(512, 512, eps=1e-6),
+            # UpDecoderBlock
+            ResnetBlock(512, 512, eps=1e-6),
+            TemporalResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            TemporalResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            TemporalResnetBlock(512, 512, eps=1e-6),
+            UpSampler(512),
+            # UpDecoderBlock
+            ResnetBlock(512, 512, eps=1e-6),
+            TemporalResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            TemporalResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            TemporalResnetBlock(512, 512, eps=1e-6),
+            UpSampler(512),
+            # UpDecoderBlock
+            ResnetBlock(512, 256, eps=1e-6),
+            TemporalResnetBlock(256, 256, eps=1e-6),
+            ResnetBlock(256, 256, eps=1e-6),
+            TemporalResnetBlock(256, 256, eps=1e-6),
+            ResnetBlock(256, 256, eps=1e-6),
+            TemporalResnetBlock(256, 256, eps=1e-6),
+            UpSampler(256),
+            # UpDecoderBlock
+            ResnetBlock(256, 128, eps=1e-6),
+            TemporalResnetBlock(128, 128, eps=1e-6),
+            ResnetBlock(128, 128, eps=1e-6),
+            TemporalResnetBlock(128, 128, eps=1e-6),
+            ResnetBlock(128, 128, eps=1e-6),
+            TemporalResnetBlock(128, 128, eps=1e-6),
+        ])
+        self.conv_norm_out = torch.nn.GroupNorm(num_channels=128, num_groups=32, eps=1e-5)
+        self.conv_act = torch.nn.SiLU()
+        self.conv_out = torch.nn.Conv2d(128, 3, kernel_size=3, padding=1)
+        self.time_conv_out = torch.nn.Conv3d(3, 3, kernel_size=(3, 1, 1), padding=(1, 0, 0))
+    def forward(self, sample):
+        # 1. pre-process
+        hidden_states = rearrange(sample, "C T H W -> T C H W")
+        hidden_states = hidden_states / self.scaling_factor
+        hidden_states = self.conv_in(hidden_states)
+        time_emb, text_emb, res_stack = None, None, None
+        # 2. blocks
+        for i, block in enumerate(self.blocks):
+            hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
+        # 3. output
+        hidden_states = self.conv_norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        hidden_states = rearrange(hidden_states, "T C H W -> C T H W")
+        hidden_states = self.time_conv_out(hidden_states)
+        return hidden_states
+    def build_mask(self, data, is_bound):
+        _, T, H, W = data.shape
+        t = repeat(torch.arange(T), "T -> T H W", T=T, H=H, W=W)
+        h = repeat(torch.arange(H), "H -> T H W", T=T, H=H, W=W)
+        w = repeat(torch.arange(W), "W -> T H W", T=T, H=H, W=W)
+        border_width = (T + H + W) // 6
+        pad = torch.ones_like(t) * border_width
+        mask = torch.stack([
+            pad if is_bound[0] else t + 1,
+            pad if is_bound[1] else T - t,
+            pad if is_bound[2] else h + 1,
+            pad if is_bound[3] else H - h,
+            pad if is_bound[4] else w + 1,
+            pad if is_bound[5] else W - w
+        ]).min(dim=0).values
+        mask = mask.clip(1, border_width)
+        mask = (mask / border_width).to(dtype=data.dtype, device=data.device)
+        mask = rearrange(mask, "T H W -> 1 T H W")
+        return mask
+    def decode_video(
+        self, sample,
+        batch_time=8, batch_height=128, batch_width=128,
+        stride_time=4, stride_height=32, stride_width=32,
+        progress_bar=lambda x:x
+    ):
+        sample = sample.permute(1, 0, 2, 3)
+        data_device = sample.device
+        computation_device = self.conv_in.weight.device
+        torch_dtype = sample.dtype
+        _, T, H, W = sample.shape
+        weight = torch.zeros((1, T, H*8, W*8), dtype=torch_dtype, device=data_device)
+        values = torch.zeros((3, T, H*8, W*8), dtype=torch_dtype, device=data_device)
+        # Split tasks
+        tasks = []
+        for t in range(0, T, stride_time):
+            for h in range(0, H, stride_height):
+                for w in range(0, W, stride_width):
+                    if (t-stride_time >= 0 and t-stride_time+batch_time >= T)\
+                        or (h-stride_height >= 0 and h-stride_height+batch_height >= H)\
+                        or (w-stride_width >= 0 and w-stride_width+batch_width >= W):
+                        continue
+                    tasks.append((t, t+batch_time, h, h+batch_height, w, w+batch_width))
+        # Run
+        for tl, tr, hl, hr, wl, wr in progress_bar(tasks):
+            sample_batch = sample[:, tl:tr, hl:hr, wl:wr].to(computation_device)
+            sample_batch = self.forward(sample_batch).to(data_device)
+            mask = self.build_mask(sample_batch, is_bound=(tl==0, tr>=T, hl==0, hr>=H, wl==0, wr>=W))
+            values[:, tl:tr, hl*8:hr*8, wl*8:wr*8] += sample_batch * mask
+            weight[:, tl:tr, hl*8:hr*8, wl*8:wr*8] += mask
+        values /= weight
+        return values
+    def state_dict_converter(self):
+        return SVDVAEDecoderStateDictConverter()
+class SVDVAEDecoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        static_rename_dict = {
+            "decoder.conv_in":  "conv_in",
+            "decoder.mid_block.attentions.0.group_norm": "blocks.2.norm",
+            "decoder.mid_block.attentions.0.to_q": "blocks.2.transformer_blocks.0.to_q",
+            "decoder.mid_block.attentions.0.to_k": "blocks.2.transformer_blocks.0.to_k",
+            "decoder.mid_block.attentions.0.to_v": "blocks.2.transformer_blocks.0.to_v",
+            "decoder.mid_block.attentions.0.to_out.0": "blocks.2.transformer_blocks.0.to_out",
+            "decoder.up_blocks.0.upsamplers.0.conv": "blocks.11.conv",
+            "decoder.up_blocks.1.upsamplers.0.conv": "blocks.18.conv",
+            "decoder.up_blocks.2.upsamplers.0.conv": "blocks.25.conv",
+            "decoder.conv_norm_out": "conv_norm_out",
+            "decoder.conv_out": "conv_out",
+            "decoder.time_conv_out": "time_conv_out"
+        }
+        prefix_rename_dict = {
+            "decoder.mid_block.resnets.0.spatial_res_block": "blocks.0",
+            "decoder.mid_block.resnets.0.temporal_res_block": "blocks.1",
+            "decoder.mid_block.resnets.0.time_mixer": "blocks.1",
+            "decoder.mid_block.resnets.1.spatial_res_block": "blocks.3",
+            "decoder.mid_block.resnets.1.temporal_res_block": "blocks.4",
+            "decoder.mid_block.resnets.1.time_mixer": "blocks.4",
+            "decoder.up_blocks.0.resnets.0.spatial_res_block": "blocks.5",
+            "decoder.up_blocks.0.resnets.0.temporal_res_block": "blocks.6",
+            "decoder.up_blocks.0.resnets.0.time_mixer": "blocks.6",
+            "decoder.up_blocks.0.resnets.1.spatial_res_block": "blocks.7",
+            "decoder.up_blocks.0.resnets.1.temporal_res_block": "blocks.8",
+            "decoder.up_blocks.0.resnets.1.time_mixer": "blocks.8",
+            "decoder.up_blocks.0.resnets.2.spatial_res_block": "blocks.9",
+            "decoder.up_blocks.0.resnets.2.temporal_res_block": "blocks.10",
+            "decoder.up_blocks.0.resnets.2.time_mixer": "blocks.10",
+            "decoder.up_blocks.1.resnets.0.spatial_res_block": "blocks.12",
+            "decoder.up_blocks.1.resnets.0.temporal_res_block": "blocks.13",
+            "decoder.up_blocks.1.resnets.0.time_mixer": "blocks.13",
+            "decoder.up_blocks.1.resnets.1.spatial_res_block": "blocks.14",
+            "decoder.up_blocks.1.resnets.1.temporal_res_block": "blocks.15",
+            "decoder.up_blocks.1.resnets.1.time_mixer": "blocks.15",
+            "decoder.up_blocks.1.resnets.2.spatial_res_block": "blocks.16",
+            "decoder.up_blocks.1.resnets.2.temporal_res_block": "blocks.17",
+            "decoder.up_blocks.1.resnets.2.time_mixer": "blocks.17",
+            "decoder.up_blocks.2.resnets.0.spatial_res_block": "blocks.19",
+            "decoder.up_blocks.2.resnets.0.temporal_res_block": "blocks.20",
+            "decoder.up_blocks.2.resnets.0.time_mixer": "blocks.20",
+            "decoder.up_blocks.2.resnets.1.spatial_res_block": "blocks.21",
+            "decoder.up_blocks.2.resnets.1.temporal_res_block": "blocks.22",
+            "decoder.up_blocks.2.resnets.1.time_mixer": "blocks.22",
+            "decoder.up_blocks.2.resnets.2.spatial_res_block": "blocks.23",
+            "decoder.up_blocks.2.resnets.2.temporal_res_block": "blocks.24",
+            "decoder.up_blocks.2.resnets.2.time_mixer": "blocks.24",
+            "decoder.up_blocks.3.resnets.0.spatial_res_block": "blocks.26",
+            "decoder.up_blocks.3.resnets.0.temporal_res_block": "blocks.27",
+            "decoder.up_blocks.3.resnets.0.time_mixer": "blocks.27",
+            "decoder.up_blocks.3.resnets.1.spatial_res_block": "blocks.28",
+            "decoder.up_blocks.3.resnets.1.temporal_res_block": "blocks.29",
+            "decoder.up_blocks.3.resnets.1.time_mixer": "blocks.29",
+            "decoder.up_blocks.3.resnets.2.spatial_res_block": "blocks.30",
+            "decoder.up_blocks.3.resnets.2.temporal_res_block": "blocks.31",
+            "decoder.up_blocks.3.resnets.2.time_mixer": "blocks.31",
+        }
+        suffix_rename_dict = {
+            "norm1.weight": "norm1.weight",
+            "conv1.weight": "conv1.weight",
+            "norm2.weight": "norm2.weight",
+            "conv2.weight": "conv2.weight",
+            "conv_shortcut.weight": "conv_shortcut.weight",
+            "norm1.bias": "norm1.bias",
+            "conv1.bias": "conv1.bias",
+            "norm2.bias": "norm2.bias",
+            "conv2.bias": "conv2.bias",
+            "conv_shortcut.bias": "conv_shortcut.bias",
+            "mix_factor": "mix_factor",
+        }
+        state_dict_ = {}
+        for name in static_rename_dict:
+            state_dict_[static_rename_dict[name] + ".weight"] = state_dict[name + ".weight"]
+            state_dict_[static_rename_dict[name] + ".bias"] = state_dict[name + ".bias"]
+        for prefix_name in prefix_rename_dict:
+            for suffix_name in suffix_rename_dict:
+                name = prefix_name + "." + suffix_name
+                name_ = prefix_rename_dict[prefix_name] + "." + suffix_rename_dict[suffix_name]
+                if name in state_dict:
+                    state_dict_[name_] = state_dict[name]
+        return state_dict_
+    def from_civitai(self, state_dict):
+        rename_dict = {
+            "first_stage_model.decoder.conv_in.bias": "conv_in.bias",
+            "first_stage_model.decoder.conv_in.weight": "conv_in.weight",
+            "first_stage_model.decoder.conv_out.bias": "conv_out.bias",
+            "first_stage_model.decoder.conv_out.time_mix_conv.bias": "time_conv_out.bias",
+            "first_stage_model.decoder.conv_out.time_mix_conv.weight": "time_conv_out.weight",
+            "first_stage_model.decoder.conv_out.weight": "conv_out.weight",
+            "first_stage_model.decoder.mid.attn_1.k.bias": "blocks.2.transformer_blocks.0.to_k.bias",
+            "first_stage_model.decoder.mid.attn_1.k.weight": "blocks.2.transformer_blocks.0.to_k.weight",
+            "first_stage_model.decoder.mid.attn_1.norm.bias": "blocks.2.norm.bias",
+            "first_stage_model.decoder.mid.attn_1.norm.weight": "blocks.2.norm.weight",
+            "first_stage_model.decoder.mid.attn_1.proj_out.bias": "blocks.2.transformer_blocks.0.to_out.bias",
+            "first_stage_model.decoder.mid.attn_1.proj_out.weight": "blocks.2.transformer_blocks.0.to_out.weight",
+            "first_stage_model.decoder.mid.attn_1.q.bias": "blocks.2.transformer_blocks.0.to_q.bias",
+            "first_stage_model.decoder.mid.attn_1.q.weight": "blocks.2.transformer_blocks.0.to_q.weight",
+            "first_stage_model.decoder.mid.attn_1.v.bias": "blocks.2.transformer_blocks.0.to_v.bias",
+            "first_stage_model.decoder.mid.attn_1.v.weight": "blocks.2.transformer_blocks.0.to_v.weight",
+            "first_stage_model.decoder.mid.block_1.conv1.bias": "blocks.0.conv1.bias",
+            "first_stage_model.decoder.mid.block_1.conv1.weight": "blocks.0.conv1.weight",
+            "first_stage_model.decoder.mid.block_1.conv2.bias": "blocks.0.conv2.bias",
+            "first_stage_model.decoder.mid.block_1.conv2.weight": "blocks.0.conv2.weight",
+            "first_stage_model.decoder.mid.block_1.mix_factor": "blocks.1.mix_factor",
+            "first_stage_model.decoder.mid.block_1.norm1.bias": "blocks.0.norm1.bias",
+            "first_stage_model.decoder.mid.block_1.norm1.weight": "blocks.0.norm1.weight",
+            "first_stage_model.decoder.mid.block_1.norm2.bias": "blocks.0.norm2.bias",
+            "first_stage_model.decoder.mid.block_1.norm2.weight": "blocks.0.norm2.weight",
+            "first_stage_model.decoder.mid.block_1.time_stack.in_layers.0.bias": "blocks.1.norm1.bias",
+            "first_stage_model.decoder.mid.block_1.time_stack.in_layers.0.weight": "blocks.1.norm1.weight",
+            "first_stage_model.decoder.mid.block_1.time_stack.in_layers.2.bias": "blocks.1.conv1.bias",
+            "first_stage_model.decoder.mid.block_1.time_stack.in_layers.2.weight": "blocks.1.conv1.weight",
+            "first_stage_model.decoder.mid.block_1.time_stack.out_layers.0.bias": "blocks.1.norm2.bias",
+            "first_stage_model.decoder.mid.block_1.time_stack.out_layers.0.weight": "blocks.1.norm2.weight",
+            "first_stage_model.decoder.mid.block_1.time_stack.out_layers.3.bias": "blocks.1.conv2.bias",
+            "first_stage_model.decoder.mid.block_1.time_stack.out_layers.3.weight": "blocks.1.conv2.weight",
+            "first_stage_model.decoder.mid.block_2.conv1.bias": "blocks.3.conv1.bias",
+            "first_stage_model.decoder.mid.block_2.conv1.weight": "blocks.3.conv1.weight",
+            "first_stage_model.decoder.mid.block_2.conv2.bias": "blocks.3.conv2.bias",
+            "first_stage_model.decoder.mid.block_2.conv2.weight": "blocks.3.conv2.weight",
+            "first_stage_model.decoder.mid.block_2.mix_factor": "blocks.4.mix_factor",
+            "first_stage_model.decoder.mid.block_2.norm1.bias": "blocks.3.norm1.bias",
+            "first_stage_model.decoder.mid.block_2.norm1.weight": "blocks.3.norm1.weight",
+            "first_stage_model.decoder.mid.block_2.norm2.bias": "blocks.3.norm2.bias",
+            "first_stage_model.decoder.mid.block_2.norm2.weight": "blocks.3.norm2.weight",
+            "first_stage_model.decoder.mid.block_2.time_stack.in_layers.0.bias": "blocks.4.norm1.bias",
+            "first_stage_model.decoder.mid.block_2.time_stack.in_layers.0.weight": "blocks.4.norm1.weight",
+            "first_stage_model.decoder.mid.block_2.time_stack.in_layers.2.bias": "blocks.4.conv1.bias",
+            "first_stage_model.decoder.mid.block_2.time_stack.in_layers.2.weight": "blocks.4.conv1.weight",
+            "first_stage_model.decoder.mid.block_2.time_stack.out_layers.0.bias": "blocks.4.norm2.bias",
+            "first_stage_model.decoder.mid.block_2.time_stack.out_layers.0.weight": "blocks.4.norm2.weight",
+            "first_stage_model.decoder.mid.block_2.time_stack.out_layers.3.bias": "blocks.4.conv2.bias",
+            "first_stage_model.decoder.mid.block_2.time_stack.out_layers.3.weight": "blocks.4.conv2.weight",
+            "first_stage_model.decoder.norm_out.bias": "conv_norm_out.bias",
+            "first_stage_model.decoder.norm_out.weight": "conv_norm_out.weight",
+            "first_stage_model.decoder.up.0.block.0.conv1.bias": "blocks.26.conv1.bias",
+            "first_stage_model.decoder.up.0.block.0.conv1.weight": "blocks.26.conv1.weight",
+            "first_stage_model.decoder.up.0.block.0.conv2.bias": "blocks.26.conv2.bias",
+            "first_stage_model.decoder.up.0.block.0.conv2.weight": "blocks.26.conv2.weight",
+            "first_stage_model.decoder.up.0.block.0.mix_factor": "blocks.27.mix_factor",
+            "first_stage_model.decoder.up.0.block.0.nin_shortcut.bias": "blocks.26.conv_shortcut.bias",
+            "first_stage_model.decoder.up.0.block.0.nin_shortcut.weight": "blocks.26.conv_shortcut.weight",
+            "first_stage_model.decoder.up.0.block.0.norm1.bias": "blocks.26.norm1.bias",
+            "first_stage_model.decoder.up.0.block.0.norm1.weight": "blocks.26.norm1.weight",
+            "first_stage_model.decoder.up.0.block.0.norm2.bias": "blocks.26.norm2.bias",
+            "first_stage_model.decoder.up.0.block.0.norm2.weight": "blocks.26.norm2.weight",
+            "first_stage_model.decoder.up.0.block.0.time_stack.in_layers.0.bias": "blocks.27.norm1.bias",
+            "first_stage_model.decoder.up.0.block.0.time_stack.in_layers.0.weight": "blocks.27.norm1.weight",
+            "first_stage_model.decoder.up.0.block.0.time_stack.in_layers.2.bias": "blocks.27.conv1.bias",
+            "first_stage_model.decoder.up.0.block.0.time_stack.in_layers.2.weight": "blocks.27.conv1.weight",
+            "first_stage_model.decoder.up.0.block.0.time_stack.out_layers.0.bias": "blocks.27.norm2.bias",
+            "first_stage_model.decoder.up.0.block.0.time_stack.out_layers.0.weight": "blocks.27.norm2.weight",
+            "first_stage_model.decoder.up.0.block.0.time_stack.out_layers.3.bias": "blocks.27.conv2.bias",
+            "first_stage_model.decoder.up.0.block.0.time_stack.out_layers.3.weight": "blocks.27.conv2.weight",
+            "first_stage_model.decoder.up.0.block.1.conv1.bias": "blocks.28.conv1.bias",
+            "first_stage_model.decoder.up.0.block.1.conv1.weight": "blocks.28.conv1.weight",
+            "first_stage_model.decoder.up.0.block.1.conv2.bias": "blocks.28.conv2.bias",
+            "first_stage_model.decoder.up.0.block.1.conv2.weight": "blocks.28.conv2.weight",
+            "first_stage_model.decoder.up.0.block.1.mix_factor": "blocks.29.mix_factor",
+            "first_stage_model.decoder.up.0.block.1.norm1.bias": "blocks.28.norm1.bias",
+            "first_stage_model.decoder.up.0.block.1.norm1.weight": "blocks.28.norm1.weight",
+            "first_stage_model.decoder.up.0.block.1.norm2.bias": "blocks.28.norm2.bias",
+            "first_stage_model.decoder.up.0.block.1.norm2.weight": "blocks.28.norm2.weight",
+            "first_stage_model.decoder.up.0.block.1.time_stack.in_layers.0.bias": "blocks.29.norm1.bias",
+            "first_stage_model.decoder.up.0.block.1.time_stack.in_layers.0.weight": "blocks.29.norm1.weight",
+            "first_stage_model.decoder.up.0.block.1.time_stack.in_layers.2.bias": "blocks.29.conv1.bias",
+            "first_stage_model.decoder.up.0.block.1.time_stack.in_layers.2.weight": "blocks.29.conv1.weight",
+            "first_stage_model.decoder.up.0.block.1.time_stack.out_layers.0.bias": "blocks.29.norm2.bias",
+            "first_stage_model.decoder.up.0.block.1.time_stack.out_layers.0.weight": "blocks.29.norm2.weight",
+            "first_stage_model.decoder.up.0.block.1.time_stack.out_layers.3.bias": "blocks.29.conv2.bias",
+            "first_stage_model.decoder.up.0.block.1.time_stack.out_layers.3.weight": "blocks.29.conv2.weight",
+            "first_stage_model.decoder.up.0.block.2.conv1.bias": "blocks.30.conv1.bias",
+            "first_stage_model.decoder.up.0.block.2.conv1.weight": "blocks.30.conv1.weight",
+            "first_stage_model.decoder.up.0.block.2.conv2.bias": "blocks.30.conv2.bias",
+            "first_stage_model.decoder.up.0.block.2.conv2.weight": "blocks.30.conv2.weight",
+            "first_stage_model.decoder.up.0.block.2.mix_factor": "blocks.31.mix_factor",
+            "first_stage_model.decoder.up.0.block.2.norm1.bias": "blocks.30.norm1.bias",
+            "first_stage_model.decoder.up.0.block.2.norm1.weight": "blocks.30.norm1.weight",
+            "first_stage_model.decoder.up.0.block.2.norm2.bias": "blocks.30.norm2.bias",
+            "first_stage_model.decoder.up.0.block.2.norm2.weight": "blocks.30.norm2.weight",
+            "first_stage_model.decoder.up.0.block.2.time_stack.in_layers.0.bias": "blocks.31.norm1.bias",
+            "first_stage_model.decoder.up.0.block.2.time_stack.in_layers.0.weight": "blocks.31.norm1.weight",
+            "first_stage_model.decoder.up.0.block.2.time_stack.in_layers.2.bias": "blocks.31.conv1.bias",
+            "first_stage_model.decoder.up.0.block.2.time_stack.in_layers.2.weight": "blocks.31.conv1.weight",
+            "first_stage_model.decoder.up.0.block.2.time_stack.out_layers.0.bias": "blocks.31.norm2.bias",
+            "first_stage_model.decoder.up.0.block.2.time_stack.out_layers.0.weight": "blocks.31.norm2.weight",
+            "first_stage_model.decoder.up.0.block.2.time_stack.out_layers.3.bias": "blocks.31.conv2.bias",
+            "first_stage_model.decoder.up.0.block.2.time_stack.out_layers.3.weight": "blocks.31.conv2.weight",
+            "first_stage_model.decoder.up.1.block.0.conv1.bias": "blocks.19.conv1.bias",
+            "first_stage_model.decoder.up.1.block.0.conv1.weight": "blocks.19.conv1.weight",
+            "first_stage_model.decoder.up.1.block.0.conv2.bias": "blocks.19.conv2.bias",
+            "first_stage_model.decoder.up.1.block.0.conv2.weight": "blocks.19.conv2.weight",
+            "first_stage_model.decoder.up.1.block.0.mix_factor": "blocks.20.mix_factor",
+            "first_stage_model.decoder.up.1.block.0.nin_shortcut.bias": "blocks.19.conv_shortcut.bias",
+            "first_stage_model.decoder.up.1.block.0.nin_shortcut.weight": "blocks.19.conv_shortcut.weight",
+            "first_stage_model.decoder.up.1.block.0.norm1.bias": "blocks.19.norm1.bias",
+            "first_stage_model.decoder.up.1.block.0.norm1.weight": "blocks.19.norm1.weight",
+            "first_stage_model.decoder.up.1.block.0.norm2.bias": "blocks.19.norm2.bias",
+            "first_stage_model.decoder.up.1.block.0.norm2.weight": "blocks.19.norm2.weight",
+            "first_stage_model.decoder.up.1.block.0.time_stack.in_layers.0.bias": "blocks.20.norm1.bias",
+            "first_stage_model.decoder.up.1.block.0.time_stack.in_layers.0.weight": "blocks.20.norm1.weight",
+            "first_stage_model.decoder.up.1.block.0.time_stack.in_layers.2.bias": "blocks.20.conv1.bias",
+            "first_stage_model.decoder.up.1.block.0.time_stack.in_layers.2.weight": "blocks.20.conv1.weight",
+            "first_stage_model.decoder.up.1.block.0.time_stack.out_layers.0.bias": "blocks.20.norm2.bias",
+            "first_stage_model.decoder.up.1.block.0.time_stack.out_layers.0.weight": "blocks.20.norm2.weight",
+            "first_stage_model.decoder.up.1.block.0.time_stack.out_layers.3.bias": "blocks.20.conv2.bias",
+            "first_stage_model.decoder.up.1.block.0.time_stack.out_layers.3.weight": "blocks.20.conv2.weight",
+            "first_stage_model.decoder.up.1.block.1.conv1.bias": "blocks.21.conv1.bias",
+            "first_stage_model.decoder.up.1.block.1.conv1.weight": "blocks.21.conv1.weight",
+            "first_stage_model.decoder.up.1.block.1.conv2.bias": "blocks.21.conv2.bias",
+            "first_stage_model.decoder.up.1.block.1.conv2.weight": "blocks.21.conv2.weight",
+            "first_stage_model.decoder.up.1.block.1.mix_factor": "blocks.22.mix_factor",
+            "first_stage_model.decoder.up.1.block.1.norm1.bias": "blocks.21.norm1.bias",
+            "first_stage_model.decoder.up.1.block.1.norm1.weight": "blocks.21.norm1.weight",
+            "first_stage_model.decoder.up.1.block.1.norm2.bias": "blocks.21.norm2.bias",
+            "first_stage_model.decoder.up.1.block.1.norm2.weight": "blocks.21.norm2.weight",
+            "first_stage_model.decoder.up.1.block.1.time_stack.in_layers.0.bias": "blocks.22.norm1.bias",
+            "first_stage_model.decoder.up.1.block.1.time_stack.in_layers.0.weight": "blocks.22.norm1.weight",
+            "first_stage_model.decoder.up.1.block.1.time_stack.in_layers.2.bias": "blocks.22.conv1.bias",
+            "first_stage_model.decoder.up.1.block.1.time_stack.in_layers.2.weight": "blocks.22.conv1.weight",
+            "first_stage_model.decoder.up.1.block.1.time_stack.out_layers.0.bias": "blocks.22.norm2.bias",
+            "first_stage_model.decoder.up.1.block.1.time_stack.out_layers.0.weight": "blocks.22.norm2.weight",
+            "first_stage_model.decoder.up.1.block.1.time_stack.out_layers.3.bias": "blocks.22.conv2.bias",
+            "first_stage_model.decoder.up.1.block.1.time_stack.out_layers.3.weight": "blocks.22.conv2.weight",
+            "first_stage_model.decoder.up.1.block.2.conv1.bias": "blocks.23.conv1.bias",
+            "first_stage_model.decoder.up.1.block.2.conv1.weight": "blocks.23.conv1.weight",
+            "first_stage_model.decoder.up.1.block.2.conv2.bias": "blocks.23.conv2.bias",
+            "first_stage_model.decoder.up.1.block.2.conv2.weight": "blocks.23.conv2.weight",
+            "first_stage_model.decoder.up.1.block.2.mix_factor": "blocks.24.mix_factor",
+            "first_stage_model.decoder.up.1.block.2.norm1.bias": "blocks.23.norm1.bias",
+            "first_stage_model.decoder.up.1.block.2.norm1.weight": "blocks.23.norm1.weight",
+            "first_stage_model.decoder.up.1.block.2.norm2.bias": "blocks.23.norm2.bias",
+            "first_stage_model.decoder.up.1.block.2.norm2.weight": "blocks.23.norm2.weight",
+            "first_stage_model.decoder.up.1.block.2.time_stack.in_layers.0.bias": "blocks.24.norm1.bias",
+            "first_stage_model.decoder.up.1.block.2.time_stack.in_layers.0.weight": "blocks.24.norm1.weight",
+            "first_stage_model.decoder.up.1.block.2.time_stack.in_layers.2.bias": "blocks.24.conv1.bias",
+            "first_stage_model.decoder.up.1.block.2.time_stack.in_layers.2.weight": "blocks.24.conv1.weight",
+            "first_stage_model.decoder.up.1.block.2.time_stack.out_layers.0.bias": "blocks.24.norm2.bias",
+            "first_stage_model.decoder.up.1.block.2.time_stack.out_layers.0.weight": "blocks.24.norm2.weight",
+            "first_stage_model.decoder.up.1.block.2.time_stack.out_layers.3.bias": "blocks.24.conv2.bias",
+            "first_stage_model.decoder.up.1.block.2.time_stack.out_layers.3.weight": "blocks.24.conv2.weight",
+            "first_stage_model.decoder.up.1.upsample.conv.bias": "blocks.25.conv.bias",
+            "first_stage_model.decoder.up.1.upsample.conv.weight": "blocks.25.conv.weight",
+            "first_stage_model.decoder.up.2.block.0.conv1.bias": "blocks.12.conv1.bias",
+            "first_stage_model.decoder.up.2.block.0.conv1.weight": "blocks.12.conv1.weight",
+            "first_stage_model.decoder.up.2.block.0.conv2.bias": "blocks.12.conv2.bias",
+            "first_stage_model.decoder.up.2.block.0.conv2.weight": "blocks.12.conv2.weight",
+            "first_stage_model.decoder.up.2.block.0.mix_factor": "blocks.13.mix_factor",
+            "first_stage_model.decoder.up.2.block.0.norm1.bias": "blocks.12.norm1.bias",
+            "first_stage_model.decoder.up.2.block.0.norm1.weight": "blocks.12.norm1.weight",
+            "first_stage_model.decoder.up.2.block.0.norm2.bias": "blocks.12.norm2.bias",
+            "first_stage_model.decoder.up.2.block.0.norm2.weight": "blocks.12.norm2.weight",
+            "first_stage_model.decoder.up.2.block.0.time_stack.in_layers.0.bias": "blocks.13.norm1.bias",
+            "first_stage_model.decoder.up.2.block.0.time_stack.in_layers.0.weight": "blocks.13.norm1.weight",
+            "first_stage_model.decoder.up.2.block.0.time_stack.in_layers.2.bias": "blocks.13.conv1.bias",
+            "first_stage_model.decoder.up.2.block.0.time_stack.in_layers.2.weight": "blocks.13.conv1.weight",
+            "first_stage_model.decoder.up.2.block.0.time_stack.out_layers.0.bias": "blocks.13.norm2.bias",
+            "first_stage_model.decoder.up.2.block.0.time_stack.out_layers.0.weight": "blocks.13.norm2.weight",
+            "first_stage_model.decoder.up.2.block.0.time_stack.out_layers.3.bias": "blocks.13.conv2.bias",
+            "first_stage_model.decoder.up.2.block.0.time_stack.out_layers.3.weight": "blocks.13.conv2.weight",
+            "first_stage_model.decoder.up.2.block.1.conv1.bias": "blocks.14.conv1.bias",
+            "first_stage_model.decoder.up.2.block.1.conv1.weight": "blocks.14.conv1.weight",
+            "first_stage_model.decoder.up.2.block.1.conv2.bias": "blocks.14.conv2.bias",
+            "first_stage_model.decoder.up.2.block.1.conv2.weight": "blocks.14.conv2.weight",
+            "first_stage_model.decoder.up.2.block.1.mix_factor": "blocks.15.mix_factor",
+            "first_stage_model.decoder.up.2.block.1.norm1.bias": "blocks.14.norm1.bias",
+            "first_stage_model.decoder.up.2.block.1.norm1.weight": "blocks.14.norm1.weight",
+            "first_stage_model.decoder.up.2.block.1.norm2.bias": "blocks.14.norm2.bias",
+            "first_stage_model.decoder.up.2.block.1.norm2.weight": "blocks.14.norm2.weight",
+            "first_stage_model.decoder.up.2.block.1.time_stack.in_layers.0.bias": "blocks.15.norm1.bias",
+            "first_stage_model.decoder.up.2.block.1.time_stack.in_layers.0.weight": "blocks.15.norm1.weight",
+            "first_stage_model.decoder.up.2.block.1.time_stack.in_layers.2.bias": "blocks.15.conv1.bias",
+            "first_stage_model.decoder.up.2.block.1.time_stack.in_layers.2.weight": "blocks.15.conv1.weight",
+            "first_stage_model.decoder.up.2.block.1.time_stack.out_layers.0.bias": "blocks.15.norm2.bias",
+            "first_stage_model.decoder.up.2.block.1.time_stack.out_layers.0.weight": "blocks.15.norm2.weight",
+            "first_stage_model.decoder.up.2.block.1.time_stack.out_layers.3.bias": "blocks.15.conv2.bias",
+            "first_stage_model.decoder.up.2.block.1.time_stack.out_layers.3.weight": "blocks.15.conv2.weight",
+            "first_stage_model.decoder.up.2.block.2.conv1.bias": "blocks.16.conv1.bias",
+            "first_stage_model.decoder.up.2.block.2.conv1.weight": "blocks.16.conv1.weight",
+            "first_stage_model.decoder.up.2.block.2.conv2.bias": "blocks.16.conv2.bias",
+            "first_stage_model.decoder.up.2.block.2.conv2.weight": "blocks.16.conv2.weight",
+            "first_stage_model.decoder.up.2.block.2.mix_factor": "blocks.17.mix_factor",
+            "first_stage_model.decoder.up.2.block.2.norm1.bias": "blocks.16.norm1.bias",
+            "first_stage_model.decoder.up.2.block.2.norm1.weight": "blocks.16.norm1.weight",
+            "first_stage_model.decoder.up.2.block.2.norm2.bias": "blocks.16.norm2.bias",
+            "first_stage_model.decoder.up.2.block.2.norm2.weight": "blocks.16.norm2.weight",
+            "first_stage_model.decoder.up.2.block.2.time_stack.in_layers.0.bias": "blocks.17.norm1.bias",
+            "first_stage_model.decoder.up.2.block.2.time_stack.in_layers.0.weight": "blocks.17.norm1.weight",
+            "first_stage_model.decoder.up.2.block.2.time_stack.in_layers.2.bias": "blocks.17.conv1.bias",
+            "first_stage_model.decoder.up.2.block.2.time_stack.in_layers.2.weight": "blocks.17.conv1.weight",
+            "first_stage_model.decoder.up.2.block.2.time_stack.out_layers.0.bias": "blocks.17.norm2.bias",
+            "first_stage_model.decoder.up.2.block.2.time_stack.out_layers.0.weight": "blocks.17.norm2.weight",
+            "first_stage_model.decoder.up.2.block.2.time_stack.out_layers.3.bias": "blocks.17.conv2.bias",
+            "first_stage_model.decoder.up.2.block.2.time_stack.out_layers.3.weight": "blocks.17.conv2.weight",
+            "first_stage_model.decoder.up.2.upsample.conv.bias": "blocks.18.conv.bias",
+            "first_stage_model.decoder.up.2.upsample.conv.weight": "blocks.18.conv.weight",
+            "first_stage_model.decoder.up.3.block.0.conv1.bias": "blocks.5.conv1.bias",
+            "first_stage_model.decoder.up.3.block.0.conv1.weight": "blocks.5.conv1.weight",
+            "first_stage_model.decoder.up.3.block.0.conv2.bias": "blocks.5.conv2.bias",
+            "first_stage_model.decoder.up.3.block.0.conv2.weight": "blocks.5.conv2.weight",
+            "first_stage_model.decoder.up.3.block.0.mix_factor": "blocks.6.mix_factor",
+            "first_stage_model.decoder.up.3.block.0.norm1.bias": "blocks.5.norm1.bias",
+            "first_stage_model.decoder.up.3.block.0.norm1.weight": "blocks.5.norm1.weight",
+            "first_stage_model.decoder.up.3.block.0.norm2.bias": "blocks.5.norm2.bias",
+            "first_stage_model.decoder.up.3.block.0.norm2.weight": "blocks.5.norm2.weight",
+            "first_stage_model.decoder.up.3.block.0.time_stack.in_layers.0.bias": "blocks.6.norm1.bias",
+            "first_stage_model.decoder.up.3.block.0.time_stack.in_layers.0.weight": "blocks.6.norm1.weight",
+            "first_stage_model.decoder.up.3.block.0.time_stack.in_layers.2.bias": "blocks.6.conv1.bias",
+            "first_stage_model.decoder.up.3.block.0.time_stack.in_layers.2.weight": "blocks.6.conv1.weight",
+            "first_stage_model.decoder.up.3.block.0.time_stack.out_layers.0.bias": "blocks.6.norm2.bias",
+            "first_stage_model.decoder.up.3.block.0.time_stack.out_layers.0.weight": "blocks.6.norm2.weight",
+            "first_stage_model.decoder.up.3.block.0.time_stack.out_layers.3.bias": "blocks.6.conv2.bias",
+            "first_stage_model.decoder.up.3.block.0.time_stack.out_layers.3.weight": "blocks.6.conv2.weight",
+            "first_stage_model.decoder.up.3.block.1.conv1.bias": "blocks.7.conv1.bias",
+            "first_stage_model.decoder.up.3.block.1.conv1.weight": "blocks.7.conv1.weight",
+            "first_stage_model.decoder.up.3.block.1.conv2.bias": "blocks.7.conv2.bias",
+            "first_stage_model.decoder.up.3.block.1.conv2.weight": "blocks.7.conv2.weight",
+            "first_stage_model.decoder.up.3.block.1.mix_factor": "blocks.8.mix_factor",
+            "first_stage_model.decoder.up.3.block.1.norm1.bias": "blocks.7.norm1.bias",
+            "first_stage_model.decoder.up.3.block.1.norm1.weight": "blocks.7.norm1.weight",
+            "first_stage_model.decoder.up.3.block.1.norm2.bias": "blocks.7.norm2.bias",
+            "first_stage_model.decoder.up.3.block.1.norm2.weight": "blocks.7.norm2.weight",
+            "first_stage_model.decoder.up.3.block.1.time_stack.in_layers.0.bias": "blocks.8.norm1.bias",
+            "first_stage_model.decoder.up.3.block.1.time_stack.in_layers.0.weight": "blocks.8.norm1.weight",
+            "first_stage_model.decoder.up.3.block.1.time_stack.in_layers.2.bias": "blocks.8.conv1.bias",
+            "first_stage_model.decoder.up.3.block.1.time_stack.in_layers.2.weight": "blocks.8.conv1.weight",
+            "first_stage_model.decoder.up.3.block.1.time_stack.out_layers.0.bias": "blocks.8.norm2.bias",
+            "first_stage_model.decoder.up.3.block.1.time_stack.out_layers.0.weight": "blocks.8.norm2.weight",
+            "first_stage_model.decoder.up.3.block.1.time_stack.out_layers.3.bias": "blocks.8.conv2.bias",
+            "first_stage_model.decoder.up.3.block.1.time_stack.out_layers.3.weight": "blocks.8.conv2.weight",
+            "first_stage_model.decoder.up.3.block.2.conv1.bias": "blocks.9.conv1.bias",
+            "first_stage_model.decoder.up.3.block.2.conv1.weight": "blocks.9.conv1.weight",
+            "first_stage_model.decoder.up.3.block.2.conv2.bias": "blocks.9.conv2.bias",
+            "first_stage_model.decoder.up.3.block.2.conv2.weight": "blocks.9.conv2.weight",
+            "first_stage_model.decoder.up.3.block.2.mix_factor": "blocks.10.mix_factor",
+            "first_stage_model.decoder.up.3.block.2.norm1.bias": "blocks.9.norm1.bias",
+            "first_stage_model.decoder.up.3.block.2.norm1.weight": "blocks.9.norm1.weight",
+            "first_stage_model.decoder.up.3.block.2.norm2.bias": "blocks.9.norm2.bias",
+            "first_stage_model.decoder.up.3.block.2.norm2.weight": "blocks.9.norm2.weight",
+            "first_stage_model.decoder.up.3.block.2.time_stack.in_layers.0.bias": "blocks.10.norm1.bias",
+            "first_stage_model.decoder.up.3.block.2.time_stack.in_layers.0.weight": "blocks.10.norm1.weight",
+            "first_stage_model.decoder.up.3.block.2.time_stack.in_layers.2.bias": "blocks.10.conv1.bias",
+            "first_stage_model.decoder.up.3.block.2.time_stack.in_layers.2.weight": "blocks.10.conv1.weight",
+            "first_stage_model.decoder.up.3.block.2.time_stack.out_layers.0.bias": "blocks.10.norm2.bias",
+            "first_stage_model.decoder.up.3.block.2.time_stack.out_layers.0.weight": "blocks.10.norm2.weight",
+            "first_stage_model.decoder.up.3.block.2.time_stack.out_layers.3.bias": "blocks.10.conv2.bias",
+            "first_stage_model.decoder.up.3.block.2.time_stack.out_layers.3.weight": "blocks.10.conv2.weight",
+            "first_stage_model.decoder.up.3.upsample.conv.bias": "blocks.11.conv.bias",
+            "first_stage_model.decoder.up.3.upsample.conv.weight": "blocks.11.conv.weight",
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if "blocks.2.transformer_blocks.0" in rename_dict[name]:
+                    param = param.squeeze()
+                state_dict_[rename_dict[name]] = param
+        return state_dict_

diffsynth/models/svd_vae_encoder.py ADDED Viewed

	@@ -0,0 +1,138 @@

+from .sd_vae_encoder import SDVAEEncoderStateDictConverter, SDVAEEncoder
+class SVDVAEEncoder(SDVAEEncoder):
+    def __init__(self):
+        super().__init__()
+        self.scaling_factor = 0.13025
+    def state_dict_converter(self):
+        return SVDVAEEncoderStateDictConverter()
+class SVDVAEEncoderStateDictConverter(SDVAEEncoderStateDictConverter):
+    def __init__(self):
+        super().__init__()
+    def from_diffusers(self, state_dict):
+        return super().from_diffusers(state_dict)
+    def from_civitai(self, state_dict):
+        rename_dict = {
+            "conditioner.embedders.3.encoder.encoder.conv_in.bias": "conv_in.bias",
+            "conditioner.embedders.3.encoder.encoder.conv_in.weight": "conv_in.weight",
+            "conditioner.embedders.3.encoder.encoder.conv_out.bias": "conv_out.bias",
+            "conditioner.embedders.3.encoder.encoder.conv_out.weight": "conv_out.weight",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.0.conv1.bias": "blocks.0.conv1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.0.conv1.weight": "blocks.0.conv1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.0.conv2.bias": "blocks.0.conv2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.0.conv2.weight": "blocks.0.conv2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.0.norm1.bias": "blocks.0.norm1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.0.norm1.weight": "blocks.0.norm1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.0.norm2.bias": "blocks.0.norm2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.0.norm2.weight": "blocks.0.norm2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.1.conv1.bias": "blocks.1.conv1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.1.conv1.weight": "blocks.1.conv1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.1.conv2.bias": "blocks.1.conv2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.1.conv2.weight": "blocks.1.conv2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.1.norm1.bias": "blocks.1.norm1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.1.norm1.weight": "blocks.1.norm1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.1.norm2.bias": "blocks.1.norm2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.0.block.1.norm2.weight": "blocks.1.norm2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.0.downsample.conv.bias": "blocks.2.conv.bias",
+            "conditioner.embedders.3.encoder.encoder.down.0.downsample.conv.weight": "blocks.2.conv.weight",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.0.conv1.bias": "blocks.3.conv1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.0.conv1.weight": "blocks.3.conv1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.0.conv2.bias": "blocks.3.conv2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.0.conv2.weight": "blocks.3.conv2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.0.nin_shortcut.bias": "blocks.3.conv_shortcut.bias",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.0.nin_shortcut.weight": "blocks.3.conv_shortcut.weight",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.0.norm1.bias": "blocks.3.norm1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.0.norm1.weight": "blocks.3.norm1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.0.norm2.bias": "blocks.3.norm2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.0.norm2.weight": "blocks.3.norm2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.1.conv1.bias": "blocks.4.conv1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.1.conv1.weight": "blocks.4.conv1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.1.conv2.bias": "blocks.4.conv2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.1.conv2.weight": "blocks.4.conv2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.1.norm1.bias": "blocks.4.norm1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.1.norm1.weight": "blocks.4.norm1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.1.norm2.bias": "blocks.4.norm2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.1.block.1.norm2.weight": "blocks.4.norm2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.1.downsample.conv.bias": "blocks.5.conv.bias",
+            "conditioner.embedders.3.encoder.encoder.down.1.downsample.conv.weight": "blocks.5.conv.weight",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.0.conv1.bias": "blocks.6.conv1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.0.conv1.weight": "blocks.6.conv1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.0.conv2.bias": "blocks.6.conv2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.0.conv2.weight": "blocks.6.conv2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.0.nin_shortcut.bias": "blocks.6.conv_shortcut.bias",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.0.nin_shortcut.weight": "blocks.6.conv_shortcut.weight",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.0.norm1.bias": "blocks.6.norm1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.0.norm1.weight": "blocks.6.norm1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.0.norm2.bias": "blocks.6.norm2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.0.norm2.weight": "blocks.6.norm2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.1.conv1.bias": "blocks.7.conv1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.1.conv1.weight": "blocks.7.conv1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.1.conv2.bias": "blocks.7.conv2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.1.conv2.weight": "blocks.7.conv2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.1.norm1.bias": "blocks.7.norm1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.1.norm1.weight": "blocks.7.norm1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.1.norm2.bias": "blocks.7.norm2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.2.block.1.norm2.weight": "blocks.7.norm2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.2.downsample.conv.bias": "blocks.8.conv.bias",
+            "conditioner.embedders.3.encoder.encoder.down.2.downsample.conv.weight": "blocks.8.conv.weight",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.0.conv1.bias": "blocks.9.conv1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.0.conv1.weight": "blocks.9.conv1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.0.conv2.bias": "blocks.9.conv2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.0.conv2.weight": "blocks.9.conv2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.0.norm1.bias": "blocks.9.norm1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.0.norm1.weight": "blocks.9.norm1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.0.norm2.bias": "blocks.9.norm2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.0.norm2.weight": "blocks.9.norm2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.1.conv1.bias": "blocks.10.conv1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.1.conv1.weight": "blocks.10.conv1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.1.conv2.bias": "blocks.10.conv2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.1.conv2.weight": "blocks.10.conv2.weight",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.1.norm1.bias": "blocks.10.norm1.bias",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.1.norm1.weight": "blocks.10.norm1.weight",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.1.norm2.bias": "blocks.10.norm2.bias",
+            "conditioner.embedders.3.encoder.encoder.down.3.block.1.norm2.weight": "blocks.10.norm2.weight",
+            "conditioner.embedders.3.encoder.encoder.mid.attn_1.k.bias": "blocks.12.transformer_blocks.0.to_k.bias",
+            "conditioner.embedders.3.encoder.encoder.mid.attn_1.k.weight": "blocks.12.transformer_blocks.0.to_k.weight",
+            "conditioner.embedders.3.encoder.encoder.mid.attn_1.norm.bias": "blocks.12.norm.bias",
+            "conditioner.embedders.3.encoder.encoder.mid.attn_1.norm.weight": "blocks.12.norm.weight",
+            "conditioner.embedders.3.encoder.encoder.mid.attn_1.proj_out.bias": "blocks.12.transformer_blocks.0.to_out.bias",
+            "conditioner.embedders.3.encoder.encoder.mid.attn_1.proj_out.weight": "blocks.12.transformer_blocks.0.to_out.weight",
+            "conditioner.embedders.3.encoder.encoder.mid.attn_1.q.bias": "blocks.12.transformer_blocks.0.to_q.bias",
+            "conditioner.embedders.3.encoder.encoder.mid.attn_1.q.weight": "blocks.12.transformer_blocks.0.to_q.weight",
+            "conditioner.embedders.3.encoder.encoder.mid.attn_1.v.bias": "blocks.12.transformer_blocks.0.to_v.bias",
+            "conditioner.embedders.3.encoder.encoder.mid.attn_1.v.weight": "blocks.12.transformer_blocks.0.to_v.weight",
+            "conditioner.embedders.3.encoder.encoder.mid.block_1.conv1.bias": "blocks.11.conv1.bias",
+            "conditioner.embedders.3.encoder.encoder.mid.block_1.conv1.weight": "blocks.11.conv1.weight",
+            "conditioner.embedders.3.encoder.encoder.mid.block_1.conv2.bias": "blocks.11.conv2.bias",
+            "conditioner.embedders.3.encoder.encoder.mid.block_1.conv2.weight": "blocks.11.conv2.weight",
+            "conditioner.embedders.3.encoder.encoder.mid.block_1.norm1.bias": "blocks.11.norm1.bias",
+            "conditioner.embedders.3.encoder.encoder.mid.block_1.norm1.weight": "blocks.11.norm1.weight",
+            "conditioner.embedders.3.encoder.encoder.mid.block_1.norm2.bias": "blocks.11.norm2.bias",
+            "conditioner.embedders.3.encoder.encoder.mid.block_1.norm2.weight": "blocks.11.norm2.weight",
+            "conditioner.embedders.3.encoder.encoder.mid.block_2.conv1.bias": "blocks.13.conv1.bias",
+            "conditioner.embedders.3.encoder.encoder.mid.block_2.conv1.weight": "blocks.13.conv1.weight",
+            "conditioner.embedders.3.encoder.encoder.mid.block_2.conv2.bias": "blocks.13.conv2.bias",
+            "conditioner.embedders.3.encoder.encoder.mid.block_2.conv2.weight": "blocks.13.conv2.weight",
+            "conditioner.embedders.3.encoder.encoder.mid.block_2.norm1.bias": "blocks.13.norm1.bias",
+            "conditioner.embedders.3.encoder.encoder.mid.block_2.norm1.weight": "blocks.13.norm1.weight",
+            "conditioner.embedders.3.encoder.encoder.mid.block_2.norm2.bias": "blocks.13.norm2.bias",
+            "conditioner.embedders.3.encoder.encoder.mid.block_2.norm2.weight": "blocks.13.norm2.weight",
+            "conditioner.embedders.3.encoder.encoder.norm_out.bias": "conv_norm_out.bias",
+            "conditioner.embedders.3.encoder.encoder.norm_out.weight": "conv_norm_out.weight",
+            "conditioner.embedders.3.encoder.quant_conv.bias": "quant_conv.bias",
+            "conditioner.embedders.3.encoder.quant_conv.weight": "quant_conv.weight",
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if "transformer_blocks" in rename_dict[name]:
+                    param = param.squeeze()
+                state_dict_[rename_dict[name]] = param
+        return state_dict_