ViTeX-Bench commited on 4 days ago

Commit

bc8c4af

verified ·

1 Parent(s): 38bf857

Bundle diffsynth library (no external repo dependency)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

diffsynth/__init__.py +1 -0
diffsynth/configs/__init__.py +2 -0
diffsynth/configs/model_configs.py +888 -0
diffsynth/configs/vram_management_module_maps.py +284 -0
diffsynth/core/__init__.py +6 -0
diffsynth/core/attention/__init__.py +1 -0
diffsynth/core/attention/attention.py +121 -0
diffsynth/core/data/__init__.py +1 -0
diffsynth/core/data/operators.py +280 -0
diffsynth/core/data/unified_dataset.py +118 -0
diffsynth/core/device/__init__.py +2 -0
diffsynth/core/device/npu_compatible_device.py +107 -0
diffsynth/core/gradient/__init__.py +1 -0
diffsynth/core/gradient/gradient_checkpoint.py +37 -0
diffsynth/core/loader/__init__.py +3 -0
diffsynth/core/loader/config.py +119 -0
diffsynth/core/loader/file.py +130 -0
diffsynth/core/loader/model.py +115 -0
diffsynth/core/npu_patch/npu_fused_operator.py +30 -0
diffsynth/core/vram/__init__.py +2 -0
diffsynth/core/vram/disk_map.py +93 -0
diffsynth/core/vram/initialization.py +21 -0
diffsynth/core/vram/layers.py +479 -0
diffsynth/diffusion/__init__.py +6 -0
diffsynth/diffusion/base_pipeline.py +500 -0
diffsynth/diffusion/flow_match.py +236 -0
diffsynth/diffusion/logger.py +43 -0
diffsynth/diffusion/loss.py +158 -0
diffsynth/diffusion/parsers.py +71 -0
diffsynth/diffusion/runner.py +135 -0
diffsynth/diffusion/training_module.py +302 -0
diffsynth/models/anima_dit.py +1307 -0
diffsynth/models/dinov3_image_encoder.py +96 -0
diffsynth/models/flux2_dit.py +1053 -0
diffsynth/models/flux2_text_encoder.py +58 -0
diffsynth/models/flux2_vae.py +0 -0
diffsynth/models/flux_controlnet.py +384 -0
diffsynth/models/flux_dit.py +398 -0
diffsynth/models/flux_infiniteyou.py +129 -0
diffsynth/models/flux_ipadapter.py +110 -0
diffsynth/models/flux_lora_encoder.py +521 -0
diffsynth/models/flux_lora_patcher.py +306 -0
diffsynth/models/flux_text_encoder_clip.py +112 -0
diffsynth/models/flux_text_encoder_t5.py +43 -0
diffsynth/models/flux_vae.py +451 -0
diffsynth/models/flux_value_control.py +56 -0
diffsynth/models/general_modules.py +146 -0
diffsynth/models/longcat_video_dit.py +902 -0
diffsynth/models/ltx2_audio_vae.py +1872 -0
diffsynth/models/ltx2_common.py +388 -0

diffsynth/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .core import *

diffsynth/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .model_configs import MODEL_CONFIGS
2	+ from .vram_management_module_maps import VRAM_MANAGEMENT_MODULE_MAPS, VERSION_CHECKER_MAPS

diffsynth/configs/model_configs.py ADDED Viewed

	@@ -0,0 +1,888 @@

+qwen_image_series = [
+    {
+        # Example: ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors")
+        "model_hash": "0319a1cb19835fb510907dd3367c95ff",
+        "model_name": "qwen_image_dit",
+        "model_class": "diffsynth.models.qwen_image_dit.QwenImageDiT",
+    },
+    {
+        # Example: ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors")
+        "model_hash": "8004730443f55db63092006dd9f7110e",
+        "model_name": "qwen_image_text_encoder",
+        "model_class": "diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.qwen_image_text_encoder.QwenImageTextEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
+        "model_hash": "ed4ea5824d55ec3107b09815e318123a",
+        "model_name": "qwen_image_vae",
+        "model_class": "diffsynth.models.qwen_image_vae.QwenImageVAE",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Depth", origin_file_pattern="model.safetensors")
+        "model_hash": "073bce9cf969e317e5662cd570c3e79c",
+        "model_name": "qwen_image_blockwise_controlnet",
+        "model_class": "diffsynth.models.qwen_image_controlnet.QwenImageBlockWiseControlNet",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-Blockwise-ControlNet-Inpaint", origin_file_pattern="model.safetensors")
+        "model_hash": "a9e54e480a628f0b956a688a81c33bab",
+        "model_name": "qwen_image_blockwise_controlnet",
+        "model_class": "diffsynth.models.qwen_image_controlnet.QwenImageBlockWiseControlNet",
+        "extra_kwargs": {"additional_in_dim": 4},
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="SigLIP2-G384/model.safetensors")
+        "model_hash": "469c78b61e3e31bc9eec0d0af3d3f2f8",
+        "model_name": "siglip2_image_encoder",
+        "model_class": "diffsynth.models.siglip2_image_encoder.Siglip2ImageEncoder",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="DINOv3-7B/model.safetensors")
+        "model_hash": "5722b5c873720009de96422993b15682",
+        "model_name": "dinov3_image_encoder",
+        "model_class": "diffsynth.models.dinov3_image_encoder.DINOv3ImageEncoder",
+    },
+    {
+        # Example:
+        "model_hash": "a166c33455cdbd89c0888a3645ca5c0f",
+        "model_name": "qwen_image_image2lora_coarse",
+        "model_class": "diffsynth.models.qwen_image_image2lora.QwenImageImage2LoRAModel",
+    },
+    {
+        # Example:
+        "model_hash": "a5476e691767a4da6d3a6634a10f7408",
+        "model_name": "qwen_image_image2lora_fine",
+        "model_class": "diffsynth.models.qwen_image_image2lora.QwenImageImage2LoRAModel",
+        "extra_kwargs": {"residual_length": 37*37+7, "residual_mid_dim": 64}
+    },
+    {
+        # Example:
+        "model_hash": "0aad514690602ecaff932c701cb4b0bb",
+        "model_name": "qwen_image_image2lora_style",
+        "model_class": "diffsynth.models.qwen_image_image2lora.QwenImageImage2LoRAModel",
+        "extra_kwargs": {"compress_dim": 64, "use_residual": False}
+    },
+    {
+        # Example: ModelConfig(model_id="Qwen/Qwen-Image-Layered", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors")
+        "model_hash": "8dc8cda05de16c73afa755e2c1ce2839",
+        "model_name": "qwen_image_dit",
+        "model_class": "diffsynth.models.qwen_image_dit.QwenImageDiT",
+        "extra_kwargs": {"use_layer3d_rope": True, "use_additional_t_cond": True}
+    },
+    {
+        # Example: ModelConfig(model_id="Qwen/Qwen-Image-Layered", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
+        "model_hash": "44b39ddc499e027cfb24f7878d7416b9",
+        "model_name": "qwen_image_vae",
+        "model_class": "diffsynth.models.qwen_image_vae.QwenImageVAE",
+        "extra_kwargs": {"image_channels": 4}
+    },
+]
+wan_series = [
+    {
+        # Example: ModelConfig(model_id="krea/krea-realtime-video", origin_file_pattern="krea-realtime-video-14b.safetensors")
+        "model_hash": "5ec04e02b42d2580483ad69f4e76346a",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 16, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_dit.WanVideoDiTStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="models_t5_umt5-xxl-enc-bf16.pth")
+        "model_hash": "9c8818c2cbea55eca56c7b447df170da",
+        "model_name": "wan_video_text_encoder",
+        "model_class": "diffsynth.models.wan_video_text_encoder.WanTextEncoder",
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="Wan2.1_VAE.pth")
+        "model_hash": "ccc42284ea13e1ad04693284c7a09be6",
+        "model_name": "wan_video_vae",
+        "model_class": "diffsynth.models.wan_video_vae.WanVideoVAE",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_vae.WanVideoVAEStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="meituan-longcat/LongCat-Video", origin_file_pattern="dit/diffusion_pytorch_model*.safetensors")
+        "model_hash": "8b27900f680d7251ce44e2dc8ae1ffef",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.longcat_video_dit.LongCatVideoTransformer3DModel",
+    },
+    {
+        # Example: ModelConfig(model_id="ByteDance/Video-As-Prompt-Wan2.1-14B", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors")
+        "model_hash": "5f90e66a0672219f12d9a626c8c21f61",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_dit.WanVideoDiTFromDiffusers"
+    },
+    {
+        # Example: ModelConfig(model_id="ByteDance/Video-As-Prompt-Wan2.1-14B", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors")
+        "model_hash": "5f90e66a0672219f12d9a626c8c21f61",
+        "model_name": "wan_video_vap",
+        "model_class": "diffsynth.models.wan_video_mot.MotWanModel",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_mot.WanVideoMotStateDictConverter"
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-I2V-14B-480P", origin_file_pattern="models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth")
+        "model_hash": "5941c53e207d62f20f9025686193c40b",
+        "model_name": "wan_video_image_encoder",
+        "model_class": "diffsynth.models.wan_video_image_encoder.WanImageEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_image_encoder.WanImageEncoderStateDictConverter"
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/Wan2.1-1.3b-speedcontrol-v1", origin_file_pattern="model.safetensors")
+        "model_hash": "dbd5ec76bbf977983f972c151d545389",
+        "model_name": "wan_video_motion_controller",
+        "model_class": "diffsynth.models.wan_video_motion_controller.WanMotionControllerModel",
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-T2V-1.3B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "9269f8db9040a9d860eaca435be61814",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 16, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06}
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-FLF2V-14B-720P", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "3ef3b1f8e1dab83d5b71fd7b617f859f",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'has_image_pos_emb': True}
+    },
+    {
+        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "349723183fc063b2bfc10bb2835cf677",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 48, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06}
+    },
+    {
+        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-1.3B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "6d6ccde6845b95ad9114ab993d917893",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06}
+    },
+    {
+        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "efa44cddf936c70abd0ea28b6cbe946c",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 48, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06}
+    },
+    {
+        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-14B-InP", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "6bfcfb3b342cb286ce886889d519a77e",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06}
+    },
+    {
+        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "ac6a5aa74f4a0aab6f64eb9a72f19901",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 32, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06, 'has_ref_conv': False, 'add_control_adapter': True, 'in_dim_control_adapter': 24}
+    },
+    {
+        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-1.3B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "70ddad9d3a133785da5ea371aae09504",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 48, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06, 'has_ref_conv': True}
+    },
+    {
+        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control-Camera", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "b61c605c2adbd23124d152ed28e049ae",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 32, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'has_ref_conv': False, 'add_control_adapter': True, 'in_dim_control_adapter': 24}
+    },
+    {
+        # Example: ModelConfig(model_id="PAI/Wan2.1-Fun-V1.1-14B-Control", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "26bde73488a92e64cc20b0a7485b9e5b",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 48, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'has_ref_conv': True}
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-T2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "aafcfd9672c3a2456dc46e1cb6e52c70",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 16, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06}
+    },
+    {
+        # Example: ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "a61453409b67cd3246cf0c3bebad47ba",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 16, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_dit.WanVideoDiTStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="iic/VACE-Wan2.1-1.3B-Preview", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "a61453409b67cd3246cf0c3bebad47ba",
+        "model_name": "wan_video_vace",
+        "model_class": "diffsynth.models.wan_video_vace.VaceWanModel",
+        "extra_kwargs": {"use_target_text_encoder": True},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_vace.VaceWanModelDictConverter"
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "7a513e1f257a861512b1afd387a8ecd9",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 16, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_dit.WanVideoDiTStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.1-VACE-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "7a513e1f257a861512b1afd387a8ecd9",
+        "model_name": "wan_video_vace",
+        "model_class": "diffsynth.models.wan_video_vace.VaceWanModel",
+        "extra_kwargs": {'vace_layers': (0, 5, 10, 15, 20, 25, 30, 35), 'vace_in_dim': 96, 'glyph_channels': 16, 'patch_size': (1, 2, 2), 'has_image_input': False, 'dim': 5120, 'num_heads': 40, 'ffn_dim': 13824, 'eps': 1e-06},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_vace.VaceWanModelDictConverter"
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "31fa352acb8a1b1d33cd8764273d80a2",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_dit.WanVideoDiTStateDictConverter"
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.2-Animate-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "31fa352acb8a1b1d33cd8764273d80a2",
+        "model_name": "wan_video_animate_adapter",
+        "model_class": "diffsynth.models.wan_video_animate_adapter.WanAnimateAdapter",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_animate_adapter.WanAnimateAdapterStateDictConverter"
+    },
+    {
+        # Example: ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control-Camera", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors")
+        "model_hash": "47dbeab5e560db3180adf51dc0232fb1",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'has_ref_conv': False, 'add_control_adapter': True, 'in_dim_control_adapter': 24, 'require_clip_embedding': False}
+    },
+    {
+        # Example: ModelConfig(model_id="PAI/Wan2.2-Fun-A14B-Control", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors")
+        "model_hash": "2267d489f0ceb9f21836532952852ee5",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 52, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'has_ref_conv': True, 'require_clip_embedding': False},
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.2-I2V-A14B", origin_file_pattern="high_noise_model/diffusion_pytorch_model*.safetensors")
+        "model_hash": "5b013604280dd715f8457c6ed6d6a626",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'require_clip_embedding': False}
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "966cffdcc52f9c46c391768b27637614",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit_s2v.WanS2VModel",
+        "extra_kwargs": {'dim': 5120, 'in_dim': 16, 'ffn_dim': 13824, 'out_dim': 16, 'text_dim': 4096, 'freq_dim': 256, 'eps': 1e-06, 'patch_size': (1, 2, 2), 'num_heads': 40, 'num_layers': 40, 'cond_dim': 16, 'audio_dim': 1024, 'num_audio_token': 4}
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="diffusion_pytorch_model*.safetensors")
+        "model_hash": "1f5ab7703c6fc803fdded85ff040c316",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 48, 'dim': 3072, 'ffn_dim': 14336, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 48, 'num_heads': 24, 'num_layers': 30, 'eps': 1e-06, 'seperated_timestep': True, 'require_clip_embedding': False, 'require_vae_embedding': False, 'fuse_vae_embedding_in_latents': True}
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.2-TI2V-5B", origin_file_pattern="Wan2.2_VAE.pth")
+        "model_hash": "e1de6c02cdac79f8b739f4d3698cd216",
+        "model_name": "wan_video_vae",
+        "model_class": "diffsynth.models.wan_video_vae.WanVideoVAE38",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.wan_video_vae.WanVideoVAEStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/Wan2.2-S2V-14B", origin_file_pattern="wav2vec2-large-xlsr-53-english/model.safetensors")
+        "model_hash": "06be60f3a4526586d8431cd038a71486",
+        "model_name": "wans2v_audio_encoder",
+        "model_class": "diffsynth.models.wav2vec.WanS2VAudioEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.wans2v_audio_encoder.WanS2VAudioEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Wan-AI/WanToDance-14B", origin_file_pattern="global_model.safetensors")
+        "model_hash": "eb18873fc0ba77b541eb7b62dbcd2059",
+        "model_name": "wan_video_dit",
+        "model_class": "diffsynth.models.wan_video_dit.WanModel",
+        "extra_kwargs": {'has_image_input': True, 'patch_size': [1, 2, 2], 'in_dim': 36, 'dim': 5120, 'ffn_dim': 13824, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 40, 'num_layers': 40, 'eps': 1e-06, 'wantodance_enable_music_inject': True, 'wantodance_music_inject_layers': [0, 4, 8, 12, 16, 20, 24, 27], 'wantodance_enable_refimage': True, 'has_ref_conv': True, 'wantodance_enable_refface': False, 'wantodance_enable_global': True, 'wantodance_enable_dynamicfps': True, 'wantodance_enable_unimodel': True}
+    },
+]
+flux_series = [
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="flux1-dev.safetensors")
+        "model_hash": "a29710fea6dddb0314663ee823598e50",
+        "model_name": "flux_dit",
+        "model_class": "diffsynth.models.flux_dit.FluxDiT",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
+    },
+    {
+        # Supported due to historical reasons.
+        "model_hash": "605c56eab23e9e2af863ad8f0813a25d",
+        "model_name": "flux_dit",
+        "model_class": "diffsynth.models.flux_dit.FluxDiT",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverterFromDiffusers",
+    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder/model.safetensors")
+        "model_hash": "94eefa3dac9cec93cb1ebaf1747d7b78",
+        "model_name": "flux_text_encoder_clip",
+        "model_class": "diffsynth.models.flux_text_encoder_clip.FluxTextEncoderClip",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_text_encoder_clip.FluxTextEncoderClipStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="text_encoder_2/*.safetensors")
+        "model_hash": "22540b49eaedbc2f2784b2091a234c7c",
+        "model_name": "flux_text_encoder_t5",
+        "model_class": "diffsynth.models.flux_text_encoder_t5.FluxTextEncoderT5",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_text_encoder_t5.FluxTextEncoderT5StateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors")
+        "model_hash": "21ea55f476dfc4fd135587abb59dfe5d",
+        "model_name": "flux_vae_encoder",
+        "model_class": "diffsynth.models.flux_vae.FluxVAEEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_vae.FluxVAEEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.1-dev", origin_file_pattern="ae.safetensors")
+        "model_hash": "21ea55f476dfc4fd135587abb59dfe5d",
+        "model_name": "flux_vae_decoder",
+        "model_class": "diffsynth.models.flux_vae.FluxVAEDecoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_vae.FluxVAEDecoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="ostris/Flex.2-preview", origin_file_pattern="Flex.2-preview.safetensors")
+        "model_hash": "d02f41c13549fa5093d3521f62a5570a",
+        "model_name": "flux_dit",
+        "model_class": "diffsynth.models.flux_dit.FluxDiT",
+        "extra_kwargs": {'input_dim': 196, 'num_blocks': 8},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/AttriCtrl-FLUX.1-Dev", origin_file_pattern="models/brightness.safetensors")
+        "model_hash": "0629116fce1472503a66992f96f3eb1a",
+        "model_name": "flux_value_controller",
+        "model_class": "diffsynth.models.flux_value_control.SingleValueEncoder",
+    },
+    {
+        # Example: ModelConfig(model_id="alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta", origin_file_pattern="diffusion_pytorch_model.safetensors")
+        "model_hash": "52357cb26250681367488a8954c271e8",
+        "model_name": "flux_controlnet",
+        "model_class": "diffsynth.models.flux_controlnet.FluxControlNet",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_controlnet.FluxControlNetStateDictConverter",
+        "extra_kwargs": {"num_joint_blocks": 6, "num_single_blocks": 0, "additional_input_dim": 4},
+    },
+    {
+        # Example: ModelConfig(model_id="InstantX/FLUX.1-dev-Controlnet-Union-alpha", origin_file_pattern="diffusion_pytorch_model.safetensors")
+        "model_hash": "78d18b9101345ff695f312e7e62538c0",
+        "model_name": "flux_controlnet",
+        "model_class": "diffsynth.models.flux_controlnet.FluxControlNet",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_controlnet.FluxControlNetStateDictConverter",
+        "extra_kwargs": {"num_mode": 10, "mode_dict": {"canny": 0, "tile": 1, "depth": 2, "blur": 3, "pose": 4, "gray": 5, "lq": 6}},
+    },
+    {
+        # Example: ModelConfig(model_id="jasperai/Flux.1-dev-Controlnet-Upscaler", origin_file_pattern="diffusion_pytorch_model.safetensors")
+        "model_hash": "b001c89139b5f053c715fe772362dd2a",
+        "model_name": "flux_controlnet",
+        "model_class": "diffsynth.models.flux_controlnet.FluxControlNet",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_controlnet.FluxControlNetStateDictConverter",
+        "extra_kwargs": {"num_single_blocks": 0},
+    },
+    {
+        # Example: ModelConfig(model_id="ByteDance/InfiniteYou", origin_file_pattern="infu_flux_v1.0/aes_stage2/image_proj_model.bin")
+        "model_hash": "c07c0f04f5ff55e86b4e937c7a40d481",
+        "model_name": "infiniteyou_image_projector",
+        "model_class": "diffsynth.models.flux_infiniteyou.InfiniteYouImageProjector",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_infiniteyou.FluxInfiniteYouImageProjectorStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="ByteDance/InfiniteYou", origin_file_pattern="infu_flux_v1.0/aes_stage2/InfuseNetModel/*.safetensors")
+        "model_hash": "7f9583eb8ba86642abb9a21a4b2c9e16",
+        "model_name": "flux_controlnet",
+        "model_class": "diffsynth.models.flux_controlnet.FluxControlNet",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_controlnet.FluxControlNetStateDictConverter",
+        "extra_kwargs": {"num_joint_blocks": 4, "num_single_blocks": 10},
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LoRA-Encoder-FLUX.1-Dev", origin_file_pattern="model.safetensors")
+        "model_hash": "77c2e4dd2440269eb33bfaa0d004f6ab",
+        "model_name": "flux_lora_encoder",
+        "model_class": "diffsynth.models.flux_lora_encoder.FluxLoRAEncoder",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LoRAFusion-preview-FLUX.1-dev", origin_file_pattern="model.safetensors")
+        "model_hash": "30143afb2dea73d1ac580e0787628f8c",
+        "model_name": "flux_lora_patcher",
+        "model_class": "diffsynth.models.flux_lora_patcher.FluxLoraPatcher",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="model*.safetensors")
+        "model_hash": "2bd19e845116e4f875a0a048e27fc219",
+        "model_name": "nexus_gen_llm",
+        "model_class": "diffsynth.models.nexus_gen.NexusGenAutoregressiveModel",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.nexus_gen.NexusGenAutoregressiveModelStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="edit_decoder.bin")
+        "model_hash": "63c969fd37cce769a90aa781fbff5f81",
+        "model_name": "nexus_gen_editing_adapter",
+        "model_class": "diffsynth.models.nexus_gen_projector.NexusGenImageEmbeddingMerger",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.nexus_gen_projector.NexusGenMergerStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="edit_decoder.bin")
+        "model_hash": "63c969fd37cce769a90aa781fbff5f81",
+        "model_name": "flux_dit",
+        "model_class": "diffsynth.models.flux_dit.FluxDiT",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="generation_decoder.bin")
+        "model_hash": "3e6c61b0f9471135fc9c6d6a98e98b6d",
+        "model_name": "nexus_gen_generation_adapter",
+        "model_class": "diffsynth.models.nexus_gen_projector.NexusGenAdapter",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.nexus_gen_projector.NexusGenAdapterStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/Nexus-GenV2", origin_file_pattern="generation_decoder.bin")
+        "model_hash": "3e6c61b0f9471135fc9c6d6a98e98b6d",
+        "model_name": "flux_dit",
+        "model_class": "diffsynth.models.flux_dit.FluxDiT",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="InstantX/FLUX.1-dev-IP-Adapter", origin_file_pattern="ip-adapter.bin")
+        "model_hash": "4daaa66cc656a8fe369908693dad0a35",
+        "model_name": "flux_ipadapter",
+        "model_class": "diffsynth.models.flux_ipadapter.FluxIpAdapter",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_ipadapter.FluxIpAdapterStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="google/siglip-so400m-patch14-384", origin_file_pattern="model.safetensors")
+        "model_hash": "04d8c1e20a1f1b25f7434f111992a33f",
+        "model_name": "siglip_vision_model",
+        "model_class": "diffsynth.models.flux_ipadapter.SiglipVisionModelSO400M",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_ipadapter.SiglipStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="stepfun-ai/Step1X-Edit", origin_file_pattern="step1x-edit-i1258.safetensors"),
+        "model_hash": "d30fb9e02b1dbf4e509142f05cf7dd50",
+        "model_name": "step1x_connector",
+        "model_class": "diffsynth.models.step1x_connector.Qwen2Connector",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.step1x_connector.Qwen2ConnectorStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="stepfun-ai/Step1X-Edit", origin_file_pattern="step1x-edit-i1258.safetensors"),
+        "model_hash": "d30fb9e02b1dbf4e509142f05cf7dd50",
+        "model_name": "flux_dit",
+        "model_class": "diffsynth.models.flux_dit.FluxDiT",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
+        "extra_kwargs": {"disable_guidance_embedder": True},
+    },
+    {
+        # Example: ModelConfig(model_id="MAILAND/majicflus_v1", origin_file_pattern="majicflus_v134.safetensors")
+        "model_hash": "3394f306c4cbf04334b712bf5aaed95f",
+        "model_name": "flux_dit",
+        "model_class": "diffsynth.models.flux_dit.FluxDiT",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_dit.FluxDiTStateDictConverter",
+    },
+]
+flux2_series = [
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="text_encoder/*.safetensors")
+        "model_hash": "28fca3d8e5bf2a2d1271748a773f6757",
+        "model_name": "flux2_text_encoder",
+        "model_class": "diffsynth.models.flux2_text_encoder.Flux2TextEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux2_text_encoder.Flux2TextEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="transformer/*.safetensors")
+        "model_hash": "d38e1d5c5aec3b0a11e79327ac6e3b0f",
+        "model_name": "flux2_dit",
+        "model_class": "diffsynth.models.flux2_dit.Flux2DiT",
+    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-dev", origin_file_pattern="vae/diffusion_pytorch_model.safetensors")
+        "model_hash": "c54288e3ee12ca215898840682337b95",
+        "model_name": "flux2_vae",
+        "model_class": "diffsynth.models.flux2_vae.Flux2VAE",
+    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="transformer/*.safetensors")
+        "model_hash": "3bde7b817fec8143028b6825a63180df",
+        "model_name": "flux2_dit",
+        "model_class": "diffsynth.models.flux2_dit.Flux2DiT",
+        "extra_kwargs": {"guidance_embeds": False, "joint_attention_dim": 7680, "num_attention_heads": 24, "num_layers": 5, "num_single_layers": 20}
+    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-klein-9B", origin_file_pattern="text_encoder/*.safetensors")
+        "model_hash": "9195f3ea256fcd0ae6d929c203470754",
+        "model_name": "z_image_text_encoder",
+        "model_class": "diffsynth.models.z_image_text_encoder.ZImageTextEncoder",
+        "extra_kwargs": {"model_size": "8B"},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.z_image_text_encoder.ZImageTextEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="black-forest-labs/FLUX.2-klein-9B", origin_file_pattern="transformer/*.safetensors")
+        "model_hash": "39c6fc48f07bebecedbbaa971ff466c8",
+        "model_name": "flux2_dit",
+        "model_class": "diffsynth.models.flux2_dit.Flux2DiT",
+        "extra_kwargs": {"guidance_embeds": False, "joint_attention_dim": 12288, "num_attention_heads": 32, "num_layers": 8, "num_single_layers": 24}
+    },
+]
+z_image_series = [
+    {
+        # Example: ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="transformer/*.safetensors")
+        "model_hash": "fc3a8a1247fe185ce116ccbe0e426c28",
+        "model_name": "z_image_dit",
+        "model_class": "diffsynth.models.z_image_dit.ZImageDiT",
+    },
+    {
+        # Example: ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="text_encoder/*.safetensors")
+        "model_hash": "0f050f62a88876fea6eae0a18dac5a2e",
+        "model_name": "z_image_text_encoder",
+        "model_class": "diffsynth.models.z_image_text_encoder.ZImageTextEncoder",
+    },
+    {
+        # Example: ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="vae/vae/diffusion_pytorch_model.safetensors")
+        "model_hash": "1aafa3cc91716fb6b300cc1cd51b85a3",
+        "model_name": "flux_vae_encoder",
+        "model_class": "diffsynth.models.flux_vae.FluxVAEEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_vae.FluxVAEEncoderStateDictConverterDiffusers",
+        "extra_kwargs": {"use_conv_attention": False},
+    },
+    {
+        # Example: ModelConfig(model_id="Tongyi-MAI/Z-Image-Turbo", origin_file_pattern="vae/vae/diffusion_pytorch_model.safetensors")
+        "model_hash": "1aafa3cc91716fb6b300cc1cd51b85a3",
+        "model_name": "flux_vae_decoder",
+        "model_class": "diffsynth.models.flux_vae.FluxVAEDecoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.flux_vae.FluxVAEDecoderStateDictConverterDiffusers",
+        "extra_kwargs": {"use_conv_attention": False},
+    },
+    {
+        # Example: ModelConfig(model_id="Tongyi-MAI/Z-Image-Omni-Base", origin_file_pattern="transformer/*.safetensors")
+        "model_hash": "aa3563718e5c3ecde3dfbb020ca61180",
+        "model_name": "z_image_dit",
+        "model_class": "diffsynth.models.z_image_dit.ZImageDiT",
+        "extra_kwargs": {"siglip_feat_dim": 1152},
+    },
+    {
+        # Example: ModelConfig(model_id="Tongyi-MAI/Z-Image-Omni-Base", origin_file_pattern="siglip/model.safetensors")
+        "model_hash": "89d48e420f45cff95115a9f3e698d44a",
+        "model_name": "siglip_vision_model_428m",
+        "model_class": "diffsynth.models.siglip2_image_encoder.Siglip2ImageEncoder428M",
+    },
+    {
+        # Example: ModelConfig(model_id="PAI/Z-Image-Turbo-Fun-Controlnet-Union-2.1", origin_file_pattern="Z-Image-Turbo-Fun-Controlnet-Union-2.1-8steps.safetensors")
+        "model_hash": "1677708d40029ab380a95f6c731a57d7",
+        "model_name": "z_image_controlnet",
+        "model_class": "diffsynth.models.z_image_controlnet.ZImageControlNet",
+    },
+    {
+        # Example: ???
+        "model_hash": "9510cb8cd1dd34ee0e4f111c24905510",
+        "model_name": "z_image_image2lora_style",
+        "model_class": "diffsynth.models.z_image_image2lora.ZImageImage2LoRAModel",
+        "extra_kwargs": {"compress_dim": 128},
+    },
+    {
+        # Example: ModelConfig(model_id="Qwen/Qwen3-0.6B", origin_file_pattern="model.safetensors")
+        "model_hash": "1392adecee344136041e70553f875f31",
+        "model_name": "z_image_text_encoder",
+        "model_class": "diffsynth.models.z_image_text_encoder.ZImageTextEncoder",
+        "extra_kwargs": {"model_size": "0.6B"},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.z_image_text_encoder.ZImageTextEncoderStateDictConverter",
+    },
+    {
+        # To ensure compatibility with the `model.diffusion_model` prefix introduced by other frameworks.
+        "model_hash": "8cf241a0d32f93d5de368502a086852f",
+        "model_name": "z_image_dit",
+        "model_class": "diffsynth.models.z_image_dit.ZImageDiT",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.z_image_dit.ZImageDiTStateDictConverter",
+    },
+]
+"""
+Offical model repo: https://www.modelscope.cn/models/Lightricks/LTX-2
+Repackaged model repo: https://www.modelscope.cn/models/DiffSynth-Studio/LTX-2-Repackage
+For base models of LTX-2, offical checkpoint (with model config ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors"))
+and repackaged checkpoints (with model config ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="*.safetensors")) are both supported.
+We have repackeged the official checkpoints in DiffSynth-Studio/LTX-2-Repackage repo to support separate loading of different submodules,
+and avoid redundant memory usage when users only want to use part of the model.
+"""
+ltx2_series = [
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
+        "model_hash": "aca7b0bbf8415e9c98360750268915fc",
+        "model_name": "ltx2_dit",
+        "model_class": "diffsynth.models.ltx2_dit.LTXModel",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_dit.LTXModelStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="transformer.safetensors")
+        "model_hash": "c567aaa37d5ed7454c73aa6024458661",
+        "model_name": "ltx2_dit",
+        "model_class": "diffsynth.models.ltx2_dit.LTXModel",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_dit.LTXModelStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
+        "model_hash": "aca7b0bbf8415e9c98360750268915fc",
+        "model_name": "ltx2_video_vae_encoder",
+        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_encoder.safetensors")
+        "model_hash": "7f7e904a53260ec0351b05f32153754b",
+        "model_name": "ltx2_video_vae_encoder",
+        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
+        "model_hash": "aca7b0bbf8415e9c98360750268915fc",
+        "model_name": "ltx2_video_vae_decoder",
+        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoDecoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="video_vae_decoder.safetensors")
+        "model_hash": "dc6029ca2825147872b45e35a2dc3a97",
+        "model_name": "ltx2_video_vae_decoder",
+        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoDecoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
+        "model_hash": "aca7b0bbf8415e9c98360750268915fc",
+        "model_name": "ltx2_audio_vae_decoder",
+        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioDecoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vae_decoder.safetensors")
+        "model_hash": "7d7823dde8f1ea0b50fb07ac329dd4cb",
+        "model_name": "ltx2_audio_vae_decoder",
+        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioDecoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
+        "model_hash": "aca7b0bbf8415e9c98360750268915fc",
+        "model_name": "ltx2_audio_vocoder",
+        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2Vocoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2VocoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vocoder.safetensors")
+        "model_hash": "f471360f6b24bef702ab73133d9f8bb9",
+        "model_name": "ltx2_audio_vocoder",
+        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2Vocoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2VocoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
+        "model_hash": "aca7b0bbf8415e9c98360750268915fc",
+        "model_name": "ltx2_audio_vae_encoder",
+        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="audio_vae_encoder.safetensors")
+        "model_hash": "29338f3b95e7e312a3460a482e4f4554",
+        "model_name": "ltx2_audio_vae_encoder",
+        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
+        "model_hash": "aca7b0bbf8415e9c98360750268915fc",
+        "model_name": "ltx2_text_encoder_post_modules",
+        "model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderPostModulesStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2-Repackage", origin_file_pattern="text_encoder_post_modules.safetensors")
+        "model_hash": "981629689c8be92a712ab3c5eb4fc3f6",
+        "model_name": "ltx2_text_encoder_post_modules",
+        "model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderPostModulesStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="google/gemma-3-12b-it-qat-q4_0-unquantized", origin_file_pattern="model-*.safetensors")
+        "model_hash": "33917f31c4a79196171154cca39f165e",
+        "model_name": "ltx2_text_encoder",
+        "model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2", origin_file_pattern="ltx-2-19b-dev.safetensors")
+        "model_hash": "c79c458c6e99e0e14d47e676761732d2",
+        "model_name": "ltx2_latent_upsampler",
+        "model_class": "diffsynth.models.ltx2_upsampler.LTX2LatentUpsampler",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
+        "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
+        "model_name": "ltx2_dit",
+        "model_class": "diffsynth.models.ltx2_dit.LTXModel",
+        "extra_kwargs": {"apply_gated_attention": True, "cross_attention_adaln": True, "caption_channels": None},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_dit.LTXModelStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
+        "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
+        "model_name": "ltx2_video_vae_encoder",
+        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder",
+        "extra_kwargs": {"encoder_version": "ltx-2.3"},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
+        "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
+        "model_name": "ltx2_video_vae_decoder",
+        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder",
+        "extra_kwargs": {"decoder_version": "ltx-2.3"},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoDecoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
+        "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
+        "model_name": "ltx2_audio_vae_decoder",
+        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioDecoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
+        "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
+        "model_name": "ltx2_audio_vocoder",
+        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2VocoderWithBWE",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2VocoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
+        "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
+        "model_name": "ltx2_audio_vae_encoder",
+        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-22b-dev.safetensors")
+        "model_hash": "f3a83ecf3995dcc4fae2d27e08ad5767",
+        "model_name": "ltx2_text_encoder_post_modules",
+        "model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules",
+        "extra_kwargs": {"separated_audio_video": True, "embedding_dim_gemma": 3840, "num_layers_gemma": 49, "video_attention_heads": 32, "video_attention_head_dim": 128, "audio_attention_heads": 32, "audio_attention_head_dim": 64, "num_connector_layers": 8, "apply_gated_attention": True},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderPostModulesStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="Lightricks/LTX-2.3", origin_file_pattern="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
+        "model_hash": "aed408774d694a2452f69936c32febb5",
+        "model_name": "ltx2_latent_upsampler",
+        "model_class": "diffsynth.models.ltx2_upsampler.LTX2LatentUpsampler",
+        "extra_kwargs": {"rational_resampler": False},
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="transformer.safetensors")
+        "model_hash": "1c55afad76ed33c112a2978550b524d1",
+        "model_name": "ltx2_dit",
+        "model_class": "diffsynth.models.ltx2_dit.LTXModel",
+        "extra_kwargs": {"apply_gated_attention": True, "cross_attention_adaln": True, "caption_channels": None},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_dit.LTXModelStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="video_vae_encoder.safetensors")
+        "model_hash": "eecdc07c2ec30863b8a2b8b2134036cf",
+        "model_name": "ltx2_video_vae_encoder",
+        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder",
+        "extra_kwargs": {"encoder_version": "ltx-2.3"},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="video_vae_decoder.safetensors")
+        "model_hash": "deda2f542e17ee25bc8c38fd605316ea",
+        "model_name": "ltx2_video_vae_decoder",
+        "model_class": "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder",
+        "extra_kwargs": {"decoder_version": "ltx-2.3"},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_video_vae.LTX2VideoDecoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vocoder.safetensors")
+        "model_hash": "7d7823dde8f1ea0b50fb07ac329dd4cb",
+        "model_name": "ltx2_audio_vae_decoder",
+        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioDecoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vae_encoder.safetensors")
+        "model_hash": "29338f3b95e7e312a3460a482e4f4554",
+        "model_name": "ltx2_audio_vae_encoder",
+        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2AudioEncoder",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2AudioEncoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="audio_vocoder.safetensors")
+        "model_hash": "cd436c99e69ec5c80f050f0944f02a15",
+        "model_name": "ltx2_audio_vocoder",
+        "model_class": "diffsynth.models.ltx2_audio_vae.LTX2VocoderWithBWE",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_audio_vae.LTX2VocoderStateDictConverter",
+    },
+    {
+        # Example: ModelConfig(model_id="DiffSynth-Studio/LTX-2.3-Repackage", origin_file_pattern="text_encoder_post_modules.safetensors")
+        "model_hash": "05da2aab1c4b061f72c426311c165a43",
+        "model_name": "ltx2_text_encoder_post_modules",
+        "model_class": "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules",
+        "extra_kwargs": {"separated_audio_video": True, "embedding_dim_gemma": 3840, "num_layers_gemma": 49, "video_attention_heads": 32, "video_attention_head_dim": 128, "audio_attention_heads": 32, "audio_attention_head_dim": 64, "num_connector_layers": 8, "apply_gated_attention": True},
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.ltx2_text_encoder.LTX2TextEncoderPostModulesStateDictConverter",
+    },
+]
+anima_series = [
+    {
+        # Example: ModelConfig(model_id="circlestone-labs/Anima", origin_file_pattern="split_files/vae/qwen_image_vae.safetensors")
+        "model_hash": "a9995952c2d8e63cf82e115005eb61b9",
+        "model_name": "z_image_text_encoder",
+        "model_class": "diffsynth.models.z_image_text_encoder.ZImageTextEncoder",
+        "extra_kwargs": {"model_size": "0.6B"},
+    },
+    {
+        # Example: ModelConfig(model_id="circlestone-labs/Anima", origin_file_pattern="split_files/diffusion_models/anima-preview.safetensors")
+        "model_hash": "417673936471e79e31ed4d186d7a3f4a",
+        "model_name": "anima_dit",
+        "model_class": "diffsynth.models.anima_dit.AnimaDiT",
+        "state_dict_converter": "diffsynth.utils.state_dict_converters.anima_dit.AnimaDiTStateDictConverter",
+    }
+]
+mova_series = [
+    # Example: ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="audio_dit/diffusion_pytorch_model.safetensors")
+    {
+        "model_hash": "8c57e12790e2c45a64817e0ce28cde2f",
+        "model_name": "mova_audio_dit",
+        "model_class": "diffsynth.models.mova_audio_dit.MovaAudioDit",
+        "extra_kwargs": {'has_image_input': False, 'patch_size': [1], 'in_dim': 128, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 128, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06}
+    },
+    # Example: ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="audio_vae/diffusion_pytorch_model.safetensors")
+    {
+        "model_hash": "418517fb2b4e919d2cac8f314fcf82ac",
+        "model_name": "mova_audio_vae",
+        "model_class": "diffsynth.models.mova_audio_vae.DacVAE",
+    },
+    # Example: ModelConfig(model_id="openmoss/MOVA-720p", origin_file_pattern="dual_tower_bridge/diffusion_pytorch_model.safetensors")
+    {
+        "model_hash": "d1139dbbc8b4ab53cf4b4243d57bbceb",
+        "model_name": "mova_dual_tower_bridge",
+        "model_class": "diffsynth.models.mova_dual_tower_bridge.DualTowerConditionalBridge",
+    },
+]
+MODEL_CONFIGS = qwen_image_series + wan_series + flux_series + flux2_series + z_image_series + ltx2_series + anima_series + mova_series

diffsynth/configs/vram_management_module_maps.py ADDED Viewed

	@@ -0,0 +1,284 @@

+flux_general_vram_config = {
+    "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+    "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+    "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+    "torch.nn.GroupNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    "diffsynth.models.general_modules.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    "diffsynth.models.flux_lora_encoder.LoRALayerBlock": "diffsynth.core.vram.layers.AutoWrappedModule",
+    "diffsynth.models.flux_lora_patcher.LoraMerger": "diffsynth.core.vram.layers.AutoWrappedModule",
+}
+VRAM_MANAGEMENT_MODULE_MAPS = {
+    "diffsynth.models.qwen_image_dit.QwenImageDiT": {
+        "diffsynth.models.qwen_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLRotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VisionPatchEmbed": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VisionRotaryEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.qwen_image_vae.QwenImageVAE": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.qwen_image_vae.QwenImageRMS_norm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.qwen_image_controlnet.BlockWiseControlBlock": {
+        "diffsynth.models.qwen_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+    },
+    "diffsynth.models.siglip2_image_encoder.Siglip2ImageEncoder": {
+        "transformers.models.siglip.modeling_siglip.SiglipVisionEmbeddings": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.siglip.modeling_siglip.SiglipMultiheadAttentionPoolingHead": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+    },
+    "diffsynth.models.dinov3_image_encoder.DINOv3ImageEncoder": {
+        "transformers.models.dinov3_vit.modeling_dinov3_vit.DINOv3ViTLayerScale": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.dinov3_vit.modeling_dinov3_vit.DINOv3ViTRopePositionEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.dinov3_vit.modeling_dinov3_vit.DINOv3ViTEmbeddings": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+    },
+    "diffsynth.models.qwen_image_image2lora.QwenImageImage2LoRAModel": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+    },
+    "diffsynth.models.wan_video_animate_adapter.WanAnimateAdapter": {
+        "diffsynth.models.wan_video_animate_adapter.FaceEncoder": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_animate_adapter.EqualLinear": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_animate_adapter.ConvLayer": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_animate_adapter.FusedLeakyReLU": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_animate_adapter.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_dit_s2v.WanS2VModel": {
+        "diffsynth.models.wan_video_dit.Head": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit_s2v.WanS2VDiTBlock": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit_s2v.CausalAudioEncoder": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_dit.WanModel": {
+        "diffsynth.models.wan_video_dit.MLP": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit.DiTBlock": "diffsynth.core.vram.layers.AutoWrappedNonRecurseModule",
+        "diffsynth.models.wan_video_dit.Head": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_image_encoder.WanImageEncoder": {
+        "diffsynth.models.wan_video_image_encoder.VisionTransformer": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_mot.MotWanModel": {
+        "diffsynth.models.wan_video_mot.MotWanAttentionBlock": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_motion_controller.WanMotionControllerModel": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+    },
+    "diffsynth.models.wan_video_text_encoder.WanTextEncoder": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_text_encoder.T5RelativeEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_text_encoder.T5LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_vace.VaceWanModel": {
+        "diffsynth.models.wan_video_dit.DiTBlock": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_vae.WanVideoVAE": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_vae.RMS_norm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_vae.CausalConv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_vae.Upsample": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.SiLU": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Dropout": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wan_video_vae.WanVideoVAE38": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_vae.RMS_norm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_vae.CausalConv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_vae.Upsample": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.SiLU": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Dropout": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.wav2vec.WanS2VAudioEncoder": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.longcat_video_dit.LongCatVideoTransformer3DModel": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.longcat_video_dit.RMSNorm_FP32": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.longcat_video_dit.LayerNorm_FP32": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.flux_dit.FluxDiT": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "diffsynth.models.flux_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.flux_text_encoder_clip.FluxTextEncoderClip": flux_general_vram_config,
+    "diffsynth.models.flux_vae.FluxVAEEncoder": flux_general_vram_config,
+    "diffsynth.models.flux_vae.FluxVAEDecoder": flux_general_vram_config,
+    "diffsynth.models.flux_controlnet.FluxControlNet": flux_general_vram_config,
+    "diffsynth.models.flux_infiniteyou.InfiniteYouImageProjector": flux_general_vram_config,
+    "diffsynth.models.flux_ipadapter.FluxIpAdapter": flux_general_vram_config,
+    "diffsynth.models.flux_lora_patcher.FluxLoraPatcher": flux_general_vram_config,
+    "diffsynth.models.step1x_connector.Qwen2Connector": flux_general_vram_config,
+    "diffsynth.models.flux_lora_encoder.FluxLoRAEncoder": flux_general_vram_config,
+    "diffsynth.models.flux_text_encoder_t5.FluxTextEncoderT5": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.t5.modeling_t5.T5LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.t5.modeling_t5.T5DenseActDense": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.t5.modeling_t5.T5DenseGatedActDense": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.flux_ipadapter.SiglipVisionModelSO400M": {
+        "transformers.models.siglip.modeling_siglip.SiglipVisionEmbeddings": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.siglip.modeling_siglip.SiglipEncoder": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.siglip.modeling_siglip.SiglipMultiheadAttentionPoolingHead": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.MultiheadAttention": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.flux2_dit.Flux2DiT": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.flux2_text_encoder.Flux2TextEncoder": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.mistral.modeling_mistral.MistralRMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.flux2_vae.Flux2VAE": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.GroupNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.z_image_text_encoder.ZImageTextEncoder": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "transformers.models.qwen3.modeling_qwen3.Qwen3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.z_image_dit.ZImageDiT": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "diffsynth.models.z_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.z_image_controlnet.ZImageControlNet": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "diffsynth.models.z_image_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.z_image_image2lora.ZImageImage2LoRAModel": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+    },
+    "diffsynth.models.siglip2_image_encoder.Siglip2ImageEncoder428M": {
+        "transformers.models.siglip2.modeling_siglip2.Siglip2VisionEmbeddings": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.siglip2.modeling_siglip2.Siglip2MultiheadAttentionPoolingHead": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+    },
+    "diffsynth.models.ltx2_dit.LTXModel": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.ltx2_upsampler.LTX2LatentUpsampler": {
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.GroupNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.ltx2_video_vae.LTX2VideoEncoder": {
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.ltx2_video_vae.LTX2VideoDecoder": {
+        "torch.nn.Conv3d": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.ltx2_audio_vae.LTX2AudioDecoder": {
+        "torch.nn.Conv2d": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.ltx2_audio_vae.LTX2Vocoder": {
+        "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.ConvTranspose1d": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.ltx2_text_encoder.LTX2TextEncoderPostModules": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.ltx2_text_encoder.Embeddings1DConnector": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.ltx2_text_encoder.LTX2TextEncoder": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "transformers.models.gemma3.modeling_gemma3.Gemma3MultiModalProjector": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.gemma3.modeling_gemma3.Gemma3RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "transformers.models.gemma3.modeling_gemma3.Gemma3TextScaledWordEmbedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.anima_dit.AnimaDiT": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Embedding": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.mova_audio_dit.MovaAudioDit": {
+        "diffsynth.models.wan_video_dit.DiTBlock": "diffsynth.core.vram.layers.AutoWrappedNonRecurseModule",
+        "diffsynth.models.wan_video_dit.Head": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.mova_dual_tower_bridge.DualTowerConditionalBridge": {
+        "torch.nn.Linear": "diffsynth.core.vram.layers.AutoWrappedLinear",
+        "torch.nn.LayerNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "diffsynth.models.wan_video_dit.RMSNorm": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+    "diffsynth.models.mova_audio_vae.DacVAE": {
+        "diffsynth.models.mova_audio_vae.Snake1d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.Conv1d": "diffsynth.core.vram.layers.AutoWrappedModule",
+        "torch.nn.ConvTranspose1d": "diffsynth.core.vram.layers.AutoWrappedModule",
+    },
+}
+def QwenImageTextEncoder_Module_Map_Updater():
+    current = VRAM_MANAGEMENT_MODULE_MAPS["diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder"]
+    from packaging import version
+    import transformers
+    if version.parse(transformers.__version__) >= version.parse("5.2.0"):
+        # The Qwen2RMSNorm in transformers 5.2.0+ has been renamed to Qwen2_5_VLRMSNorm, so we need to update the module map accordingly
+        current.pop("transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2RMSNorm", None)
+        current["transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VLRMSNorm"] = "diffsynth.core.vram.layers.AutoWrappedModule"
+    return current
+VERSION_CHECKER_MAPS = {
+    "diffsynth.models.qwen_image_text_encoder.QwenImageTextEncoder": QwenImageTextEncoder_Module_Map_Updater,
+}

diffsynth/core/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .attention import *
+from .data import *
+from .gradient import *
+from .loader import *
+from .vram import *
+from .device import *

diffsynth/core/attention/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .attention import attention_forward

diffsynth/core/attention/attention.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch, os
+from einops import rearrange
+try:
+    import flash_attn_interface
+    FLASH_ATTN_3_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_3_AVAILABLE = False
+try:
+    import flash_attn
+    FLASH_ATTN_2_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_2_AVAILABLE = False
+try:
+    from sageattention import sageattn
+    SAGE_ATTN_AVAILABLE = True
+except ModuleNotFoundError:
+    SAGE_ATTN_AVAILABLE = False
+try:
+    import xformers.ops as xops
+    XFORMERS_AVAILABLE = True
+except ModuleNotFoundError:
+    XFORMERS_AVAILABLE = False
+def initialize_attention_priority():
+    if os.environ.get('DIFFSYNTH_ATTENTION_IMPLEMENTATION') is not None:
+        return os.environ.get('DIFFSYNTH_ATTENTION_IMPLEMENTATION').lower()
+    elif FLASH_ATTN_3_AVAILABLE:
+        return "flash_attention_3"
+    elif FLASH_ATTN_2_AVAILABLE:
+        return "flash_attention_2"
+    elif SAGE_ATTN_AVAILABLE:
+        return "sage_attention"
+    elif XFORMERS_AVAILABLE:
+        return "xformers"
+    else:
+        return "torch"
+ATTENTION_IMPLEMENTATION = initialize_attention_priority()
+def rearrange_qkv(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", required_in_pattern="b n s d", dims=None):
+    dims = {} if dims is None else dims
+    if q_pattern != required_in_pattern:
+        q = rearrange(q, f"{q_pattern} -> {required_in_pattern}", **dims)
+    if k_pattern != required_in_pattern:
+        k = rearrange(k, f"{k_pattern} -> {required_in_pattern}", **dims)
+    if v_pattern != required_in_pattern:
+        v = rearrange(v, f"{v_pattern} -> {required_in_pattern}", **dims)
+    return q, k, v
+def rearrange_out(out: torch.Tensor, out_pattern="b n s d", required_out_pattern="b n s d", dims=None):
+    dims = {} if dims is None else dims
+    if out_pattern != required_out_pattern:
+        out = rearrange(out, f"{required_out_pattern} -> {out_pattern}", **dims)
+    return out
+def torch_sdpa(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, attn_mask=None, scale=None):
+    required_in_pattern, required_out_pattern= "b n s d", "b n s d"
+    q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
+    out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask, scale=scale)
+    out = rearrange_out(out, out_pattern, required_out_pattern, dims)
+    return out
+def flash_attention_3(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, scale=None):
+    required_in_pattern, required_out_pattern= "b s n d", "b s n d"
+    q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
+    out = flash_attn_interface.flash_attn_func(q, k, v, softmax_scale=scale)
+    if isinstance(out, tuple):
+        out = out[0]
+    out = rearrange_out(out, out_pattern, required_out_pattern, dims)
+    return out
+def flash_attention_2(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, scale=None):
+    required_in_pattern, required_out_pattern= "b s n d", "b s n d"
+    q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
+    out = flash_attn.flash_attn_func(q, k, v, softmax_scale=scale)
+    out = rearrange_out(out, out_pattern, required_out_pattern, dims)
+    return out
+def sage_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, scale=None):
+    required_in_pattern, required_out_pattern= "b n s d", "b n s d"
+    q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
+    out = sageattn(q, k, v, sm_scale=scale)
+    out = rearrange_out(out, out_pattern, required_out_pattern, dims)
+    return out
+def xformers_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, scale=None):
+    required_in_pattern, required_out_pattern= "b s n d", "b s n d"
+    q, k, v = rearrange_qkv(q, k, v, q_pattern, k_pattern, v_pattern, required_in_pattern, dims)
+    out = xops.memory_efficient_attention(q, k, v, scale=scale)
+    out = rearrange_out(out, out_pattern, required_out_pattern, dims)
+    return out
+def attention_forward(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, q_pattern="b n s d", k_pattern="b n s d", v_pattern="b n s d", out_pattern="b n s d", dims=None, attn_mask=None, scale=None, compatibility_mode=False):
+    if compatibility_mode or (attn_mask is not None):
+        return torch_sdpa(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, attn_mask=attn_mask, scale=scale)
+    else:
+        if ATTENTION_IMPLEMENTATION == "flash_attention_3":
+            return flash_attention_3(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale)
+        elif ATTENTION_IMPLEMENTATION == "flash_attention_2":
+            return flash_attention_2(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale)
+        elif ATTENTION_IMPLEMENTATION == "sage_attention":
+            return sage_attention(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale)
+        elif ATTENTION_IMPLEMENTATION == "xformers":
+            return xformers_attention(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale)
+        else:
+            return torch_sdpa(q, k, v, q_pattern, k_pattern, v_pattern, out_pattern, dims, scale=scale)

diffsynth/core/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .unified_dataset import UnifiedDataset

diffsynth/core/data/operators.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import math
+import torch, torchvision, imageio, os
+import imageio.v3 as iio
+from PIL import Image
+import torchaudio
+class DataProcessingPipeline:
+    def __init__(self, operators=None):
+        self.operators: list[DataProcessingOperator] = [] if operators is None else operators
+    def __call__(self, data):
+        for operator in self.operators:
+            data = operator(data)
+        return data
+    def __rshift__(self, pipe):
+        if isinstance(pipe, DataProcessingOperator):
+            pipe = DataProcessingPipeline([pipe])
+        return DataProcessingPipeline(self.operators + pipe.operators)
+class DataProcessingOperator:
+    def __call__(self, data):
+        raise NotImplementedError("DataProcessingOperator cannot be called directly.")
+    def __rshift__(self, pipe):
+        if isinstance(pipe, DataProcessingOperator):
+            pipe = DataProcessingPipeline([pipe])
+        return DataProcessingPipeline([self]).__rshift__(pipe)
+class DataProcessingOperatorRaw(DataProcessingOperator):
+    def __call__(self, data):
+        return data
+class ToInt(DataProcessingOperator):
+    def __call__(self, data):
+        return int(data)
+class ToFloat(DataProcessingOperator):
+    def __call__(self, data):
+        return float(data)
+class ToStr(DataProcessingOperator):
+    def __init__(self, none_value=""):
+        self.none_value = none_value
+    def __call__(self, data):
+        if data is None: data = self.none_value
+        return str(data)
+class LoadImage(DataProcessingOperator):
+    def __init__(self, convert_RGB=True, convert_RGBA=False):
+        self.convert_RGB = convert_RGB
+        self.convert_RGBA = convert_RGBA
+    def __call__(self, data: str):
+        image = Image.open(data)
+        if self.convert_RGB: image = image.convert("RGB")
+        if self.convert_RGBA: image = image.convert("RGBA")
+        return image
+class ImageCropAndResize(DataProcessingOperator):
+    def __init__(self, height=None, width=None, max_pixels=None, height_division_factor=1, width_division_factor=1):
+        self.height = height
+        self.width = width
+        self.max_pixels = max_pixels
+        self.height_division_factor = height_division_factor
+        self.width_division_factor = width_division_factor
+    def crop_and_resize(self, image, target_height, target_width):
+        width, height = image.size
+        scale = max(target_width / width, target_height / height)
+        image = torchvision.transforms.functional.resize(
+            image,
+            (round(height*scale), round(width*scale)),
+            interpolation=torchvision.transforms.InterpolationMode.BILINEAR
+        )
+        image = torchvision.transforms.functional.center_crop(image, (target_height, target_width))
+        return image
+    def get_height_width(self, image):
+        if self.height is None or self.width is None:
+            width, height = image.size
+            if width * height > self.max_pixels:
+                scale = (width * height / self.max_pixels) ** 0.5
+                height, width = int(height / scale), int(width / scale)
+            height = height // self.height_division_factor * self.height_division_factor
+            width = width // self.width_division_factor * self.width_division_factor
+        else:
+            height, width = self.height, self.width
+        return height, width
+    def __call__(self, data: Image.Image):
+        image = self.crop_and_resize(data, *self.get_height_width(data))
+        return image
+class ToList(DataProcessingOperator):
+    def __call__(self, data):
+        return [data]
+class FrameSamplerByRateMixin:
+    def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1, frame_rate=24, fix_frame_rate=False):
+        self.num_frames = num_frames
+        self.time_division_factor = time_division_factor
+        self.time_division_remainder = time_division_remainder
+        self.frame_rate = frame_rate
+        self.fix_frame_rate = fix_frame_rate
+    def get_reader(self, data: str):
+        return imageio.get_reader(data)
+    def get_available_num_frames(self, reader):
+        if not self.fix_frame_rate:
+            return reader.count_frames()
+        meta_data = reader.get_meta_data()
+        total_original_frames = int(reader.count_frames())
+        duration = meta_data["duration"] if "duration" in meta_data else total_original_frames / meta_data['fps']
+        total_available_frames = math.floor(duration * self.frame_rate)
+        return int(total_available_frames)
+    def get_num_frames(self, reader):
+        num_frames = self.num_frames
+        total_frames = self.get_available_num_frames(reader)
+        if int(total_frames) < num_frames:
+            num_frames = total_frames
+            while num_frames > 1 and num_frames % self.time_division_factor != self.time_division_remainder:
+                num_frames -= 1
+        return num_frames
+    def map_single_frame_id(self, new_sequence_id: int, raw_frame_rate: float, total_raw_frames: int) -> int:
+        if not self.fix_frame_rate:
+            return new_sequence_id
+        target_time_in_seconds = new_sequence_id / self.frame_rate
+        raw_frame_index_float = target_time_in_seconds * raw_frame_rate
+        frame_id = int(round(raw_frame_index_float))
+        frame_id = min(frame_id, total_raw_frames - 1)
+        return frame_id
+class LoadVideo(DataProcessingOperator, FrameSamplerByRateMixin):
+    def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1, frame_processor=lambda x: x, frame_rate=24, fix_frame_rate=False):
+        FrameSamplerByRateMixin.__init__(self, num_frames, time_division_factor, time_division_remainder, frame_rate, fix_frame_rate)
+        # frame_processor is build in the video loader for high efficiency.
+        self.frame_processor = frame_processor
+    def __call__(self, data: str):
+        reader = self.get_reader(data)
+        raw_frame_rate = reader.get_meta_data()['fps']
+        total_raw_frames = reader.count_frames()
+        total_available = self.get_available_num_frames(reader)
+        # Pad short videos with the last frame instead of reducing num_frames
+        num_frames = self.num_frames
+        frames = []
+        for frame_id in range(num_frames):
+            if frame_id < total_available:
+                raw_id = self.map_single_frame_id(frame_id, raw_frame_rate, total_raw_frames)
+                frame = reader.get_data(raw_id)
+                frame = Image.fromarray(frame)
+                frame = self.frame_processor(frame)
+                frames.append(frame)
+            else:
+                # Pad with the last frame
+                frames.append(frames[-1])
+        reader.close()
+        return frames
+class SequencialProcess(DataProcessingOperator):
+    def __init__(self, operator=lambda x: x):
+        self.operator = operator
+    def __call__(self, data):
+        return [self.operator(i) for i in data]
+class LoadGIF(DataProcessingOperator):
+    def __init__(self, num_frames=81, time_division_factor=4, time_division_remainder=1, frame_processor=lambda x: x):
+        self.num_frames = num_frames
+        self.time_division_factor = time_division_factor
+        self.time_division_remainder = time_division_remainder
+        # frame_processor is build in the video loader for high efficiency.
+        self.frame_processor = frame_processor
+    def get_num_frames(self, path):
+        num_frames = self.num_frames
+        images = iio.imread(path, mode="RGB")
+        if len(images) < num_frames:
+            num_frames = len(images)
+            while num_frames > 1 and num_frames % self.time_division_factor != self.time_division_remainder:
+                num_frames -= 1
+        return num_frames
+    def __call__(self, data: str):
+        num_frames = self.get_num_frames(data)
+        frames = []
+        images = iio.imread(data, mode="RGB")
+        for img in images:
+            frame = Image.fromarray(img)
+            frame = self.frame_processor(frame)
+            frames.append(frame)
+            if len(frames) >= num_frames:
+                break
+        return frames
+class RouteByExtensionName(DataProcessingOperator):
+    def __init__(self, operator_map):
+        self.operator_map = operator_map
+    def __call__(self, data: str):
+        file_ext_name = data.split(".")[-1].lower()
+        for ext_names, operator in self.operator_map:
+            if ext_names is None or file_ext_name in ext_names:
+                return operator(data)
+        raise ValueError(f"Unsupported file: {data}")
+class RouteByType(DataProcessingOperator):
+    def __init__(self, operator_map):
+        self.operator_map = operator_map
+    def __call__(self, data):
+        for dtype, operator in self.operator_map:
+            if dtype is None or isinstance(data, dtype):
+                return operator(data)
+        raise ValueError(f"Unsupported data: {data}")
+class LoadTorchPickle(DataProcessingOperator):
+    def __init__(self, map_location="cpu"):
+        self.map_location = map_location
+    def __call__(self, data):
+        return torch.load(data, map_location=self.map_location, weights_only=False)
+class ToAbsolutePath(DataProcessingOperator):
+    def __init__(self, base_path=""):
+        self.base_path = base_path
+    def __call__(self, data):
+        return os.path.join(self.base_path, data)
+class LoadAudio(DataProcessingOperator):
+    def __init__(self, sr=16000):
+        self.sr = sr
+    def __call__(self, data: str):
+        import librosa
+        input_audio, sample_rate = librosa.load(data, sr=self.sr)
+        return input_audio
+class LoadAudioWithTorchaudio(DataProcessingOperator, FrameSamplerByRateMixin):
+    def __init__(self, num_frames=121, time_division_factor=8, time_division_remainder=1, frame_rate=24, fix_frame_rate=True):
+        FrameSamplerByRateMixin.__init__(self, num_frames, time_division_factor, time_division_remainder, frame_rate, fix_frame_rate)
+    def __call__(self, data: str):
+        reader = self.get_reader(data)
+        num_frames = self.get_num_frames(reader)
+        duration = num_frames / self.frame_rate
+        waveform, sample_rate = torchaudio.load(data)
+        target_samples = int(duration * sample_rate)
+        current_samples = waveform.shape[-1]
+        if current_samples > target_samples:
+            waveform = waveform[..., :target_samples]
+        elif current_samples < target_samples:
+            padding = target_samples - current_samples
+            waveform = torch.nn.functional.pad(waveform, (0, padding))
+        return waveform, sample_rate

diffsynth/core/data/unified_dataset.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from .operators import *
+import torch, json, pandas
+class UnifiedDataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        base_path=None, metadata_path=None,
+        repeat=1,
+        data_file_keys=tuple(),
+        main_data_operator=lambda x: x,
+        special_operator_map=None,
+        max_data_items=None,
+    ):
+        self.base_path = base_path
+        self.metadata_path = metadata_path
+        self.repeat = repeat
+        self.data_file_keys = data_file_keys
+        self.main_data_operator = main_data_operator
+        self.cached_data_operator = LoadTorchPickle()
+        self.special_operator_map = {} if special_operator_map is None else special_operator_map
+        self.max_data_items = max_data_items
+        self.data = []
+        self.cached_data = []
+        self.load_from_cache = metadata_path is None
+        self.load_metadata(metadata_path)
+    @staticmethod
+    def default_image_operator(
+        base_path="",
+        max_pixels=1920*1080, height=None, width=None,
+        height_division_factor=16, width_division_factor=16,
+    ):
+        return RouteByType(operator_map=[
+            (str, ToAbsolutePath(base_path) >> LoadImage() >> ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor)),
+            (list, SequencialProcess(ToAbsolutePath(base_path) >> LoadImage() >> ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor))),
+        ])
+    @staticmethod
+    def default_video_operator(
+        base_path="",
+        max_pixels=1920*1080, height=None, width=None,
+        height_division_factor=16, width_division_factor=16,
+        num_frames=81, time_division_factor=4, time_division_remainder=1,
+        frame_rate=24, fix_frame_rate=False,
+    ):
+        return RouteByType(operator_map=[
+            (str, ToAbsolutePath(base_path) >> RouteByExtensionName(operator_map=[
+                (("jpg", "jpeg", "png", "webp"), LoadImage() >> ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor) >> ToList()),
+                (("gif",), LoadGIF(
+                    num_frames, time_division_factor, time_division_remainder,
+                    frame_processor=ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor),
+                )),
+                (("mp4", "avi", "mov", "wmv", "mkv", "flv", "webm"), LoadVideo(
+                    num_frames, time_division_factor, time_division_remainder,
+                    frame_processor=ImageCropAndResize(height, width, max_pixels, height_division_factor, width_division_factor),
+                    frame_rate=frame_rate, fix_frame_rate=fix_frame_rate,
+                )),
+            ])),
+        ])
+    def search_for_cached_data_files(self, path):
+        for file_name in os.listdir(path):
+            subpath = os.path.join(path, file_name)
+            if os.path.isdir(subpath):
+                self.search_for_cached_data_files(subpath)
+            elif subpath.endswith(".pth"):
+                self.cached_data.append(subpath)
+    def load_metadata(self, metadata_path):
+        if metadata_path is None:
+            print("No metadata_path. Searching for cached data files.")
+            self.search_for_cached_data_files(self.base_path)
+            print(f"{len(self.cached_data)} cached data files found.")
+        elif metadata_path.endswith(".json"):
+            with open(metadata_path, "r") as f:
+                metadata = json.load(f)
+            self.data = metadata
+        elif metadata_path.endswith(".jsonl"):
+            metadata = []
+            with open(metadata_path, 'r') as f:
+                for line in f:
+                    metadata.append(json.loads(line.strip()))
+            self.data = metadata
+        else:
+            metadata = pandas.read_csv(metadata_path)
+            self.data = [metadata.iloc[i].to_dict() for i in range(len(metadata))]
+    def __getitem__(self, data_id):
+        if self.load_from_cache:
+            data = self.cached_data[data_id % len(self.cached_data)]
+            data = self.cached_data_operator(data)
+        else:
+            data = self.data[data_id % len(self.data)].copy()
+            for key in self.data_file_keys:
+                if key in data:
+                    if key in self.special_operator_map:
+                        data[key] = self.special_operator_map[key](data[key])
+                    elif key in self.data_file_keys:
+                        data[key] = self.main_data_operator(data[key])
+        return data
+    def __len__(self):
+        if self.max_data_items is not None:
+            return self.max_data_items
+        elif self.load_from_cache:
+            return len(self.cached_data) * self.repeat
+        else:
+            return len(self.data) * self.repeat
+    def check_data_equal(self, data1, data2):
+        # Debug only
+        if len(data1) != len(data2):
+            return False
+        for k in data1:
+            if data1[k] != data2[k]:
+                return False
+        return True

diffsynth/core/device/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .npu_compatible_device import parse_device_type, parse_nccl_backend, get_available_device_type, get_device_name
2	+ from .npu_compatible_device import IS_NPU_AVAILABLE, IS_CUDA_AVAILABLE

diffsynth/core/device/npu_compatible_device.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import importlib
+import torch
+from typing import Any
+def is_torch_npu_available():
+    return importlib.util.find_spec("torch_npu") is not None
+IS_CUDA_AVAILABLE = torch.cuda.is_available()
+IS_NPU_AVAILABLE = is_torch_npu_available() and torch.npu.is_available()
+if IS_NPU_AVAILABLE:
+    import torch_npu
+    torch.npu.config.allow_internal_format = False
+def get_device_type() -> str:
+    """Get device type based on current machine, currently only support CPU, CUDA, NPU."""
+    if IS_CUDA_AVAILABLE:
+        device = "cuda"
+    elif IS_NPU_AVAILABLE:
+        device = "npu"
+    else:
+        device = "cpu"
+    return device
+def get_torch_device() -> Any:
+    """Get torch attribute based on device type, e.g. torch.cuda or torch.npu"""
+    device_name = get_device_type()
+    try:
+        return getattr(torch, device_name)
+    except AttributeError:
+        print(f"Device namespace '{device_name}' not found in torch, try to load 'torch.cuda'.")
+        return torch.cuda
+def get_device_id() -> int:
+    """Get current device id based on device type."""
+    return get_torch_device().current_device()
+def get_device_name() -> str:
+    """Get current device name based on device type."""
+    return f"{get_device_type()}:{get_device_id()}"
+def synchronize() -> None:
+    """Execute torch synchronize operation."""
+    get_torch_device().synchronize()
+def empty_cache() -> None:
+    """Execute torch empty cache operation."""
+    get_torch_device().empty_cache()
+def get_nccl_backend() -> str:
+    """Return distributed communication backend type based on device type."""
+    if IS_CUDA_AVAILABLE:
+        return "nccl"
+    elif IS_NPU_AVAILABLE:
+        return "hccl"
+    else:
+        raise RuntimeError(f"No available distributed communication backend found on device type {get_device_type()}.")
+def enable_high_precision_for_bf16():
+    """
+    Set high accumulation dtype for matmul and reduction.
+    """
+    if IS_CUDA_AVAILABLE:
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
+    if IS_NPU_AVAILABLE:
+        torch.npu.matmul.allow_tf32 = False
+        torch.npu.matmul.allow_bf16_reduced_precision_reduction = False
+def parse_device_type(device):
+    if isinstance(device, str):
+        if device.startswith("cuda"):
+            return "cuda"
+        elif device.startswith("npu"):
+            return "npu"
+        else:
+            return "cpu"
+    elif isinstance(device, torch.device):
+        return device.type
+def parse_nccl_backend(device_type):
+    if device_type == "cuda":
+        return "nccl"
+    elif device_type == "npu":
+        return "hccl"
+    else:
+        raise RuntimeError(f"No available distributed communication backend found on device type {device_type}.")
+def get_available_device_type():
+    return get_device_type()

diffsynth/core/gradient/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .gradient_checkpoint import gradient_checkpoint_forward

diffsynth/core/gradient/gradient_checkpoint.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+import warnings
+# Suppress checkpoint requires_grad warning - gradients flow through model params, not inputs
+warnings.filterwarnings("ignore", message=".*None of the inputs have requires_grad.*")
+def create_custom_forward(module):
+    def custom_forward(*inputs, **kwargs):
+        return module(*inputs, **kwargs)
+    return custom_forward
+def gradient_checkpoint_forward(
+    model,
+    use_gradient_checkpointing,
+    use_gradient_checkpointing_offload,
+    *args,
+    **kwargs,
+):
+    if use_gradient_checkpointing_offload:
+        with torch.autograd.graph.save_on_cpu():
+            model_output = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(model),
+                *args,
+                **kwargs,
+                use_reentrant=True,
+            )
+    elif use_gradient_checkpointing:
+        model_output = torch.utils.checkpoint.checkpoint(
+            create_custom_forward(model),
+            *args,
+            **kwargs,
+            use_reentrant=True,
+        )
+    else:
+        model_output = model(*args, **kwargs)
+    return model_output

diffsynth/core/loader/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .file import load_state_dict, hash_state_dict_keys, hash_model_file
+from .model import load_model, load_model_with_disk_offload
+from .config import ModelConfig

diffsynth/core/loader/config.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import torch, glob, os
+from typing import Optional, Union, Dict
+from dataclasses import dataclass
+from modelscope import snapshot_download
+from huggingface_hub import snapshot_download as hf_snapshot_download
+from typing import Optional
+@dataclass
+class ModelConfig:
+    path: Union[str, list[str]] = None
+    model_id: str = None
+    origin_file_pattern: Union[str, list[str]] = None
+    download_source: str = None
+    local_model_path: str = None
+    skip_download: bool = None
+    offload_device: Optional[Union[str, torch.device]] = None
+    offload_dtype: Optional[torch.dtype] = None
+    onload_device: Optional[Union[str, torch.device]] = None
+    onload_dtype: Optional[torch.dtype] = None
+    preparing_device: Optional[Union[str, torch.device]] = None
+    preparing_dtype: Optional[torch.dtype] = None
+    computation_device: Optional[Union[str, torch.device]] = None
+    computation_dtype: Optional[torch.dtype] = None
+    clear_parameters: bool = False
+    state_dict: Dict[str, torch.Tensor] = None
+    def check_input(self):
+        if self.path is None and self.model_id is None:
+            raise ValueError(f"""No valid model files. Please use `ModelConfig(path="xxx")` or `ModelConfig(model_id="xxx/yyy", origin_file_pattern="zzz")`. `skip_download=True` only supports the first one.""")
+    def parse_original_file_pattern(self):
+        if self.origin_file_pattern in [None, "", "./"]:
+            return "*"
+        elif self.origin_file_pattern.endswith("/"):
+            return self.origin_file_pattern + "*"
+        else:
+            return self.origin_file_pattern
+    def parse_download_source(self):
+        if self.download_source is None:
+            if os.environ.get('DIFFSYNTH_DOWNLOAD_SOURCE') is not None:
+                return os.environ.get('DIFFSYNTH_DOWNLOAD_SOURCE')
+            else:
+                return "modelscope"
+        else:
+            return self.download_source
+    def parse_skip_download(self):
+        if self.skip_download is None:
+            if os.environ.get('DIFFSYNTH_SKIP_DOWNLOAD') is not None:
+                if os.environ.get('DIFFSYNTH_SKIP_DOWNLOAD').lower() == "true":
+                    return True
+                elif os.environ.get('DIFFSYNTH_SKIP_DOWNLOAD').lower() == "false":
+                    return False
+            else:
+                return False
+        else:
+            return self.skip_download
+    def download(self):
+        origin_file_pattern = self.parse_original_file_pattern()
+        downloaded_files = glob.glob(origin_file_pattern, root_dir=os.path.join(self.local_model_path, self.model_id))
+        download_source = self.parse_download_source()
+        if download_source.lower() == "modelscope":
+            snapshot_download(
+                self.model_id,
+                local_dir=os.path.join(self.local_model_path, self.model_id),
+                allow_file_pattern=origin_file_pattern,
+                ignore_file_pattern=downloaded_files,
+                local_files_only=False
+            )
+        elif download_source.lower() == "huggingface":
+            hf_snapshot_download(
+                self.model_id,
+                local_dir=os.path.join(self.local_model_path, self.model_id),
+                allow_patterns=origin_file_pattern,
+                ignore_patterns=downloaded_files,
+                local_files_only=False
+            )
+        else:
+            raise ValueError("`download_source` should be `modelscope` or `huggingface`.")
+    def require_downloading(self):
+        if self.path is not None:
+            return False
+        skip_download = self.parse_skip_download()
+        return not skip_download
+    def reset_local_model_path(self):
+        if os.environ.get('DIFFSYNTH_MODEL_BASE_PATH') is not None:
+            self.local_model_path = os.environ.get('DIFFSYNTH_MODEL_BASE_PATH')
+        elif self.local_model_path is None:
+            self.local_model_path = "./models"
+    def download_if_necessary(self):
+        self.check_input()
+        self.reset_local_model_path()
+        if self.require_downloading():
+            self.download()
+        if self.path is None:
+            if self.origin_file_pattern in [None, "", "./"]:
+                self.path = os.path.join(self.local_model_path, self.model_id)
+            else:
+                self.path = glob.glob(os.path.join(self.local_model_path, self.model_id, self.origin_file_pattern))
+        if isinstance(self.path, list) and len(self.path) == 1:
+            self.path = self.path[0]
+    def vram_config(self):
+        return {
+            "offload_device": self.offload_device,
+            "offload_dtype": self.offload_dtype,
+            "onload_device": self.onload_device,
+            "onload_dtype": self.onload_dtype,
+            "preparing_device": self.preparing_device,
+            "preparing_dtype": self.preparing_dtype,
+            "computation_device": self.computation_device,
+            "computation_dtype": self.computation_dtype,
+        }

diffsynth/core/loader/file.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from safetensors import safe_open
+import torch, hashlib
+def load_state_dict(file_path, torch_dtype=None, device="cpu", pin_memory=False, verbose=0):
+    if isinstance(file_path, list):
+        state_dict = {}
+        for file_path_ in file_path:
+            state_dict.update(load_state_dict(file_path_, torch_dtype, device, pin_memory=pin_memory, verbose=verbose))
+    else:
+        if verbose >= 1:
+            print(f"Loading file [started]: {file_path}")
+        if file_path.endswith(".safetensors"):
+            state_dict = load_state_dict_from_safetensors(file_path, torch_dtype=torch_dtype, device=device)
+        else:
+            state_dict = load_state_dict_from_bin(file_path, torch_dtype=torch_dtype, device=device)
+        # If load state dict in CPU memory, `pin_memory=True` will make `model.to("cuda")` faster.
+        if pin_memory:
+            for i in state_dict:
+                state_dict[i] = state_dict[i].pin_memory()
+        if verbose >= 1:
+            print(f"Loading file [done]: {file_path}")
+    return state_dict
+def load_state_dict_from_safetensors(file_path, torch_dtype=None, device="cpu"):
+    state_dict = {}
+    with safe_open(file_path, framework="pt", device=str(device)) as f:
+        for k in f.keys():
+            state_dict[k] = f.get_tensor(k)
+            if torch_dtype is not None:
+                state_dict[k] = state_dict[k].to(torch_dtype)
+    return state_dict
+def load_state_dict_from_bin(file_path, torch_dtype=None, device="cpu"):
+    state_dict = torch.load(file_path, map_location=device, weights_only=True)
+    if len(state_dict) == 1:
+        if "state_dict" in state_dict:
+            state_dict = state_dict["state_dict"]
+        elif "module" in state_dict:
+            state_dict = state_dict["module"]
+        elif "model_state" in state_dict:
+            state_dict = state_dict["model_state"]
+    if torch_dtype is not None:
+        for i in state_dict:
+            if isinstance(state_dict[i], torch.Tensor):
+                state_dict[i] = state_dict[i].to(torch_dtype)
+    return state_dict
+def convert_state_dict_keys_to_single_str(state_dict, with_shape=True):
+    keys = []
+    for key, value in state_dict.items():
+        if isinstance(key, str):
+            if isinstance(value, torch.Tensor):
+                if with_shape:
+                    shape = "_".join(map(str, list(value.shape)))
+                    keys.append(key + ":" + shape)
+                keys.append(key)
+            elif isinstance(value, dict):
+                keys.append(key + "|" + convert_state_dict_keys_to_single_str(value, with_shape=with_shape))
+    keys.sort()
+    keys_str = ",".join(keys)
+    return keys_str
+def hash_state_dict_keys(state_dict, with_shape=True):
+    keys_str = convert_state_dict_keys_to_single_str(state_dict, with_shape=with_shape)
+    keys_str = keys_str.encode(encoding="UTF-8")
+    return hashlib.md5(keys_str).hexdigest()
+def load_keys_dict(file_path):
+    if isinstance(file_path, list):
+        state_dict = {}
+        for file_path_ in file_path:
+            state_dict.update(load_keys_dict(file_path_))
+        return state_dict
+    if file_path.endswith(".safetensors"):
+        return load_keys_dict_from_safetensors(file_path)
+    else:
+        return load_keys_dict_from_bin(file_path)
+def load_keys_dict_from_safetensors(file_path):
+    keys_dict = {}
+    with safe_open(file_path, framework="pt", device="cpu") as f:
+        for k in f.keys():
+            keys_dict[k] = f.get_slice(k).get_shape()
+    return keys_dict
+def convert_state_dict_to_keys_dict(state_dict):
+    keys_dict = {}
+    for k, v in state_dict.items():
+        if isinstance(v, torch.Tensor):
+            keys_dict[k] = list(v.shape)
+        else:
+            keys_dict[k] = convert_state_dict_to_keys_dict(v)
+    return keys_dict
+def load_keys_dict_from_bin(file_path):
+    state_dict = load_state_dict_from_bin(file_path)
+    keys_dict = convert_state_dict_to_keys_dict(state_dict)
+    return keys_dict
+def convert_keys_dict_to_single_str(state_dict, with_shape=True):
+    keys = []
+    for key, value in state_dict.items():
+        if isinstance(key, str):
+            if isinstance(value, dict):
+                keys.append(key + "|" + convert_keys_dict_to_single_str(value, with_shape=with_shape))
+            else:
+                if with_shape:
+                    shape = "_".join(map(str, list(value)))
+                    keys.append(key + ":" + shape)
+                keys.append(key)
+    keys.sort()
+    keys_str = ",".join(keys)
+    return keys_str
+def hash_model_file(path, with_shape=True):
+    keys_dict = load_keys_dict(path)
+    keys_str = convert_keys_dict_to_single_str(keys_dict, with_shape=with_shape)
+    keys_str = keys_str.encode(encoding="UTF-8")
+    return hashlib.md5(keys_str).hexdigest()

diffsynth/core/loader/model.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from ..vram.initialization import skip_model_initialization
+from ..vram.disk_map import DiskMap
+from ..vram.layers import enable_vram_management
+from .file import load_state_dict
+import torch
+from contextlib import contextmanager
+from transformers.integrations import is_deepspeed_zero3_enabled
+from transformers.utils import ContextManagers
+def load_model(model_class, path, config=None, torch_dtype=torch.bfloat16, device="cpu", state_dict_converter=None, use_disk_map=False, module_map=None, vram_config=None, vram_limit=None, state_dict=None):
+    config = {} if config is None else config
+    # Skip ZeRO-3 initialization for VAE to avoid compatibility issues
+    skip_zero3 = 'vae' in model_class.__name__.lower() if hasattr(model_class, '__name__') else False
+    with ContextManagers(get_init_context(torch_dtype=torch_dtype, device=device, skip_zero3=skip_zero3)):
+        model = model_class(**config)
+    # What is `module_map`?
+    # This is a module mapping table for VRAM management.
+    if module_map is not None:
+        devices = [vram_config["offload_device"], vram_config["onload_device"], vram_config["preparing_device"], vram_config["computation_device"]]
+        device = [d for d in devices if d != "disk"][0]
+        dtypes = [vram_config["offload_dtype"], vram_config["onload_dtype"], vram_config["preparing_dtype"], vram_config["computation_dtype"]]
+        dtype = [d for d in dtypes if d != "disk"][0]
+        if vram_config["offload_device"] != "disk":
+            if state_dict is None: state_dict = DiskMap(path, device, torch_dtype=dtype)
+            if state_dict_converter is not None:
+                state_dict = state_dict_converter(state_dict)
+            else:
+                state_dict = {i: state_dict[i] for i in state_dict}
+            if is_deepspeed_zero3_enabled():
+                from transformers.integrations.deepspeed import _load_state_dict_into_zero3_model
+                _load_state_dict_into_zero3_model(model, state_dict)
+            else:
+                model.load_state_dict(state_dict, assign=True)
+            model = enable_vram_management(model, module_map, vram_config=vram_config, disk_map=None, vram_limit=vram_limit)
+        else:
+            disk_map = DiskMap(path, device, state_dict_converter=state_dict_converter)
+            model = enable_vram_management(model, module_map, vram_config=vram_config, disk_map=disk_map, vram_limit=vram_limit)
+    else:
+        # Why do we use `DiskMap`?
+        # Sometimes a model file contains multiple models,
+        # and DiskMap can load only the parameters of a single model,
+        # avoiding the need to load all parameters in the file.
+        if state_dict is not None:
+            pass
+        elif use_disk_map:
+            state_dict = DiskMap(path, device, torch_dtype=torch_dtype)
+        else:
+            state_dict = load_state_dict(path, torch_dtype, device)
+        # Why do we use `state_dict_converter`?
+        # Some models are saved in complex formats,
+        # and we need to convert the state dict into the appropriate format.
+        if state_dict_converter is not None:
+            state_dict = state_dict_converter(state_dict)
+        else:
+            state_dict = {i: state_dict[i] for i in state_dict}
+        # Why does DeepSpeed ZeRO Stage 3 need to be handled separately?
+        # Because at this stage, model parameters are partitioned across multiple GPUs.
+        # Loading them directly could lead to excessive GPU memory consumption.
+        if is_deepspeed_zero3_enabled():
+            from transformers.integrations.deepspeed import _load_state_dict_into_zero3_model
+            _load_state_dict_into_zero3_model(model, state_dict)
+        else:
+            model.load_state_dict(state_dict, assign=True)
+        # Why do we call `to()`?
+        # Because some models override the behavior of `to()`,
+        # especially those from libraries like Transformers.
+        model = model.to(dtype=torch_dtype, device=device)
+    if hasattr(model, "eval"):
+        model = model.eval()
+    return model
+def load_model_with_disk_offload(model_class, path, config=None, torch_dtype=torch.bfloat16, device="cpu", state_dict_converter=None, module_map=None):
+    if isinstance(path, str):
+        path = [path]
+    config = {} if config is None else config
+    with skip_model_initialization():
+        model = model_class(**config)
+    if hasattr(model, "eval"):
+        model = model.eval()
+    disk_map = DiskMap(path, device, state_dict_converter=state_dict_converter)
+    vram_config = {
+        "offload_dtype": "disk",
+        "offload_device": "disk",
+        "onload_dtype": "disk",
+        "onload_device": "disk",
+        "preparing_dtype": torch.float8_e4m3fn,
+        "preparing_device": device,
+        "computation_dtype": torch_dtype,
+        "computation_device": device,
+    }
+    enable_vram_management(model, module_map, vram_config=vram_config, disk_map=disk_map, vram_limit=80)
+    return model
+def get_init_context(torch_dtype, device, skip_zero3=False):
+    if is_deepspeed_zero3_enabled() and not skip_zero3:
+        from transformers.modeling_utils import set_zero3_state
+        import deepspeed
+        # Why do we use "deepspeed.zero.Init"?
+        # Weight segmentation of the model can be performed on the CPU side
+        # and loading the segmented weights onto the computing card
+        init_contexts = [deepspeed.zero.Init(remote_device=device, dtype=torch_dtype), set_zero3_state()]
+    elif skip_zero3:
+        # For models excluded from ZeRO-3 (e.g. VAE), use normal initialization
+        # instead of skip_model_initialization to avoid meta tensor issues
+        init_contexts = []
+    else:
+        # Why do we use `skip_model_initialization`?
+        # It skips the random initialization of model parameters,
+        # thereby speeding up model loading and avoiding excessive memory usage.
+        init_contexts = [skip_model_initialization()]
+    return init_contexts

diffsynth/core/npu_patch/npu_fused_operator.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+from ..device.npu_compatible_device import get_device_type
+try:
+    import torch_npu
+except:
+    pass
+def rms_norm_forward_npu(self, hidden_states):
+    "npu rms fused operator for RMSNorm.forward from diffsynth\models\general_modules.py"
+    if hidden_states.dtype != self.weight.dtype:
+        hidden_states = hidden_states.to(self.weight.dtype)
+    return torch_npu.npu_rms_norm(hidden_states, self.weight, self.eps)[0]
+def rms_norm_forward_transformers_npu(self, hidden_states):
+    "npu rms fused operator for transformers"
+    if hidden_states.dtype != self.weight.dtype:
+        hidden_states = hidden_states.to(self.weight.dtype)
+    return torch_npu.npu_rms_norm(hidden_states, self.weight, self.variance_epsilon)[0]
+def rotary_emb_Zimage_npu(self, x_in: torch.Tensor, freqs_cis: torch.Tensor):
+    "npu rope fused operator for Zimage"
+    with torch.amp.autocast(get_device_type(), enabled=False):
+        freqs_cis = freqs_cis.unsqueeze(2)
+        cos, sin = torch.chunk(torch.view_as_real(freqs_cis), 2, dim=-1)
+        cos = cos.expand(-1, -1, -1, -1, 2).flatten(-2)
+        sin = sin.expand(-1, -1, -1, -1, 2).flatten(-2)
+        return torch_npu.npu_rotary_mul(x_in, cos, sin, rotary_mode="interleave").to(x_in)

diffsynth/core/vram/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .initialization import skip_model_initialization
2	+ from .layers import *

diffsynth/core/vram/disk_map.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from safetensors import safe_open
+import torch, os
+class SafetensorsCompatibleTensor:
+    def __init__(self, tensor):
+        self.tensor = tensor
+    def get_shape(self):
+        return list(self.tensor.shape)
+class SafetensorsCompatibleBinaryLoader:
+    def __init__(self, path, device):
+        print("Detected non-safetensors files, which may cause slower loading. It's recommended to convert it to a safetensors file.")
+        self.state_dict = torch.load(path, weights_only=True, map_location=device)
+    def keys(self):
+        return self.state_dict.keys()
+    def get_tensor(self, name):
+        return self.state_dict[name]
+    def get_slice(self, name):
+        return SafetensorsCompatibleTensor(self.state_dict[name])
+class DiskMap:
+    def __init__(self, path, device, torch_dtype=None, state_dict_converter=None, buffer_size=10**9):
+        self.path = path if isinstance(path, list) else [path]
+        self.device = device
+        self.torch_dtype = torch_dtype
+        if os.environ.get('DIFFSYNTH_DISK_MAP_BUFFER_SIZE') is not None:
+            self.buffer_size = int(os.environ.get('DIFFSYNTH_DISK_MAP_BUFFER_SIZE'))
+        else:
+            self.buffer_size = buffer_size
+        self.files = []
+        self.flush_files()
+        self.name_map = {}
+        for file_id, file in enumerate(self.files):
+            for name in file.keys():
+                self.name_map[name] = file_id
+        self.rename_dict = self.fetch_rename_dict(state_dict_converter)
+    def flush_files(self):
+        if len(self.files) == 0:
+            for path in self.path:
+                if path.endswith(".safetensors"):
+                    self.files.append(safe_open(path, framework="pt", device=str(self.device)))
+                else:
+                    self.files.append(SafetensorsCompatibleBinaryLoader(path, device=self.device))
+        else:
+            for i, path in enumerate(self.path):
+                if path.endswith(".safetensors"):
+                    self.files[i] = safe_open(path, framework="pt", device=str(self.device))
+        self.num_params = 0
+    def __getitem__(self, name):
+        if self.rename_dict is not None: name = self.rename_dict[name]
+        file_id = self.name_map[name]
+        param = self.files[file_id].get_tensor(name)
+        if self.torch_dtype is not None and isinstance(param, torch.Tensor):
+            param = param.to(self.torch_dtype)
+        if isinstance(param, torch.Tensor) and param.device == "cpu":
+            param = param.clone()
+        if isinstance(param, torch.Tensor):
+            self.num_params += param.numel()
+        if self.num_params > self.buffer_size:
+            self.flush_files()
+        return param
+    def fetch_rename_dict(self, state_dict_converter):
+        if state_dict_converter is None:
+            return None
+        state_dict = {}
+        for file in self.files:
+            for name in file.keys():
+                state_dict[name] = name
+        state_dict = state_dict_converter(state_dict)
+        return state_dict
+    def __iter__(self):
+        if self.rename_dict is not None:
+            return self.rename_dict.__iter__()
+        else:
+            return self.name_map.__iter__()
+    def __contains__(self, x):
+        if self.rename_dict is not None:
+            return x in self.rename_dict
+        else:
+            return x in self.name_map

diffsynth/core/vram/initialization.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+from contextlib import contextmanager
+@contextmanager
+def skip_model_initialization(device=torch.device("meta")):
+    def register_empty_parameter(module, name, param):
+        old_register_parameter(module, name, param)
+        if param is not None:
+            param_cls = type(module._parameters[name])
+            kwargs = module._parameters[name].__dict__
+            kwargs["requires_grad"] = param.requires_grad
+            module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
+    old_register_parameter = torch.nn.Module.register_parameter
+    torch.nn.Module.register_parameter = register_empty_parameter
+    try:
+        yield
+    finally:
+        torch.nn.Module.register_parameter = old_register_parameter

diffsynth/core/vram/layers.py ADDED Viewed

	@@ -0,0 +1,479 @@

+import torch, copy
+from typing import Union
+from .initialization import skip_model_initialization
+from .disk_map import DiskMap
+from ..device import parse_device_type, get_device_name, IS_NPU_AVAILABLE
+class AutoTorchModule(torch.nn.Module):
+    def __init__(
+        self,
+        offload_dtype: torch.dtype = None,
+        offload_device: Union[str, torch.device] = None,
+        onload_dtype: torch.dtype = None,
+        onload_device: Union[str, torch.device] = None,
+        preparing_dtype: torch.dtype = None,
+        preparing_device: Union[str, torch.device] = None,
+        computation_dtype: torch.dtype = None,
+        computation_device: Union[str, torch.device] = None,
+        vram_limit: float = None,
+    ):
+        super().__init__()
+        self.set_dtype_and_device(
+            offload_dtype,
+            offload_device,
+            onload_dtype,
+            onload_device,
+            preparing_dtype,
+            preparing_device,
+            computation_dtype,
+            computation_device,
+            vram_limit,
+        )
+        self.state = 0
+        self.name = ""
+        self.computation_device_type = parse_device_type(self.computation_device)
+    def set_dtype_and_device(
+        self,
+        offload_dtype: torch.dtype = None,
+        offload_device: Union[str, torch.device] = None,
+        onload_dtype: torch.dtype = None,
+        onload_device: Union[str, torch.device] = None,
+        preparing_dtype: torch.dtype = None,
+        preparing_device: Union[str, torch.device] = None,
+        computation_dtype: torch.dtype = None,
+        computation_device: Union[str, torch.device] = None,
+        vram_limit: float = None,
+    ):
+        self.offload_dtype = offload_dtype or computation_dtype
+        self.offload_device = offload_device or computation_dtype
+        self.onload_dtype = onload_dtype or computation_dtype
+        self.onload_device = onload_device or computation_dtype
+        self.preparing_dtype = preparing_dtype or computation_dtype
+        self.preparing_device = preparing_device or computation_dtype
+        self.computation_dtype = computation_dtype
+        self.computation_device = computation_device
+        self.vram_limit = vram_limit
+    def cast_to(self, weight, dtype, device):
+        r = torch.empty_like(weight, dtype=dtype, device=device)
+        r.copy_(weight)
+        return r
+    def check_free_vram(self):
+        device = self.computation_device if not IS_NPU_AVAILABLE else get_device_name()
+        gpu_mem_state = getattr(torch, self.computation_device_type).mem_get_info(device)
+        used_memory = (gpu_mem_state[1] - gpu_mem_state[0]) / (1024**3)
+        return used_memory < self.vram_limit
+    def offload(self):
+        if self.state != 0:
+            self.to(dtype=self.offload_dtype, device=self.offload_device)
+            self.state = 0
+    def onload(self):
+        if self.state != 1:
+            self.to(dtype=self.onload_dtype, device=self.onload_device)
+            self.state = 1
+    def param_name(self, name):
+        if self.name == "":
+            return name
+        else:
+            return self.name + "." + name
+class AutoWrappedModule(AutoTorchModule):
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        offload_dtype: torch.dtype = None,
+        offload_device: Union[str, torch.device] = None,
+        onload_dtype: torch.dtype = None,
+        onload_device: Union[str, torch.device] = None,
+        preparing_dtype: torch.dtype = None,
+        preparing_device: Union[str, torch.device] = None,
+        computation_dtype: torch.dtype = None,
+        computation_device: Union[str, torch.device] = None,
+        vram_limit: float = None,
+        name: str = "",
+        disk_map: DiskMap = None,
+        **kwargs
+    ):
+        super().__init__(
+            offload_dtype,
+            offload_device,
+            onload_dtype,
+            onload_device,
+            preparing_dtype,
+            preparing_device,
+            computation_dtype,
+            computation_device,
+            vram_limit,
+        )
+        self.module = module
+        if offload_dtype == "disk":
+            self.name = name
+            self.disk_map = disk_map
+            self.required_params = [name for name, _ in self.module.named_parameters()]
+            self.disk_offload = True
+        else:
+            self.disk_offload = False
+    def load_from_disk(self, torch_dtype, device, copy_module=False):
+        if copy_module:
+            module = copy.deepcopy(self.module)
+        else:
+            module = self.module
+        state_dict = {}
+        for name in self.required_params:
+            param = self.disk_map[self.param_name(name)]
+            param = param.to(dtype=torch_dtype, device=device)
+            state_dict[name] = param
+        module.load_state_dict(state_dict, assign=True)
+        module.to(dtype=torch_dtype, device=device)
+        return module
+    def offload_to_disk(self, model: torch.nn.Module):
+        for buf in model.buffers():
+            # If there are some parameters are registed in buffers (not in state dict),
+            # We cannot offload the model.
+            for children in model.children():
+                self.offload_to_disk(children)
+            break
+        else:
+            model.to("meta")
+    def offload(self):
+        # offload / onload / preparing -> offload
+        if self.state != 0:
+            if self.disk_offload:
+                self.offload_to_disk(self.module)
+            else:
+                self.to(dtype=self.offload_dtype, device=self.offload_device)
+            self.state = 0
+    def onload(self):
+        # offload / onload / preparing -> onload
+        if self.state < 1:
+            if self.disk_offload and self.onload_device != "disk" and self.offload_device == "disk":
+                self.load_from_disk(self.onload_dtype, self.onload_device)
+            elif self.onload_device != "disk":
+                self.to(dtype=self.onload_dtype, device=self.onload_device)
+            self.state = 1
+    def preparing(self):
+        # onload / preparing -> preparing
+        if self.state != 2:
+            if self.disk_offload and self.preparing_device != "disk" and self.onload_device == "disk":
+                self.load_from_disk(self.preparing_dtype, self.preparing_device)
+            elif self.preparing_device != "disk":
+                self.to(dtype=self.preparing_dtype, device=self.preparing_device)
+            self.state = 2
+    def cast_to(self, module, dtype, device):
+        return copy.deepcopy(module).to(dtype=dtype, device=device)
+    def computation(self):
+        # onload / preparing -> computation (temporary)
+        if self.state == 2:
+            torch_dtype, device = self.preparing_dtype, self.preparing_device
+        else:
+            torch_dtype, device = self.onload_dtype, self.onload_device
+        if torch_dtype == self.computation_dtype and device == self.computation_device:
+            module = self.module
+        elif self.disk_offload and device == "disk":
+            module = self.load_from_disk(self.computation_dtype, self.computation_device, copy_module=True)
+        else:
+            module = self.cast_to(self.module, dtype=self.computation_dtype, device=self.computation_device)
+        return module
+    def forward(self, *args, **kwargs):
+        if self.state == 1 and (self.vram_limit is None or self.check_free_vram()):
+            self.preparing()
+        module = self.computation()
+        return module(*args, **kwargs)
+    def __getattr__(self, name):
+        if name in self.__dict__ or name == "module":
+            return super().__getattr__(name)
+        else:
+            return getattr(self.module, name)
+class AutoWrappedNonRecurseModule(AutoWrappedModule):
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        offload_dtype: torch.dtype = None,
+        offload_device: Union[str, torch.device] = None,
+        onload_dtype: torch.dtype = None,
+        onload_device: Union[str, torch.device] = None,
+        preparing_dtype: torch.dtype = None,
+        preparing_device: Union[str, torch.device] = None,
+        computation_dtype: torch.dtype = None,
+        computation_device: Union[str, torch.device] = None,
+        vram_limit: float = None,
+        name: str = "",
+        disk_map: DiskMap = None,
+        **kwargs
+    ):
+        super().__init__(
+            module,
+            offload_dtype,
+            offload_device,
+            onload_dtype,
+            onload_device,
+            preparing_dtype,
+            preparing_device,
+            computation_dtype,
+            computation_device,
+            vram_limit,
+            name,
+            disk_map,
+            **kwargs
+        )
+        if self.disk_offload:
+            self.required_params = [name for name, _ in self.module.named_parameters(recurse=False)]
+    def load_from_disk(self, torch_dtype, device, copy_module=False):
+        if copy_module:
+            module = copy.deepcopy(self.module)
+        else:
+            module = self.module
+        state_dict = {}
+        for name in self.required_params:
+            param = self.disk_map[self.param_name(name)]
+            param = param.to(dtype=torch_dtype, device=device)
+            state_dict[name] = param
+        module.load_state_dict(state_dict, assign=True, strict=False)
+        return module
+    def offload_to_disk(self, model: torch.nn.Module):
+        for name in self.required_params:
+            getattr(self, name).to("meta")
+    def cast_to(self, module, dtype, device):
+        # Parameter casting is implemented in the model architecture.
+        return module
+    def __getattr__(self, name):
+        if name in self.__dict__ or name == "module":
+            return super().__getattr__(name)
+        else:
+            return getattr(self.module, name)
+class AutoWrappedLinear(torch.nn.Linear, AutoTorchModule):
+    def __init__(
+        self,
+        module: torch.nn.Linear,
+        offload_dtype: torch.dtype = None,
+        offload_device: Union[str, torch.device] = None,
+        onload_dtype: torch.dtype = None,
+        onload_device: Union[str, torch.device] = None,
+        preparing_dtype: torch.dtype = None,
+        preparing_device: Union[str, torch.device] = None,
+        computation_dtype: torch.dtype = None,
+        computation_device: Union[str, torch.device] = None,
+        vram_limit: float = None,
+        name: str = "",
+        disk_map: DiskMap = None,
+        **kwargs
+    ):
+        with skip_model_initialization():
+            super().__init__(
+                in_features=module.in_features,
+                out_features=module.out_features,
+                bias=module.bias is not None,
+            )
+        self.set_dtype_and_device(
+            offload_dtype,
+            offload_device,
+            onload_dtype,
+            onload_device,
+            preparing_dtype,
+            preparing_device,
+            computation_dtype,
+            computation_device,
+            vram_limit,
+        )
+        self.weight = module.weight
+        self.bias = module.bias
+        self.state = 0
+        self.name = name
+        self.lora_A_weights = []
+        self.lora_B_weights = []
+        self.lora_merger = None
+        self.enable_fp8 = computation_dtype in [torch.float8_e4m3fn, torch.float8_e4m3fnuz]
+        self.computation_device_type = parse_device_type(self.computation_device)
+        if offload_dtype == "disk":
+            self.disk_map = disk_map
+            self.disk_offload = True
+        else:
+            self.disk_offload = False
+    def fp8_linear(
+        self,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor = None,
+    ) -> torch.Tensor:
+        device = input.device
+        origin_dtype = input.dtype
+        origin_shape = input.shape
+        input = input.reshape(-1, origin_shape[-1])
+        x_max = torch.max(torch.abs(input), dim=-1, keepdim=True).values
+        fp8_max = 448.0
+        # For float8_e4m3fnuz, the maximum representable value is half of that of e4m3fn.
+        # To avoid overflow and ensure numerical compatibility during FP8 computation,
+        # we scale down the input by 2.0 in advance.
+        # This scaling will be compensated later during the final result scaling.
+        if self.computation_dtype == torch.float8_e4m3fnuz:
+            fp8_max = fp8_max / 2.0
+        scale_a = torch.clamp(x_max / fp8_max, min=1.0).float().to(device=device)
+        scale_b = torch.ones((weight.shape[0], 1)).to(device=device)
+        input = input / (scale_a + 1e-8)
+        input = input.to(self.computation_dtype)
+        weight = weight.to(self.computation_dtype)
+        bias = bias.to(torch.bfloat16)
+        result = torch._scaled_mm(
+            input,
+            weight.T,
+            scale_a=scale_a,
+            scale_b=scale_b.T,
+            bias=bias,
+            out_dtype=origin_dtype,
+        )
+        new_shape = origin_shape[:-1] + result.shape[-1:]
+        result = result.reshape(new_shape)
+        return result
+    def load_from_disk(self, torch_dtype, device, assign=True):
+        weight = self.disk_map[self.name + ".weight"].to(dtype=torch_dtype, device=device)
+        bias = None if self.bias is None else self.disk_map[self.name + ".bias"].to(dtype=torch_dtype, device=device)
+        if assign:
+            state_dict = {"weight": weight}
+            if bias is not None: state_dict["bias"] = bias
+            self.load_state_dict(state_dict, assign=True)
+        return weight, bias
+    def offload(self):
+        # offload / onload / preparing -> offload
+        if self.state != 0:
+            if self.disk_offload:
+                self.to("meta")
+            else:
+                self.to(dtype=self.offload_dtype, device=self.offload_device)
+            self.state = 0
+    def onload(self):
+        # offload / onload / preparing -> onload
+        if self.state < 1:
+            if self.disk_offload and self.onload_device != "disk" and self.offload_device == "disk":
+                self.load_from_disk(self.onload_dtype, self.onload_device)
+            elif self.onload_device != "disk":
+                self.to(dtype=self.onload_dtype, device=self.onload_device)
+            self.state = 1
+    def preparing(self):
+        # onload / preparing -> preparing
+        if self.state != 2:
+            if self.disk_offload and self.preparing_device != "disk" and self.onload_device == "disk":
+                self.load_from_disk(self.preparing_dtype, self.preparing_device)
+            elif self.preparing_device != "disk":
+                self.to(dtype=self.preparing_dtype, device=self.preparing_device)
+            self.state = 2
+    def computation(self):
+        # onload / preparing -> computation (temporary)
+        if self.state == 2:
+            torch_dtype, device = self.preparing_dtype, self.preparing_device
+        else:
+            torch_dtype, device = self.onload_dtype, self.onload_device
+        if torch_dtype == self.computation_dtype and device == self.computation_device:
+            weight, bias = self.weight, self.bias
+        elif self.disk_offload and device == "disk":
+            weight, bias = self.load_from_disk(self.computation_dtype, self.computation_device, assign=False)
+        else:
+            weight = self.cast_to(self.weight, self.computation_dtype, self.computation_device)
+            bias = None if self.bias is None else self.cast_to(self.bias, self.computation_dtype, self.computation_device)
+        return weight, bias
+    def linear_forward(self, x, weight, bias):
+        if self.enable_fp8:
+            out = self.fp8_linear(x, weight, bias)
+        else:
+            out = torch.nn.functional.linear(x, weight, bias)
+        return out
+    def lora_forward(self, x, out):
+        if self.lora_merger is None:
+            for lora_A, lora_B in zip(self.lora_A_weights, self.lora_B_weights):
+                out = out + x @ lora_A.T.to(device=x.device, dtype=x.dtype) @ lora_B.T.to(device=x.device, dtype=x.dtype)
+        else:
+            lora_output = []
+            for lora_A, lora_B in zip(self.lora_A_weights, self.lora_B_weights):
+                lora_output.append(x @ lora_A.T @ lora_B.T)
+            lora_output = torch.stack(lora_output)
+            out = self.lora_merger(out, lora_output)
+        return out
+    def forward(self, x, *args, **kwargs):
+        if self.state == 1 and (self.vram_limit is None or self.check_free_vram()):
+            self.preparing()
+        weight, bias = self.computation()
+        out = self.linear_forward(x, weight, bias)
+        if len(self.lora_A_weights) > 0:
+            out = self.lora_forward(x, out)
+        return out
+def enable_vram_management_recursively(model: torch.nn.Module, module_map: dict, vram_config: dict, vram_limit=None, name_prefix="", disk_map=None, **kwargs):
+    if isinstance(model, AutoWrappedNonRecurseModule):
+        model = model.module
+    for name, module in model.named_children():
+        layer_name = name if name_prefix == "" else name_prefix + "." + name
+        for source_module, target_module in module_map.items():
+            if isinstance(module, source_module):
+                module_ = target_module(module, **vram_config, vram_limit=vram_limit, name=layer_name, disk_map=disk_map, **kwargs)
+                if isinstance(module_, AutoWrappedNonRecurseModule):
+                    enable_vram_management_recursively(module_, module_map, vram_config, vram_limit=vram_limit, name_prefix=layer_name, disk_map=disk_map, **kwargs)
+                setattr(model, name, module_)
+                break
+        else:
+            enable_vram_management_recursively(module, module_map, vram_config, vram_limit=vram_limit, name_prefix=layer_name, disk_map=disk_map, **kwargs)
+def fill_vram_config(model, vram_config):
+    vram_config_ = vram_config.copy()
+    vram_config_["onload_dtype"] = vram_config["computation_dtype"]
+    vram_config_["onload_device"] = vram_config["computation_device"]
+    vram_config_["preparing_dtype"] = vram_config["computation_dtype"]
+    vram_config_["preparing_device"] = vram_config["computation_device"]
+    for k in vram_config:
+        if vram_config[k] != vram_config_[k]:
+            print(f"No fine-grained VRAM configuration is provided for {model.__class__.__name__}. [`onload`, `preparing`, `computation`] will be the same state. `vram_config` is set to {vram_config_}")
+            break
+    return vram_config_
+def enable_vram_management(model: torch.nn.Module, module_map: dict, vram_config: dict, vram_limit=None, disk_map=None, **kwargs):
+    for source_module, target_module in module_map.items():
+        # If no fine-grained VRAM configuration is provided, the entire model will be managed uniformly.
+        if isinstance(model, source_module):
+            vram_config = fill_vram_config(model, vram_config)
+            model = target_module(model, **vram_config, vram_limit=vram_limit, disk_map=disk_map, **kwargs)
+            break
+    else:
+        enable_vram_management_recursively(model, module_map, vram_config, vram_limit=vram_limit, disk_map=disk_map, **kwargs)
+    # `vram_management_enabled` is a flag that allows the pipeline to determine whether VRAM management is enabled.
+    model.vram_management_enabled = True
+    return model

diffsynth/diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .flow_match import FlowMatchScheduler
+from .training_module import DiffusionTrainingModule
+from .logger import ModelLogger
+from .runner import launch_training_task, launch_data_process_task
+from .parsers import *
+from .loss import *

diffsynth/diffusion/base_pipeline.py ADDED Viewed

	@@ -0,0 +1,500 @@

+from PIL import Image
+import torch
+import numpy as np
+from einops import repeat, reduce
+from typing import Union
+from ..core import AutoTorchModule, AutoWrappedLinear, load_state_dict, ModelConfig, parse_device_type
+from ..core.device.npu_compatible_device import get_device_type
+from ..utils.lora import GeneralLoRALoader
+from ..models.model_loader import ModelPool
+from ..utils.controlnet import ControlNetInput
+from ..core.device import get_device_name, IS_NPU_AVAILABLE
+class PipelineUnit:
+    def __init__(
+        self,
+        seperate_cfg: bool = False,
+        take_over: bool = False,
+        input_params: tuple[str] = None,
+        output_params: tuple[str] = None,
+        input_params_posi: dict[str, str] = None,
+        input_params_nega: dict[str, str] = None,
+        onload_model_names: tuple[str] = None
+    ):
+        self.seperate_cfg = seperate_cfg
+        self.take_over = take_over
+        self.input_params = input_params
+        self.output_params = output_params
+        self.input_params_posi = input_params_posi
+        self.input_params_nega = input_params_nega
+        self.onload_model_names = onload_model_names
+    def fetch_input_params(self):
+        params = []
+        if self.input_params is not None:
+            for param in self.input_params:
+                params.append(param)
+        if self.input_params_posi is not None:
+            for _, param in self.input_params_posi.items():
+                params.append(param)
+        if self.input_params_nega is not None:
+            for _, param in self.input_params_nega.items():
+                params.append(param)
+        params = sorted(list(set(params)))
+        return params
+    def fetch_output_params(self):
+        params = []
+        if self.output_params is not None:
+            for param in self.output_params:
+                params.append(param)
+        return params
+    def process(self, pipe, **kwargs) -> dict:
+        return {}
+    def post_process(self, pipe, **kwargs) -> dict:
+        return {}
+class BasePipeline(torch.nn.Module):
+    def __init__(
+        self,
+        device=get_device_type(), torch_dtype=torch.float16,
+        height_division_factor=64, width_division_factor=64,
+        time_division_factor=None, time_division_remainder=None,
+    ):
+        super().__init__()
+        # The device and torch_dtype is used for the storage of intermediate variables, not models.
+        self.device = device
+        self.torch_dtype = torch_dtype
+        self.device_type = parse_device_type(device)
+        # The following parameters are used for shape check.
+        self.height_division_factor = height_division_factor
+        self.width_division_factor = width_division_factor
+        self.time_division_factor = time_division_factor
+        self.time_division_remainder = time_division_remainder
+        # VRAM management
+        self.vram_management_enabled = False
+        # Pipeline Unit Runner
+        self.unit_runner = PipelineUnitRunner()
+        # LoRA Loader
+        self.lora_loader = GeneralLoRALoader
+    def to(self, *args, **kwargs):
+        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
+        if device is not None:
+            self.device = device
+        if dtype is not None:
+            self.torch_dtype = dtype
+        super().to(*args, **kwargs)
+        return self
+    def check_resize_height_width(self, height, width, num_frames=None, verbose=1):
+        # Shape check
+        if height % self.height_division_factor != 0:
+            height = (height + self.height_division_factor - 1) // self.height_division_factor * self.height_division_factor
+            if verbose > 0:
+                print(f"height % {self.height_division_factor} != 0. We round it up to {height}.")
+        if width % self.width_division_factor != 0:
+            width = (width + self.width_division_factor - 1) // self.width_division_factor * self.width_division_factor
+            if verbose > 0:
+                print(f"width % {self.width_division_factor} != 0. We round it up to {width}.")
+        if num_frames is None:
+            return height, width
+        else:
+            if num_frames % self.time_division_factor != self.time_division_remainder:
+                num_frames = (num_frames + self.time_division_factor - 1) // self.time_division_factor * self.time_division_factor + self.time_division_remainder
+                if verbose > 0:
+                    print(f"num_frames % {self.time_division_factor} != {self.time_division_remainder}. We round it up to {num_frames}.")
+            return height, width, num_frames
+    def preprocess_image(self, image, torch_dtype=None, device=None, pattern="B C H W", min_value=-1, max_value=1):
+        # Transform a PIL.Image to torch.Tensor
+        image = torch.Tensor(np.array(image, dtype=np.float32))
+        image = image.to(dtype=torch_dtype or self.torch_dtype, device=device or self.device)
+        image = image * ((max_value - min_value) / 255) + min_value
+        image = repeat(image, f"H W C -> {pattern}", **({"B": 1} if "B" in pattern else {}))
+        return image
+    def preprocess_video(self, video, torch_dtype=None, device=None, pattern="B C T H W", min_value=-1, max_value=1):
+        # Transform a list of PIL.Image to torch.Tensor
+        video = [self.preprocess_image(image, torch_dtype=torch_dtype, device=device, min_value=min_value, max_value=max_value) for image in video]
+        video = torch.stack(video, dim=pattern.index("T") // 2)
+        return video
+    def vae_output_to_image(self, vae_output, pattern="B C H W", min_value=-1, max_value=1):
+        # Transform a torch.Tensor to PIL.Image
+        if pattern != "H W C":
+            vae_output = reduce(vae_output, f"{pattern} -> H W C", reduction="mean")
+        image = ((vae_output - min_value) * (255 / (max_value - min_value))).clip(0, 255)
+        image = image.to(device="cpu", dtype=torch.uint8)
+        image = Image.fromarray(image.numpy())
+        return image
+    def vae_output_to_video(self, vae_output, pattern="B C T H W", min_value=-1, max_value=1):
+        # Transform a torch.Tensor to list of PIL.Image
+        if pattern != "T H W C":
+            vae_output = reduce(vae_output, f"{pattern} -> T H W C", reduction="mean")
+        video = [self.vae_output_to_image(image, pattern="H W C", min_value=min_value, max_value=max_value) for image in vae_output]
+        return video
+    def output_audio_format_check(self, audio_output):
+        # output standard foramt: [C, T], output dtype: float()
+        # remove batch dim
+        if audio_output.ndim == 3:
+            audio_output = audio_output.squeeze(0)
+        return audio_output.float()
+    def load_models_to_device(self, model_names):
+        if self.vram_management_enabled:
+            # offload models
+            for name, model in self.named_children():
+                if name not in model_names:
+                    if hasattr(model, "vram_management_enabled") and model.vram_management_enabled:
+                        if hasattr(model, "offload"):
+                            model.offload()
+                        else:
+                            for module in model.modules():
+                                if hasattr(module, "offload"):
+                                    module.offload()
+            getattr(torch, self.device_type).empty_cache()
+            # onload models
+            for name, model in self.named_children():
+                if name in model_names:
+                    if hasattr(model, "vram_management_enabled") and model.vram_management_enabled:
+                        if hasattr(model, "onload"):
+                            model.onload()
+                        else:
+                            for module in model.modules():
+                                if hasattr(module, "onload"):
+                                    module.onload()
+    def generate_noise(self, shape, seed=None, rand_device="cpu", rand_torch_dtype=torch.float32, device=None, torch_dtype=None):
+        # Initialize Gaussian noise
+        generator = None if seed is None else torch.Generator(rand_device).manual_seed(seed)
+        noise = torch.randn(shape, generator=generator, device=rand_device, dtype=rand_torch_dtype)
+        noise = noise.to(dtype=torch_dtype or self.torch_dtype, device=device or self.device)
+        return noise
+    def get_vram(self):
+        device = self.device if not IS_NPU_AVAILABLE else get_device_name()
+        return getattr(torch, self.device_type).mem_get_info(device)[1] / (1024 ** 3)
+    def get_module(self, model, name):
+        if "." in name:
+            name, suffix = name[:name.index(".")], name[name.index(".") + 1:]
+            if name.isdigit():
+                return self.get_module(model[int(name)], suffix)
+            else:
+                return self.get_module(getattr(model, name), suffix)
+        else:
+            return getattr(model, name)
+    def freeze_except(self, model_names):
+        self.eval()
+        self.requires_grad_(False)
+        for name in model_names:
+            module = self.get_module(self, name)
+            if module is None:
+                print(f"No {name} models in the pipeline. We cannot enable training on the model. If this occurs during the data processing stage, it is normal.")
+                continue
+            module.train()
+            module.requires_grad_(True)
+    def blend_with_mask(self, base, addition, mask):
+        return base * (1 - mask) + addition * mask
+    def step(self, scheduler, latents, progress_id, noise_pred, input_latents=None, inpaint_mask=None, **kwargs):
+        timestep = scheduler.timesteps[progress_id]
+        if inpaint_mask is not None:
+            noise_pred_expected = scheduler.return_to_timestep(scheduler.timesteps[progress_id], latents, input_latents)
+            noise_pred = self.blend_with_mask(noise_pred_expected, noise_pred, inpaint_mask)
+        latents_next = scheduler.step(noise_pred, timestep, latents)
+        return latents_next
+    def split_pipeline_units(self, model_names: list[str]):
+        return PipelineUnitGraph().split_pipeline_units(self.units, model_names)
+    def flush_vram_management_device(self, device):
+        for module in self.modules():
+            if isinstance(module, AutoTorchModule):
+                module.offload_device = device
+                module.onload_device = device
+                module.preparing_device = device
+                module.computation_device = device
+    def load_lora(
+        self,
+        module: torch.nn.Module,
+        lora_config: Union[ModelConfig, str] = None,
+        alpha=1,
+        hotload=None,
+        state_dict=None,
+        verbose=1,
+    ):
+        if state_dict is None:
+            if isinstance(lora_config, str):
+                lora = load_state_dict(lora_config, torch_dtype=self.torch_dtype, device=self.device)
+            else:
+                lora_config.download_if_necessary()
+                lora = load_state_dict(lora_config.path, torch_dtype=self.torch_dtype, device=self.device)
+        else:
+            lora = state_dict
+        lora_loader = self.lora_loader(torch_dtype=self.torch_dtype, device=self.device)
+        lora = lora_loader.convert_state_dict(lora)
+        if hotload is None:
+            hotload = hasattr(module, "vram_management_enabled") and getattr(module, "vram_management_enabled")
+        if hotload:
+            if not (hasattr(module, "vram_management_enabled") and getattr(module, "vram_management_enabled")):
+                raise ValueError("VRAM Management is not enabled. LoRA hotloading is not supported.")
+            updated_num = 0
+            for _, module in module.named_modules():
+                if isinstance(module, AutoWrappedLinear):
+                    name = module.name
+                    lora_a_name = f'{name}.lora_A.weight'
+                    lora_b_name = f'{name}.lora_B.weight'
+                    if lora_a_name in lora and lora_b_name in lora:
+                        updated_num += 1
+                        module.lora_A_weights.append(lora[lora_a_name] * alpha)
+                        module.lora_B_weights.append(lora[lora_b_name])
+            if verbose >= 1:
+                print(f"{updated_num} tensors are patched by LoRA. You can use `pipe.clear_lora()` to clear all LoRA layers.")
+        else:
+            lora_loader.fuse_lora_to_base_model(module, lora, alpha=alpha)
+    def clear_lora(self, verbose=1):
+        cleared_num = 0
+        for name, module in self.named_modules():
+            if isinstance(module, AutoWrappedLinear):
+                if hasattr(module, "lora_A_weights"):
+                    if len(module.lora_A_weights) > 0:
+                        cleared_num += 1
+                    module.lora_A_weights.clear()
+                if hasattr(module, "lora_B_weights"):
+                    module.lora_B_weights.clear()
+        if verbose >= 1:
+            print(f"{cleared_num} LoRA layers are cleared.")
+    def download_and_load_models(self, model_configs: list[ModelConfig] = [], vram_limit: float = None):
+        model_pool = ModelPool()
+        for model_config in model_configs:
+            model_config.download_if_necessary()
+            vram_config = model_config.vram_config()
+            vram_config["computation_dtype"] = vram_config["computation_dtype"] or self.torch_dtype
+            vram_config["computation_device"] = vram_config["computation_device"] or self.device
+            model_pool.auto_load_model(
+                model_config.path,
+                vram_config=vram_config,
+                vram_limit=vram_limit,
+                clear_parameters=model_config.clear_parameters,
+                state_dict=model_config.state_dict,
+            )
+        return model_pool
+    def check_vram_management_state(self):
+        vram_management_enabled = False
+        for module in self.children():
+            if hasattr(module, "vram_management_enabled") and getattr(module, "vram_management_enabled"):
+                vram_management_enabled = True
+        return vram_management_enabled
+    def cfg_guided_model_fn(self, model_fn, cfg_scale, inputs_shared, inputs_posi, inputs_nega, **inputs_others):
+        if inputs_shared.get("positive_only_lora", None) is not None:
+            self.clear_lora(verbose=0)
+            self.load_lora(self.dit, state_dict=inputs_shared["positive_only_lora"], verbose=0)
+        noise_pred_posi = model_fn(**inputs_posi, **inputs_shared, **inputs_others)
+        if cfg_scale != 1.0:
+            if inputs_shared.get("positive_only_lora", None) is not None:
+                self.clear_lora(verbose=0)
+            noise_pred_nega = model_fn(**inputs_nega, **inputs_shared, **inputs_others)
+            if isinstance(noise_pred_posi, tuple):
+                # Separately handling different output types of latents, eg. video and audio latents.
+                noise_pred = tuple(
+                    n_nega + cfg_scale * (n_posi - n_nega)
+                    for n_posi, n_nega in zip(noise_pred_posi, noise_pred_nega)
+                )
+            else:
+                noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
+        else:
+            noise_pred = noise_pred_posi
+        return noise_pred
+    def compile_pipeline(self, mode: str = "default", dynamic: bool = True, fullgraph: bool = False, compile_models: list = None, **kwargs):
+        """
+        compile the pipeline with torch.compile. The models that will be compiled are determined by the `compilable_models` attribute of the pipeline.
+        If a model has `_repeated_blocks` attribute, we will compile these blocks with regional compilation. Otherwise, we will compile the whole model.
+        See https://docs.pytorch.org/docs/stable/generated/torch.compile.html#torch.compile for details about compilation arguments.
+        Args:
+            mode: The compilation mode, which will be passed to `torch.compile`, options are "default", "reduce-overhead", "max-autotune" and "max-autotune-no-cudagraphs. Default to "default".
+            dynamic: Whether to enable dynamic graph compilation to support dynamic input shapes, which will be passed to `torch.compile`. Default to True (recommended).
+            fullgraph: Whether to use full graph compilation, which will be passed to `torch.compile`. Default to False (recommended).
+            compile_models: The list of model names to be compiled. If None, we will compile the models in `pipeline.compilable_models`. Default to None.
+            **kwargs: Other arguments for `torch.compile`.
+        """
+        compile_models = compile_models or getattr(self, "compilable_models", [])
+        if len(compile_models) == 0:
+            print("No compilable models in the pipeline. Skip compilation.")
+            return
+        for name in compile_models:
+            model = getattr(self, name, None)
+            if model is None:
+                print(f"Model '{name}' not found in the pipeline.")
+                continue
+            repeated_blocks = getattr(model, "_repeated_blocks", None)
+            # regional compilation for repeated blocks.
+            if repeated_blocks is not None:
+                for submod in model.modules():
+                    if submod.__class__.__name__ in repeated_blocks:
+                        submod.compile(mode=mode, dynamic=dynamic, fullgraph=fullgraph, **kwargs)
+            # compile the whole model.
+            else:
+                model.compile(mode=mode, dynamic=dynamic, fullgraph=fullgraph, **kwargs)
+            print(f"{name} is compiled with mode={mode}, dynamic={dynamic}, fullgraph={fullgraph}.")
+class PipelineUnitGraph:
+    def __init__(self):
+        pass
+    def build_edges(self, units: list[PipelineUnit]):
+        # Establish dependencies between units
+        # to search for subsequent related computation units.
+        last_compute_unit_id = {}
+        edges = []
+        for unit_id, unit in enumerate(units):
+            for input_param in unit.fetch_input_params():
+                if input_param in last_compute_unit_id:
+                    edges.append((last_compute_unit_id[input_param], unit_id))
+            for output_param in unit.fetch_output_params():
+                last_compute_unit_id[output_param] = unit_id
+        return edges
+    def build_chains(self, units: list[PipelineUnit]):
+        # Establish updating chains for each variable
+        # to track their computation process.
+        params = sum([unit.fetch_input_params() + unit.fetch_output_params() for unit in units], [])
+        params = sorted(list(set(params)))
+        chains = {param: [] for param in params}
+        for unit_id, unit in enumerate(units):
+            for param in unit.fetch_output_params():
+                chains[param].append(unit_id)
+        return chains
+    def search_direct_unit_ids(self, units: list[PipelineUnit], model_names: list[str]):
+        # Search for units that directly participate in the model's computation.
+        related_unit_ids = []
+        for unit_id, unit in enumerate(units):
+            for model_name in model_names:
+                if unit.onload_model_names is not None and model_name in unit.onload_model_names:
+                    related_unit_ids.append(unit_id)
+                    break
+        return related_unit_ids
+    def search_related_unit_ids(self, edges, start_unit_ids, direction="target"):
+        # Search for subsequent related computation units.
+        related_unit_ids = [unit_id for unit_id in start_unit_ids]
+        while True:
+            neighbors = []
+            for source, target in edges:
+                if direction == "target" and source in related_unit_ids and target not in related_unit_ids:
+                    neighbors.append(target)
+                elif direction == "source" and source not in related_unit_ids and target in related_unit_ids:
+                    neighbors.append(source)
+            neighbors = sorted(list(set(neighbors)))
+            if len(neighbors) == 0:
+                break
+            else:
+                related_unit_ids.extend(neighbors)
+        related_unit_ids = sorted(list(set(related_unit_ids)))
+        return related_unit_ids
+    def search_updating_unit_ids(self, units: list[PipelineUnit], chains, related_unit_ids):
+        # If the input parameters of this subgraph are updated outside the subgraph,
+        # search for the units where these updates occur.
+        first_compute_unit_id = {}
+        for unit_id in related_unit_ids:
+            for param in units[unit_id].fetch_input_params():
+                if param not in first_compute_unit_id:
+                    first_compute_unit_id[param] = unit_id
+        updating_unit_ids = []
+        for param in first_compute_unit_id:
+            unit_id = first_compute_unit_id[param]
+            chain = chains[param]
+            if unit_id in chain and chain.index(unit_id) != len(chain) - 1:
+                for unit_id_ in chain[chain.index(unit_id) + 1:]:
+                    if unit_id_ not in related_unit_ids:
+                        updating_unit_ids.append(unit_id_)
+        related_unit_ids.extend(updating_unit_ids)
+        related_unit_ids = sorted(list(set(related_unit_ids)))
+        return related_unit_ids
+    def split_pipeline_units(self, units: list[PipelineUnit], model_names: list[str]):
+        # Split the computation graph,
+        # separating all model-related computations.
+        related_unit_ids = self.search_direct_unit_ids(units, model_names)
+        edges = self.build_edges(units)
+        chains = self.build_chains(units)
+        while True:
+            num_related_unit_ids = len(related_unit_ids)
+            related_unit_ids = self.search_related_unit_ids(edges, related_unit_ids, "target")
+            related_unit_ids = self.search_updating_unit_ids(units, chains, related_unit_ids)
+            if len(related_unit_ids) == num_related_unit_ids:
+                break
+            else:
+                num_related_unit_ids = len(related_unit_ids)
+        related_units = [units[i] for i in related_unit_ids]
+        unrelated_units = [units[i] for i in range(len(units)) if i not in related_unit_ids]
+        return related_units, unrelated_units
+class PipelineUnitRunner:
+    def __init__(self):
+        pass
+    def __call__(self, unit: PipelineUnit, pipe: BasePipeline, inputs_shared: dict, inputs_posi: dict, inputs_nega: dict) -> tuple[dict, dict]:
+        if unit.take_over:
+            # Let the pipeline unit take over this function.
+            inputs_shared, inputs_posi, inputs_nega = unit.process(pipe, inputs_shared=inputs_shared, inputs_posi=inputs_posi, inputs_nega=inputs_nega)
+        elif unit.seperate_cfg:
+            # Positive side
+            processor_inputs = {name: inputs_posi.get(name_) for name, name_ in unit.input_params_posi.items()}
+            if unit.input_params is not None:
+                for name in unit.input_params:
+                    processor_inputs[name] = inputs_shared.get(name)
+            processor_outputs = unit.process(pipe, **processor_inputs)
+            inputs_posi.update(processor_outputs)
+            # Negative side
+            if inputs_shared["cfg_scale"] != 1:
+                processor_inputs = {name: inputs_nega.get(name_) for name, name_ in unit.input_params_nega.items()}
+                if unit.input_params is not None:
+                    for name in unit.input_params:
+                        processor_inputs[name] = inputs_shared.get(name)
+                processor_outputs = unit.process(pipe, **processor_inputs)
+                inputs_nega.update(processor_outputs)
+            else:
+                inputs_nega.update(processor_outputs)
+        else:
+            processor_inputs = {name: inputs_shared.get(name) for name in unit.input_params}
+            processor_outputs = unit.process(pipe, **processor_inputs)
+            inputs_shared.update(processor_outputs)
+        return inputs_shared, inputs_posi, inputs_nega

diffsynth/diffusion/flow_match.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import torch, math
+from typing_extensions import Literal
+class FlowMatchScheduler():
+    def __init__(self, template: Literal["FLUX.1", "Wan", "Qwen-Image", "FLUX.2", "Z-Image", "LTX-2", "Qwen-Image-Lightning"] = "FLUX.1"):
+        self.set_timesteps_fn = {
+            "FLUX.1": FlowMatchScheduler.set_timesteps_flux,
+            "Wan": FlowMatchScheduler.set_timesteps_wan,
+            "Qwen-Image": FlowMatchScheduler.set_timesteps_qwen_image,
+            "FLUX.2": FlowMatchScheduler.set_timesteps_flux2,
+            "Z-Image": FlowMatchScheduler.set_timesteps_z_image,
+            "LTX-2": FlowMatchScheduler.set_timesteps_ltx2,
+            "Qwen-Image-Lightning": FlowMatchScheduler.set_timesteps_qwen_image_lightning,
+        }.get(template, FlowMatchScheduler.set_timesteps_flux)
+        self.num_train_timesteps = 1000
+    @staticmethod
+    def set_timesteps_flux(num_inference_steps=100, denoising_strength=1.0, shift=None):
+        sigma_min = 0.003/1.002
+        sigma_max = 1.0
+        shift = 3 if shift is None else shift
+        num_train_timesteps = 1000
+        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
+        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps)
+        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+        timesteps = sigmas * num_train_timesteps
+        return sigmas, timesteps
+    @staticmethod
+    def set_timesteps_wan(num_inference_steps=100, denoising_strength=1.0, shift=None):
+        sigma_min = 0.0
+        sigma_max = 1.0
+        shift = 5 if shift is None else shift
+        num_train_timesteps = 1000
+        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
+        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
+        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+        timesteps = sigmas * num_train_timesteps
+        return sigmas, timesteps
+    @staticmethod
+    def _calculate_shift_qwen_image(image_seq_len, base_seq_len=256, max_seq_len=8192, base_shift=0.5, max_shift=0.9):
+        m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+        b = base_shift - m * base_seq_len
+        mu = image_seq_len * m + b
+        return mu
+    @staticmethod
+    def set_timesteps_qwen_image(num_inference_steps=100, denoising_strength=1.0, exponential_shift_mu=None, dynamic_shift_len=None):
+        sigma_min = 0.0
+        sigma_max = 1.0
+        num_train_timesteps = 1000
+        shift_terminal = 0.02
+        # Sigmas
+        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
+        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
+        # Mu
+        if exponential_shift_mu is not None:
+            mu = exponential_shift_mu
+        elif dynamic_shift_len is not None:
+            mu = FlowMatchScheduler._calculate_shift_qwen_image(dynamic_shift_len)
+        else:
+            mu = 0.8
+        sigmas = math.exp(mu) / (math.exp(mu) + (1 / sigmas - 1))
+        # Shift terminal
+        one_minus_z = 1 - sigmas
+        scale_factor = one_minus_z[-1] / (1 - shift_terminal)
+        sigmas = 1 - (one_minus_z / scale_factor)
+        # Timesteps
+        timesteps = sigmas * num_train_timesteps
+        return sigmas, timesteps
+    @staticmethod
+    def set_timesteps_qwen_image_lightning(num_inference_steps=100, denoising_strength=1.0, exponential_shift_mu=None, dynamic_shift_len=None):
+        sigma_min = 0.0
+        sigma_max = 1.0
+        num_train_timesteps = 1000
+        base_shift = math.log(3)
+        max_shift = math.log(3)
+        # Sigmas
+        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
+        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
+        # Mu
+        if exponential_shift_mu is not None:
+            mu = exponential_shift_mu
+        elif dynamic_shift_len is not None:
+            mu = FlowMatchScheduler._calculate_shift_qwen_image(dynamic_shift_len, base_shift=base_shift, max_shift=max_shift)
+        else:
+            mu = 0.8
+        sigmas = math.exp(mu) / (math.exp(mu) + (1 / sigmas - 1))
+        # Timesteps
+        timesteps = sigmas * num_train_timesteps
+        return sigmas, timesteps
+    @staticmethod
+    def compute_empirical_mu(image_seq_len, num_steps):
+        a1, b1 = 8.73809524e-05, 1.89833333
+        a2, b2 = 0.00016927, 0.45666666
+        if image_seq_len > 4300:
+            mu = a2 * image_seq_len + b2
+            return float(mu)
+        m_200 = a2 * image_seq_len + b2
+        m_10 = a1 * image_seq_len + b1
+        a = (m_200 - m_10) / 190.0
+        b = m_200 - 200.0 * a
+        mu = a * num_steps + b
+        return float(mu)
+    @staticmethod
+    def set_timesteps_flux2(num_inference_steps=100, denoising_strength=1.0, dynamic_shift_len=None):
+        sigma_min = 1 / num_inference_steps
+        sigma_max = 1.0
+        num_train_timesteps = 1000
+        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
+        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps)
+        if dynamic_shift_len is None:
+            # If you ask me why I set mu=0.8,
+            # I can only say that it yields better training results.
+            mu = 0.8
+        else:
+            mu = FlowMatchScheduler.compute_empirical_mu(dynamic_shift_len, num_inference_steps)
+        sigmas = math.exp(mu) / (math.exp(mu) + (1 / sigmas - 1))
+        timesteps = sigmas * num_train_timesteps
+        return sigmas, timesteps
+    @staticmethod
+    def set_timesteps_z_image(num_inference_steps=100, denoising_strength=1.0, shift=None, target_timesteps=None):
+        sigma_min = 0.0
+        sigma_max = 1.0
+        shift = 3 if shift is None else shift
+        num_train_timesteps = 1000
+        sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
+        sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
+        sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)
+        timesteps = sigmas * num_train_timesteps
+        if target_timesteps is not None:
+            target_timesteps = target_timesteps.to(dtype=timesteps.dtype, device=timesteps.device)
+            for timestep in target_timesteps:
+                timestep_id = torch.argmin((timesteps - timestep).abs())
+                timesteps[timestep_id] = timestep
+        return sigmas, timesteps
+    @staticmethod
+    def set_timesteps_ltx2(num_inference_steps=100, denoising_strength=1.0, dynamic_shift_len=None, terminal=0.1, special_case=None):
+        num_train_timesteps = 1000
+        if special_case == "stage2":
+            sigmas = torch.Tensor([0.909375, 0.725, 0.421875])
+        elif special_case == "ditilled_stage1":
+            sigmas = torch.Tensor([1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875])
+        else:
+            dynamic_shift_len = dynamic_shift_len or 4096
+            sigma_shift = FlowMatchScheduler._calculate_shift_qwen_image(
+                image_seq_len=dynamic_shift_len,
+                base_seq_len=1024,
+                max_seq_len=4096,
+                base_shift=0.95,
+                max_shift=2.05,
+            )
+            sigma_min = 0.0
+            sigma_max = 1.0
+            sigma_start = sigma_min + (sigma_max - sigma_min) * denoising_strength
+            sigmas = torch.linspace(sigma_start, sigma_min, num_inference_steps + 1)[:-1]
+            sigmas = math.exp(sigma_shift) / (math.exp(sigma_shift) + (1 / sigmas - 1))
+            # Shift terminal
+            one_minus_z = 1.0 - sigmas
+            scale_factor = one_minus_z[-1] / (1 - terminal)
+            sigmas = 1.0 - (one_minus_z / scale_factor)
+        timesteps = sigmas * num_train_timesteps
+        return sigmas, timesteps
+    def set_training_weight(self):
+        steps = 1000
+        x = self.timesteps
+        y = torch.exp(-2 * ((x - steps / 2) / steps) ** 2)
+        y_shifted = y - y.min()
+        bsmntw_weighing = y_shifted * (steps / y_shifted.sum())
+        if len(self.timesteps) != 1000:
+            # This is an empirical formula.
+            bsmntw_weighing = bsmntw_weighing * (len(self.timesteps) / steps)
+            bsmntw_weighing = bsmntw_weighing + bsmntw_weighing[1]
+        self.linear_timesteps_weights = bsmntw_weighing
+    def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0, training=False, **kwargs):
+        self.sigmas, self.timesteps = self.set_timesteps_fn(
+            num_inference_steps=num_inference_steps,
+            denoising_strength=denoising_strength,
+            **kwargs,
+        )
+        if training:
+            self.set_training_weight()
+            self.training = True
+        else:
+            self.training = False
+    def step(self, model_output, timestep, sample, to_final=False, **kwargs):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        if to_final or timestep_id + 1 >= len(self.timesteps):
+            sigma_ = 0
+        else:
+            sigma_ = self.sigmas[timestep_id + 1]
+        prev_sample = sample + model_output * (sigma_ - sigma)
+        return prev_sample
+    def return_to_timestep(self, timestep, sample, sample_stablized):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        model_output = (sample - sample_stablized) / sigma
+        return model_output
+    def add_noise(self, original_samples, noise, timestep):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample = (1 - sigma) * original_samples + sigma * noise
+        return sample
+    def training_target(self, sample, noise, timestep):
+        target = noise - sample
+        return target
+    def training_weight(self, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep.to(self.timesteps.device)).abs())
+        weights = self.linear_timesteps_weights[timestep_id]
+        return weights

diffsynth/diffusion/logger.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os, torch
+from accelerate import Accelerator
+class ModelLogger:
+    def __init__(self, output_path, remove_prefix_in_ckpt=None, state_dict_converter=lambda x:x, resume_step=0):
+        self.output_path = output_path
+        self.remove_prefix_in_ckpt = remove_prefix_in_ckpt
+        self.state_dict_converter = state_dict_converter
+        self.num_steps = resume_step
+    def on_step_end(self, accelerator: Accelerator, model: torch.nn.Module, save_steps=None, **kwargs):
+        self.num_steps += 1
+        if save_steps is not None and self.num_steps % save_steps == 0:
+            self.save_model(accelerator, model, f"step-{self.num_steps}.safetensors")
+    def on_epoch_end(self, accelerator: Accelerator, model: torch.nn.Module, epoch_id):
+        accelerator.wait_for_everyone()
+        state_dict = accelerator.get_state_dict(model)
+        if accelerator.is_main_process:
+            state_dict = accelerator.unwrap_model(model).export_trainable_state_dict(state_dict, remove_prefix=self.remove_prefix_in_ckpt)
+            state_dict = self.state_dict_converter(state_dict)
+            os.makedirs(self.output_path, exist_ok=True)
+            path = os.path.join(self.output_path, f"epoch-{epoch_id}.safetensors")
+            accelerator.save(state_dict, path, safe_serialization=True)
+    def on_training_end(self, accelerator: Accelerator, model: torch.nn.Module, save_steps=None):
+        if save_steps is not None and self.num_steps % save_steps != 0:
+            self.save_model(accelerator, model, f"step-{self.num_steps}.safetensors")
+    def save_model(self, accelerator: Accelerator, model: torch.nn.Module, file_name):
+        accelerator.wait_for_everyone()
+        state_dict = accelerator.get_state_dict(model)
+        if accelerator.is_main_process:
+            state_dict = accelerator.unwrap_model(model).export_trainable_state_dict(state_dict, remove_prefix=self.remove_prefix_in_ckpt)
+            state_dict = self.state_dict_converter(state_dict)
+            os.makedirs(self.output_path, exist_ok=True)
+            path = os.path.join(self.output_path, file_name)
+            accelerator.save(state_dict, path, safe_serialization=True)

diffsynth/diffusion/loss.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from .base_pipeline import BasePipeline
+import torch
+def FlowMatchSFTLoss(pipe: BasePipeline, **inputs):
+    max_timestep_boundary = int(inputs.get("max_timestep_boundary", 1) * len(pipe.scheduler.timesteps))
+    min_timestep_boundary = int(inputs.get("min_timestep_boundary", 0) * len(pipe.scheduler.timesteps))
+    timestep_id = torch.randint(min_timestep_boundary, max_timestep_boundary, (1,))
+    timestep = pipe.scheduler.timesteps[timestep_id].to(dtype=pipe.torch_dtype, device=pipe.device)
+    noise = torch.randn_like(inputs["input_latents"])
+    inputs["latents"] = pipe.scheduler.add_noise(inputs["input_latents"], noise, timestep)
+    training_target = pipe.scheduler.training_target(inputs["input_latents"], noise, timestep)
+    if "first_frame_latents" in inputs:
+        inputs["latents"][:, :, 0:1] = inputs["first_frame_latents"]
+    models = {name: getattr(pipe, name) for name in pipe.in_iteration_models}
+    noise_pred = pipe.model_fn(**models, **inputs, timestep=timestep)
+    if "first_frame_latents" in inputs:
+        noise_pred = noise_pred[:, :, 1:]
+        training_target = training_target[:, :, 1:]
+    loss = torch.nn.functional.mse_loss(noise_pred.float(), training_target.float())
+    loss = loss * pipe.scheduler.training_weight(timestep)
+    return loss
+def FlowMatchSFTAudioVideoLoss(pipe: BasePipeline, **inputs):
+    max_timestep_boundary = int(inputs.get("max_timestep_boundary", 1) * len(pipe.scheduler.timesteps))
+    min_timestep_boundary = int(inputs.get("min_timestep_boundary", 0) * len(pipe.scheduler.timesteps))
+    timestep_id = torch.randint(min_timestep_boundary, max_timestep_boundary, (1,))
+    timestep = pipe.scheduler.timesteps[timestep_id].to(dtype=pipe.torch_dtype, device=pipe.device)
+    # video
+    noise = torch.randn_like(inputs["input_latents"])
+    inputs["video_latents"] = pipe.scheduler.add_noise(inputs["input_latents"], noise, timestep)
+    training_target = pipe.scheduler.training_target(inputs["input_latents"], noise, timestep)
+    # audio
+    if inputs.get("audio_input_latents") is not None:
+        audio_noise = torch.randn_like(inputs["audio_input_latents"])
+        inputs["audio_latents"] = pipe.scheduler.add_noise(inputs["audio_input_latents"], audio_noise, timestep)
+        training_target_audio = pipe.scheduler.training_target(inputs["audio_input_latents"], audio_noise, timestep)
+    models = {name: getattr(pipe, name) for name in pipe.in_iteration_models}
+    noise_pred, noise_pred_audio = pipe.model_fn(**models, **inputs, timestep=timestep)
+    loss = torch.nn.functional.mse_loss(noise_pred.float(), training_target.float())
+    loss = loss * pipe.scheduler.training_weight(timestep)
+    if inputs.get("audio_input_latents") is not None:
+        loss_audio = torch.nn.functional.mse_loss(noise_pred_audio.float(), training_target_audio.float())
+        loss_audio = loss_audio * pipe.scheduler.training_weight(timestep)
+        loss = loss + loss_audio
+    return loss
+def DirectDistillLoss(pipe: BasePipeline, **inputs):
+    pipe.scheduler.set_timesteps(inputs["num_inference_steps"])
+    pipe.scheduler.training = True
+    models = {name: getattr(pipe, name) for name in pipe.in_iteration_models}
+    for progress_id, timestep in enumerate(pipe.scheduler.timesteps):
+        timestep = timestep.unsqueeze(0).to(dtype=pipe.torch_dtype, device=pipe.device)
+        noise_pred = pipe.model_fn(**models, **inputs, timestep=timestep, progress_id=progress_id)
+        inputs["latents"] = pipe.step(pipe.scheduler, progress_id=progress_id, noise_pred=noise_pred, **inputs)
+    loss = torch.nn.functional.mse_loss(inputs["latents"].float(), inputs["input_latents"].float())
+    return loss
+class TrajectoryImitationLoss(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.initialized = False
+    def initialize(self, device):
+        import lpips # TODO: remove it
+        self.loss_fn = lpips.LPIPS(net='alex').to(device)
+        self.initialized = True
+    def fetch_trajectory(self, pipe: BasePipeline, timesteps_student, inputs_shared, inputs_posi, inputs_nega, num_inference_steps, cfg_scale):
+        trajectory = [inputs_shared["latents"].clone()]
+        pipe.scheduler.set_timesteps(num_inference_steps, target_timesteps=timesteps_student)
+        models = {name: getattr(pipe, name) for name in pipe.in_iteration_models}
+        for progress_id, timestep in enumerate(pipe.scheduler.timesteps):
+            timestep = timestep.unsqueeze(0).to(dtype=pipe.torch_dtype, device=pipe.device)
+            noise_pred = pipe.cfg_guided_model_fn(
+                pipe.model_fn, cfg_scale,
+                inputs_shared, inputs_posi, inputs_nega,
+                **models, timestep=timestep, progress_id=progress_id
+            )
+            inputs_shared["latents"] = pipe.step(pipe.scheduler, progress_id=progress_id, noise_pred=noise_pred.detach(), **inputs_shared)
+            trajectory.append(inputs_shared["latents"].clone())
+        return pipe.scheduler.timesteps, trajectory
+    def align_trajectory(self, pipe: BasePipeline, timesteps_teacher, trajectory_teacher, inputs_shared, inputs_posi, inputs_nega, num_inference_steps, cfg_scale):
+        loss = 0
+        pipe.scheduler.set_timesteps(num_inference_steps, training=True)
+        models = {name: getattr(pipe, name) for name in pipe.in_iteration_models}
+        for progress_id, timestep in enumerate(pipe.scheduler.timesteps):
+            timestep = timestep.unsqueeze(0).to(dtype=pipe.torch_dtype, device=pipe.device)
+            progress_id_teacher = torch.argmin((timesteps_teacher - timestep).abs())
+            inputs_shared["latents"] = trajectory_teacher[progress_id_teacher]
+            noise_pred = pipe.cfg_guided_model_fn(
+                pipe.model_fn, cfg_scale,
+                inputs_shared, inputs_posi, inputs_nega,
+                **models, timestep=timestep, progress_id=progress_id
+            )
+            sigma = pipe.scheduler.sigmas[progress_id]
+            sigma_ = 0 if progress_id + 1 >= len(pipe.scheduler.timesteps) else pipe.scheduler.sigmas[progress_id + 1]
+            if progress_id + 1 >= len(pipe.scheduler.timesteps):
+                latents_ = trajectory_teacher[-1]
+            else:
+                progress_id_teacher = torch.argmin((timesteps_teacher - pipe.scheduler.timesteps[progress_id + 1]).abs())
+                latents_ = trajectory_teacher[progress_id_teacher]
+            denom = sigma_ - sigma
+            denom = torch.sign(denom) * torch.clamp(denom.abs(), min=1e-6)
+            target = (latents_ - inputs_shared["latents"]) / denom
+            loss = loss + torch.nn.functional.mse_loss(noise_pred.float(), target.float()) * pipe.scheduler.training_weight(timestep)
+        return loss
+    def compute_regularization(self, pipe: BasePipeline, trajectory_teacher, inputs_shared, inputs_posi, inputs_nega, num_inference_steps, cfg_scale):
+        inputs_shared["latents"] = trajectory_teacher[0]
+        pipe.scheduler.set_timesteps(num_inference_steps)
+        models = {name: getattr(pipe, name) for name in pipe.in_iteration_models}
+        for progress_id, timestep in enumerate(pipe.scheduler.timesteps):
+            timestep = timestep.unsqueeze(0).to(dtype=pipe.torch_dtype, device=pipe.device)
+            noise_pred = pipe.cfg_guided_model_fn(
+                pipe.model_fn, cfg_scale,
+                inputs_shared, inputs_posi, inputs_nega,
+                **models, timestep=timestep, progress_id=progress_id
+            )
+            inputs_shared["latents"] = pipe.step(pipe.scheduler, progress_id=progress_id, noise_pred=noise_pred.detach(), **inputs_shared)
+        image_pred = pipe.vae_decoder(inputs_shared["latents"])
+        image_real = pipe.vae_decoder(trajectory_teacher[-1])
+        loss = self.loss_fn(image_pred.float(), image_real.float())
+        return loss
+    def forward(self, pipe: BasePipeline, inputs_shared, inputs_posi, inputs_nega):
+        if not self.initialized:
+            self.initialize(pipe.device)
+        with torch.no_grad():
+            pipe.scheduler.set_timesteps(8)
+            timesteps_teacher, trajectory_teacher = self.fetch_trajectory(inputs_shared["teacher"], pipe.scheduler.timesteps, inputs_shared, inputs_posi, inputs_nega, 50, 2)
+            timesteps_teacher = timesteps_teacher.to(dtype=pipe.torch_dtype, device=pipe.device)
+        loss_1 = self.align_trajectory(pipe, timesteps_teacher, trajectory_teacher, inputs_shared, inputs_posi, inputs_nega, 8, 1)
+        loss_2 = self.compute_regularization(pipe, trajectory_teacher, inputs_shared, inputs_posi, inputs_nega, 8, 1)
+        loss = loss_1 + loss_2
+        return loss

diffsynth/diffusion/parsers.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import argparse
+def add_dataset_base_config(parser: argparse.ArgumentParser):
+    parser.add_argument("--dataset_base_path", type=str, default="", required=True, help="Base path of the dataset.")
+    parser.add_argument("--dataset_metadata_path", type=str, default=None, help="Path to the metadata file of the dataset.")
+    parser.add_argument("--dataset_repeat", type=int, default=1, help="Number of times to repeat the dataset per epoch.")
+    parser.add_argument("--dataset_num_workers", type=int, default=0, help="Number of workers for data loading.")
+    parser.add_argument("--data_file_keys", type=str, default="image,video", help="Data file keys in the metadata. Comma-separated.")
+    return parser
+def add_image_size_config(parser: argparse.ArgumentParser):
+    parser.add_argument("--height", type=int, default=None, help="Height of images. Leave `height` and `width` empty to enable dynamic resolution.")
+    parser.add_argument("--width", type=int, default=None, help="Width of images. Leave `height` and `width` empty to enable dynamic resolution.")
+    parser.add_argument("--max_pixels", type=int, default=1024*1024, help="Maximum number of pixels per frame, used for dynamic resolution.")
+    return parser
+def add_video_size_config(parser: argparse.ArgumentParser):
+    parser.add_argument("--height", type=int, default=None, help="Height of images. Leave `height` and `width` empty to enable dynamic resolution.")
+    parser.add_argument("--width", type=int, default=None, help="Width of images. Leave `height` and `width` empty to enable dynamic resolution.")
+    parser.add_argument("--max_pixels", type=int, default=1024*1024, help="Maximum number of pixels per frame, used for dynamic resolution.")
+    parser.add_argument("--num_frames", type=int, default=81, help="Number of frames per video. Frames are sampled from the video prefix.")
+    return parser
+def add_model_config(parser: argparse.ArgumentParser):
+    parser.add_argument("--model_paths", type=str, default=None, help="Paths to load models. In JSON format.")
+    parser.add_argument("--model_id_with_origin_paths", type=str, default=None, help="Model ID with origin paths, e.g., Wan-AI/Wan2.1-T2V-1.3B:diffusion_pytorch_model*.safetensors. Comma-separated.")
+    parser.add_argument("--extra_inputs", default=None, help="Additional model inputs, comma-separated.")
+    parser.add_argument("--fp8_models", default=None, help="Models with FP8 precision, comma-separated.")
+    parser.add_argument("--offload_models", default=None, help="Models with offload, comma-separated. Only used in splited training.")
+    return parser
+def add_training_config(parser: argparse.ArgumentParser):
+    parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate.")
+    parser.add_argument("--num_epochs", type=int, default=1, help="Number of epochs.")
+    parser.add_argument("--trainable_models", type=str, default=None, help="Models to train, e.g., dit, vae, text_encoder.")
+    parser.add_argument("--find_unused_parameters", default=False, action="store_true", help="Whether to find unused parameters in DDP.")
+    parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay.")
+    parser.add_argument("--task", type=str, default="sft", required=False, help="Task type.")
+    return parser
+def add_output_config(parser: argparse.ArgumentParser):
+    parser.add_argument("--output_path", type=str, default="./models", help="Output save path.")
+    parser.add_argument("--remove_prefix_in_ckpt", type=str, default="pipe.dit.", help="Remove prefix in ckpt.")
+    parser.add_argument("--save_steps", type=int, default=None, help="Number of checkpoint saving invervals. If None, checkpoints will be saved every epoch.")
+    parser.add_argument("--resume_step", type=int, default=0, help="Starting step count when resuming. ModelLogger.num_steps initializes here; training stops when num_steps reaches num_epochs * steps_per_epoch.")
+    return parser
+def add_lora_config(parser: argparse.ArgumentParser):
+    parser.add_argument("--lora_base_model", type=str, default=None, help="Which model LoRA is added to.")
+    parser.add_argument("--lora_target_modules", type=str, default="q,k,v,o,ffn.0,ffn.2", help="Which layers LoRA is added to.")
+    parser.add_argument("--lora_rank", type=int, default=32, help="Rank of LoRA.")
+    parser.add_argument("--lora_checkpoint", type=str, default=None, help="Path to the LoRA checkpoint. If provided, LoRA will be loaded from this checkpoint.")
+    parser.add_argument("--preset_lora_path", type=str, default=None, help="Path to the preset LoRA checkpoint. If provided, this LoRA will be fused to the base model.")
+    parser.add_argument("--preset_lora_model", type=str, default=None, help="Which model the preset LoRA is fused to.")
+    return parser
+def add_gradient_config(parser: argparse.ArgumentParser):
+    parser.add_argument("--use_gradient_checkpointing", default=False, action="store_true", help="Whether to use gradient checkpointing.")
+    parser.add_argument("--use_gradient_checkpointing_offload", default=False, action="store_true", help="Whether to offload gradient checkpointing to CPU memory.")
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Gradient accumulation steps.")
+    return parser
+def add_general_config(parser: argparse.ArgumentParser):
+    parser = add_dataset_base_config(parser)
+    parser = add_model_config(parser)
+    parser = add_training_config(parser)
+    parser = add_output_config(parser)
+    parser = add_lora_config(parser)
+    parser = add_gradient_config(parser)
+    return parser

diffsynth/diffusion/runner.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import os, torch
+from tqdm import tqdm
+from accelerate import Accelerator
+from .training_module import DiffusionTrainingModule
+from .logger import ModelLogger
+def launch_training_task(
+    accelerator: Accelerator,
+    dataset: torch.utils.data.Dataset,
+    model: DiffusionTrainingModule,
+    model_logger: ModelLogger,
+    learning_rate: float = 1e-5,
+    weight_decay: float = 1e-2,
+    num_workers: int = 1,
+    save_steps: int = None,
+    num_epochs: int = 1,
+    args = None,
+):
+    if args is not None:
+        learning_rate = args.learning_rate
+        weight_decay = args.weight_decay
+        num_workers = args.dataset_num_workers
+        save_steps = args.save_steps
+        num_epochs = args.num_epochs
+    optimizer = torch.optim.AdamW(model.trainable_modules(), lr=learning_rate, weight_decay=weight_decay)
+    scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer)
+    dataloader = torch.utils.data.DataLoader(dataset, shuffle=True, collate_fn=lambda x: x[0], num_workers=num_workers)
+    model.to(device=accelerator.device)
+    # Exclude VAE from DeepSpeed ZeRO-3 wrapping to avoid compatibility issues
+    # Store VAE outside the module tree so DeepSpeed doesn't touch it
+    vae_module = getattr(model.pipe, 'vae', None)
+    if vae_module is not None:
+        del model.pipe._modules['vae']
+    model, optimizer, dataloader, scheduler = accelerator.prepare(model, optimizer, dataloader, scheduler)
+    if vae_module is not None:
+        vae_module.to(accelerator.device)
+        # Store VAE as a non-module attribute so pipeline code can still use pipe.vae
+        pipe = model.module.pipe if hasattr(model, 'module') else model.pipe
+        # Use object.__setattr__ to bypass nn.Module's __setattr__ which would register it as a submodule
+        object.__setattr__(pipe, 'vae', vae_module)
+    initialize_deepspeed_gradient_checkpointing(accelerator)
+    # Training log file
+    log_path = os.path.join(model_logger.output_path, "training_log.txt")
+    if accelerator.is_main_process:
+        os.makedirs(model_logger.output_path, exist_ok=True)
+        log_file = open(log_path, "a")
+        log_file.write(f"Training started. Epochs: {num_epochs}, LR: {learning_rate}, Steps/epoch: {len(dataloader)}\n")
+        log_file.flush()
+    else:
+        log_file = None
+    total_target = num_epochs * len(dataloader)
+    reached_target = False
+    for epoch_id in range(num_epochs):
+        if reached_target:
+            break
+        progress = tqdm(
+            total=total_target,
+            initial=model_logger.num_steps,
+            desc=f"Epoch {epoch_id+1}/{num_epochs}",
+        )
+        for step_id, data in enumerate(dataloader):
+            if model_logger.num_steps >= total_target:
+                reached_target = True
+                break
+            with accelerator.accumulate(model):
+                optimizer.zero_grad()
+                if dataset.load_from_cache:
+                    loss = model({}, inputs=data)
+                else:
+                    loss = model(data)
+                accelerator.backward(loss)
+                optimizer.step()
+                model_logger.on_step_end(accelerator, model, save_steps, loss=loss)
+                scheduler.step()
+                # Log loss
+                loss_val = loss.item()
+                progress.update(1)
+                progress.set_postfix(loss=f"{loss_val:.4f}")
+                if accelerator.is_main_process and log_file is not None and (model_logger.num_steps % 10 == 0 or model_logger.num_steps <= 5):
+                    log_file.write(f"epoch={epoch_id+1} step={model_logger.num_steps} loss={loss_val:.6f}\n")
+                    log_file.flush()
+        progress.close()
+        if save_steps is None:
+            model_logger.on_epoch_end(accelerator, model, epoch_id)
+        if accelerator.is_main_process and log_file is not None:
+            log_file.write(f"Epoch {epoch_id+1} completed. Checkpoint saved.\n")
+            log_file.flush()
+    model_logger.on_training_end(accelerator, model, save_steps)
+    if log_file is not None:
+        log_file.close()
+def launch_data_process_task(
+    accelerator: Accelerator,
+    dataset: torch.utils.data.Dataset,
+    model: DiffusionTrainingModule,
+    model_logger: ModelLogger,
+    num_workers: int = 8,
+    args = None,
+):
+    if args is not None:
+        num_workers = args.dataset_num_workers
+    dataloader = torch.utils.data.DataLoader(dataset, shuffle=False, collate_fn=lambda x: x[0], num_workers=num_workers)
+    model.to(device=accelerator.device)
+    model, dataloader = accelerator.prepare(model, dataloader)
+    for data_id, data in enumerate(tqdm(dataloader)):
+        with accelerator.accumulate(model):
+            with torch.no_grad():
+                folder = os.path.join(model_logger.output_path, str(accelerator.process_index))
+                os.makedirs(folder, exist_ok=True)
+                save_path = os.path.join(model_logger.output_path, str(accelerator.process_index), f"{data_id}.pth")
+                data = model(data)
+                torch.save(data, save_path)
+def initialize_deepspeed_gradient_checkpointing(accelerator: Accelerator):
+    if getattr(accelerator.state, "deepspeed_plugin", None) is not None:
+        ds_config = accelerator.state.deepspeed_plugin.deepspeed_config
+        if "activation_checkpointing" in ds_config:
+            import deepspeed
+            act_config = ds_config["activation_checkpointing"]
+            deepspeed.checkpointing.configure(
+                mpu_=None,
+                partition_activations=act_config.get("partition_activations", False),
+                checkpoint_in_cpu=act_config.get("cpu_checkpointing", False),
+                contiguous_checkpointing=act_config.get("contiguous_memory_optimization", False)
+            )
+        else:
+            print("Do not find activation_checkpointing config in deepspeed config, skip initializing deepspeed gradient checkpointing.")

diffsynth/diffusion/training_module.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import torch, json, os, inspect
+from ..core import ModelConfig, load_state_dict
+from ..utils.controlnet import ControlNetInput
+from .base_pipeline import PipelineUnit
+from peft import LoraConfig, inject_adapter_in_model
+class GeneralUnit_RemoveCache(PipelineUnit):
+    def __init__(self, required_params=tuple(), force_remove_params_shared=tuple(), force_remove_params_posi=tuple(), force_remove_params_nega=tuple()):
+        super().__init__(take_over=True)
+        self.required_params = required_params
+        self.force_remove_params_shared = force_remove_params_shared
+        self.force_remove_params_posi = force_remove_params_posi
+        self.force_remove_params_nega = force_remove_params_nega
+    def process_params(self, inputs, required_params, force_remove_params):
+        inputs_ = {}
+        for name, param in inputs.items():
+            if name in required_params and name not in force_remove_params:
+                inputs_[name] = param
+        return inputs_
+    def process(self, pipe, inputs_shared, inputs_posi, inputs_nega):
+        inputs_shared = self.process_params(inputs_shared, self.required_params, self.force_remove_params_shared)
+        inputs_posi = self.process_params(inputs_posi, self.required_params, self.force_remove_params_posi)
+        inputs_nega = self.process_params(inputs_nega, self.required_params, self.force_remove_params_nega)
+        return inputs_shared, inputs_posi, inputs_nega
+class DiffusionTrainingModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def to(self, *args, **kwargs):
+        for name, model in self.named_children():
+            model.to(*args, **kwargs)
+        return self
+    def trainable_modules(self):
+        trainable_modules = filter(lambda p: p.requires_grad, self.parameters())
+        return trainable_modules
+    def trainable_param_names(self):
+        trainable_param_names = list(filter(lambda named_param: named_param[1].requires_grad, self.named_parameters()))
+        trainable_param_names = set([named_param[0] for named_param in trainable_param_names])
+        return trainable_param_names
+    def add_lora_to_model(self, model, target_modules, lora_rank, lora_alpha=None, upcast_dtype=None):
+        if lora_alpha is None:
+            lora_alpha = lora_rank
+        if isinstance(target_modules, list) and len(target_modules) == 1:
+            target_modules = target_modules[0]
+        lora_config = LoraConfig(r=lora_rank, lora_alpha=lora_alpha, target_modules=target_modules)
+        model = inject_adapter_in_model(lora_config, model)
+        if upcast_dtype is not None:
+            for param in model.parameters():
+                if param.requires_grad:
+                    param.data = param.to(upcast_dtype)
+        return model
+    def mapping_lora_state_dict(self, state_dict):
+        new_state_dict = {}
+        for key, value in state_dict.items():
+            if "lora_A.weight" in key or "lora_B.weight" in key:
+                new_key = key.replace("lora_A.weight", "lora_A.default.weight").replace("lora_B.weight", "lora_B.default.weight")
+                new_state_dict[new_key] = value
+            elif "lora_A.default.weight" in key or "lora_B.default.weight" in key:
+                new_state_dict[key] = value
+        return new_state_dict
+    def export_trainable_state_dict(self, state_dict, remove_prefix=None):
+        trainable_param_names = self.trainable_param_names()
+        state_dict = {name: param for name, param in state_dict.items() if name in trainable_param_names}
+        if remove_prefix is not None:
+            state_dict_ = {}
+            for name, param in state_dict.items():
+                if name.startswith(remove_prefix):
+                    name = name[len(remove_prefix):]
+                state_dict_[name] = param
+            state_dict = state_dict_
+        return state_dict
+    def transfer_data_to_device(self, data, device, torch_float_dtype=None):
+        if data is None:
+            return data
+        elif isinstance(data, torch.Tensor):
+            data = data.to(device)
+            if torch_float_dtype is not None and data.dtype in [torch.float, torch.float16, torch.bfloat16]:
+                data = data.to(torch_float_dtype)
+            return data
+        elif isinstance(data, tuple):
+            data = tuple(self.transfer_data_to_device(x, device, torch_float_dtype) for x in data)
+            return data
+        elif isinstance(data, list):
+            data = list(self.transfer_data_to_device(x, device, torch_float_dtype) for x in data)
+            return data
+        elif isinstance(data, dict):
+            data = {i: self.transfer_data_to_device(data[i], device, torch_float_dtype) for i in data}
+            return data
+        else:
+            return data
+    def parse_vram_config(self, fp8=False, offload=False, device="cpu"):
+        if fp8:
+            return {
+                "offload_dtype": torch.float8_e4m3fn,
+                "offload_device": device,
+                "onload_dtype": torch.float8_e4m3fn,
+                "onload_device": device,
+                "preparing_dtype": torch.float8_e4m3fn,
+                "preparing_device": device,
+                "computation_dtype": torch.bfloat16,
+                "computation_device": device,
+            }
+        elif offload:
+            return {
+                "offload_dtype": "disk",
+                "offload_device": "disk",
+                "onload_dtype": "disk",
+                "onload_device": "disk",
+                "preparing_dtype": torch.bfloat16,
+                "preparing_device": device,
+                "computation_dtype": torch.bfloat16,
+                "computation_device": device,
+                "clear_parameters": True,
+            }
+        else:
+            return {}
+    def parse_model_configs(self, model_paths, model_id_with_origin_paths, fp8_models=None, offload_models=None, device="cpu"):
+        fp8_models = [] if fp8_models is None else fp8_models.split(",")
+        offload_models = [] if offload_models is None else offload_models.split(",")
+        model_configs = []
+        if model_paths is not None:
+            model_paths = json.loads(model_paths)
+            for path in model_paths:
+                vram_config = self.parse_vram_config(
+                    fp8=path in fp8_models,
+                    offload=path in offload_models,
+                    device=device
+                )
+                model_configs.append(ModelConfig(path=path, **vram_config))
+        if model_id_with_origin_paths is not None:
+            model_id_with_origin_paths = model_id_with_origin_paths.split(",")
+            for model_id_with_origin_path in model_id_with_origin_paths:
+                vram_config = self.parse_vram_config(
+                    fp8=model_id_with_origin_path in fp8_models,
+                    offload=model_id_with_origin_path in offload_models,
+                    device=device
+                )
+                config = self.parse_path_or_model_id(model_id_with_origin_path)
+                model_configs.append(ModelConfig(model_id=config.model_id, origin_file_pattern=config.origin_file_pattern, **vram_config))
+        return model_configs
+    def parse_path_or_model_id(self, model_id_with_origin_path, default_value=None):
+        if model_id_with_origin_path is None:
+            return default_value
+        elif os.path.exists(model_id_with_origin_path):
+            return ModelConfig(path=model_id_with_origin_path)
+        else:
+            if ":" not in model_id_with_origin_path:
+                raise ValueError(f"Failed to parse model config: {model_id_with_origin_path}. This is neither a valid path nor in the format of `model_id/origin_file_pattern`.")
+            split_id = model_id_with_origin_path.rfind(":")
+            model_id = model_id_with_origin_path[:split_id]
+            origin_file_pattern = model_id_with_origin_path[split_id + 1:]
+            return ModelConfig(model_id=model_id, origin_file_pattern=origin_file_pattern)
+    def auto_detect_lora_target_modules(
+        self,
+        model: torch.nn.Module,
+        search_for_linear=False,
+        linear_detector=lambda x: min(x.weight.shape) >= 512,
+        block_list_detector=lambda x: isinstance(x, torch.nn.ModuleList) and len(x) > 1,
+        name_prefix="",
+    ):
+        lora_target_modules = []
+        if search_for_linear:
+            for name, module in model.named_modules():
+                module_name = name_prefix + ["", "."][name_prefix != ""] + name
+                if isinstance(module, torch.nn.Linear) and linear_detector(module):
+                    lora_target_modules.append(module_name)
+        else:
+            for name, module in model.named_children():
+                module_name = name_prefix + ["", "."][name_prefix != ""] + name
+                lora_target_modules += self.auto_detect_lora_target_modules(
+                    module,
+                    search_for_linear=block_list_detector(module),
+                    linear_detector=linear_detector,
+                    block_list_detector=block_list_detector,
+                    name_prefix=module_name,
+                )
+        return lora_target_modules
+    def parse_lora_target_modules(self, model, lora_target_modules):
+        if lora_target_modules == "":
+            print("No LoRA target modules specified. The framework will automatically search for them.")
+            lora_target_modules = self.auto_detect_lora_target_modules(model)
+            print(f"LoRA will be patched at {lora_target_modules}.")
+        else:
+            lora_target_modules = lora_target_modules.split(",")
+        return lora_target_modules
+    def switch_pipe_to_training_mode(
+        self,
+        pipe,
+        trainable_models=None,
+        lora_base_model=None, lora_target_modules="", lora_rank=32, lora_checkpoint=None,
+        preset_lora_path=None, preset_lora_model=None,
+        task="sft",
+    ):
+        # Scheduler
+        pipe.scheduler.set_timesteps(1000, training=True)
+        # Freeze untrainable models
+        pipe.freeze_except([] if trainable_models is None else trainable_models.split(","))
+        # Preset LoRA
+        if preset_lora_path is not None:
+            pipe.load_lora(getattr(pipe, preset_lora_model), preset_lora_path)
+        # FP8
+        # FP8 relies on a model-specific memory management scheme.
+        # It is delegated to the subclass.
+        # Add LoRA to the base models
+        if lora_base_model is not None and not task.endswith(":data_process"):
+            if (not hasattr(pipe, lora_base_model)) or getattr(pipe, lora_base_model) is None:
+                print(f"No {lora_base_model} models in the pipeline. We cannot patch LoRA on the model. If this occurs during the data processing stage, it is normal.")
+                return
+            model = self.add_lora_to_model(
+                getattr(pipe, lora_base_model),
+                target_modules=self.parse_lora_target_modules(getattr(pipe, lora_base_model), lora_target_modules),
+                lora_rank=lora_rank,
+                upcast_dtype=pipe.torch_dtype,
+            )
+            if lora_checkpoint is not None:
+                state_dict = load_state_dict(lora_checkpoint)
+                state_dict = self.mapping_lora_state_dict(state_dict)
+                load_result = model.load_state_dict(state_dict, strict=False)
+                print(f"LoRA checkpoint loaded: {lora_checkpoint}, total {len(state_dict)} keys")
+                if len(load_result[1]) > 0:
+                    print(f"Warning, LoRA key mismatch! Unexpected keys in LoRA checkpoint: {load_result[1]}")
+            setattr(pipe, lora_base_model, model)
+    def split_pipeline_units(
+        self, task, pipe,
+        trainable_models=None, lora_base_model=None,
+        # TODO: set `remove_unnecessary_params` to `True` by default
+        remove_unnecessary_params=False,
+        # TODO: move `loss_required_params` to `loss.py`
+        loss_required_params=("input_latents", "max_timestep_boundary", "min_timestep_boundary", "first_frame_latents", "video_latents", "audio_input_latents", "num_inference_steps"),
+        force_remove_params_shared=tuple(),
+        force_remove_params_posi=tuple(),
+        force_remove_params_nega=tuple(),
+    ):
+        models_require_backward = []
+        if trainable_models is not None:
+            models_require_backward += trainable_models.split(",")
+        if lora_base_model is not None:
+            models_require_backward += [lora_base_model]
+        if task.endswith(":data_process"):
+            other_units, pipe.units = pipe.split_pipeline_units(models_require_backward)
+            if remove_unnecessary_params:
+                required_params = list(loss_required_params) + [i for i in inspect.signature(self.pipe.model_fn).parameters]
+                for unit in other_units:
+                    required_params.extend(unit.fetch_input_params())
+                required_params = sorted(list(set(required_params)))
+                pipe.units.append(GeneralUnit_RemoveCache(required_params, force_remove_params_shared, force_remove_params_posi, force_remove_params_nega))
+        elif task.endswith(":train"):
+            pipe.units, _ = pipe.split_pipeline_units(models_require_backward)
+        return pipe
+    def parse_extra_inputs(self, data, extra_inputs, inputs_shared):
+        controlnet_keys_map = (
+            ("blockwise_controlnet_", "blockwise_controlnet_inputs",),
+            ("controlnet_", "controlnet_inputs"),
+        )
+        controlnet_inputs = {}
+        for extra_input in extra_inputs:
+            for prefix, name in controlnet_keys_map:
+                if extra_input.startswith(prefix):
+                    if name not in controlnet_inputs:
+                        controlnet_inputs[name] = {}
+                    controlnet_inputs[name][extra_input.replace(prefix, "")] = data[extra_input]
+                    break
+            else:
+                inputs_shared[extra_input] = data[extra_input]
+        for name, params in controlnet_inputs.items():
+            inputs_shared[name] = [ControlNetInput(**params)]
+        return inputs_shared

diffsynth/models/anima_dit.py ADDED Viewed

	@@ -0,0 +1,1307 @@

+# original code from: comfy/ldm/cosmos/predict2.py
+import torch
+from torch import nn
+from einops import rearrange, repeat
+from einops.layers.torch import Rearrange
+import logging
+from typing import Callable, Optional, Tuple, List
+import math
+from torchvision import transforms
+from ..core.attention import attention_forward
+from ..core.gradient import gradient_checkpoint_forward
+class VideoPositionEmb(nn.Module):
+    def forward(self, x_B_T_H_W_C: torch.Tensor, fps=Optional[torch.Tensor], device=None, dtype=None) -> torch.Tensor:
+        """
+        It delegates the embedding generation to generate_embeddings function.
+        """
+        B_T_H_W_C = x_B_T_H_W_C.shape
+        embeddings = self.generate_embeddings(B_T_H_W_C, fps=fps, device=device, dtype=dtype)
+        return embeddings
+    def generate_embeddings(self, B_T_H_W_C: torch.Size, fps=Optional[torch.Tensor], device=None):
+        raise NotImplementedError
+def normalize(x: torch.Tensor, dim: Optional[List[int]] = None, eps: float = 0) -> torch.Tensor:
+    """
+    Normalizes the input tensor along specified dimensions such that the average square norm of elements is adjusted.
+    Args:
+        x (torch.Tensor): The input tensor to normalize.
+        dim (list, optional): The dimensions over which to normalize. If None, normalizes over all dimensions except the first.
+        eps (float, optional): A small constant to ensure numerical stability during division.
+    Returns:
+        torch.Tensor: The normalized tensor.
+    """
+    if dim is None:
+        dim = list(range(1, x.ndim))
+    norm = torch.linalg.vector_norm(x, dim=dim, keepdim=True, dtype=torch.float32)
+    norm = torch.add(eps, norm, alpha=math.sqrt(norm.numel() / x.numel()))
+    return x / norm.to(x.dtype)
+class LearnablePosEmbAxis(VideoPositionEmb):
+    def __init__(
+        self,
+        *,  # enforce keyword arguments
+        interpolation: str,
+        model_channels: int,
+        len_h: int,
+        len_w: int,
+        len_t: int,
+        device=None,
+        dtype=None,
+        **kwargs,
+    ):
+        """
+        Args:
+            interpolation (str): we curretly only support "crop", ideally when we need extrapolation capacity, we should adjust frequency or other more advanced methods. they are not implemented yet.
+        """
+        del kwargs  # unused
+        super().__init__()
+        self.interpolation = interpolation
+        assert self.interpolation in ["crop"], f"Unknown interpolation method {self.interpolation}"
+        self.pos_emb_h = nn.Parameter(torch.empty(len_h, model_channels, device=device, dtype=dtype))
+        self.pos_emb_w = nn.Parameter(torch.empty(len_w, model_channels, device=device, dtype=dtype))
+        self.pos_emb_t = nn.Parameter(torch.empty(len_t, model_channels, device=device, dtype=dtype))
+    def generate_embeddings(self, B_T_H_W_C: torch.Size, fps=Optional[torch.Tensor], device=None, dtype=None) -> torch.Tensor:
+        B, T, H, W, _ = B_T_H_W_C
+        if self.interpolation == "crop":
+            emb_h_H = self.pos_emb_h[:H].to(device=device, dtype=dtype)
+            emb_w_W = self.pos_emb_w[:W].to(device=device, dtype=dtype)
+            emb_t_T = self.pos_emb_t[:T].to(device=device, dtype=dtype)
+            emb = (
+                repeat(emb_t_T, "t d-> b t h w d", b=B, h=H, w=W)
+                + repeat(emb_h_H, "h d-> b t h w d", b=B, t=T, w=W)
+                + repeat(emb_w_W, "w d-> b t h w d", b=B, t=T, h=H)
+            )
+            assert list(emb.shape)[:4] == [B, T, H, W], f"bad shape: {list(emb.shape)[:4]} != {B, T, H, W}"
+        else:
+            raise ValueError(f"Unknown interpolation method {self.interpolation}")
+        return normalize(emb, dim=-1, eps=1e-6)
+class VideoRopePosition3DEmb(VideoPositionEmb):
+    def __init__(
+        self,
+        *,  # enforce keyword arguments
+        head_dim: int,
+        len_h: int,
+        len_w: int,
+        len_t: int,
+        base_fps: int = 24,
+        h_extrapolation_ratio: float = 1.0,
+        w_extrapolation_ratio: float = 1.0,
+        t_extrapolation_ratio: float = 1.0,
+        enable_fps_modulation: bool = True,
+        device=None,
+        **kwargs,  # used for compatibility with other positional embeddings; unused in this class
+    ):
+        del kwargs
+        super().__init__()
+        self.base_fps = base_fps
+        self.max_h = len_h
+        self.max_w = len_w
+        self.enable_fps_modulation = enable_fps_modulation
+        dim = head_dim
+        dim_h = dim // 6 * 2
+        dim_w = dim_h
+        dim_t = dim - 2 * dim_h
+        assert dim == dim_h + dim_w + dim_t, f"bad dim: {dim} != {dim_h} + {dim_w} + {dim_t}"
+        self.register_buffer(
+            "dim_spatial_range",
+            torch.arange(0, dim_h, 2, device=device)[: (dim_h // 2)].float() / dim_h,
+            persistent=False,
+        )
+        self.register_buffer(
+            "dim_temporal_range",
+            torch.arange(0, dim_t, 2, device=device)[: (dim_t // 2)].float() / dim_t,
+            persistent=False,
+        )
+        self.h_ntk_factor = h_extrapolation_ratio ** (dim_h / (dim_h - 2))
+        self.w_ntk_factor = w_extrapolation_ratio ** (dim_w / (dim_w - 2))
+        self.t_ntk_factor = t_extrapolation_ratio ** (dim_t / (dim_t - 2))
+    def generate_embeddings(
+        self,
+        B_T_H_W_C: torch.Size,
+        fps: Optional[torch.Tensor] = None,
+        h_ntk_factor: Optional[float] = None,
+        w_ntk_factor: Optional[float] = None,
+        t_ntk_factor: Optional[float] = None,
+        device=None,
+        dtype=None,
+    ):
+        """
+        Generate embeddings for the given input size.
+        Args:
+            B_T_H_W_C (torch.Size): Input tensor size (Batch, Time, Height, Width, Channels).
+            fps (Optional[torch.Tensor], optional): Frames per second. Defaults to None.
+            h_ntk_factor (Optional[float], optional): Height NTK factor. If None, uses self.h_ntk_factor.
+            w_ntk_factor (Optional[float], optional): Width NTK factor. If None, uses self.w_ntk_factor.
+            t_ntk_factor (Optional[float], optional): Time NTK factor. If None, uses self.t_ntk_factor.
+        Returns:
+            Not specified in the original code snippet.
+        """
+        h_ntk_factor = h_ntk_factor if h_ntk_factor is not None else self.h_ntk_factor
+        w_ntk_factor = w_ntk_factor if w_ntk_factor is not None else self.w_ntk_factor
+        t_ntk_factor = t_ntk_factor if t_ntk_factor is not None else self.t_ntk_factor
+        h_theta = 10000.0 * h_ntk_factor
+        w_theta = 10000.0 * w_ntk_factor
+        t_theta = 10000.0 * t_ntk_factor
+        h_spatial_freqs = 1.0 / (h_theta**self.dim_spatial_range.to(device=device))
+        w_spatial_freqs = 1.0 / (w_theta**self.dim_spatial_range.to(device=device))
+        temporal_freqs = 1.0 / (t_theta**self.dim_temporal_range.to(device=device))
+        B, T, H, W, _ = B_T_H_W_C
+        seq = torch.arange(max(H, W, T), dtype=torch.float, device=device)
+        uniform_fps = (fps is None) or isinstance(fps, (int, float)) or (fps.min() == fps.max())
+        assert (
+            uniform_fps or B == 1 or T == 1
+        ), "For video batch, batch size should be 1 for non-uniform fps. For image batch, T should be 1"
+        half_emb_h = torch.outer(seq[:H].to(device=device), h_spatial_freqs)
+        half_emb_w = torch.outer(seq[:W].to(device=device), w_spatial_freqs)
+        # apply sequence scaling in temporal dimension
+        if fps is None or self.enable_fps_modulation is False:  # image case
+            half_emb_t = torch.outer(seq[:T].to(device=device), temporal_freqs)
+        else:
+            half_emb_t = torch.outer(seq[:T].to(device=device) / fps * self.base_fps, temporal_freqs)
+        half_emb_h = torch.stack([torch.cos(half_emb_h), -torch.sin(half_emb_h), torch.sin(half_emb_h), torch.cos(half_emb_h)], dim=-1)
+        half_emb_w = torch.stack([torch.cos(half_emb_w), -torch.sin(half_emb_w), torch.sin(half_emb_w), torch.cos(half_emb_w)], dim=-1)
+        half_emb_t = torch.stack([torch.cos(half_emb_t), -torch.sin(half_emb_t), torch.sin(half_emb_t), torch.cos(half_emb_t)], dim=-1)
+        em_T_H_W_D = torch.cat(
+            [
+                repeat(half_emb_t, "t d x -> t h w d x", h=H, w=W),
+                repeat(half_emb_h, "h d x -> t h w d x", t=T, w=W),
+                repeat(half_emb_w, "w d x -> t h w d x", t=T, h=H),
+            ]
+            , dim=-2,
+        )
+        return rearrange(em_T_H_W_D, "t h w d (i j) -> (t h w) d i j", i=2, j=2).float()
+def apply_rotary_pos_emb(
+    t: torch.Tensor,
+    freqs: torch.Tensor,
+) -> torch.Tensor:
+    t_ = t.reshape(*t.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2).float()
+    t_out = freqs[..., 0] * t_[..., 0] + freqs[..., 1] * t_[..., 1]
+    t_out = t_out.movedim(-1, -2).reshape(*t.shape).type_as(t)
+    return t_out
+# ---------------------- Feed Forward Network -----------------------
+class GPT2FeedForward(nn.Module):
+    def __init__(self, d_model: int, d_ff: int, device=None, dtype=None, operations=None) -> None:
+        super().__init__()
+        self.activation = nn.GELU()
+        self.layer1 = operations.Linear(d_model, d_ff, bias=False, device=device, dtype=dtype)
+        self.layer2 = operations.Linear(d_ff, d_model, bias=False, device=device, dtype=dtype)
+        self._layer_id = None
+        self._dim = d_model
+        self._hidden_dim = d_ff
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.layer1(x)
+        x = self.activation(x)
+        x = self.layer2(x)
+        return x
+def torch_attention_op(q_B_S_H_D: torch.Tensor, k_B_S_H_D: torch.Tensor, v_B_S_H_D: torch.Tensor, transformer_options: Optional[dict] = {}) -> torch.Tensor:
+    """Computes multi-head attention using PyTorch's native implementation.
+    This function provides a PyTorch backend alternative to Transformer Engine's attention operation.
+    It rearranges the input tensors to match PyTorch's expected format, computes scaled dot-product
+    attention, and rearranges the output back to the original format.
+    The input tensor names use the following dimension conventions:
+    - B: batch size
+    - S: sequence length
+    - H: number of attention heads
+    - D: head dimension
+    Args:
+        q_B_S_H_D: Query tensor with shape (batch, seq_len, n_heads, head_dim)
+        k_B_S_H_D: Key tensor with shape (batch, seq_len, n_heads, head_dim)
+        v_B_S_H_D: Value tensor with shape (batch, seq_len, n_heads, head_dim)
+    Returns:
+        Attention output tensor with shape (batch, seq_len, n_heads * head_dim)
+    """
+    in_q_shape = q_B_S_H_D.shape
+    in_k_shape = k_B_S_H_D.shape
+    q_B_H_S_D = rearrange(q_B_S_H_D, "b ... h k -> b h ... k").view(in_q_shape[0], in_q_shape[-2], -1, in_q_shape[-1])
+    k_B_H_S_D = rearrange(k_B_S_H_D, "b ... h v -> b h ... v").view(in_k_shape[0], in_k_shape[-2], -1, in_k_shape[-1])
+    v_B_H_S_D = rearrange(v_B_S_H_D, "b ... h v -> b h ... v").view(in_k_shape[0], in_k_shape[-2], -1, in_k_shape[-1])
+    return attention_forward(q_B_H_S_D, k_B_H_S_D, v_B_H_S_D, out_pattern="b s (n d)")
+class Attention(nn.Module):
+    """
+    A flexible attention module supporting both self-attention and cross-attention mechanisms.
+    This module implements a multi-head attention layer that can operate in either self-attention
+    or cross-attention mode. The mode is determined by whether a context dimension is provided.
+    The implementation uses scaled dot-product attention and supports optional bias terms and
+    dropout regularization.
+    Args:
+        query_dim (int): The dimensionality of the query vectors.
+        context_dim (int, optional): The dimensionality of the context (key/value) vectors.
+            If None, the module operates in self-attention mode using query_dim. Default: None
+        n_heads (int, optional): Number of attention heads for multi-head attention. Default: 8
+        head_dim (int, optional): The dimension of each attention head. Default: 64
+        dropout (float, optional): Dropout probability applied to the output. Default: 0.0
+        qkv_format (str, optional): Format specification for QKV tensors. Default: "bshd"
+        backend (str, optional): Backend to use for the attention operation. Default: "transformer_engine"
+    Examples:
+        >>> # Self-attention with 512 dimensions and 8 heads
+        >>> self_attn = Attention(query_dim=512)
+        >>> x = torch.randn(32, 16, 512)  # (batch_size, seq_len, dim)
+        >>> out = self_attn(x)  # (32, 16, 512)
+        >>> # Cross-attention
+        >>> cross_attn = Attention(query_dim=512, context_dim=256)
+        >>> query = torch.randn(32, 16, 512)
+        >>> context = torch.randn(32, 8, 256)
+        >>> out = cross_attn(query, context)  # (32, 16, 512)
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        context_dim: Optional[int] = None,
+        n_heads: int = 8,
+        head_dim: int = 64,
+        dropout: float = 0.0,
+        device=None,
+        dtype=None,
+        operations=None,
+    ) -> None:
+        super().__init__()
+        logging.debug(
+            f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using "
+            f"{n_heads} heads with a dimension of {head_dim}."
+        )
+        self.is_selfattn = context_dim is None  # self attention
+        context_dim = query_dim if context_dim is None else context_dim
+        inner_dim = head_dim * n_heads
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.query_dim = query_dim
+        self.context_dim = context_dim
+        self.q_proj = operations.Linear(query_dim, inner_dim, bias=False, device=device, dtype=dtype)
+        self.q_norm = operations.RMSNorm(self.head_dim, eps=1e-6, device=device, dtype=dtype)
+        self.k_proj = operations.Linear(context_dim, inner_dim, bias=False, device=device, dtype=dtype)
+        self.k_norm = operations.RMSNorm(self.head_dim, eps=1e-6, device=device, dtype=dtype)
+        self.v_proj = operations.Linear(context_dim, inner_dim, bias=False, device=device, dtype=dtype)
+        self.v_norm = nn.Identity()
+        self.output_proj = operations.Linear(inner_dim, query_dim, bias=False, device=device, dtype=dtype)
+        self.output_dropout = nn.Dropout(dropout) if dropout > 1e-4 else nn.Identity()
+        self.attn_op = torch_attention_op
+        self._query_dim = query_dim
+        self._context_dim = context_dim
+        self._inner_dim = inner_dim
+    def compute_qkv(
+        self,
+        x: torch.Tensor,
+        context: Optional[torch.Tensor] = None,
+        rope_emb: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        q = self.q_proj(x)
+        context = x if context is None else context
+        k = self.k_proj(context)
+        v = self.v_proj(context)
+        q, k, v = map(
+            lambda t: rearrange(t, "b ... (h d) -> b ... h d", h=self.n_heads, d=self.head_dim),
+            (q, k, v),
+        )
+        def apply_norm_and_rotary_pos_emb(
+            q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, rope_emb: Optional[torch.Tensor]
+        ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+            v = self.v_norm(v)
+            if self.is_selfattn and rope_emb is not None:  # only apply to self-attention!
+                q = apply_rotary_pos_emb(q, rope_emb)
+                k = apply_rotary_pos_emb(k, rope_emb)
+            return q, k, v
+        q, k, v = apply_norm_and_rotary_pos_emb(q, k, v, rope_emb)
+        return q, k, v
+    def compute_attention(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, transformer_options: Optional[dict] = {}) -> torch.Tensor:
+        result = self.attn_op(q, k, v, transformer_options=transformer_options)  # [B, S, H, D]
+        return self.output_dropout(self.output_proj(result))
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: Optional[torch.Tensor] = None,
+        rope_emb: Optional[torch.Tensor] = None,
+        transformer_options: Optional[dict] = {},
+    ) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): The query tensor of shape [B, Mq, K]
+            context (Optional[Tensor]): The key tensor of shape [B, Mk, K] or use x as context [self attention] if None
+        """
+        q, k, v = self.compute_qkv(x, context, rope_emb=rope_emb)
+        return self.compute_attention(q, k, v, transformer_options=transformer_options)
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int):
+        super().__init__()
+        self.num_channels = num_channels
+    def forward(self, timesteps_B_T: torch.Tensor) -> torch.Tensor:
+        assert timesteps_B_T.ndim == 2, f"Expected 2D input, got {timesteps_B_T.ndim}"
+        timesteps = timesteps_B_T.flatten().float()
+        half_dim = self.num_channels // 2
+        exponent = -math.log(10000) * torch.arange(half_dim, dtype=torch.float32, device=timesteps.device)
+        exponent = exponent / (half_dim - 0.0)
+        emb = torch.exp(exponent)
+        emb = timesteps[:, None].float() * emb[None, :]
+        sin_emb = torch.sin(emb)
+        cos_emb = torch.cos(emb)
+        emb = torch.cat([cos_emb, sin_emb], dim=-1)
+        return rearrange(emb, "(b t) d -> b t d", b=timesteps_B_T.shape[0], t=timesteps_B_T.shape[1])
+class TimestepEmbedding(nn.Module):
+    def __init__(self, in_features: int, out_features: int, use_adaln_lora: bool = False, device=None, dtype=None, operations=None):
+        super().__init__()
+        logging.debug(
+            f"Using AdaLN LoRA Flag:  {use_adaln_lora}. We enable bias if no AdaLN LoRA for backward compatibility."
+        )
+        self.in_dim = in_features
+        self.out_dim = out_features
+        self.linear_1 = operations.Linear(in_features, out_features, bias=not use_adaln_lora, device=device, dtype=dtype)
+        self.activation = nn.SiLU()
+        self.use_adaln_lora = use_adaln_lora
+        if use_adaln_lora:
+            self.linear_2 = operations.Linear(out_features, 3 * out_features, bias=False, device=device, dtype=dtype)
+        else:
+            self.linear_2 = operations.Linear(out_features, out_features, bias=False, device=device, dtype=dtype)
+    def forward(self, sample: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        emb = self.linear_1(sample)
+        emb = self.activation(emb)
+        emb = self.linear_2(emb)
+        if self.use_adaln_lora:
+            adaln_lora_B_T_3D = emb
+            emb_B_T_D = sample
+        else:
+            adaln_lora_B_T_3D = None
+            emb_B_T_D = emb
+        return emb_B_T_D, adaln_lora_B_T_3D
+class PatchEmbed(nn.Module):
+    """
+    PatchEmbed is a module for embedding patches from an input tensor by applying either 3D or 2D convolutional layers,
+    depending on the . This module can process inputs with temporal (video) and spatial (image) dimensions,
+    making it suitable for video and image processing tasks. It supports dividing the input into patches
+    and embedding each patch into a vector of size `out_channels`.
+    Parameters:
+    - spatial_patch_size (int): The size of each spatial patch.
+    - temporal_patch_size (int): The size of each temporal patch.
+    - in_channels (int): Number of input channels. Default: 3.
+    - out_channels (int): The dimension of the embedding vector for each patch. Default: 768.
+    - bias (bool): If True, adds a learnable bias to the output of the convolutional layers. Default: True.
+    """
+    def __init__(
+        self,
+        spatial_patch_size: int,
+        temporal_patch_size: int,
+        in_channels: int = 3,
+        out_channels: int = 768,
+        device=None, dtype=None, operations=None
+    ):
+        super().__init__()
+        self.spatial_patch_size = spatial_patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.proj = nn.Sequential(
+            Rearrange(
+                "b c (t r) (h m) (w n) -> b t h w (c r m n)",
+                r=temporal_patch_size,
+                m=spatial_patch_size,
+                n=spatial_patch_size,
+            ),
+            operations.Linear(
+                in_channels * spatial_patch_size * spatial_patch_size * temporal_patch_size, out_channels, bias=False, device=device, dtype=dtype
+            ),
+        )
+        self.dim = in_channels * spatial_patch_size * spatial_patch_size * temporal_patch_size
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the PatchEmbed module.
+        Parameters:
+        - x (torch.Tensor): The input tensor of shape (B, C, T, H, W) where
+            B is the batch size,
+            C is the number of channels,
+            T is the temporal dimension,
+            H is the height, and
+            W is the width of the input.
+        Returns:
+        - torch.Tensor: The embedded patches as a tensor, with shape b t h w c.
+        """
+        assert x.dim() == 5
+        _, _, T, H, W = x.shape
+        assert (
+            H % self.spatial_patch_size == 0 and W % self.spatial_patch_size == 0
+        ), f"H,W {(H, W)} should be divisible by spatial_patch_size {self.spatial_patch_size}"
+        assert T % self.temporal_patch_size == 0
+        x = self.proj(x)
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of video DiT.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        spatial_patch_size: int,
+        temporal_patch_size: int,
+        out_channels: int,
+        use_adaln_lora: bool = False,
+        adaln_lora_dim: int = 256,
+        device=None, dtype=None, operations=None
+    ):
+        super().__init__()
+        self.layer_norm = operations.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = operations.Linear(
+            hidden_size, spatial_patch_size * spatial_patch_size * temporal_patch_size * out_channels, bias=False, device=device, dtype=dtype
+        )
+        self.hidden_size = hidden_size
+        self.n_adaln_chunks = 2
+        self.use_adaln_lora = use_adaln_lora
+        self.adaln_lora_dim = adaln_lora_dim
+        if use_adaln_lora:
+            self.adaln_modulation = nn.Sequential(
+                nn.SiLU(),
+                operations.Linear(hidden_size, adaln_lora_dim, bias=False, device=device, dtype=dtype),
+                operations.Linear(adaln_lora_dim, self.n_adaln_chunks * hidden_size, bias=False, device=device, dtype=dtype),
+            )
+        else:
+            self.adaln_modulation = nn.Sequential(
+                nn.SiLU(), operations.Linear(hidden_size, self.n_adaln_chunks * hidden_size, bias=False, device=device, dtype=dtype)
+            )
+    def forward(
+        self,
+        x_B_T_H_W_D: torch.Tensor,
+        emb_B_T_D: torch.Tensor,
+        adaln_lora_B_T_3D: Optional[torch.Tensor] = None,
+    ):
+        if self.use_adaln_lora:
+            assert adaln_lora_B_T_3D is not None
+            shift_B_T_D, scale_B_T_D = (
+                self.adaln_modulation(emb_B_T_D) + adaln_lora_B_T_3D[:, :, : 2 * self.hidden_size]
+            ).chunk(2, dim=-1)
+        else:
+            shift_B_T_D, scale_B_T_D = self.adaln_modulation(emb_B_T_D).chunk(2, dim=-1)
+        shift_B_T_1_1_D, scale_B_T_1_1_D = rearrange(shift_B_T_D, "b t d -> b t 1 1 d"), rearrange(
+            scale_B_T_D, "b t d -> b t 1 1 d"
+        )
+        def _fn(
+            _x_B_T_H_W_D: torch.Tensor,
+            _norm_layer: nn.Module,
+            _scale_B_T_1_1_D: torch.Tensor,
+            _shift_B_T_1_1_D: torch.Tensor,
+        ) -> torch.Tensor:
+            return _norm_layer(_x_B_T_H_W_D) * (1 + _scale_B_T_1_1_D) + _shift_B_T_1_1_D
+        x_B_T_H_W_D = _fn(x_B_T_H_W_D, self.layer_norm, scale_B_T_1_1_D, shift_B_T_1_1_D)
+        x_B_T_H_W_O = self.linear(x_B_T_H_W_D)
+        return x_B_T_H_W_O
+class Block(nn.Module):
+    """
+    A transformer block that combines self-attention, cross-attention and MLP layers with AdaLN modulation.
+    Each component (self-attention, cross-attention, MLP) has its own layer normalization and AdaLN modulation.
+    Parameters:
+        x_dim (int): Dimension of input features
+        context_dim (int): Dimension of context features for cross-attention
+        num_heads (int): Number of attention heads
+        mlp_ratio (float): Multiplier for MLP hidden dimension. Default: 4.0
+        use_adaln_lora (bool): Whether to use AdaLN-LoRA modulation. Default: False
+        adaln_lora_dim (int): Hidden dimension for AdaLN-LoRA layers. Default: 256
+    The block applies the following sequence:
+    1. Self-attention with AdaLN modulation
+    2. Cross-attention with AdaLN modulation
+    3. MLP with AdaLN modulation
+    Each component uses skip connections and layer normalization.
+    """
+    def __init__(
+        self,
+        x_dim: int,
+        context_dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        use_adaln_lora: bool = False,
+        adaln_lora_dim: int = 256,
+        device=None,
+        dtype=None,
+        operations=None,
+    ):
+        super().__init__()
+        self.x_dim = x_dim
+        self.layer_norm_self_attn = operations.LayerNorm(x_dim, elementwise_affine=False, eps=1e-6, device=device, dtype=dtype)
+        self.self_attn = Attention(x_dim, None, num_heads, x_dim // num_heads, device=device, dtype=dtype, operations=operations)
+        self.layer_norm_cross_attn = operations.LayerNorm(x_dim, elementwise_affine=False, eps=1e-6, device=device, dtype=dtype)
+        self.cross_attn = Attention(
+            x_dim, context_dim, num_heads, x_dim // num_heads, device=device, dtype=dtype, operations=operations
+        )
+        self.layer_norm_mlp = operations.LayerNorm(x_dim, elementwise_affine=False, eps=1e-6, device=device, dtype=dtype)
+        self.mlp = GPT2FeedForward(x_dim, int(x_dim * mlp_ratio), device=device, dtype=dtype, operations=operations)
+        self.use_adaln_lora = use_adaln_lora
+        if self.use_adaln_lora:
+            self.adaln_modulation_self_attn = nn.Sequential(
+                nn.SiLU(),
+                operations.Linear(x_dim, adaln_lora_dim, bias=False, device=device, dtype=dtype),
+                operations.Linear(adaln_lora_dim, 3 * x_dim, bias=False, device=device, dtype=dtype),
+            )
+            self.adaln_modulation_cross_attn = nn.Sequential(
+                nn.SiLU(),
+                operations.Linear(x_dim, adaln_lora_dim, bias=False, device=device, dtype=dtype),
+                operations.Linear(adaln_lora_dim, 3 * x_dim, bias=False, device=device, dtype=dtype),
+            )
+            self.adaln_modulation_mlp = nn.Sequential(
+                nn.SiLU(),
+                operations.Linear(x_dim, adaln_lora_dim, bias=False, device=device, dtype=dtype),
+                operations.Linear(adaln_lora_dim, 3 * x_dim, bias=False, device=device, dtype=dtype),
+            )
+        else:
+            self.adaln_modulation_self_attn = nn.Sequential(nn.SiLU(), operations.Linear(x_dim, 3 * x_dim, bias=False, device=device, dtype=dtype))
+            self.adaln_modulation_cross_attn = nn.Sequential(nn.SiLU(), operations.Linear(x_dim, 3 * x_dim, bias=False, device=device, dtype=dtype))
+            self.adaln_modulation_mlp = nn.Sequential(nn.SiLU(), operations.Linear(x_dim, 3 * x_dim, bias=False, device=device, dtype=dtype))
+    def forward(
+        self,
+        x_B_T_H_W_D: torch.Tensor,
+        emb_B_T_D: torch.Tensor,
+        crossattn_emb: torch.Tensor,
+        rope_emb_L_1_1_D: Optional[torch.Tensor] = None,
+        adaln_lora_B_T_3D: Optional[torch.Tensor] = None,
+        extra_per_block_pos_emb: Optional[torch.Tensor] = None,
+        transformer_options: Optional[dict] = {},
+    ) -> torch.Tensor:
+        residual_dtype = x_B_T_H_W_D.dtype
+        compute_dtype = emb_B_T_D.dtype
+        if extra_per_block_pos_emb is not None:
+            x_B_T_H_W_D = x_B_T_H_W_D + extra_per_block_pos_emb
+        if self.use_adaln_lora:
+            shift_self_attn_B_T_D, scale_self_attn_B_T_D, gate_self_attn_B_T_D = (
+                self.adaln_modulation_self_attn(emb_B_T_D) + adaln_lora_B_T_3D
+            ).chunk(3, dim=-1)
+            shift_cross_attn_B_T_D, scale_cross_attn_B_T_D, gate_cross_attn_B_T_D = (
+                self.adaln_modulation_cross_attn(emb_B_T_D) + adaln_lora_B_T_3D
+            ).chunk(3, dim=-1)
+            shift_mlp_B_T_D, scale_mlp_B_T_D, gate_mlp_B_T_D = (
+                self.adaln_modulation_mlp(emb_B_T_D) + adaln_lora_B_T_3D
+            ).chunk(3, dim=-1)
+        else:
+            shift_self_attn_B_T_D, scale_self_attn_B_T_D, gate_self_attn_B_T_D = self.adaln_modulation_self_attn(
+                emb_B_T_D
+            ).chunk(3, dim=-1)
+            shift_cross_attn_B_T_D, scale_cross_attn_B_T_D, gate_cross_attn_B_T_D = self.adaln_modulation_cross_attn(
+                emb_B_T_D
+            ).chunk(3, dim=-1)
+            shift_mlp_B_T_D, scale_mlp_B_T_D, gate_mlp_B_T_D = self.adaln_modulation_mlp(emb_B_T_D).chunk(3, dim=-1)
+        # Reshape tensors from (B, T, D) to (B, T, 1, 1, D) for broadcasting
+        shift_self_attn_B_T_1_1_D = rearrange(shift_self_attn_B_T_D, "b t d -> b t 1 1 d")
+        scale_self_attn_B_T_1_1_D = rearrange(scale_self_attn_B_T_D, "b t d -> b t 1 1 d")
+        gate_self_attn_B_T_1_1_D = rearrange(gate_self_attn_B_T_D, "b t d -> b t 1 1 d")
+        shift_cross_attn_B_T_1_1_D = rearrange(shift_cross_attn_B_T_D, "b t d -> b t 1 1 d")
+        scale_cross_attn_B_T_1_1_D = rearrange(scale_cross_attn_B_T_D, "b t d -> b t 1 1 d")
+        gate_cross_attn_B_T_1_1_D = rearrange(gate_cross_attn_B_T_D, "b t d -> b t 1 1 d")
+        shift_mlp_B_T_1_1_D = rearrange(shift_mlp_B_T_D, "b t d -> b t 1 1 d")
+        scale_mlp_B_T_1_1_D = rearrange(scale_mlp_B_T_D, "b t d -> b t 1 1 d")
+        gate_mlp_B_T_1_1_D = rearrange(gate_mlp_B_T_D, "b t d -> b t 1 1 d")
+        B, T, H, W, D = x_B_T_H_W_D.shape
+        def _fn(_x_B_T_H_W_D, _norm_layer, _scale_B_T_1_1_D, _shift_B_T_1_1_D):
+            return _norm_layer(_x_B_T_H_W_D) * (1 + _scale_B_T_1_1_D) + _shift_B_T_1_1_D
+        normalized_x_B_T_H_W_D = _fn(
+            x_B_T_H_W_D,
+            self.layer_norm_self_attn,
+            scale_self_attn_B_T_1_1_D,
+            shift_self_attn_B_T_1_1_D,
+        )
+        result_B_T_H_W_D = rearrange(
+            self.self_attn(
+                # normalized_x_B_T_HW_D,
+                rearrange(normalized_x_B_T_H_W_D.to(compute_dtype), "b t h w d -> b (t h w) d"),
+                None,
+                rope_emb=rope_emb_L_1_1_D,
+                transformer_options=transformer_options,
+            ),
+            "b (t h w) d -> b t h w d",
+            t=T,
+            h=H,
+            w=W,
+        )
+        x_B_T_H_W_D = x_B_T_H_W_D + gate_self_attn_B_T_1_1_D.to(residual_dtype) * result_B_T_H_W_D.to(residual_dtype)
+        def _x_fn(
+            _x_B_T_H_W_D: torch.Tensor,
+            layer_norm_cross_attn: Callable,
+            _scale_cross_attn_B_T_1_1_D: torch.Tensor,
+            _shift_cross_attn_B_T_1_1_D: torch.Tensor,
+            transformer_options: Optional[dict] = {},
+        ) -> torch.Tensor:
+            _normalized_x_B_T_H_W_D = _fn(
+                _x_B_T_H_W_D, layer_norm_cross_attn, _scale_cross_attn_B_T_1_1_D, _shift_cross_attn_B_T_1_1_D
+            )
+            _result_B_T_H_W_D = rearrange(
+                self.cross_attn(
+                    rearrange(_normalized_x_B_T_H_W_D.to(compute_dtype), "b t h w d -> b (t h w) d"),
+                    crossattn_emb,
+                    rope_emb=rope_emb_L_1_1_D,
+                    transformer_options=transformer_options,
+                ),
+                "b (t h w) d -> b t h w d",
+                t=T,
+                h=H,
+                w=W,
+            )
+            return _result_B_T_H_W_D
+        result_B_T_H_W_D = _x_fn(
+            x_B_T_H_W_D,
+            self.layer_norm_cross_attn,
+            scale_cross_attn_B_T_1_1_D,
+            shift_cross_attn_B_T_1_1_D,
+            transformer_options=transformer_options,
+        )
+        x_B_T_H_W_D = result_B_T_H_W_D.to(residual_dtype) * gate_cross_attn_B_T_1_1_D.to(residual_dtype) + x_B_T_H_W_D
+        normalized_x_B_T_H_W_D = _fn(
+            x_B_T_H_W_D,
+            self.layer_norm_mlp,
+            scale_mlp_B_T_1_1_D,
+            shift_mlp_B_T_1_1_D,
+        )
+        result_B_T_H_W_D = self.mlp(normalized_x_B_T_H_W_D.to(compute_dtype))
+        x_B_T_H_W_D = x_B_T_H_W_D + gate_mlp_B_T_1_1_D.to(residual_dtype) * result_B_T_H_W_D.to(residual_dtype)
+        return x_B_T_H_W_D
+class MiniTrainDIT(nn.Module):
+    """
+    A clean impl of DIT that can load and  reproduce the training results of the original DIT model in~(cosmos 1)
+    A general implementation of adaln-modulated VIT-like~(DiT) transformer for video processing.
+    Args:
+        max_img_h (int): Maximum height of the input images.
+        max_img_w (int): Maximum width of the input images.
+        max_frames (int): Maximum number of frames in the video sequence.
+        in_channels (int): Number of input channels (e.g., RGB channels for color images).
+        out_channels (int): Number of output channels.
+        patch_spatial (tuple): Spatial resolution of patches for input processing.
+        patch_temporal (int): Temporal resolution of patches for input processing.
+        concat_padding_mask (bool): If True, includes a mask channel in the input to handle padding.
+        model_channels (int): Base number of channels used throughout the model.
+        num_blocks (int): Number of transformer blocks.
+        num_heads (int): Number of heads in the multi-head attention layers.
+        mlp_ratio (float): Expansion ratio for MLP blocks.
+        crossattn_emb_channels (int): Number of embedding channels for cross-attention.
+        pos_emb_cls (str): Type of positional embeddings.
+        pos_emb_learnable (bool): Whether positional embeddings are learnable.
+        pos_emb_interpolation (str): Method for interpolating positional embeddings.
+        min_fps (int): Minimum frames per second.
+        max_fps (int): Maximum frames per second.
+        use_adaln_lora (bool): Whether to use AdaLN-LoRA.
+        adaln_lora_dim (int): Dimension for AdaLN-LoRA.
+        rope_h_extrapolation_ratio (float): Height extrapolation ratio for RoPE.
+        rope_w_extrapolation_ratio (float): Width extrapolation ratio for RoPE.
+        rope_t_extrapolation_ratio (float): Temporal extrapolation ratio for RoPE.
+        extra_per_block_abs_pos_emb (bool): Whether to use extra per-block absolute positional embeddings.
+        extra_h_extrapolation_ratio (float): Height extrapolation ratio for extra embeddings.
+        extra_w_extrapolation_ratio (float): Width extrapolation ratio for extra embeddings.
+        extra_t_extrapolation_ratio (float): Temporal extrapolation ratio for extra embeddings.
+    """
+    def __init__(
+        self,
+        max_img_h: int,
+        max_img_w: int,
+        max_frames: int,
+        in_channels: int,
+        out_channels: int,
+        patch_spatial: int,  # tuple,
+        patch_temporal: int,
+        concat_padding_mask: bool = True,
+        # attention settings
+        model_channels: int = 768,
+        num_blocks: int = 10,
+        num_heads: int = 16,
+        mlp_ratio: float = 4.0,
+        # cross attention settings
+        crossattn_emb_channels: int = 1024,
+        # positional embedding settings
+        pos_emb_cls: str = "sincos",
+        pos_emb_learnable: bool = False,
+        pos_emb_interpolation: str = "crop",
+        min_fps: int = 1,
+        max_fps: int = 30,
+        use_adaln_lora: bool = False,
+        adaln_lora_dim: int = 256,
+        rope_h_extrapolation_ratio: float = 1.0,
+        rope_w_extrapolation_ratio: float = 1.0,
+        rope_t_extrapolation_ratio: float = 1.0,
+        extra_per_block_abs_pos_emb: bool = False,
+        extra_h_extrapolation_ratio: float = 1.0,
+        extra_w_extrapolation_ratio: float = 1.0,
+        extra_t_extrapolation_ratio: float = 1.0,
+        rope_enable_fps_modulation: bool = True,
+        image_model=None,
+        device=None,
+        dtype=None,
+        operations=None,
+    ) -> None:
+        super().__init__()
+        self.dtype = dtype
+        self.max_img_h = max_img_h
+        self.max_img_w = max_img_w
+        self.max_frames = max_frames
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.patch_spatial = patch_spatial
+        self.patch_temporal = patch_temporal
+        self.num_heads = num_heads
+        self.num_blocks = num_blocks
+        self.model_channels = model_channels
+        self.concat_padding_mask = concat_padding_mask
+        # positional embedding settings
+        self.pos_emb_cls = pos_emb_cls
+        self.pos_emb_learnable = pos_emb_learnable
+        self.pos_emb_interpolation = pos_emb_interpolation
+        self.min_fps = min_fps
+        self.max_fps = max_fps
+        self.rope_h_extrapolation_ratio = rope_h_extrapolation_ratio
+        self.rope_w_extrapolation_ratio = rope_w_extrapolation_ratio
+        self.rope_t_extrapolation_ratio = rope_t_extrapolation_ratio
+        self.extra_per_block_abs_pos_emb = extra_per_block_abs_pos_emb
+        self.extra_h_extrapolation_ratio = extra_h_extrapolation_ratio
+        self.extra_w_extrapolation_ratio = extra_w_extrapolation_ratio
+        self.extra_t_extrapolation_ratio = extra_t_extrapolation_ratio
+        self.rope_enable_fps_modulation = rope_enable_fps_modulation
+        self.build_pos_embed(device=device, dtype=dtype)
+        self.use_adaln_lora = use_adaln_lora
+        self.adaln_lora_dim = adaln_lora_dim
+        self.t_embedder = nn.Sequential(
+            Timesteps(model_channels),
+            TimestepEmbedding(model_channels, model_channels, use_adaln_lora=use_adaln_lora, device=device, dtype=dtype, operations=operations,),
+        )
+        in_channels = in_channels + 1 if concat_padding_mask else in_channels
+        self.x_embedder = PatchEmbed(
+            spatial_patch_size=patch_spatial,
+            temporal_patch_size=patch_temporal,
+            in_channels=in_channels,
+            out_channels=model_channels,
+            device=device, dtype=dtype, operations=operations,
+        )
+        self.blocks = nn.ModuleList(
+            [
+                Block(
+                    x_dim=model_channels,
+                    context_dim=crossattn_emb_channels,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    use_adaln_lora=use_adaln_lora,
+                    adaln_lora_dim=adaln_lora_dim,
+                    device=device, dtype=dtype, operations=operations,
+                )
+                for _ in range(num_blocks)
+            ]
+        )
+        self.final_layer = FinalLayer(
+            hidden_size=self.model_channels,
+            spatial_patch_size=self.patch_spatial,
+            temporal_patch_size=self.patch_temporal,
+            out_channels=self.out_channels,
+            use_adaln_lora=self.use_adaln_lora,
+            adaln_lora_dim=self.adaln_lora_dim,
+            device=device, dtype=dtype, operations=operations,
+        )
+        self.t_embedding_norm = operations.RMSNorm(model_channels, eps=1e-6, device=device, dtype=dtype)
+    def build_pos_embed(self, device=None, dtype=None) -> None:
+        if self.pos_emb_cls == "rope3d":
+            cls_type = VideoRopePosition3DEmb
+        else:
+            raise ValueError(f"Unknown pos_emb_cls {self.pos_emb_cls}")
+        logging.debug(f"Building positional embedding with {self.pos_emb_cls} class, impl {cls_type}")
+        kwargs = dict(
+            model_channels=self.model_channels,
+            len_h=self.max_img_h // self.patch_spatial,
+            len_w=self.max_img_w // self.patch_spatial,
+            len_t=self.max_frames // self.patch_temporal,
+            max_fps=self.max_fps,
+            min_fps=self.min_fps,
+            is_learnable=self.pos_emb_learnable,
+            interpolation=self.pos_emb_interpolation,
+            head_dim=self.model_channels // self.num_heads,
+            h_extrapolation_ratio=self.rope_h_extrapolation_ratio,
+            w_extrapolation_ratio=self.rope_w_extrapolation_ratio,
+            t_extrapolation_ratio=self.rope_t_extrapolation_ratio,
+            enable_fps_modulation=self.rope_enable_fps_modulation,
+            device=device,
+        )
+        self.pos_embedder = cls_type(
+            **kwargs,  # type: ignore
+        )
+        if self.extra_per_block_abs_pos_emb:
+            kwargs["h_extrapolation_ratio"] = self.extra_h_extrapolation_ratio
+            kwargs["w_extrapolation_ratio"] = self.extra_w_extrapolation_ratio
+            kwargs["t_extrapolation_ratio"] = self.extra_t_extrapolation_ratio
+            kwargs["device"] = device
+            kwargs["dtype"] = dtype
+            self.extra_pos_embedder = LearnablePosEmbAxis(
+                **kwargs,  # type: ignore
+            )
+    def prepare_embedded_sequence(
+        self,
+        x_B_C_T_H_W: torch.Tensor,
+        fps: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Prepares an embedded sequence tensor by applying positional embeddings and handling padding masks.
+        Args:
+            x_B_C_T_H_W (torch.Tensor): video
+            fps (Optional[torch.Tensor]): Frames per second tensor to be used for positional embedding when required.
+                                    If None, a default value (`self.base_fps`) will be used.
+            padding_mask (Optional[torch.Tensor]): current it is not used
+        Returns:
+            Tuple[torch.Tensor, Optional[torch.Tensor]]:
+                - A tensor of shape (B, T, H, W, D) with the embedded sequence.
+                - An optional positional embedding tensor, returned only if the positional embedding class
+                (`self.pos_emb_cls`) includes 'rope'. Otherwise, None.
+        Notes:
+            - If `self.concat_padding_mask` is True, a padding mask channel is concatenated to the input tensor.
+            - The method of applying positional embeddings depends on the value of `self.pos_emb_cls`.
+            - If 'rope' is in `self.pos_emb_cls` (case insensitive), the positional embeddings are generated using
+                the `self.pos_embedder` with the shape [T, H, W].
+            - If "fps_aware" is in `self.pos_emb_cls`, the positional embeddings are generated using the
+            `self.pos_embedder` with the fps tensor.
+            - Otherwise, the positional embeddings are generated without considering fps.
+        """
+        if self.concat_padding_mask:
+            if padding_mask is None:
+                padding_mask = torch.zeros(x_B_C_T_H_W.shape[0], 1, x_B_C_T_H_W.shape[3], x_B_C_T_H_W.shape[4], dtype=x_B_C_T_H_W.dtype, device=x_B_C_T_H_W.device)
+            else:
+                padding_mask = transforms.functional.resize(
+                    padding_mask, list(x_B_C_T_H_W.shape[-2:]), interpolation=transforms.InterpolationMode.NEAREST
+                )
+            x_B_C_T_H_W = torch.cat(
+                [x_B_C_T_H_W, padding_mask.unsqueeze(1).repeat(1, 1, x_B_C_T_H_W.shape[2], 1, 1)], dim=1
+            )
+        x_B_T_H_W_D = self.x_embedder(x_B_C_T_H_W)
+        if self.extra_per_block_abs_pos_emb:
+            extra_pos_emb = self.extra_pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device, dtype=x_B_C_T_H_W.dtype)
+        else:
+            extra_pos_emb = None
+        if "rope" in self.pos_emb_cls.lower():
+            return x_B_T_H_W_D, self.pos_embedder(x_B_T_H_W_D, fps=fps, device=x_B_C_T_H_W.device), extra_pos_emb
+        x_B_T_H_W_D = x_B_T_H_W_D + self.pos_embedder(x_B_T_H_W_D, device=x_B_C_T_H_W.device)  # [B, T, H, W, D]
+        return x_B_T_H_W_D, None, extra_pos_emb
+    def unpatchify(self, x_B_T_H_W_M: torch.Tensor) -> torch.Tensor:
+        x_B_C_Tt_Hp_Wp = rearrange(
+            x_B_T_H_W_M,
+            "B T H W (p1 p2 t C) -> B C (T t) (H p1) (W p2)",
+            p1=self.patch_spatial,
+            p2=self.patch_spatial,
+            t=self.patch_temporal,
+        )
+        return x_B_C_Tt_Hp_Wp
+    def pad_to_patch_size(self, img, patch_size=(2, 2), padding_mode="circular"):
+        if padding_mode == "circular" and (torch.jit.is_tracing() or torch.jit.is_scripting()):
+            padding_mode = "reflect"
+        pad = ()
+        for i in range(img.ndim - 2):
+            pad = (0, (patch_size[i] - img.shape[i + 2] % patch_size[i]) % patch_size[i]) + pad
+        return torch.nn.functional.pad(img, pad, mode=padding_mode)
+    def forward(
+        self,
+        x: torch.Tensor,
+        timesteps: torch.Tensor,
+        context: torch.Tensor,
+        fps: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        use_gradient_checkpointing=False,
+        use_gradient_checkpointing_offload=False,
+        **kwargs,
+    ):
+        orig_shape = list(x.shape)
+        x = self.pad_to_patch_size(x, (self.patch_temporal, self.patch_spatial, self.patch_spatial))
+        x_B_C_T_H_W = x
+        timesteps_B_T = timesteps
+        crossattn_emb = context
+        """
+        Args:
+            x: (B, C, T, H, W) tensor of spatial-temp inputs
+            timesteps: (B, ) tensor of timesteps
+            crossattn_emb: (B, N, D) tensor of cross-attention embeddings
+        """
+        x_B_T_H_W_D, rope_emb_L_1_1_D, extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D = self.prepare_embedded_sequence(
+            x_B_C_T_H_W,
+            fps=fps,
+            padding_mask=padding_mask,
+        )
+        if timesteps_B_T.ndim == 1:
+            timesteps_B_T = timesteps_B_T.unsqueeze(1)
+        t_embedding_B_T_D, adaln_lora_B_T_3D = self.t_embedder[1](self.t_embedder[0](timesteps_B_T).to(x_B_T_H_W_D.dtype))
+        t_embedding_B_T_D = self.t_embedding_norm(t_embedding_B_T_D)
+        # for logging purpose
+        affline_scale_log_info = {}
+        affline_scale_log_info["t_embedding_B_T_D"] = t_embedding_B_T_D.detach()
+        self.affline_scale_log_info = affline_scale_log_info
+        self.affline_emb = t_embedding_B_T_D
+        self.crossattn_emb = crossattn_emb
+        if extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D is not None:
+            assert (
+                x_B_T_H_W_D.shape == extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape
+            ), f"{x_B_T_H_W_D.shape} != {extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D.shape}"
+        block_kwargs = {
+            "rope_emb_L_1_1_D": rope_emb_L_1_1_D.unsqueeze(1).unsqueeze(0),
+            "adaln_lora_B_T_3D": adaln_lora_B_T_3D,
+            "extra_per_block_pos_emb": extra_pos_emb_B_T_H_W_D_or_T_H_W_B_D,
+            "transformer_options": kwargs.get("transformer_options", {}),
+        }
+        # The residual stream for this model has large values. To make fp16 compute_dtype work, we keep the residual stream
+        # in fp32, but run attention and MLP modules in fp16.
+        # An alternate method that clamps fp16 values "works" in the sense that it makes coherent images, but there is noticeable
+        # quality degradation and visual artifacts.
+        if x_B_T_H_W_D.dtype == torch.float16:
+            x_B_T_H_W_D = x_B_T_H_W_D.float()
+        for block in self.blocks:
+            x_B_T_H_W_D = gradient_checkpoint_forward(
+                block,
+                use_gradient_checkpointing=use_gradient_checkpointing,
+                use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
+                x_B_T_H_W_D=x_B_T_H_W_D,
+                emb_B_T_D=t_embedding_B_T_D,
+                crossattn_emb=crossattn_emb,
+                **block_kwargs,
+            )
+        x_B_T_H_W_O = self.final_layer(x_B_T_H_W_D.to(crossattn_emb.dtype), t_embedding_B_T_D, adaln_lora_B_T_3D=adaln_lora_B_T_3D)
+        x_B_C_Tt_Hp_Wp = self.unpatchify(x_B_T_H_W_O)[:, :, :orig_shape[-3], :orig_shape[-2], :orig_shape[-1]]
+        return x_B_C_Tt_Hp_Wp
+def rotate_half(x):
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb2(x, cos, sin, unsqueeze_dim=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    x_embed = (x * cos) + (rotate_half(x) * sin)
+    return x_embed
+class RotaryEmbedding(nn.Module):
+    def __init__(self, head_dim):
+        super().__init__()
+        self.rope_theta = 10000
+        inv_freq = 1.0 / (self.rope_theta ** (torch.arange(0, head_dim, 2, dtype=torch.int64).to(dtype=torch.float) / head_dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class LLMAdapterAttention(nn.Module):
+    def __init__(self, query_dim, context_dim, n_heads, head_dim, device=None, dtype=None, operations=None):
+        super().__init__()
+        inner_dim = head_dim * n_heads
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.query_dim = query_dim
+        self.context_dim = context_dim
+        self.q_proj = operations.Linear(query_dim, inner_dim, bias=False, device=device, dtype=dtype)
+        self.q_norm = operations.RMSNorm(self.head_dim, eps=1e-6, device=device, dtype=dtype)
+        self.k_proj = operations.Linear(context_dim, inner_dim, bias=False, device=device, dtype=dtype)
+        self.k_norm = operations.RMSNorm(self.head_dim, eps=1e-6, device=device, dtype=dtype)
+        self.v_proj = operations.Linear(context_dim, inner_dim, bias=False, device=device, dtype=dtype)
+        self.o_proj = operations.Linear(inner_dim, query_dim, bias=False, device=device, dtype=dtype)
+    def forward(self, x, mask=None, context=None, position_embeddings=None, position_embeddings_context=None):
+        context = x if context is None else context
+        input_shape = x.shape[:-1]
+        q_shape = (*input_shape, self.n_heads, self.head_dim)
+        context_shape = context.shape[:-1]
+        kv_shape = (*context_shape, self.n_heads, self.head_dim)
+        query_states = self.q_norm(self.q_proj(x).view(q_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(context).view(kv_shape)).transpose(1, 2)
+        value_states = self.v_proj(context).view(kv_shape).transpose(1, 2)
+        if position_embeddings is not None:
+            assert position_embeddings_context is not None
+            cos, sin = position_embeddings
+            query_states = apply_rotary_pos_emb2(query_states, cos, sin)
+            cos, sin = position_embeddings_context
+            key_states = apply_rotary_pos_emb2(key_states, cos, sin)
+        attn_output = torch.nn.functional.scaled_dot_product_attention(query_states, key_states, value_states, attn_mask=mask)
+        attn_output = attn_output.transpose(1, 2).reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+    def init_weights(self):
+        torch.nn.init.zeros_(self.o_proj.weight)
+class LLMAdapterTransformerBlock(nn.Module):
+    def __init__(self, source_dim, model_dim, num_heads=16, mlp_ratio=4.0, use_self_attn=False, layer_norm=False, device=None, dtype=None, operations=None):
+        super().__init__()
+        self.use_self_attn = use_self_attn
+        if self.use_self_attn:
+            self.norm_self_attn = operations.LayerNorm(model_dim, device=device, dtype=dtype) if layer_norm else operations.RMSNorm(model_dim, eps=1e-6, device=device, dtype=dtype)
+            self.self_attn = LLMAdapterAttention(
+                query_dim=model_dim,
+                context_dim=model_dim,
+                n_heads=num_heads,
+                head_dim=model_dim//num_heads,
+                device=device,
+                dtype=dtype,
+                operations=operations,
+            )
+        self.norm_cross_attn = operations.LayerNorm(model_dim, device=device, dtype=dtype) if layer_norm else operations.RMSNorm(model_dim, eps=1e-6, device=device, dtype=dtype)
+        self.cross_attn = LLMAdapterAttention(
+            query_dim=model_dim,
+            context_dim=source_dim,
+            n_heads=num_heads,
+            head_dim=model_dim//num_heads,
+            device=device,
+            dtype=dtype,
+            operations=operations,
+        )
+        self.norm_mlp = operations.LayerNorm(model_dim, device=device, dtype=dtype) if layer_norm else operations.RMSNorm(model_dim, eps=1e-6, device=device, dtype=dtype)
+        self.mlp = nn.Sequential(
+            operations.Linear(model_dim, int(model_dim * mlp_ratio), device=device, dtype=dtype),
+            nn.GELU(),
+            operations.Linear(int(model_dim * mlp_ratio), model_dim, device=device, dtype=dtype)
+        )
+    def forward(self, x, context, target_attention_mask=None, source_attention_mask=None, position_embeddings=None, position_embeddings_context=None):
+        if self.use_self_attn:
+            normed = self.norm_self_attn(x)
+            attn_out = self.self_attn(normed, mask=target_attention_mask, position_embeddings=position_embeddings, position_embeddings_context=position_embeddings)
+            x = x + attn_out
+        normed = self.norm_cross_attn(x)
+        attn_out = self.cross_attn(normed, mask=source_attention_mask, context=context, position_embeddings=position_embeddings, position_embeddings_context=position_embeddings_context)
+        x = x + attn_out
+        x = x + self.mlp(self.norm_mlp(x))
+        return x
+    def init_weights(self):
+        torch.nn.init.zeros_(self.mlp[2].weight)
+        self.cross_attn.init_weights()
+class LLMAdapter(nn.Module):
+    def __init__(
+            self,
+            source_dim=1024,
+            target_dim=1024,
+            model_dim=1024,
+            num_layers=6,
+            num_heads=16,
+            use_self_attn=True,
+            layer_norm=False,
+            device=None,
+            dtype=None,
+            operations=None,
+        ):
+        super().__init__()
+        self.embed = operations.Embedding(32128, target_dim, device=device, dtype=dtype)
+        if model_dim != target_dim:
+            self.in_proj = operations.Linear(target_dim, model_dim, device=device, dtype=dtype)
+        else:
+            self.in_proj = nn.Identity()
+        self.rotary_emb = RotaryEmbedding(model_dim//num_heads)
+        self.blocks = nn.ModuleList([
+            LLMAdapterTransformerBlock(source_dim, model_dim, num_heads=num_heads, use_self_attn=use_self_attn, layer_norm=layer_norm, device=device, dtype=dtype, operations=operations) for _ in range(num_layers)
+        ])
+        self.out_proj = operations.Linear(model_dim, target_dim, device=device, dtype=dtype)
+        self.norm = operations.RMSNorm(target_dim, eps=1e-6, device=device, dtype=dtype)
+    def forward(self, source_hidden_states, target_input_ids, target_attention_mask=None, source_attention_mask=None):
+        if target_attention_mask is not None:
+            target_attention_mask = target_attention_mask.to(torch.bool)
+            if target_attention_mask.ndim == 2:
+                target_attention_mask = target_attention_mask.unsqueeze(1).unsqueeze(1)
+        if source_attention_mask is not None:
+            source_attention_mask = source_attention_mask.to(torch.bool)
+            if source_attention_mask.ndim == 2:
+                source_attention_mask = source_attention_mask.unsqueeze(1).unsqueeze(1)
+        context = source_hidden_states
+        x = self.in_proj(self.embed(target_input_ids).to(context.dtype))
+        position_ids = torch.arange(x.shape[1], device=x.device).unsqueeze(0)
+        position_ids_context = torch.arange(context.shape[1], device=x.device).unsqueeze(0)
+        position_embeddings = self.rotary_emb(x, position_ids)
+        position_embeddings_context = self.rotary_emb(x, position_ids_context)
+        for block in self.blocks:
+            x = block(x, context, target_attention_mask=target_attention_mask, source_attention_mask=source_attention_mask, position_embeddings=position_embeddings, position_embeddings_context=position_embeddings_context)
+        return self.norm(self.out_proj(x))
+class AnimaDiT(MiniTrainDIT):
+    _repeated_blocks = ["Block"]
+    def __init__(self):
+        kwargs = {'image_model': 'anima', 'max_img_h': 240, 'max_img_w': 240, 'max_frames': 128, 'in_channels': 16, 'out_channels': 16, 'patch_spatial': 2, 'patch_temporal': 1, 'model_channels': 2048, 'concat_padding_mask': True, 'crossattn_emb_channels': 1024, 'pos_emb_cls': 'rope3d', 'pos_emb_learnable': True, 'pos_emb_interpolation': 'crop', 'min_fps': 1, 'max_fps': 30, 'use_adaln_lora': True, 'adaln_lora_dim': 256, 'num_blocks': 28, 'num_heads': 16, 'extra_per_block_abs_pos_emb': False, 'rope_h_extrapolation_ratio': 4.0, 'rope_w_extrapolation_ratio': 4.0, 'rope_t_extrapolation_ratio': 1.0, 'extra_h_extrapolation_ratio': 1.0, 'extra_w_extrapolation_ratio': 1.0, 'extra_t_extrapolation_ratio': 1.0, 'rope_enable_fps_modulation': False, 'dtype': torch.bfloat16, 'device': None, 'operations': torch.nn}
+        super().__init__(**kwargs)
+        self.llm_adapter = LLMAdapter(device=kwargs.get("device"), dtype=kwargs.get("dtype"), operations=kwargs.get("operations"))
+    def preprocess_text_embeds(self, text_embeds, text_ids, t5xxl_weights=None):
+        if text_ids is not None:
+            out = self.llm_adapter(text_embeds, text_ids)
+            if t5xxl_weights is not None:
+                out = out * t5xxl_weights
+            if out.shape[1] < 512:
+                out = torch.nn.functional.pad(out, (0, 0, 0, 512 - out.shape[1]))
+            return out
+        else:
+            return text_embeds
+    def forward(
+        self,
+        x, timesteps, context,
+        use_gradient_checkpointing=False,
+        use_gradient_checkpointing_offload=False,
+        **kwargs
+    ):
+        t5xxl_ids = kwargs.pop("t5xxl_ids", None)
+        if t5xxl_ids is not None:
+            context = self.preprocess_text_embeds(context, t5xxl_ids, t5xxl_weights=kwargs.pop("t5xxl_weights", None))
+        return super().forward(
+            x, timesteps, context,
+            use_gradient_checkpointing=use_gradient_checkpointing, use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
+            **kwargs
+        )

diffsynth/models/dinov3_image_encoder.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from transformers import DINOv3ViTModel, DINOv3ViTImageProcessorFast
+from transformers.models.dinov3_vit.modeling_dinov3_vit import DINOv3ViTConfig
+import torch
+from ..core.device.npu_compatible_device import get_device_type
+class DINOv3ImageEncoder(DINOv3ViTModel):
+    def __init__(self):
+        config = DINOv3ViTConfig(
+            architectures = [
+                "DINOv3ViTModel"
+            ],
+            attention_dropout = 0.0,
+            drop_path_rate = 0.0,
+            dtype = "float32",
+            hidden_act = "silu",
+            hidden_size = 4096,
+            image_size = 224,
+            initializer_range = 0.02,
+            intermediate_size = 8192,
+            key_bias = False,
+            layer_norm_eps = 1e-05,
+            layerscale_value = 1.0,
+            mlp_bias = True,
+            model_type = "dinov3_vit",
+            num_attention_heads = 32,
+            num_channels = 3,
+            num_hidden_layers = 40,
+            num_register_tokens = 4,
+            patch_size = 16,
+            pos_embed_jitter = None,
+            pos_embed_rescale = 2.0,
+            pos_embed_shift = None,
+            proj_bias = True,
+            query_bias = False,
+            rope_theta = 100.0,
+            transformers_version = "4.56.1",
+            use_gated_mlp = True,
+            value_bias = False
+        )
+        super().__init__(config)
+        self.processor = DINOv3ViTImageProcessorFast(
+            crop_size = None,
+            data_format = "channels_first",
+            default_to_square = True,
+            device = None,
+            disable_grouping = None,
+            do_center_crop = None,
+            do_convert_rgb = None,
+            do_normalize = True,
+            do_rescale = True,
+            do_resize = True,
+            image_mean = [
+                0.485,
+                0.456,
+                0.406
+            ],
+            image_processor_type = "DINOv3ViTImageProcessorFast",
+            image_std = [
+                0.229,
+                0.224,
+                0.225
+            ],
+            input_data_format = None,
+            resample = 2,
+            rescale_factor = 0.00392156862745098,
+            return_tensors = None,
+            size = {
+                "height": 224,
+                "width": 224
+            }
+        )
+    def forward(self, image, torch_dtype=torch.bfloat16, device=get_device_type()):
+        inputs = self.processor(images=image, return_tensors="pt")
+        pixel_values = inputs["pixel_values"].to(dtype=torch_dtype, device=device)
+        bool_masked_pos = None
+        head_mask = None
+        pixel_values = pixel_values.to(torch_dtype)
+        hidden_states = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+        position_embeddings = self.rope_embeddings(pixel_values)
+        for i, layer_module in enumerate(self.layer):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            hidden_states = layer_module(
+                hidden_states,
+                attention_mask=layer_head_mask,
+                position_embeddings=position_embeddings,
+            )
+        sequence_output = self.norm(hidden_states)
+        pooled_output = sequence_output[:, 0, :]
+        return pooled_output

diffsynth/models/flux2_dit.py ADDED Viewed

	@@ -0,0 +1,1053 @@

+import inspect
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch, math
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from ..core.attention import attention_forward
+from ..core.gradient import gradient_checkpoint_forward
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+) -> torch.Tensor:
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+        sample_proj_bias=True,
+    ):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+        self.act = torch.nn.SiLU()
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)
+        if post_act_fn is None:
+            self.post_act = None
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+        if self.act is not None:
+            sample = self.act(sample)
+        sample = self.linear_2(sample)
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+        self.scale = scale
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+            scale=self.scale,
+        )
+        return t_emb
+class AdaLayerNormContinuous(nn.Module):
+    r"""
+    Adaptive normalization layer with a norm layer (layer_norm or rms_norm).
+    Args:
+        embedding_dim (`int`): Embedding dimension to use during projection.
+        conditioning_embedding_dim (`int`): Dimension of the input condition.
+        elementwise_affine (`bool`, defaults to `True`):
+            Boolean flag to denote if affine transformation should be applied.
+        eps (`float`, defaults to 1e-5): Epsilon factor.
+        bias (`bias`, defaults to `True`): Boolean flag to denote if bias should be use.
+        norm_type (`str`, defaults to `"layer_norm"`):
+            Normalization layer to use. Values supported: "layer_norm", "rms_norm".
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
+        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
+        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
+        # However, this is how it was implemented in the original code, and it's rather likely you should
+        # set `elementwise_affine` to False.
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = nn.LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
+        # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
+        emb = self.linear(self.silu(conditioning_embedding).to(x.dtype))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+def get_1d_rotary_pos_embed(
+    dim: int,
+    pos: Union[np.ndarray, int],
+    theta: float = 10000.0,
+    use_real=False,
+    linear_factor=1.0,
+    ntk_factor=1.0,
+    repeat_interleave_real=True,
+    freqs_dtype=torch.float32,  #  torch.float32, torch.float64 (flux)
+):
+    """
+    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end
+    index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64
+    data type.
+    Args:
+        dim (`int`): Dimension of the frequency tensor.
+        pos (`np.ndarray` or `int`): Position indices for the frequency tensor. [S] or scalar
+        theta (`float`, *optional*, defaults to 10000.0):
+            Scaling factor for frequency computation. Defaults to 10000.0.
+        use_real (`bool`, *optional*):
+            If True, return real part and imaginary part separately. Otherwise, return complex numbers.
+        linear_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for the context extrapolation. Defaults to 1.0.
+        ntk_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor for the NTK-Aware RoPE. Defaults to 1.0.
+        repeat_interleave_real (`bool`, *optional*, defaults to `True`):
+            If `True` and `use_real`, real part and imaginary part are each interleaved with themselves to reach `dim`.
+            Otherwise, they are concateanted with themselves.
+        freqs_dtype (`torch.float32` or `torch.float64`, *optional*, defaults to `torch.float32`):
+            the dtype of the frequency tensor.
+    Returns:
+        `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
+    """
+    assert dim % 2 == 0
+    if isinstance(pos, int):
+        pos = torch.arange(pos)
+    if isinstance(pos, np.ndarray):
+        pos = torch.from_numpy(pos)  # type: ignore  # [S]
+    theta = theta * ntk_factor
+    freqs = (
+        1.0 / (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype, device=pos.device) / dim)) / linear_factor
+    )  # [D/2]
+    freqs = torch.outer(pos, freqs)  # type: ignore   # [S, D/2]
+    is_npu = freqs.device.type == "npu"
+    if is_npu:
+        freqs = freqs.float()
+    if use_real and repeat_interleave_real:
+        # flux, hunyuan-dit, cogvideox
+        freqs_cos = freqs.cos().repeat_interleave(2, dim=1, output_size=freqs.shape[1] * 2).float()  # [S, D]
+        freqs_sin = freqs.sin().repeat_interleave(2, dim=1, output_size=freqs.shape[1] * 2).float()  # [S, D]
+        return freqs_cos, freqs_sin
+    elif use_real:
+        # stable audio, allegro
+        freqs_cos = torch.cat([freqs.cos(), freqs.cos()], dim=-1).float()  # [S, D]
+        freqs_sin = torch.cat([freqs.sin(), freqs.sin()], dim=-1).float()  # [S, D]
+        return freqs_cos, freqs_sin
+    else:
+        # lumina
+        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64     # [S, D/2]
+        return freqs_cis
+def apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+    sequence_dim: int = 2,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        if sequence_dim == 2:
+            cos = cos[None, None, :, :]
+            sin = sin[None, None, :, :]
+        elif sequence_dim == 1:
+            cos = cos[None, :, None, :]
+            sin = sin[None, :, None, :]
+        else:
+            raise ValueError(f"`sequence_dim={sequence_dim}` but should be 1 or 2.")
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if use_real_unbind_dim == -1:
+            # Used for flux, cogvideox, hunyuan-dit
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, H, S, D//2]
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, H, S, D//2]
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        # used for lumina
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(2)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)
+def _get_projections(attn: "Flux2Attention", hidden_states, encoder_hidden_states=None):
+    query = attn.to_q(hidden_states)
+    key = attn.to_k(hidden_states)
+    value = attn.to_v(hidden_states)
+    encoder_query = encoder_key = encoder_value = None
+    if encoder_hidden_states is not None and attn.added_kv_proj_dim is not None:
+        encoder_query = attn.add_q_proj(encoder_hidden_states)
+        encoder_key = attn.add_k_proj(encoder_hidden_states)
+        encoder_value = attn.add_v_proj(encoder_hidden_states)
+    return query, key, value, encoder_query, encoder_key, encoder_value
+def _get_fused_projections(attn: "Flux2Attention", hidden_states, encoder_hidden_states=None):
+    query, key, value = attn.to_qkv(hidden_states).chunk(3, dim=-1)
+    encoder_query = encoder_key = encoder_value = (None,)
+    if encoder_hidden_states is not None and hasattr(attn, "to_added_qkv"):
+        encoder_query, encoder_key, encoder_value = attn.to_added_qkv(encoder_hidden_states).chunk(3, dim=-1)
+    return query, key, value, encoder_query, encoder_key, encoder_value
+def _get_qkv_projections(attn: "Flux2Attention", hidden_states, encoder_hidden_states=None):
+    return _get_projections(attn, hidden_states, encoder_hidden_states)
+class Flux2SwiGLU(nn.Module):
+    """
+    Flux 2 uses a SwiGLU-style activation in the transformer feedforward sub-blocks, but with the linear projection
+    layer fused into the first linear layer of the FF sub-block. Thus, this module has no trainable parameters.
+    """
+    def __init__(self):
+        super().__init__()
+        self.gate_fn = nn.SiLU()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x1, x2 = x.chunk(2, dim=-1)
+        x = self.gate_fn(x1) * x2
+        return x
+class Flux2FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: float = 3.0,
+        inner_dim: Optional[int] = None,
+        bias: bool = False,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out or dim
+        # Flux2SwiGLU will reduce the dimension by half
+        self.linear_in = nn.Linear(dim, inner_dim * 2, bias=bias)
+        self.act_fn = Flux2SwiGLU()
+        self.linear_out = nn.Linear(inner_dim, dim_out, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.linear_in(x)
+        x = self.act_fn(x)
+        x = self.linear_out(x)
+        return x
+class Flux2AttnProcessor:
+    _attention_backend = None
+    _parallel_config = None
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(f"{self.__class__.__name__} requires PyTorch 2.0. Please upgrade your pytorch version.")
+    def __call__(
+        self,
+        attn: "Flux2Attention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        query, key, value, encoder_query, encoder_key, encoder_value = _get_qkv_projections(
+            attn, hidden_states, encoder_hidden_states
+        )
+        query = query.unflatten(-1, (attn.heads, -1))
+        key = key.unflatten(-1, (attn.heads, -1))
+        value = value.unflatten(-1, (attn.heads, -1))
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+        if attn.added_kv_proj_dim is not None:
+            encoder_query = encoder_query.unflatten(-1, (attn.heads, -1))
+            encoder_key = encoder_key.unflatten(-1, (attn.heads, -1))
+            encoder_value = encoder_value.unflatten(-1, (attn.heads, -1))
+            encoder_query = attn.norm_added_q(encoder_query)
+            encoder_key = attn.norm_added_k(encoder_key)
+            query = torch.cat([encoder_query, query], dim=1)
+            key = torch.cat([encoder_key, key], dim=1)
+            value = torch.cat([encoder_value, value], dim=1)
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
+            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
+        query, key, value = query.to(hidden_states.dtype), key.to(hidden_states.dtype), value.to(hidden_states.dtype)
+        hidden_states = attention_forward(
+            query,
+            key,
+            value,
+            q_pattern="b s n d", k_pattern="b s n d", v_pattern="b s n d", out_pattern="b s n d",
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(query.dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = hidden_states.split_with_sizes(
+                [encoder_hidden_states.shape[1], hidden_states.shape[1] - encoder_hidden_states.shape[1]], dim=1
+            )
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        if encoder_hidden_states is not None:
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
+class Flux2Attention(torch.nn.Module):
+    _default_processor_cls = Flux2AttnProcessor
+    _available_processors = [Flux2AttnProcessor]
+    def __init__(
+        self,
+        query_dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        added_kv_proj_dim: Optional[int] = None,
+        added_proj_bias: Optional[bool] = True,
+        out_bias: bool = True,
+        eps: float = 1e-5,
+        out_dim: int = None,
+        elementwise_affine: bool = True,
+        processor=None,
+    ):
+        super().__init__()
+        self.head_dim = dim_head
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        self.use_bias = bias
+        self.dropout = dropout
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.added_proj_bias = added_proj_bias
+        self.to_q = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
+        self.to_k = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
+        self.to_v = torch.nn.Linear(query_dim, self.inner_dim, bias=bias)
+        # QK Norm
+        self.norm_q = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+        self.norm_k = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+        self.to_out = torch.nn.ModuleList([])
+        self.to_out.append(torch.nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
+        self.to_out.append(torch.nn.Dropout(dropout))
+        if added_kv_proj_dim is not None:
+            self.norm_added_q = torch.nn.RMSNorm(dim_head, eps=eps)
+            self.norm_added_k = torch.nn.RMSNorm(dim_head, eps=eps)
+            self.add_q_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+            self.add_k_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+            self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+            self.to_add_out = torch.nn.Linear(self.inner_dim, query_dim, bias=out_bias)
+        if processor is None:
+            processor = self._default_processor_cls()
+        self.processor = processor
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
+        kwargs = {k: w for k, w in kwargs.items() if k in attn_parameters}
+        return self.processor(self, hidden_states, encoder_hidden_states, attention_mask, image_rotary_emb, **kwargs)
+class Flux2ParallelSelfAttnProcessor:
+    _attention_backend = None
+    _parallel_config = None
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(f"{self.__class__.__name__} requires PyTorch 2.0. Please upgrade your pytorch version.")
+    def __call__(
+        self,
+        attn: "Flux2ParallelSelfAttention",
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        # Parallel in (QKV + MLP in) projection
+        hidden_states = attn.to_qkv_mlp_proj(hidden_states)
+        qkv, mlp_hidden_states = torch.split(
+            hidden_states, [3 * attn.inner_dim, attn.mlp_hidden_dim * attn.mlp_mult_factor], dim=-1
+        )
+        # Handle the attention logic
+        query, key, value = qkv.chunk(3, dim=-1)
+        query = query.unflatten(-1, (attn.heads, -1))
+        key = key.unflatten(-1, (attn.heads, -1))
+        value = value.unflatten(-1, (attn.heads, -1))
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+        if image_rotary_emb is not None:
+            query = apply_rotary_emb(query, image_rotary_emb, sequence_dim=1)
+            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)
+        query, key, value = query.to(hidden_states.dtype), key.to(hidden_states.dtype), value.to(hidden_states.dtype)
+        hidden_states = attention_forward(
+            query,
+            key,
+            value,
+            q_pattern="b s n d", k_pattern="b s n d", v_pattern="b s n d", out_pattern="b s n d",
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.to(query.dtype)
+        # Handle the feedforward (FF) logic
+        mlp_hidden_states = attn.mlp_act_fn(mlp_hidden_states)
+        # Concatenate and parallel output projection
+        hidden_states = torch.cat([hidden_states, mlp_hidden_states], dim=-1)
+        hidden_states = attn.to_out(hidden_states)
+        return hidden_states
+class Flux2ParallelSelfAttention(torch.nn.Module):
+    """
+    Flux 2 parallel self-attention for the Flux 2 single-stream transformer blocks.
+    This implements a parallel transformer block, where the attention QKV projections are fused to the feedforward (FF)
+    input projections, and the attention output projections are fused to the FF output projections. See the [ViT-22B
+    paper](https://arxiv.org/abs/2302.05442) for a visual depiction of this type of transformer block.
+    """
+    _default_processor_cls = Flux2ParallelSelfAttnProcessor
+    _available_processors = [Flux2ParallelSelfAttnProcessor]
+    # Does not support QKV fusion as the QKV projections are always fused
+    _supports_qkv_fusion = False
+    def __init__(
+        self,
+        query_dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        out_bias: bool = True,
+        eps: float = 1e-5,
+        out_dim: int = None,
+        elementwise_affine: bool = True,
+        mlp_ratio: float = 4.0,
+        mlp_mult_factor: int = 2,
+        processor=None,
+    ):
+        super().__init__()
+        self.head_dim = dim_head
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        self.use_bias = bias
+        self.dropout = dropout
+        self.mlp_ratio = mlp_ratio
+        self.mlp_hidden_dim = int(query_dim * self.mlp_ratio)
+        self.mlp_mult_factor = mlp_mult_factor
+        # Fused QKV projections + MLP input projection
+        self.to_qkv_mlp_proj = torch.nn.Linear(
+            self.query_dim, self.inner_dim * 3 + self.mlp_hidden_dim * self.mlp_mult_factor, bias=bias
+        )
+        self.mlp_act_fn = Flux2SwiGLU()
+        # QK Norm
+        self.norm_q = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+        self.norm_k = torch.nn.RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+        # Fused attention output projection + MLP output projection
+        self.to_out = torch.nn.Linear(self.inner_dim + self.mlp_hidden_dim, self.out_dim, bias=out_bias)
+        if processor is None:
+            processor = self._default_processor_cls()
+        self.processor = processor
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
+        kwargs = {k: w for k, w in kwargs.items() if k in attn_parameters}
+        return self.processor(self, hidden_states, attention_mask, image_rotary_emb, **kwargs)
+class Flux2SingleTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float = 3.0,
+        eps: float = 1e-6,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        # Note that the MLP in/out linear layers are fused with the attention QKV/out projections, respectively; this
+        # is often called a "parallel" transformer block. See the [ViT-22B paper](https://arxiv.org/abs/2302.05442)
+        # for a visual depiction of this type of transformer block.
+        self.attn = Flux2ParallelSelfAttention(
+            query_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=bias,
+            out_bias=bias,
+            eps=eps,
+            mlp_ratio=mlp_ratio,
+            mlp_mult_factor=2,
+            processor=Flux2ParallelSelfAttnProcessor(),
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor],
+        temb_mod_params: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        split_hidden_states: bool = False,
+        text_seq_len: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # If encoder_hidden_states is None, hidden_states is assumed to have encoder_hidden_states already
+        # concatenated
+        if encoder_hidden_states is not None:
+            text_seq_len = encoder_hidden_states.shape[1]
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        mod_shift, mod_scale, mod_gate = temb_mod_params
+        norm_hidden_states = self.norm(hidden_states)
+        norm_hidden_states = (1 + mod_scale) * norm_hidden_states + mod_shift
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+        hidden_states = hidden_states + mod_gate * attn_output
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        if split_hidden_states:
+            encoder_hidden_states, hidden_states = hidden_states[:, :text_seq_len], hidden_states[:, text_seq_len:]
+            return encoder_hidden_states, hidden_states
+        else:
+            return hidden_states
+class Flux2TransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float = 3.0,
+        eps: float = 1e-6,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.norm1_context = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.attn = Flux2Attention(
+            query_dim=dim,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=bias,
+            added_proj_bias=bias,
+            out_bias=bias,
+            eps=eps,
+            processor=Flux2AttnProcessor(),
+        )
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.ff = Flux2FeedForward(dim=dim, dim_out=dim, mult=mlp_ratio, bias=bias)
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.ff_context = Flux2FeedForward(dim=dim, dim_out=dim, mult=mlp_ratio, bias=bias)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb_mod_params_img: Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...],
+        temb_mod_params_txt: Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...],
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        # Modulation parameters shape: [1, 1, self.dim]
+        (shift_msa, scale_msa, gate_msa), (shift_mlp, scale_mlp, gate_mlp) = temb_mod_params_img
+        (c_shift_msa, c_scale_msa, c_gate_msa), (c_shift_mlp, c_scale_mlp, c_gate_mlp) = temb_mod_params_txt
+        # Img stream
+        norm_hidden_states = self.norm1(hidden_states)
+        norm_hidden_states = (1 + scale_msa) * norm_hidden_states + shift_msa
+        # Conditioning txt stream
+        norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states)
+        norm_encoder_hidden_states = (1 + c_scale_msa) * norm_encoder_hidden_states + c_shift_msa
+        # Attention on concatenated img + txt stream
+        attention_outputs = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+        attn_output, context_attn_output = attention_outputs
+        # Process attention outputs for the image stream (`hidden_states`).
+        attn_output = gate_msa * attn_output
+        hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        ff_output = self.ff(norm_hidden_states)
+        hidden_states = hidden_states + gate_mlp * ff_output
+        # Process attention outputs for the text stream (`encoder_hidden_states`).
+        context_attn_output = c_gate_msa * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp) + c_shift_mlp
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        return encoder_hidden_states, hidden_states
+class Flux2PosEmbed(nn.Module):
+    # modified from https://github.com/black-forest-labs/flux/blob/c00d7c60b085fce8058b9df845e036090873f2ce/src/flux/modules/layers.py#L11
+    def __init__(self, theta: int, axes_dim: List[int]):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: torch.Tensor) -> torch.Tensor:
+        # Expected ids shape: [S, len(self.axes_dim)]
+        cos_out = []
+        sin_out = []
+        pos = ids.float()
+        is_mps = ids.device.type == "mps"
+        is_npu = ids.device.type == "npu"
+        freqs_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+        # Unlike Flux 1, loop over len(self.axes_dim) rather than ids.shape[-1]
+        for i in range(len(self.axes_dim)):
+            cos, sin = get_1d_rotary_pos_embed(
+                self.axes_dim[i],
+                pos[..., i],
+                theta=self.theta,
+                repeat_interleave_real=True,
+                use_real=True,
+                freqs_dtype=freqs_dtype,
+            )
+            cos_out.append(cos)
+            sin_out.append(sin)
+        freqs_cos = torch.cat(cos_out, dim=-1).to(ids.device)
+        freqs_sin = torch.cat(sin_out, dim=-1).to(ids.device)
+        return freqs_cos, freqs_sin
+class Flux2TimestepGuidanceEmbeddings(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 256,
+        embedding_dim: int = 6144,
+        bias: bool = False,
+        guidance_embeds: bool = True,
+    ):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=in_channels, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(
+            in_channels=in_channels, time_embed_dim=embedding_dim, sample_proj_bias=bias
+        )
+        if guidance_embeds:
+            self.guidance_embedder = TimestepEmbedding(
+                in_channels=in_channels, time_embed_dim=embedding_dim, sample_proj_bias=bias
+            )
+        else:
+            self.guidance_embedder = None
+    def forward(self, timestep: torch.Tensor, guidance: torch.Tensor) -> torch.Tensor:
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(timestep.dtype))  # (N, D)
+        if guidance is not None and self.guidance_embedder is not None:
+            guidance_proj = self.time_proj(guidance)
+            guidance_emb = self.guidance_embedder(guidance_proj.to(guidance.dtype))  # (N, D)
+            time_guidance_emb = timesteps_emb + guidance_emb
+            return time_guidance_emb
+        else:
+            return timesteps_emb
+class Flux2Modulation(nn.Module):
+    def __init__(self, dim: int, mod_param_sets: int = 2, bias: bool = False):
+        super().__init__()
+        self.mod_param_sets = mod_param_sets
+        self.linear = nn.Linear(dim, dim * 3 * self.mod_param_sets, bias=bias)
+        self.act_fn = nn.SiLU()
+    def forward(self, temb: torch.Tensor) -> Tuple[Tuple[torch.Tensor, torch.Tensor, torch.Tensor], ...]:
+        mod = self.act_fn(temb)
+        mod = self.linear(mod)
+        if mod.ndim == 2:
+            mod = mod.unsqueeze(1)
+        mod_params = torch.chunk(mod, 3 * self.mod_param_sets, dim=-1)
+        # Return tuple of 3-tuples of modulation params shift/scale/gate
+        return tuple(mod_params[3 * i : 3 * (i + 1)] for i in range(self.mod_param_sets))
+class Flux2DiT(torch.nn.Module):
+    _repeated_blocks = ["Flux2TransformerBlock", "Flux2SingleTransformerBlock"]
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 128,
+        out_channels: Optional[int] = None,
+        num_layers: int = 8,
+        num_single_layers: int = 48,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 48,
+        joint_attention_dim: int = 15360,
+        timestep_guidance_channels: int = 256,
+        mlp_ratio: float = 3.0,
+        axes_dims_rope: Tuple[int, ...] = (32, 32, 32, 32),
+        rope_theta: int = 2000,
+        eps: float = 1e-6,
+        guidance_embeds: bool = True,
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        # 1. Sinusoidal positional embedding for RoPE on image and text tokens
+        self.pos_embed = Flux2PosEmbed(theta=rope_theta, axes_dim=axes_dims_rope)
+        # 2. Combined timestep + guidance embedding
+        self.time_guidance_embed = Flux2TimestepGuidanceEmbeddings(
+            in_channels=timestep_guidance_channels,
+            embedding_dim=self.inner_dim,
+            bias=False,
+            guidance_embeds=guidance_embeds,
+        )
+        # 3. Modulation (double stream and single stream blocks share modulation parameters, resp.)
+        # Two sets of shift/scale/gate modulation parameters for the double stream attn and FF sub-blocks
+        self.double_stream_modulation_img = Flux2Modulation(self.inner_dim, mod_param_sets=2, bias=False)
+        self.double_stream_modulation_txt = Flux2Modulation(self.inner_dim, mod_param_sets=2, bias=False)
+        # Only one set of modulation parameters as the attn and FF sub-blocks are run in parallel for single stream
+        self.single_stream_modulation = Flux2Modulation(self.inner_dim, mod_param_sets=1, bias=False)
+        # 4. Input projections
+        self.x_embedder = nn.Linear(in_channels, self.inner_dim, bias=False)
+        self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim, bias=False)
+        # 5. Double Stream Transformer Blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                Flux2TransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    mlp_ratio=mlp_ratio,
+                    eps=eps,
+                    bias=False,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        # 6. Single Stream Transformer Blocks
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                Flux2SingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    mlp_ratio=mlp_ratio,
+                    eps=eps,
+                    bias=False,
+                )
+                for _ in range(num_single_layers)
+            ]
+        )
+        # 7. Output layers
+        self.norm_out = AdaLayerNormContinuous(
+            self.inner_dim, self.inner_dim, elementwise_affine=False, eps=eps, bias=False
+        )
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=False)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        use_gradient_checkpointing=False,
+        use_gradient_checkpointing_offload=False,
+    ):
+        # 0. Handle input arguments
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        num_txt_tokens = encoder_hidden_states.shape[1]
+        # 1. Calculate timestep embedding and modulation parameters
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        temb = self.time_guidance_embed(timestep, guidance)
+        double_stream_mod_img = self.double_stream_modulation_img(temb)
+        double_stream_mod_txt = self.double_stream_modulation_txt(temb)
+        single_stream_mod = self.single_stream_modulation(temb)[0]
+        # 2. Input projection for image (hidden_states) and conditioning text (encoder_hidden_states)
+        hidden_states = self.x_embedder(hidden_states)
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        # 3. Calculate RoPE embeddings from image and text tokens
+        # NOTE: the below logic means that we can't support batched inference with images of different resolutions or
+        # text prompts of differents lengths. Is this a use case we want to support?
+        if img_ids.ndim == 3:
+            img_ids = img_ids[0]
+        if txt_ids.ndim == 3:
+            txt_ids = txt_ids[0]
+        image_rotary_emb = self.pos_embed(img_ids)
+        text_rotary_emb = self.pos_embed(txt_ids)
+        concat_rotary_emb = (
+            torch.cat([text_rotary_emb[0], image_rotary_emb[0]], dim=0),
+            torch.cat([text_rotary_emb[1], image_rotary_emb[1]], dim=0),
+        )
+        # 4. Double Stream Transformer Blocks
+        for index_block, block in enumerate(self.transformer_blocks):
+            encoder_hidden_states, hidden_states = gradient_checkpoint_forward(
+                block,
+                use_gradient_checkpointing=use_gradient_checkpointing,
+                use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                temb_mod_params_img=double_stream_mod_img,
+                temb_mod_params_txt=double_stream_mod_txt,
+                image_rotary_emb=concat_rotary_emb,
+                joint_attention_kwargs=joint_attention_kwargs,
+            )
+        # Concatenate text and image streams for single-block inference
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        # 5. Single Stream Transformer Blocks
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            hidden_states = gradient_checkpoint_forward(
+                block,
+                use_gradient_checkpointing=use_gradient_checkpointing,
+                use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
+                hidden_states=hidden_states,
+                encoder_hidden_states=None,
+                temb_mod_params=single_stream_mod,
+                image_rotary_emb=concat_rotary_emb,
+                joint_attention_kwargs=joint_attention_kwargs,
+            )
+        # Remove text tokens from concatenated stream
+        hidden_states = hidden_states[:, num_txt_tokens:, ...]
+        # 6. Output layers
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+        return output

diffsynth/models/flux2_text_encoder.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from transformers import Mistral3ForConditionalGeneration, Mistral3Config
+class Flux2TextEncoder(Mistral3ForConditionalGeneration):
+    def __init__(self):
+        config = Mistral3Config(**{
+            "architectures": [
+                "Mistral3ForConditionalGeneration"
+            ],
+            "dtype": "bfloat16",
+            "image_token_index": 10,
+            "model_type": "mistral3",
+            "multimodal_projector_bias": False,
+            "projector_hidden_act": "gelu",
+            "spatial_merge_size": 2,
+            "text_config": {
+                "attention_dropout": 0.0,
+                "dtype": "bfloat16",
+                "head_dim": 128,
+                "hidden_act": "silu",
+                "hidden_size": 5120,
+                "initializer_range": 0.02,
+                "intermediate_size": 32768,
+                "max_position_embeddings": 131072,
+                "model_type": "mistral",
+                "num_attention_heads": 32,
+                "num_hidden_layers": 40,
+                "num_key_value_heads": 8,
+                "rms_norm_eps": 1e-05,
+                "rope_theta": 1000000000.0,
+                "sliding_window": None,
+                "use_cache": True,
+                "vocab_size": 131072
+            },
+            "transformers_version": "4.57.1",
+            "vision_config": {
+                "attention_dropout": 0.0,
+                "dtype": "bfloat16",
+                "head_dim": 64,
+                "hidden_act": "silu",
+                "hidden_size": 1024,
+                "image_size": 1540,
+                "initializer_range": 0.02,
+                "intermediate_size": 4096,
+                "model_type": "pixtral",
+                "num_attention_heads": 16,
+                "num_channels": 3,
+                "num_hidden_layers": 24,
+                "patch_size": 14,
+                "rope_theta": 10000.0
+            },
+            "vision_feature_layer": -1
+        })
+        super().__init__(config)
+    def forward(self, input_ids = None, pixel_values = None, attention_mask = None, position_ids = None, past_key_values = None, inputs_embeds = None, labels = None, use_cache = None, output_attentions = None, output_hidden_states = None, return_dict = None, cache_position = None, logits_to_keep = 0, image_sizes = None, **kwargs):
+        return super().forward(input_ids, pixel_values, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, logits_to_keep, image_sizes, **kwargs)

diffsynth/models/flux2_vae.py ADDED Viewed

The diff for this file is too large to render. See raw diff

diffsynth/models/flux_controlnet.py ADDED Viewed

	@@ -0,0 +1,384 @@

+import torch
+from einops import rearrange, repeat
+from .flux_dit import RoPEEmbedding, TimestepEmbeddings, FluxJointTransformerBlock, FluxSingleTransformerBlock, RMSNorm
+# from .utils import hash_state_dict_keys, init_weights_on_device
+from contextlib import contextmanager
+def hash_state_dict_keys(state_dict, with_shape=True):
+    keys_str = convert_state_dict_keys_to_single_str(state_dict, with_shape=with_shape)
+    keys_str = keys_str.encode(encoding="UTF-8")
+    return hashlib.md5(keys_str).hexdigest()
+@contextmanager
+def init_weights_on_device(device = torch.device("meta"), include_buffers :bool = False):
+    old_register_parameter = torch.nn.Module.register_parameter
+    if include_buffers:
+        old_register_buffer = torch.nn.Module.register_buffer
+    def register_empty_parameter(module, name, param):
+        old_register_parameter(module, name, param)
+        if param is not None:
+            param_cls = type(module._parameters[name])
+            kwargs = module._parameters[name].__dict__
+            kwargs["requires_grad"] = param.requires_grad
+            module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
+    def register_empty_buffer(module, name, buffer, persistent=True):
+        old_register_buffer(module, name, buffer, persistent=persistent)
+        if buffer is not None:
+            module._buffers[name] = module._buffers[name].to(device)
+    def patch_tensor_constructor(fn):
+        def wrapper(*args, **kwargs):
+            kwargs["device"] = device
+            return fn(*args, **kwargs)
+        return wrapper
+    if include_buffers:
+        tensor_constructors_to_patch = {
+            torch_function_name: getattr(torch, torch_function_name)
+            for torch_function_name in ["empty", "zeros", "ones", "full"]
+        }
+    else:
+        tensor_constructors_to_patch = {}
+    try:
+        torch.nn.Module.register_parameter = register_empty_parameter
+        if include_buffers:
+            torch.nn.Module.register_buffer = register_empty_buffer
+        for torch_function_name in tensor_constructors_to_patch.keys():
+            setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
+        yield
+    finally:
+        torch.nn.Module.register_parameter = old_register_parameter
+        if include_buffers:
+            torch.nn.Module.register_buffer = old_register_buffer
+        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
+            setattr(torch, torch_function_name, old_torch_function)
+class FluxControlNet(torch.nn.Module):
+    def __init__(self, disable_guidance_embedder=False, num_joint_blocks=5, num_single_blocks=10, num_mode=0, mode_dict={}, additional_input_dim=0):
+        super().__init__()
+        self.pos_embedder = RoPEEmbedding(3072, 10000, [16, 56, 56])
+        self.time_embedder = TimestepEmbeddings(256, 3072)
+        self.guidance_embedder = None if disable_guidance_embedder else TimestepEmbeddings(256, 3072)
+        self.pooled_text_embedder = torch.nn.Sequential(torch.nn.Linear(768, 3072), torch.nn.SiLU(), torch.nn.Linear(3072, 3072))
+        self.context_embedder = torch.nn.Linear(4096, 3072)
+        self.x_embedder = torch.nn.Linear(64, 3072)
+        self.blocks = torch.nn.ModuleList([FluxJointTransformerBlock(3072, 24) for _ in range(num_joint_blocks)])
+        self.single_blocks = torch.nn.ModuleList([FluxSingleTransformerBlock(3072, 24) for _ in range(num_single_blocks)])
+        self.controlnet_blocks = torch.nn.ModuleList([torch.nn.Linear(3072, 3072) for _ in range(num_joint_blocks)])
+        self.controlnet_single_blocks = torch.nn.ModuleList([torch.nn.Linear(3072, 3072) for _ in range(num_single_blocks)])
+        self.mode_dict = mode_dict
+        self.controlnet_mode_embedder = torch.nn.Embedding(num_mode, 3072) if len(mode_dict) > 0 else None
+        self.controlnet_x_embedder = torch.nn.Linear(64 + additional_input_dim, 3072)
+    def prepare_image_ids(self, latents):
+        batch_size, _, height, width = latents.shape
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1)
+        latent_image_ids = latent_image_ids.reshape(
+            batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        latent_image_ids = latent_image_ids.to(device=latents.device, dtype=latents.dtype)
+        return latent_image_ids
+    def patchify(self, hidden_states):
+        hidden_states = rearrange(hidden_states, "B C (H P) (W Q) -> B (H W) (C P Q)", P=2, Q=2)
+        return hidden_states
+    def align_res_stack_to_original_blocks(self, res_stack, num_blocks, hidden_states):
+        if len(res_stack) == 0:
+            return [torch.zeros_like(hidden_states)] * num_blocks
+        interval = (num_blocks + len(res_stack) - 1) // len(res_stack)
+        aligned_res_stack = [res_stack[block_id // interval] for block_id in range(num_blocks)]
+        return aligned_res_stack
+    def forward(
+        self,
+        hidden_states,
+        controlnet_conditioning,
+        timestep, prompt_emb, pooled_prompt_emb, guidance, text_ids, image_ids=None,
+        processor_id=None,
+        tiled=False, tile_size=128, tile_stride=64,
+        **kwargs
+    ):
+        if image_ids is None:
+            image_ids = self.prepare_image_ids(hidden_states)
+        conditioning = self.time_embedder(timestep, hidden_states.dtype) + self.pooled_text_embedder(pooled_prompt_emb)
+        if self.guidance_embedder is not None:
+            guidance = guidance * 1000
+            conditioning = conditioning + self.guidance_embedder(guidance, hidden_states.dtype)
+        prompt_emb = self.context_embedder(prompt_emb)
+        if self.controlnet_mode_embedder is not None: # Different from FluxDiT
+            processor_id = torch.tensor([self.mode_dict[processor_id]], dtype=torch.int)
+            processor_id = repeat(processor_id, "D -> B D", B=1).to(text_ids.device)
+            prompt_emb = torch.concat([self.controlnet_mode_embedder(processor_id), prompt_emb], dim=1)
+            text_ids = torch.cat([text_ids[:, :1], text_ids], dim=1)
+        image_rotary_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
+        hidden_states = self.patchify(hidden_states)
+        hidden_states = self.x_embedder(hidden_states)
+        controlnet_conditioning = self.patchify(controlnet_conditioning) # Different from FluxDiT
+        hidden_states = hidden_states + self.controlnet_x_embedder(controlnet_conditioning) # Different from FluxDiT
+        controlnet_res_stack = []
+        for block, controlnet_block in zip(self.blocks, self.controlnet_blocks):
+            hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, image_rotary_emb)
+            controlnet_res_stack.append(controlnet_block(hidden_states))
+        controlnet_single_res_stack = []
+        hidden_states = torch.cat([prompt_emb, hidden_states], dim=1)
+        for block, controlnet_block in zip(self.single_blocks, self.controlnet_single_blocks):
+            hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, image_rotary_emb)
+            controlnet_single_res_stack.append(controlnet_block(hidden_states[:, prompt_emb.shape[1]:]))
+        controlnet_res_stack = self.align_res_stack_to_original_blocks(controlnet_res_stack, 19, hidden_states[:, prompt_emb.shape[1]:])
+        controlnet_single_res_stack = self.align_res_stack_to_original_blocks(controlnet_single_res_stack, 38, hidden_states[:, prompt_emb.shape[1]:])
+        return controlnet_res_stack, controlnet_single_res_stack
+    # @staticmethod
+    # def state_dict_converter():
+    #     return FluxControlNetStateDictConverter()
+    def quantize(self):
+        def cast_to(weight, dtype=None, device=None, copy=False):
+            if device is None or weight.device == device:
+                if not copy:
+                    if dtype is None or weight.dtype == dtype:
+                        return weight
+                return weight.to(dtype=dtype, copy=copy)
+            r = torch.empty_like(weight, dtype=dtype, device=device)
+            r.copy_(weight)
+            return r
+        def cast_weight(s, input=None, dtype=None, device=None):
+            if input is not None:
+                if dtype is None:
+                    dtype = input.dtype
+                if device is None:
+                    device = input.device
+            weight = cast_to(s.weight, dtype, device)
+            return weight
+        def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
+            if input is not None:
+                if dtype is None:
+                    dtype = input.dtype
+                if bias_dtype is None:
+                    bias_dtype = dtype
+                if device is None:
+                    device = input.device
+            bias = None
+            weight = cast_to(s.weight, dtype, device)
+            bias = cast_to(s.bias, bias_dtype, device)
+            return weight, bias
+        class quantized_layer:
+            class QLinear(torch.nn.Linear):
+                def __init__(self, *args, **kwargs):
+                    super().__init__(*args, **kwargs)
+                def forward(self,input,**kwargs):
+                    weight,bias= cast_bias_weight(self,input)
+                    return torch.nn.functional.linear(input,weight,bias)
+            class QRMSNorm(torch.nn.Module):
+                def __init__(self, module):
+                    super().__init__()
+                    self.module = module
+                def forward(self,hidden_states,**kwargs):
+                    weight= cast_weight(self.module,hidden_states)
+                    input_dtype = hidden_states.dtype
+                    variance = hidden_states.to(torch.float32).square().mean(-1, keepdim=True)
+                    hidden_states = hidden_states * torch.rsqrt(variance + self.module.eps)
+                    hidden_states = hidden_states.to(input_dtype) * weight
+                    return hidden_states
+            class QEmbedding(torch.nn.Embedding):
+                def __init__(self, *args, **kwargs):
+                    super().__init__(*args, **kwargs)
+                def forward(self,input,**kwargs):
+                    weight= cast_weight(self,input)
+                    return torch.nn.functional.embedding(
+                        input, weight, self.padding_idx, self.max_norm,
+                        self.norm_type, self.scale_grad_by_freq, self.sparse)
+        def replace_layer(model):
+            for name, module in model.named_children():
+                if isinstance(module,quantized_layer.QRMSNorm):
+                    continue
+                if isinstance(module, torch.nn.Linear):
+                    with init_weights_on_device():
+                        new_layer = quantized_layer.QLinear(module.in_features,module.out_features)
+                    new_layer.weight = module.weight
+                    if module.bias is not None:
+                        new_layer.bias = module.bias
+                    setattr(model, name, new_layer)
+                elif isinstance(module, RMSNorm):
+                    if hasattr(module,"quantized"):
+                        continue
+                    module.quantized= True
+                    new_layer = quantized_layer.QRMSNorm(module)
+                    setattr(model, name, new_layer)
+                elif isinstance(module,torch.nn.Embedding):
+                    rows, cols = module.weight.shape
+                    new_layer = quantized_layer.QEmbedding(
+                        num_embeddings=rows,
+                        embedding_dim=cols,
+                        _weight=module.weight,
+                        # _freeze=module.freeze,
+                        padding_idx=module.padding_idx,
+                        max_norm=module.max_norm,
+                        norm_type=module.norm_type,
+                        scale_grad_by_freq=module.scale_grad_by_freq,
+                        sparse=module.sparse)
+                    setattr(model, name, new_layer)
+                else:
+                    replace_layer(module)
+        replace_layer(self)
+class FluxControlNetStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        hash_value = hash_state_dict_keys(state_dict)
+        global_rename_dict = {
+            "context_embedder": "context_embedder",
+            "x_embedder": "x_embedder",
+            "time_text_embed.timestep_embedder.linear_1": "time_embedder.timestep_embedder.0",
+            "time_text_embed.timestep_embedder.linear_2": "time_embedder.timestep_embedder.2",
+            "time_text_embed.guidance_embedder.linear_1": "guidance_embedder.timestep_embedder.0",
+            "time_text_embed.guidance_embedder.linear_2": "guidance_embedder.timestep_embedder.2",
+            "time_text_embed.text_embedder.linear_1": "pooled_text_embedder.0",
+            "time_text_embed.text_embedder.linear_2": "pooled_text_embedder.2",
+            "norm_out.linear": "final_norm_out.linear",
+            "proj_out": "final_proj_out",
+        }
+        rename_dict = {
+            "proj_out": "proj_out",
+            "norm1.linear": "norm1_a.linear",
+            "norm1_context.linear": "norm1_b.linear",
+            "attn.to_q": "attn.a_to_q",
+            "attn.to_k": "attn.a_to_k",
+            "attn.to_v": "attn.a_to_v",
+            "attn.to_out.0": "attn.a_to_out",
+            "attn.add_q_proj": "attn.b_to_q",
+            "attn.add_k_proj": "attn.b_to_k",
+            "attn.add_v_proj": "attn.b_to_v",
+            "attn.to_add_out": "attn.b_to_out",
+            "ff.net.0.proj": "ff_a.0",
+            "ff.net.2": "ff_a.2",
+            "ff_context.net.0.proj": "ff_b.0",
+            "ff_context.net.2": "ff_b.2",
+            "attn.norm_q": "attn.norm_q_a",
+            "attn.norm_k": "attn.norm_k_a",
+            "attn.norm_added_q": "attn.norm_q_b",
+            "attn.norm_added_k": "attn.norm_k_b",
+        }
+        rename_dict_single = {
+            "attn.to_q": "a_to_q",
+            "attn.to_k": "a_to_k",
+            "attn.to_v": "a_to_v",
+            "attn.norm_q": "norm_q_a",
+            "attn.norm_k": "norm_k_a",
+            "norm.linear": "norm.linear",
+            "proj_mlp": "proj_in_besides_attn",
+            "proj_out": "proj_out",
+        }
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            if name.endswith(".weight") or name.endswith(".bias"):
+                suffix = ".weight" if name.endswith(".weight") else ".bias"
+                prefix = name[:-len(suffix)]
+                if prefix in global_rename_dict:
+                    state_dict_[global_rename_dict[prefix] + suffix] = param
+                elif prefix.startswith("transformer_blocks."):
+                    names = prefix.split(".")
+                    names[0] = "blocks"
+                    middle = ".".join(names[2:])
+                    if middle in rename_dict:
+                        name_ = ".".join(names[:2] + [rename_dict[middle]] + [suffix[1:]])
+                        state_dict_[name_] = param
+                elif prefix.startswith("single_transformer_blocks."):
+                    names = prefix.split(".")
+                    names[0] = "single_blocks"
+                    middle = ".".join(names[2:])
+                    if middle in rename_dict_single:
+                        name_ = ".".join(names[:2] + [rename_dict_single[middle]] + [suffix[1:]])
+                        state_dict_[name_] = param
+                    else:
+                        state_dict_[name] = param
+                else:
+                    state_dict_[name] = param
+        for name in list(state_dict_.keys()):
+            if ".proj_in_besides_attn." in name:
+                name_ = name.replace(".proj_in_besides_attn.", ".to_qkv_mlp.")
+                param = torch.concat([
+                    state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_q.")],
+                    state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_k.")],
+                    state_dict_[name.replace(".proj_in_besides_attn.", f".a_to_v.")],
+                    state_dict_[name],
+                ], dim=0)
+                state_dict_[name_] = param
+                state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_q."))
+                state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_k."))
+                state_dict_.pop(name.replace(".proj_in_besides_attn.", f".a_to_v."))
+                state_dict_.pop(name)
+        for name in list(state_dict_.keys()):
+            for component in ["a", "b"]:
+                if f".{component}_to_q." in name:
+                    name_ = name.replace(f".{component}_to_q.", f".{component}_to_qkv.")
+                    param = torch.concat([
+                        state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")],
+                        state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")],
+                        state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")],
+                    ], dim=0)
+                    state_dict_[name_] = param
+                    state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_q."))
+                    state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_k."))
+                    state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_v."))
+        if hash_value == "78d18b9101345ff695f312e7e62538c0":
+            extra_kwargs = {"num_mode": 10, "mode_dict": {"canny": 0, "tile": 1, "depth": 2, "blur": 3, "pose": 4, "gray": 5, "lq": 6}}
+        elif hash_value == "b001c89139b5f053c715fe772362dd2a":
+            extra_kwargs = {"num_single_blocks": 0}
+        elif hash_value == "52357cb26250681367488a8954c271e8":
+            extra_kwargs = {"num_joint_blocks": 6, "num_single_blocks": 0, "additional_input_dim": 4}
+        elif hash_value == "0cfd1740758423a2a854d67c136d1e8c":
+            extra_kwargs = {"num_joint_blocks": 4, "num_single_blocks": 1}
+        elif hash_value == "7f9583eb8ba86642abb9a21a4b2c9e16":
+            extra_kwargs = {"num_joint_blocks": 4, "num_single_blocks": 10}
+        elif hash_value == "43ad5aaa27dd4ee01b832ed16773fa52":
+            extra_kwargs = {"num_joint_blocks": 6, "num_single_blocks": 0}
+        else:
+            extra_kwargs = {}
+        return state_dict_, extra_kwargs
+    def from_civitai(self, state_dict):
+        return self.from_diffusers(state_dict)

diffsynth/models/flux_dit.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import torch
+from .general_modules import TimestepEmbeddings, AdaLayerNorm, RMSNorm
+from einops import rearrange
+def interact_with_ipadapter(hidden_states, q, ip_k, ip_v, scale=1.0):
+    batch_size, num_tokens = hidden_states.shape[0:2]
+    ip_hidden_states = torch.nn.functional.scaled_dot_product_attention(q, ip_k, ip_v)
+    ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, num_tokens, -1)
+    hidden_states = hidden_states + scale * ip_hidden_states
+    return hidden_states
+class RoPEEmbedding(torch.nn.Module):
+    def __init__(self, dim, theta, axes_dim):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def rope(self, pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
+        assert dim % 2 == 0, "The dimension must be even."
+        scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+        omega = 1.0 / (theta**scale)
+        batch_size, seq_length = pos.shape
+        out = torch.einsum("...n,d->...nd", pos, omega)
+        cos_out = torch.cos(out)
+        sin_out = torch.sin(out)
+        stacked_out = torch.stack([cos_out, -sin_out, sin_out, cos_out], dim=-1)
+        out = stacked_out.view(batch_size, -1, dim // 2, 2, 2)
+        return out.float()
+    def forward(self, ids):
+        n_axes = ids.shape[-1]
+        emb = torch.cat([self.rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)], dim=-3)
+        return emb.unsqueeze(1)
+class FluxJointAttention(torch.nn.Module):
+    def __init__(self, dim_a, dim_b, num_heads, head_dim, only_out_a=False):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.only_out_a = only_out_a
+        self.a_to_qkv = torch.nn.Linear(dim_a, dim_a * 3)
+        self.b_to_qkv = torch.nn.Linear(dim_b, dim_b * 3)
+        self.norm_q_a = RMSNorm(head_dim, eps=1e-6)
+        self.norm_k_a = RMSNorm(head_dim, eps=1e-6)
+        self.norm_q_b = RMSNorm(head_dim, eps=1e-6)
+        self.norm_k_b = RMSNorm(head_dim, eps=1e-6)
+        self.a_to_out = torch.nn.Linear(dim_a, dim_a)
+        if not only_out_a:
+            self.b_to_out = torch.nn.Linear(dim_b, dim_b)
+    def apply_rope(self, xq, xk, freqs_cis):
+        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+        xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+        xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+        return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+    def forward(self, hidden_states_a, hidden_states_b, image_rotary_emb, attn_mask=None, ipadapter_kwargs_list=None):
+        batch_size = hidden_states_a.shape[0]
+        # Part A
+        qkv_a = self.a_to_qkv(hidden_states_a)
+        qkv_a = qkv_a.view(batch_size, -1, 3 * self.num_heads, self.head_dim).transpose(1, 2)
+        q_a, k_a, v_a = qkv_a.chunk(3, dim=1)
+        q_a, k_a = self.norm_q_a(q_a), self.norm_k_a(k_a)
+        # Part B
+        qkv_b = self.b_to_qkv(hidden_states_b)
+        qkv_b = qkv_b.view(batch_size, -1, 3 * self.num_heads, self.head_dim).transpose(1, 2)
+        q_b, k_b, v_b = qkv_b.chunk(3, dim=1)
+        q_b, k_b = self.norm_q_b(q_b), self.norm_k_b(k_b)
+        q = torch.concat([q_b, q_a], dim=2)
+        k = torch.concat([k_b, k_a], dim=2)
+        v = torch.concat([v_b, v_a], dim=2)
+        q, k = self.apply_rope(q, k, image_rotary_emb)
+        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
+        hidden_states = hidden_states.to(q.dtype)
+        hidden_states_b, hidden_states_a = hidden_states[:, :hidden_states_b.shape[1]], hidden_states[:, hidden_states_b.shape[1]:]
+        if ipadapter_kwargs_list is not None:
+            hidden_states_a = interact_with_ipadapter(hidden_states_a, q_a, **ipadapter_kwargs_list)
+        hidden_states_a = self.a_to_out(hidden_states_a)
+        if self.only_out_a:
+            return hidden_states_a
+        else:
+            hidden_states_b = self.b_to_out(hidden_states_b)
+            return hidden_states_a, hidden_states_b
+class FluxJointTransformerBlock(torch.nn.Module):
+    def __init__(self, dim, num_attention_heads):
+        super().__init__()
+        self.norm1_a = AdaLayerNorm(dim)
+        self.norm1_b = AdaLayerNorm(dim)
+        self.attn = FluxJointAttention(dim, dim, num_attention_heads, dim // num_attention_heads)
+        self.norm2_a = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_a = torch.nn.Sequential(
+            torch.nn.Linear(dim, dim*4),
+            torch.nn.GELU(approximate="tanh"),
+            torch.nn.Linear(dim*4, dim)
+        )
+        self.norm2_b = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_b = torch.nn.Sequential(
+            torch.nn.Linear(dim, dim*4),
+            torch.nn.GELU(approximate="tanh"),
+            torch.nn.Linear(dim*4, dim)
+        )
+    def forward(self, hidden_states_a, hidden_states_b, temb, image_rotary_emb, attn_mask=None, ipadapter_kwargs_list=None):
+        norm_hidden_states_a, gate_msa_a, shift_mlp_a, scale_mlp_a, gate_mlp_a = self.norm1_a(hidden_states_a, emb=temb)
+        norm_hidden_states_b, gate_msa_b, shift_mlp_b, scale_mlp_b, gate_mlp_b = self.norm1_b(hidden_states_b, emb=temb)
+        # Attention
+        attn_output_a, attn_output_b = self.attn(norm_hidden_states_a, norm_hidden_states_b, image_rotary_emb, attn_mask, ipadapter_kwargs_list)
+        # Part A
+        hidden_states_a = hidden_states_a + gate_msa_a * attn_output_a
+        norm_hidden_states_a = self.norm2_a(hidden_states_a) * (1 + scale_mlp_a) + shift_mlp_a
+        hidden_states_a = hidden_states_a + gate_mlp_a * self.ff_a(norm_hidden_states_a)
+        # Part B
+        hidden_states_b = hidden_states_b + gate_msa_b * attn_output_b
+        norm_hidden_states_b = self.norm2_b(hidden_states_b) * (1 + scale_mlp_b) + shift_mlp_b
+        hidden_states_b = hidden_states_b + gate_mlp_b * self.ff_b(norm_hidden_states_b)
+        return hidden_states_a, hidden_states_b
+class FluxSingleAttention(torch.nn.Module):
+    def __init__(self, dim_a, dim_b, num_heads, head_dim):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.a_to_qkv = torch.nn.Linear(dim_a, dim_a * 3)
+        self.norm_q_a = RMSNorm(head_dim, eps=1e-6)
+        self.norm_k_a = RMSNorm(head_dim, eps=1e-6)
+    def apply_rope(self, xq, xk, freqs_cis):
+        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+        xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+        xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+        return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+    def forward(self, hidden_states, image_rotary_emb):
+        batch_size = hidden_states.shape[0]
+        qkv_a = self.a_to_qkv(hidden_states)
+        qkv_a = qkv_a.view(batch_size, -1, 3 * self.num_heads, self.head_dim).transpose(1, 2)
+        q_a, k_a, v = qkv_a.chunk(3, dim=1)
+        q_a, k_a = self.norm_q_a(q_a), self.norm_k_a(k_a)
+        q, k = self.apply_rope(q_a, k_a, image_rotary_emb)
+        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
+        hidden_states = hidden_states.to(q.dtype)
+        return hidden_states
+class AdaLayerNormSingle(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = torch.nn.SiLU()
+        self.linear = torch.nn.Linear(dim, 3 * dim, bias=True)
+        self.norm = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb):
+        emb = self.linear(self.silu(emb))
+        shift_msa, scale_msa, gate_msa = emb.chunk(3, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa
+class FluxSingleTransformerBlock(torch.nn.Module):
+    def __init__(self, dim, num_attention_heads):
+        super().__init__()
+        self.num_heads = num_attention_heads
+        self.head_dim = dim // num_attention_heads
+        self.dim = dim
+        self.norm = AdaLayerNormSingle(dim)
+        self.to_qkv_mlp = torch.nn.Linear(dim, dim * (3 + 4))
+        self.norm_q_a = RMSNorm(self.head_dim, eps=1e-6)
+        self.norm_k_a = RMSNorm(self.head_dim, eps=1e-6)
+        self.proj_out = torch.nn.Linear(dim * 5, dim)
+    def apply_rope(self, xq, xk, freqs_cis):
+        xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+        xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+        xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+        xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+        return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+    def process_attention(self, hidden_states, image_rotary_emb, attn_mask=None, ipadapter_kwargs_list=None):
+        batch_size = hidden_states.shape[0]
+        qkv = hidden_states.view(batch_size, -1, 3 * self.num_heads, self.head_dim).transpose(1, 2)
+        q, k, v = qkv.chunk(3, dim=1)
+        q, k = self.norm_q_a(q), self.norm_k_a(k)
+        q, k = self.apply_rope(q, k, image_rotary_emb)
+        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
+        hidden_states = hidden_states.to(q.dtype)
+        if ipadapter_kwargs_list is not None:
+            hidden_states = interact_with_ipadapter(hidden_states, q, **ipadapter_kwargs_list)
+        return hidden_states
+    def forward(self, hidden_states_a, hidden_states_b, temb, image_rotary_emb, attn_mask=None, ipadapter_kwargs_list=None):
+        residual = hidden_states_a
+        norm_hidden_states, gate = self.norm(hidden_states_a, emb=temb)
+        hidden_states_a = self.to_qkv_mlp(norm_hidden_states)
+        attn_output, mlp_hidden_states = hidden_states_a[:, :, :self.dim * 3], hidden_states_a[:, :, self.dim * 3:]
+        attn_output = self.process_attention(attn_output, image_rotary_emb, attn_mask, ipadapter_kwargs_list)
+        mlp_hidden_states = torch.nn.functional.gelu(mlp_hidden_states, approximate="tanh")
+        hidden_states_a = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        hidden_states_a = gate.unsqueeze(1) * self.proj_out(hidden_states_a)
+        hidden_states_a = residual + hidden_states_a
+        return hidden_states_a, hidden_states_b
+class AdaLayerNormContinuous(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.silu = torch.nn.SiLU()
+        self.linear = torch.nn.Linear(dim, dim * 2, bias=True)
+        self.norm = torch.nn.LayerNorm(dim, eps=1e-6, elementwise_affine=False)
+    def forward(self, x, conditioning):
+        emb = self.linear(self.silu(conditioning))
+        shift, scale = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None] + shift[:, None]
+        return x
+class FluxDiT(torch.nn.Module):
+    _repeated_blocks = ["FluxJointTransformerBlock", "FluxSingleTransformerBlock"]
+    def __init__(self, disable_guidance_embedder=False, input_dim=64, num_blocks=19):
+        super().__init__()
+        self.pos_embedder = RoPEEmbedding(3072, 10000, [16, 56, 56])
+        self.time_embedder = TimestepEmbeddings(256, 3072)
+        self.guidance_embedder = None if disable_guidance_embedder else TimestepEmbeddings(256, 3072)
+        self.pooled_text_embedder = torch.nn.Sequential(torch.nn.Linear(768, 3072), torch.nn.SiLU(), torch.nn.Linear(3072, 3072))
+        self.context_embedder = torch.nn.Linear(4096, 3072)
+        self.x_embedder = torch.nn.Linear(input_dim, 3072)
+        self.blocks = torch.nn.ModuleList([FluxJointTransformerBlock(3072, 24) for _ in range(num_blocks)])
+        self.single_blocks = torch.nn.ModuleList([FluxSingleTransformerBlock(3072, 24) for _ in range(38)])
+        self.final_norm_out = AdaLayerNormContinuous(3072)
+        self.final_proj_out = torch.nn.Linear(3072, 64)
+        self.input_dim = input_dim
+    def patchify(self, hidden_states):
+        hidden_states = rearrange(hidden_states, "B C (H P) (W Q) -> B (H W) (C P Q)", P=2, Q=2)
+        return hidden_states
+    def unpatchify(self, hidden_states, height, width):
+        hidden_states = rearrange(hidden_states, "B (H W) (C P Q) -> B C (H P) (W Q)", P=2, Q=2, H=height//2, W=width//2)
+        return hidden_states
+    def prepare_image_ids(self, latents):
+        batch_size, _, height, width = latents.shape
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1)
+        latent_image_ids = latent_image_ids.reshape(
+            batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        latent_image_ids = latent_image_ids.to(device=latents.device, dtype=latents.dtype)
+        return latent_image_ids
+    def construct_mask(self, entity_masks, prompt_seq_len, image_seq_len):
+        N = len(entity_masks)
+        batch_size = entity_masks[0].shape[0]
+        total_seq_len = N * prompt_seq_len + image_seq_len
+        patched_masks = [self.patchify(entity_masks[i]) for i in range(N)]
+        attention_mask = torch.ones((batch_size, total_seq_len, total_seq_len), dtype=torch.bool).to(device=entity_masks[0].device)
+        image_start = N * prompt_seq_len
+        image_end = N * prompt_seq_len + image_seq_len
+        # prompt-image mask
+        for i in range(N):
+            prompt_start = i * prompt_seq_len
+            prompt_end = (i + 1) * prompt_seq_len
+            image_mask = torch.sum(patched_masks[i], dim=-1) > 0
+            image_mask = image_mask.unsqueeze(1).repeat(1, prompt_seq_len, 1)
+            # prompt update with image
+            attention_mask[:, prompt_start:prompt_end, image_start:image_end] = image_mask
+            # image update with prompt
+            attention_mask[:, image_start:image_end, prompt_start:prompt_end] = image_mask.transpose(1, 2)
+        # prompt-prompt mask
+        for i in range(N):
+            for j in range(N):
+                if i != j:
+                    prompt_start_i = i * prompt_seq_len
+                    prompt_end_i = (i + 1) * prompt_seq_len
+                    prompt_start_j = j * prompt_seq_len
+                    prompt_end_j = (j + 1) * prompt_seq_len
+                    attention_mask[:, prompt_start_i:prompt_end_i, prompt_start_j:prompt_end_j] = False
+        attention_mask = attention_mask.float()
+        attention_mask[attention_mask == 0] = float('-inf')
+        attention_mask[attention_mask == 1] = 0
+        return attention_mask
+    def process_entity_masks(self, hidden_states, prompt_emb, entity_prompt_emb, entity_masks, text_ids, image_ids, repeat_dim):
+        max_masks = 0
+        attention_mask = None
+        prompt_embs = [prompt_emb]
+        if entity_masks is not None:
+            # entity_masks
+            batch_size, max_masks = entity_masks.shape[0], entity_masks.shape[1]
+            entity_masks = entity_masks.repeat(1, 1, repeat_dim, 1, 1)
+            entity_masks = [entity_masks[:, i, None].squeeze(1) for i in range(max_masks)]
+            # global mask
+            global_mask = torch.ones_like(entity_masks[0]).to(device=hidden_states.device, dtype=hidden_states.dtype)
+            entity_masks = entity_masks + [global_mask] # append global to last
+            # attention mask
+            attention_mask = self.construct_mask(entity_masks, prompt_emb.shape[1], hidden_states.shape[1])
+            attention_mask = attention_mask.to(device=hidden_states.device, dtype=hidden_states.dtype)
+            attention_mask = attention_mask.unsqueeze(1)
+            # embds: n_masks * b * seq * d
+            local_embs = [entity_prompt_emb[:, i, None].squeeze(1) for i in range(max_masks)]
+            prompt_embs = local_embs + prompt_embs # append global to last
+        prompt_embs = [self.context_embedder(prompt_emb) for prompt_emb in prompt_embs]
+        prompt_emb = torch.cat(prompt_embs, dim=1)
+        # positional embedding
+        text_ids = torch.cat([text_ids] * (max_masks + 1), dim=1)
+        image_rotary_emb = self.pos_embedder(torch.cat((text_ids, image_ids), dim=1))
+        return prompt_emb, image_rotary_emb, attention_mask
+    def forward(
+        self,
+        hidden_states,
+        timestep, prompt_emb, pooled_prompt_emb, guidance, text_ids, image_ids=None,
+        tiled=False, tile_size=128, tile_stride=64, entity_prompt_emb=None, entity_masks=None,
+        use_gradient_checkpointing=False,
+        **kwargs
+    ):
+        # (Deprecated) The real forward is in `pipelines.flux_image`.
+        return None

diffsynth/models/flux_infiniteyou.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import math
+import torch
+import torch.nn as nn
+# FFN
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    #(bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+        b, l, _ = latents.shape
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+        return self.to_out(out)
+class InfiniteYouImageProjector(nn.Module):
+    def __init__(
+        self,
+        dim=1280,
+        depth=4,
+        dim_head=64,
+        heads=20,
+        num_queries=8,
+        embedding_dim=512,
+        output_dim=4096,
+        ff_mult=4,
+    ):
+        super().__init__()
+        self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
+        self.proj_in = nn.Linear(embedding_dim, dim)
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList([
+                    PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                    FeedForward(dim=dim, mult=ff_mult),
+                ]))
+    def forward(self, x):
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        latents = latents.to(dtype=x.dtype, device=x.device)
+        x = self.proj_in(x)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+    @staticmethod
+    def state_dict_converter():
+        return FluxInfiniteYouImageProjectorStateDictConverter()
+class FluxInfiniteYouImageProjectorStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict['image_proj']

diffsynth/models/flux_ipadapter.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from .general_modules import RMSNorm
+from transformers import SiglipVisionModel, SiglipVisionConfig
+import torch
+class SiglipVisionModelSO400M(SiglipVisionModel):
+    def __init__(self):
+        config = SiglipVisionConfig(
+            hidden_size=1152,
+            image_size=384,
+            intermediate_size=4304,
+            model_type="siglip_vision_model",
+            num_attention_heads=16,
+            num_hidden_layers=27,
+            patch_size=14,
+            architectures=["SiglipModel"],
+            initializer_factor=1.0,
+            torch_dtype="float32",
+            transformers_version="4.37.0.dev0"
+        )
+        super().__init__(config)
+class MLPProjModel(torch.nn.Module):
+    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, num_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        self.proj = torch.nn.Sequential(
+            torch.nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
+            torch.nn.GELU(),
+            torch.nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
+        )
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+    def forward(self, id_embeds):
+        x = self.proj(id_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.norm(x)
+        return x
+class IpAdapterModule(torch.nn.Module):
+    def __init__(self, num_attention_heads, attention_head_dim, input_dim):
+        super().__init__()
+        self.num_heads = num_attention_heads
+        self.head_dim = attention_head_dim
+        output_dim = num_attention_heads * attention_head_dim
+        self.to_k_ip = torch.nn.Linear(input_dim, output_dim, bias=False)
+        self.to_v_ip = torch.nn.Linear(input_dim, output_dim, bias=False)
+        self.norm_added_k = RMSNorm(attention_head_dim, eps=1e-5, elementwise_affine=False)
+    def forward(self, hidden_states):
+        batch_size = hidden_states.shape[0]
+        # ip_k
+        ip_k = self.to_k_ip(hidden_states)
+        ip_k = ip_k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        ip_k = self.norm_added_k(ip_k)
+        # ip_v
+        ip_v = self.to_v_ip(hidden_states)
+        ip_v = ip_v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        return ip_k, ip_v
+class FluxIpAdapter(torch.nn.Module):
+    def __init__(self, num_attention_heads=24, attention_head_dim=128, cross_attention_dim=4096, num_tokens=128, num_blocks=57):
+        super().__init__()
+        self.ipadapter_modules = torch.nn.ModuleList([IpAdapterModule(num_attention_heads, attention_head_dim, cross_attention_dim) for _ in range(num_blocks)])
+        self.image_proj = MLPProjModel(cross_attention_dim=cross_attention_dim, id_embeddings_dim=1152, num_tokens=num_tokens)
+        self.set_adapter()
+    def set_adapter(self):
+        self.call_block_id = {i:i for i in range(len(self.ipadapter_modules))}
+    def forward(self, hidden_states, scale=1.0):
+        hidden_states = self.image_proj(hidden_states)
+        hidden_states = hidden_states.view(1, -1, hidden_states.shape[-1])
+        ip_kv_dict = {}
+        for block_id in self.call_block_id:
+            ipadapter_id = self.call_block_id[block_id]
+            ip_k, ip_v = self.ipadapter_modules[ipadapter_id](hidden_states)
+            ip_kv_dict[block_id] = {
+                "ip_k": ip_k,
+                "ip_v": ip_v,
+                "scale": scale
+            }
+        return ip_kv_dict
+    @staticmethod
+    def state_dict_converter():
+        return FluxIpAdapterStateDictConverter()
+class FluxIpAdapterStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        state_dict_ = {}
+        for name in state_dict["ip_adapter"]:
+            name_ = 'ipadapter_modules.' + name
+            state_dict_[name_] = state_dict["ip_adapter"][name]
+        for name in state_dict["image_proj"]:
+            name_ = "image_proj." + name
+            state_dict_[name_] = state_dict["image_proj"][name]
+        return state_dict_
+    def from_civitai(self, state_dict):
+        return self.from_diffusers(state_dict)

diffsynth/models/flux_lora_encoder.py ADDED Viewed

	@@ -0,0 +1,521 @@

+import torch
+from einops import rearrange
+def low_version_attention(query, key, value, attn_bias=None):
+    scale = 1 / query.shape[-1] ** 0.5
+    query = query * scale
+    attn = torch.matmul(query, key.transpose(-2, -1))
+    if attn_bias is not None:
+        attn = attn + attn_bias
+    attn = attn.softmax(-1)
+    return attn @ value
+class Attention(torch.nn.Module):
+    def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_kv=False, bias_out=False):
+        super().__init__()
+        dim_inner = head_dim * num_heads
+        kv_dim = kv_dim if kv_dim is not None else q_dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.to_q = torch.nn.Linear(q_dim, dim_inner, bias=bias_q)
+        self.to_k = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
+        self.to_v = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
+        self.to_out = torch.nn.Linear(dim_inner, q_dim, bias=bias_out)
+    def interact_with_ipadapter(self, hidden_states, q, ip_k, ip_v, scale=1.0):
+        batch_size = q.shape[0]
+        ip_k = ip_k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        ip_v = ip_v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        ip_hidden_states = torch.nn.functional.scaled_dot_product_attention(q, ip_k, ip_v)
+        hidden_states = hidden_states + scale * ip_hidden_states
+        return hidden_states
+    def torch_forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None, ipadapter_kwargs=None, qkv_preprocessor=None):
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        batch_size = encoder_hidden_states.shape[0]
+        q = self.to_q(hidden_states)
+        k = self.to_k(encoder_hidden_states)
+        v = self.to_v(encoder_hidden_states)
+        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        if qkv_preprocessor is not None:
+            q, k, v = qkv_preprocessor(q, k, v)
+        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+        if ipadapter_kwargs is not None:
+            hidden_states = self.interact_with_ipadapter(hidden_states, q, **ipadapter_kwargs)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
+        hidden_states = hidden_states.to(q.dtype)
+        hidden_states = self.to_out(hidden_states)
+        return hidden_states
+    def xformers_forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None):
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        q = self.to_q(hidden_states)
+        k = self.to_k(encoder_hidden_states)
+        v = self.to_v(encoder_hidden_states)
+        q = rearrange(q, "b f (n d) -> (b n) f d", n=self.num_heads)
+        k = rearrange(k, "b f (n d) -> (b n) f d", n=self.num_heads)
+        v = rearrange(v, "b f (n d) -> (b n) f d", n=self.num_heads)
+        if attn_mask is not None:
+            hidden_states = low_version_attention(q, k, v, attn_bias=attn_mask)
+        else:
+            import xformers.ops as xops
+            hidden_states = xops.memory_efficient_attention(q, k, v)
+        hidden_states = rearrange(hidden_states, "(b n) f d -> b f (n d)", n=self.num_heads)
+        hidden_states = hidden_states.to(q.dtype)
+        hidden_states = self.to_out(hidden_states)
+        return hidden_states
+    def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None, ipadapter_kwargs=None, qkv_preprocessor=None):
+        return self.torch_forward(hidden_states, encoder_hidden_states=encoder_hidden_states, attn_mask=attn_mask, ipadapter_kwargs=ipadapter_kwargs, qkv_preprocessor=qkv_preprocessor)
+class CLIPEncoderLayer(torch.nn.Module):
+    def __init__(self, embed_dim, intermediate_size, num_heads=12, head_dim=64, use_quick_gelu=True):
+        super().__init__()
+        self.attn = Attention(q_dim=embed_dim, num_heads=num_heads, head_dim=head_dim, bias_q=True, bias_kv=True, bias_out=True)
+        self.layer_norm1 = torch.nn.LayerNorm(embed_dim)
+        self.layer_norm2 = torch.nn.LayerNorm(embed_dim)
+        self.fc1 = torch.nn.Linear(embed_dim, intermediate_size)
+        self.fc2 = torch.nn.Linear(intermediate_size, embed_dim)
+        self.use_quick_gelu = use_quick_gelu
+    def quickGELU(self, x):
+        return x * torch.sigmoid(1.702 * x)
+    def forward(self, hidden_states, attn_mask=None):
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.attn(hidden_states, attn_mask=attn_mask)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.fc1(hidden_states)
+        if self.use_quick_gelu:
+            hidden_states = self.quickGELU(hidden_states)
+        else:
+            hidden_states = torch.nn.functional.gelu(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class SDTextEncoder(torch.nn.Module):
+    def __init__(self, embed_dim=768, vocab_size=49408, max_position_embeddings=77, num_encoder_layers=12, encoder_intermediate_size=3072):
+        super().__init__()
+        # token_embedding
+        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
+        # position_embeds (This is a fixed tensor)
+        self.position_embeds = torch.nn.Parameter(torch.zeros(1, max_position_embeddings, embed_dim))
+        # encoders
+        self.encoders = torch.nn.ModuleList([CLIPEncoderLayer(embed_dim, encoder_intermediate_size) for _ in range(num_encoder_layers)])
+        # attn_mask
+        self.attn_mask = self.attention_mask(max_position_embeddings)
+        # final_layer_norm
+        self.final_layer_norm = torch.nn.LayerNorm(embed_dim)
+    def attention_mask(self, length):
+        mask = torch.empty(length, length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)
+        return mask
+    def forward(self, input_ids, clip_skip=1):
+        embeds = self.token_embedding(input_ids) + self.position_embeds
+        attn_mask = self.attn_mask.to(device=embeds.device, dtype=embeds.dtype)
+        for encoder_id, encoder in enumerate(self.encoders):
+            embeds = encoder(embeds, attn_mask=attn_mask)
+            if encoder_id + clip_skip == len(self.encoders):
+                break
+        embeds = self.final_layer_norm(embeds)
+        return embeds
+    @staticmethod
+    def state_dict_converter():
+        return SDTextEncoderStateDictConverter()
+class SDTextEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        rename_dict = {
+            "text_model.embeddings.token_embedding.weight": "token_embedding.weight",
+            "text_model.embeddings.position_embedding.weight": "position_embeds",
+            "text_model.final_layer_norm.weight": "final_layer_norm.weight",
+            "text_model.final_layer_norm.bias": "final_layer_norm.bias"
+        }
+        attn_rename_dict = {
+            "self_attn.q_proj": "attn.to_q",
+            "self_attn.k_proj": "attn.to_k",
+            "self_attn.v_proj": "attn.to_v",
+            "self_attn.out_proj": "attn.to_out",
+            "layer_norm1": "layer_norm1",
+            "layer_norm2": "layer_norm2",
+            "mlp.fc1": "fc1",
+            "mlp.fc2": "fc2",
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if name == "text_model.embeddings.position_embedding.weight":
+                    param = param.reshape((1, param.shape[0], param.shape[1]))
+                state_dict_[rename_dict[name]] = param
+            elif name.startswith("text_model.encoder.layers."):
+                param = state_dict[name]
+                names = name.split(".")
+                layer_id, layer_type, tail = names[3], ".".join(names[4:-1]), names[-1]
+                name_ = ".".join(["encoders", layer_id, attn_rename_dict[layer_type], tail])
+                state_dict_[name_] = param
+        return state_dict_
+    def from_civitai(self, state_dict):
+        rename_dict = {
+            "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight": "token_embedding.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.bias": "encoders.0.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm1.weight": "encoders.0.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.bias": "encoders.0.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.layer_norm2.weight": "encoders.0.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.bias": "encoders.0.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc1.weight": "encoders.0.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.bias": "encoders.0.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.mlp.fc2.weight": "encoders.0.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.bias": "encoders.0.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.k_proj.weight": "encoders.0.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.bias": "encoders.0.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.out_proj.weight": "encoders.0.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.bias": "encoders.0.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.q_proj.weight": "encoders.0.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.bias": "encoders.0.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.0.self_attn.v_proj.weight": "encoders.0.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.bias": "encoders.1.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm1.weight": "encoders.1.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.bias": "encoders.1.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.layer_norm2.weight": "encoders.1.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.bias": "encoders.1.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc1.weight": "encoders.1.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.bias": "encoders.1.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.mlp.fc2.weight": "encoders.1.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.bias": "encoders.1.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.k_proj.weight": "encoders.1.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.bias": "encoders.1.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.out_proj.weight": "encoders.1.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.bias": "encoders.1.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.q_proj.weight": "encoders.1.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.bias": "encoders.1.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.1.self_attn.v_proj.weight": "encoders.1.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.bias": "encoders.10.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm1.weight": "encoders.10.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.bias": "encoders.10.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.layer_norm2.weight": "encoders.10.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.bias": "encoders.10.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc1.weight": "encoders.10.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.bias": "encoders.10.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.mlp.fc2.weight": "encoders.10.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.bias": "encoders.10.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.k_proj.weight": "encoders.10.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.bias": "encoders.10.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.out_proj.weight": "encoders.10.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.bias": "encoders.10.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.q_proj.weight": "encoders.10.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.bias": "encoders.10.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.10.self_attn.v_proj.weight": "encoders.10.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.bias": "encoders.11.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm1.weight": "encoders.11.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.bias": "encoders.11.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.layer_norm2.weight": "encoders.11.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.bias": "encoders.11.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc1.weight": "encoders.11.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.bias": "encoders.11.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.mlp.fc2.weight": "encoders.11.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias": "encoders.11.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight": "encoders.11.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.bias": "encoders.11.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.out_proj.weight": "encoders.11.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias": "encoders.11.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight": "encoders.11.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias": "encoders.11.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight": "encoders.11.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.bias": "encoders.2.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm1.weight": "encoders.2.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.bias": "encoders.2.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.layer_norm2.weight": "encoders.2.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.bias": "encoders.2.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc1.weight": "encoders.2.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.bias": "encoders.2.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.mlp.fc2.weight": "encoders.2.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.bias": "encoders.2.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.k_proj.weight": "encoders.2.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.bias": "encoders.2.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.out_proj.weight": "encoders.2.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.bias": "encoders.2.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.q_proj.weight": "encoders.2.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.bias": "encoders.2.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.2.self_attn.v_proj.weight": "encoders.2.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.bias": "encoders.3.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm1.weight": "encoders.3.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.bias": "encoders.3.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.layer_norm2.weight": "encoders.3.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.bias": "encoders.3.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc1.weight": "encoders.3.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.bias": "encoders.3.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.mlp.fc2.weight": "encoders.3.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.bias": "encoders.3.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.k_proj.weight": "encoders.3.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.bias": "encoders.3.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.out_proj.weight": "encoders.3.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.bias": "encoders.3.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.q_proj.weight": "encoders.3.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.bias": "encoders.3.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.3.self_attn.v_proj.weight": "encoders.3.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.bias": "encoders.4.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm1.weight": "encoders.4.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.bias": "encoders.4.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.layer_norm2.weight": "encoders.4.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.bias": "encoders.4.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc1.weight": "encoders.4.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.bias": "encoders.4.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.mlp.fc2.weight": "encoders.4.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.bias": "encoders.4.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.k_proj.weight": "encoders.4.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.bias": "encoders.4.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.out_proj.weight": "encoders.4.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.bias": "encoders.4.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.q_proj.weight": "encoders.4.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.bias": "encoders.4.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.4.self_attn.v_proj.weight": "encoders.4.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.bias": "encoders.5.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm1.weight": "encoders.5.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.bias": "encoders.5.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.layer_norm2.weight": "encoders.5.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.bias": "encoders.5.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc1.weight": "encoders.5.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.bias": "encoders.5.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.mlp.fc2.weight": "encoders.5.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.bias": "encoders.5.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.k_proj.weight": "encoders.5.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.bias": "encoders.5.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.out_proj.weight": "encoders.5.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.bias": "encoders.5.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.q_proj.weight": "encoders.5.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.bias": "encoders.5.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.5.self_attn.v_proj.weight": "encoders.5.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.bias": "encoders.6.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm1.weight": "encoders.6.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.bias": "encoders.6.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.layer_norm2.weight": "encoders.6.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.bias": "encoders.6.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc1.weight": "encoders.6.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.bias": "encoders.6.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.mlp.fc2.weight": "encoders.6.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.bias": "encoders.6.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.k_proj.weight": "encoders.6.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.bias": "encoders.6.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.out_proj.weight": "encoders.6.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.bias": "encoders.6.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.q_proj.weight": "encoders.6.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.bias": "encoders.6.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.6.self_attn.v_proj.weight": "encoders.6.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.bias": "encoders.7.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm1.weight": "encoders.7.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.bias": "encoders.7.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.layer_norm2.weight": "encoders.7.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.bias": "encoders.7.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc1.weight": "encoders.7.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.bias": "encoders.7.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.mlp.fc2.weight": "encoders.7.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.bias": "encoders.7.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.k_proj.weight": "encoders.7.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.bias": "encoders.7.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.out_proj.weight": "encoders.7.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.bias": "encoders.7.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.q_proj.weight": "encoders.7.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.bias": "encoders.7.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.7.self_attn.v_proj.weight": "encoders.7.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.bias": "encoders.8.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm1.weight": "encoders.8.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.bias": "encoders.8.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.layer_norm2.weight": "encoders.8.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.bias": "encoders.8.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc1.weight": "encoders.8.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.bias": "encoders.8.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.mlp.fc2.weight": "encoders.8.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.bias": "encoders.8.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.k_proj.weight": "encoders.8.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.bias": "encoders.8.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.out_proj.weight": "encoders.8.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.bias": "encoders.8.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.q_proj.weight": "encoders.8.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.bias": "encoders.8.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.8.self_attn.v_proj.weight": "encoders.8.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.bias": "encoders.9.layer_norm1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm1.weight": "encoders.9.layer_norm1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.bias": "encoders.9.layer_norm2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.layer_norm2.weight": "encoders.9.layer_norm2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.bias": "encoders.9.fc1.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc1.weight": "encoders.9.fc1.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.bias": "encoders.9.fc2.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.mlp.fc2.weight": "encoders.9.fc2.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.bias": "encoders.9.attn.to_k.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.k_proj.weight": "encoders.9.attn.to_k.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.bias": "encoders.9.attn.to_out.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.out_proj.weight": "encoders.9.attn.to_out.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.bias": "encoders.9.attn.to_q.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.q_proj.weight": "encoders.9.attn.to_q.weight",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.bias": "encoders.9.attn.to_v.bias",
+            "cond_stage_model.transformer.text_model.encoder.layers.9.self_attn.v_proj.weight": "encoders.9.attn.to_v.weight",
+            "cond_stage_model.transformer.text_model.final_layer_norm.bias": "final_layer_norm.bias",
+            "cond_stage_model.transformer.text_model.final_layer_norm.weight": "final_layer_norm.weight",
+            "cond_stage_model.transformer.text_model.embeddings.position_embedding.weight": "position_embeds"
+        }
+        state_dict_ = {}
+        for name in state_dict:
+            if name in rename_dict:
+                param = state_dict[name]
+                if name == "cond_stage_model.transformer.text_model.embeddings.position_embedding.weight":
+                    param = param.reshape((1, param.shape[0], param.shape[1]))
+                state_dict_[rename_dict[name]] = param
+        return state_dict_
+class LoRALayerBlock(torch.nn.Module):
+    def __init__(self, L, dim_in, dim_out):
+        super().__init__()
+        self.x = torch.nn.Parameter(torch.randn(1, L, dim_in))
+        self.layer_norm = torch.nn.LayerNorm(dim_out)
+    def forward(self, lora_A, lora_B):
+        x = self.x @ lora_A.T @ lora_B.T
+        x = self.layer_norm(x)
+        return x
+class LoRAEmbedder(torch.nn.Module):
+    def __init__(self, lora_patterns=None, L=1, out_dim=2048):
+        super().__init__()
+        if lora_patterns is None:
+            lora_patterns = self.default_lora_patterns()
+        model_dict = {}
+        for lora_pattern in lora_patterns:
+            name, dim = lora_pattern["name"], lora_pattern["dim"]
+            model_dict[name.replace(".", "___")] = LoRALayerBlock(L, dim[0], dim[1])
+        self.model_dict = torch.nn.ModuleDict(model_dict)
+        proj_dict = {}
+        for lora_pattern in lora_patterns:
+            layer_type, dim = lora_pattern["type"], lora_pattern["dim"]
+            if layer_type not in proj_dict:
+                proj_dict[layer_type.replace(".", "___")] = torch.nn.Linear(dim[1], out_dim)
+        self.proj_dict = torch.nn.ModuleDict(proj_dict)
+        self.lora_patterns = lora_patterns
+    def default_lora_patterns(self):
+        lora_patterns = []
+        lora_dict = {
+            "attn.a_to_qkv": (3072, 9216), "attn.a_to_out": (3072, 3072), "ff_a.0": (3072, 12288), "ff_a.2": (12288, 3072), "norm1_a.linear": (3072, 18432),
+            "attn.b_to_qkv": (3072, 9216), "attn.b_to_out": (3072, 3072), "ff_b.0": (3072, 12288), "ff_b.2": (12288, 3072), "norm1_b.linear": (3072, 18432),
+        }
+        for i in range(19):
+            for suffix in lora_dict:
+                lora_patterns.append({
+                    "name": f"blocks.{i}.{suffix}",
+                    "dim": lora_dict[suffix],
+                    "type": suffix,
+                })
+        lora_dict = {"to_qkv_mlp": (3072, 21504), "proj_out": (15360, 3072), "norm.linear": (3072, 9216)}
+        for i in range(38):
+            for suffix in lora_dict:
+                lora_patterns.append({
+                    "name": f"single_blocks.{i}.{suffix}",
+                    "dim": lora_dict[suffix],
+                    "type": suffix,
+                })
+        return lora_patterns
+    def forward(self, lora):
+        lora_emb = []
+        for lora_pattern in self.lora_patterns:
+            name, layer_type = lora_pattern["name"], lora_pattern["type"]
+            lora_A = lora[name + ".lora_A.weight"]
+            lora_B = lora[name + ".lora_B.weight"]
+            lora_out = self.model_dict[name.replace(".", "___")](lora_A, lora_B)
+            lora_out = self.proj_dict[layer_type.replace(".", "___")](lora_out)
+            lora_emb.append(lora_out)
+        lora_emb = torch.concat(lora_emb, dim=1)
+        return lora_emb
+class FluxLoRAEncoder(torch.nn.Module):
+    def __init__(self, embed_dim=4096, encoder_intermediate_size=8192, num_encoder_layers=1, num_embeds_per_lora=16, num_special_embeds=1):
+        super().__init__()
+        self.num_embeds_per_lora = num_embeds_per_lora
+        # embedder
+        self.embedder = LoRAEmbedder(L=num_embeds_per_lora, out_dim=embed_dim)
+        # encoders
+        self.encoders = torch.nn.ModuleList([CLIPEncoderLayer(embed_dim, encoder_intermediate_size, num_heads=32, head_dim=128) for _ in range(num_encoder_layers)])
+        # special embedding
+        self.special_embeds = torch.nn.Parameter(torch.randn(1, num_special_embeds, embed_dim))
+        self.num_special_embeds = num_special_embeds
+        # final layer
+        self.final_layer_norm = torch.nn.LayerNorm(embed_dim)
+        self.final_linear = torch.nn.Linear(embed_dim, embed_dim)
+    def forward(self, lora):
+        lora_embeds = self.embedder(lora)
+        special_embeds = self.special_embeds.to(dtype=lora_embeds.dtype, device=lora_embeds.device)
+        embeds = torch.concat([special_embeds, lora_embeds], dim=1)
+        for encoder_id, encoder in enumerate(self.encoders):
+            embeds = encoder(embeds)
+        embeds = embeds[:, :self.num_special_embeds]
+        embeds = self.final_layer_norm(embeds)
+        embeds = self.final_linear(embeds)
+        return embeds
+    @staticmethod
+    def state_dict_converter():
+        return FluxLoRAEncoderStateDictConverter()
+class FluxLoRAEncoderStateDictConverter:
+    def from_civitai(self, state_dict):
+        return state_dict

diffsynth/models/flux_lora_patcher.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import torch, math
+from ..core.loader import load_state_dict
+from typing import Union
+class GeneralLoRALoader:
+    def __init__(self, device="cpu", torch_dtype=torch.float32):
+        self.device = device
+        self.torch_dtype = torch_dtype
+    def get_name_dict(self, lora_state_dict):
+        lora_name_dict = {}
+        for key in lora_state_dict:
+            if ".lora_B." not in key:
+                continue
+            keys = key.split(".")
+            if len(keys) > keys.index("lora_B") + 2:
+                keys.pop(keys.index("lora_B") + 1)
+            keys.pop(keys.index("lora_B"))
+            if keys[0] == "diffusion_model":
+                keys.pop(0)
+            keys.pop(-1)
+            target_name = ".".join(keys)
+            lora_name_dict[target_name] = (key, key.replace(".lora_B.", ".lora_A."))
+        return lora_name_dict
+    def load(self, model: torch.nn.Module, state_dict_lora, alpha=1.0):
+        updated_num = 0
+        lora_name_dict = self.get_name_dict(state_dict_lora)
+        for name, module in model.named_modules():
+            if name in lora_name_dict:
+                weight_up = state_dict_lora[lora_name_dict[name][0]].to(device=self.device, dtype=self.torch_dtype)
+                weight_down = state_dict_lora[lora_name_dict[name][1]].to(device=self.device, dtype=self.torch_dtype)
+                if len(weight_up.shape) == 4:
+                    weight_up = weight_up.squeeze(3).squeeze(2)
+                    weight_down = weight_down.squeeze(3).squeeze(2)
+                    weight_lora = alpha * torch.mm(weight_up, weight_down).unsqueeze(2).unsqueeze(3)
+                else:
+                    weight_lora = alpha * torch.mm(weight_up, weight_down)
+                state_dict = module.state_dict()
+                state_dict["weight"] = state_dict["weight"].to(device=self.device, dtype=self.torch_dtype) + weight_lora
+                module.load_state_dict(state_dict)
+                updated_num += 1
+        print(f"{updated_num} tensors are updated by LoRA.")
+class FluxLoRALoader(GeneralLoRALoader):
+    def __init__(self, device="cpu", torch_dtype=torch.float32):
+        super().__init__(device=device, torch_dtype=torch_dtype)
+        self.diffusers_rename_dict = {
+            "transformer.single_transformer_blocks.blockid.attn.to_k.lora_A.weight":"single_blocks.blockid.a_to_k.lora_A.default.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_k.lora_B.weight":"single_blocks.blockid.a_to_k.lora_B.default.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_q.lora_A.weight":"single_blocks.blockid.a_to_q.lora_A.default.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_q.lora_B.weight":"single_blocks.blockid.a_to_q.lora_B.default.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_v.lora_A.weight":"single_blocks.blockid.a_to_v.lora_A.default.weight",
+            "transformer.single_transformer_blocks.blockid.attn.to_v.lora_B.weight":"single_blocks.blockid.a_to_v.lora_B.default.weight",
+            "transformer.single_transformer_blocks.blockid.norm.linear.lora_A.weight":"single_blocks.blockid.norm.linear.lora_A.default.weight",
+            "transformer.single_transformer_blocks.blockid.norm.linear.lora_B.weight":"single_blocks.blockid.norm.linear.lora_B.default.weight",
+            "transformer.single_transformer_blocks.blockid.proj_mlp.lora_A.weight":"single_blocks.blockid.proj_in_besides_attn.lora_A.default.weight",
+            "transformer.single_transformer_blocks.blockid.proj_mlp.lora_B.weight":"single_blocks.blockid.proj_in_besides_attn.lora_B.default.weight",
+            "transformer.single_transformer_blocks.blockid.proj_out.lora_A.weight":"single_blocks.blockid.proj_out.lora_A.default.weight",
+            "transformer.single_transformer_blocks.blockid.proj_out.lora_B.weight":"single_blocks.blockid.proj_out.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.add_k_proj.lora_A.weight":"blocks.blockid.attn.b_to_k.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.add_k_proj.lora_B.weight":"blocks.blockid.attn.b_to_k.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.add_q_proj.lora_A.weight":"blocks.blockid.attn.b_to_q.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.add_q_proj.lora_B.weight":"blocks.blockid.attn.b_to_q.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.add_v_proj.lora_A.weight":"blocks.blockid.attn.b_to_v.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.add_v_proj.lora_B.weight":"blocks.blockid.attn.b_to_v.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_add_out.lora_A.weight":"blocks.blockid.attn.b_to_out.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_add_out.lora_B.weight":"blocks.blockid.attn.b_to_out.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_k.lora_A.weight":"blocks.blockid.attn.a_to_k.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_k.lora_B.weight":"blocks.blockid.attn.a_to_k.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_out.0.lora_A.weight":"blocks.blockid.attn.a_to_out.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_out.0.lora_B.weight":"blocks.blockid.attn.a_to_out.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_q.lora_A.weight":"blocks.blockid.attn.a_to_q.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_q.lora_B.weight":"blocks.blockid.attn.a_to_q.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_v.lora_A.weight":"blocks.blockid.attn.a_to_v.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.attn.to_v.lora_B.weight":"blocks.blockid.attn.a_to_v.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.ff.net.0.proj.lora_A.weight":"blocks.blockid.ff_a.0.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.ff.net.0.proj.lora_B.weight":"blocks.blockid.ff_a.0.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.ff.net.2.lora_A.weight":"blocks.blockid.ff_a.2.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.ff.net.2.lora_B.weight":"blocks.blockid.ff_a.2.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.ff_context.net.0.proj.lora_A.weight":"blocks.blockid.ff_b.0.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.ff_context.net.0.proj.lora_B.weight":"blocks.blockid.ff_b.0.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.ff_context.net.2.lora_A.weight":"blocks.blockid.ff_b.2.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.ff_context.net.2.lora_B.weight":"blocks.blockid.ff_b.2.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.norm1.linear.lora_A.weight":"blocks.blockid.norm1_a.linear.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.norm1.linear.lora_B.weight":"blocks.blockid.norm1_a.linear.lora_B.default.weight",
+            "transformer.transformer_blocks.blockid.norm1_context.linear.lora_A.weight":"blocks.blockid.norm1_b.linear.lora_A.default.weight",
+            "transformer.transformer_blocks.blockid.norm1_context.linear.lora_B.weight":"blocks.blockid.norm1_b.linear.lora_B.default.weight",
+        }
+        self.civitai_rename_dict = {
+            "lora_unet_double_blocks_blockid_img_mod_lin.lora_down.weight": "blocks.blockid.norm1_a.linear.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_img_mod_lin.lora_up.weight": "blocks.blockid.norm1_a.linear.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_txt_mod_lin.lora_down.weight": "blocks.blockid.norm1_b.linear.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_txt_mod_lin.lora_up.weight": "blocks.blockid.norm1_b.linear.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_img_attn_qkv.lora_down.weight": "blocks.blockid.attn.a_to_qkv.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_img_attn_qkv.lora_up.weight": "blocks.blockid.attn.a_to_qkv.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_txt_attn_qkv.lora_down.weight": "blocks.blockid.attn.b_to_qkv.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_txt_attn_qkv.lora_up.weight": "blocks.blockid.attn.b_to_qkv.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_img_attn_proj.lora_down.weight": "blocks.blockid.attn.a_to_out.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_img_attn_proj.lora_up.weight": "blocks.blockid.attn.a_to_out.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_txt_attn_proj.lora_down.weight": "blocks.blockid.attn.b_to_out.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_txt_attn_proj.lora_up.weight": "blocks.blockid.attn.b_to_out.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_img_mlp_0.lora_down.weight": "blocks.blockid.ff_a.0.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_img_mlp_0.lora_up.weight": "blocks.blockid.ff_a.0.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_img_mlp_2.lora_down.weight": "blocks.blockid.ff_a.2.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_img_mlp_2.lora_up.weight": "blocks.blockid.ff_a.2.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_txt_mlp_0.lora_down.weight": "blocks.blockid.ff_b.0.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_txt_mlp_0.lora_up.weight": "blocks.blockid.ff_b.0.lora_B.default.weight",
+            "lora_unet_double_blocks_blockid_txt_mlp_2.lora_down.weight": "blocks.blockid.ff_b.2.lora_A.default.weight",
+            "lora_unet_double_blocks_blockid_txt_mlp_2.lora_up.weight": "blocks.blockid.ff_b.2.lora_B.default.weight",
+            "lora_unet_single_blocks_blockid_modulation_lin.lora_down.weight": "single_blocks.blockid.norm.linear.lora_A.default.weight",
+            "lora_unet_single_blocks_blockid_modulation_lin.lora_up.weight": "single_blocks.blockid.norm.linear.lora_B.default.weight",
+            "lora_unet_single_blocks_blockid_linear1.lora_down.weight": "single_blocks.blockid.to_qkv_mlp.lora_A.default.weight",
+            "lora_unet_single_blocks_blockid_linear1.lora_up.weight": "single_blocks.blockid.to_qkv_mlp.lora_B.default.weight",
+            "lora_unet_single_blocks_blockid_linear2.lora_down.weight": "single_blocks.blockid.proj_out.lora_A.default.weight",
+            "lora_unet_single_blocks_blockid_linear2.lora_up.weight": "single_blocks.blockid.proj_out.lora_B.default.weight",
+        }
+    def load(self, model: torch.nn.Module, state_dict_lora, alpha=1.0):
+        super().load(model, state_dict_lora, alpha)
+    def convert_state_dict(self,state_dict):
+        def guess_block_id(name,model_resource):
+            if model_resource == 'civitai':
+                names = name.split("_")
+                for i in names:
+                    if i.isdigit():
+                        return i, name.replace(f"_{i}_", "_blockid_")
+            if model_resource == 'diffusers':
+                names = name.split(".")
+                for i in names:
+                    if i.isdigit():
+                        return i, name.replace(f"transformer_blocks.{i}.", "transformer_blocks.blockid.")
+            return None, None
+        def guess_resource(state_dict):
+            for k in state_dict:
+                if "lora_unet_" in k:
+                    return 'civitai'
+                elif k.startswith("transformer."):
+                    return 'diffusers'
+                else:
+                    None
+        model_resource = guess_resource(state_dict)
+        if model_resource is None:
+            return state_dict
+        rename_dict = self.diffusers_rename_dict if model_resource == 'diffusers' else self.civitai_rename_dict
+        def guess_alpha(state_dict):
+                for name, param in state_dict.items():
+                    if ".alpha" in name:
+                        for suffix in [".lora_down.weight", ".lora_A.weight"]:
+                            name_ = name.replace(".alpha", suffix)
+                            if name_ in state_dict:
+                                lora_alpha = param.item() / state_dict[name_].shape[0]
+                                lora_alpha = math.sqrt(lora_alpha)
+                                return lora_alpha
+                return 1
+        alpha = guess_alpha(state_dict)
+        state_dict_ = {}
+        for name, param in state_dict.items():
+            block_id, source_name = guess_block_id(name,model_resource)
+            if alpha != 1:
+                param *= alpha
+            if source_name in rename_dict:
+                target_name = rename_dict[source_name]
+                target_name = target_name.replace(".blockid.", f".{block_id}.")
+                state_dict_[target_name] = param
+            else:
+                state_dict_[name] = param
+        if model_resource == 'diffusers':
+            for name in list(state_dict_.keys()):
+                if "single_blocks." in name and ".a_to_q." in name:
+                    mlp = state_dict_.get(name.replace(".a_to_q.", ".proj_in_besides_attn."), None)
+                    if mlp is None:
+                        dim = 4
+                        if 'lora_A' in name:
+                            dim = 1
+                        mlp = torch.zeros(dim * state_dict_[name].shape[0],
+                                        *state_dict_[name].shape[1:],
+                                        dtype=state_dict_[name].dtype)
+                    else:
+                        state_dict_.pop(name.replace(".a_to_q.", ".proj_in_besides_attn."))
+                    if 'lora_A' in name:
+                        param = torch.concat([
+                            state_dict_.pop(name),
+                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_k.")),
+                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_v.")),
+                            mlp,
+                        ], dim=0)
+                    elif 'lora_B' in name:
+                        d, r = state_dict_[name].shape
+                        param = torch.zeros((3*d+mlp.shape[0], 3*r+mlp.shape[1]), dtype=state_dict_[name].dtype, device=state_dict_[name].device)
+                        param[:d, :r] = state_dict_.pop(name)
+                        param[d:2*d, r:2*r] = state_dict_.pop(name.replace(".a_to_q.", ".a_to_k."))
+                        param[2*d:3*d, 2*r:3*r] = state_dict_.pop(name.replace(".a_to_q.", ".a_to_v."))
+                        param[3*d:, 3*r:] = mlp
+                    else:
+                        param = torch.concat([
+                            state_dict_.pop(name),
+                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_k.")),
+                            state_dict_.pop(name.replace(".a_to_q.", ".a_to_v.")),
+                            mlp,
+                        ], dim=0)
+                    name_ = name.replace(".a_to_q.", ".to_qkv_mlp.")
+                    state_dict_[name_] = param
+            for name in list(state_dict_.keys()):
+                for component in ["a", "b"]:
+                    if f".{component}_to_q." in name:
+                        name_ = name.replace(f".{component}_to_q.", f".{component}_to_qkv.")
+                        concat_dim = 0
+                        if 'lora_A' in name:
+                            param = torch.concat([
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")],
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")],
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")],
+                            ], dim=0)
+                        elif 'lora_B' in name:
+                            origin = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")]
+                            d, r = origin.shape
+                            # print(d, r)
+                            param = torch.zeros((3*d, 3*r), dtype=origin.dtype, device=origin.device)
+                            param[:d, :r] = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")]
+                            param[d:2*d, r:2*r] = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")]
+                            param[2*d:3*d, 2*r:3*r] = state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")]
+                        else:
+                            param = torch.concat([
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_q.")],
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_k.")],
+                                state_dict_[name.replace(f".{component}_to_q.", f".{component}_to_v.")],
+                            ], dim=0)
+                        state_dict_[name_] = param
+                        state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_q."))
+                        state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_k."))
+                        state_dict_.pop(name.replace(f".{component}_to_q.", f".{component}_to_v."))
+        return state_dict_
+class LoraMerger(torch.nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.weight_base = torch.nn.Parameter(torch.randn((dim,)))
+        self.weight_lora = torch.nn.Parameter(torch.randn((dim,)))
+        self.weight_cross = torch.nn.Parameter(torch.randn((dim,)))
+        self.weight_out = torch.nn.Parameter(torch.ones((dim,)))
+        self.bias = torch.nn.Parameter(torch.randn((dim,)))
+        self.activation = torch.nn.Sigmoid()
+        self.norm_base = torch.nn.LayerNorm(dim, eps=1e-5)
+        self.norm_lora = torch.nn.LayerNorm(dim, eps=1e-5)
+    def forward(self, base_output, lora_outputs):
+        norm_base_output = self.norm_base(base_output)
+        norm_lora_outputs = self.norm_lora(lora_outputs)
+        gate = self.activation(
+            norm_base_output * self.weight_base \
+            + norm_lora_outputs * self.weight_lora \
+            + norm_base_output * norm_lora_outputs * self.weight_cross + self.bias
+        )
+        output = base_output + (self.weight_out * gate * lora_outputs).sum(dim=0)
+        return output
+class FluxLoraPatcher(torch.nn.Module):
+    def __init__(self, lora_patterns=None):
+        super().__init__()
+        if lora_patterns is None:
+            lora_patterns = self.default_lora_patterns()
+        model_dict = {}
+        for lora_pattern in lora_patterns:
+            name, dim = lora_pattern["name"], lora_pattern["dim"]
+            model_dict[name.replace(".", "___")] = LoraMerger(dim)
+        self.model_dict = torch.nn.ModuleDict(model_dict)
+    def default_lora_patterns(self):
+        lora_patterns = []
+        lora_dict = {
+            "attn.a_to_qkv": 9216, "attn.a_to_out": 3072, "ff_a.0": 12288, "ff_a.2": 3072, "norm1_a.linear": 18432,
+            "attn.b_to_qkv": 9216, "attn.b_to_out": 3072, "ff_b.0": 12288, "ff_b.2": 3072, "norm1_b.linear": 18432,
+        }
+        for i in range(19):
+            for suffix in lora_dict:
+                lora_patterns.append({
+                    "name": f"blocks.{i}.{suffix}",
+                    "dim": lora_dict[suffix]
+                })
+        lora_dict = {"to_qkv_mlp": 21504, "proj_out": 3072, "norm.linear": 9216}
+        for i in range(38):
+            for suffix in lora_dict:
+                lora_patterns.append({
+                    "name": f"single_blocks.{i}.{suffix}",
+                    "dim": lora_dict[suffix]
+                })
+        return lora_patterns
+    def forward(self, base_output, lora_outputs, name):
+        return self.model_dict[name.replace(".", "___")](base_output, lora_outputs)

diffsynth/models/flux_text_encoder_clip.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+class Attention(torch.nn.Module):
+    def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_kv=False, bias_out=False):
+        super().__init__()
+        dim_inner = head_dim * num_heads
+        kv_dim = kv_dim if kv_dim is not None else q_dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.to_q = torch.nn.Linear(q_dim, dim_inner, bias=bias_q)
+        self.to_k = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
+        self.to_v = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
+        self.to_out = torch.nn.Linear(dim_inner, q_dim, bias=bias_out)
+    def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None):
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        batch_size = encoder_hidden_states.shape[0]
+        q = self.to_q(hidden_states)
+        k = self.to_k(encoder_hidden_states)
+        v = self.to_v(encoder_hidden_states)
+        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
+        hidden_states = hidden_states.to(q.dtype)
+        hidden_states = self.to_out(hidden_states)
+        return hidden_states
+class CLIPEncoderLayer(torch.nn.Module):
+    def __init__(self, embed_dim, intermediate_size, num_heads=12, head_dim=64, use_quick_gelu=True):
+        super().__init__()
+        self.attn = Attention(q_dim=embed_dim, num_heads=num_heads, head_dim=head_dim, bias_q=True, bias_kv=True, bias_out=True)
+        self.layer_norm1 = torch.nn.LayerNorm(embed_dim)
+        self.layer_norm2 = torch.nn.LayerNorm(embed_dim)
+        self.fc1 = torch.nn.Linear(embed_dim, intermediate_size)
+        self.fc2 = torch.nn.Linear(intermediate_size, embed_dim)
+        self.use_quick_gelu = use_quick_gelu
+    def quickGELU(self, x):
+        return x * torch.sigmoid(1.702 * x)
+    def forward(self, hidden_states, attn_mask=None):
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states = self.attn(hidden_states, attn_mask=attn_mask)
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.fc1(hidden_states)
+        if self.use_quick_gelu:
+            hidden_states = self.quickGELU(hidden_states)
+        else:
+            hidden_states = torch.nn.functional.gelu(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class FluxTextEncoderClip(torch.nn.Module):
+    def __init__(self, embed_dim=768, vocab_size=49408, max_position_embeddings=77, num_encoder_layers=12, encoder_intermediate_size=3072):
+        super().__init__()
+        # token_embedding
+        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim)
+        # position_embeds (This is a fixed tensor)
+        self.position_embeds = torch.nn.Parameter(torch.zeros(1, max_position_embeddings, embed_dim))
+        # encoders
+        self.encoders = torch.nn.ModuleList([CLIPEncoderLayer(embed_dim, encoder_intermediate_size) for _ in range(num_encoder_layers)])
+        # attn_mask
+        self.attn_mask = self.attention_mask(max_position_embeddings)
+        # final_layer_norm
+        self.final_layer_norm = torch.nn.LayerNorm(embed_dim)
+    def attention_mask(self, length):
+        mask = torch.empty(length, length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)
+        return mask
+    def forward(self, input_ids, clip_skip=2, extra_mask=None):
+        embeds = self.token_embedding(input_ids)
+        embeds = embeds + self.position_embeds.to(dtype=embeds.dtype, device=input_ids.device)
+        attn_mask = self.attn_mask.to(device=embeds.device, dtype=embeds.dtype)
+        if extra_mask is not None:
+            attn_mask[:, extra_mask[0]==0] = float("-inf")
+        for encoder_id, encoder in enumerate(self.encoders):
+            embeds = encoder(embeds, attn_mask=attn_mask)
+            if encoder_id + clip_skip == len(self.encoders):
+                hidden_states = embeds
+        embeds = self.final_layer_norm(embeds)
+        pooled_embeds = embeds[torch.arange(embeds.shape[0]), input_ids.to(dtype=torch.int).argmax(dim=-1)]
+        return pooled_embeds, hidden_states

diffsynth/models/flux_text_encoder_t5.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+from transformers import T5EncoderModel, T5Config
+class FluxTextEncoderT5(T5EncoderModel):
+    def __init__(self):
+        config = T5Config(**{
+            "architectures": [
+                "T5EncoderModel"
+            ],
+            "classifier_dropout": 0.0,
+            "d_ff": 10240,
+            "d_kv": 64,
+            "d_model": 4096,
+            "decoder_start_token_id": 0,
+            "dense_act_fn": "gelu_new",
+            "dropout_rate": 0.1,
+            "dtype": "bfloat16",
+            "eos_token_id": 1,
+            "feed_forward_proj": "gated-gelu",
+            "initializer_factor": 1.0,
+            "is_encoder_decoder": True,
+            "is_gated_act": True,
+            "layer_norm_epsilon": 1e-06,
+            "model_type": "t5",
+            "num_decoder_layers": 24,
+            "num_heads": 64,
+            "num_layers": 24,
+            "output_past": True,
+            "pad_token_id": 0,
+            "relative_attention_max_distance": 128,
+            "relative_attention_num_buckets": 32,
+            "tie_word_embeddings": False,
+            "transformers_version": "4.57.1",
+            "use_cache": True,
+            "vocab_size": 32128
+        })
+        super().__init__(config)
+    def forward(self, input_ids):
+        outputs = super().forward(input_ids=input_ids)
+        prompt_emb = outputs.last_hidden_state
+        return prompt_emb

diffsynth/models/flux_vae.py ADDED Viewed

	@@ -0,0 +1,451 @@

+import torch
+from einops import rearrange, repeat
+class TileWorker:
+    def __init__(self):
+        pass
+    def mask(self, height, width, border_width):
+        # Create a mask with shape (height, width).
+        # The centre area is filled with 1, and the border line is filled with values in range (0, 1].
+        x = torch.arange(height).repeat(width, 1).T
+        y = torch.arange(width).repeat(height, 1)
+        mask = torch.stack([x + 1, height - x, y + 1, width - y]).min(dim=0).values
+        mask = (mask / border_width).clip(0, 1)
+        return mask
+    def tile(self, model_input, tile_size, tile_stride, tile_device, tile_dtype):
+        # Convert a tensor (b, c, h, w) to (b, c, tile_size, tile_size, tile_num)
+        batch_size, channel, _, _ = model_input.shape
+        model_input = model_input.to(device=tile_device, dtype=tile_dtype)
+        unfold_operator = torch.nn.Unfold(
+            kernel_size=(tile_size, tile_size),
+            stride=(tile_stride, tile_stride)
+        )
+        model_input = unfold_operator(model_input)
+        model_input = model_input.view((batch_size, channel, tile_size, tile_size, -1))
+        return model_input
+    def tiled_inference(self, forward_fn, model_input, tile_batch_size, inference_device, inference_dtype, tile_device, tile_dtype):
+        # Call y=forward_fn(x) for each tile
+        tile_num = model_input.shape[-1]
+        model_output_stack = []
+        for tile_id in range(0, tile_num, tile_batch_size):
+            # process input
+            tile_id_ = min(tile_id + tile_batch_size, tile_num)
+            x = model_input[:, :, :, :, tile_id: tile_id_]
+            x = x.to(device=inference_device, dtype=inference_dtype)
+            x = rearrange(x, "b c h w n -> (n b) c h w")
+            # process output
+            y = forward_fn(x)
+            y = rearrange(y, "(n b) c h w -> b c h w n", n=tile_id_-tile_id)
+            y = y.to(device=tile_device, dtype=tile_dtype)
+            model_output_stack.append(y)
+        model_output = torch.concat(model_output_stack, dim=-1)
+        return model_output
+    def io_scale(self, model_output, tile_size):
+        # Determine the size modification happened in forward_fn
+        # We only consider the same scale on height and width.
+        io_scale = model_output.shape[2] / tile_size
+        return io_scale
+    def untile(self, model_output, height, width, tile_size, tile_stride, border_width, tile_device, tile_dtype):
+        # The reversed function of tile
+        mask = self.mask(tile_size, tile_size, border_width)
+        mask = mask.to(device=tile_device, dtype=tile_dtype)
+        mask = rearrange(mask, "h w -> 1 1 h w 1")
+        model_output = model_output * mask
+        fold_operator = torch.nn.Fold(
+            output_size=(height, width),
+            kernel_size=(tile_size, tile_size),
+            stride=(tile_stride, tile_stride)
+        )
+        mask = repeat(mask[0, 0, :, :, 0], "h w -> 1 (h w) n", n=model_output.shape[-1])
+        model_output = rearrange(model_output, "b c h w n -> b (c h w) n")
+        model_output = fold_operator(model_output) / fold_operator(mask)
+        return model_output
+    def tiled_forward(self, forward_fn, model_input, tile_size, tile_stride, tile_batch_size=1, tile_device="cpu", tile_dtype=torch.float32, border_width=None):
+        # Prepare
+        inference_device, inference_dtype = model_input.device, model_input.dtype
+        height, width = model_input.shape[2], model_input.shape[3]
+        border_width = int(tile_stride*0.5) if border_width is None else border_width
+        # tile
+        model_input = self.tile(model_input, tile_size, tile_stride, tile_device, tile_dtype)
+        # inference
+        model_output = self.tiled_inference(forward_fn, model_input, tile_batch_size, inference_device, inference_dtype, tile_device, tile_dtype)
+        # resize
+        io_scale = self.io_scale(model_output, tile_size)
+        height, width = int(height*io_scale), int(width*io_scale)
+        tile_size, tile_stride = int(tile_size*io_scale), int(tile_stride*io_scale)
+        border_width = int(border_width*io_scale)
+        # untile
+        model_output = self.untile(model_output, height, width, tile_size, tile_stride, border_width, tile_device, tile_dtype)
+        # Done!
+        model_output = model_output.to(device=inference_device, dtype=inference_dtype)
+        return model_output
+class ConvAttention(torch.nn.Module):
+    def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_kv=False, bias_out=False):
+        super().__init__()
+        dim_inner = head_dim * num_heads
+        kv_dim = kv_dim if kv_dim is not None else q_dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.to_q = torch.nn.Conv2d(q_dim, dim_inner, kernel_size=(1, 1), bias=bias_q)
+        self.to_k = torch.nn.Conv2d(kv_dim, dim_inner, kernel_size=(1, 1), bias=bias_kv)
+        self.to_v = torch.nn.Conv2d(kv_dim, dim_inner, kernel_size=(1, 1), bias=bias_kv)
+        self.to_out = torch.nn.Conv2d(dim_inner, q_dim, kernel_size=(1, 1), bias=bias_out)
+    def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None):
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        batch_size = encoder_hidden_states.shape[0]
+        conv_input = rearrange(hidden_states, "B L C -> B C L 1")
+        q = self.to_q(conv_input)
+        q = rearrange(q[:, :, :, 0], "B C L -> B L C")
+        conv_input = rearrange(encoder_hidden_states, "B L C -> B C L 1")
+        k = self.to_k(conv_input)
+        v = self.to_v(conv_input)
+        k = rearrange(k[:, :, :, 0], "B C L -> B L C")
+        v = rearrange(v[:, :, :, 0], "B C L -> B L C")
+        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
+        hidden_states = hidden_states.to(q.dtype)
+        conv_input = rearrange(hidden_states, "B L C -> B C L 1")
+        hidden_states = self.to_out(conv_input)
+        hidden_states = rearrange(hidden_states[:, :, :, 0], "B C L -> B L C")
+        return hidden_states
+class Attention(torch.nn.Module):
+    def __init__(self, q_dim, num_heads, head_dim, kv_dim=None, bias_q=False, bias_kv=False, bias_out=False):
+        super().__init__()
+        dim_inner = head_dim * num_heads
+        kv_dim = kv_dim if kv_dim is not None else q_dim
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.to_q = torch.nn.Linear(q_dim, dim_inner, bias=bias_q)
+        self.to_k = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
+        self.to_v = torch.nn.Linear(kv_dim, dim_inner, bias=bias_kv)
+        self.to_out = torch.nn.Linear(dim_inner, q_dim, bias=bias_out)
+    def forward(self, hidden_states, encoder_hidden_states=None, attn_mask=None):
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        batch_size = encoder_hidden_states.shape[0]
+        q = self.to_q(hidden_states)
+        k = self.to_k(encoder_hidden_states)
+        v = self.to_v(encoder_hidden_states)
+        q = q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        k = k.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        v = v.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        hidden_states = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_dim)
+        hidden_states = hidden_states.to(q.dtype)
+        hidden_states = self.to_out(hidden_states)
+        return hidden_states
+class VAEAttentionBlock(torch.nn.Module):
+    def __init__(self, num_attention_heads, attention_head_dim, in_channels, num_layers=1, norm_num_groups=32, eps=1e-5, use_conv_attention=True):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=eps, affine=True)
+        if use_conv_attention:
+            self.transformer_blocks = torch.nn.ModuleList([
+                ConvAttention(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    bias_q=True,
+                    bias_kv=True,
+                    bias_out=True
+                )
+                for d in range(num_layers)
+            ])
+        else:
+            self.transformer_blocks = torch.nn.ModuleList([
+                Attention(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    bias_q=True,
+                    bias_kv=True,
+                    bias_out=True
+                )
+                for d in range(num_layers)
+            ])
+    def forward(self, hidden_states, time_emb, text_emb, res_stack):
+        batch, _, height, width = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        inner_dim = hidden_states.shape[1]
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states)
+        hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+        hidden_states = hidden_states + residual
+        return hidden_states, time_emb, text_emb, res_stack
+class ResnetBlock(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, temb_channels=None, groups=32, eps=1e-5):
+        super().__init__()
+        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels is not None:
+            self.time_emb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = torch.nn.GroupNorm(num_groups=groups, num_channels=out_channels, eps=eps, affine=True)
+        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.nonlinearity = torch.nn.SiLU()
+        self.conv_shortcut = None
+        if in_channels != out_channels:
+            self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=True)
+    def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwargs):
+        x = hidden_states
+        x = self.norm1(x)
+        x = self.nonlinearity(x)
+        x = self.conv1(x)
+        if time_emb is not None:
+            emb = self.nonlinearity(time_emb)
+            emb = self.time_emb_proj(emb)[:, :, None, None]
+            x = x + emb
+        x = self.norm2(x)
+        x = self.nonlinearity(x)
+        x = self.conv2(x)
+        if self.conv_shortcut is not None:
+            hidden_states = self.conv_shortcut(hidden_states)
+        hidden_states = hidden_states + x
+        return hidden_states, time_emb, text_emb, res_stack
+class UpSampler(torch.nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(channels, channels, 3, padding=1)
+    def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwargs):
+        hidden_states = torch.nn.functional.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+        hidden_states = self.conv(hidden_states)
+        return hidden_states, time_emb, text_emb, res_stack
+class DownSampler(torch.nn.Module):
+    def __init__(self, channels, padding=1, extra_padding=False):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(channels, channels, 3, stride=2, padding=padding)
+        self.extra_padding = extra_padding
+    def forward(self, hidden_states, time_emb, text_emb, res_stack, **kwargs):
+        if self.extra_padding:
+            hidden_states = torch.nn.functional.pad(hidden_states, (0, 1, 0, 1), mode="constant", value=0)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states, time_emb, text_emb, res_stack
+class FluxVAEDecoder(torch.nn.Module):
+    def __init__(self, use_conv_attention=True):
+        super().__init__()
+        self.scaling_factor = 0.3611
+        self.shift_factor = 0.1159
+        self.conv_in = torch.nn.Conv2d(16, 512, kernel_size=3, padding=1) # Different from SD 1.x
+        self.blocks = torch.nn.ModuleList([
+            # UNetMidBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            VAEAttentionBlock(1, 512, 512, 1, eps=1e-6, use_conv_attention=use_conv_attention),
+            ResnetBlock(512, 512, eps=1e-6),
+            # UpDecoderBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            UpSampler(512),
+            # UpDecoderBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            UpSampler(512),
+            # UpDecoderBlock2D
+            ResnetBlock(512, 256, eps=1e-6),
+            ResnetBlock(256, 256, eps=1e-6),
+            ResnetBlock(256, 256, eps=1e-6),
+            UpSampler(256),
+            # UpDecoderBlock2D
+            ResnetBlock(256, 128, eps=1e-6),
+            ResnetBlock(128, 128, eps=1e-6),
+            ResnetBlock(128, 128, eps=1e-6),
+        ])
+        self.conv_norm_out = torch.nn.GroupNorm(num_channels=128, num_groups=32, eps=1e-6)
+        self.conv_act = torch.nn.SiLU()
+        self.conv_out = torch.nn.Conv2d(128, 3, kernel_size=3, padding=1)
+    def tiled_forward(self, sample, tile_size=64, tile_stride=32):
+        hidden_states = TileWorker().tiled_forward(
+            lambda x: self.forward(x),
+            sample,
+            tile_size,
+            tile_stride,
+            tile_device=sample.device,
+            tile_dtype=sample.dtype
+        )
+        return hidden_states
+    def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
+        # For VAE Decoder, we do not need to apply the tiler on each layer.
+        if tiled:
+            return self.tiled_forward(sample, tile_size=tile_size, tile_stride=tile_stride)
+        # 1. pre-process
+        hidden_states = sample / self.scaling_factor + self.shift_factor
+        hidden_states = self.conv_in(hidden_states)
+        time_emb = None
+        text_emb = None
+        res_stack = None
+        # 2. blocks
+        for i, block in enumerate(self.blocks):
+            hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
+        # 3. output
+        hidden_states = self.conv_norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        return hidden_states
+class FluxVAEEncoder(torch.nn.Module):
+    def __init__(self, use_conv_attention=True):
+        super().__init__()
+        self.scaling_factor = 0.3611
+        self.shift_factor = 0.1159
+        self.conv_in = torch.nn.Conv2d(3, 128, kernel_size=3, padding=1)
+        self.blocks = torch.nn.ModuleList([
+            # DownEncoderBlock2D
+            ResnetBlock(128, 128, eps=1e-6),
+            ResnetBlock(128, 128, eps=1e-6),
+            DownSampler(128, padding=0, extra_padding=True),
+            # DownEncoderBlock2D
+            ResnetBlock(128, 256, eps=1e-6),
+            ResnetBlock(256, 256, eps=1e-6),
+            DownSampler(256, padding=0, extra_padding=True),
+            # DownEncoderBlock2D
+            ResnetBlock(256, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            DownSampler(512, padding=0, extra_padding=True),
+            # DownEncoderBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            ResnetBlock(512, 512, eps=1e-6),
+            # UNetMidBlock2D
+            ResnetBlock(512, 512, eps=1e-6),
+            VAEAttentionBlock(1, 512, 512, 1, eps=1e-6, use_conv_attention=use_conv_attention),
+            ResnetBlock(512, 512, eps=1e-6),
+        ])
+        self.conv_norm_out = torch.nn.GroupNorm(num_channels=512, num_groups=32, eps=1e-6)
+        self.conv_act = torch.nn.SiLU()
+        self.conv_out = torch.nn.Conv2d(512, 32, kernel_size=3, padding=1)
+    def tiled_forward(self, sample, tile_size=64, tile_stride=32):
+        hidden_states = TileWorker().tiled_forward(
+            lambda x: self.forward(x),
+            sample,
+            tile_size,
+            tile_stride,
+            tile_device=sample.device,
+            tile_dtype=sample.dtype
+        )
+        return hidden_states
+    def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
+        # For VAE Decoder, we do not need to apply the tiler on each layer.
+        if tiled:
+            return self.tiled_forward(sample, tile_size=tile_size, tile_stride=tile_stride)
+        # 1. pre-process
+        hidden_states = self.conv_in(sample)
+        time_emb = None
+        text_emb = None
+        res_stack = None
+        # 2. blocks
+        for i, block in enumerate(self.blocks):
+            hidden_states, time_emb, text_emb, res_stack = block(hidden_states, time_emb, text_emb, res_stack)
+        # 3. output
+        hidden_states = self.conv_norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        hidden_states = hidden_states[:, :16]
+        hidden_states = (hidden_states - self.shift_factor) * self.scaling_factor
+        return hidden_states
+    def encode_video(self, sample, batch_size=8):
+        B = sample.shape[0]
+        hidden_states = []
+        for i in range(0, sample.shape[2], batch_size):
+            j = min(i + batch_size, sample.shape[2])
+            sample_batch = rearrange(sample[:,:,i:j], "B C T H W -> (B T) C H W")
+            hidden_states_batch = self(sample_batch)
+            hidden_states_batch = rearrange(hidden_states_batch, "(B T) C H W -> B C T H W", B=B)
+            hidden_states.append(hidden_states_batch)
+        hidden_states = torch.concat(hidden_states, dim=2)
+        return hidden_states

diffsynth/models/flux_value_control.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+from .general_modules import TemporalTimesteps
+class MultiValueEncoder(torch.nn.Module):
+    def __init__(self, encoders=()):
+        super().__init__()
+        if not isinstance(encoders, list):
+            encoders = [encoders]
+        self.encoders = torch.nn.ModuleList(encoders)
+    def __call__(self, values, dtype):
+        emb = []
+        for encoder, value in zip(self.encoders, values):
+            if value is not None:
+                value = value.unsqueeze(0)
+                emb.append(encoder(value, dtype))
+        emb = torch.concat(emb, dim=0)
+        return emb
+class SingleValueEncoder(torch.nn.Module):
+    def __init__(self, dim_in=256, dim_out=4096, prefer_len=32, computation_device=None):
+        super().__init__()
+        self.prefer_len = prefer_len
+        self.prefer_proj = TemporalTimesteps(num_channels=dim_in, flip_sin_to_cos=True, downscale_freq_shift=0, computation_device=computation_device)
+        self.prefer_value_embedder = torch.nn.Sequential(
+            torch.nn.Linear(dim_in, dim_out), torch.nn.SiLU(), torch.nn.Linear(dim_out, dim_out)
+        )
+        self.positional_embedding = torch.nn.Parameter(
+            torch.randn(self.prefer_len, dim_out)
+        )
+    def forward(self, value, dtype):
+        value = value * 1000
+        emb = self.prefer_proj(value).to(dtype)
+        emb = self.prefer_value_embedder(emb).squeeze(0)
+        base_embeddings = emb.expand(self.prefer_len, -1)
+        positional_embedding = self.positional_embedding.to(dtype=base_embeddings.dtype, device=base_embeddings.device)
+        learned_embeddings = base_embeddings + positional_embedding
+        return learned_embeddings
+    @staticmethod
+    def state_dict_converter():
+        return SingleValueEncoderStateDictConverter()
+class SingleValueEncoderStateDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        return state_dict

diffsynth/models/general_modules.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch, math
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+    computation_device = None,
+    align_dtype_to_timestep = False,
+):
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device if computation_device is None else computation_device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    if align_dtype_to_timestep:
+        emb = emb.to(timesteps.dtype)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+class TemporalTimesteps(torch.nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, computation_device = None, scale=1, align_dtype_to_timestep=False):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+        self.computation_device = computation_device
+        self.scale = scale
+        self.align_dtype_to_timestep = align_dtype_to_timestep
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+            computation_device=self.computation_device,
+            scale=self.scale,
+            align_dtype_to_timestep=self.align_dtype_to_timestep,
+        )
+        return t_emb
+class DiffusersCompatibleTimestepProj(torch.nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.linear_1 = torch.nn.Linear(dim_in, dim_out)
+        self.act = torch.nn.SiLU()
+        self.linear_2 = torch.nn.Linear(dim_out, dim_out)
+    def forward(self, x):
+        x = self.linear_1(x)
+        x = self.act(x)
+        x = self.linear_2(x)
+        return x
+class TimestepEmbeddings(torch.nn.Module):
+    def __init__(self, dim_in, dim_out, computation_device=None, diffusers_compatible_format=False, scale=1, align_dtype_to_timestep=False, use_additional_t_cond=False):
+        super().__init__()
+        self.time_proj = TemporalTimesteps(num_channels=dim_in, flip_sin_to_cos=True, downscale_freq_shift=0, computation_device=computation_device, scale=scale, align_dtype_to_timestep=align_dtype_to_timestep)
+        if diffusers_compatible_format:
+            self.timestep_embedder = DiffusersCompatibleTimestepProj(dim_in, dim_out)
+        else:
+            self.timestep_embedder = torch.nn.Sequential(
+                torch.nn.Linear(dim_in, dim_out), torch.nn.SiLU(), torch.nn.Linear(dim_out, dim_out)
+            )
+        self.use_additional_t_cond = use_additional_t_cond
+        if use_additional_t_cond:
+            self.addition_t_embedding = torch.nn.Embedding(2, dim_out)
+    def forward(self, timestep, dtype, addition_t_cond=None):
+        time_emb = self.time_proj(timestep).to(dtype)
+        time_emb = self.timestep_embedder(time_emb)
+        if addition_t_cond is not None:
+            addition_t_emb = self.addition_t_embedding(addition_t_cond)
+            addition_t_emb = addition_t_emb.to(dtype=dtype)
+            time_emb = time_emb + addition_t_emb
+        return time_emb
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim, eps, elementwise_affine=True):
+        super().__init__()
+        self.eps = eps
+        if elementwise_affine:
+            self.weight = torch.nn.Parameter(torch.ones((dim,)))
+        else:
+            self.weight = None
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).square().mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        hidden_states = hidden_states.to(input_dtype)
+        if self.weight is not None:
+            hidden_states = hidden_states * self.weight
+        return hidden_states
+class AdaLayerNorm(torch.nn.Module):
+    def __init__(self, dim, single=False, dual=False):
+        super().__init__()
+        self.single = single
+        self.dual = dual
+        self.linear = torch.nn.Linear(dim, dim * [[6, 2][single], 9][dual])
+        self.norm = torch.nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, emb):
+        emb = self.linear(torch.nn.functional.silu(emb))
+        if self.single:
+            scale, shift = emb.unsqueeze(1).chunk(2, dim=2)
+            x = self.norm(x) * (1 + scale) + shift
+            return x
+        elif self.dual:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp, shift_msa2, scale_msa2, gate_msa2 = emb.unsqueeze(1).chunk(9, dim=2)
+            norm_x = self.norm(x)
+            x = norm_x * (1 + scale_msa) + shift_msa
+            norm_x2 = norm_x * (1 + scale_msa2) + shift_msa2
+            return x, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_x2, gate_msa2
+        else:
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.unsqueeze(1).chunk(6, dim=2)
+            x = self.norm(x) * (1 + scale_msa) + shift_msa
+            return x, gate_msa, shift_mlp, scale_mlp, gate_mlp

diffsynth/models/longcat_video_dit.py ADDED Viewed

	@@ -0,0 +1,902 @@

+from typing import List, Optional, Tuple
+import math
+import torch
+import torch.nn as nn
+import torch.amp as amp
+import numpy as np
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from .wan_video_dit import flash_attention
+from ..core.device.npu_compatible_device import get_device_type
+from ..core.gradient import gradient_checkpoint_forward
+class RMSNorm_FP32(torch.nn.Module):
+    def __init__(self, dim: int, eps: float):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+def broadcat(tensors, dim=-1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all(
+        [*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]
+    ), "invalid dimensions for broadcastable concatentation"
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
+    return torch.cat(tensors, dim=dim)
+def rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+class RotaryPositionalEmbedding(nn.Module):
+    def __init__(self,
+                 head_dim,
+                 cp_split_hw=None
+                 ):
+        """Rotary positional embedding for 3D
+        Reference : https://blog.eleuther.ai/rotary-embeddings/
+        Paper: https://arxiv.org/pdf/2104.09864.pdf
+        Args:
+            dim: Dimension of embedding
+            base: Base value for exponential
+        """
+        super().__init__()
+        self.head_dim = head_dim
+        assert self.head_dim % 8 == 0, 'Dim must be a multiply of 8 for 3D RoPE.'
+        self.cp_split_hw = cp_split_hw
+        # We take the assumption that the longest side of grid will not larger than 512, i.e, 512 * 8 = 4098 input pixels
+        self.base = 10000
+        self.freqs_dict = {}
+    def register_grid_size(self, grid_size):
+        if grid_size not in self.freqs_dict:
+            self.freqs_dict.update({
+                grid_size: self.precompute_freqs_cis_3d(grid_size)
+            })
+    def precompute_freqs_cis_3d(self, grid_size):
+        num_frames, height, width = grid_size
+        dim_t = self.head_dim - 4 * (self.head_dim // 6)
+        dim_h = 2 * (self.head_dim // 6)
+        dim_w = 2 * (self.head_dim // 6)
+        freqs_t = 1.0 / (self.base ** (torch.arange(0, dim_t, 2)[: (dim_t // 2)].float() / dim_t))
+        freqs_h = 1.0 / (self.base ** (torch.arange(0, dim_h, 2)[: (dim_h // 2)].float() / dim_h))
+        freqs_w = 1.0 / (self.base ** (torch.arange(0, dim_w, 2)[: (dim_w // 2)].float() / dim_w))
+        grid_t = np.linspace(0, num_frames, num_frames, endpoint=False, dtype=np.float32)
+        grid_h = np.linspace(0, height, height, endpoint=False, dtype=np.float32)
+        grid_w = np.linspace(0, width, width, endpoint=False, dtype=np.float32)
+        grid_t = torch.from_numpy(grid_t).float()
+        grid_h = torch.from_numpy(grid_h).float()
+        grid_w = torch.from_numpy(grid_w).float()
+        freqs_t = torch.einsum("..., f -> ... f", grid_t, freqs_t)
+        freqs_h = torch.einsum("..., f -> ... f", grid_h, freqs_h)
+        freqs_w = torch.einsum("..., f -> ... f", grid_w, freqs_w)
+        freqs_t = repeat(freqs_t, "... n -> ... (n r)", r=2)
+        freqs_h = repeat(freqs_h, "... n -> ... (n r)", r=2)
+        freqs_w = repeat(freqs_w, "... n -> ... (n r)", r=2)
+        freqs = broadcat((freqs_t[:, None, None, :], freqs_h[None, :, None, :], freqs_w[None, None, :, :]), dim=-1)
+        # (T H W D)
+        freqs = rearrange(freqs, "T H W D -> (T H W) D")
+        # if self.cp_split_hw[0] * self.cp_split_hw[1] > 1:
+        #     with torch.no_grad():
+        #         freqs = rearrange(freqs, "(T H W) D -> T H W D", T=num_frames, H=height, W=width)
+        #         freqs = context_parallel_util.split_cp_2d(freqs, seq_dim_hw=(1, 2), split_hw=self.cp_split_hw)
+        #         freqs = rearrange(freqs, "T H W D -> (T H W) D")
+        return freqs
+    def forward(self, q, k, grid_size):
+        """3D RoPE.
+        Args:
+            query: [B, head, seq, head_dim]
+            key: [B, head, seq, head_dim]
+        Returns:
+            query and key with the same shape as input.
+        """
+        if grid_size not in self.freqs_dict:
+            self.register_grid_size(grid_size)
+        freqs_cis = self.freqs_dict[grid_size].to(q.device)
+        q_, k_ = q.float(), k.float()
+        freqs_cis = freqs_cis.float().to(q.device)
+        cos, sin = freqs_cis.cos(), freqs_cis.sin()
+        cos, sin = rearrange(cos, 'n d -> 1 1 n d'), rearrange(sin, 'n d -> 1 1 n d')
+        q_ = (q_ * cos) + (rotate_half(q_) * sin)
+        k_ = (k_ * cos) + (rotate_half(k_) * sin)
+        return q_.type_as(q), k_.type_as(k)
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        enable_flashattn3: bool = False,
+        enable_flashattn2: bool = False,
+        enable_xformers: bool = False,
+        enable_bsa: bool = False,
+        bsa_params: dict = None,
+        cp_split_hw: Optional[List[int]] = None
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.enable_flashattn3 = enable_flashattn3
+        self.enable_flashattn2 = enable_flashattn2
+        self.enable_xformers = enable_xformers
+        self.enable_bsa = enable_bsa
+        self.bsa_params = bsa_params
+        self.cp_split_hw = cp_split_hw
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.q_norm = RMSNorm_FP32(self.head_dim, eps=1e-6)
+        self.k_norm = RMSNorm_FP32(self.head_dim, eps=1e-6)
+        self.proj = nn.Linear(dim, dim)
+        self.rope_3d = RotaryPositionalEmbedding(
+            self.head_dim,
+            cp_split_hw=cp_split_hw
+        )
+    def _process_attn(self, q, k, v, shape):
+        q = rearrange(q, "B H S D -> B S (H D)")
+        k = rearrange(k, "B H S D -> B S (H D)")
+        v = rearrange(v, "B H S D -> B S (H D)")
+        x = flash_attention(q, k, v, num_heads=self.num_heads)
+        x = rearrange(x, "B S (H D) -> B H S D", H=self.num_heads)
+        return x
+    def forward(self, x: torch.Tensor, shape=None, num_cond_latents=None, return_kv=False) -> torch.Tensor:
+        """
+        """
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv_shape = (B, N, 3, self.num_heads, self.head_dim)
+        qkv = qkv.view(qkv_shape).permute((2, 0, 3, 1, 4)) # [3, B, H, N, D]
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if return_kv:
+            k_cache, v_cache = k.clone(), v.clone()
+        q, k = self.rope_3d(q, k, shape)
+        # cond mode
+        if num_cond_latents is not None and num_cond_latents > 0:
+            num_cond_latents_thw = num_cond_latents * (N // shape[0])
+            # process the condition tokens
+            q_cond = q[:, :, :num_cond_latents_thw].contiguous()
+            k_cond = k[:, :, :num_cond_latents_thw].contiguous()
+            v_cond = v[:, :, :num_cond_latents_thw].contiguous()
+            x_cond = self._process_attn(q_cond, k_cond, v_cond, shape)
+            # process the noise tokens
+            q_noise = q[:, :, num_cond_latents_thw:].contiguous()
+            x_noise = self._process_attn(q_noise, k, v, shape)
+            # merge x_cond and x_noise
+            x = torch.cat([x_cond, x_noise], dim=2).contiguous()
+        else:
+            x = self._process_attn(q, k, v, shape)
+        x_output_shape = (B, N, C)
+        x = x.transpose(1, 2) # [B, H, N, D] --> [B, N, H, D]
+        x = x.reshape(x_output_shape) # [B, N, H, D] --> [B, N, C]
+        x = self.proj(x)
+        if return_kv:
+            return x, (k_cache, v_cache)
+        else:
+            return x
+    def forward_with_kv_cache(self, x: torch.Tensor, shape=None, num_cond_latents=None, kv_cache=None) -> torch.Tensor:
+        """
+        """
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv_shape = (B, N, 3, self.num_heads, self.head_dim)
+        qkv = qkv.view(qkv_shape).permute((2, 0, 3, 1, 4)) # [3, B, H, N, D]
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        T, H, W = shape
+        k_cache, v_cache = kv_cache
+        assert k_cache.shape[0] == v_cache.shape[0] and k_cache.shape[0] in [1, B]
+        if k_cache.shape[0] == 1:
+            k_cache = k_cache.repeat(B, 1, 1, 1)
+            v_cache = v_cache.repeat(B, 1, 1, 1)
+        if num_cond_latents is not None and num_cond_latents > 0:
+            k_full = torch.cat([k_cache, k], dim=2).contiguous()
+            v_full = torch.cat([v_cache, v], dim=2).contiguous()
+            q_padding = torch.cat([torch.empty_like(k_cache), q], dim=2).contiguous()
+            q_padding, k_full = self.rope_3d(q_padding, k_full, (T + num_cond_latents, H, W))
+            q = q_padding[:, :, -N:].contiguous()
+        x = self._process_attn(q, k_full, v_full, shape)
+        x_output_shape = (B, N, C)
+        x = x.transpose(1, 2) # [B, H, N, D] --> [B, N, H, D]
+        x = x.reshape(x_output_shape) # [B, N, H, D] --> [B, N, C]
+        x = self.proj(x)
+        return x
+class MultiHeadCrossAttention(nn.Module):
+    def __init__(
+            self,
+            dim,
+            num_heads,
+            enable_flashattn3=False,
+            enable_flashattn2=False,
+            enable_xformers=False,
+        ):
+        super(MultiHeadCrossAttention, self).__init__()
+        assert dim % num_heads == 0, "d_model must be divisible by num_heads"
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.q_linear = nn.Linear(dim, dim)
+        self.kv_linear = nn.Linear(dim, dim * 2)
+        self.proj = nn.Linear(dim, dim)
+        self.q_norm = RMSNorm_FP32(self.head_dim, eps=1e-6)
+        self.k_norm = RMSNorm_FP32(self.head_dim, eps=1e-6)
+        self.enable_flashattn3 = enable_flashattn3
+        self.enable_flashattn2 = enable_flashattn2
+        self.enable_xformers = enable_xformers
+    def _process_cross_attn(self, x, cond, kv_seqlen):
+        B, N, C = x.shape
+        assert C == self.dim and cond.shape[2] == self.dim
+        q = self.q_linear(x).view(1, -1, self.num_heads, self.head_dim)
+        kv = self.kv_linear(cond).view(1, -1, 2, self.num_heads, self.head_dim)
+        k, v = kv.unbind(2)
+        q, k = self.q_norm(q), self.k_norm(k)
+        q = rearrange(q, "B S H D -> B S (H D)")
+        k = rearrange(k, "B S H D -> B S (H D)")
+        v = rearrange(v, "B S H D -> B S (H D)")
+        x = flash_attention(q, k, v, num_heads=self.num_heads)
+        x = x.view(B, -1, C)
+        x = self.proj(x)
+        return x
+    def forward(self, x, cond, kv_seqlen, num_cond_latents=None, shape=None):
+        """
+            x: [B, N, C]
+            cond: [B, M, C]
+        """
+        if num_cond_latents is None or num_cond_latents == 0:
+            return self._process_cross_attn(x, cond, kv_seqlen)
+        else:
+            B, N, C = x.shape
+            if num_cond_latents is not None and num_cond_latents > 0:
+                assert shape is not None, "SHOULD pass in the shape"
+                num_cond_latents_thw = num_cond_latents * (N // shape[0])
+                x_noise = x[:, num_cond_latents_thw:] # [B, N_noise, C]
+                output_noise = self._process_cross_attn(x_noise, cond, kv_seqlen) # [B, N_noise, C]
+                output = torch.cat([
+                    torch.zeros((B, num_cond_latents_thw, C), dtype=output_noise.dtype, device=output_noise.device),
+                    output_noise
+                ], dim=1).contiguous()
+            else:
+                raise NotImplementedError
+            return output
+class LayerNorm_FP32(nn.LayerNorm):
+    def __init__(self, dim, eps, elementwise_affine):
+        super().__init__(dim, eps=eps, elementwise_affine=elementwise_affine)
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        origin_dtype = inputs.dtype
+        out = F.layer_norm(
+            inputs.float(),
+            self.normalized_shape,
+            None if self.weight is None else self.weight.float(),
+            None if self.bias is None else self.bias.float() ,
+            self.eps
+        ).to(origin_dtype)
+        return out
+def modulate_fp32(norm_func, x, shift, scale):
+    # Suppose x is (B, N, D), shift is (B, -1, D), scale is (B, -1, D)
+    # ensure the modulation params be fp32
+    assert shift.dtype == torch.float32, scale.dtype == torch.float32
+    dtype = x.dtype
+    x = norm_func(x.to(torch.float32))
+    x = x * (scale + 1) + shift
+    x = x.to(dtype)
+    return x
+class FinalLayer_FP32(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, num_patch, out_channels, adaln_tembed_dim):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_patch = num_patch
+        self.out_channels = out_channels
+        self.adaln_tembed_dim = adaln_tembed_dim
+        self.norm_final = LayerNorm_FP32(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, num_patch * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(adaln_tembed_dim, 2 * hidden_size, bias=True))
+    def forward(self, x, t, latent_shape):
+        # timestep shape: [B, T, C]
+        assert t.dtype == torch.float32
+        B, N, C = x.shape
+        T, _, _ = latent_shape
+        with amp.autocast(get_device_type(), dtype=torch.float32):
+            shift, scale = self.adaLN_modulation(t).unsqueeze(2).chunk(2, dim=-1) # [B, T, 1, C]
+            x = modulate_fp32(self.norm_final, x.view(B, T, -1, C), shift, scale).view(B, N, C)
+            x = self.linear(x)
+        return x
+class FeedForwardSwiGLU(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int = 256,
+        ffn_dim_multiplier: Optional[float] = None,
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.dim = dim
+        self.hidden_dim = hidden_dim
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, t_embed_dim, frequency_embedding_size=256):
+        super().__init__()
+        self.t_embed_dim = t_embed_dim
+        self.frequency_embedding_size = frequency_embedding_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, t_embed_dim, bias=True),
+            nn.SiLU(),
+            nn.Linear(t_embed_dim, t_embed_dim, bias=True),
+        )
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        half = dim // 2
+        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half)
+        freqs = freqs.to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t, dtype):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        if t_freq.dtype != dtype:
+            t_freq = t_freq.to(dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class CaptionEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations.
+    """
+    def __init__(self, in_channels, hidden_size):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_size = hidden_size
+        self.y_proj = nn.Sequential(
+            nn.Linear(in_channels, hidden_size, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+    def forward(self, caption):
+        B, _, N, C = caption.shape
+        caption = self.y_proj(caption)
+        return caption
+class PatchEmbed3D(nn.Module):
+    """Video to Patch Embedding.
+    Args:
+        patch_size (int): Patch token size. Default: (2,4,4).
+        in_chans (int): Number of input video channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(
+        self,
+        patch_size=(2, 4, 4),
+        in_chans=3,
+        embed_dim=96,
+        norm_layer=None,
+        flatten=True,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.flatten = flatten
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv3d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, D, H, W = x.size()
+        if W % self.patch_size[2] != 0:
+            x = F.pad(x, (0, self.patch_size[2] - W % self.patch_size[2]))
+        if H % self.patch_size[1] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[1] - H % self.patch_size[1]))
+        if D % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, 0, 0, self.patch_size[0] - D % self.patch_size[0]))
+        B, C, T, H, W = x.shape
+        x = self.proj(x)  # (B C T H W)
+        if self.norm is not None:
+            D, Wh, Ww = x.size(2), x.size(3), x.size(4)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, D, Wh, Ww)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCTHW -> BNC
+        return x
+class LongCatSingleStreamBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: int,
+        adaln_tembed_dim: int,
+        enable_flashattn3: bool = False,
+        enable_flashattn2: bool = False,
+        enable_xformers: bool = False,
+        enable_bsa: bool = False,
+        bsa_params=None,
+        cp_split_hw=None
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        # scale and gate modulation
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(adaln_tembed_dim, 6 * hidden_size, bias=True)
+        )
+        self.mod_norm_attn = LayerNorm_FP32(hidden_size, eps=1e-6, elementwise_affine=False)
+        self.mod_norm_ffn  = LayerNorm_FP32(hidden_size, eps=1e-6, elementwise_affine=False)
+        self.pre_crs_attn_norm = LayerNorm_FP32(hidden_size, eps=1e-6, elementwise_affine=True)
+        self.attn = Attention(
+            dim=hidden_size,
+            num_heads=num_heads,
+            enable_flashattn3=enable_flashattn3,
+            enable_flashattn2=enable_flashattn2,
+            enable_xformers=enable_xformers,
+            enable_bsa=enable_bsa,
+            bsa_params=bsa_params,
+            cp_split_hw=cp_split_hw
+        )
+        self.cross_attn = MultiHeadCrossAttention(
+            dim=hidden_size,
+            num_heads=num_heads,
+            enable_flashattn3=enable_flashattn3,
+            enable_flashattn2=enable_flashattn2,
+            enable_xformers=enable_xformers,
+        )
+        self.ffn = FeedForwardSwiGLU(dim=hidden_size, hidden_dim=int(hidden_size * mlp_ratio))
+    def forward(self, x, y, t, y_seqlen, latent_shape, num_cond_latents=None, return_kv=False, kv_cache=None, skip_crs_attn=False):
+        """
+            x: [B, N, C]
+            y: [1, N_valid_tokens, C]
+            t: [B, T, C_t]
+            y_seqlen: [B]; type of a list
+            latent_shape: latent shape of a single item
+        """
+        x_dtype = x.dtype
+        B, N, C = x.shape
+        T, _, _ = latent_shape # S != T*H*W in case of CP split on H*W.
+        # compute modulation params in fp32
+        with amp.autocast(device_type=get_device_type(), dtype=torch.float32):
+            shift_msa, scale_msa, gate_msa, \
+            shift_mlp, scale_mlp, gate_mlp = \
+                self.adaLN_modulation(t).unsqueeze(2).chunk(6, dim=-1) # [B, T, 1, C]
+        # self attn with modulation
+        x_m = modulate_fp32(self.mod_norm_attn, x.view(B, T, -1, C), shift_msa, scale_msa).view(B, N, C)
+        if kv_cache is not None:
+            kv_cache = (kv_cache[0].to(x.device), kv_cache[1].to(x.device))
+            attn_outputs = self.attn.forward_with_kv_cache(x_m, shape=latent_shape, num_cond_latents=num_cond_latents, kv_cache=kv_cache)
+        else:
+            attn_outputs = self.attn(x_m, shape=latent_shape, num_cond_latents=num_cond_latents, return_kv=return_kv)
+        if return_kv:
+            x_s, kv_cache = attn_outputs
+        else:
+            x_s = attn_outputs
+        with amp.autocast(device_type=get_device_type(), dtype=torch.float32):
+            x = x + (gate_msa * x_s.view(B, -1, N//T, C)).view(B, -1, C) # [B, N, C]
+        x = x.to(x_dtype)
+        # cross attn
+        if not skip_crs_attn:
+            if kv_cache is not None:
+                num_cond_latents = None
+            x = x + self.cross_attn(self.pre_crs_attn_norm(x), y, y_seqlen, num_cond_latents=num_cond_latents, shape=latent_shape)
+        # ffn with modulation
+        x_m = modulate_fp32(self.mod_norm_ffn, x.view(B, -1, N//T, C), shift_mlp, scale_mlp).view(B, -1, C)
+        x_s = self.ffn(x_m)
+        with amp.autocast(device_type=get_device_type(), dtype=torch.float32):
+            x = x + (gate_mlp * x_s.view(B, -1, N//T, C)).view(B, -1, C) # [B, N, C]
+        x = x.to(x_dtype)
+        if return_kv:
+            return x, kv_cache
+        else:
+            return x
+class LongCatVideoTransformer3DModel(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        hidden_size: int = 4096,
+        depth: int = 48,
+        num_heads: int = 32,
+        caption_channels: int = 4096,
+        mlp_ratio: int = 4,
+        adaln_tembed_dim: int = 512,
+        frequency_embedding_size: int = 256,
+        # default params
+        patch_size: Tuple[int] = (1, 2, 2),
+        # attention config
+        enable_flashattn3: bool = False,
+        enable_flashattn2: bool = True,
+        enable_xformers: bool = False,
+        enable_bsa: bool = False,
+        bsa_params: dict = {'sparsity': 0.9375, 'chunk_3d_shape_q': [4, 4, 4], 'chunk_3d_shape_k': [4, 4, 4]},
+        cp_split_hw: Optional[List[int]] = [1, 1],
+        text_tokens_zero_pad: bool = True,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.cp_split_hw = cp_split_hw
+        self.x_embedder = PatchEmbed3D(patch_size, in_channels, hidden_size)
+        self.t_embedder = TimestepEmbedder(t_embed_dim=adaln_tembed_dim, frequency_embedding_size=frequency_embedding_size)
+        self.y_embedder = CaptionEmbedder(
+            in_channels=caption_channels,
+            hidden_size=hidden_size,
+        )
+        self.blocks = nn.ModuleList(
+            [
+                LongCatSingleStreamBlock(
+                    hidden_size=hidden_size,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    adaln_tembed_dim=adaln_tembed_dim,
+                    enable_flashattn3=enable_flashattn3,
+                    enable_flashattn2=enable_flashattn2,
+                    enable_xformers=enable_xformers,
+                    enable_bsa=enable_bsa,
+                    bsa_params=bsa_params,
+                    cp_split_hw=cp_split_hw
+                )
+                for i in range(depth)
+            ]
+        )
+        self.final_layer = FinalLayer_FP32(
+            hidden_size,
+            np.prod(self.patch_size),
+            out_channels,
+            adaln_tembed_dim,
+        )
+        self.gradient_checkpointing = False
+        self.text_tokens_zero_pad = text_tokens_zero_pad
+        self.lora_dict = {}
+        self.active_loras = []
+    def enable_loras(self, lora_key_list=[]):
+        self.disable_all_loras()
+        module_loras = {}  # {module_name: [lora1, lora2, ...]}
+        model_device = next(self.parameters()).device
+        model_dtype = next(self.parameters()).dtype
+        for lora_key in lora_key_list:
+            if lora_key in self.lora_dict:
+                for lora in self.lora_dict[lora_key].loras:
+                    lora.to(model_device, dtype=model_dtype, non_blocking=True)
+                    module_name = lora.lora_name.replace("lora___lorahyphen___", "").replace("___lorahyphen___", ".")
+                    if module_name not in module_loras:
+                        module_loras[module_name] = []
+                    module_loras[module_name].append(lora)
+                self.active_loras.append(lora_key)
+        for module_name, loras in module_loras.items():
+            module = self._get_module_by_name(module_name)
+            if not hasattr(module, 'org_forward'):
+                module.org_forward = module.forward
+            module.forward = self._create_multi_lora_forward(module, loras)
+    def _create_multi_lora_forward(self, module, loras):
+        def multi_lora_forward(x, *args, **kwargs):
+            weight_dtype = x.dtype
+            org_output = module.org_forward(x, *args, **kwargs)
+            total_lora_output = 0
+            for lora in loras:
+                if lora.use_lora:
+                    lx = lora.lora_down(x.to(lora.lora_down.weight.dtype))
+                    lx = lora.lora_up(lx)
+                    lora_output = lx.to(weight_dtype) * lora.multiplier * lora.alpha_scale
+                    total_lora_output += lora_output
+            return org_output + total_lora_output
+        return multi_lora_forward
+    def _get_module_by_name(self, module_name):
+        try:
+            module = self
+            for part in module_name.split('.'):
+                module = getattr(module, part)
+            return module
+        except AttributeError as e:
+            raise ValueError(f"Cannot find module: {module_name}, error: {e}")
+    def disable_all_loras(self):
+        for name, module in self.named_modules():
+            if hasattr(module, 'org_forward'):
+                module.forward = module.org_forward
+                delattr(module, 'org_forward')
+        for lora_key, lora_network in self.lora_dict.items():
+            for lora in lora_network.loras:
+                lora.to("cpu")
+        self.active_loras.clear()
+    def enable_bsa(self,):
+        for block in self.blocks:
+            block.attn.enable_bsa = True
+    def disable_bsa(self,):
+        for block in self.blocks:
+            block.attn.enable_bsa = False
+    def forward(
+        self,
+        hidden_states,
+        timestep,
+        encoder_hidden_states,
+        encoder_attention_mask=None,
+        num_cond_latents=0,
+        return_kv=False,
+        kv_cache_dict={},
+        skip_crs_attn=False,
+        offload_kv_cache=False,
+        use_gradient_checkpointing=False,
+        use_gradient_checkpointing_offload=False,
+    ):
+        B, _, T, H, W = hidden_states.shape
+        N_t = T // self.patch_size[0]
+        N_h = H // self.patch_size[1]
+        N_w = W // self.patch_size[2]
+        assert self.patch_size[0]==1, "Currently, 3D x_embedder should not compress the temporal dimension."
+        # expand the shape of timestep from [B] to [B, T]
+        if len(timestep.shape) == 1:
+            timestep = timestep.unsqueeze(1).expand(-1, N_t).clone() # [B, T]
+        timestep[:, :num_cond_latents] = 0
+        dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(dtype)
+        timestep = timestep.to(dtype)
+        encoder_hidden_states = encoder_hidden_states.to(dtype)
+        hidden_states = self.x_embedder(hidden_states)  # [B, N, C]
+        with amp.autocast(device_type=get_device_type(), dtype=torch.float32):
+            t = self.t_embedder(timestep.float().flatten(), dtype=torch.float32).reshape(B, N_t, -1)  # [B, T, C_t]
+        encoder_hidden_states = self.y_embedder(encoder_hidden_states)  # [B, 1, N_token, C]
+        if self.text_tokens_zero_pad and encoder_attention_mask is not None:
+            encoder_hidden_states = encoder_hidden_states * encoder_attention_mask[:, None, :, None]
+            encoder_attention_mask = (encoder_attention_mask * 0 + 1).to(encoder_attention_mask.dtype)
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = encoder_attention_mask.squeeze(1).squeeze(1)
+            encoder_hidden_states = encoder_hidden_states.squeeze(1).masked_select(encoder_attention_mask.unsqueeze(-1) != 0).view(1, -1, hidden_states.shape[-1]) # [1, N_valid_tokens, C]
+            y_seqlens = encoder_attention_mask.sum(dim=1).tolist() # [B]
+        else:
+            y_seqlens = [encoder_hidden_states.shape[2]] * encoder_hidden_states.shape[0]
+            encoder_hidden_states = encoder_hidden_states.squeeze(1).view(1, -1, hidden_states.shape[-1])
+        # if self.cp_split_hw[0] * self.cp_split_hw[1] > 1:
+        #     hidden_states = rearrange(hidden_states, "B (T H W) C -> B T H W C", T=N_t, H=N_h, W=N_w)
+        #     hidden_states = context_parallel_util.split_cp_2d(hidden_states, seq_dim_hw=(2, 3), split_hw=self.cp_split_hw)
+        #     hidden_states = rearrange(hidden_states, "B T H W C -> B (T H W) C")
+        # blocks
+        kv_cache_dict_ret = {}
+        for i, block in enumerate(self.blocks):
+            block_outputs = gradient_checkpoint_forward(
+                block,
+                use_gradient_checkpointing=use_gradient_checkpointing,
+                use_gradient_checkpointing_offload=use_gradient_checkpointing_offload,
+                x=hidden_states,
+                y=encoder_hidden_states,
+                t=t,
+                y_seqlen=y_seqlens,
+                latent_shape=(N_t, N_h, N_w),
+                num_cond_latents=num_cond_latents,
+                return_kv=return_kv,
+                kv_cache=kv_cache_dict.get(i, None),
+                skip_crs_attn=skip_crs_attn,
+            )
+            if return_kv:
+                hidden_states, kv_cache = block_outputs
+                if offload_kv_cache:
+                    kv_cache_dict_ret[i] = (kv_cache[0].cpu(), kv_cache[1].cpu())
+                else:
+                    kv_cache_dict_ret[i] = (kv_cache[0].contiguous(), kv_cache[1].contiguous())
+            else:
+                hidden_states = block_outputs
+        hidden_states = self.final_layer(hidden_states, t, (N_t, N_h, N_w))  # [B, N, C=T_p*H_p*W_p*C_out]
+        # if self.cp_split_hw[0] * self.cp_split_hw[1] > 1:
+        #     hidden_states = context_parallel_util.gather_cp_2d(hidden_states, shape=(N_t, N_h, N_w), split_hw=self.cp_split_hw)
+        hidden_states = self.unpatchify(hidden_states, N_t, N_h, N_w)  # [B, C_out, H, W]
+        # cast to float32 for better accuracy
+        hidden_states = hidden_states.to(torch.float32)
+        if return_kv:
+            return hidden_states, kv_cache_dict_ret
+        else:
+            return hidden_states
+    def unpatchify(self, x, N_t, N_h, N_w):
+        """
+        Args:
+            x (torch.Tensor): of shape [B, N, C]
+        Return:
+            x (torch.Tensor): of shape [B, C_out, T, H, W]
+        """
+        T_p, H_p, W_p = self.patch_size
+        x = rearrange(
+            x,
+            "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)",
+            N_t=N_t,
+            N_h=N_h,
+            N_w=N_w,
+            T_p=T_p,
+            H_p=H_p,
+            W_p=W_p,
+            C_out=self.out_channels,
+        )
+        return x
+    @staticmethod
+    def state_dict_converter():
+        return LongCatVideoTransformer3DModelDictConverter()
+class LongCatVideoTransformer3DModelDictConverter:
+    def __init__(self):
+        pass
+    def from_diffusers(self, state_dict):
+        return state_dict
+    def from_civitai(self, state_dict):
+        return state_dict

diffsynth/models/ltx2_audio_vae.py ADDED Viewed

	@@ -0,0 +1,1872 @@

+from typing import Set, Tuple, Optional, List
+from enum import Enum
+import math
+import einops
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from .ltx2_common import VideoLatentShape, AudioLatentShape, Patchifier, NormType, build_normalization_layer
+class AudioProcessor(nn.Module):
+    """Converts audio waveforms to log-mel spectrograms with optional resampling."""
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        mel_bins: int = 64,
+        mel_hop_length: int = 160,
+        n_fft: int = 1024,
+    ) -> None:
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.mel_transform = torchaudio.transforms.MelSpectrogram(
+            sample_rate=sample_rate,
+            n_fft=n_fft,
+            win_length=n_fft,
+            hop_length=mel_hop_length,
+            f_min=0.0,
+            f_max=sample_rate / 2.0,
+            n_mels=mel_bins,
+            window_fn=torch.hann_window,
+            center=True,
+            pad_mode="reflect",
+            power=1.0,
+            mel_scale="slaney",
+            norm="slaney",
+        )
+    def resample_waveform(
+        self,
+        waveform: torch.Tensor,
+        source_rate: int,
+        target_rate: int,
+    ) -> torch.Tensor:
+        """Resample waveform to target sample rate if needed."""
+        if source_rate == target_rate:
+            return waveform
+        resampled = torchaudio.functional.resample(waveform, source_rate, target_rate)
+        return resampled.to(device=waveform.device, dtype=waveform.dtype)
+    def waveform_to_mel(
+        self,
+        waveform: torch.Tensor,
+        waveform_sample_rate: int,
+    ) -> torch.Tensor:
+        """Convert waveform to log-mel spectrogram [batch, channels, time, n_mels]."""
+        waveform = self.resample_waveform(waveform, waveform_sample_rate, self.sample_rate)
+        mel = self.mel_transform(waveform)
+        mel = torch.log(torch.clamp(mel, min=1e-5))
+        mel = mel.to(device=waveform.device, dtype=waveform.dtype)
+        return mel.permute(0, 1, 3, 2).contiguous()
+class AudioPatchifier(Patchifier):
+    def __init__(
+        self,
+        patch_size: int,
+        sample_rate: int = 16000,
+        hop_length: int = 160,
+        audio_latent_downsample_factor: int = 4,
+        is_causal: bool = True,
+        shift: int = 0,
+    ):
+        """
+        Patchifier tailored for spectrogram/audio latents.
+        Args:
+            patch_size: Number of mel bins combined into a single patch. This
+                controls the resolution along the frequency axis.
+            sample_rate: Original waveform sampling rate. Used to map latent
+                indices back to seconds so downstream consumers can align audio
+                and video cues.
+            hop_length: Window hop length used for the spectrogram. Determines
+                how many real-time samples separate two consecutive latent frames.
+            audio_latent_downsample_factor: Ratio between spectrogram frames and
+                latent frames; compensates for additional downsampling inside the
+                VAE encoder.
+            is_causal: When True, timing is shifted to account for causal
+                receptive fields so timestamps do not peek into the future.
+            shift: Integer offset applied to the latent indices. Enables
+                constructing overlapping windows from the same latent sequence.
+        """
+        self.hop_length = hop_length
+        self.sample_rate = sample_rate
+        self.audio_latent_downsample_factor = audio_latent_downsample_factor
+        self.is_causal = is_causal
+        self.shift = shift
+        self._patch_size = (1, patch_size, patch_size)
+    @property
+    def patch_size(self) -> Tuple[int, int, int]:
+        return self._patch_size
+    def get_token_count(self, tgt_shape: AudioLatentShape) -> int:
+        return tgt_shape.frames
+    def _get_audio_latent_time_in_sec(
+        self,
+        start_latent: int,
+        end_latent: int,
+        dtype: torch.dtype,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        """
+        Converts latent indices into real-time seconds while honoring causal
+        offsets and the configured hop length.
+        Args:
+            start_latent: Inclusive start index inside the latent sequence. This
+                sets the first timestamp returned.
+            end_latent: Exclusive end index. Determines how many timestamps get
+                generated.
+            dtype: Floating-point dtype used for the returned tensor, allowing
+                callers to control precision.
+            device: Target device for the timestamp tensor. When omitted the
+                computation occurs on CPU to avoid surprising GPU allocations.
+        """
+        if device is None:
+            device = torch.device("cpu")
+        audio_latent_frame = torch.arange(start_latent, end_latent, dtype=dtype, device=device)
+        audio_mel_frame = audio_latent_frame * self.audio_latent_downsample_factor
+        if self.is_causal:
+            # Frame offset for causal alignment.
+            # The "+1" ensures the timestamp corresponds to the first sample that is fully available.
+            causal_offset = 1
+            audio_mel_frame = (audio_mel_frame + causal_offset - self.audio_latent_downsample_factor).clip(min=0)
+        return audio_mel_frame * self.hop_length / self.sample_rate
+    def _compute_audio_timings(
+        self,
+        batch_size: int,
+        num_steps: int,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        """
+        Builds a `(B, 1, T, 2)` tensor containing timestamps for each latent frame.
+        This helper method underpins `get_patch_grid_bounds` for the audio patchifier.
+        Args:
+            batch_size: Number of sequences to broadcast the timings over.
+            num_steps: Number of latent frames (time steps) to convert into timestamps.
+            device: Device on which the resulting tensor should reside.
+        """
+        resolved_device = device
+        if resolved_device is None:
+            resolved_device = torch.device("cpu")
+        start_timings = self._get_audio_latent_time_in_sec(
+            self.shift,
+            num_steps + self.shift,
+            torch.float32,
+            resolved_device,
+        )
+        start_timings = start_timings.unsqueeze(0).expand(batch_size, -1).unsqueeze(1)
+        end_timings = self._get_audio_latent_time_in_sec(
+            self.shift + 1,
+            num_steps + self.shift + 1,
+            torch.float32,
+            resolved_device,
+        )
+        end_timings = end_timings.unsqueeze(0).expand(batch_size, -1).unsqueeze(1)
+        return torch.stack([start_timings, end_timings], dim=-1)
+    def patchify(
+        self,
+        audio_latents: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Flattens the audio latent tensor along time. Use `get_patch_grid_bounds`
+        to derive timestamps for each latent frame based on the configured hop
+        length and downsampling.
+        Args:
+            audio_latents: Latent tensor to patchify.
+        Returns:
+            Flattened patch tokens tensor. Use `get_patch_grid_bounds` to compute the
+            corresponding timing metadata when needed.
+        """
+        audio_latents = einops.rearrange(
+            audio_latents,
+            "b c t f -> b t (c f)",
+        )
+        return audio_latents
+    def unpatchify(
+        self,
+        audio_latents: torch.Tensor,
+        output_shape: AudioLatentShape,
+    ) -> torch.Tensor:
+        """
+        Restores the `(B, C, T, F)` spectrogram tensor from flattened patches.
+        Use `get_patch_grid_bounds` to recompute the timestamps that describe each
+        frame's position in real time.
+        Args:
+            audio_latents: Latent tensor to unpatchify.
+            output_shape: Shape of the unpatched output tensor.
+        Returns:
+            Unpatched latent tensor. Use `get_patch_grid_bounds` to compute the timing
+            metadata associated with the restored latents.
+        """
+        # audio_latents shape: (batch, time, freq * channels)
+        audio_latents = einops.rearrange(
+            audio_latents,
+            "b t (c f) -> b c t f",
+            c=output_shape.channels,
+            f=output_shape.mel_bins,
+        )
+        return audio_latents
+    def unpatchify_audio(
+        self,
+        audio_latents: torch.Tensor,
+        channels: int,
+        mel_bins: int
+    ) -> torch.Tensor:
+        audio_latents = einops.rearrange(
+            audio_latents,
+            "b t (c f) -> b c t f",
+            c=channels,
+            f=mel_bins,
+        )
+        return audio_latents
+    def get_patch_grid_bounds(
+        self,
+        output_shape: AudioLatentShape | VideoLatentShape,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        """
+        Return the temporal bounds `[inclusive start, exclusive end)` for every
+        patch emitted by `patchify`. For audio this corresponds to timestamps in
+        seconds aligned with the original spectrogram grid.
+        The returned tensor has shape `[batch_size, 1, time_steps, 2]`, where:
+            - axis 1 (size 1) represents the temporal dimension
+            - axis 3 (size 2) stores the `[start, end)` timestamps per patch
+        Args:
+            output_shape: Audio grid specification describing the number of time steps.
+            device: Target device for the returned tensor.
+        """
+        if not isinstance(output_shape, AudioLatentShape):
+            raise ValueError("AudioPatchifier expects AudioLatentShape when computing coordinates")
+        return self._compute_audio_timings(output_shape.batch, output_shape.frames, device)
+class AttentionType(Enum):
+    """Enum for specifying the attention mechanism type."""
+    VANILLA = "vanilla"
+    LINEAR = "linear"
+    NONE = "none"
+class AttnBlock(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        norm_type: NormType = NormType.GROUP,
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = build_normalization_layer(in_channels, normtype=norm_type)
+        self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w).contiguous()
+        q = q.permute(0, 2, 1).contiguous()  # b,hw,c
+        k = k.reshape(b, c, h * w).contiguous()  # b,c,hw
+        w_ = torch.bmm(q, k).contiguous()  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w).contiguous()
+        w_ = w_.permute(0, 2, 1).contiguous()  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_).contiguous()  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w).contiguous()
+        h_ = self.proj_out(h_)
+        return x + h_
+def make_attn(
+    in_channels: int,
+    attn_type: AttentionType = AttentionType.VANILLA,
+    norm_type: NormType = NormType.GROUP,
+) -> torch.nn.Module:
+    match attn_type:
+        case AttentionType.VANILLA:
+            return AttnBlock(in_channels, norm_type=norm_type)
+        case AttentionType.NONE:
+            return torch.nn.Identity()
+        case AttentionType.LINEAR:
+            raise NotImplementedError(f"Attention type {attn_type.value} is not supported yet.")
+        case _:
+            raise ValueError(f"Unknown attention type: {attn_type}")
+class CausalityAxis(Enum):
+    """Enum for specifying the causality axis in causal convolutions."""
+    NONE = None
+    WIDTH = "width"
+    HEIGHT = "height"
+    WIDTH_COMPATIBILITY = "width-compatibility"
+class CausalConv2d(torch.nn.Module):
+    """
+    A causal 2D convolution.
+    This layer ensures that the output at time `t` only depends on inputs
+    at time `t` and earlier. It achieves this by applying asymmetric padding
+    to the time dimension (width) before the convolution.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int | tuple[int, int],
+        stride: int = 1,
+        dilation: int | tuple[int, int] = 1,
+        groups: int = 1,
+        bias: bool = True,
+        causality_axis: CausalityAxis = CausalityAxis.HEIGHT,
+    ) -> None:
+        super().__init__()
+        self.causality_axis = causality_axis
+        # Ensure kernel_size and dilation are tuples
+        kernel_size = torch.nn.modules.utils._pair(kernel_size)
+        dilation = torch.nn.modules.utils._pair(dilation)
+        # Calculate padding dimensions
+        pad_h = (kernel_size[0] - 1) * dilation[0]
+        pad_w = (kernel_size[1] - 1) * dilation[1]
+        # The padding tuple for F.pad is (pad_left, pad_right, pad_top, pad_bottom)
+        match self.causality_axis:
+            case CausalityAxis.NONE:
+                self.padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)
+            case CausalityAxis.WIDTH | CausalityAxis.WIDTH_COMPATIBILITY:
+                self.padding = (pad_w, 0, pad_h // 2, pad_h - pad_h // 2)
+            case CausalityAxis.HEIGHT:
+                self.padding = (pad_w // 2, pad_w - pad_w // 2, pad_h, 0)
+            case _:
+                raise ValueError(f"Invalid causality_axis: {causality_axis}")
+        # The internal convolution layer uses no padding, as we handle it manually
+        self.conv = torch.nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=0,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Apply causal padding before convolution
+        x = F.pad(x, self.padding)
+        return self.conv(x)
+def make_conv2d(
+    in_channels: int,
+    out_channels: int,
+    kernel_size: int | tuple[int, int],
+    stride: int = 1,
+    padding: tuple[int, int, int, int] | None = None,
+    dilation: int = 1,
+    groups: int = 1,
+    bias: bool = True,
+    causality_axis: CausalityAxis | None = None,
+) -> torch.nn.Module:
+    """
+    Create a 2D convolution layer that can be either causal or non-causal.
+    Args:
+        in_channels: Number of input channels
+        out_channels: Number of output channels
+        kernel_size: Size of the convolution kernel
+        stride: Convolution stride
+        padding: Padding (if None, will be calculated based on causal flag)
+        dilation: Dilation rate
+        groups: Number of groups for grouped convolution
+        bias: Whether to use bias
+        causality_axis: Dimension along which to apply causality.
+    Returns:
+        Either a regular Conv2d or CausalConv2d layer
+    """
+    if causality_axis is not None:
+        # For causal convolution, padding is handled internally by CausalConv2d
+        return CausalConv2d(in_channels, out_channels, kernel_size, stride, dilation, groups, bias, causality_axis)
+    else:
+        # For non-causal convolution, use symmetric padding if not specified
+        if padding is None:
+            padding = kernel_size // 2 if isinstance(kernel_size, int) else tuple(k // 2 for k in kernel_size)
+        return torch.nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+        )
+LRELU_SLOPE = 0.1
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels: int, kernel_size: int = 3, dilation: Tuple[int, int, int] = (1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = torch.nn.ModuleList(
+            [
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding="same",
+                ),
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding="same",
+                ),
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[2],
+                    padding="same",
+                ),
+            ]
+        )
+        self.convs2 = torch.nn.ModuleList(
+            [
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=1,
+                    padding="same",
+                ),
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=1,
+                    padding="same",
+                ),
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=1,
+                    padding="same",
+                ),
+            ]
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for conv1, conv2 in zip(self.convs1, self.convs2, strict=True):
+            xt = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
+            xt = conv1(xt)
+            xt = torch.nn.functional.leaky_relu(xt, LRELU_SLOPE)
+            xt = conv2(xt)
+            x = xt + x
+        return x
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels: int, kernel_size: int = 3, dilation: Tuple[int, int] = (1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = torch.nn.ModuleList(
+            [
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding="same",
+                ),
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding="same",
+                ),
+            ]
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for conv in self.convs:
+            xt = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
+            xt = conv(xt)
+            x = xt + x
+        return x
+class ResnetBlock(torch.nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: int | None = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        norm_type: NormType = NormType.GROUP,
+        causality_axis: CausalityAxis = CausalityAxis.HEIGHT,
+    ) -> None:
+        super().__init__()
+        self.causality_axis = causality_axis
+        if self.causality_axis != CausalityAxis.NONE and norm_type == NormType.GROUP:
+            raise ValueError("Causal ResnetBlock with GroupNorm is not supported.")
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = build_normalization_layer(in_channels, normtype=norm_type)
+        self.non_linearity = torch.nn.SiLU()
+        self.conv1 = make_conv2d(in_channels, out_channels, kernel_size=3, stride=1, causality_axis=causality_axis)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = build_normalization_layer(out_channels, normtype=norm_type)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = make_conv2d(out_channels, out_channels, kernel_size=3, stride=1, causality_axis=causality_axis)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = make_conv2d(
+                    in_channels, out_channels, kernel_size=3, stride=1, causality_axis=causality_axis
+                )
+            else:
+                self.nin_shortcut = make_conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, causality_axis=causality_axis
+                )
+    def forward(
+        self,
+        x: torch.Tensor,
+        temb: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        h = x
+        h = self.norm1(h)
+        h = self.non_linearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(self.non_linearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = self.non_linearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.conv_shortcut(x) if self.use_conv_shortcut else self.nin_shortcut(x)
+        return x + h
+class Downsample(torch.nn.Module):
+    """
+    A downsampling layer that can use either a strided convolution
+    or average pooling. Supports standard and causal padding for the
+    convolutional mode.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        with_conv: bool,
+        causality_axis: CausalityAxis = CausalityAxis.WIDTH,
+    ) -> None:
+        super().__init__()
+        self.with_conv = with_conv
+        self.causality_axis = causality_axis
+        if self.causality_axis != CausalityAxis.NONE and not self.with_conv:
+            raise ValueError("causality is only supported when `with_conv=True`.")
+        if self.with_conv:
+            # Do time downsampling here
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.with_conv:
+            # Padding tuple is in the order: (left, right, top, bottom).
+            match self.causality_axis:
+                case CausalityAxis.NONE:
+                    pad = (0, 1, 0, 1)
+                case CausalityAxis.WIDTH:
+                    pad = (2, 0, 0, 1)
+                case CausalityAxis.HEIGHT:
+                    pad = (0, 1, 2, 0)
+                case CausalityAxis.WIDTH_COMPATIBILITY:
+                    pad = (1, 0, 0, 1)
+                case _:
+                    raise ValueError(f"Invalid causality_axis: {self.causality_axis}")
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            # This branch is only taken if with_conv=False, which implies causality_axis is NONE.
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+def build_downsampling_path(  # noqa: PLR0913
+    *,
+    ch: int,
+    ch_mult: Tuple[int, ...],
+    num_resolutions: int,
+    num_res_blocks: int,
+    resolution: int,
+    temb_channels: int,
+    dropout: float,
+    norm_type: NormType,
+    causality_axis: CausalityAxis,
+    attn_type: AttentionType,
+    attn_resolutions: Set[int],
+    resamp_with_conv: bool,
+) -> tuple[torch.nn.ModuleList, int]:
+    """Build the downsampling path with residual blocks, attention, and downsampling layers."""
+    down_modules = torch.nn.ModuleList()
+    curr_res = resolution
+    in_ch_mult = (1, *tuple(ch_mult))
+    block_in = ch
+    for i_level in range(num_resolutions):
+        block = torch.nn.ModuleList()
+        attn = torch.nn.ModuleList()
+        block_in = ch * in_ch_mult[i_level]
+        block_out = ch * ch_mult[i_level]
+        for _ in range(num_res_blocks):
+            block.append(
+                ResnetBlock(
+                    in_channels=block_in,
+                    out_channels=block_out,
+                    temb_channels=temb_channels,
+                    dropout=dropout,
+                    norm_type=norm_type,
+                    causality_axis=causality_axis,
+                )
+            )
+            block_in = block_out
+            if curr_res in attn_resolutions:
+                attn.append(make_attn(block_in, attn_type=attn_type, norm_type=norm_type))
+        down = torch.nn.Module()
+        down.block = block
+        down.attn = attn
+        if i_level != num_resolutions - 1:
+            down.downsample = Downsample(block_in, resamp_with_conv, causality_axis=causality_axis)
+            curr_res = curr_res // 2
+        down_modules.append(down)
+    return down_modules, block_in
+class Upsample(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        with_conv: bool,
+        causality_axis: CausalityAxis = CausalityAxis.HEIGHT,
+    ) -> None:
+        super().__init__()
+        self.with_conv = with_conv
+        self.causality_axis = causality_axis
+        if self.with_conv:
+            self.conv = make_conv2d(in_channels, in_channels, kernel_size=3, stride=1, causality_axis=causality_axis)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+            # Drop FIRST element in the causal axis to undo encoder's padding, while keeping the length 1 + 2 * n.
+            # For example, if the input is [0, 1, 2], after interpolation, the output is [0, 0, 1, 1, 2, 2].
+            # The causal convolution will pad the first element as [-, -, 0, 0, 1, 1, 2, 2],
+            # So the output elements rely on the following windows:
+            # 0: [-,-,0]
+            # 1: [-,0,0]
+            # 2: [0,0,1]
+            # 3: [0,1,1]
+            # 4: [1,1,2]
+            # 5: [1,2,2]
+            # Notice that the first and second elements in the output rely only on the first element in the input,
+            # while all other elements rely on two elements in the input.
+            # So we can drop the first element to undo the padding (rather than the last element).
+            # This is a no-op for non-causal convolutions.
+            match self.causality_axis:
+                case CausalityAxis.NONE:
+                    pass  # x remains unchanged
+                case CausalityAxis.HEIGHT:
+                    x = x[:, :, 1:, :]
+                case CausalityAxis.WIDTH:
+                    x = x[:, :, :, 1:]
+                case CausalityAxis.WIDTH_COMPATIBILITY:
+                    pass  # x remains unchanged
+                case _:
+                    raise ValueError(f"Invalid causality_axis: {self.causality_axis}")
+        return x
+def build_upsampling_path(  # noqa: PLR0913
+    *,
+    ch: int,
+    ch_mult: Tuple[int, ...],
+    num_resolutions: int,
+    num_res_blocks: int,
+    resolution: int,
+    temb_channels: int,
+    dropout: float,
+    norm_type: NormType,
+    causality_axis: CausalityAxis,
+    attn_type: AttentionType,
+    attn_resolutions: Set[int],
+    resamp_with_conv: bool,
+    initial_block_channels: int,
+) -> tuple[torch.nn.ModuleList, int]:
+    """Build the upsampling path with residual blocks, attention, and upsampling layers."""
+    up_modules = torch.nn.ModuleList()
+    block_in = initial_block_channels
+    curr_res = resolution // (2 ** (num_resolutions - 1))
+    for level in reversed(range(num_resolutions)):
+        stage = torch.nn.Module()
+        stage.block = torch.nn.ModuleList()
+        stage.attn = torch.nn.ModuleList()
+        block_out = ch * ch_mult[level]
+        for _ in range(num_res_blocks + 1):
+            stage.block.append(
+                ResnetBlock(
+                    in_channels=block_in,
+                    out_channels=block_out,
+                    temb_channels=temb_channels,
+                    dropout=dropout,
+                    norm_type=norm_type,
+                    causality_axis=causality_axis,
+                )
+            )
+            block_in = block_out
+            if curr_res in attn_resolutions:
+                stage.attn.append(make_attn(block_in, attn_type=attn_type, norm_type=norm_type))
+        if level != 0:
+            stage.upsample = Upsample(block_in, resamp_with_conv, causality_axis=causality_axis)
+            curr_res *= 2
+        up_modules.insert(0, stage)
+    return up_modules, block_in
+class PerChannelStatistics(nn.Module):
+    """
+    Per-channel statistics for normalizing and denormalizing the latent representation.
+    This statics is computed over the entire dataset and stored in model's checkpoint under AudioVAE state_dict.
+    """
+    def __init__(self, latent_channels: int = 128) -> None:
+        super().__init__()
+        self.register_buffer("std-of-means", torch.empty(latent_channels))
+        self.register_buffer("mean-of-means", torch.empty(latent_channels))
+    def un_normalize(self, x: torch.Tensor) -> torch.Tensor:
+        return (x * self.get_buffer("std-of-means").to(x)) + self.get_buffer("mean-of-means").to(x)
+    def normalize(self, x: torch.Tensor) -> torch.Tensor:
+        return (x - self.get_buffer("mean-of-means").to(x)) / self.get_buffer("std-of-means").to(x)
+LATENT_DOWNSAMPLE_FACTOR = 4
+def build_mid_block(
+    channels: int,
+    temb_channels: int,
+    dropout: float,
+    norm_type: NormType,
+    causality_axis: CausalityAxis,
+    attn_type: AttentionType,
+    add_attention: bool,
+) -> torch.nn.Module:
+    """Build the middle block with two ResNet blocks and optional attention."""
+    mid = torch.nn.Module()
+    mid.block_1 = ResnetBlock(
+        in_channels=channels,
+        out_channels=channels,
+        temb_channels=temb_channels,
+        dropout=dropout,
+        norm_type=norm_type,
+        causality_axis=causality_axis,
+    )
+    mid.attn_1 = make_attn(channels, attn_type=attn_type, norm_type=norm_type) if add_attention else torch.nn.Identity()
+    mid.block_2 = ResnetBlock(
+        in_channels=channels,
+        out_channels=channels,
+        temb_channels=temb_channels,
+        dropout=dropout,
+        norm_type=norm_type,
+        causality_axis=causality_axis,
+    )
+    return mid
+def run_mid_block(mid: torch.nn.Module, features: torch.Tensor) -> torch.Tensor:
+    """Run features through the middle block."""
+    features = mid.block_1(features, temb=None)
+    features = mid.attn_1(features)
+    return mid.block_2(features, temb=None)
+class LTX2AudioEncoder(torch.nn.Module):
+    """
+    Encoder that compresses audio spectrograms into latent representations.
+    The encoder uses a series of downsampling blocks with residual connections,
+    attention mechanisms, and configurable causal convolutions.
+    """
+    def __init__(  # noqa: PLR0913
+        self,
+        *,
+        ch: int = 128,
+        ch_mult: Tuple[int, ...] = (1, 2, 4),
+        num_res_blocks: int = 2,
+        attn_resolutions: Set[int] = set(),
+        dropout: float = 0.0,
+        resamp_with_conv: bool = True,
+        in_channels: int = 2,
+        resolution: int = 256,
+        z_channels: int = 8,
+        double_z: bool = True,
+        attn_type: AttentionType = AttentionType.VANILLA,
+        mid_block_add_attention: bool = False,
+        norm_type: NormType = NormType.PIXEL,
+        causality_axis: CausalityAxis = CausalityAxis.HEIGHT,
+        sample_rate: int = 16000,
+        mel_hop_length: int = 160,
+        n_fft: int = 1024,
+        is_causal: bool = True,
+        mel_bins: int = 64,
+        **_ignore_kwargs,
+    ) -> None:
+        """
+        Initialize the Encoder.
+        Args:
+            Arguments are configuration parameters, loaded from the audio VAE checkpoint config
+            (audio_vae.model.params.ddconfig):
+            ch: Base number of feature channels used in the first convolution layer.
+            ch_mult: Multiplicative factors for the number of channels at each resolution level.
+            num_res_blocks: Number of residual blocks to use at each resolution level.
+            attn_resolutions: Spatial resolutions (e.g., in time/frequency) at which to apply attention.
+            resolution: Input spatial resolution of the spectrogram (height, width).
+            z_channels: Number of channels in the latent representation.
+            norm_type: Normalization layer type to use within the network (e.g., group, batch).
+            causality_axis: Axis along which convolutions should be causal (e.g., time axis).
+            sample_rate: Audio sample rate in Hz for the input signals.
+            mel_hop_length: Hop length used when computing the mel spectrogram.
+            n_fft: FFT size used to compute the spectrogram.
+            mel_bins: Number of mel-frequency bins in the input spectrogram.
+            in_channels: Number of channels in the input spectrogram tensor.
+            double_z: If True, predict both mean and log-variance (doubling latent channels).
+            is_causal: If True, use causal convolutions suitable for streaming setups.
+            dropout: Dropout probability used in residual and mid blocks.
+            attn_type: Type of attention mechanism to use in attention blocks.
+            resamp_with_conv: If True, perform resolution changes using strided convolutions.
+            mid_block_add_attention: If True, add an attention block in the mid-level of the encoder.
+        """
+        super().__init__()
+        self.per_channel_statistics = PerChannelStatistics(latent_channels=ch)
+        self.sample_rate = sample_rate
+        self.mel_hop_length = mel_hop_length
+        self.n_fft = n_fft
+        self.is_causal = is_causal
+        self.mel_bins = mel_bins
+        self.patchifier = AudioPatchifier(
+            patch_size=1,
+            audio_latent_downsample_factor=LATENT_DOWNSAMPLE_FACTOR,
+            sample_rate=sample_rate,
+            hop_length=mel_hop_length,
+            is_causal=is_causal,
+        )
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.z_channels = z_channels
+        self.double_z = double_z
+        self.norm_type = norm_type
+        self.causality_axis = causality_axis
+        self.attn_type = attn_type
+        # downsampling
+        self.conv_in = make_conv2d(
+            in_channels,
+            self.ch,
+            kernel_size=3,
+            stride=1,
+            causality_axis=self.causality_axis,
+        )
+        self.non_linearity = torch.nn.SiLU()
+        self.down, block_in = build_downsampling_path(
+            ch=ch,
+            ch_mult=ch_mult,
+            num_resolutions=self.num_resolutions,
+            num_res_blocks=num_res_blocks,
+            resolution=resolution,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            norm_type=self.norm_type,
+            causality_axis=self.causality_axis,
+            attn_type=self.attn_type,
+            attn_resolutions=attn_resolutions,
+            resamp_with_conv=resamp_with_conv,
+        )
+        self.mid = build_mid_block(
+            channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            norm_type=self.norm_type,
+            causality_axis=self.causality_axis,
+            attn_type=self.attn_type,
+            add_attention=mid_block_add_attention,
+        )
+        self.norm_out = build_normalization_layer(block_in, normtype=self.norm_type)
+        self.conv_out = make_conv2d(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            causality_axis=self.causality_axis,
+        )
+    def forward(self, spectrogram: torch.Tensor) -> torch.Tensor:
+        """
+        Encode audio spectrogram into latent representations.
+        Args:
+            spectrogram: Input spectrogram of shape (batch, channels, time, frequency)
+        Returns:
+            Encoded latent representation of shape (batch, channels, frames, mel_bins)
+        """
+        h = self.conv_in(spectrogram)
+        h = self._run_downsampling_path(h)
+        h = run_mid_block(self.mid, h)
+        h = self._finalize_output(h)
+        return self._normalize_latents(h)
+    def _run_downsampling_path(self, h: torch.Tensor) -> torch.Tensor:
+        for level in range(self.num_resolutions):
+            stage = self.down[level]
+            for block_idx in range(self.num_res_blocks):
+                h = stage.block[block_idx](h, temb=None)
+                if stage.attn:
+                    h = stage.attn[block_idx](h)
+            if level != self.num_resolutions - 1:
+                h = stage.downsample(h)
+        return h
+    def _finalize_output(self, h: torch.Tensor) -> torch.Tensor:
+        h = self.norm_out(h)
+        h = self.non_linearity(h)
+        return self.conv_out(h)
+    def _normalize_latents(self, latent_output: torch.Tensor) -> torch.Tensor:
+        """
+        Normalize encoder latents using per-channel statistics.
+        When the encoder is configured with ``double_z=True``, the final
+        convolution produces twice the number of latent channels, typically
+        interpreted as two concatenated tensors along the channel dimension
+        (e.g., mean and variance or other auxiliary parameters).
+        This method intentionally uses only the first half of the channels
+        (the "mean" component) as input to the patchifier and normalization
+        logic. The remaining channels are left unchanged by this method and
+        are expected to be consumed elsewhere in the VAE pipeline.
+        If ``double_z=False``, the encoder output already contains only the
+        mean latents and the chunking operation simply returns that tensor.
+        """
+        means = torch.chunk(latent_output, 2, dim=1)[0]
+        latent_shape = AudioLatentShape(
+            batch=means.shape[0],
+            channels=means.shape[1],
+            frames=means.shape[2],
+            mel_bins=means.shape[3],
+        )
+        latent_patched = self.patchifier.patchify(means)
+        latent_normalized = self.per_channel_statistics.normalize(latent_patched)
+        return self.patchifier.unpatchify(latent_normalized, latent_shape)
+class LTX2AudioDecoder(torch.nn.Module):
+    """
+    Symmetric decoder that reconstructs audio spectrograms from latent features.
+    The decoder mirrors the encoder structure with configurable channel multipliers,
+    attention resolutions, and causal convolutions.
+    """
+    def __init__(  # noqa: PLR0913
+        self,
+        *,
+        ch: int = 128,
+        out_ch: int = 2,
+        ch_mult: Tuple[int, ...] = (1, 2, 4),
+        num_res_blocks: int = 2,
+        attn_resolutions: Set[int] = set(),
+        resolution: int=256,
+        z_channels: int=8,
+        norm_type: NormType = NormType.PIXEL,
+        causality_axis: CausalityAxis = CausalityAxis.HEIGHT,
+        dropout: float = 0.0,
+        mid_block_add_attention: bool = False,
+        sample_rate: int = 16000,
+        mel_hop_length: int = 160,
+        is_causal: bool = True,
+        mel_bins: int | None = 64,
+    ) -> None:
+        """
+        Initialize the Decoder.
+        Args:
+            Arguments are configuration parameters, loaded from the audio VAE checkpoint config
+            (audio_vae.model.params.ddconfig):
+            - ch, out_ch, ch_mult, num_res_blocks, attn_resolutions
+            - resolution, z_channels
+            - norm_type, causality_axis
+        """
+        super().__init__()
+        # Internal behavioural defaults that are not driven by the checkpoint.
+        resamp_with_conv = True
+        attn_type = AttentionType.VANILLA
+        # Per-channel statistics for denormalizing latents
+        self.per_channel_statistics = PerChannelStatistics(latent_channels=ch)
+        self.sample_rate = sample_rate
+        self.mel_hop_length = mel_hop_length
+        self.is_causal = is_causal
+        self.mel_bins = mel_bins
+        self.patchifier = AudioPatchifier(
+            patch_size=1,
+            audio_latent_downsample_factor=LATENT_DOWNSAMPLE_FACTOR,
+            sample_rate=sample_rate,
+            hop_length=mel_hop_length,
+            is_causal=is_causal,
+        )
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.out_ch = out_ch
+        self.give_pre_end = False
+        self.tanh_out = False
+        self.norm_type = norm_type
+        self.z_channels = z_channels
+        self.channel_multipliers = ch_mult
+        self.attn_resolutions = attn_resolutions
+        self.causality_axis = causality_axis
+        self.attn_type = attn_type
+        base_block_channels = ch * self.channel_multipliers[-1]
+        base_resolution = resolution // (2 ** (self.num_resolutions - 1))
+        self.z_shape = (1, z_channels, base_resolution, base_resolution)
+        self.conv_in = make_conv2d(
+            z_channels, base_block_channels, kernel_size=3, stride=1, causality_axis=self.causality_axis
+        )
+        self.non_linearity = torch.nn.SiLU()
+        self.mid = build_mid_block(
+            channels=base_block_channels,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            norm_type=self.norm_type,
+            causality_axis=self.causality_axis,
+            attn_type=self.attn_type,
+            add_attention=mid_block_add_attention,
+        )
+        self.up, final_block_channels = build_upsampling_path(
+            ch=ch,
+            ch_mult=ch_mult,
+            num_resolutions=self.num_resolutions,
+            num_res_blocks=num_res_blocks,
+            resolution=resolution,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            norm_type=self.norm_type,
+            causality_axis=self.causality_axis,
+            attn_type=self.attn_type,
+            attn_resolutions=attn_resolutions,
+            resamp_with_conv=resamp_with_conv,
+            initial_block_channels=base_block_channels,
+        )
+        self.norm_out = build_normalization_layer(final_block_channels, normtype=self.norm_type)
+        self.conv_out = make_conv2d(
+            final_block_channels, out_ch, kernel_size=3, stride=1, causality_axis=self.causality_axis
+        )
+    def forward(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        Decode latent features back to audio spectrograms.
+        Args:
+            sample: Encoded latent representation of shape (batch, channels, frames, mel_bins)
+        Returns:
+            Reconstructed audio spectrogram of shape (batch, channels, time, frequency)
+        """
+        sample, target_shape = self._denormalize_latents(sample)
+        h = self.conv_in(sample)
+        h = run_mid_block(self.mid, h)
+        h = self._run_upsampling_path(h)
+        h = self._finalize_output(h)
+        return self._adjust_output_shape(h, target_shape)
+    def _denormalize_latents(self, sample: torch.Tensor) -> tuple[torch.Tensor, AudioLatentShape]:
+        latent_shape = AudioLatentShape(
+            batch=sample.shape[0],
+            channels=sample.shape[1],
+            frames=sample.shape[2],
+            mel_bins=sample.shape[3],
+        )
+        sample_patched = self.patchifier.patchify(sample)
+        sample_denormalized = self.per_channel_statistics.un_normalize(sample_patched)
+        sample = self.patchifier.unpatchify(sample_denormalized, latent_shape)
+        target_frames = latent_shape.frames * LATENT_DOWNSAMPLE_FACTOR
+        if self.causality_axis != CausalityAxis.NONE:
+            target_frames = max(target_frames - (LATENT_DOWNSAMPLE_FACTOR - 1), 1)
+        target_shape = AudioLatentShape(
+            batch=latent_shape.batch,
+            channels=self.out_ch,
+            frames=target_frames,
+            mel_bins=self.mel_bins if self.mel_bins is not None else latent_shape.mel_bins,
+        )
+        return sample, target_shape
+    def _adjust_output_shape(
+        self,
+        decoded_output: torch.Tensor,
+        target_shape: AudioLatentShape,
+    ) -> torch.Tensor:
+        """
+        Adjust output shape to match target dimensions for variable-length audio.
+        This function handles the common case where decoded audio spectrograms need to be
+        resized to match a specific target shape.
+        Args:
+            decoded_output: Tensor of shape (batch, channels, time, frequency)
+            target_shape: AudioLatentShape describing (batch, channels, time, mel bins)
+        Returns:
+            Tensor adjusted to match target_shape exactly
+        """
+        # Current output shape: (batch, channels, time, frequency)
+        _, _, current_time, current_freq = decoded_output.shape
+        target_channels = target_shape.channels
+        target_time = target_shape.frames
+        target_freq = target_shape.mel_bins
+        # Step 1: Crop first to avoid exceeding target dimensions
+        decoded_output = decoded_output[
+            :, :target_channels, : min(current_time, target_time), : min(current_freq, target_freq)
+        ]
+        # Step 2: Calculate padding needed for time and frequency dimensions
+        time_padding_needed = target_time - decoded_output.shape[2]
+        freq_padding_needed = target_freq - decoded_output.shape[3]
+        # Step 3: Apply padding if needed
+        if time_padding_needed > 0 or freq_padding_needed > 0:
+            # PyTorch padding format: (pad_left, pad_right, pad_top, pad_bottom)
+            # For audio: pad_left/right = frequency, pad_top/bottom = time
+            padding = (
+                0,
+                max(freq_padding_needed, 0),  # frequency padding (left, right)
+                0,
+                max(time_padding_needed, 0),  # time padding (top, bottom)
+            )
+            decoded_output = F.pad(decoded_output, padding)
+        # Step 4: Final safety crop to ensure exact target shape
+        decoded_output = decoded_output[:, :target_channels, :target_time, :target_freq]
+        return decoded_output
+    def _run_upsampling_path(self, h: torch.Tensor) -> torch.Tensor:
+        for level in reversed(range(self.num_resolutions)):
+            stage = self.up[level]
+            for block_idx, block in enumerate(stage.block):
+                h = block(h, temb=None)
+                if stage.attn:
+                    h = stage.attn[block_idx](h)
+            if level != 0 and hasattr(stage, "upsample"):
+                h = stage.upsample(h)
+        return h
+    def _finalize_output(self, h: torch.Tensor) -> torch.Tensor:
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = self.non_linearity(h)
+        h = self.conv_out(h)
+        return torch.tanh(h) if self.tanh_out else h
+def get_padding(kernel_size: int, dilation: int = 1) -> int:
+    return int((kernel_size * dilation - dilation) / 2)
+# ---------------------------------------------------------------------------
+# Anti-aliased resampling helpers (kaiser-sinc filters) for BigVGAN v2
+# Adopted from https://github.com/NVIDIA/BigVGAN
+# ---------------------------------------------------------------------------
+def _sinc(x: torch.Tensor) -> torch.Tensor:
+    return torch.where(
+        x == 0,
+        torch.tensor(1.0, device=x.device, dtype=x.dtype),
+        torch.sin(math.pi * x) / math.pi / x,
+    )
+def kaiser_sinc_filter1d(cutoff: float, half_width: float, kernel_size: int) -> torch.Tensor:
+    even = kernel_size % 2 == 0
+    half_size = kernel_size // 2
+    delta_f = 4 * half_width
+    amplitude = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if amplitude > 50.0:
+        beta = 0.1102 * (amplitude - 8.7)
+    elif amplitude >= 21.0:
+        beta = 0.5842 * (amplitude - 21) ** 0.4 + 0.07886 * (amplitude - 21.0)
+    else:
+        beta = 0.0
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+    time = torch.arange(-half_size, half_size) + 0.5 if even else torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * _sinc(2 * cutoff * time)
+        filter_ /= filter_.sum()
+    return filter_.view(1, 1, kernel_size)
+class LowPassFilter1d(nn.Module):
+    def __init__(
+        self,
+        cutoff: float = 0.5,
+        half_width: float = 0.6,
+        stride: int = 1,
+        padding: bool = True,
+        padding_mode: str = "replicate",
+        kernel_size: int = 12,
+    ) -> None:
+        super().__init__()
+        if cutoff < -0.0:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = kernel_size % 2 == 0
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        self.register_buffer("filter", kaiser_sinc_filter1d(cutoff, half_width, kernel_size))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        _, n_channels, _ = x.shape
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
+        return F.conv1d(x, self.filter.expand(n_channels, -1, -1), stride=self.stride, groups=n_channels)
+class UpSample1d(nn.Module):
+    def __init__(
+        self,
+        ratio: int = 2,
+        kernel_size: int | None = None,
+        persistent: bool = True,
+        window_type: str = "kaiser",
+    ) -> None:
+        super().__init__()
+        self.ratio = ratio
+        self.stride = ratio
+        if window_type == "hann":
+            # Hann-windowed sinc filter equivalent to torchaudio.functional.resample
+            rolloff = 0.99
+            lowpass_filter_width = 6
+            width = math.ceil(lowpass_filter_width / rolloff)
+            self.kernel_size = 2 * width * ratio + 1
+            self.pad = width
+            self.pad_left = 2 * width * ratio
+            self.pad_right = self.kernel_size - ratio
+            time_axis = (torch.arange(self.kernel_size) / ratio - width) * rolloff
+            time_clamped = time_axis.clamp(-lowpass_filter_width, lowpass_filter_width)
+            window = torch.cos(time_clamped * math.pi / lowpass_filter_width / 2) ** 2
+            sinc_filter = (torch.sinc(time_axis) * window * rolloff / ratio).view(1, 1, -1)
+        else:
+            # Kaiser-windowed sinc filter (BigVGAN default).
+            self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+            self.pad = self.kernel_size // ratio - 1
+            self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+            self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+            sinc_filter = kaiser_sinc_filter1d(
+                cutoff=0.5 / ratio,
+                half_width=0.6 / ratio,
+                kernel_size=self.kernel_size,
+            )
+        self.register_buffer("filter", sinc_filter, persistent=persistent)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        _, n_channels, _ = x.shape
+        x = F.pad(x, (self.pad, self.pad), mode="replicate")
+        filt = self.filter.to(dtype=x.dtype, device=x.device).expand(n_channels, -1, -1)
+        x = self.ratio * F.conv_transpose1d(x, filt, stride=self.stride, groups=n_channels)
+        return x[..., self.pad_left : -self.pad_right]
+class DownSample1d(nn.Module):
+    def __init__(self, ratio: int = 2, kernel_size: int | None = None) -> None:
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.lowpass = LowPassFilter1d(
+            cutoff=0.5 / ratio,
+            half_width=0.6 / ratio,
+            stride=ratio,
+            kernel_size=self.kernel_size,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lowpass(x)
+class Activation1d(nn.Module):
+    def __init__(
+        self,
+        activation: nn.Module,
+        up_ratio: int = 2,
+        down_ratio: int = 2,
+        up_kernel_size: int = 12,
+        down_kernel_size: int = 12,
+    ) -> None:
+        super().__init__()
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.upsample(x)
+        x = self.act(x)
+        return self.downsample(x)
+class Snake(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        alpha: float = 1.0,
+        alpha_trainable: bool = True,
+        alpha_logscale: bool = True,
+    ) -> None:
+        super().__init__()
+        self.alpha_logscale = alpha_logscale
+        self.alpha = nn.Parameter(torch.zeros(in_features) if alpha_logscale else torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.eps = 1e-9
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        return x + (1.0 / (alpha + self.eps)) * torch.sin(x * alpha).pow(2)
+class SnakeBeta(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        alpha: float = 1.0,
+        alpha_trainable: bool = True,
+        alpha_logscale: bool = True,
+    ) -> None:
+        super().__init__()
+        self.alpha_logscale = alpha_logscale
+        self.alpha = nn.Parameter(torch.zeros(in_features) if alpha_logscale else torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta = nn.Parameter(torch.zeros(in_features) if alpha_logscale else torch.ones(in_features) * alpha)
+        self.beta.requires_grad = alpha_trainable
+        self.eps = 1e-9
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        return x + (1.0 / (beta + self.eps)) * torch.sin(x * alpha).pow(2)
+class AMPBlock1(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        kernel_size: int = 3,
+        dilation: tuple[int, int, int] = (1, 3, 5),
+        activation: str = "snake",
+    ) -> None:
+        super().__init__()
+        act_cls = SnakeBeta if activation == "snakebeta" else Snake
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=get_padding(kernel_size, dilation[0]),
+                ),
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=get_padding(kernel_size, dilation[1]),
+                ),
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[2],
+                    padding=get_padding(kernel_size, dilation[2]),
+                ),
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)),
+                nn.Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)),
+                nn.Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)),
+            ]
+        )
+        self.acts1 = nn.ModuleList([Activation1d(act_cls(channels)) for _ in range(len(self.convs1))])
+        self.acts2 = nn.ModuleList([Activation1d(act_cls(channels)) for _ in range(len(self.convs2))])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, self.acts1, self.acts2, strict=True):
+            xt = a1(x)
+            xt = c1(xt)
+            xt = a2(xt)
+            xt = c2(xt)
+            x = x + xt
+        return x
+class LTX2Vocoder(torch.nn.Module):
+    """
+    LTX2Vocoder model for synthesizing audio from Mel spectrograms.
+    Args:
+        resblock_kernel_sizes: List of kernel sizes for the residual blocks.
+                               This value is read from the checkpoint at `config.vocoder.resblock_kernel_sizes`.
+        upsample_rates: List of upsampling rates.
+                               This value is read from the checkpoint at `config.vocoder.upsample_rates`.
+        upsample_kernel_sizes: List of kernel sizes for the upsampling layers.
+                               This value is read from the checkpoint at `config.vocoder.upsample_kernel_sizes`.
+        resblock_dilation_sizes: List of dilation sizes for the residual blocks.
+                               This value is read from the checkpoint at `config.vocoder.resblock_dilation_sizes`.
+        upsample_initial_channel: Initial number of channels for the upsampling layers.
+                               This value is read from the checkpoint at `config.vocoder.upsample_initial_channel`.
+        resblock: Type of residual block to use ("1", "2", or "AMP1").
+                                This value is read from the checkpoint at `config.vocoder.resblock`.
+        output_sampling_rate: Waveform sample rate.
+                               This value is read from the checkpoint at `config.vocoder.output_sampling_rate`.
+        activation: Activation type for BigVGAN v2 ("snake" or "snakebeta"). Only used when resblock="AMP1".
+        use_tanh_at_final: Apply tanh at the output (when apply_final_activation=True).
+        apply_final_activation: Whether to apply the final tanh/clamp activation.
+        use_bias_at_final: Whether to use bias in the final conv layer.
+    """
+    def __init__(  # noqa: PLR0913
+        self,
+        resblock_kernel_sizes: List[int] | None = [3, 7, 11],
+        upsample_rates: List[int] | None = [6, 5, 2, 2, 2],
+        upsample_kernel_sizes: List[int] | None = [16, 15, 8, 4, 4],
+        resblock_dilation_sizes: List[List[int]] | None = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        upsample_initial_channel: int = 1024,
+        resblock: str = "1",
+        output_sampling_rate: int = 24000,
+        activation: str = "snake",
+        use_tanh_at_final: bool = True,
+        apply_final_activation: bool = True,
+        use_bias_at_final: bool = True,
+    ) -> None:
+        super().__init__()
+        # Mutable default values are not supported as default arguments.
+        if resblock_kernel_sizes is None:
+            resblock_kernel_sizes = [3, 7, 11]
+        if upsample_rates is None:
+            upsample_rates = [6, 5, 2, 2, 2]
+        if upsample_kernel_sizes is None:
+            upsample_kernel_sizes = [16, 15, 8, 4, 4]
+        if resblock_dilation_sizes is None:
+            resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        self.output_sampling_rate = output_sampling_rate
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.use_tanh_at_final = use_tanh_at_final
+        self.apply_final_activation = apply_final_activation
+        self.is_amp = resblock == "AMP1"
+        # All production checkpoints are stereo: 128 input channels (2 stereo channels x 64 mel
+        # bins each), 2 output channels.
+        self.conv_pre = nn.Conv1d(
+            in_channels=128,
+            out_channels=upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+        resblock_cls = ResBlock1 if resblock == "1" else AMPBlock1
+        self.ups = nn.ModuleList(
+            nn.ConvTranspose1d(
+                upsample_initial_channel // (2**i),
+                upsample_initial_channel // (2 ** (i + 1)),
+                kernel_size,
+                stride,
+                padding=(kernel_size - stride) // 2,
+            )
+            for i, (stride, kernel_size) in enumerate(zip(upsample_rates, upsample_kernel_sizes, strict=True))
+        )
+        final_channels = upsample_initial_channel // (2 ** len(upsample_rates))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(upsample_rates)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilations in zip(resblock_kernel_sizes, resblock_dilation_sizes, strict=True):
+                if self.is_amp:
+                    self.resblocks.append(resblock_cls(ch, kernel_size, dilations, activation=activation))
+                else:
+                    self.resblocks.append(resblock_cls(ch, kernel_size, dilations))
+        if self.is_amp:
+            self.act_post: nn.Module = Activation1d(SnakeBeta(final_channels))
+        else:
+            self.act_post = nn.LeakyReLU()
+        # All production checkpoints are stereo: this final conv maps `final_channels` to 2 output channels (stereo).
+        self.conv_post = nn.Conv1d(
+            in_channels=final_channels,
+            out_channels=2,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+            bias=use_bias_at_final,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the vocoder.
+        Args:
+            x: Input Mel spectrogram tensor. Can be either:
+               - 3D: (batch_size, time, mel_bins) for mono
+               - 4D: (batch_size, 2, time, mel_bins) for stereo
+        Returns:
+            Audio waveform tensor of shape (batch_size, out_channels, audio_length)
+        """
+        x = x.transpose(2, 3)  # (batch, channels, time, mel_bins) -> (batch, channels, mel_bins, time)
+        if x.dim() == 4:  # stereo
+            assert x.shape[1] == 2, "Input must have 2 channels for stereo"
+            x = einops.rearrange(x, "b s c t -> b (s c) t")
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            if not self.is_amp:
+                x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            start = i * self.num_kernels
+            end = start + self.num_kernels
+            # Evaluate all resblocks with the same input tensor so they can run
+            # independently (and thus in parallel on accelerator hardware) before
+            # aggregating their outputs via mean.
+            block_outputs = torch.stack(
+                [self.resblocks[idx](x) for idx in range(start, end)],
+                dim=0,
+            )
+            x = block_outputs.mean(dim=0)
+        x = self.act_post(x)
+        x = self.conv_post(x)
+        if self.apply_final_activation:
+            x = torch.tanh(x) if self.use_tanh_at_final else torch.clamp(x, -1, 1)
+        return x
+class _STFTFn(nn.Module):
+    """Implements STFT as a convolution with precomputed DFT x Hann-window bases.
+    The DFT basis rows (real and imaginary parts interleaved) multiplied by the causal
+    Hann window are stored as buffers and loaded from the checkpoint. Using the exact
+    bfloat16 bases from training ensures the mel values fed to the BWE generator are
+    bit-identical to what it was trained on.
+    """
+    def __init__(self, filter_length: int, hop_length: int, win_length: int) -> None:
+        super().__init__()
+        self.hop_length = hop_length
+        self.win_length = win_length
+        n_freqs = filter_length // 2 + 1
+        self.register_buffer("forward_basis", torch.zeros(n_freqs * 2, 1, filter_length))
+        self.register_buffer("inverse_basis", torch.zeros(n_freqs * 2, 1, filter_length))
+    def forward(self, y: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """Compute magnitude and phase spectrogram from a batch of waveforms.
+        Applies causal (left-only) padding of win_length - hop_length samples so that
+        each output frame depends only on past and present input — no lookahead.
+        Args:
+            y: Waveform tensor of shape (B, T).
+        Returns:
+            magnitude: Linear amplitude spectrogram, shape (B, n_freqs, T_frames).
+            phase:     Phase spectrogram in radians, shape (B, n_freqs, T_frames).
+        """
+        if y.dim() == 2:
+            y = y.unsqueeze(1)  # (B, 1, T)
+        left_pad = max(0, self.win_length - self.hop_length)  # causal: left-only
+        y = F.pad(y, (left_pad, 0))
+        spec = F.conv1d(y, self.forward_basis, stride=self.hop_length, padding=0)
+        n_freqs = spec.shape[1] // 2
+        real, imag = spec[:, :n_freqs], spec[:, n_freqs:]
+        magnitude = torch.sqrt(real**2 + imag**2)
+        phase = torch.atan2(imag.float(), real.float()).to(real.dtype)
+        return magnitude, phase
+class MelSTFT(nn.Module):
+    """Causal log-mel spectrogram module whose buffers are loaded from the checkpoint.
+    Computes a log-mel spectrogram by running the causal STFT (_STFTFn) on the input
+    waveform and projecting the linear magnitude spectrum onto the mel filterbank.
+    The module's state dict layout matches the 'mel_stft.*' keys stored in the checkpoint
+    (mel_basis, stft_fn.forward_basis, stft_fn.inverse_basis).
+    """
+    def __init__(
+        self,
+        filter_length: int,
+        hop_length: int,
+        win_length: int,
+        n_mel_channels: int,
+    ) -> None:
+        super().__init__()
+        self.stft_fn = _STFTFn(filter_length, hop_length, win_length)
+        # Initialized to zeros; load_state_dict overwrites with the checkpoint's
+        # exact bfloat16 filterbank (vocoder.mel_stft.mel_basis, shape [n_mels, n_freqs]).
+        n_freqs = filter_length // 2 + 1
+        self.register_buffer("mel_basis", torch.zeros(n_mel_channels, n_freqs))
+    def mel_spectrogram(self, y: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute log-mel spectrogram and auxiliary spectral quantities.
+        Args:
+            y: Waveform tensor of shape (B, T).
+        Returns:
+            log_mel:   Log-compressed mel spectrogram, shape (B, n_mel_channels, T_frames).
+            magnitude: Linear amplitude spectrogram, shape (B, n_freqs, T_frames).
+            phase:     Phase spectrogram in radians, shape (B, n_freqs, T_frames).
+            energy:    Per-frame energy (L2 norm over frequency), shape (B, T_frames).
+        """
+        magnitude, phase = self.stft_fn(y)
+        energy = torch.norm(magnitude, dim=1)
+        mel = torch.matmul(self.mel_basis.to(magnitude.dtype), magnitude)
+        log_mel = torch.log(torch.clamp(mel, min=1e-5))
+        return log_mel, magnitude, phase, energy
+class LTX2VocoderWithBWE(nn.Module):
+    """LTX2Vocoder with bandwidth extension (BWE) upsampling.
+    Chains a mel-to-wav vocoder with a BWE module that upsamples the output
+    to a higher sample rate. The BWE computes a mel spectrogram from the
+    vocoder output, runs it through a second generator to predict a residual,
+    and adds it to a sinc-resampled skip connection.
+    """
+    def __init__(
+        self,
+        input_sampling_rate: int = 16000,
+        output_sampling_rate: int = 48000,
+        hop_length: int = 80,
+    ) -> None:
+        super().__init__()
+        self.vocoder = LTX2Vocoder(
+            resblock_kernel_sizes=[3, 7, 11],
+            upsample_rates=[5, 2, 2, 2, 2, 2],
+            upsample_kernel_sizes=[11, 4, 4, 4, 4, 4],
+            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            upsample_initial_channel=1536,
+            resblock="AMP1",
+            activation="snakebeta",
+            use_tanh_at_final=False,
+            apply_final_activation=True,
+            use_bias_at_final=False,
+            output_sampling_rate=input_sampling_rate,
+        )
+        self.bwe_generator = LTX2Vocoder(
+            resblock_kernel_sizes=[3, 7, 11],
+            upsample_rates=[6, 5, 2, 2, 2],
+            upsample_kernel_sizes=[12, 11, 4, 4, 4],
+            resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            upsample_initial_channel=512,
+            resblock="AMP1",
+            activation="snakebeta",
+            use_tanh_at_final=False,
+            apply_final_activation=False,
+            use_bias_at_final=False,
+            output_sampling_rate=output_sampling_rate,
+        )
+        self.mel_stft = MelSTFT(
+            filter_length=512,
+            hop_length=hop_length,
+            win_length=512,
+            n_mel_channels=64,
+        )
+        self.input_sampling_rate = input_sampling_rate
+        self.output_sampling_rate = output_sampling_rate
+        self.hop_length = hop_length
+        # Compute the resampler on CPU so the sinc filter is materialized even when
+        # the model is constructed on meta device (SingleGPUModelBuilder pattern).
+        # The filter is not stored in the checkpoint (persistent=False).
+        with torch.device("cpu"):
+            self.resampler = UpSample1d(
+                ratio=output_sampling_rate // input_sampling_rate, persistent=False, window_type="hann"
+            )
+    @property
+    def conv_pre(self) -> nn.Conv1d:
+        return self.vocoder.conv_pre
+    @property
+    def conv_post(self) -> nn.Conv1d:
+        return self.vocoder.conv_post
+    def _compute_mel(self, audio: torch.Tensor) -> torch.Tensor:
+        """Compute log-mel spectrogram from waveform using causal STFT bases.
+        Args:
+            audio: Waveform tensor of shape (B, C, T).
+        Returns:
+            mel: Log-mel spectrogram of shape (B, C, n_mels, T_frames).
+        """
+        batch, n_channels, _ = audio.shape
+        flat = audio.reshape(batch * n_channels, -1)  # (B*C, T)
+        mel, _, _, _ = self.mel_stft.mel_spectrogram(flat)  # (B*C, n_mels, T_frames)
+        return mel.reshape(batch, n_channels, mel.shape[1], mel.shape[2])  # (B, C, n_mels, T_frames)
+    def forward(self, mel_spec: torch.Tensor) -> torch.Tensor:
+        """Run the full vocoder + BWE forward pass.
+        Args:
+            mel_spec: Mel spectrogram of shape (B, 2, T, mel_bins) for stereo
+                      or (B, T, mel_bins) for mono. Same format as LTX2Vocoder.forward.
+        Returns:
+            Waveform tensor of shape (B, out_channels, T_out) clipped to [-1, 1].
+        """
+        x = self.vocoder(mel_spec)
+        _, _, length_low_rate = x.shape
+        output_length = length_low_rate * self.output_sampling_rate // self.input_sampling_rate
+        # Pad to multiple of hop_length for exact mel frame count
+        remainder = length_low_rate % self.hop_length
+        if remainder != 0:
+            x = F.pad(x, (0, self.hop_length - remainder))
+        # Compute mel spectrogram from vocoder output: (B, C, n_mels, T_frames)
+        mel = self._compute_mel(x)
+        # LTX2Vocoder.forward expects (B, C, T, mel_bins) — transpose before calling bwe_generator
+        mel_for_bwe = mel.transpose(2, 3)  # (B, C, T_frames, mel_bins)
+        residual = self.bwe_generator(mel_for_bwe)
+        skip = self.resampler(x)
+        assert residual.shape == skip.shape, f"residual {residual.shape} != skip {skip.shape}"
+        return torch.clamp(residual + skip, -1, 1)[..., :output_length]

diffsynth/models/ltx2_common.py ADDED Viewed

	@@ -0,0 +1,388 @@

+from dataclasses import dataclass
+from typing import NamedTuple, Protocol, Tuple
+import torch
+from torch import nn
+from enum import Enum
+class VideoPixelShape(NamedTuple):
+    """
+    Shape of the tensor representing the video pixel array. Assumes BGR channel format.
+    """
+    batch: int
+    frames: int
+    height: int
+    width: int
+    fps: float
+class SpatioTemporalScaleFactors(NamedTuple):
+    """
+    Describes the spatiotemporal downscaling between decoded video space and
+    the corresponding VAE latent grid.
+    """
+    time: int
+    width: int
+    height: int
+    @classmethod
+    def default(cls) -> "SpatioTemporalScaleFactors":
+        return cls(time=8, width=32, height=32)
+VIDEO_SCALE_FACTORS = SpatioTemporalScaleFactors.default()
+class VideoLatentShape(NamedTuple):
+    """
+    Shape of the tensor representing video in VAE latent space.
+    The latent representation is a 5D tensor with dimensions ordered as
+    (batch, channels, frames, height, width). Spatial and temporal dimensions
+    are downscaled relative to pixel space according to the VAE's scale factors.
+    """
+    batch: int
+    channels: int
+    frames: int
+    height: int
+    width: int
+    def to_torch_shape(self) -> torch.Size:
+        return torch.Size([self.batch, self.channels, self.frames, self.height, self.width])
+    @staticmethod
+    def from_torch_shape(shape: torch.Size) -> "VideoLatentShape":
+        return VideoLatentShape(
+            batch=shape[0],
+            channels=shape[1],
+            frames=shape[2],
+            height=shape[3],
+            width=shape[4],
+        )
+    def mask_shape(self) -> "VideoLatentShape":
+        return self._replace(channels=1)
+    @staticmethod
+    def from_pixel_shape(
+        shape: VideoPixelShape,
+        latent_channels: int = 128,
+        scale_factors: SpatioTemporalScaleFactors = VIDEO_SCALE_FACTORS,
+    ) -> "VideoLatentShape":
+        frames = (shape.frames - 1) // scale_factors[0] + 1
+        height = shape.height // scale_factors[1]
+        width = shape.width // scale_factors[2]
+        return VideoLatentShape(
+            batch=shape.batch,
+            channels=latent_channels,
+            frames=frames,
+            height=height,
+            width=width,
+        )
+    def upscale(self, scale_factors: SpatioTemporalScaleFactors = VIDEO_SCALE_FACTORS) -> "VideoLatentShape":
+        return self._replace(
+            channels=3,
+            frames=(self.frames - 1) * scale_factors.time + 1,
+            height=self.height * scale_factors.height,
+            width=self.width * scale_factors.width,
+        )
+class AudioLatentShape(NamedTuple):
+    """
+    Shape of audio in VAE latent space: (batch, channels, frames, mel_bins).
+    mel_bins is the number of frequency bins from the mel-spectrogram encoding.
+    """
+    batch: int
+    channels: int
+    frames: int
+    mel_bins: int
+    def to_torch_shape(self) -> torch.Size:
+        return torch.Size([self.batch, self.channels, self.frames, self.mel_bins])
+    def mask_shape(self) -> "AudioLatentShape":
+        return self._replace(channels=1, mel_bins=1)
+    @staticmethod
+    def from_torch_shape(shape: torch.Size) -> "AudioLatentShape":
+        return AudioLatentShape(
+            batch=shape[0],
+            channels=shape[1],
+            frames=shape[2],
+            mel_bins=shape[3],
+        )
+    @staticmethod
+    def from_duration(
+        batch: int,
+        duration: float,
+        channels: int = 8,
+        mel_bins: int = 16,
+        sample_rate: int = 16000,
+        hop_length: int = 160,
+        audio_latent_downsample_factor: int = 4,
+    ) -> "AudioLatentShape":
+        latents_per_second = float(sample_rate) / float(hop_length) / float(audio_latent_downsample_factor)
+        return AudioLatentShape(
+            batch=batch,
+            channels=channels,
+            frames=round(duration * latents_per_second),
+            mel_bins=mel_bins,
+        )
+    @staticmethod
+    def from_video_pixel_shape(
+        shape: VideoPixelShape,
+        channels: int = 8,
+        mel_bins: int = 16,
+        sample_rate: int = 16000,
+        hop_length: int = 160,
+        audio_latent_downsample_factor: int = 4,
+    ) -> "AudioLatentShape":
+        return AudioLatentShape.from_duration(
+            batch=shape.batch,
+            duration=float(shape.frames) / float(shape.fps),
+            channels=channels,
+            mel_bins=mel_bins,
+            sample_rate=sample_rate,
+            hop_length=hop_length,
+            audio_latent_downsample_factor=audio_latent_downsample_factor,
+        )
+@dataclass(frozen=True)
+class LatentState:
+    """
+    State of latents during the diffusion denoising process.
+    Attributes:
+        latent: The current noisy latent tensor being denoised.
+        denoise_mask: Mask encoding the denoising strength for each token (1 = full denoising, 0 = no denoising).
+        positions: Positional indices for each latent element, used for positional embeddings.
+        clean_latent: Initial state of the latent before denoising, may include conditioning latents.
+    """
+    latent: torch.Tensor
+    denoise_mask: torch.Tensor
+    positions: torch.Tensor
+    clean_latent: torch.Tensor
+    def clone(self) -> "LatentState":
+        return LatentState(
+            latent=self.latent.clone(),
+            denoise_mask=self.denoise_mask.clone(),
+            positions=self.positions.clone(),
+            clean_latent=self.clean_latent.clone(),
+        )
+class NormType(Enum):
+    """Normalization layer types: GROUP (GroupNorm) or PIXEL (per-location RMS norm)."""
+    GROUP = "group"
+    PIXEL = "pixel"
+class PixelNorm(nn.Module):
+    """
+    Per-pixel (per-location) RMS normalization layer.
+    For each element along the chosen dimension, this layer normalizes the tensor
+    by the root-mean-square of its values across that dimension:
+        y = x / sqrt(mean(x^2, dim=dim, keepdim=True) + eps)
+    """
+    def __init__(self, dim: int = 1, eps: float = 1e-8) -> None:
+        """
+        Args:
+            dim: Dimension along which to compute the RMS (typically channels).
+            eps: Small constant added for numerical stability.
+        """
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Apply RMS normalization along the configured dimension.
+        """
+        # Compute mean of squared values along `dim`, keep dimensions for broadcasting.
+        mean_sq = torch.mean(x**2, dim=self.dim, keepdim=True)
+        # Normalize by the root-mean-square (RMS).
+        rms = torch.sqrt(mean_sq + self.eps)
+        return x / rms
+def build_normalization_layer(
+    in_channels: int, *, num_groups: int = 32, normtype: NormType = NormType.GROUP
+) -> nn.Module:
+    """
+    Create a normalization layer based on the normalization type.
+    Args:
+        in_channels: Number of input channels
+        num_groups: Number of groups for group normalization
+        normtype: Type of normalization: "group" or "pixel"
+    Returns:
+        A normalization layer
+    """
+    if normtype == NormType.GROUP:
+        return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+    if normtype == NormType.PIXEL:
+        return PixelNorm(dim=1, eps=1e-6)
+    raise ValueError(f"Invalid normalization type: {normtype}")
+def rms_norm(x: torch.Tensor, weight: torch.Tensor | None = None, eps: float = 1e-6) -> torch.Tensor:
+    """Root-mean-square (RMS) normalize `x` over its last dimension.
+    Thin wrapper around `torch.nn.functional.rms_norm` that infers the normalized
+    shape and forwards `weight` and `eps`.
+    """
+    return torch.nn.functional.rms_norm(x, (x.shape[-1],), weight=weight, eps=eps)
+@dataclass(frozen=True)
+class Modality:
+    """
+    Input data for a single modality (video or audio) in the transformer.
+    Bundles the latent tokens, timestep embeddings, positional information,
+    and text conditioning context for processing by the diffusion transformer.
+    Attributes:
+        latent: Patchified latent tokens, shape ``(B, T, D)`` where *B* is
+            the batch size, *T* is the total number of tokens (noisy +
+            conditioning), and *D* is the input dimension.
+        timesteps: Per-token timestep embeddings, shape ``(B, T)``.
+        positions: Positional coordinates, shape ``(B, 3, T)`` for video
+            (time, height, width) or ``(B, 1, T)`` for audio.
+        context: Text conditioning embeddings from the prompt encoder.
+        enabled: Whether this modality is active in the current forward pass.
+        context_mask: Optional mask for the text context tokens.
+        attention_mask: Optional 2-D self-attention mask, shape ``(B, T, T)``.
+            Values in ``[0, 1]`` where ``1`` = full attention and ``0`` = no
+            attention. ``None`` means unrestricted (full) attention between
+            all tokens. Built incrementally by conditioning items; see
+            :class:`~ltx_core.conditioning.types.attention_strength_wrapper.ConditioningItemAttentionStrengthWrapper`.
+    """
+    latent: (
+        torch.Tensor
+    )  # Shape: (B, T, D) where B is the batch size, T is the number of tokens, and D is input dimension
+    sigma: torch.Tensor  # Shape: (B,). Current sigma value, used for cross-attention timestep calculation.
+    timesteps: torch.Tensor  # Shape: (B, T) where T is the number of timesteps
+    positions: (
+        torch.Tensor
+    )  # Shape: (B, 3, T) for video, where 3 is the number of dimensions and T is the number of tokens
+    context: torch.Tensor
+    enabled: bool = True
+    context_mask: torch.Tensor | None = None
+    attention_mask: torch.Tensor | None = None
+def to_denoised(
+    sample: torch.Tensor,
+    velocity: torch.Tensor,
+    sigma: float | torch.Tensor,
+    calc_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """
+    Convert the sample and its denoising velocity to denoised sample.
+    Returns:
+        Denoised sample
+    """
+    if isinstance(sigma, torch.Tensor):
+        sigma = sigma.to(calc_dtype)
+    return (sample.to(calc_dtype) - velocity.to(calc_dtype) * sigma).to(sample.dtype)
+class Patchifier(Protocol):
+    """
+    Protocol for patchifiers that convert latent tensors into patches and assemble them back.
+    """
+    def patchify(
+        self,
+        latents: torch.Tensor,
+    ) -> torch.Tensor:
+        ...
+        """
+        Convert latent tensors into flattened patch tokens.
+        Args:
+            latents: Latent tensor to patchify.
+        Returns:
+            Flattened patch tokens tensor.
+        """
+    def unpatchify(
+        self,
+        latents: torch.Tensor,
+        output_shape: AudioLatentShape | VideoLatentShape,
+    ) -> torch.Tensor:
+        """
+        Converts latent tensors between spatio-temporal formats and flattened sequence representations.
+        Args:
+            latents: Patch tokens that must be rearranged back into the latent grid constructed by `patchify`.
+            output_shape: Shape of the output tensor. Note that output_shape is either AudioLatentShape or
+            VideoLatentShape.
+        Returns:
+            Dense latent tensor restored from the flattened representation.
+        """
+    @property
+    def patch_size(self) -> Tuple[int, int, int]:
+        ...
+        """
+        Returns the patch size as a tuple of (temporal, height, width) dimensions
+        """
+    def get_patch_grid_bounds(
+        self,
+        output_shape: AudioLatentShape | VideoLatentShape,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        ...
+        """
+        Compute metadata describing where each latent patch resides within the
+        grid specified by `output_shape`.
+        Args:
+            output_shape: Target grid layout for the patches.
+            device: Target device for the returned tensor.
+        Returns:
+            Tensor containing patch coordinate metadata such as spatial or temporal intervals.
+        """
+def get_pixel_coords(
+    latent_coords: torch.Tensor,
+    scale_factors: SpatioTemporalScaleFactors,
+    causal_fix: bool = False,
+) -> torch.Tensor:
+    """
+    Map latent-space `[start, end)` coordinates to their pixel-space equivalents by scaling
+    each axis (frame/time, height, width) with the corresponding VAE downsampling factors.
+    Optionally compensate for causal encoding that keeps the first frame at unit temporal scale.
+    Args:
+        latent_coords: Tensor of latent bounds shaped `(batch, 3, num_patches, 2)`.
+        scale_factors: SpatioTemporalScaleFactors tuple `(temporal, height, width)` with integer scale factors applied
+        per axis.
+        causal_fix: When True, rewrites the temporal axis of the first frame so causal VAEs
+            that treat frame zero differently still yield non-negative timestamps.
+    """
+    # Broadcast the VAE scale factors so they align with the `(batch, axis, patch, bound)` layout.
+    broadcast_shape = [1] * latent_coords.ndim
+    broadcast_shape[1] = -1  # axis dimension corresponds to (frame/time, height, width)
+    scale_tensor = torch.tensor(scale_factors, device=latent_coords.device).view(*broadcast_shape)
+    # Apply per-axis scaling to convert latent bounds into pixel-space coordinates.
+    pixel_coords = latent_coords * scale_tensor
+    if causal_fix:
+        # VAE temporal stride for the very first frame is 1 instead of `scale_factors[0]`.
+        # Shift and clamp to keep the first-frame timestamps causal and non-negative.
+        pixel_coords[:, 0, ...] = (pixel_coords[:, 0, ...] + 1 - scale_factors[0]).clamp(min=0)
+    return pixel_coords