Spaces:

allenai
/

RefDecoder

Running on Zero

App Files Files Community

Arrokothwhi commited on 3 days ago

Commit

0d0c9d9

1 Parent(s): d77af5d

Add I2V demo

Browse files

Files changed (8) hide show

README.md +16 -6
app.py +301 -0
requirements.txt +12 -0
src/__init__.py +1 -0
src/models/Wan/__init__.py +1 -0
src/models/Wan/autoencoder_wanT.py +1916 -0
src/models/Wan/transformer_wan.py +1049 -0
src/models/__init__.py +1 -0

README.md CHANGED Viewed

@@ -1,13 +1,23 @@
 ---
-title: Test
-emoji: 💻
-colorFrom: red
-colorTo: gray
 sdk: gradio
 sdk_version: 6.14.0
-python_version: '3.12'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: RefDecoder I2V Demo
+emoji: 🎬
+colorFrom: green
+colorTo: blue
 sdk: gradio
 sdk_version: 6.14.0
+python_version: "3.10"
 app_file: app.py
 pinned: false
 ---
+# RefDecoder I2V Demo
+This Space:
+1. Generates Wan I2V latents from an input image and prompt
+2. Saves the latent tensor as a `.pt` file
+3. Decodes the same latents with Wan VAE
+4. Decodes the same latents with RefDecoder
+The RefDecoder checkpoint is downloaded at runtime from:
+`Arrokothwhi/RefDecoder` -> `I2V_Wan2.1/model.pt`

app.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import random
+import sys
+import tempfile
+from functools import lru_cache
+from pathlib import Path
+import gradio as gr
+import imageio
+import numpy as np
+import torch
+from diffusers import AutoencoderKLWan as DiffusersWanVAE
+from diffusers import WanImageToVideoPipeline
+from huggingface_hub import hf_hub_download
+from transformers import CLIPVisionModel
+from src.models.Wan.autoencoder_wanT import AutoencoderKLWan
+from src.models.Wan.transformer_wan import WanDecoderTransformer
+ROOT = Path(__file__).resolve().parent
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+MODEL_ID = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
+REFDECODER_REPO_ID = "Arrokothwhi/RefDecoder"
+REFDECODER_CKPT_PATH_IN_REPO = "I2V_Wan2.1/model.pt"
+NEGATIVE_PROMPT = (
+    "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, "
+    "images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, "
+    "incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, "
+    "misshapen limbs, fused fingers, still picture, messy background, three legs, many people "
+    "in the background, walking backwards"
+)
+TARGET_AREA = 480 * 832
+FPS = 16
+NUM_FRAMES = 17
+NUM_INFERENCE_STEPS = 50
+GUIDANCE_SCALE = 5.0
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+PIPE_DTYPE = torch.bfloat16 if DEVICE == "cuda" else torch.float32
+@lru_cache(maxsize=1)
+def get_generation_pipe():
+    image_encoder = CLIPVisionModel.from_pretrained(
+        MODEL_ID,
+        subfolder="image_encoder",
+        torch_dtype=torch.float32,
+    )
+    vae = DiffusersWanVAE.from_pretrained(
+        MODEL_ID,
+        subfolder="vae",
+        torch_dtype=torch.float32,
+    )
+    pipe = WanImageToVideoPipeline.from_pretrained(
+        MODEL_ID,
+        vae=vae,
+        image_encoder=image_encoder,
+        torch_dtype=PIPE_DTYPE,
+    )
+    if DEVICE == "cuda":
+        pipe.enable_model_cpu_offload()
+    else:
+        pipe = pipe.to(DEVICE)
+    return pipe
+@lru_cache(maxsize=1)
+def get_wan_vae():
+    vae = DiffusersWanVAE.from_pretrained(
+        MODEL_ID,
+        subfolder="vae",
+        torch_dtype=torch.float32,
+    )
+    vae = vae.to(DEVICE)
+    vae.eval()
+    return vae
+@lru_cache(maxsize=1)
+def get_refdecoder_module():
+    vae = AutoencoderKLWan(
+        dropout_p=0.0,
+        use_reference=True,
+    ).eval()
+    transformer = WanDecoderTransformer(
+        chunk=5,
+        num_layers=10,
+        num_heads=12,
+        head_dim=128,
+        reusing=True,
+        pretrained=False,
+    ).eval()
+    ckpt_path = hf_hub_download(
+        repo_id=REFDECODER_REPO_ID,
+        filename=REFDECODER_CKPT_PATH_IN_REPO,
+    )
+    checkpoint = torch.load(ckpt_path, map_location="cpu")
+    state_dict = checkpoint.get("state_dict", checkpoint.get("module", checkpoint))
+    vae_sd = {}
+    transformer_sd = {}
+    for key, value in state_dict.items():
+        if key.startswith("vae."):
+            vae_sd[key[len("vae.") :]] = value
+        elif key.startswith("transformer."):
+            transformer_sd[key[len("transformer.") :]] = value
+    vae.load_state_dict(vae_sd, strict=False)
+    transformer.load_state_dict(transformer_sd, strict=False)
+    vae = vae.to(DEVICE).eval()
+    transformer = transformer.to(DEVICE).eval()
+    return vae, transformer
+def resize_image_for_wan(image, pipe):
+    image = image.convert("RGB")
+    aspect_ratio = image.height / image.width
+    mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
+    height = round(np.sqrt(TARGET_AREA * aspect_ratio)) // mod_value * mod_value
+    width = round(np.sqrt(TARGET_AREA / aspect_ratio)) // mod_value * mod_value
+    resized = image.resize((width, height))
+    return resized, height, width
+def build_reference_frame(image, device):
+    ref_array = np.asarray(image).astype(np.float32)
+    ref_tensor = torch.from_numpy(ref_array).permute(2, 0, 1)
+    ref_tensor = (ref_tensor / 255.0 - 0.5) * 2.0
+    return ref_tensor.unsqueeze(0).unsqueeze(2).to(device=device, dtype=torch.float32)
+def normalize_latent_shape(latents):
+    if isinstance(latents, list):
+        latents = latents[0]
+    if latents.ndim == 4:
+        latents = latents.unsqueeze(0)
+    if latents.ndim != 5:
+        raise ValueError(f"Expected latent shape [B,C,T,H,W], got {tuple(latents.shape)}")
+    return latents
+def save_video_tensor(video_tensor, output_path):
+    video = (video_tensor / 2 + 0.5).clamp(0, 1)
+    video = video.squeeze(0).permute(1, 2, 3, 0).detach().cpu().float().numpy()
+    video = (video * 255).astype(np.uint8)
+    imageio.mimwrite(output_path, video, fps=FPS, quality=10)
+    return str(output_path)
+def decode_with_wan_vae(latents):
+    vae = get_wan_vae()
+    latents = latents.to(device=DEVICE, dtype=torch.float32)
+    latents_mean = torch.tensor(vae.config.latents_mean, device=DEVICE, dtype=torch.float32).view(1, -1, 1, 1, 1)
+    latents_std = torch.tensor(vae.config.latents_std, device=DEVICE, dtype=torch.float32).view(1, -1, 1, 1, 1)
+    latents = latents * latents_std + latents_mean
+    with torch.no_grad():
+        video = vae.decode(latents, return_dict=False)[0]
+    return video
+def decode_with_refdecoder(latents, reference_frame):
+    vae, transformer = get_refdecoder_module()
+    latents = latents.to(device=DEVICE, dtype=torch.float32)
+    latents_mean = torch.tensor(
+        vae.config.latents_mean,
+        device=DEVICE,
+        dtype=torch.float32,
+    ).view(1, -1, 1, 1, 1)
+    latents_std = torch.tensor(
+        vae.config.latents_std,
+        device=DEVICE,
+        dtype=torch.float32,
+    ).view(1, -1, 1, 1, 1)
+    latents = latents * latents_std + latents_mean
+    with torch.no_grad():
+        video = vae.decode(
+            latents,
+            transformer,
+            return_dict=True,
+            reference_frame=reference_frame,
+            skip=False,
+            window_size=-1,
+        ).sample
+    if hasattr(vae, "clear_cache"):
+        vae.clear_cache()
+    return video
+def generate_and_decode(image, prompt, seed, progress=gr.Progress(track_tqdm=False)):
+    if image is None:
+        raise gr.Error("Please upload an input image.")
+    if not prompt or not prompt.strip():
+        raise gr.Error("Please enter a prompt.")
+    if DEVICE != "cuda":
+        raise gr.Error("This demo expects a CUDA GPU to run Wan I2V generation.")
+    seed = int(seed) if seed is not None else random.randint(0, 2**32 - 1)
+    run_dir = Path(tempfile.mkdtemp(prefix="refdecoder_demo_"))
+    progress(0.05, desc="Loading Wan I2V pipeline")
+    pipe = get_generation_pipe()
+    progress(0.15, desc="Preparing image")
+    resized_image, height, width = resize_image_for_wan(image, pipe)
+    reference_frame = build_reference_frame(resized_image, DEVICE)
+    generator = torch.Generator(device=DEVICE).manual_seed(seed)
+    progress(0.3, desc="Generating latent video")
+    with torch.no_grad():
+        output = pipe(
+            image=resized_image,
+            prompt=prompt.strip(),
+            negative_prompt=NEGATIVE_PROMPT,
+            height=height,
+            width=width,
+            num_frames=NUM_FRAMES,
+            num_inference_steps=NUM_INFERENCE_STEPS,
+            guidance_scale=GUIDANCE_SCALE,
+            generator=generator,
+            output_type="latent",
+        )
+    latents = normalize_latent_shape(output.frames).detach().cpu()
+    latent_path = run_dir / "wan_latents.pt"
+    torch.save(
+        {
+            "latents": latents,
+            "height": height,
+            "width": width,
+            "prompt": prompt.strip(),
+            "seed": seed,
+        },
+        latent_path,
+    )
+    progress(0.65, desc="Decoding with Wan VAE")
+    wan_video = decode_with_wan_vae(latents)
+    wan_video_path = save_video_tensor(wan_video, run_dir / "wan_vae.mp4")
+    progress(0.82, desc="Decoding with RefDecoder")
+    ref_video = decode_with_refdecoder(latents, reference_frame)
+    ref_video_path = save_video_tensor(ref_video, run_dir / "refdecoder.mp4")
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    status = (
+        f"Seed: {seed}\n"
+        f"Resolution: {width}x{height}\n"
+        f"Frames: {NUM_FRAMES}\n"
+        f"Latents: {tuple(latents.shape)}"
+    )
+    progress(1.0, desc="Done")
+    return str(latent_path), wan_video_path, ref_video_path, status
+with gr.Blocks(title="RefDecoder I2V Demo") as demo:
+    gr.Markdown(
+        """
+        # RefDecoder I2V Demo
+        Upload one image and one prompt. The app generates Wan I2V latents once, then decodes the same latents with:
+        1. Wan's original VAE
+        2. RefDecoder (`ckpt/model.pt`)
+        """
+    )
+    with gr.Row():
+        image_input = gr.Image(label="Input Image", type="pil")
+        with gr.Column():
+            prompt_input = gr.Textbox(
+                label="Prompt",
+                lines=4,
+                placeholder="Describe the motion you want to generate...",
+            )
+            seed_input = gr.Number(
+                label="Seed",
+                value=0,
+                precision=0,
+                info="Use a fixed seed for reproducible results.",
+            )
+            run_button = gr.Button("Generate and Decode", variant="primary")
+    with gr.Row():
+        latent_output = gr.File(label="Wan Latents (.pt)")
+        status_output = gr.Textbox(label="Run Info")
+    with gr.Row():
+        wan_video_output = gr.Video(label="Wan VAE Decode")
+        ref_video_output = gr.Video(label="RefDecoder Decode")
+    run_button.click(
+        fn=generate_and_decode,
+        inputs=[image_input, prompt_input, seed_input],
+        outputs=[latent_output, wan_video_output, ref_video_output, status_output],
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=2).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+gradio==6.14.0
+imageio==2.37.0
+numpy==1.26.4
+torch==2.7.0
+transformers==4.56.2
+diffusers==0.36.0
+accelerate==1.10.1
+einops==0.8.1
+sentencepiece==0.2.1
+safetensors==0.6.2
+peft==0.18.0
+huggingface-hub==0.34.4

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/models/Wan/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/models/Wan/autoencoder_wanT.py ADDED Viewed

	@@ -0,0 +1,1916 @@

+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin
+from diffusers.models.autoencoders.autoencoder_kl import (
+    AutoencoderKLOutput,
+    DecoderOutput,
+    DiagonalGaussianDistribution,
+)
+from diffusers.models.embeddings import get_1d_rotary_pos_embed
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import logging
+from diffusers.utils.accelerate_utils import apply_forward_hook
+from einops import rearrange
+_ACTS = {
+    "silu": nn.SiLU,
+    "swish": nn.SiLU,
+    "gelu": nn.GELU,
+    "relu": nn.ReLU,
+    "mish": nn.Mish,
+    "tanh": nn.Tanh,
+    "sigmoid": nn.Sigmoid,
+    "identity": nn.Identity,
+    "none": nn.Identity,
+}
+def resolve_activation(x):
+    if x is None:
+        return nn.Identity()
+    if isinstance(x, nn.Module):
+        return x
+    name = str(x).strip().lower()
+    if name in _ACTS:
+        return _ACTS[name]()
+    if name in ("lrelu", "leaky_relu"):
+        return nn.LeakyReLU(0.01)
+    raise ValueError(f"Unknown activation: {x}")
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+CACHE_T = 0
+LATENT_T_STRIDE = 100
+GRADIENT_CHECKPOINTING = False
+class AvgDown3D(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+        assert in_channels * self.factor % out_channels == 0
+        self.group_size = in_channels * self.factor // out_channels
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
+        pad = (0, 0, 0, 0, pad_t, 0)
+        x = F.pad(x, pad)
+        B, C, T, H, W = x.shape
+        x = x.view(
+            B,
+            C,
+            T // self.factor_t,
+            self.factor_t,
+            H // self.factor_s,
+            self.factor_s,
+            W // self.factor_s,
+            self.factor_s,
+        )
+        x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
+        x = x.view(
+            B,
+            C * self.factor,
+            T // self.factor_t,
+            H // self.factor_s,
+            W // self.factor_s,
+        )
+        x = x.view(
+            B,
+            self.out_channels,
+            self.group_size,
+            T // self.factor_t,
+            H // self.factor_s,
+            W // self.factor_s,
+        )
+        x = x.mean(dim=2)
+        return x
+class DupUp3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        factor_t,
+        factor_s=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+        assert out_channels * self.factor % in_channels == 0
+        self.repeats = out_channels * self.factor // in_channels
+    def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
+        x = x.repeat_interleave(self.repeats, dim=1)
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            self.factor_t,
+            self.factor_s,
+            self.factor_s,
+            x.size(2),
+            x.size(3),
+            x.size(4),
+        )
+        x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
+        x = x.view(
+            x.size(0),
+            self.out_channels,
+            x.size(2) * self.factor_t,
+            x.size(4) * self.factor_s,
+            x.size(6) * self.factor_s,
+        )
+        if first_chunk:
+            x = x[:, :, self.factor_t - 1 :, :, :]
+        return x
+class WanCausalConv3d(nn.Conv3d):
+    r"""
+    A custom 3D causal convolution layer with feature caching support.
+    This layer extends the standard Conv3D layer by ensuring causality in the time dimension and handling feature
+    caching for efficient inference.
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to all three sides of the input. Default: 0
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        padding: Union[int, Tuple[int, int, int]] = 0,
+    ) -> None:
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+        # Set up causal padding
+        self._padding = (self.padding[2], self.padding[2], self.padding[1], self.padding[1], 2 * self.padding[0], 0)
+        self.padding = (0, 0, 0)
+    def forward(self, x, cache_x=None, mode=None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        if mode == 'upsample3d':
+            # x: BCTHW
+            assert self.stride[0] == 1 and self.stride[1] == 1 and self.stride[2] == 1
+            assert self.kernel_size[0] == 3
+            assert padding[0] == padding[1] and padding[2] == padding[3]
+            results = []
+            for i in range(x.shape[2] if padding[-2] == 2 else x.shape[2] - 1):
+                if padding[-2] == 2:
+                    if i == 0:
+                        out = F.conv3d(x[:, :, 0:1, :, :], self.weight, self.bias, self.stride, (2, padding[2], padding[0]))[:, :, :-2]  # BC1HW
+                    elif i == 1:
+                        out = F.conv3d(x[:, :, 0:2, :, :], self.weight, self.bias, self.stride, (1, padding[2], padding[0]))[:, :, :-1]  # BC1HW
+                    else:
+                        out = F.conv3d(x[:, :, i - 2: i - 2 + self.kernel_size[0], :, :], self.weight, self.bias, self.stride, (0, padding[2], padding[0]))  # BC1HW
+                elif padding[-2] == 1:
+                    if i == 0:
+                        out = F.conv3d(x[:, :, 0:2, :, :], self.weight, self.bias, self.stride, (1, padding[2], padding[0]))[:, :, :-1]  # BC1HW
+                    else:
+                        out = F.conv3d(x[:, :, i - 1: i - 1 + self.kernel_size[0], :, :], self.weight, self.bias, self.stride, (0, padding[2], padding[0]))  # BC1HW
+                else:
+                    raise ValueError("Invalid padding for causal conv3d in upsample3d mode.")
+                results.append(out)
+            if not results:
+                breakpoint() # TODO
+            return torch.cat(results, dim=2)  # BCTHW
+        x = F.pad(x, padding)
+        return super().forward(x)
+        '''
+        if mode == "upsample3d":
+            padding = list(self._padding)
+            x = F.pad(x, padding)
+            t = x.shape[2]
+            itr = t - 2
+            print(f"DEBUG: time frame {t}")
+            out = super().forward(x[:, :, :1, :, :])
+            for i in range(1, itr):
+                out_ = super().forward(x[:, :, i: i + 4, :, :])
+                out = torch.cat([out, out_], 2)
+            return out
+        else:
+            padding = list(self._padding)
+            if cache_x is not None and self._padding[4] > 0:
+                cache_x = cache_x.to(x.device)
+                x = torch.cat([cache_x, x], dim=2)
+                padding[4] -= cache_x.shape[2]
+            x = F.pad(x, padding)
+            print(x.shape, self.weight.shape)
+            print(x.dtype, self.weight.dtype)
+            return super().forward(x)
+        '''
+class WanRMS_norm(nn.Module):
+    r"""
+    A custom RMS normalization layer.
+    Args:
+        dim (int): The number of dimensions to normalize over.
+        channel_first (bool, optional): Whether the input tensor has channels as the first dimension.
+            Default is True.
+        images (bool, optional): Whether the input represents image data. Default is True.
+        bias (bool, optional): Whether to include a learnable bias term. Default is False.
+    """
+    def __init__(self, dim: int, channel_first: bool = True, images: bool = True, bias: bool = False) -> None:
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+    def forward(self, x):
+        return F.normalize(x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma + self.bias
+class WanUpsample(nn.Upsample):
+    r"""
+    Perform upsampling while ensuring the output tensor has the same data type as the input.
+    Args:
+        x (torch.Tensor): Input tensor to be upsampled.
+    Returns:
+        torch.Tensor: Upsampled tensor with the same data type as the input.
+    """
+    def forward(self, x):
+        return super().forward(x.float()).type_as(x)
+class WanResample(nn.Module):
+    r"""
+    A custom resampling module for 2D and 3D data.
+    Args:
+        dim (int): The number of input/output channels.
+        mode (str): The resampling mode. Must be one of:
+            - 'none': No resampling (identity operation).
+            - 'upsample2d': 2D upsampling with nearest-exact interpolation and convolution.
+            - 'upsample3d': 3D upsampling with nearest-exact interpolation, convolution, and causal 3D convolution.
+            - 'downsample2d': 2D downsampling with zero-padding and convolution.
+            - 'downsample3d': 3D downsampling with zero-padding, convolution, and causal 3D convolution.
+    """
+    def __init__(self, dim: int, mode: str, upsample_out_dim: int = None) -> None:
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # default to dim //2
+        if upsample_out_dim is None:
+            upsample_out_dim = dim // 2
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                WanUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, upsample_out_dim, 3, padding=1),
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                WanUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, upsample_out_dim, 3, padding=1),
+            )
+            self.time_conv = WanCausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = WanCausalConv3d(dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0], is_reference=False, first_chunk=False):
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None and not is_reference:
+                # Latent frames: full caching logic
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    if t <= 1:
+                        feat_cache[idx] = "Rep"
+                        feat_idx[0] += 1
+                    else:
+                        subseq = x[:, :, 1:]
+                        cache_x = subseq[:, :, -CACHE_T:, :, :].clone() if CACHE_T > 0 else subseq[:, :, :0, :, :]
+                        if cache_x.shape[2] < 2:
+                            cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device), cache_x], dim=2)
+                        subseq = self.time_conv(subseq, mode=self.mode)
+                        feat_cache[idx] = cache_x
+                        feat_idx[0] += 1
+                        subseq = subseq.reshape(b, 2, c, t - 1, h, w)
+                        subseq = torch.stack((subseq[:, 0, :, :, :, :], subseq[:, 1, :, :, :, :]), 3)
+                        subseq = subseq.reshape(b, c, (t - 1) * 2, h, w)
+                        x = torch.cat([x[:, :, :1, :, :], subseq], dim=2)
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone() if CACHE_T > 0 else x[:, :, :0, :, :]
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
+                        cache_x = torch.cat([feat_cache[idx][:, :, -1:, :, :].to(cache_x.device), cache_x], dim=2)
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
+                        cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device), cache_x], dim=2)
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x, mode=self.mode)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx], mode=self.mode)
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        # Spatial resampling (applies to all paths)
+        t = x.shape[2]
+        x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+        x = self.resample(x)
+        x = x.view(b, t, x.size(1), x.size(2), x.size(3)).permute(0, 2, 1, 3, 4)
+        if self.mode == "downsample3d":
+            if feat_cache is not None and not is_reference:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    if t <= 1:
+                        feat_cache[idx] = x.clone()
+                        feat_idx[0] += 1
+                    else:
+                        subseq = x[:, :, 1:]
+                        cache_x = subseq[:, :, -1:, :, :].clone()
+                        subseq = self.time_conv(x)
+                        x = torch.cat([x[:, :, :1, :, :], subseq], dim=2)
+                        feat_cache[idx] = cache_x
+                        feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        return x
+class WanResidualBlock(nn.Module):
+    r"""
+    A custom residual block module.
+    Args:
+        in_dim (int): Number of input channels.
+        out_dim (int): Number of output channels.
+        dropout (float, optional): Dropout rate for the dropout layer. Default is 0.0.
+        non_linearity (str, optional): Type of non-linearity to use. Default is "silu".
+    """
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        dropout: float = 0.0,
+        non_linearity: str = "silu",
+    ) -> None:
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.nonlinearity = resolve_activation(non_linearity)
+        # layers
+        self.norm1 = WanRMS_norm(in_dim, images=False)
+        self.conv1 = WanCausalConv3d(in_dim, out_dim, 3, padding=1)
+        self.norm2 = WanRMS_norm(out_dim, images=False)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = WanCausalConv3d(out_dim, out_dim, 3, padding=1)
+        self.conv_shortcut = WanCausalConv3d(in_dim, out_dim, 1) if in_dim != out_dim else nn.Identity()
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        # Apply shortcut connection
+        h = self.conv_shortcut(x)
+        # First normalization and activation
+        x = self.norm1(x)
+        x = self.nonlinearity(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone() if CACHE_T > 0 else x[:, :, :0, :, :]
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat([feat_cache[idx][:, :, -1:, :, :].to(cache_x.device), cache_x], dim=2)
+            x = self.conv1(x, feat_cache[idx], mode='upsample3d')
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x, mode='upsample3d')
+        # Second normalization and activation
+        x = self.norm2(x)
+        x = self.nonlinearity(x)
+        # Dropout
+        x = self.dropout(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone() if CACHE_T > 0 else x[:, :, :0, :, :]
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat([feat_cache[idx][:, :, -1:, :, :].to(cache_x.device), cache_x], dim=2)
+            x = self.conv2(x, feat_cache[idx], mode='upsample3d')
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv2(x, mode='upsample3d')
+        # Add residual connection
+        return x + h
+class WanAttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    Args:
+        dim (int): The number of channels in the input tensor.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        # layers
+        self.norm = WanRMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+    def forward(self, x):
+        identity = x
+        batch_size, channels, time, height, width = x.size()
+        x = x.permute(0, 2, 1, 3, 4).reshape(batch_size * time, channels, height, width)
+        x = self.norm(x)
+        # compute query, key, value
+        qkv = self.to_qkv(x)
+        qkv = qkv.reshape(batch_size * time, 1, channels * 3, -1)
+        qkv = qkv.permute(0, 1, 3, 2).contiguous()
+        q, k, v = qkv.chunk(3, dim=-1)
+        # apply attention
+        x = F.scaled_dot_product_attention(q, k, v)
+        x = x.squeeze(1).permute(0, 2, 1).reshape(batch_size * time, channels, height, width)
+        # output projection
+        x = self.proj(x)
+        # Reshape back: [(b*t), c, h, w] -> [b, c, t, h, w]
+        x = x.view(batch_size, time, channels, height, width)
+        x = x.permute(0, 2, 1, 3, 4)
+        return x + identity
+class WanMidBlock(nn.Module):
+    """
+    Middle block for WanVAE encoder and decoder.
+    Args:
+        dim (int): Number of input/output channels.
+        dropout (float): Dropout rate.
+        non_linearity (str): Type of non-linearity to use.
+    """
+    def __init__(self, dim: int, dropout: float = 0.0, non_linearity: str = "silu", num_layers: int = 1):
+        super().__init__()
+        self.dim = dim
+        # Create the components
+        resnets = [WanResidualBlock(dim, dim, dropout, non_linearity)]
+        attentions = []
+        for _ in range(num_layers):
+            attentions.append(WanAttentionBlock(dim))
+            resnets.append(WanResidualBlock(dim, dim, dropout, non_linearity))
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.gradient_checkpointing = GRADIENT_CHECKPOINTING
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        # First residual block
+        x = self.resnets[0](x, feat_cache, feat_idx)
+        # Process through attention and residual blocks
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                if self.gradient_checkpointing:
+                    x = torch.utils.checkpoint.checkpoint(
+                        attn,
+                        x,
+                        use_reentrant=False,
+                    )
+                else:
+                    x = attn(x)
+            if self.gradient_checkpointing and feat_cache is not None:
+                # Save mutable state before checkpoint; it will be restored on recompute.
+                initial_idx = feat_idx[0]
+                initial_cache_snapshot = [
+                    (c.clone() if isinstance(c, torch.Tensor) else c)
+                    for c in feat_cache
+                ]
+                def checkpoint_fn(x, block=resnet):
+                    feat_idx[0] = initial_idx
+                    for j in range(len(feat_cache)):
+                        val = initial_cache_snapshot[j]
+                        feat_cache[j] = val.clone() if isinstance(val, torch.Tensor) else val
+                    return block(x, feat_cache, feat_idx)
+                x = torch.utils.checkpoint.checkpoint(
+                    checkpoint_fn,
+                    x,
+                    use_reentrant=False,
+                )
+            else:
+                x = resnet(x, feat_cache, feat_idx)
+        return x
+class WanResidualDownBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout, num_res_blocks, temperal_downsample=False, down_flag=False):
+        super().__init__()
+        # Shortcut path with downsample
+        self.avg_shortcut = AvgDown3D(
+            in_dim,
+            out_dim,
+            factor_t=2 if temperal_downsample else 1,
+            factor_s=2 if down_flag else 1,
+        )
+        # Main path with residual blocks and downsample
+        resnets = []
+        for _ in range(num_res_blocks):
+            resnets.append(WanResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+        self.resnets = nn.ModuleList(resnets)
+        # Add the final downsample block
+        if down_flag:
+            mode = "downsample3d" if temperal_downsample else "downsample2d"
+            self.downsampler = WanResample(out_dim, mode=mode)
+        else:
+            self.downsampler = None
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        x_copy = x.clone()
+        for resnet in self.resnets:
+            x = resnet(x, feat_cache, feat_idx)
+        if self.downsampler is not None:
+            x = self.downsampler(x, feat_cache, feat_idx)
+        return x + self.avg_shortcut(x_copy)
+class WanEncoder3d(nn.Module):
+    r"""
+    A 3D encoder module.
+    Args:
+        dim (int): The base number of channels in the first layer.
+        z_dim (int): The dimensionality of the latent space.
+        dim_mult (list of int): Multipliers for the number of channels in each block.
+        num_res_blocks (int): Number of residual blocks in each block.
+        attn_scales (list of float): Scales at which to apply attention mechanisms.
+        temperal_downsample (list of bool): Whether to downsample temporally in each block.
+        dropout (float): Dropout rate for the dropout layers.
+        non_linearity (str): Type of non-linearity to use.
+    """
+    def __init__(
+        self,
+        in_channels: int = 3,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+        non_linearity: str = "silu",
+        is_residual: bool = False,  # wan 2.2 vae use a residual downblock
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.nonlinearity = resolve_activation(non_linearity)
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv_in = WanCausalConv3d(in_channels, dims[0], 3, padding=1)
+        # downsample blocks
+        self.down_blocks = nn.ModuleList([])
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if is_residual:
+                self.down_blocks.append(
+                    WanResidualDownBlock(
+                        in_dim,
+                        out_dim,
+                        dropout,
+                        num_res_blocks,
+                        temperal_downsample=temperal_downsample[i] if i != len(dim_mult) - 1 else False,
+                        down_flag=i != len(dim_mult) - 1,
+                    )
+                )
+            else:
+                for _ in range(num_res_blocks):
+                    self.down_blocks.append(WanResidualBlock(in_dim, out_dim, dropout))
+                    if scale in attn_scales:
+                        self.down_blocks.append(WanAttentionBlock(out_dim))
+                    in_dim = out_dim
+                # downsample block
+                if i != len(dim_mult) - 1:
+                    mode = "downsample3d" if temperal_downsample[i] else "downsample2d"
+                    self.down_blocks.append(WanResample(out_dim, mode=mode))
+                    scale /= 2.0
+        # middle blocks
+        self.mid_block = WanMidBlock(out_dim, dropout, non_linearity, num_layers=1)
+        # output blocks
+        self.norm_out = WanRMS_norm(out_dim, images=False)
+        self.conv_out = WanCausalConv3d(out_dim, z_dim, 3, padding=1)
+        self.gradient_checkpointing = False
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone() if CACHE_T > 0 else x[:, :, :0, :, :]
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1:, :, :].to(cache_x.device), cache_x], dim=2)
+            x = self.conv_in(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_in(x)
+        ## downsamples
+        for layer in self.down_blocks:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        ## middle
+        x = self.mid_block(x, feat_cache, feat_idx)
+        ## head
+        x = self.norm_out(x)
+        x = self.nonlinearity(x)
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone() if CACHE_T > 0 else x[:, :, :0, :, :]
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                # cache last frame of last two chunk
+                cache_x = torch.cat([feat_cache[idx][:, :, -1:, :, :].to(cache_x.device), cache_x], dim=2)
+            x = self.conv_out(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv_out(x)
+        return x
+class WanResidualUpBlock(nn.Module):
+    """
+    A block that handles upsampling for the WanVAE decoder.
+    Args:
+        in_dim (int): Input dimension
+        out_dim (int): Output dimension
+        num_res_blocks (int): Number of residual blocks
+        dropout (float): Dropout rate
+        temperal_upsample (bool): Whether to upsample on temporal dimension
+        up_flag (bool): Whether to upsample or not
+        non_linearity (str): Type of non-linearity to use
+    """
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        num_res_blocks: int,
+        dropout: float = 0.0,
+        temperal_upsample: bool = False,
+        up_flag: bool = False,
+        non_linearity: str = "silu",
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        if up_flag:
+            self.avg_shortcut = DupUp3D(
+                in_dim,
+                out_dim,
+                factor_t=2 if temperal_upsample else 1,
+                factor_s=2,
+            )
+        else:
+            self.avg_shortcut = None
+        # create residual blocks
+        resnets = []
+        current_dim = in_dim
+        for _ in range(num_res_blocks + 1):
+            resnets.append(WanResidualBlock(current_dim, out_dim, dropout, non_linearity))
+            current_dim = out_dim
+        self.resnets = nn.ModuleList(resnets)
+        # Add upsampling layer if needed
+        if up_flag:
+            upsample_mode = "upsample3d" if temperal_upsample else "upsample2d"
+            self.upsampler = WanResample(out_dim, mode=upsample_mode, upsample_out_dim=out_dim)
+        else:
+            self.upsampler = None
+        self.gradient_checkpointing = False
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False, is_reference=False):
+        """
+        Forward pass through the upsampling block.
+        Args:
+            x (torch.Tensor): Input tensor
+            feat_cache (list, optional): Feature cache for causal convolutions
+            feat_idx (list, optional): Feature index for cache management
+            first_chunk (bool, optional): Whether this is the first chunk
+            is_reference (bool, optional): Whether processing reference tokens
+        Returns:
+            torch.Tensor: Output tensor
+        """
+        x_copy = x.clone()
+        for resnet in self.resnets:
+            if feat_cache is not None:
+                x = resnet(x, feat_cache, feat_idx, is_reference=is_reference)
+            else:
+                x = resnet(x)
+        if self.upsampler is not None:
+            if feat_cache is not None:
+                x = self.upsampler(x, feat_cache, feat_idx)
+            else:
+                # Pass is_reference to upsampler
+                x = self.upsampler(x, is_reference=is_reference)
+        if self.avg_shortcut is not None:
+            x = x + self.avg_shortcut(x_copy, first_chunk=first_chunk, is_reference=is_reference)
+        return x
+class WanUpBlock(nn.Module):
+    """
+    A block that handles upsampling for the WanVAE decoder.
+    Args:
+        in_dim (int): Input dimension
+        out_dim (int): Output dimension
+        num_res_blocks (int): Number of residual blocks
+        dropout (float): Dropout rate
+        upsample_mode (str, optional): Mode for upsampling ('upsample2d' or 'upsample3d')
+        non_linearity (str): Type of non-linearity to use
+    """
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        num_res_blocks: int,
+        dropout: float = 0.0,
+        upsample_mode: Optional[str] = None,
+        non_linearity: str = "silu",
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # Create layers list
+        resnets = []
+        # Add residual blocks and attention if needed
+        current_dim = in_dim
+        for _ in range(num_res_blocks + 1):
+            resnets.append(WanResidualBlock(current_dim, out_dim, dropout, non_linearity))
+            current_dim = out_dim
+        self.resnets = nn.ModuleList(resnets)
+        # Add upsampling layer if needed
+        self.upsamplers = None
+        if upsample_mode is not None:
+            self.upsamplers = nn.ModuleList([WanResample(out_dim, mode=upsample_mode)])
+        self.gradient_checkpointing = False
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=None, is_reference=False):
+        """
+        Forward pass through the upsampling block.
+        Args:
+            x (torch.Tensor): Input tensor
+            feat_cache (list, optional): Feature cache for causal convolutions
+            feat_idx (list, optional): Feature index for cache management
+            first_chunk (bool, optional): Whether this is the first chunk
+            is_reference (bool, optional): Whether processing reference tokens
+        Returns:
+            torch.Tensor: Output tensor
+        """
+        # Pass is_reference to all resnets
+        for resnet in self.resnets:
+            if feat_cache is not None:
+                x = resnet(x, feat_cache, feat_idx)
+            else:
+                x = resnet(x)
+        # Pass is_reference to upsampler
+        if self.upsamplers is not None:
+            if feat_cache is not None:
+                x = self.upsamplers[0](x, feat_cache, feat_idx)
+            else:
+                x = self.upsamplers[0](x, first_chunk=first_chunk, is_reference=is_reference)
+        return x
+class RefConvIn(nn.Module):
+    """
+    Tokenizes reference videos by converting spatial resolution into channels.
+    Uses only reshape operations.
+    Converts [b, c, T, h, w] to [b, c_out, T, h/patch_size, w/patch_size]
+    """
+    def __init__(
+        self,
+        in_channels=3,
+        out_channels=384,
+        patch_size=8,
+    ):
+        """
+        Args:
+            in_channels (int): Number of input channels (e.g., 3 for RGB)
+            out_channels (int): Number of output channels
+            patch_size (int): Size of spatial patches for downsampling
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.patch_size = patch_size
+        # Calculate intermediate channels after patchification
+        self.patch_channels = in_channels * patch_size * patch_size
+        # Conv2d layer to project from patch_channels to out_channels
+        self.proj = nn.Conv2d(self.patch_channels, self.out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm = WanRMS_norm(self.out_channels, images=True)
+        # Calculate how many times to repeat
+        assert (
+            self.out_channels % self.patch_channels == 0
+        ), f"out_channels ({self.out_channels}) must be divisible by patch_channels ({self.patch_channels})"
+    def forward(self, x):
+        """
+        Tokenize reference input using only reshape operations.
+        Args:
+            x: Input tensor [b, in_channels, T, h, w]
+        Returns:
+            Tokenized tensor [b, out_channels, T, h/patch_size, w/patch_size]
+        """
+        b, c, T, h, w = x.shape
+        patch_size = self.patch_size
+        # Ensure dimensions are divisible by patch_size
+        assert h % patch_size == 0, f"Height {h} must be divisible by patch_size {patch_size}"
+        assert w % patch_size == 0, f"Width {w} must be divisible by patch_size {patch_size}"
+        # Step 1: Reshape into patches
+        x = x.view(b, c, T, h // patch_size, patch_size, w // patch_size, patch_size)
+        # Step 2: Rearrange dimensions
+        x = x.permute(0, 1, 4, 6, 2, 3, 5).contiguous()
+        # Step 3: Flatten patches into channels
+        x = x.view(b, c * patch_size * patch_size, T, h // patch_size, w // patch_size)
+        # Step 4: Apply Conv2d projection for each time step
+        # Reshape to merge batch and time dimensions
+        x = x.view(b * T, self.patch_channels, h // patch_size, w // patch_size)
+        # Apply convolution
+        x = self.proj(x)
+        x = self.norm(x)
+        # Reshape back to separate batch and time dimensions
+        x = x.view(b, self.out_channels, T, h // patch_size, w // patch_size)
+        return x
+class WanRotaryPosEmbed(nn.Module):
+    def __init__(
+        self,
+        attention_head_dim: int,
+        patch_size: Tuple[int, int, int],
+        max_seq_len: int,
+        theta: float = 10000.0,
+    ):
+        super().__init__()
+        self.attention_head_dim = attention_head_dim
+        self.patch_size = patch_size
+        self.max_seq_len = max_seq_len
+        h_dim = w_dim = 2 * (attention_head_dim // 6)
+        t_dim = attention_head_dim - h_dim - w_dim
+        freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+        freqs_cos = []
+        freqs_sin = []
+        for dim in [t_dim, h_dim, w_dim]:
+            freq_cos, freq_sin = get_1d_rotary_pos_embed(
+                dim,
+                max_seq_len,
+                theta,
+                use_real=True,
+                repeat_interleave_real=True,
+                freqs_dtype=freqs_dtype,
+            )
+            freqs_cos.append(freq_cos)
+            freqs_sin.append(freq_sin)
+        self.register_buffer("freqs_cos", torch.cat(freqs_cos, dim=1), persistent=False)
+        self.register_buffer("freqs_sin", torch.cat(freqs_sin, dim=1), persistent=False)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.patch_size
+        ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
+        split_sizes = [
+            self.attention_head_dim - 2 * (self.attention_head_dim // 3),
+            self.attention_head_dim // 3,
+            self.attention_head_dim // 3,
+        ]
+        freqs_cos = self.freqs_cos.split(split_sizes, dim=1)
+        freqs_sin = self.freqs_sin.split(split_sizes, dim=1)
+        freqs_cos_f = freqs_cos[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos_h = freqs_cos[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos_w = freqs_cos[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_f = freqs_sin[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_h = freqs_sin[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_w = freqs_sin[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos = torch.cat([freqs_cos_f, freqs_cos_h, freqs_cos_w], dim=-1).reshape(1, ppf * pph * ppw, 1, -1)
+        freqs_sin = torch.cat([freqs_sin_f, freqs_sin_h, freqs_sin_w], dim=-1).reshape(1, ppf * pph * ppw, 1, -1)
+        return freqs_cos, freqs_sin
+class ReferenceRemover:
+    """
+    Removes reference frame tokens that were concatenated along temporal dimension.
+    Handles cases where temporal upsampling may have occurred.
+    """
+    def __init__(self, ref_frame_count: int = 1):
+        """
+        Args:
+            ref_frame_count: Number of reference frames concatenated (default: 1)
+        """
+        self.ref_frame_count = ref_frame_count
+    def __call__(self, x: torch.Tensor, original_temporal_dim: int) -> torch.Tensor:
+        """
+        Remove reference frames from the temporal dimension.
+        Args:
+            x: Tensor of shape [B, C, T, H, W]
+            original_temporal_dim: The temporal dimension before concatenating reference
+        Returns:
+            Tensor with reference frames removed
+        """
+        current_temporal_dim = x.shape[2]
+        # Calculate temporal scale factor from upsampling
+        original_input_frames = original_temporal_dim + 1
+        temporal_scale = current_temporal_dim // original_input_frames
+        # Calculate how many frames to remove (scaled reference frames)
+        frames_to_remove = self.ref_frame_count * temporal_scale
+        # Remove reference frames from the beginning
+        return (x[:, :, :frames_to_remove, :, :], x[:, :, frames_to_remove:, :, :])
+class WanDecoder3d(nn.Module):
+    r"""
+    A 3D decoder module.
+    Args:
+        dim (int): The base number of channels in the first layer.
+        z_dim (int): The dimensionality of the latent space.
+        dim_mult (list of int): Multipliers for the number of channels in each block.
+        num_res_blocks (int): Number of residual blocks in each block.
+        attn_scales (list of float): Scales at which to apply attention mechanisms.
+        temperal_upsample (list of bool): Whether to upsample temporally in each block.
+        dropout (float): Dropout rate for the dropout layers.
+        non_linearity (str): Type of non-linearity to use.
+        skip_decoder_attention (bool): If True, skip all attention blocks in decoder.
+    """
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_upsample=[False, True, True],
+        dropout=0.0,
+        non_linearity: str = "silu",
+        out_channels: int = 3,
+        is_residual: bool = False,
+        use_reference: bool = False,
+        skip_decoder_attention: bool = False,
+        dc_factor: int = 2,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        self.use_reference = use_reference
+        self.skip_decoder_attention = skip_decoder_attention
+        self.dc_factor = dc_factor
+        self.nonlinearity = resolve_activation(non_linearity)
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        # init block
+        self.conv_in = WanCausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.mid_block = WanMidBlock(dims[0], dropout, non_linearity, num_layers=1)
+        self.ref_conv_in = RefConvIn(out_channels=dims[0]) if self.use_reference else None
+        # upsample block & attention block 1, 2 and 3
+        self.up_blocks = nn.ModuleList([])
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            # residual (+attention) blocks
+            if i > 0 and not is_residual:
+                # wan vae 2.1
+                in_dim = in_dim // 2
+            # determine if we need upsampling
+            up_flag = i != len(dim_mult) - 1
+            # determine upsampling mode, if not upsampling, set to None
+            upsample_mode = None
+            if up_flag and temperal_upsample[i]:
+                upsample_mode = "upsample3d"
+            elif up_flag:
+                upsample_mode = "upsample2d"
+            # Create and add the upsampling block
+            if is_residual:
+                up_block = WanResidualUpBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    num_res_blocks=num_res_blocks,
+                    dropout=dropout,
+                    temperal_upsample=temperal_upsample[i] if up_flag else False,
+                    up_flag=up_flag,
+                    non_linearity=non_linearity,
+                )
+            else:
+                up_block = WanUpBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    num_res_blocks=num_res_blocks,
+                    dropout=dropout,
+                    upsample_mode=upsample_mode,
+                    non_linearity=non_linearity,
+                )
+            self.up_blocks.append(up_block)
+        # output blocks
+        self.norm_out = WanRMS_norm(out_dim, images=False)
+        self.conv_out = WanCausalConv3d(out_dim, out_channels, 3, padding=1)
+        self.gradient_checkpointing = GRADIENT_CHECKPOINTING
+    def forward(self, x, transformer, feat_cache=None, feat_idx=[0], first_chunk=False, reference_frame=None, skip=False, window_size=-1):
+        run_attn = not self.skip_decoder_attention and not skip
+        if self.gradient_checkpointing:
+            x = torch.utils.checkpoint.checkpoint(
+                self.conv_in,
+                x,
+                use_reentrant=False
+                )
+        else:
+            x = self.conv_in(x)
+        ## middle
+        x = self.mid_block(x, feat_cache, feat_idx)
+        ref_tokens = None
+        if self.use_reference and reference_frame is not None:
+            # ref_tokens: [B, C, 1, H, W] - single frame
+            if self.gradient_checkpointing:
+                ref_tokens = torch.utils.checkpoint.checkpoint(
+                    self.ref_conv_in,
+                    reference_frame,
+                    use_reentrant=False
+                )
+            else:
+                ref_tokens = self.ref_conv_in(reference_frame)
+        # Transformer + upblock
+        if run_attn:
+            for i in range(4):
+                if i <= 2:
+                    if ref_tokens is not None:
+                        x = torch.cat([ref_tokens, x], dim=2)
+                    transformer_output = transformer(
+                        hidden_states=x,
+                        stage_idx=i,
+                        return_dict=True,
+                        window_size=window_size,
+                    )
+                    # Extract the output sample
+                    x = transformer_output.sample if hasattr(transformer_output, 'sample') else transformer_output[0]
+                    if ref_tokens is not None:
+                        ref_tokens, x = x[:, :, :1], x[:, :, 1:]
+                        if i <= 1:
+                            if self.gradient_checkpointing:
+                                ref_tokens = torch.utils.checkpoint.checkpoint(
+                                    self.up_blocks[i],
+                                    ref_tokens,
+                                    None,
+                                    [0],
+                                    first_chunk,
+                                    True,
+                                    use_reentrant=False
+                                    )
+                            else:
+                                ref_tokens = self.up_blocks[i](ref_tokens, is_reference=True, first_chunk=first_chunk)
+                if self.gradient_checkpointing:
+                    # Save mutable state before checkpoint - will be restored on each forward run
+                    # (both original forward and backward recompute)
+                    initial_idx = feat_idx[0]
+                    initial_cache_snapshot = [
+                        (c.clone() if isinstance(c, torch.Tensor) else c)
+                        for c in feat_cache
+                    ] if feat_cache is not None else None
+                    def checkpoint_fn(x, block_idx=i):
+                        # Restore state before each run to ensure consistency
+                        feat_idx[0] = initial_idx
+                        if initial_cache_snapshot is not None:
+                            for j in range(len(feat_cache)):
+                                val = initial_cache_snapshot[j]
+                                feat_cache[j] = val.clone() if isinstance(val, torch.Tensor) else val
+                        return self.up_blocks[block_idx](x, feat_cache, feat_idx, first_chunk=first_chunk)
+                    x = torch.utils.checkpoint.checkpoint(
+                        checkpoint_fn,
+                        x,
+                        use_reentrant=False,
+                    )
+                else:
+                    x = self.up_blocks[i](x, feat_cache, feat_idx, first_chunk=first_chunk)
+        else:
+            print(f"[DEBUG]: Transformer skipped")
+            for i in range(4):
+                x = self.up_blocks[i](x, feat_cache, feat_idx, first_chunk=first_chunk)
+        ## head
+        x = self.norm_out(x)
+        x = self.nonlinearity(x)
+        if self.gradient_checkpointing:
+            x = torch.utils.checkpoint.checkpoint(
+                self.conv_out,
+                x,
+                None,
+                'upsample3d',
+                use_reentrant=False,
+            )
+        else:
+            x = self.conv_out(x, mode='upsample3d')
+        return x
+def patchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() != 5:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    # x shape: [batch_size, channels, frames, height, width]
+    batch_size, channels, frames, height, width = x.shape
+    # Ensure height and width are divisible by patch_size
+    if height % patch_size != 0 or width % patch_size != 0:
+        raise ValueError(f"Height ({height}) and width ({width}) must be divisible by patch_size ({patch_size})")
+    # Reshape to [batch_size, channels, frames, height//patch_size, patch_size, width//patch_size, patch_size]
+    x = x.view(batch_size, channels, frames, height // patch_size, patch_size, width // patch_size, patch_size)
+    # Rearrange to [batch_size, channels * patch_size * patch_size, frames, height//patch_size, width//patch_size]
+    x = x.permute(0, 1, 6, 4, 2, 3, 5).contiguous()
+    x = x.view(batch_size, channels * patch_size * patch_size, frames, height // patch_size, width // patch_size)
+    return x
+def unpatchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() != 5:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    # x shape: [batch_size, (channels * patch_size * patch_size), frame, height, width]
+    batch_size, c_patches, frames, height, width = x.shape
+    channels = c_patches // (patch_size * patch_size)
+    # Reshape to [b, c, patch_size, patch_size, f, h, w]
+    x = x.view(batch_size, channels, patch_size, patch_size, frames, height, width)
+    # Rearrange to [b, c, f, h * patch_size, w * patch_size]
+    x = x.permute(0, 1, 4, 5, 3, 6, 2).contiguous()
+    x = x.view(batch_size, channels, frames, height * patch_size, width * patch_size)
+    return x
+class AutoencoderKLWan(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    r"""
+    A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos.
+    Introduced in [Wan 2.1].
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    """
+    _supports_gradient_checkpointing = False
+    @register_to_config
+    def __init__(
+        self,
+        base_dim: int = 96,
+        decoder_base_dim: Optional[int] = None,
+        use_reference: bool = False,
+        skip_decoder_attention: bool = False,
+        z_dim: int = 16,
+        dim_mult: Tuple[int] = [1, 2, 4, 4],
+        num_res_blocks: int = 2,
+        attn_scales: List[float] = [],
+        temperal_downsample: List[bool] = [False, True, True],
+        dropout: float = 0.0,
+        latents_mean: List[float] = [
+            -0.7571,
+            -0.7089,
+            -0.9113,
+            0.1075,
+            -0.1745,
+            0.9653,
+            -0.1517,
+            1.5508,
+            0.4134,
+            -0.0715,
+            0.5517,
+            -0.3632,
+            -0.1922,
+            -0.9497,
+            0.2503,
+            -0.2921,
+        ],
+        latents_std: List[float] = [
+            2.8184,
+            1.4541,
+            2.3275,
+            2.6558,
+            1.2196,
+            1.7708,
+            2.6052,
+            2.0743,
+            3.2687,
+            2.1526,
+            2.8652,
+            1.5579,
+            1.6382,
+            1.1253,
+            2.8251,
+            1.9160,
+        ],
+        is_residual: bool = False,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        patch_size: Optional[int] = None,
+        scale_factor_temporal: Optional[int] = 4,
+        scale_factor_spatial: Optional[int] = 8,
+        inference_w_dropout=False,
+        dropout_p=0.7,
+        gradient_checkpointing=False,
+        **kwargs,
+    ) -> None:
+        global GRADIENT_CHECKPOINTING
+        GRADIENT_CHECKPOINTING = gradient_checkpointing
+        super().__init__()
+        self.inference_w_dropout = inference_w_dropout
+        self.dropout_p = dropout_p
+        self.z_dim = z_dim
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        if decoder_base_dim is None:
+            decoder_base_dim = base_dim
+        self.encoder = WanEncoder3d(
+            in_channels=in_channels,
+            dim=base_dim,
+            z_dim=z_dim * 2,
+            dim_mult=dim_mult,
+            num_res_blocks=num_res_blocks,
+            attn_scales=attn_scales,
+            temperal_downsample=temperal_downsample,
+            dropout=dropout,
+            is_residual=is_residual,
+        )
+        self.quant_conv = WanCausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.post_quant_conv = WanCausalConv3d(z_dim, z_dim, 1)
+        self.decoder = WanDecoder3d(
+            dim=decoder_base_dim,
+            z_dim=z_dim,
+            dim_mult=dim_mult,
+            num_res_blocks=num_res_blocks,
+            attn_scales=attn_scales,
+            temperal_upsample=self.temperal_upsample,
+            dropout=dropout,
+            out_channels=out_channels,
+            is_residual=is_residual,
+            use_reference=use_reference,
+            skip_decoder_attention=skip_decoder_attention,
+        )
+        self.spatial_compression_ratio = 2 ** len(self.temperal_downsample)
+        # When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension
+        # to perform decoding of a single video latent at a time.
+        self.use_slicing = False
+        # When decoding spatially large video latents, the memory requirement is very high. By breaking the video latent
+        # frames spatially into smaller tiles and performing multiple forward passes for decoding, and then blending the
+        # intermediate tiles together, the memory requirement can be lowered.
+        self.use_tiling = False
+        # The minimal tile height and width for spatial tiling to be used
+        self.tile_sample_min_height = 256
+        self.tile_sample_min_width = 256
+        # The minimal distance between two spatial tiles
+        self.tile_sample_stride_height = 192
+        self.tile_sample_stride_width = 192
+        # Precompute and cache conv counts for encoder and decoder for clear_cache speedup
+        self._cached_conv_counts = {
+            "decoder": (
+                sum(isinstance(m, WanCausalConv3d) for m in self.decoder.modules()) if self.decoder is not None else 0
+            ),
+            "encoder": (
+                sum(isinstance(m, WanCausalConv3d) for m in self.encoder.modules()) if self.encoder is not None else 0
+            ),
+        }
+        self.reference_frame = None
+    def _init_ref_conv_in(self):
+        ref_conv_in = getattr(self.decoder, "ref_conv_in", None)
+        if ref_conv_in is None:
+            return
+        with torch.no_grad():
+            nn.init.xavier_uniform_(ref_conv_in.proj.weight)
+            if ref_conv_in.proj.bias is not None:
+                nn.init.constant_(ref_conv_in.proj.bias, 0.0)
+    def _apply_token_dropout(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Apply token dropout to the input tensor.
+        Args:
+            x: Input tensor of shape [B, C, T, H, W]
+        Returns:
+            Tensor with random tokens dropped (set to zero)
+        """
+        if self.inference_w_dropout or self.training:
+            if self.training:
+                p = torch.rand(1).item() * self.dropout_p
+            else:
+                p = self.dropout_p
+            dropped = torch.rand_like(x[:, :1, :1, :, :]) < p
+            x = torch.where(dropped, torch.zeros_like(x), x)
+        return x
+    def enable_tiling(
+        self,
+        tile_sample_min_height: Optional[int] = None,
+        tile_sample_min_width: Optional[int] = None,
+        tile_sample_stride_height: Optional[float] = None,
+        tile_sample_stride_width: Optional[float] = None,
+    ) -> None:
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        Args:
+            tile_sample_min_height (`int`, *optional*):
+                The minimum height required for a sample to be separated into tiles across the height dimension.
+            tile_sample_min_width (`int`, *optional*):
+                The minimum width required for a sample to be separated into tiles across the width dimension.
+            tile_sample_stride_height (`int`, *optional*):
+                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
+                no tiling artifacts produced across the height dimension.
+            tile_sample_stride_width (`int`, *optional*):
+                The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
+                artifacts produced across the width dimension.
+        """
+        self.use_tiling = True
+        self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
+        self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
+        self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
+        self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
+    def disable_tiling(self) -> None:
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_tiling = False
+    def enable_slicing(self) -> None:
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+    def disable_slicing(self) -> None:
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+    def clear_cache(self):
+        # Use cached conv counts for decoder and encoder to avoid re-iterating modules each call
+        self._conv_num = self._cached_conv_counts["decoder"]
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = self._cached_conv_counts["encoder"]
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+    def _encode(self, x: torch.Tensor):
+        _, _, num_frame, height, width = x.shape
+        if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
+            return self.tiled_encode(x, is_reference)
+        self.clear_cache()
+        if self.config.patch_size is not None:
+            x = patchify(x, patch_size=self.config.patch_size)
+        iter_ = 1 #TODO
+        for i in range(0, iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(
+                    x[:, :, : 4 * LATENT_T_STRIDE - 3, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+            else:
+                out_ = self.encoder(
+                    x[:, :, i * 4 * LATENT_T_STRIDE - 3 :  (i + 1) * 4 * LATENT_T_STRIDE - 3, :, :],
+                    feat_cache=self._enc_feat_map,
+                    feat_idx=self._enc_conv_idx,
+                )
+                out = torch.cat([out, out_], 2)
+        enc = self.quant_conv(out)
+        self.clear_cache()
+        return enc
+    @apply_forward_hook
+    def encode(
+        self, x: torch.Tensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        r"""
+        Encode a batch of images into latents.
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+        Returns:
+                The latent representations of the encoded videos. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self._encode(x)
+        posterior = DiagonalGaussianDistribution(h)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def _decode(self, z: torch.Tensor, transformer, return_dict: bool = True, reference_frame=None, skip=False, window_size=-1):
+        _, _, num_frame, height, width = z.shape
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+        if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height):
+            return self.tiled_decode(z, return_dict=return_dict, reference_frame=reference_frame, skip=skip)
+        self.clear_cache()
+        x = self.post_quant_conv(z)
+        x = self._apply_token_dropout(x)
+        for i in range(0, num_frame, LATENT_T_STRIDE):
+            self._conv_idx = [0]
+            self._conv_idx_ref = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i : i + LATENT_T_STRIDE, :, :],
+                    transformer=transformer,
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                    first_chunk=True,
+                    reference_frame=reference_frame,
+                    skip=skip,
+                    window_size=window_size,
+                )
+            else:
+                out_ = self.decoder(
+                    x[:, :, i : i + LATENT_T_STRIDE, :, :],
+                    transformer=transformer,
+                    feat_cache=self._feat_map,
+                    feat_idx=self._conv_idx,
+                    reference_frame=reference_frame,
+                    skip=skip,
+                    window_size=window_size,
+                )
+                out = torch.cat([out, out_], 2)
+        if self.config.patch_size is not None:
+            out = unpatchify(out, patch_size=self.config.patch_size)
+        out = torch.clamp(out, min=-1.0, max=1.0)
+        self.clear_cache()
+        if not return_dict:
+            return (out,)
+        return DecoderOutput(sample=out)
+    @apply_forward_hook
+    def decode(
+        self, z: torch.Tensor, transformer ,return_dict: bool = True, reference_frame=None, skip=False, window_size=-1
+    ) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images.
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+            reference_frame (`torch.Tensor`, *optional*):
+                Reference frame for decoder attention.
+            skip (`bool`, *optional*, defaults to `False`):
+                Whether to skip attention in the decoder.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        # Use passed reference_frame or fall back to stored one
+        ref_frame = reference_frame if reference_frame is not None else self.reference_frame
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [
+                self._decode(z_slice, transformer, reference_frame=ref_frame, skip=skip, window_size=window_size).sample for z_slice in z.split(1)
+            ]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z, transformer, reference_frame=ref_frame, skip=skip, window_size=window_size).sample
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
+                y / blend_extent
+            )
+        return b
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
+                x / blend_extent
+            )
+        return b
+    def tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
+        r"""Encode a batch of images using a tiled encoder.
+        Args:
+            x (`torch.Tensor`): Input batch of videos.
+        Returns:
+            `torch.Tensor`:
+                The latent representation of the encoded videos.
+        """
+        _, _, num_frames, height, width = x.shape
+        latent_height = height // self.spatial_compression_ratio
+        latent_width = width // self.spatial_compression_ratio
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+        tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
+        tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
+        blend_height = tile_latent_min_height - tile_latent_stride_height
+        blend_width = tile_latent_min_width - tile_latent_stride_width
+        # Split x into overlapping tiles and encode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, self.tile_sample_stride_height):
+            row = []
+            for j in range(0, width, self.tile_sample_stride_width):
+                self.clear_cache()
+                time = []
+                frame_range = 1 + (num_frames - 1) // 4
+                for k in range(frame_range):
+                    self._enc_conv_idx = [0]
+                    if k == 0:
+                        tile = x[:, :, :1, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
+                    else:
+                        tile = x[
+                            :,
+                            :,
+                            1 + 4 * (k - 1) : 1 + 4 * k,
+                            i : i + self.tile_sample_min_height,
+                            j : j + self.tile_sample_min_width,
+                        ]
+                    tile = self.encoder(tile, feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
+                    tile = self.quant_conv(tile)
+                    time.append(tile)
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+        self.clear_cache()
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_width)
+                result_row.append(tile[:, :, :, :tile_latent_stride_height, :tile_latent_stride_width])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        enc = torch.cat(result_rows, dim=3)[:, :, :, :latent_height, :latent_width]
+        return enc
+    def tiled_decode(
+        self, z: torch.Tensor, return_dict: bool = True, reference_frame=None, skip=False
+    ) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        _, _, num_frames, height, width = z.shape
+        sample_height = height * self.spatial_compression_ratio
+        sample_width = width * self.spatial_compression_ratio
+        tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
+        tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
+        tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
+        tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
+        blend_height = self.tile_sample_min_height - self.tile_sample_stride_height
+        blend_width = self.tile_sample_min_width - self.tile_sample_stride_width
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, tile_latent_stride_height):
+            row = []
+            for j in range(0, width, tile_latent_stride_width):
+                self.clear_cache()
+                time = []
+                for k in range(num_frames):
+                    self._conv_idx = [0]
+                    tile = z[:, :, k : k + 1, i : i + tile_latent_min_height, j : j + tile_latent_min_width]
+                    tile = self.post_quant_conv(tile)
+                    tile = self._apply_token_dropout(tile)
+                    decoded = self.decoder(
+                        tile,
+                        feat_cache=self._feat_map,
+                        feat_idx=self._conv_idx,
+                        reference_frame=reference_frame,
+                        skip=skip,
+                    )
+                    time.append(decoded)
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+        self.clear_cache()
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_width)
+                result_row.append(tile[:, :, :, : self.tile_sample_stride_height, : self.tile_sample_stride_width])
+            result_rows.append(torch.cat(result_row, dim=-1))
+        dec = torch.cat(result_rows, dim=3)[:, :, :, :sample_height, :sample_width]
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, torch.Tensor]:
+        """
+        Args:
+            sample (`torch.Tensor`): Input sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        # Store reference frame if using reference attention
+        if self.decoder.use_reference:
+            idx = torch.randint(0, x.size(2), ()).item()
+            self.reference_frame = x[:, :, idx : idx + 1, :, :].clone()
+        else:
+            self.reference_frame = None
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior

src/models/Wan/transformer_wan.py ADDED Viewed

	@@ -0,0 +1,1049 @@

+# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from peft import LoraConfig, get_peft_model, TaskType
+from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import AttentionMixin, AttentionModuleMixin, FeedForward
+from diffusers.models.attention_dispatch import dispatch_attention_fn
+from diffusers.models.cache_utils import CacheMixin
+from diffusers.models.embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps, get_1d_rotary_pos_embed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import FP32LayerNorm
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def _get_qkv_projections(attn: "WanAttention", hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor):
+    # encoder_hidden_states is only passed for cross-attention
+    if encoder_hidden_states is None:
+        encoder_hidden_states = hidden_states
+    if attn.fused_projections:
+        if attn.cross_attention_dim_head is None:
+            # In self-attention layers, we can fuse the entire QKV projection into a single linear
+            query, key, value = attn.to_qkv(hidden_states).chunk(3, dim=-1)
+        else:
+            # In cross-attention layers, we can only fuse the KV projections into a single linear
+            query = attn.to_q(hidden_states)
+            key, value = attn.to_kv(encoder_hidden_states).chunk(2, dim=-1)
+    else:
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+    return query, key, value
+def _get_added_kv_projections(attn: "WanAttention", encoder_hidden_states_img: torch.Tensor):
+    if attn.fused_projections:
+        key_img, value_img = attn.to_added_kv(encoder_hidden_states_img).chunk(2, dim=-1)
+    else:
+        key_img = attn.add_k_proj(encoder_hidden_states_img)
+        value_img = attn.add_v_proj(encoder_hidden_states_img)
+    return key_img, value_img
+class WanAttnProcessor:
+    _attention_backend = None
+    def __init__(self, return_attention_maps):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "WanAttnProcessor requires PyTorch 2.0."
+            )
+        self.return_attention_maps = return_attention_maps
+    def __call__(
+        self,
+        attn: "WanAttention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        encoder_hidden_states_img = None
+        if attn.add_k_proj is not None:
+            # 512 is the context length of the text encoder, hardcoded for now
+            image_context_length = encoder_hidden_states.shape[1] - 512
+            encoder_hidden_states_img = encoder_hidden_states[:, :image_context_length]
+            encoder_hidden_states = encoder_hidden_states[:, image_context_length:]
+        query, key, value = _get_qkv_projections(attn, hidden_states, encoder_hidden_states)
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+        query = query.unflatten(2, (attn.heads, -1))
+        key = key.unflatten(2, (attn.heads, -1))
+        value = value.unflatten(2, (attn.heads, -1))
+        if rotary_emb is not None:
+            def apply_rotary_emb(
+                hidden_states: torch.Tensor,
+                freqs_cos: torch.Tensor,
+                freqs_sin: torch.Tensor,
+            ):
+                x1, x2 = hidden_states.unflatten(-1, (-1, 2)).unbind(-1)
+                cos = freqs_cos[..., 0::2]
+                sin = freqs_sin[..., 1::2]
+                out = torch.empty_like(hidden_states)
+                out[..., 0::2] = x1 * cos - x2 * sin
+                out[..., 1::2] = x1 * sin + x2 * cos
+                return out.type_as(hidden_states)
+            query = apply_rotary_emb(query, *rotary_emb)
+            key = apply_rotary_emb(key, *rotary_emb)
+        # I2V task
+        hidden_states_img = None
+        if encoder_hidden_states_img is not None:
+            key_img, value_img = _get_added_kv_projections(attn, encoder_hidden_states_img)
+            key_img = attn.norm_added_k(key_img)
+            key_img = key_img.unflatten(2, (attn.heads, -1))
+            value_img = value_img.unflatten(2, (attn.heads, -1))
+            hidden_states_img = dispatch_attention_fn(
+                query,
+                key_img,
+                value_img,
+                attn_mask=None,
+                dropout_p=0.0,
+                is_causal=False,
+                backend=self._attention_backend,
+            )
+            hidden_states_img = hidden_states_img.flatten(2, 3)
+            hidden_states_img = hidden_states_img.type_as(query)
+        if not self.return_attention_maps:
+            # Use fast dispatch
+            # Cast attention_mask to match query dtype to avoid dtype mismatch
+            attn_mask = attention_mask.to(query.dtype) if attention_mask is not None else None
+            hidden_states = dispatch_attention_fn(
+                query,
+                key,
+                value,
+                attn_mask=attn_mask,
+                dropout_p=0.0,
+                is_causal=False,
+                backend=self._attention_backend,
+            )
+            hidden_states = hidden_states.flatten(2, 3)
+            attn_weights = None
+        else:
+            # Manual attention computation to get attention maps
+            # query, key, value: (B, S, H, D) where H=heads, D=head_dim
+            # Transpose to (B, H, S, D) for batched matrix multiplication
+            q = query.transpose(1, 2)  # (B, H, S, D)
+            k = key.transpose(1, 2)    # (B, H, S, D)
+            v = value.transpose(1, 2)  # (B, H, S, D)
+            # Compute attention scores: (B, H, S, S)
+            scale = q.size(-1) ** -0.5
+            attn_scores = torch.matmul(q, k.transpose(-2, -1)) * scale
+            # Apply attention mask if provided
+            if attention_mask is not None:
+                attn_scores = attn_scores + attention_mask
+            # Compute attention weights
+            attn_weights = F.softmax(attn_scores, dim=-1)  # (B, H, S, S)
+            # Apply attention to values
+            hidden_states = torch.matmul(attn_weights, v)  # (B, H, S, D)
+            # Transpose back and flatten: (B, S, H, D) -> (B, S, H*D)
+            hidden_states = hidden_states.transpose(1, 2).flatten(2, 3)
+        hidden_states = hidden_states.type_as(query)
+        if hidden_states_img is not None:
+            hidden_states = hidden_states + hidden_states_img
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states, attn_weights
+class WanAttention(torch.nn.Module, AttentionModuleMixin):
+    _default_processor_cls = WanAttnProcessor
+    _available_processors = [WanAttnProcessor]
+    def __init__(
+        self,
+        dim: int,
+        heads: int = 8,
+        dim_head: int = 64,
+        eps: float = 1e-5,
+        dropout: float = 0.0,
+        added_kv_proj_dim: Optional[int] = None, #image embedding dimension
+        cross_attention_dim_head: Optional[int] = None, #text embedding dimension
+        processor=None,
+        is_cross_attention=None,
+    ):
+        super().__init__()
+        self.inner_dim = dim_head * heads
+        self.heads = heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.cross_attention_dim_head = cross_attention_dim_head
+        self.kv_inner_dim = self.inner_dim if cross_attention_dim_head is None else cross_attention_dim_head * heads
+        self.to_q = torch.nn.Linear(dim, self.inner_dim, bias=True)
+        self.to_k = torch.nn.Linear(dim, self.kv_inner_dim, bias=True)
+        self.to_v = torch.nn.Linear(dim, self.kv_inner_dim, bias=True)
+        self.to_out = torch.nn.ModuleList(
+            [
+                torch.nn.Linear(self.inner_dim, dim, bias=True),
+                torch.nn.Dropout(dropout),
+            ]
+        )
+        self.norm_q = torch.nn.RMSNorm(dim_head * heads, eps=eps, elementwise_affine=True)
+        self.norm_k = torch.nn.RMSNorm(dim_head * heads, eps=eps, elementwise_affine=True)
+        self.add_k_proj = self.add_v_proj = None
+        if added_kv_proj_dim is not None:
+            self.add_k_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=True)
+            self.add_v_proj = torch.nn.Linear(added_kv_proj_dim, self.inner_dim, bias=True)
+            self.norm_added_k = torch.nn.RMSNorm(dim_head * heads, eps=eps)
+        self.is_cross_attention = cross_attention_dim_head is not None
+        self.set_processor(processor)
+    def fuse_projections(self):
+        if getattr(self, "fused_projections", False):
+            return
+        if self.cross_attention_dim_head is None:
+            concatenated_weights = torch.cat([self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data])
+            concatenated_bias = torch.cat([self.to_q.bias.data, self.to_k.bias.data, self.to_v.bias.data])
+            out_features, in_features = concatenated_weights.shape
+            with torch.device("meta"):
+                self.to_qkv = nn.Linear(in_features, out_features, bias=True)
+            self.to_qkv.load_state_dict(
+                {"weight": concatenated_weights, "bias": concatenated_bias}, strict=True, assign=True
+            )
+        else:
+            concatenated_weights = torch.cat([self.to_k.weight.data, self.to_v.weight.data])
+            concatenated_bias = torch.cat([self.to_k.bias.data, self.to_v.bias.data])
+            out_features, in_features = concatenated_weights.shape
+            with torch.device("meta"):
+                self.to_kv = nn.Linear(in_features, out_features, bias=True)
+            self.to_kv.load_state_dict(
+                {"weight": concatenated_weights, "bias": concatenated_bias}, strict=True, assign=True
+            )
+        if self.added_kv_proj_dim is not None:
+            concatenated_weights = torch.cat([self.add_k_proj.weight.data, self.add_v_proj.weight.data])
+            concatenated_bias = torch.cat([self.add_k_proj.bias.data, self.add_v_proj.bias.data])
+            out_features, in_features = concatenated_weights.shape
+            with torch.device("meta"):
+                self.to_added_kv = nn.Linear(in_features, out_features, bias=True)
+            self.to_added_kv.load_state_dict(
+                {"weight": concatenated_weights, "bias": concatenated_bias}, strict=True, assign=True
+            )
+        self.fused_projections = True
+    @torch.no_grad()
+    def unfuse_projections(self):
+        if not getattr(self, "fused_projections", False):
+            return
+        if hasattr(self, "to_qkv"):
+            delattr(
+                self, "to_qkv")
+        if hasattr(self, "to_kv"):
+            delattr(self, "to_kv")
+        if hasattr(self, "to_added_kv"):
+            delattr(self, "to_added_kv")
+        self.fused_projections = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        return self.processor(self, hidden_states, encoder_hidden_states, attention_mask, rotary_emb, **kwargs)
+class WanImageEmbedding(torch.nn.Module):
+    def __init__(self, in_features: int, out_features: int, pos_embed_seq_len=None):
+        super().__init__()
+        self.norm1 = FP32LayerNorm(in_features)
+        self.ff = FeedForward(in_features, out_features, mult=1, activation_fn="gelu")
+        self.norm2 = FP32LayerNorm(out_features)
+        if pos_embed_seq_len is not None:
+            self.pos_embed = nn.Parameter(torch.zeros(1, pos_embed_seq_len, in_features))
+        else:
+            self.pos_embed = None
+    def forward(self, encoder_hidden_states_image: torch.Tensor) -> torch.Tensor:
+        if self.pos_embed is not None:
+            batch_size, seq_len, embed_dim = encoder_hidden_states_image.shape
+            encoder_hidden_states_image = encoder_hidden_states_image.view(-1, 2 * seq_len, embed_dim)
+            encoder_hidden_states_image = encoder_hidden_states_image + self.pos_embed
+        hidden_states = self.norm1(encoder_hidden_states_image)
+        hidden_states = self.ff(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        return hidden_states
+class WanTimeTextImageEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        time_freq_dim: int,
+        time_proj_dim: int,
+        text_embed_dim: int,
+        image_embed_dim: Optional[int] = None,
+        pos_embed_seq_len: Optional[int] = None,
+    ):
+        super().__init__()
+        self.timesteps_proj = Timesteps(num_channels=time_freq_dim, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.time_embedder = TimestepEmbedding(in_channels=time_freq_dim, time_embed_dim=dim)
+        self.act_fn = nn.SiLU()
+        self.time_proj = nn.Linear(dim, time_proj_dim)
+        self.text_embedder = PixArtAlphaTextProjection(text_embed_dim, dim, act_fn="gelu_tanh")
+        self.image_embedder = None
+        if image_embed_dim is not None:
+            self.image_embedder = WanImageEmbedding(image_embed_dim, dim, pos_embed_seq_len=pos_embed_seq_len)
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        timestep_seq_len: Optional[int] = None,
+    ):
+        timestep = self.timesteps_proj(timestep)
+        if timestep_seq_len is not None:
+            timestep = timestep.unflatten(0, (-1, timestep_seq_len))
+        time_embedder_dtype = next(iter(self.time_embedder.parameters())).dtype
+        if timestep.dtype != time_embedder_dtype and time_embedder_dtype != torch.int8:
+            timestep = timestep.to(time_embedder_dtype)
+        temb = self.time_embedder(timestep).type_as(encoder_hidden_states)
+        timestep_proj = self.time_proj(self.act_fn(temb))
+        encoder_hidden_states = self.text_embedder(encoder_hidden_states)
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states_image = self.image_embedder(encoder_hidden_states_image)
+        return temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image
+class WanRotaryPosEmbed(nn.Module):
+    def __init__(
+        self,
+        attention_head_dim: int,
+        patch_size: Tuple[int, int, int],
+        max_seq_len: int,
+        theta: float = 10000.0,
+    ):
+        super().__init__()
+        self.attention_head_dim = attention_head_dim
+        self.patch_size = patch_size
+        self.max_seq_len = max_seq_len
+        h_dim = w_dim = 2 * (attention_head_dim // 6)
+        t_dim = attention_head_dim - h_dim - w_dim
+        freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
+        freqs_cos = []
+        freqs_sin = []
+        for dim in [t_dim, h_dim, w_dim]:
+            freq_cos, freq_sin = get_1d_rotary_pos_embed(
+                dim,
+                max_seq_len,
+                theta,
+                use_real=True,
+                repeat_interleave_real=True,
+                freqs_dtype=freqs_dtype,
+            )
+            freqs_cos.append(freq_cos)
+            freqs_sin.append(freq_sin)
+        self.register_buffer("freqs_cos", torch.cat(freqs_cos, dim=1), persistent=False)
+        self.register_buffer("freqs_sin", torch.cat(freqs_sin, dim=1), persistent=False)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.patch_size
+        ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
+        split_sizes = [
+            self.attention_head_dim - 2 * (self.attention_head_dim // 3),
+            self.attention_head_dim // 3,
+            self.attention_head_dim // 3,
+        ]
+        freqs_cos = self.freqs_cos.split(split_sizes, dim=1)
+        freqs_sin = self.freqs_sin.split(split_sizes, dim=1)
+        freqs_cos_f = freqs_cos[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos_h = freqs_cos[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos_w = freqs_cos[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_f = freqs_sin[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_h = freqs_sin[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_sin_w = freqs_sin[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+        freqs_cos = torch.cat([freqs_cos_f, freqs_cos_h, freqs_cos_w], dim=-1).reshape(1, ppf * pph * ppw, 1, -1)
+        freqs_sin = torch.cat([freqs_sin_f, freqs_sin_h, freqs_sin_w], dim=-1).reshape(1, ppf * pph * ppw, 1, -1)
+        return freqs_cos, freqs_sin
+@maybe_allow_in_graph
+class WanTransformerBlockOG(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        ffn_dim: int,
+        num_heads: int,
+        qk_norm: str = "rms_norm_across_heads",
+        cross_attn_norm: bool = False,
+        eps: float = 1e-6,
+        added_kv_proj_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        # 1. Self-attention
+        self.norm1 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.attn1 = WanAttention(
+            dim=dim,
+            heads=num_heads,
+            dim_head=dim // num_heads,
+            eps=eps,
+            cross_attention_dim_head=None,
+            processor=WanAttnProcessor(),
+        )
+        # 2. Cross-attention
+        self.attn2 = WanAttention(
+            dim=dim,
+            heads=num_heads,
+            dim_head=dim // num_heads,
+            eps=eps,
+            added_kv_proj_dim=added_kv_proj_dim,
+            cross_attention_dim_head=dim // num_heads,
+            processor=WanAttnProcessor(),
+        )
+        self.norm2 = FP32LayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity()
+        # 3. Feed-forward
+        self.ffn = FeedForward(dim, inner_dim=ffn_dim, activation_fn="gelu-approximate")
+        self.norm3 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        rotary_emb: torch.Tensor,
+    ) -> torch.Tensor:
+        if temb.ndim == 4:
+            # temb: batch_size, seq_len, 6, inner_dim (wan2.2 ti2v)
+            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+                self.scale_shift_table.unsqueeze(0) + temb.float()
+            ).chunk(6, dim=2)
+            # batch_size, seq_len, 1, inner_dim
+            shift_msa = shift_msa.squeeze(2)
+            scale_msa = scale_msa.squeeze(2)
+            gate_msa = gate_msa.squeeze(2)
+            c_shift_msa = c_shift_msa.squeeze(2)
+            c_scale_msa = c_scale_msa.squeeze(2)
+            c_gate_msa = c_gate_msa.squeeze(2)
+        else:
+            # temb: batch_size, 6, inner_dim (wan2.1/wan2.2 14B)
+            shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+                self.scale_shift_table + temb.float()
+            ).chunk(6, dim=1)
+        # 1. Self-attention
+        norm_hidden_states = (self.norm1(hidden_states.float()) * (1 + scale_msa) + shift_msa).type_as(hidden_states)
+        attn_output = self.attn1(norm_hidden_states, None, None, rotary_emb)
+        hidden_states = (hidden_states.float() + attn_output * gate_msa).type_as(hidden_states)
+        # 2. Cross-attention
+        norm_hidden_states = self.norm2(hidden_states.float()).type_as(hidden_states)
+        attn_output = self.attn2(norm_hidden_states, encoder_hidden_states, None, None)
+        hidden_states = hidden_states + attn_output
+        # 3. Feed-forward
+        norm_hidden_states = (self.norm3(hidden_states.float()) * (1 + c_scale_msa) + c_shift_msa).type_as(
+            hidden_states
+        )
+        ff_output = self.ffn(norm_hidden_states)
+        hidden_states = (hidden_states.float() + ff_output.float() * c_gate_msa).type_as(hidden_states)
+        return hidden_states
+@maybe_allow_in_graph
+class WanTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        ffn_dim: int,
+        num_heads: int,
+        return_attention_maps: bool,
+        qk_norm: str = "rms_norm_across_heads",
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        # 1. Self-attention
+        self.norm1 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.attn1 = WanAttention(
+            dim=dim,
+            heads=num_heads,
+            dim_head=dim // num_heads,
+            eps=eps,
+            cross_attention_dim_head=None,
+            processor=WanAttnProcessor(return_attention_maps=return_attention_maps),
+        )
+        # 2. Feed-forward
+        self.ffn = FeedForward(dim, inner_dim=ffn_dim, activation_fn="gelu-approximate")
+        self.norm3 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        # 3. Curriculum learning parameter for spatial attention
+        self.attention_window = -1  # -1 = full attention (default)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rotary_emb: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        attn_weights = None
+        # 1. Self-attention
+        norm_hidden_states = self.norm1(hidden_states.float()).type_as(hidden_states)
+        attn_output, attn_weights = self.attn1(norm_hidden_states, None, attention_mask, rotary_emb)
+        hidden_states = (hidden_states.float() + attn_output).type_as(hidden_states)
+        # 2. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states.float()).type_as(hidden_states)
+        ff_output = self.ffn(norm_hidden_states)
+        hidden_states = (hidden_states.float() + ff_output.float()).type_as(hidden_states)
+        return hidden_states, attn_weights
+class WanTransformer3DModel(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin, AttentionMixin
+):
+    r"""
+    A Transformer model for video-like data used in the Wan model.
+    Args:
+        patch_size (`Tuple[int]`, defaults to `(1, 2, 2)`):
+            3D patch dimensions for video embedding (t_patch, h_patch, w_patch).
+        num_attention_heads (`int`, defaults to `40`):
+            Fixed length for text embeddings.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, defaults to `16`):
+            The number of channels in the output.
+        text_dim (`int`, defaults to `512`):
+            Input dimension for text embeddings.
+        freq_dim (`int`, defaults to `256`):
+            Dimension for sinusoidal time embeddings.
+        ffn_dim (`int`, defaults to `13824`):
+            Intermediate dimension in feed-forward network.
+        num_layers (`int`, defaults to `40`):
+            The number of layers of transformer blocks to use.
+        window_size (`Tuple[int]`, defaults to `(-1, -1)`):
+            Window size for local attention (-1 indicates global attention).
+        cross_attn_norm (`bool`, defaults to `True`):
+            Enable cross-attention normalization.
+        qk_norm (`bool`, defaults to `True`):
+            Enable query/key normalization.
+        eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        add_img_emb (`bool`, defaults to `False`):
+            Whether to use img_emb.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+    """
+    _supports_gradient_checkpointing = True
+    _skip_layerwise_casting_patterns = ["patch_embedding", "condition_embedder", "norm"]
+    _no_split_modules = ["WanTransformerBlock"]
+    _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"]
+    _keys_to_ignore_on_load_unexpected = ["norm_added_q"]
+    _repeated_blocks = ["WanTransformerBlock"]
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 40,
+        attention_head_dim: int = 128,
+        ffn_dim: int = 13824,
+        num_layers: int = 40,
+        qk_norm: Optional[str] = "rms_norm_across_heads",
+        eps: float = 1e-6,
+        gradient_checkpointing: bool = False,
+    ) -> None:
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        # Transformer blocks
+        self.blocks = nn.ModuleList(
+            [
+                WanTransformerBlock(
+                    inner_dim, ffn_dim, num_attention_heads, False, qk_norm, eps
+                )
+                for i in range(num_layers)
+            ]
+        )
+        self.gradient_checkpointing = gradient_checkpointing
+class WanDecoderTransformer(torch.nn.Module):
+    def __init__(
+        self,
+        chunk:int = 2,
+        rope_max_seq_len=None,
+        patch_size=[(1, 2, 2), (1, 4, 4), (1, 8, 8)],
+        num_layers: int = 30,
+        num_heads=12,
+        head_dim=128,
+        channels=[384, 192, 192],
+        use_lora: bool = False,
+        lora_rank: int = 8,
+        lora_alpha: int = 32,
+        lora_dropout: float = 0.1,
+        reusing: bool = False,
+        pretrained: bool = True,
+        gradient_checkpointing: bool = False,
+    ) -> None:
+        super().__init__()
+        self.chunk = chunk
+        self.use_lora = use_lora
+        self.attn_weights = []
+        # # Initialize the transformer
+        if pretrained:
+            self.transformer = WanTransformer3DModel.from_pretrained(
+                "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+                subfolder="transformer",
+                num_attention_heads=12,
+                attention_head_dim=128,
+                num_layers=30,
+                ffn_dim=8960,
+                eps=1e-6,
+                qk_norm="rms_norm_across_heads",
+                gradient_checkpointing=gradient_checkpointing,
+                torch_dtype=torch.float32,
+                device_map=None,
+                ignore_mismatched_sizes=True,
+                strict=False
+            )
+        else:
+            self.transformer = WanTransformer3DModel(
+                num_attention_heads=num_heads,
+                attention_head_dim=head_dim,
+                num_layers=num_layers,
+                ffn_dim=8960,
+                eps=1e-6,
+                qk_norm="rms_norm_across_heads",
+                gradient_checkpointing=gradient_checkpointing,
+            )
+        # Apply LoRA if requested
+        if self.use_lora:
+            self._apply_lora(lora_rank, lora_alpha, lora_dropout)
+        # Configuration
+        self.channels = channels
+        self.num_attention_heads = num_heads
+        self.attention_head_dim = head_dim
+        self.num_layers = num_layers
+        self.reusing = reusing
+        inner_dim = self.num_attention_heads * self.attention_head_dim
+        # Ensure each image has 1560 tokens
+        seq_len_per_chunk = 1560
+        chunk = self.chunk
+        self.patch_size = patch_size
+        if rope_max_seq_len is None:
+            self.rope_max_seq_len = [seq_len_per_chunk * (chunk + 1), seq_len_per_chunk * (2 * chunk), seq_len_per_chunk * (4 * chunk - 2)]
+        else:
+            self.rope_max_seq_len = rope_max_seq_len
+        eps = 1e-6
+        # 1. Patch & position embedding
+        self.patch_embeddings = nn.ModuleList([
+            nn.Conv3d(channels[0], inner_dim, kernel_size=self.patch_size[0], stride=self.patch_size[0]),  # First upblock output
+            nn.Conv3d(channels[1], inner_dim, kernel_size=self.patch_size[1], stride=self.patch_size[1]),  # Second upblock output
+            nn.Conv3d(channels[2], inner_dim, kernel_size=self.patch_size[2], stride=self.patch_size[2]),  # Third upblock output
+        ])
+        self.rope = nn.ModuleList([
+            WanRotaryPosEmbed(self.attention_head_dim, self.patch_size[i], self.rope_max_seq_len[i]) for i in range(3)
+        ])
+        # Output norms & projections for three resolutions
+        self.norm_outs = nn.ModuleList([
+            FP32LayerNorm(inner_dim, eps, elementwise_affine=False),
+            FP32LayerNorm(inner_dim, eps, elementwise_affine=False),
+            FP32LayerNorm(inner_dim, eps, elementwise_affine=False),
+        ])
+        self.proj_outs = nn.ModuleList([
+            nn.Linear(inner_dim, channels[0] * math.prod(self.patch_size[0])),
+            nn.Linear(inner_dim, channels[1] * math.prod(self.patch_size[1])),
+            nn.Linear(inner_dim, channels[2] * math.prod(self.patch_size[2])),
+        ])
+        self.initialize_decoder_components()
+    def initialize_decoder_components(self):
+        """Initialize patch embeddings and position embeddings"""
+        import math
+        # Initialize patch embeddings
+        for patch_embed in self.patch_embeddings:
+            patch_embed.reset_parameters()
+        # # Initialize position embeddings (ViT standard)
+        # for pos_embed in self.pos_embeds:
+        #     nn.init.trunc_normal_(pos_embed, std=0.02)
+        # Initialize output projections
+        for proj_out in self.proj_outs:
+            nn.init.xavier_uniform_(proj_out.weight)
+            # nn.init.zeros_(proj_out.weight)
+            if proj_out.bias is not None:
+                nn.init.zeros_(proj_out.bias)
+    def _apply_lora(self, lora_rank, lora_alpha, lora_dropout):
+        """Apply LoRA to transformer blocks"""
+        lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_alpha,
+            target_modules=[
+                "to_q", "to_k", "to_v", "to_out.0",
+                "ffn.net.0.proj", "ffn.net.2",
+            ],
+            lora_dropout=lora_dropout,
+            bias="none",
+            task_type=TaskType.FEATURE_EXTRACTION,
+        )
+        self.transformer = get_peft_model(self.transformer, lora_config)
+    def get_lora_target_modules(self):
+        """Return the target modules configured for LoRA on the wrapped transformer."""
+        if not self.use_lora:
+            return []
+        transformer = getattr(self, "transformer", None)
+        if transformer is None:
+            return []
+        peft_config = getattr(transformer, "peft_config", None)
+        if not peft_config:
+            return []
+        active_adapter = getattr(transformer, "active_adapter", None)
+        if active_adapter and active_adapter in peft_config:
+            config = peft_config[active_adapter]
+        else:
+            config = next(iter(peft_config.values()))
+        target_modules = getattr(config, "target_modules", None)
+        if target_modules is None:
+            return []
+        return list(target_modules)
+    def fuse_lora_weights(self):
+        """
+        Fuse LoRA weights into the base model weights.
+        This merges the low-rank adaptation matrices (A and B) with the original weights:
+        W' = W + (scaling * B @ A)
+        After fusing, the model will have the same behavior but without the LoRA overhead,
+        making it more efficient for inference.
+        Returns:
+            bool: True if fusion was successful, False otherwise
+        """
+        if not self.use_lora:
+            print("⚠ LoRA is not enabled, nothing to fuse")
+            return False
+        try:
+            # PEFT library provides a merge_and_unload method
+            print("Fusing LoRA weights into base model...")
+            # Get the base model with fused weights
+            self.transformer = self.transformer.merge_and_unload()
+            # Update the use_lora flag since LoRA is now fused
+            self.use_lora = False
+            print("✓ Successfully fused LoRA weights into base model")
+            return True
+        except Exception as e:
+            print(f"✗ Error fusing LoRA weights: {e}")
+            return False
+    def unfuse_lora_weights(self):
+        """
+        Unfuse/unmerge LoRA weights from the base model.
+        This separates the LoRA weights from base weights if they were previously merged.
+        Note: This only works if the model still has LoRA adapters loaded.
+        Returns:
+            bool: True if unfusion was successful, False otherwise
+        """
+        if not self.use_lora:
+            print("⚠ LoRA is not enabled or already unfused")
+            return False
+        try:
+            print("Unfusing LoRA weights from base model...")
+            # PEFT library provides an unmerge method
+            self.transformer.unmerge_adapter()
+            print("✓ Successfully unfused LoRA weights from base model")
+            return True
+        except Exception as e:
+            print(f"✗ Error unfusing LoRA weights: {e}")
+            return False
+    def get_map(self):
+        return self.attn_weights
+    def clear_map(self):
+        self.attn_weights = []
+    def create_spatial_mask(self, attention_window, num_frames, height, width, device):
+        """
+        Create spatial attention mask for self-attention within frames.
+        Restricts each token to attend only to spatially nearby tokens within the same frame.
+        Uses Manhattan distance for spatial proximity.
+        Args:
+            batch_size: Batch size
+            num_frames: Number of temporal frames
+            height: Spatial height of feature map
+            width: Spatial width of feature map
+            device: torch device
+        Returns:
+            Attention mask [1, 1, seq_len, seq_len] or None if full attention
+        """
+        if attention_window < 0:
+            return None  # Full attention
+        seq_len = num_frames * height * width
+        # Tokens are ordered as [t0_h0_w0, t0_h0_w1, ..., t0_hH_wW, t1_h0_w0, ...]
+        # For each query token, compute which key token it should attend to
+        # Query token i at (t_q, h_q, w_q) should attend to key token at (t=0, h_q, w_q)
+        # Create indices for spatial positions (h, w) - reused across frames
+        spatial_size = height * width
+        h_indices = torch.arange(height, device=device).repeat_interleave(width)  # [0,0,...,0,1,1,...,1,...]
+        w_indices = torch.arange(width, device=device).repeat(height)  # [0,1,2,...,W-1,0,1,2,...,W-1,...]
+        # For each query position, find the corresponding key index in first frame
+        # Query at frame t, position (h,w) -> Key at frame 0, position (h,w)
+        # Key index = h * width + w
+        key_indices_per_spatial_pos = h_indices * width + w_indices  # [spatial_size]
+        # Repeat this pattern for all frames (each query frame uses same spatial mapping)
+        key_indices = key_indices_per_spatial_pos.repeat(num_frames)  # [seq_len]
+        # Create sparse mask more efficiently using indexing
+        # Initialize with -inf (block all attention)
+        attention_mask = torch.full((seq_len, seq_len), float('-inf'), dtype=torch.float32, device=device)
+        # For each query position, allow attention to exactly one key position
+        query_indices = torch.arange(seq_len, device=device)
+        attention_mask[query_indices, key_indices] = 0.0
+        # Add batch and head dimensions: [1, 1, seq_len, seq_len]
+        attention_mask = attention_mask.unsqueeze(0).unsqueeze(0)
+        return attention_mask
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        stage_idx: int = 0,
+        return_dict: bool = True,
+        window_size=-1,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            hidden_states: Input tensor (B, C, T, H, W) where C is 384 or 192
+            stage_idx: 0 for first stage (384 channels), 1 for second stage (192 channels)
+            return_dict: Whether to return dict or tuple
+            attention_kwargs: Additional attention arguments
+        """
+        assert stage_idx in [0, 1, 2], f"stage_idx must be 0 or 1, got {stage_idx}"
+        # clear previous attention weights
+        # self.attn_weights = []
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        # Get input dimensions
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        p_t, p_h, p_w = self.patch_size[stage_idx]
+        # Keep exact output shape even when T/H/W are not divisible by patch size.
+        # We pad before patch embedding and crop back after unpatchify.
+        pad_t = (p_t - (num_frames % p_t)) % p_t
+        pad_h = (p_h - (height % p_h)) % p_h
+        pad_w = (p_w - (width % p_w)) % p_w
+        if pad_t or pad_h or pad_w:
+            hidden_states = F.pad(hidden_states, (0, pad_w, 0, pad_h, 0, pad_t))
+        _, _, padded_num_frames, padded_height, padded_width = hidden_states.shape
+        post_patch_num_frames = padded_num_frames // p_t
+        post_patch_height = padded_height // p_h
+        post_patch_width = padded_width // p_w
+        # Select appropriate patch embedding based on stage
+        patch_embedding = self.patch_embeddings[stage_idx]
+        rotary_emb = self.rope[stage_idx](hidden_states)
+        # Patch embedding
+        hidden_states = patch_embedding(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)  # (B, seq_len, inner_dim)
+        assert hidden_states.shape[1] <= self.rope_max_seq_len[stage_idx], (
+            f"Sequence length {hidden_states.shape[1]} is greater than maximum sequence length "
+            f"{self.rope_max_seq_len[stage_idx]} for stage {stage_idx}"
+        )
+        # Select transformer blocks
+        if self.reusing:
+            transformer_blocks = self.transformer.blocks
+        else:
+            blocks_per_stage = self.num_layers // 3
+            transformer_blocks = self.transformer.blocks[stage_idx * blocks_per_stage : (stage_idx + 1) * blocks_per_stage]
+        # Run transformer blocks
+        attention_mask = self.create_spatial_mask(
+            window_size,
+            post_patch_num_frames,
+            post_patch_height,
+            post_patch_width,
+            hidden_states.device,
+        )
+        if torch.is_grad_enabled() and getattr(self.transformer, 'gradient_checkpointing', False):
+            for block in transformer_blocks:
+                hidden_states, attn_weight = torch.utils.checkpoint.checkpoint(
+                    block,
+                    hidden_states,
+                    rotary_emb,
+                    attention_mask,
+                    use_reentrant=False
+                )
+                self.attn_weights.append(attn_weight)
+        else:
+            for block in transformer_blocks:
+                hidden_states, attn_weight = block(
+                    hidden_states,
+                    rotary_emb,
+                    attention_mask,
+                )
+                self.attn_weights.append(attn_weight)
+        # Output norm & projection
+        norm_out = self.norm_outs[stage_idx]
+        proj_out = self.proj_outs[stage_idx]
+        hidden_states = norm_out(hidden_states.float()).type_as(hidden_states)
+        hidden_states = proj_out(hidden_states)
+        # Unpatchify
+        out_channels = self.channels[stage_idx]
+        hidden_states = hidden_states.reshape(
+            batch_size, post_patch_num_frames, post_patch_height, post_patch_width,
+            p_t, p_h, p_w, out_channels
+        )
+        hidden_states = hidden_states.permute(0, 7, 1, 4, 2, 5, 3, 6)
+        output = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
+        if pad_t or pad_h or pad_w:
+            output = output[:, :, :num_frames, :height, :width]
+        if USE_PEFT_BACKEND:
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

src/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+