LTX-Video-Playground

Running

App Files Files Community

Sapir commited on Oct 6

Commit

86b1a7e

•

1 Parent(s): e7d5e3c

working.

Browse files

Files changed (10) hide show

xora/examples/image_to_video.py +87 -0
xora/models/autoencoders/causal_video_autoencoder.py +3 -1
xora/models/autoencoders/vae_encode.py +11 -41
xora/models/autoencoders/video_autoencoder.py +912 -0
xora/models/transformers/embeddings.py +125 -0
xora/models/transformers/transformer3d.py +77 -4
xora/pipelines/pipeline_video_pixart_alpha.py +181 -13
xora/schedulers/rf.py +13 -4
xora/utils/conditioning_method.py +7 -0
xora/utils/dist_util.py +11 -0

xora/examples/image_to_video.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
+from xora.models.transformers.transformer3d import Transformer3DModel
+from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier
+from xora.schedulers.rf import RectifiedFlowScheduler
+from xora.pipelines.pipeline_video_pixart_alpha import VideoPixArtAlphaPipeline
+from pathlib import Path
+from transformers import T5EncoderModel
+model_name_or_path = "PixArt-alpha/PixArt-XL-2-1024-MS"
+vae_local_path = Path("/opt/models/checkpoints/vae_training/causal_vvae_32x32x8_420m_cont_32/step_2296000")
+dtype = torch.float32
+vae = CausalVideoAutoencoder.from_pretrained(
+            pretrained_model_name_or_path=vae_local_path,
+            revision=False,
+            torch_dtype=torch.bfloat16,
+            load_in_8bit=False,
+).cuda()
+transformer_config_path = Path("/opt/txt2img/txt2img/config/transformer3d/xora_v1.2-L.json")
+transformer_config = Transformer3DModel.load_config(transformer_config_path)
+transformer = Transformer3DModel.from_config(transformer_config)
+transformer_local_path = Path("/opt/models/logs/v1.2-vae-mf-medHR-mr-cvae-first-frame-cond-4k-seq/ckpt/01822000/model.pt")
+transformer_ckpt_state_dict = torch.load(transformer_local_path)
+transformer.load_state_dict(transformer_ckpt_state_dict, True)
+transformer = transformer.cuda()
+unet = transformer
+scheduler_config_path = Path("/opt/txt2img/txt2img/config/scheduler/RF_SD3_shifted.json")
+scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
+scheduler = RectifiedFlowScheduler.from_config(scheduler_config)
+patchifier = SymmetricPatchifier(patch_size=1)
+# text_encoder = T5EncoderModel.from_pretrained("t5-v1_1-xxl")
+submodel_dict = {
+    "unet": unet,
+    "transformer": transformer,
+    "patchifier": patchifier,
+    "text_encoder": None,
+    "scheduler": scheduler,
+    "vae": vae,
+}
+pipeline = VideoPixArtAlphaPipeline.from_pretrained(model_name_or_path,
+                                                    safety_checker=None,
+            revision=None,
+            torch_dtype=dtype,
+            **submodel_dict,
+        )
+num_inference_steps=20
+num_images_per_prompt=2
+guidance_scale=3
+height=512
+width=768
+num_frames=57
+frame_rate=25
+# sample = {
+#     "prompt": "A cat", # (B, L, E)
+#     'prompt_attention_mask': None, # (B , L)
+#     'negative_prompt': "Ugly deformed",
+#     'negative_prompt_attention_mask': None # (B , L)
+# }
+sample = torch.load("/opt/sample.pt")
+for _, item in sample.items():
+    if item is not None:
+        item = item.cuda()
+media_items = torch.load("/opt/sample_media.pt")
+images = pipeline(
+    num_inference_steps=num_inference_steps,
+    num_images_per_prompt=num_images_per_prompt,
+    guidance_scale=guidance_scale,
+    generator=None,
+    output_type="pt",
+    callback_on_step_end=None,
+    height=height,
+    width=width,
+    num_frames=num_frames,
+    frame_rate=frame_rate,
+    **sample,
+    is_video=True,
+    vae_per_channel_normalize=True,
+).images
+print()

xora/models/autoencoders/causal_video_autoencoder.py CHANGED Viewed

@@ -8,11 +8,13 @@ import torch
 import numpy as np
 from einops import rearrange
 from torch import nn
 from xora.models.autoencoders.conv_nd_factory import make_conv_nd, make_linear_nd
 from xora.models.autoencoders.pixel_norm import PixelNorm
 from xora.models.autoencoders.vae import AutoencoderKLWrapper
 class CausalVideoAutoencoder(AutoencoderKLWrapper):
     @classmethod
@@ -138,7 +140,7 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
                 key = key.replace(k, v)
             if "norm" in key and key not in model_keys:
-                print(f"Removing key {key} from state_dict as it is not present in the model")
                 continue
             converted_state_dict[key] = value

 import numpy as np
 from einops import rearrange
 from torch import nn
+from diffusers.utils import logging
 from xora.models.autoencoders.conv_nd_factory import make_conv_nd, make_linear_nd
 from xora.models.autoencoders.pixel_norm import PixelNorm
 from xora.models.autoencoders.vae import AutoencoderKLWrapper
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 class CausalVideoAutoencoder(AutoencoderKLWrapper):
     @classmethod
                 key = key.replace(k, v)
             if "norm" in key and key not in model_keys:
+                logger.info(f"Removing key {key} from state_dict as it is not present in the model")
                 continue
             converted_state_dict[key] = value

xora/models/autoencoders/vae_encode.py CHANGED Viewed

@@ -1,44 +1,12 @@
 import torch
-from torch import nn
 from diffusers import AutoencoderKL
 from einops import rearrange
 from torch import Tensor
-from torch.nn import functional
 from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
-class Downsample3D(nn.Module):
-    def __init__(self, dims, in_channels: int, out_channels: int, kernel_size: int = 3, padding: int = 1):
-        super().__init__()
-        stride: int = 2
-        self.padding = padding
-        self.in_channels = in_channels
-        self.dims = dims
-        self.conv = make_conv_nd(
-            dims=dims,
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-        )
-    def forward(self, x, downsample_in_time=True):
-        conv = self.conv
-        if self.padding == 0:
-            if self.dims == 2:
-                padding = (0, 1, 0, 1)
-            else:
-                padding = (0, 1, 0, 1, 0, 1 if downsample_in_time else 0)
-            x = functional.pad(x, padding, mode="constant", value=0)
-            if self.dims == (2, 1) and not downsample_in_time:
-                return conv(x, skip_time_conv=True)
-        return conv(x)
 def vae_encode(media_items: Tensor, vae: AutoencoderKL, split_size: int = 1, vae_per_channel_normalize=False) -> Tensor:
@@ -78,7 +46,7 @@ def vae_encode(media_items: Tensor, vae: AutoencoderKL, split_size: int = 1, vae
     if channels != 3:
         raise ValueError(f"Expects tensors with 3 channels, got {channels}.")
-    if is_video_shaped and not isinstance(vae, (CausalVideoAutoencoder)):
         media_items = rearrange(media_items, "b c n h w -> (b n) c h w")
     if split_size > 1:
         if len(media_items) % split_size != 0:
@@ -86,14 +54,16 @@ def vae_encode(media_items: Tensor, vae: AutoencoderKL, split_size: int = 1, vae
         encode_bs = len(media_items) // split_size
         # latents = [vae.encode(image_batch).latent_dist.sample() for image_batch in media_items.split(encode_bs)]
         latents = []
         for image_batch in media_items.split(encode_bs):
             latents.append(vae.encode(image_batch).latent_dist.sample())
         latents = torch.cat(latents, dim=0)
     else:
         latents = vae.encode(media_items).latent_dist.sample()
     latents = normalize_latents(latents, vae, vae_per_channel_normalize)
-    if is_video_shaped and not isinstance(vae, (CausalVideoAutoencoder)):
         latents = rearrange(latents, "(b n) c h w -> b c n h w", b=batch_size)
     return latents
@@ -104,7 +74,7 @@ def vae_decode(
     is_video_shaped = latents.dim() == 5
     batch_size = latents.shape[0]
-    if is_video_shaped and not isinstance(vae, (CausalVideoAutoencoder)):
         latents = rearrange(latents, "b c n h w -> (b n) c h w")
     if split_size > 1:
         if len(latents) % split_size != 0:
@@ -118,13 +88,13 @@ def vae_decode(
     else:
         images = _run_decoder(latents, vae, is_video, vae_per_channel_normalize)
-    if is_video_shaped and not isinstance(vae, (CausalVideoAutoencoder)):
         images = rearrange(images, "(b n) c h w -> b c n h w", b=batch_size)
     return images
 def _run_decoder(latents: Tensor, vae: AutoencoderKL, is_video: bool, vae_per_channel_normalize=False) -> Tensor:
-    if isinstance(vae, (CausalVideoAutoencoder)):
         *_, fl, hl, wl = latents.shape
         temporal_scale, spatial_scale, _ = get_vae_size_scale_factor(vae)
         latents = latents.to(vae.dtype)
@@ -148,7 +118,7 @@ def get_vae_size_scale_factor(vae: AutoencoderKL) -> float:
     else:
         down_blocks = len([block for block in vae.encoder.down_blocks if isinstance(block.downsample, Downsample3D)])
         spatial = vae.config.patch_size * 2**down_blocks
-        temporal = vae.config.patch_size_t * 2 ** down_blocks if isinstance(vae) else 1
     return (temporal, spatial, spatial)
@@ -168,4 +138,4 @@ def un_normalize_latents(latents: Tensor, vae: AutoencoderKL, vae_per_channel_no
         + vae.mean_of_means.to(latents.dtype).view(1, -1, 1, 1, 1)
         if vae_per_channel_normalize
         else latents / vae.config.scaling_factor
-    )

 import torch
 from diffusers import AutoencoderKL
 from einops import rearrange
 from torch import Tensor
 from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
+from xora.models.autoencoders.video_autoencoder import Downsample3D, VideoAutoencoder
+import xora.utils.dist_util
 def vae_encode(media_items: Tensor, vae: AutoencoderKL, split_size: int = 1, vae_per_channel_normalize=False) -> Tensor:
     if channels != 3:
         raise ValueError(f"Expects tensors with 3 channels, got {channels}.")
+    if is_video_shaped and not isinstance(vae, (VideoAutoencoder, CausalVideoAutoencoder)):
         media_items = rearrange(media_items, "b c n h w -> (b n) c h w")
     if split_size > 1:
         if len(media_items) % split_size != 0:
         encode_bs = len(media_items) // split_size
         # latents = [vae.encode(image_batch).latent_dist.sample() for image_batch in media_items.split(encode_bs)]
         latents = []
+        dist_util.execute_graph()
         for image_batch in media_items.split(encode_bs):
             latents.append(vae.encode(image_batch).latent_dist.sample())
+            dist_util.execute_graph()
         latents = torch.cat(latents, dim=0)
     else:
         latents = vae.encode(media_items).latent_dist.sample()
     latents = normalize_latents(latents, vae, vae_per_channel_normalize)
+    if is_video_shaped and not isinstance(vae, (VideoAutoencoder, CausalVideoAutoencoder)):
         latents = rearrange(latents, "(b n) c h w -> b c n h w", b=batch_size)
     return latents
     is_video_shaped = latents.dim() == 5
     batch_size = latents.shape[0]
+    if is_video_shaped and not isinstance(vae, (VideoAutoencoder, CausalVideoAutoencoder)):
         latents = rearrange(latents, "b c n h w -> (b n) c h w")
     if split_size > 1:
         if len(latents) % split_size != 0:
     else:
         images = _run_decoder(latents, vae, is_video, vae_per_channel_normalize)
+    if is_video_shaped and not isinstance(vae, (VideoAutoencoder, CausalVideoAutoencoder)):
         images = rearrange(images, "(b n) c h w -> b c n h w", b=batch_size)
     return images
 def _run_decoder(latents: Tensor, vae: AutoencoderKL, is_video: bool, vae_per_channel_normalize=False) -> Tensor:
+    if isinstance(vae, (VideoAutoencoder, CausalVideoAutoencoder)):
         *_, fl, hl, wl = latents.shape
         temporal_scale, spatial_scale, _ = get_vae_size_scale_factor(vae)
         latents = latents.to(vae.dtype)
     else:
         down_blocks = len([block for block in vae.encoder.down_blocks if isinstance(block.downsample, Downsample3D)])
         spatial = vae.config.patch_size * 2**down_blocks
+        temporal = vae.config.patch_size_t * 2 ** down_blocks if isinstance(vae, VideoAutoencoder) else 1
     return (temporal, spatial, spatial)
         + vae.mean_of_means.to(latents.dtype).view(1, -1, 1, 1, 1)
         if vae_per_channel_normalize
         else latents / vae.config.scaling_factor
+    )

xora/models/autoencoders/video_autoencoder.py ADDED Viewed

	@@ -0,0 +1,912 @@

+import json
+import os
+from functools import partial
+from types import SimpleNamespace
+from typing import Any, Mapping, Optional, Tuple, Union
+import torch
+from einops import rearrange
+from torch import nn
+from torch.nn import functional
+from diffusers.utils import logging
+from txt2img.models.layers.nn import Identity
+from xora.models.autoencoders.conv_nd_factory import make_conv_nd, make_linear_nd
+from xora.models.autoencoders.pixel_norm import PixelNorm
+from xora.models.autoencoders.vae import AutoencoderKLWrapper
+logger = logging.get_logger(__name__)
+class VideoAutoencoder(AutoencoderKLWrapper):
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *args, **kwargs):
+        config_local_path = pretrained_model_name_or_path / "config.json"
+        config = cls.load_config(config_local_path, **kwargs)
+        video_vae = cls.from_config(config)
+        video_vae.to(kwargs["torch_dtype"])
+        model_local_path = pretrained_model_name_or_path / "autoencoder.pth"
+        ckpt_state_dict = torch.load(model_local_path)
+        video_vae.load_state_dict(ckpt_state_dict)
+        statistics_local_path = pretrained_model_name_or_path / "per_channel_statistics.json"
+        if statistics_local_path.exists():
+            with open(statistics_local_path, "r") as file:
+                data = json.load(file)
+            transposed_data = list(zip(*data["data"]))
+            data_dict = {col: torch.tensor(vals) for col, vals in zip(data["columns"], transposed_data)}
+            video_vae.register_buffer("std_of_means", data_dict["std-of-means"])
+            video_vae.register_buffer(
+                "mean_of_means", data_dict.get("mean-of-means", torch.zeros_like(data_dict["std-of-means"]))
+            )
+        return video_vae
+    @staticmethod
+    def from_config(config):
+        assert config["_class_name"] == "VideoAutoencoder", "config must have _class_name=VideoAutoencoder"
+        if isinstance(config["dims"], list):
+            config["dims"] = tuple(config["dims"])
+        assert config["dims"] in [2, 3, (2, 1)], "dims must be 2, 3 or (2, 1)"
+        double_z = config.get("double_z", True)
+        latent_log_var = config.get("latent_log_var", "per_channel" if double_z else "none")
+        use_quant_conv = config.get("use_quant_conv", True)
+        if use_quant_conv and latent_log_var == "uniform":
+            raise ValueError("uniform latent_log_var requires use_quant_conv=False")
+        encoder = Encoder(
+            dims=config["dims"],
+            in_channels=config.get("in_channels", 3),
+            out_channels=config["latent_channels"],
+            block_out_channels=config["block_out_channels"],
+            patch_size=config.get("patch_size", 1),
+            latent_log_var=latent_log_var,
+            norm_layer=config.get("norm_layer", "group_norm"),
+            patch_size_t=config.get("patch_size_t", config.get("patch_size", 1)),
+            add_channel_padding=config.get("add_channel_padding", False),
+        )
+        decoder = Decoder(
+            dims=config["dims"],
+            in_channels=config["latent_channels"],
+            out_channels=config.get("out_channels", 3),
+            block_out_channels=config["block_out_channels"],
+            patch_size=config.get("patch_size", 1),
+            norm_layer=config.get("norm_layer", "group_norm"),
+            patch_size_t=config.get("patch_size_t", config.get("patch_size", 1)),
+            add_channel_padding=config.get("add_channel_padding", False),
+        )
+        dims = config["dims"]
+        return VideoAutoencoder(
+            encoder=encoder,
+            decoder=decoder,
+            latent_channels=config["latent_channels"],
+            dims=dims,
+            use_quant_conv=use_quant_conv,
+        )
+    @property
+    def config(self):
+        return SimpleNamespace(
+            _class_name="VideoAutoencoder",
+            dims=self.dims,
+            in_channels=self.encoder.conv_in.in_channels // (self.encoder.patch_size_t * self.encoder.patch_size**2),
+            out_channels=self.decoder.conv_out.out_channels // (self.decoder.patch_size_t * self.decoder.patch_size**2),
+            latent_channels=self.decoder.conv_in.in_channels,
+            block_out_channels=[
+                self.encoder.down_blocks[i].res_blocks[-1].conv1.out_channels
+                for i in range(len(self.encoder.down_blocks))
+            ],
+            scaling_factor=1.0,
+            norm_layer=self.encoder.norm_layer,
+            patch_size=self.encoder.patch_size,
+            latent_log_var=self.encoder.latent_log_var,
+            use_quant_conv=self.use_quant_conv,
+            patch_size_t=self.encoder.patch_size_t,
+            add_channel_padding=self.encoder.add_channel_padding,
+        )
+    @property
+    def is_video_supported(self):
+        """
+        Check if the model supports video inputs of shape (B, C, F, H, W). Otherwise, the model only supports 2D images.
+        """
+        return self.dims != 2
+    @property
+    def downscale_factor(self):
+        return self.encoder.downsample_factor
+    def to_json_string(self) -> str:
+        import json
+        return json.dumps(self.config.__dict__)
+    def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
+        model_keys = set(name for name, _ in self.named_parameters())
+        key_mapping = {
+            ".resnets.": ".res_blocks.",
+            "downsamplers.0": "downsample",
+            "upsamplers.0": "upsample",
+        }
+        converted_state_dict = {}
+        for key, value in state_dict.items():
+            for k, v in key_mapping.items():
+                key = key.replace(k, v)
+            if "norm" in key and key not in model_keys:
+                logger.info(f"Removing key {key} from state_dict as it is not present in the model")
+                continue
+            converted_state_dict[key] = value
+        super().load_state_dict(converted_state_dict, strict=strict)
+    def last_layer(self):
+        if hasattr(self.decoder, "conv_out"):
+            if isinstance(self.decoder.conv_out, nn.Sequential):
+                last_layer = self.decoder.conv_out[-1]
+            else:
+                last_layer = self.decoder.conv_out
+        else:
+            last_layer = self.decoder.layers[-1]
+        return last_layer
+class Encoder(nn.Module):
+    r"""
+    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        patch_size (`int`, *optional*, defaults to 1):
+            The patch size to use. Should be a power of 2.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+        latent_log_var (`str`, *optional*, defaults to `per_channel`):
+            The number of channels for the log variance. Can be either `per_channel`, `uniform`, or `none`.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]] = 3,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        patch_size: Union[int, Tuple[int]] = 1,
+        norm_layer: str = "group_norm",  # group_norm, pixel_norm
+        latent_log_var: str = "per_channel",
+        patch_size_t: Optional[int] = None,
+        add_channel_padding: Optional[bool] = False,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.patch_size_t = patch_size_t if patch_size_t is not None else patch_size
+        self.add_channel_padding = add_channel_padding
+        self.layers_per_block = layers_per_block
+        self.norm_layer = norm_layer
+        self.latent_channels = out_channels
+        self.latent_log_var = latent_log_var
+        if add_channel_padding:
+            in_channels = in_channels * self.patch_size**3
+        else:
+            in_channels = in_channels * self.patch_size_t * self.patch_size**2
+        self.in_channels = in_channels
+        output_channel = block_out_channels[0]
+        self.conv_in = make_conv_nd(
+            dims=dims,
+            in_channels=in_channels,
+            out_channels=output_channel,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.down_blocks = nn.ModuleList([])
+        for i in range(len(block_out_channels)):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = DownEncoderBlock3D(
+                dims=dims,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                num_layers=self.layers_per_block,
+                add_downsample=not is_final_block and 2**i >= patch_size,
+                resnet_eps=1e-6,
+                downsample_padding=0,
+                resnet_groups=norm_num_groups,
+                norm_layer=norm_layer,
+            )
+            self.down_blocks.append(down_block)
+        self.mid_block = UNetMidBlock3D(
+            dims=dims,
+            in_channels=block_out_channels[-1],
+            num_layers=self.layers_per_block,
+            resnet_eps=1e-6,
+            resnet_groups=norm_num_groups,
+            norm_layer=norm_layer,
+        )
+        # out
+        if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+        elif norm_layer == "pixel_norm":
+            self.conv_norm_out = PixelNorm()
+        self.conv_act = nn.SiLU()
+        conv_out_channels = out_channels
+        if latent_log_var == "per_channel":
+            conv_out_channels *= 2
+        elif latent_log_var == "uniform":
+            conv_out_channels += 1
+        elif latent_log_var != "none":
+            raise ValueError(f"Invalid latent_log_var: {latent_log_var}")
+        self.conv_out = make_conv_nd(dims, block_out_channels[-1], conv_out_channels, 3, padding=1)
+        self.gradient_checkpointing = False
+    @property
+    def downscale_factor(self):
+        return (
+            2 ** len([block for block in self.down_blocks if isinstance(block.downsample, Downsample3D)])
+            * self.patch_size
+        )
+    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `Encoder` class."""
+        downsample_in_time = sample.shape[2] != 1
+        # patchify
+        patch_size_t = self.patch_size_t if downsample_in_time else 1
+        sample = patchify(
+            sample,
+            patch_size_hw=self.patch_size,
+            patch_size_t=patch_size_t,
+            add_channel_padding=self.add_channel_padding,
+        )
+        sample = self.conv_in(sample)
+        checkpoint_fn = (
+            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+            if self.gradient_checkpointing and self.training
+            else lambda x: x
+        )
+        for down_block in self.down_blocks:
+            sample = checkpoint_fn(down_block)(sample, downsample_in_time=downsample_in_time)
+        sample = checkpoint_fn(self.mid_block)(sample)
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if self.latent_log_var == "uniform":
+            last_channel = sample[:, -1:, ...]
+            num_dims = sample.dim()
+            if num_dims == 4:
+                # For shape (B, C, H, W)
+                repeated_last_channel = last_channel.repeat(1, sample.shape[1] - 2, 1, 1)
+                sample = torch.cat([sample, repeated_last_channel], dim=1)
+            elif num_dims == 5:
+                # For shape (B, C, F, H, W)
+                repeated_last_channel = last_channel.repeat(1, sample.shape[1] - 2, 1, 1, 1)
+                sample = torch.cat([sample, repeated_last_channel], dim=1)
+            else:
+                raise ValueError(f"Invalid input shape: {sample.shape}")
+        return sample
+class Decoder(nn.Module):
+    r"""
+    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        patch_size (`int`, *optional*, defaults to 1):
+            The patch size to use. Should be a power of 2.
+        norm_layer (`str`, *optional*, defaults to `group_norm`):
+            The normalization layer to use. Can be either `group_norm` or `pixel_norm`.
+    """
+    def __init__(
+        self,
+        dims,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        patch_size: int = 1,
+        norm_layer: str = "group_norm",
+        patch_size_t: Optional[int] = None,
+        add_channel_padding: Optional[bool] = False,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.patch_size_t = patch_size_t if patch_size_t is not None else patch_size
+        self.add_channel_padding = add_channel_padding
+        self.layers_per_block = layers_per_block
+        if add_channel_padding:
+            out_channels = out_channels * self.patch_size**3
+        else:
+            out_channels = out_channels * self.patch_size_t * self.patch_size**2
+        self.out_channels = out_channels
+        self.conv_in = make_conv_nd(
+            dims,
+            in_channels,
+            block_out_channels[-1],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        self.mid_block = UNetMidBlock3D(
+            dims=dims,
+            in_channels=block_out_channels[-1],
+            num_layers=self.layers_per_block,
+            resnet_eps=1e-6,
+            resnet_groups=norm_num_groups,
+            norm_layer=norm_layer,
+        )
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i in range(len(reversed_block_out_channels)):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            up_block = UpDecoderBlock3D(
+                dims=dims,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                add_upsample=not is_final_block and 2 ** (len(block_out_channels) - i - 1) > patch_size,
+                resnet_eps=1e-6,
+                resnet_groups=norm_num_groups,
+                norm_layer=norm_layer,
+            )
+            self.up_blocks.append(up_block)
+        if norm_layer == "group_norm":
+            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        elif norm_layer == "pixel_norm":
+            self.conv_norm_out = PixelNorm()
+        self.conv_act = nn.SiLU()
+        self.conv_out = make_conv_nd(dims, block_out_channels[0], out_channels, 3, padding=1)
+        self.gradient_checkpointing = False
+    def forward(self, sample: torch.FloatTensor, target_shape) -> torch.FloatTensor:
+        r"""The forward method of the `Decoder` class."""
+        assert target_shape is not None, "target_shape must be provided"
+        upsample_in_time = sample.shape[2] < target_shape[2]
+        sample = self.conv_in(sample)
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        checkpoint_fn = (
+            partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
+            if self.gradient_checkpointing and self.training
+            else lambda x: x
+        )
+        sample = checkpoint_fn(self.mid_block)(sample)
+        sample = sample.to(upscale_dtype)
+        for up_block in self.up_blocks:
+            sample = checkpoint_fn(up_block)(sample, upsample_in_time=upsample_in_time)
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        # un-patchify
+        patch_size_t = self.patch_size_t if upsample_in_time else 1
+        sample = unpatchify(
+            sample,
+            patch_size_hw=self.patch_size,
+            patch_size_t=patch_size_t,
+            add_channel_padding=self.add_channel_padding,
+        )
+        return sample
+class DownEncoderBlock3D(nn.Module):
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_groups: int = 32,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+        norm_layer: str = "group_norm",
+    ):
+        super().__init__()
+        res_blocks = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            res_blocks.append(
+                ResnetBlock3D(
+                    dims=dims,
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    norm_layer=norm_layer,
+                )
+            )
+        self.res_blocks = nn.ModuleList(res_blocks)
+        if add_downsample:
+            self.downsample = Downsample3D(dims, out_channels, out_channels=out_channels, padding=downsample_padding)
+        else:
+            self.downsample = Identity()
+    def forward(self, hidden_states: torch.FloatTensor, downsample_in_time) -> torch.FloatTensor:
+        for resnet in self.res_blocks:
+            hidden_states = resnet(hidden_states)
+        hidden_states = self.downsample(hidden_states, downsample_in_time=downsample_in_time)
+        return hidden_states
+class UNetMidBlock3D(nn.Module):
+    """
+    A 3D UNet mid-block [`UNetMidBlock3D`] with multiple residual blocks.
+    Args:
+        in_channels (`int`): The number of input channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_groups: int = 32,
+        norm_layer: str = "group_norm",
+    ):
+        super().__init__()
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        self.res_blocks = nn.ModuleList(
+            [
+                ResnetBlock3D(
+                    dims=dims,
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    norm_layer=norm_layer,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        for resnet in self.res_blocks:
+            hidden_states = resnet(hidden_states)
+        return hidden_states
+class UpDecoderBlock3D(nn.Module):
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_groups: int = 32,
+        add_upsample: bool = True,
+        norm_layer: str = "group_norm",
+    ):
+        super().__init__()
+        res_blocks = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+            res_blocks.append(
+                ResnetBlock3D(
+                    dims=dims,
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    norm_layer=norm_layer,
+                )
+            )
+        self.res_blocks = nn.ModuleList(res_blocks)
+        if add_upsample:
+            self.upsample = Upsample3D(dims=dims, channels=out_channels, out_channels=out_channels)
+        else:
+            self.upsample = Identity()
+        self.resolution_idx = resolution_idx
+    def forward(self, hidden_states: torch.FloatTensor, upsample_in_time=True) -> torch.FloatTensor:
+        for resnet in self.res_blocks:
+            hidden_states = resnet(hidden_states)
+        hidden_states = self.upsample(hidden_states, upsample_in_time=upsample_in_time)
+        return hidden_states
+class ResnetBlock3D(nn.Module):
+    r"""
+    A Resnet block.
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+    """
+    def __init__(
+        self,
+        dims: Union[int, Tuple[int, int]],
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        groups: int = 32,
+        eps: float = 1e-6,
+        norm_layer: str = "group_norm",
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        if norm_layer == "group_norm":
+            self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        elif norm_layer == "pixel_norm":
+            self.norm1 = PixelNorm()
+        self.non_linearity = nn.SiLU()
+        self.conv1 = make_conv_nd(dims, in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if norm_layer == "group_norm":
+            self.norm2 = torch.nn.GroupNorm(num_groups=groups, num_channels=out_channels, eps=eps, affine=True)
+        elif norm_layer == "pixel_norm":
+            self.norm2 = PixelNorm()
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = make_conv_nd(dims, out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.conv_shortcut = (
+            make_linear_nd(dims=dims, in_channels=in_channels, out_channels=out_channels)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+    def forward(
+        self,
+        input_tensor: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.non_linearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.non_linearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = input_tensor + hidden_states
+        return output_tensor
+class Downsample3D(nn.Module):
+    def __init__(self, dims, in_channels: int, out_channels: int, kernel_size: int = 3, padding: int = 1):
+        super().__init__()
+        stride: int = 2
+        self.padding = padding
+        self.in_channels = in_channels
+        self.dims = dims
+        self.conv = make_conv_nd(
+            dims=dims,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+    def forward(self, x, downsample_in_time=True):
+        conv = self.conv
+        if self.padding == 0:
+            if self.dims == 2:
+                padding = (0, 1, 0, 1)
+            else:
+                padding = (0, 1, 0, 1, 0, 1 if downsample_in_time else 0)
+            x = functional.pad(x, padding, mode="constant", value=0)
+            if self.dims == (2, 1) and not downsample_in_time:
+                return conv(x, skip_time_conv=True)
+        return conv(x)
+class Upsample3D(nn.Module):
+    """
+    An upsampling layer for 3D tensors of shape (B, C, D, H, W).
+    :param channels: channels in the inputs and outputs.
+    """
+    def __init__(self, dims, channels, out_channels=None):
+        super().__init__()
+        self.dims = dims
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.conv = make_conv_nd(dims, channels, out_channels, kernel_size=3, padding=1, bias=True)
+    def forward(self, x, upsample_in_time):
+        if self.dims == 2:
+            x = functional.interpolate(x, (x.shape[2] * 2, x.shape[3] * 2), mode="nearest")
+        else:
+            time_scale_factor = 2 if upsample_in_time else 1
+            # print("before:", x.shape)
+            b, c, d, h, w = x.shape
+            x = rearrange(x, "b c d h w -> (b d) c h w")
+            # height and width interpolate
+            x = functional.interpolate(x, (x.shape[2] * 2, x.shape[3] * 2), mode="nearest")
+            _, _, h, w = x.shape
+            if not upsample_in_time and self.dims == (2, 1):
+                x = rearrange(x, "(b d) c h w -> b c d h w ", b=b, h=h, w=w)
+                return self.conv(x, skip_time_conv=True)
+            # Second ** upsampling ** which is essentially treated as a 1D convolution across the 'd' dimension
+            x = rearrange(x, "(b d) c h w -> (b h w) c 1 d", b=b)
+            # (b h w) c 1 d
+            new_d = x.shape[-1] * time_scale_factor
+            x = functional.interpolate(x, (1, new_d), mode="nearest")
+            # (b h w) c 1 new_d
+            x = rearrange(x, "(b h w) c 1 new_d  -> b c new_d h w", b=b, h=h, w=w, new_d=new_d)
+            # b c d h w
+            # x = functional.interpolate(
+            #     x, (x.shape[2] * time_scale_factor, x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+            # )
+            # print("after:", x.shape)
+        return self.conv(x)
+def patchify(x, patch_size_hw, patch_size_t=1, add_channel_padding=False):
+    if patch_size_hw == 1 and patch_size_t == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size_hw, r=patch_size_hw)
+    elif x.dim() == 5:
+        x = rearrange(x, "b c (f p) (h q) (w r) -> b (c p r q) f h w", p=patch_size_t, q=patch_size_hw, r=patch_size_hw)
+    else:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    if (x.dim() == 5) and (patch_size_hw > patch_size_t) and (patch_size_t > 1 or add_channel_padding):
+        channels_to_pad = x.shape[1] * (patch_size_hw // patch_size_t) - x.shape[1]
+        padding_zeros = torch.zeros(
+            x.shape[0],
+            channels_to_pad,
+            x.shape[2],
+            x.shape[3],
+            x.shape[4],
+            device=x.device,
+            dtype=x.dtype,
+        )
+        x = torch.cat([padding_zeros, x], dim=1)
+    return x
+def unpatchify(x, patch_size_hw, patch_size_t=1, add_channel_padding=False):
+    if patch_size_hw == 1 and patch_size_t == 1:
+        return x
+    if (x.dim() == 5) and (patch_size_hw > patch_size_t) and (patch_size_t > 1 or add_channel_padding):
+        channels_to_keep = int(x.shape[1] * (patch_size_t / patch_size_hw))
+        x = x[:, :channels_to_keep, :, :, :]
+    if x.dim() == 4:
+        x = rearrange(x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size_hw, r=patch_size_hw)
+    elif x.dim() == 5:
+        x = rearrange(x, "b (c p r q) f h w -> b c (f p) (h q) (w r)", p=patch_size_t, q=patch_size_hw, r=patch_size_hw)
+    return x
+def create_video_autoencoder_config(
+    latent_channels: int = 4,
+):
+    config = {
+        "_class_name": "VideoAutoencoder",
+        "dims": (2, 1),  # 2 for Conv2, 3 for Conv3d, (2, 1) for Conv2d followed by Conv1d
+        "in_channels": 3,  # Number of input color channels (e.g., RGB)
+        "out_channels": 3,  # Number of output color channels
+        "latent_channels": latent_channels,  # Number of channels in the latent space representation
+        "block_out_channels": [128, 256, 512, 512],  # Number of output channels of each encoder / decoder inner block
+        "patch_size": 1,
+    }
+    return config
+def create_video_autoencoder_pathify4x4x4_config(
+    latent_channels: int = 4,
+):
+    config = {
+        "_class_name": "VideoAutoencoder",
+        "dims": (2, 1),  # 2 for Conv2, 3 for Conv3d, (2, 1) for Conv2d followed by Conv1d
+        "in_channels": 3,  # Number of input color channels (e.g., RGB)
+        "out_channels": 3,  # Number of output color channels
+        "latent_channels": latent_channels,  # Number of channels in the latent space representation
+        "block_out_channels": [512] * 4,  # Number of output channels of each encoder / decoder inner block
+        "patch_size": 4,
+        "latent_log_var": "uniform",
+    }
+    return config
+def create_video_autoencoder_pathify4x4_config(
+    latent_channels: int = 4,
+):
+    config = {
+        "_class_name": "VideoAutoencoder",
+        "dims": 2,  # 2 for Conv2, 3 for Conv3d, (2, 1) for Conv2d followed by Conv1d
+        "in_channels": 3,  # Number of input color channels (e.g., RGB)
+        "out_channels": 3,  # Number of output color channels
+        "latent_channels": latent_channels,  # Number of channels in the latent space representation
+        "block_out_channels": [512] * 4,  # Number of output channels of each encoder / decoder inner block
+        "patch_size": 4,
+        "norm_layer": "pixel_norm",
+    }
+    return config
+def test_vae_patchify_unpatchify():
+    import torch
+    x = torch.randn(2, 3, 8, 64, 64)
+    x_patched = patchify(x, patch_size_hw=4, patch_size_t=4)
+    x_unpatched = unpatchify(x_patched, patch_size_hw=4, patch_size_t=4)
+    assert torch.allclose(x, x_unpatched)
+def demo_video_autoencoder_forward_backward():
+    # Configuration for the VideoAutoencoder
+    config = create_video_autoencoder_pathify4x4x4_config()
+    # Instantiate the VideoAutoencoder with the specified configuration
+    video_autoencoder = VideoAutoencoder.from_config(config)
+    print(video_autoencoder)
+    # Print the total number of parameters in the video autoencoder
+    total_params = sum(p.numel() for p in video_autoencoder.parameters())
+    print(f"Total number of parameters in VideoAutoencoder: {total_params:,}")
+    # Create a mock input tensor simulating a batch of videos
+    # Shape: (batch_size, channels, depth, height, width)
+    # E.g., 4 videos, each with 3 color channels, 16 frames, and 64x64 pixels per frame
+    input_videos = torch.randn(2, 3, 8, 64, 64)
+    # Forward pass: encode and decode the input videos
+    latent = video_autoencoder.encode(input_videos).latent_dist.mode()
+    print(f"input shape={input_videos.shape}")
+    print(f"latent shape={latent.shape}")
+    reconstructed_videos = video_autoencoder.decode(latent, target_shape=input_videos.shape).sample
+    print(f"reconstructed shape={reconstructed_videos.shape}")
+    # Calculate the loss (e.g., mean squared error)
+    loss = torch.nn.functional.mse_loss(input_videos, reconstructed_videos)
+    # Perform backward pass
+    loss.backward()
+    print(f"Demo completed with loss: {loss.item()}")
+# Ensure to call the demo function to execute the forward and backward pass
+if __name__ == "__main__":
+    demo_video_autoencoder_forward_backward()

xora/models/transformers/embeddings.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Adapted from: https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py
+import math
+import numpy as np
+import torch
+from einops import rearrange
+from torch import nn
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timesteps.device)
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+def get_3d_sincos_pos_embed(embed_dim, grid, w, h, f):
+    """
+    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid = rearrange(grid, "c (f h w) -> c f h w", h=h, w=w)
+    grid = rearrange(grid, "c f h w -> c h w f", h=h, w=w)
+    grid = grid.reshape([3, 1, w, h, f])
+    pos_embed = get_3d_sincos_pos_embed_from_grid(embed_dim, grid)
+    pos_embed = pos_embed.transpose(1, 0, 2, 3)
+    return rearrange(pos_embed, "h w f c -> (f h w) c")
+def get_3d_sincos_pos_embed_from_grid(embed_dim, grid):
+    if embed_dim % 3 != 0:
+        raise ValueError("embed_dim must be divisible by 3")
+    # use half of dimensions to encode grid_h
+    emb_f = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[0])  # (H*W*T, D/3)
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[1])  # (H*W*T, D/3)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 3, grid[2])  # (H*W*T, D/3)
+    emb = np.concatenate([emb_h, emb_w, emb_f], axis=-1)  # (H*W*T, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos_shape = pos.shape
+    pos = pos.reshape(-1)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    out = out.reshape([*pos_shape, -1])[0]
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=-1)  # (M, D)
+    return emb
+class SinusoidalPositionalEmbedding(nn.Module):
+    """Apply positional information to a sequence of embeddings.
+    Takes in a sequence of embeddings with shape (batch_size, seq_length, embed_dim) and adds positional embeddings to
+    them
+    Args:
+        embed_dim: (int): Dimension of the positional embedding.
+        max_seq_length: Maximum sequence length to apply positional embeddings
+    """
+    def __init__(self, embed_dim: int, max_seq_length: int = 32):
+        super().__init__()
+        position = torch.arange(max_seq_length).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
+        pe = torch.zeros(1, max_seq_length, embed_dim)
+        pe[0, :, 0::2] = torch.sin(position * div_term)
+        pe[0, :, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        _, seq_length, _ = x.shape
+        x = x + self.pe[:, :seq_length]
+        return x

xora/models/transformers/transformer3d.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Adapted from: https://github.com/huggingface/diffusers/blob/v0.26.3/src/diffusers/models/transformers/transformer_2d.py
 import math
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
 import torch
 from diffusers.configuration_utils import ConfigMixin, register_to_config
@@ -9,10 +9,13 @@ from diffusers.models.embeddings import PixArtAlphaTextProjection
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.models.normalization import AdaLayerNormSingle
 from diffusers.utils import BaseOutput, is_torch_version
 from torch import nn
 from xora.models.transformers.attention import BasicTransformerBlock
 @dataclass
 class Transformer3DModelOutput(BaseOutput):
@@ -143,6 +146,61 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
         self.gradient_checkpointing = False
     def _set_gradient_checkpointing(self, module, value=False):
         if hasattr(module, "gradient_checkpointing"):
             module.gradient_checkpointing = value
@@ -287,10 +345,14 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
         if self.timestep_scale_multiplier:
             timestep = self.timestep_scale_multiplier * timestep
-        if self.positional_embedding_type == "rope":
             freqs_cis = self.precompute_freqs_cis(indices_grid)
-        else:
-            raise NotImplementedError("Only rope pos embed supported.")
         batch_size = hidden_states.shape[0]
         timestep, embedded_timestep = self.adaln_single(
@@ -358,3 +420,14 @@ class Transformer3DModel(ModelMixin, ConfigMixin):
         return Transformer3DModelOutput(sample=hidden_states)

 # Adapted from: https://github.com/huggingface/diffusers/blob/v0.26.3/src/diffusers/models/transformers/transformer_2d.py
 import math
 from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Literal
 import torch
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers.models.normalization import AdaLayerNormSingle
 from diffusers.utils import BaseOutput, is_torch_version
+from diffusers.utils import logging
 from torch import nn
 from xora.models.transformers.attention import BasicTransformerBlock
+from xora.models.transformers.embeddings import get_3d_sincos_pos_embed
+logger = logging.get_logger(__name__)
 @dataclass
 class Transformer3DModelOutput(BaseOutput):
         self.gradient_checkpointing = False
+    def set_use_tpu_flash_attention(self):
+        r"""
+        Function sets the flag in this object and propagates down the children. The flag will enforce the usage of TPU
+        attention kernel.
+        """
+        logger.info(" ENABLE TPU FLASH ATTENTION -> TRUE")
+        # if using TPU -> configure components to use TPU flash attention
+        if dist_util.acceleration_type() == dist_util.AccelerationType.TPU:
+            self.use_tpu_flash_attention = True
+            # push config down to the attention modules
+            for block in self.transformer_blocks:
+                block.set_use_tpu_flash_attention()
+    def initialize(self, embedding_std: float, mode: Literal["xora", "pixart"]):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.adaln_single.emb.timestep_embedder.linear_1.weight, std=embedding_std)
+        nn.init.normal_(self.adaln_single.emb.timestep_embedder.linear_2.weight, std=embedding_std)
+        nn.init.normal_(self.adaln_single.linear.weight, std=embedding_std)
+        if hasattr(self.adaln_single.emb, "resolution_embedder"):
+            nn.init.normal_(self.adaln_single.emb.resolution_embedder.linear_1.weight, std=embedding_std)
+            nn.init.normal_(self.adaln_single.emb.resolution_embedder.linear_2.weight, std=embedding_std)
+        if hasattr(self.adaln_single.emb, "aspect_ratio_embedder"):
+            nn.init.normal_(self.adaln_single.emb.aspect_ratio_embedder.linear_1.weight, std=embedding_std)
+            nn.init.normal_(self.adaln_single.emb.aspect_ratio_embedder.linear_2.weight, std=embedding_std)
+        # Initialize caption embedding MLP:
+        nn.init.normal_(self.caption_projection.linear_1.weight, std=embedding_std)
+        nn.init.normal_(self.caption_projection.linear_1.weight, std=embedding_std)
+        # Zero-out adaLN modulation layers in PixArt blocks:
+        for block in self.transformer_blocks:
+            if mode == "xora":
+                nn.init.constant_(block.attn1.to_out[0].weight, 0)
+                nn.init.constant_(block.attn1.to_out[0].bias, 0)
+            nn.init.constant_(block.attn2.to_out[0].weight, 0)
+            nn.init.constant_(block.attn2.to_out[0].bias, 0)
+            if mode == "xora":
+                nn.init.constant_(block.ff.net[2].weight, 0)
+                nn.init.constant_(block.ff.net[2].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.proj_out.weight, 0)
+        nn.init.constant_(self.proj_out.bias, 0)
     def _set_gradient_checkpointing(self, module, value=False):
         if hasattr(module, "gradient_checkpointing"):
             module.gradient_checkpointing = value
         if self.timestep_scale_multiplier:
             timestep = self.timestep_scale_multiplier * timestep
+        if self.positional_embedding_type == "absolute":
+            pos_embed_3d = self.get_absolute_pos_embed(indices_grid).to(hidden_states.device)
+            if self.project_to_2d_pos:
+                pos_embed = self.to_2d_proj(pos_embed_3d)
+            hidden_states = (hidden_states + pos_embed).to(hidden_states.dtype)
+            freqs_cis = None
+        elif self.positional_embedding_type == "rope":
             freqs_cis = self.precompute_freqs_cis(indices_grid)
         batch_size = hidden_states.shape[0]
         timestep, embedded_timestep = self.adaln_single(
         return Transformer3DModelOutput(sample=hidden_states)
+    def get_absolute_pos_embed(self, grid):
+        grid_np = grid[0].cpu().numpy()
+        embed_dim_3d = math.ceil((self.inner_dim / 2) * 3) if self.project_to_2d_pos else self.inner_dim
+        pos_embed = get_3d_sincos_pos_embed(  # (f h w)
+            embed_dim_3d,
+            grid_np,
+            h=int(max(grid_np[1]) + 1),
+            w=int(max(grid_np[2]) + 1),
+            f=int(max(grid_np[0] + 1)),
+        )
+        return torch.from_numpy(pos_embed).float().unsqueeze(0)

xora/pipelines/pipeline_video_pixart_alpha.py CHANGED Viewed

@@ -32,16 +32,106 @@ from xora.models.transformers.symmetric_patchifier import Patchifier
 from xora.models.autoencoders.vae_encode import get_vae_size_scale_factor, vae_decode, vae_encode
 from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 from xora.schedulers.rf import TimestepShifter
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 if is_bs4_available():
     from bs4 import BeautifulSoup
 if is_ftfy_available():
     import ftfy
 def retrieve_timesteps(
     scheduler,
     num_inference_steps: Optional[int] = None,
@@ -520,14 +610,7 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
-        self,
-        batch_size,
-        num_latent_channels,
-        num_patches,
-        dtype,
-        device,
-        generator,
-        latents=None,
     ):
         shape = (
             batch_size,
@@ -543,6 +626,9 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         else:
             latents = latents.to(device)
@@ -582,8 +668,8 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
         return samples
     @torch.no_grad()
     def __call__(
         self,
         height: int,
@@ -607,6 +693,7 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
         return_dict: bool = True,
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         clean_caption: bool = True,
         **kwargs,
     ) -> Union[ImagePipelineOutput, Tuple]:
         """
@@ -736,8 +823,15 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
             prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
-        # 4. Prepare latents.
         self.video_scale_factor = self.video_scale_factor if is_video else 1
         latent_height = height // self.vae_scale_factor
         latent_width = width // self.vae_scale_factor
         latent_num_frames = num_frames // self.video_scale_factor
@@ -752,7 +846,12 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
             dtype=prompt_embeds.dtype,
             device=device,
             generator=generator,
         )
         # 5. Prepare timesteps
         retrieve_timesteps_kwargs = {}
@@ -790,7 +889,7 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
                 elif len(current_timestep.shape) == 0:
                     current_timestep = current_timestep[None].to(latent_model_input.device)
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-                current_timestep = current_timestep.expand(latent_model_input.shape[0])
                 scale_grid = (
                     (1 / latent_frame_rates, self.vae_scale_factor, self.vae_scale_factor)
                     if self.transformer.use_rope
@@ -805,6 +904,9 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
                     device=latents.device,
                 )
                 # predict noise model_output
                 noise_pred = self.transformer(
                     latent_model_input.to(self.transformer.dtype),
@@ -819,13 +921,20 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 # learned sigma
                 if self.transformer.config.out_channels // 2 == self.transformer.config.in_channels:
                     noise_pred = noise_pred.chunk(2, dim=1)[0]
                 # compute previous image: x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -857,3 +966,62 @@ class VideoPixArtAlphaPipeline(DiffusionPipeline):
             return (image,)
         return ImagePipelineOutput(images=image)

 from xora.models.autoencoders.vae_encode import get_vae_size_scale_factor, vae_decode, vae_encode
 from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 from xora.schedulers.rf import TimestepShifter
+from xora.utils.conditioning_method import ConditioningMethod
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 if is_bs4_available():
     from bs4 import BeautifulSoup
 if is_ftfy_available():
     import ftfy
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import PixArtAlphaPipeline
+        >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-XL-2-512x512" too.
+        >>> pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
+        >>> # Enable memory optimizations.
+        >>> pipe.enable_model_cpu_offload()
+        >>> prompt = "A small cactus with a happy face in the Sahara desert."
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+ASPECT_RATIO_1024_BIN = {
+    "0.25": [512.0, 2048.0],
+    "0.28": [512.0, 1856.0],
+    "0.32": [576.0, 1792.0],
+    "0.33": [576.0, 1728.0],
+    "0.35": [576.0, 1664.0],
+    "0.4": [640.0, 1600.0],
+    "0.42": [640.0, 1536.0],
+    "0.48": [704.0, 1472.0],
+    "0.5": [704.0, 1408.0],
+    "0.52": [704.0, 1344.0],
+    "0.57": [768.0, 1344.0],
+    "0.6": [768.0, 1280.0],
+    "0.68": [832.0, 1216.0],
+    "0.72": [832.0, 1152.0],
+    "0.78": [896.0, 1152.0],
+    "0.82": [896.0, 1088.0],
+    "0.88": [960.0, 1088.0],
+    "0.94": [960.0, 1024.0],
+    "1.0": [1024.0, 1024.0],
+    "1.07": [1024.0, 960.0],
+    "1.13": [1088.0, 960.0],
+    "1.21": [1088.0, 896.0],
+    "1.29": [1152.0, 896.0],
+    "1.38": [1152.0, 832.0],
+    "1.46": [1216.0, 832.0],
+    "1.67": [1280.0, 768.0],
+    "1.75": [1344.0, 768.0],
+    "2.0": [1408.0, 704.0],
+    "2.09": [1472.0, 704.0],
+    "2.4": [1536.0, 640.0],
+    "2.5": [1600.0, 640.0],
+    "3.0": [1728.0, 576.0],
+    "4.0": [2048.0, 512.0],
+}
+ASPECT_RATIO_512_BIN = {
+    "0.25": [256.0, 1024.0],
+    "0.28": [256.0, 928.0],
+    "0.32": [288.0, 896.0],
+    "0.33": [288.0, 864.0],
+    "0.35": [288.0, 832.0],
+    "0.4": [320.0, 800.0],
+    "0.42": [320.0, 768.0],
+    "0.48": [352.0, 736.0],
+    "0.5": [352.0, 704.0],
+    "0.52": [352.0, 672.0],
+    "0.57": [384.0, 672.0],
+    "0.6": [384.0, 640.0],
+    "0.68": [416.0, 608.0],
+    "0.72": [416.0, 576.0],
+    "0.78": [448.0, 576.0],
+    "0.82": [448.0, 544.0],
+    "0.88": [480.0, 544.0],
+    "0.94": [480.0, 512.0],
+    "1.0": [512.0, 512.0],
+    "1.07": [512.0, 480.0],
+    "1.13": [544.0, 480.0],
+    "1.21": [544.0, 448.0],
+    "1.29": [576.0, 448.0],
+    "1.38": [576.0, 416.0],
+    "1.46": [608.0, 416.0],
+    "1.67": [640.0, 384.0],
+    "1.75": [672.0, 384.0],
+    "2.0": [704.0, 352.0],
+    "2.09": [736.0, 352.0],
+    "2.4": [768.0, 320.0],
+    "2.5": [800.0, 320.0],
+    "3.0": [864.0, 288.0],
+    "4.0": [1024.0, 256.0],
+}
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
     scheduler,
     num_inference_steps: Optional[int] = None,
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(
+        self, batch_size, num_latent_channels, num_patches, dtype, device, generator, latents=None, latents_mask=None
     ):
         shape = (
             batch_size,
         if latents is None:
             latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        elif latents_mask is not None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = latents * latents_mask[..., None] + noise * (1 - latents_mask[..., None])
         else:
             latents = latents.to(device)
         return samples
     @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         height: int,
         return_dict: bool = True,
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         clean_caption: bool = True,
+        media_items: Optional[torch.FloatTensor] = None,
         **kwargs,
     ) -> Union[ImagePipelineOutput, Tuple]:
         """
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
             prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+        # 3b. Encode and prepare conditioning data
         self.video_scale_factor = self.video_scale_factor if is_video else 1
+        conditioning_method = kwargs.get("conditioning_method", None)
+        vae_per_channel_normalize = kwargs.get("vae_per_channel_normalize", False)
+        init_latents, conditioning_mask = self.prepare_conditioning(
+            media_items, num_frames, height, width, conditioning_method, vae_per_channel_normalize
+        )
+        # 4. Prepare latents.
         latent_height = height // self.vae_scale_factor
         latent_width = width // self.vae_scale_factor
         latent_num_frames = num_frames // self.video_scale_factor
             dtype=prompt_embeds.dtype,
             device=device,
             generator=generator,
+            latents=init_latents,
+            latents_mask=conditioning_mask,
         )
+        if conditioning_mask is not None and is_video:
+            assert num_images_per_prompt == 1
+            conditioning_mask = torch.cat([conditioning_mask] * 2) if do_classifier_free_guidance else conditioning_mask
         # 5. Prepare timesteps
         retrieve_timesteps_kwargs = {}
                 elif len(current_timestep.shape) == 0:
                     current_timestep = current_timestep[None].to(latent_model_input.device)
                 # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                current_timestep = current_timestep.expand(latent_model_input.shape[0]).unsqueeze(-1)
                 scale_grid = (
                     (1 / latent_frame_rates, self.vae_scale_factor, self.vae_scale_factor)
                     if self.transformer.use_rope
                     device=latents.device,
                 )
+                if conditioning_mask is not None:
+                    current_timestep = current_timestep * (1 - conditioning_mask)
                 # predict noise model_output
                 noise_pred = self.transformer(
                     latent_model_input.to(self.transformer.dtype),
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                     noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    current_timestep, _ = current_timestep.chunk(2)
                 # learned sigma
                 if self.transformer.config.out_channels // 2 == self.transformer.config.in_channels:
                     noise_pred = noise_pred.chunk(2, dim=1)[0]
                 # compute previous image: x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred,
+                    t if current_timestep is None else current_timestep,
+                    latents,
+                    **extra_step_kwargs,
+                    return_dict=False,
+                )[0]
                 # call the callback, if provided
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
             return (image,)
         return ImagePipelineOutput(images=image)
+    def prepare_conditioning(
+        self,
+        media_items: torch.Tensor,
+        num_frames: int,
+        height: int,
+        width: int,
+        method: ConditioningMethod = ConditioningMethod.UNCONDITIONAL,
+        vae_per_channel_normalize: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Prepare the conditioning data for the video generation. If an input media item is provided, encode it
+        and set the conditioning_mask to indicate which tokens to condition on. Input media item should have
+        the same height and width as the generated video.
+        Args:
+            media_items (torch.Tensor): media items to condition on (images or videos)
+            num_frames (int): number of frames to generate
+            height (int): height of the generated video
+            width (int): width of the generated video
+            method (ConditioningMethod, optional): conditioning method to use. Defaults to ConditioningMethod.UNCONDITIONAL.
+            vae_per_channel_normalize (bool, optional): whether to normalize the input to the VAE per channel. Defaults to False.
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: the conditioning latents and the conditioning mask
+        """
+        if media_items is None or method == ConditioningMethod.UNCONDITIONAL:
+            return None, None
+        assert media_items.ndim == 5
+        assert height == media_items.shape[-2] and width == media_items.shape[-1]
+        # Encode the input video and repeat to the required number of frame-tokens
+        init_latents = vae_encode(
+            media_items.to(dtype=self.vae.dtype, device=self.vae.device),
+            self.vae,
+            vae_per_channel_normalize=vae_per_channel_normalize,
+        ).float()
+        init_len, target_len = init_latents.shape[2], num_frames // self.video_scale_factor
+        if isinstance(self.vae, CausalVideoAutoencoder):
+            target_len += 1
+        init_latents = init_latents[:, :, :target_len]
+        if target_len > init_len:
+            repeat_factor = (target_len + init_len - 1) // init_len  # Ceiling division
+            init_latents = init_latents.repeat(1, 1, repeat_factor, 1, 1)[:, :, :target_len]
+        # Prepare the conditioning mask (1.0 = condition on this token)
+        b, n, f, h, w = init_latents.shape
+        conditioning_mask = torch.zeros([b, 1, f, h, w], device=init_latents.device)
+        if method in [ConditioningMethod.FIRST_FRAME, ConditioningMethod.FIRST_AND_LAST_FRAME]:
+            conditioning_mask[:, :, 0] = 1.0
+        if method in [ConditioningMethod.LAST_FRAME, ConditioningMethod.FIRST_AND_LAST_FRAME]:
+            conditioning_mask[:, :, -1] = 1.0
+        # Patchify the init latents and the mask
+        conditioning_mask = self.patchifier.patchify(conditioning_mask).squeeze(-1)
+        init_latents = self.patchifier.patchify(latents=init_latents)
+        return init_latents, conditioning_mask

xora/schedulers/rf.py CHANGED Viewed

@@ -9,7 +9,7 @@ from diffusers.schedulers.scheduling_utils import SchedulerMixin
 from diffusers.utils import BaseOutput
 from torch import Tensor
-from xora.utils.torch_utils import append_dims
 def simple_diffusion_resolution_dependent_timestep_shift(
@@ -199,8 +199,17 @@ class RectifiedFlowScheduler(SchedulerMixin, ConfigMixin, TimestepShifter):
                 "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
             )
-        current_index = (self.timesteps - timestep).abs().argmin()
-        dt = self.delta_timesteps.gather(0, current_index.unsqueeze(0))
         prev_sample = sample - dt * model_output
@@ -219,4 +228,4 @@ class RectifiedFlowScheduler(SchedulerMixin, ConfigMixin, TimestepShifter):
         sigmas = append_dims(sigmas, original_samples.ndim)
         alphas = 1 - sigmas
         noisy_samples = alphas * original_samples + sigmas * noise
-        return noisy_samples

 from diffusers.utils import BaseOutput
 from torch import Tensor
+from txt2img.common.torch_utils import append_dims
 def simple_diffusion_resolution_dependent_timestep_shift(
                 "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
             )
+        if timestep.ndim == 0:
+            # Global timestep
+            current_index = (self.timesteps - timestep).abs().argmin()
+            dt = self.delta_timesteps.gather(0, current_index.unsqueeze(0))
+        else:
+            # Timestep per token
+            assert timestep.ndim == 2
+            current_index = (self.timesteps[:, None, None] - timestep[None]).abs().argmin(dim=0)
+            dt = self.delta_timesteps[current_index]
+            # Special treatment for zero timestep tokens - set dt to 0 so prev_sample = sample
+            dt = torch.where(timestep == 0.0, torch.zeros_like(dt), dt)[..., None]
         prev_sample = sample - dt * model_output
         sigmas = append_dims(sigmas, original_samples.ndim)
         alphas = 1 - sigmas
         noisy_samples = alphas * original_samples + sigmas * noise
+        return noisy_samples

xora/utils/conditioning_method.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from enum import Enum
+class ConditioningMethod(Enum):
+    UNCONDITIONAL = "unconditional"
+    FIRST_FRAME = "first_frame"
+    LAST_FRAME = "last_frame"
+    FIRST_AND_LAST_FRAME = "first_and_last_frame"

xora/utils/dist_util.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from enum import Enum
+class AccelerationType(Enum):
+    CPU = "cpu"
+    GPU = "gpu"
+    TPU = "tpu"
+    MPS = "mps"
+def execute_graph() -> None:
+    if _acceleration_type == AccelerationType.TPU:
+        xm.mark_step()