jiuhai commited on 17 days ago

Commit

4e1e978

verified ·

1 Parent(s): 4cd1d55

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

packages/ltx-core/src/ltx_core/__init__.py +0 -0
packages/ltx-core/src/ltx_core/__pycache__/__init__.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/__pycache__/types.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/__pycache__/utils.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/components/__init__.py +10 -0
packages/ltx-core/src/ltx_core/components/diffusion_steps.py +95 -0
packages/ltx-core/src/ltx_core/components/guiders.py +364 -0
packages/ltx-core/src/ltx_core/components/noisers.py +35 -0
packages/ltx-core/src/ltx_core/components/patchifiers.py +348 -0
packages/ltx-core/src/ltx_core/components/protocols.py +101 -0
packages/ltx-core/src/ltx_core/components/schedulers.py +130 -0
packages/ltx-core/src/ltx_core/conditioning/__init__.py +19 -0
packages/ltx-core/src/ltx_core/conditioning/exceptions.py +4 -0
packages/ltx-core/src/ltx_core/conditioning/item.py +20 -0
packages/ltx-core/src/ltx_core/conditioning/mask_utils.py +210 -0
packages/ltx-core/src/ltx_core/guidance/__init__.py +15 -0
packages/ltx-core/src/ltx_core/guidance/perturbations.py +79 -0
packages/ltx-core/src/ltx_core/loader/__init__.py +48 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/__init__.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/fuse_loras.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/module_ops.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/primitives.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/registry.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/sd_ops.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/sft_loader.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/__pycache__/single_gpu_model_builder.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/loader/fuse_loras.py +153 -0
packages/ltx-core/src/ltx_core/loader/kernels.py +72 -0
packages/ltx-core/src/ltx_core/loader/module_ops.py +14 -0
packages/ltx-core/src/ltx_core/loader/primitives.py +109 -0
packages/ltx-core/src/ltx_core/loader/registry.py +84 -0
packages/ltx-core/src/ltx_core/loader/sd_ops.py +127 -0
packages/ltx-core/src/ltx_core/loader/sft_loader.py +66 -0
packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py +116 -0
packages/ltx-core/src/ltx_core/model/__init__.py +8 -0
packages/ltx-core/src/ltx_core/model/__pycache__/__init__.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/model/__pycache__/model_protocol.cpython-312.pyc +0 -0
packages/ltx-core/src/ltx_core/model/audio_vae/__init__.py +29 -0
packages/ltx-core/src/ltx_core/model/audio_vae/attention.py +71 -0
packages/ltx-core/src/ltx_core/model/audio_vae/audio_vae.py +508 -0
packages/ltx-core/src/ltx_core/model/audio_vae/causal_conv_2d.py +110 -0
packages/ltx-core/src/ltx_core/model/audio_vae/downsample.py +110 -0
packages/ltx-core/src/ltx_core/model/audio_vae/model_configurator.py +200 -0
packages/ltx-core/src/ltx_core/model/audio_vae/ops.py +73 -0
packages/ltx-core/src/ltx_core/model/audio_vae/resnet.py +176 -0
packages/ltx-core/src/ltx_core/model/audio_vae/upsample.py +106 -0
packages/ltx-core/src/ltx_core/model/audio_vae/vocoder.py +575 -0
packages/ltx-core/src/ltx_core/model/model_protocol.py +10 -0
packages/ltx-core/src/ltx_core/model/transformer/feed_forward.py +15 -0
packages/ltx-core/src/ltx_core/model/upsampler/__init__.py +10 -0

packages/ltx-core/src/ltx_core/__init__.py ADDED Viewed

File without changes

packages/ltx-core/src/ltx_core/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (170 Bytes). View file

packages/ltx-core/src/ltx_core/__pycache__/types.cpython-312.pyc ADDED Viewed

Binary file (10 kB). View file

packages/ltx-core/src/ltx_core/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (3.68 kB). View file

packages/ltx-core/src/ltx_core/components/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+Diffusion pipeline components.
+Submodules:
+    diffusion_steps - Diffusion stepping algorithms (EulerDiffusionStep)
+    guiders         - Guidance strategies (CFGGuider, STGGuider, APG variants)
+    noisers         - Noise samplers (GaussianNoiser)
+    patchifiers     - Latent patchification (VideoLatentPatchifier, AudioPatchifier)
+    protocols       - Protocol definitions (Patchifier, etc.)
+    schedulers      - Sigma schedulers (LTX2Scheduler, LinearQuadraticScheduler)
+"""

packages/ltx-core/src/ltx_core/components/diffusion_steps.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch
+from ltx_core.components.protocols import DiffusionStepProtocol
+from ltx_core.utils import to_velocity
+class EulerDiffusionStep(DiffusionStepProtocol):
+    """
+    First-order Euler method for diffusion sampling.
+    Takes a single step from the current noise level (sigma) to the next by
+    computing velocity from the denoised prediction and applying: sample + velocity * dt.
+    """
+    def step(
+        self, sample: torch.Tensor, denoised_sample: torch.Tensor, sigmas: torch.Tensor, step_index: int, **_kwargs
+    ) -> torch.Tensor:
+        sigma = sigmas[step_index]
+        sigma_next = sigmas[step_index + 1]
+        dt = sigma_next - sigma
+        velocity = to_velocity(sample, sigma, denoised_sample)
+        return (sample.to(torch.float32) + velocity.to(torch.float32) * dt).to(sample.dtype)
+class Res2sDiffusionStep(DiffusionStepProtocol):
+    """
+    Second-order diffusion step for res_2s sampling with SDE noise injection.
+    Used by the res_2s denoising loop. Advances the sample from the current
+    sigma to the next by mixing a deterministic update (from the denoised
+    prediction) with injected noise via ``get_sde_coeff``, producing
+    variance-preserving transitions.
+    """
+    @staticmethod
+    def get_sde_coeff(
+        sigma_next: torch.Tensor,
+        sigma_up: torch.Tensor | None = None,
+        sigma_down: torch.Tensor | None = None,
+        sigma_max: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Compute SDE coefficients (alpha_ratio, sigma_down, sigma_up) for the step.
+        Given either ``sigma_down`` or ``sigma_up``, returns the mixing
+        coefficients used for variance-preserving noise injection. If
+        ``sigma_up`` is provided, ``sigma_down`` and ``alpha_ratio`` are
+        derived; if ``sigma_down`` is provided, ``sigma_up`` and
+        ``alpha_ratio`` are derived.
+        """
+        if sigma_down is not None:
+            alpha_ratio = (1 - sigma_next) / (1 - sigma_down)
+            sigma_up = (sigma_next**2 - sigma_down**2 * alpha_ratio**2).clamp(min=0) ** 0.5
+        elif sigma_up is not None:
+            # Fallback to avoid sqrt(neg_num)
+            sigma_up.clamp_(max=sigma_next * 0.9999)
+            sigmax = sigma_max if sigma_max is not None else torch.ones_like(sigma_next)
+            sigma_signal = sigmax - sigma_next
+            sigma_residual = (sigma_next**2 - sigma_up**2).clamp(min=0) ** 0.5
+            alpha_ratio = sigma_signal + sigma_residual
+            sigma_down = sigma_residual / alpha_ratio
+        else:
+            alpha_ratio = torch.ones_like(sigma_next)
+            sigma_down = sigma_next
+            sigma_up = torch.zeros_like(sigma_next)
+        sigma_up = torch.nan_to_num(sigma_up if sigma_up is not None else torch.zeros_like(sigma_next), 0.0)
+        # Replace NaNs in sigma_down with corresponding sigma_next elements (float32)
+        nan_mask = torch.isnan(sigma_down)
+        sigma_down[nan_mask] = sigma_next[nan_mask].to(sigma_down.dtype)
+        alpha_ratio = torch.nan_to_num(alpha_ratio, 1.0)
+        return alpha_ratio, sigma_down, sigma_up
+    def step(
+        self,
+        sample: torch.Tensor,
+        denoised_sample: torch.Tensor,
+        sigmas: torch.Tensor,
+        step_index: int,
+        noise: torch.Tensor,
+    ) -> torch.Tensor:
+        """Advance one step with SDE noise injection via get_sde_coeff."""
+        sigma = sigmas[step_index]
+        sigma_next = sigmas[step_index + 1]
+        alpha_ratio, sigma_down, sigma_up = self.get_sde_coeff(sigma_next, sigma_up=sigma_next * 0.5)
+        output_dtype = denoised_sample.dtype
+        if torch.any(sigma_up == 0) or torch.any(sigma_next == 0):
+            return denoised_sample
+        # Extract epsilon prediction
+        eps_next = (sample - denoised_sample) / (sigma - sigma_next)
+        denoised_next = sample - sigma * eps_next
+        # Mix deterministic and stochastic components
+        x_noised = alpha_ratio * (denoised_next + sigma_down * eps_next) + sigma_up * noise
+        return x_noised.to(output_dtype)

packages/ltx-core/src/ltx_core/components/guiders.py ADDED Viewed

	@@ -0,0 +1,364 @@

+import math
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass, field
+import torch
+from ltx_core.components.protocols import GuiderProtocol
+@dataclass(frozen=True)
+class CFGGuider(GuiderProtocol):
+    """
+    Classifier-free guidance (CFG) guider.
+    Computes the guidance delta as (scale - 1) * (cond - uncond), steering the
+    denoising process toward the conditioned prediction.
+    Attributes:
+        scale: Guidance strength. 1.0 means no guidance, higher values increase
+            adherence to the conditioning.
+    """
+    scale: float
+    def delta(self, cond: torch.Tensor, uncond: torch.Tensor) -> torch.Tensor:
+        return (self.scale - 1) * (cond - uncond)
+    def enabled(self) -> bool:
+        return self.scale != 1.0
+@dataclass(frozen=True)
+class CFGStarRescalingGuider(GuiderProtocol):
+    """
+    Calculates the CFG delta between conditioned and unconditioned samples.
+    To minimize offset in the denoising direction and move mostly along the
+    conditioning axis within the distribution, the unconditioned sample is
+    rescaled in accordance with the norm of the conditioned sample.
+    Attributes:
+        scale (float):
+            Global guidance strength. A value of 1.0 corresponds to no extra
+            guidance beyond the base model prediction. Values > 1.0 increase
+            the influence of the conditioned sample relative to the
+            unconditioned one.
+    """
+    scale: float
+    def delta(self, cond: torch.Tensor, uncond: torch.Tensor) -> torch.Tensor:
+        rescaled_neg = projection_coef(cond, uncond) * uncond
+        return (self.scale - 1) * (cond - rescaled_neg)
+    def enabled(self) -> bool:
+        return self.scale != 1.0
+@dataclass(frozen=True)
+class STGGuider(GuiderProtocol):
+    """
+    Calculates the STG delta between conditioned and perturbed denoised samples.
+    Perturbed samples are the result of the denoising process with perturbations,
+    e.g. attentions acting as passthrough for certain layers and modalities.
+    Attributes:
+        scale (float):
+            Global strength of the STG guidance. A value of 0.0 disables the
+            guidance. Larger values increase the correction applied in the
+            direction of (pos_denoised - perturbed_denoised).
+    """
+    scale: float
+    def delta(self, pos_denoised: torch.Tensor, perturbed_denoised: torch.Tensor) -> torch.Tensor:
+        return self.scale * (pos_denoised - perturbed_denoised)
+    def enabled(self) -> bool:
+        return self.scale != 0.0
+@dataclass(frozen=True)
+class LtxAPGGuider(GuiderProtocol):
+    """
+    Calculates the APG (adaptive projected guidance) delta between conditioned
+    and unconditioned samples.
+    To minimize offset in the denoising direction and move mostly along the
+    conditioning axis within the distribution, the (cond - uncond) delta is
+    decomposed into components parallel and orthogonal to the conditioned
+    sample. The `eta` parameter weights the parallel component, while `scale`
+    is applied to the orthogonal component. Optionally, a norm threshold can
+    be used to suppress guidance when the magnitude of the correction is small.
+    Attributes:
+        scale (float):
+            Strength applied to the component of the guidance that is orthogonal
+            to the conditioned sample. Controls how aggressively we move in
+            directions that change semantics but stay consistent with the
+            conditioning manifold.
+        eta (float):
+            Weight of the component of the guidance that is parallel to the
+            conditioned sample. A value of 1.0 keeps the full parallel
+            component; values in [0, 1] attenuate it, and values > 1.0 amplify
+            motion along the conditioning direction.
+        norm_threshold (float):
+            Minimum L2 norm of the guidance delta below which the guidance
+            can be reduced or ignored (depending on implementation).
+            This is useful for avoiding noisy or unstable updates when the
+            guidance signal is very small.
+    """
+    scale: float
+    eta: float = 1.0
+    norm_threshold: float = 0.0
+    def delta(self, cond: torch.Tensor, uncond: torch.Tensor) -> torch.Tensor:
+        guidance = cond - uncond
+        if self.norm_threshold > 0:
+            ones = torch.ones_like(guidance)
+            guidance_norm = guidance.norm(p=2, dim=[-1, -2, -3], keepdim=True)
+            scale_factor = torch.minimum(ones, self.norm_threshold / guidance_norm)
+            guidance = guidance * scale_factor
+        proj_coeff = projection_coef(guidance, cond)
+        g_parallel = proj_coeff * cond
+        g_orth = guidance - g_parallel
+        g_apg = g_parallel * self.eta + g_orth
+        return g_apg * (self.scale - 1)
+    def enabled(self) -> bool:
+        return self.scale != 1.0
+@dataclass(frozen=False)
+class LegacyStatefulAPGGuider(GuiderProtocol):
+    """
+    Calculates the APG (adaptive projected guidance) delta between conditioned
+    and unconditioned samples.
+    To minimize offset in the denoising direction and move mostly along the
+    conditioning axis within the distribution, the (cond - uncond) delta is
+    decomposed into components parallel and orthogonal to the conditioned
+    sample. The `eta` parameter weights the parallel component, while `scale`
+    is applied to the orthogonal component. Optionally, a norm threshold can
+    be used to suppress guidance when the magnitude of the correction is small.
+    Attributes:
+        scale (float):
+            Strength applied to the component of the guidance that is orthogonal
+            to the conditioned sample. Controls how aggressively we move in
+            directions that change semantics but stay consistent with the
+            conditioning manifold.
+        eta (float):
+            Weight of the component of the guidance that is parallel to the
+            conditioned sample. A value of 1.0 keeps the full parallel
+            component; values in [0, 1] attenuate it, and values > 1.0 amplify
+            motion along the conditioning direction.
+        norm_threshold (float):
+            Minimum L2 norm of the guidance delta below which the guidance
+            can be reduced or ignored (depending on implementation).
+            This is useful for avoiding noisy or unstable updates when the
+            guidance signal is very small.
+        momentum (float):
+            Exponential moving-average coefficient for accumulating guidance
+            over time. running_avg = momentum * running_avg + guidance
+    """
+    scale: float
+    eta: float
+    norm_threshold: float = 5.0
+    momentum: float = 0.0
+    # it is user's responsibility not to use same APGGuider for several denoisings or different modalities
+    # in order not to share accumulated average across different denoisings or modalities
+    running_avg: torch.Tensor | None = None
+    def delta(self, cond: torch.Tensor, uncond: torch.Tensor) -> torch.Tensor:
+        guidance = cond - uncond
+        if self.momentum != 0:
+            if self.running_avg is None:
+                self.running_avg = guidance.clone()
+            else:
+                self.running_avg = self.momentum * self.running_avg + guidance
+            guidance = self.running_avg
+        if self.norm_threshold > 0:
+            ones = torch.ones_like(guidance)
+            guidance_norm = guidance.norm(p=2, dim=[-1, -2, -3], keepdim=True)
+            scale_factor = torch.minimum(ones, self.norm_threshold / guidance_norm)
+            guidance = guidance * scale_factor
+        proj_coeff = projection_coef(guidance, cond)
+        g_parallel = proj_coeff * cond
+        g_orth = guidance - g_parallel
+        g_apg = g_parallel * self.eta + g_orth
+        return g_apg * self.scale
+    def enabled(self) -> bool:
+        return self.scale != 0.0
+@dataclass(frozen=True)
+class MultiModalGuiderParams:
+    """
+    Parameters for the multi-modal guider.
+    """
+    cfg_scale: float = 1.0
+    "CFG (Classifier-free guidance) scale controlling how strongly the model adheres to the prompt."
+    stg_scale: float = 0.0
+    "STG (Spatio-Temporal Guidance) scale controls how strongly the model reacts to the perturbation of the modality."
+    stg_blocks: list[int] | None = field(default_factory=list)
+    "Which transformer blocks to perturb for STG."
+    rescale_scale: float = 0.0
+    "Rescale scale controlling how strongly the model rescales the modality after applying other guidance."
+    modality_scale: float = 1.0
+    "Modality scale controlling how strongly the model reacts to the perturbation of the modality."
+    skip_step: int = 0
+    "Skip step controlling how often the model skips the step."
+def _params_for_sigma_from_sorted_dict(
+    sigma: float, params_by_sigma: Sequence[tuple[float, MultiModalGuiderParams]]
+) -> MultiModalGuiderParams:
+    """
+    Return params for the given sigma from a sorted (sigma_upper_bound -> params) structure.
+    Keys are sorted descending (bin upper bounds). Bin i is (key_{i+1}, key_i].
+    Get all keys >= sigma; use last in list (smallest such key = upper bound of bin containing sigma),
+    or last entry in the sequence if list is empty (sigma above max key).
+    """
+    if not params_by_sigma:
+        raise ValueError("params_by_sigma must be non-empty")
+    sigma = float(sigma)
+    keys_desc = [k for k, _ in params_by_sigma]
+    keys_ge_sigma = [k for k in keys_desc if k >= sigma]
+    # sigma above all keys: use first bin (max key)
+    key = keys_ge_sigma[-1] if keys_ge_sigma else keys_desc[0]
+    return next(p for k, p in params_by_sigma if k == key)
+@dataclass(frozen=True)
+class MultiModalGuider:
+    """
+    Multi-modal guider with constant params per instance.
+    For sigma-dependent params, use MultiModalGuiderFactory.build_from_sigma(sigma) to
+    obtain a guider for each step.
+    """
+    params: MultiModalGuiderParams
+    negative_context: torch.Tensor | None = None
+    def calculate(
+        self,
+        cond: torch.Tensor,
+        uncond_text: torch.Tensor | float,
+        uncond_perturbed: torch.Tensor | float,
+        uncond_modality: torch.Tensor | float,
+    ) -> torch.Tensor:
+        """
+        The guider calculates the guidance delta as (scale - 1) * (cond - uncond) for cfg and modality cfg,
+        and as scale * (cond - uncond) for stg, steering the denoising process away from the unconditioned
+        prediction.
+        """
+        pred = (
+            cond
+            + (self.params.cfg_scale - 1) * (cond - uncond_text)
+            + self.params.stg_scale * (cond - uncond_perturbed)
+            + (self.params.modality_scale - 1) * (cond - uncond_modality)
+        )
+        if self.params.rescale_scale != 0:
+            factor = cond.std() / pred.std()
+            factor = self.params.rescale_scale * factor + (1 - self.params.rescale_scale)
+            pred = pred * factor
+        return pred
+    def do_unconditional_generation(self) -> bool:
+        """Returns True if the guider is doing unconditional generation."""
+        return not math.isclose(self.params.cfg_scale, 1.0)
+    def do_perturbed_generation(self) -> bool:
+        """Returns True if the guider is doing perturbed generation."""
+        return not math.isclose(self.params.stg_scale, 0.0)
+    def do_isolated_modality_generation(self) -> bool:
+        """Returns True if the guider is doing isolated modality generation."""
+        return not math.isclose(self.params.modality_scale, 1.0)
+    def should_skip_step(self, step: int) -> bool:
+        """Returns True if the guider should skip the step."""
+        if self.params.skip_step == 0:
+            return False
+        return step % (self.params.skip_step + 1) != 0
+@dataclass(frozen=True)
+class MultiModalGuiderFactory:
+    """
+    Factory that creates a MultiModalGuider for a given sigma.
+    Single source of truth: _params_by_sigma (schedule). Use constant() for
+    one params for all sigma, from_dict() for sigma-binned params.
+    """
+    negative_context: torch.Tensor | None = None
+    _params_by_sigma: tuple[tuple[float, MultiModalGuiderParams], ...] = ()
+    @classmethod
+    def constant(
+        cls,
+        params: MultiModalGuiderParams,
+        negative_context: torch.Tensor | None = None,
+    ) -> "MultiModalGuiderFactory":
+        """Build a factory with constant params (same guider for all sigma)."""
+        return cls(
+            negative_context=negative_context,
+            _params_by_sigma=((float("inf"), params),),
+        )
+    @classmethod
+    def from_dict(
+        cls,
+        sigma_to_params: Mapping[float, MultiModalGuiderParams],
+        negative_context: torch.Tensor | None = None,
+    ) -> "MultiModalGuiderFactory":
+        """
+        Build a factory from a dict of sigma_value -> MultiModalGuiderParams.
+        Keys are sorted descending and used for bin lookup in params(sigma).
+        """
+        if not sigma_to_params:
+            raise ValueError("sigma_to_params must be non-empty")
+        sorted_items = tuple(sorted(sigma_to_params.items(), key=lambda x: x[0], reverse=True))
+        return cls(negative_context=negative_context, _params_by_sigma=sorted_items)
+    def params(self, sigma: float | torch.Tensor) -> MultiModalGuiderParams:
+        """Return params effective for the given sigma (getter; single source of truth)."""
+        sigma_val = float(sigma.item() if isinstance(sigma, torch.Tensor) else sigma)
+        return _params_for_sigma_from_sorted_dict(sigma_val, self._params_by_sigma)
+    def build_from_sigma(self, sigma: float | torch.Tensor) -> MultiModalGuider:
+        """Return a MultiModalGuider with params effective for the given sigma."""
+        return MultiModalGuider(
+            params=self.params(sigma),
+            negative_context=self.negative_context,
+        )
+def create_multimodal_guider_factory(
+    params: MultiModalGuiderParams | MultiModalGuiderFactory,
+    negative_context: torch.Tensor | None = None,
+) -> MultiModalGuiderFactory:
+    """
+    Create or return a MultiModalGuiderFactory. Pass constant params for a
+    single-params factory (uses MultiModalGuiderFactory.constant), or an existing
+    MultiModalGuiderFactory. When given a factory, returns it as-is unless
+    negative_context is provided. For sigma-dependent params use
+    MultiModalGuiderFactory.from_dict(...) and pass that as params.
+    """
+    if isinstance(params, MultiModalGuiderFactory):
+        if negative_context is not None and params.negative_context is not negative_context:
+            return MultiModalGuiderFactory.from_dict(dict(params._params_by_sigma), negative_context=negative_context)
+        return params
+    return MultiModalGuiderFactory.constant(params, negative_context=negative_context)
+def projection_coef(to_project: torch.Tensor, project_onto: torch.Tensor) -> torch.Tensor:
+    batch_size = to_project.shape[0]
+    positive_flat = to_project.reshape(batch_size, -1)
+    negative_flat = project_onto.reshape(batch_size, -1)
+    dot_product = torch.sum(positive_flat * negative_flat, dim=1, keepdim=True)
+    squared_norm = torch.sum(negative_flat**2, dim=1, keepdim=True) + 1e-8
+    return dot_product / squared_norm

packages/ltx-core/src/ltx_core/components/noisers.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from dataclasses import replace
+from typing import Protocol
+import torch
+from ltx_core.types import LatentState
+class Noiser(Protocol):
+    """Protocol for adding noise to a latent state during diffusion."""
+    def __call__(self, latent_state: LatentState, noise_scale: float) -> LatentState: ...
+class GaussianNoiser(Noiser):
+    """Adds Gaussian noise to a latent state, scaled by the denoise mask."""
+    def __init__(self, generator: torch.Generator):
+        super().__init__()
+        self.generator = generator
+    def __call__(self, latent_state: LatentState, noise_scale: float = 1.0) -> LatentState:
+        noise = torch.randn(
+            *latent_state.latent.shape,
+            device=latent_state.latent.device,
+            dtype=latent_state.latent.dtype,
+            generator=self.generator,
+        )
+        scaled_mask = latent_state.denoise_mask * noise_scale
+        latent = noise * scaled_mask + latent_state.latent * (1 - scaled_mask)
+        return replace(
+            latent_state,
+            latent=latent.to(latent_state.latent.dtype),
+        )

packages/ltx-core/src/ltx_core/components/patchifiers.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import math
+from typing import Optional, Tuple
+import einops
+import torch
+from ltx_core.components.protocols import Patchifier
+from ltx_core.types import AudioLatentShape, SpatioTemporalScaleFactors, VideoLatentShape
+class VideoLatentPatchifier(Patchifier):
+    def __init__(self, patch_size: int):
+        # Patch sizes for video latents.
+        self._patch_size = (
+            1,  # temporal dimension
+            patch_size,  # height dimension
+            patch_size,  # width dimension
+        )
+    @property
+    def patch_size(self) -> Tuple[int, int, int]:
+        return self._patch_size
+    def get_token_count(self, tgt_shape: VideoLatentShape) -> int:
+        return math.prod(tgt_shape.to_torch_shape()[2:]) // math.prod(self._patch_size)
+    def patchify(
+        self,
+        latents: torch.Tensor,
+    ) -> torch.Tensor:
+        latents = einops.rearrange(
+            latents,
+            "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)",
+            p1=self._patch_size[0],
+            p2=self._patch_size[1],
+            p3=self._patch_size[2],
+        )
+        return latents
+    def unpatchify(
+        self,
+        latents: torch.Tensor,
+        output_shape: VideoLatentShape,
+    ) -> torch.Tensor:
+        assert self._patch_size[0] == 1, "Temporal patch size must be 1 for symmetric patchifier"
+        patch_grid_frames = output_shape.frames // self._patch_size[0]
+        patch_grid_height = output_shape.height // self._patch_size[1]
+        patch_grid_width = output_shape.width // self._patch_size[2]
+        latents = einops.rearrange(
+            latents,
+            "b (f h w) (c p q) -> b c f (h p) (w q)",
+            f=patch_grid_frames,
+            h=patch_grid_height,
+            w=patch_grid_width,
+            p=self._patch_size[1],
+            q=self._patch_size[2],
+        )
+        return latents
+    def get_patch_grid_bounds(
+        self,
+        output_shape: AudioLatentShape | VideoLatentShape,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        """
+        Return the per-dimension bounds [inclusive start, exclusive end) for every
+        patch produced by `patchify`. The bounds are expressed in the original
+        video grid coordinates: frame/time, height, and width.
+        The resulting tensor is shaped `[batch_size, 3, num_patches, 2]`, where:
+            - axis 1 (size 3) enumerates (frame/time, height, width) dimensions
+            - axis 3 (size 2) stores `[start, end)` indices within each dimension
+        Args:
+            output_shape: Video grid description containing frames, height, and width.
+            device: Device of the latent tensor.
+        """
+        if not isinstance(output_shape, VideoLatentShape):
+            raise ValueError("VideoLatentPatchifier expects VideoLatentShape when computing coordinates")
+        frames = output_shape.frames
+        height = output_shape.height
+        width = output_shape.width
+        batch_size = output_shape.batch
+        # Validate inputs to ensure positive dimensions
+        assert frames > 0, f"frames must be positive, got {frames}"
+        assert height > 0, f"height must be positive, got {height}"
+        assert width > 0, f"width must be positive, got {width}"
+        assert batch_size > 0, f"batch_size must be positive, got {batch_size}"
+        # Generate grid coordinates for each dimension (frame, height, width)
+        # We use torch.arange to create the starting coordinates for each patch.
+        # indexing='ij' ensures the dimensions are in the order (frame, height, width).
+        grid_coords = torch.meshgrid(
+            torch.arange(start=0, end=frames, step=self._patch_size[0], device=device),
+            torch.arange(start=0, end=height, step=self._patch_size[1], device=device),
+            torch.arange(start=0, end=width, step=self._patch_size[2], device=device),
+            indexing="ij",
+        )
+        # Stack the grid coordinates to create the start coordinates tensor.
+        # Shape becomes (3, grid_f, grid_h, grid_w)
+        patch_starts = torch.stack(grid_coords, dim=0)
+        # Create a tensor containing the size of a single patch:
+        # (frame_patch_size, height_patch_size, width_patch_size).
+        # Reshape to (3, 1, 1, 1) to enable broadcasting when adding to the start coordinates.
+        patch_size_delta = torch.tensor(
+            self._patch_size,
+            device=patch_starts.device,
+            dtype=patch_starts.dtype,
+        ).view(3, 1, 1, 1)
+        # Calculate end coordinates: start + patch_size
+        # Shape becomes (3, grid_f, grid_h, grid_w)
+        patch_ends = patch_starts + patch_size_delta
+        # Stack start and end coordinates together along the last dimension
+        # Shape becomes (3, grid_f, grid_h, grid_w, 2), where the last dimension is [start, end]
+        latent_coords = torch.stack((patch_starts, patch_ends), dim=-1)
+        # Broadcast to batch size and flatten all spatial/temporal dimensions into one sequence.
+        # Final Shape: (batch_size, 3, num_patches, 2)
+        latent_coords = einops.repeat(
+            latent_coords,
+            "c f h w bounds -> b c (f h w) bounds",
+            b=batch_size,
+            bounds=2,
+        )
+        return latent_coords
+def get_pixel_coords(
+    latent_coords: torch.Tensor,
+    scale_factors: SpatioTemporalScaleFactors,
+    causal_fix: bool = False,
+) -> torch.Tensor:
+    """
+    Map latent-space `[start, end)` coordinates to their pixel-space equivalents by scaling
+    each axis (frame/time, height, width) with the corresponding VAE downsampling factors.
+    Optionally compensate for causal encoding that keeps the first frame at unit temporal scale.
+    Args:
+        latent_coords: Tensor of latent bounds shaped `(batch, 3, num_patches, 2)`.
+        scale_factors: SpatioTemporalScaleFactors tuple `(temporal, height, width)` with integer scale factors applied
+        per axis.
+        causal_fix: When True, rewrites the temporal axis of the first frame so causal VAEs
+            that treat frame zero differently still yield non-negative timestamps.
+    """
+    # Broadcast the VAE scale factors so they align with the `(batch, axis, patch, bound)` layout.
+    broadcast_shape = [1] * latent_coords.ndim
+    broadcast_shape[1] = -1  # axis dimension corresponds to (frame/time, height, width)
+    scale_tensor = torch.tensor(scale_factors, device=latent_coords.device).view(*broadcast_shape)
+    # Apply per-axis scaling to convert latent bounds into pixel-space coordinates.
+    pixel_coords = latent_coords * scale_tensor
+    if causal_fix:
+        # VAE temporal stride for the very first frame is 1 instead of `scale_factors[0]`.
+        # Shift and clamp to keep the first-frame timestamps causal and non-negative.
+        pixel_coords[:, 0, ...] = (pixel_coords[:, 0, ...] + 1 - scale_factors[0]).clamp(min=0)
+    return pixel_coords
+class AudioPatchifier(Patchifier):
+    def __init__(
+        self,
+        patch_size: int,
+        sample_rate: int = 16000,
+        hop_length: int = 160,
+        audio_latent_downsample_factor: int = 4,
+        is_causal: bool = True,
+        shift: int = 0,
+    ):
+        """
+        Patchifier tailored for spectrogram/audio latents.
+        Args:
+            patch_size: Number of mel bins combined into a single patch. This
+                controls the resolution along the frequency axis.
+            sample_rate: Original waveform sampling rate. Used to map latent
+                indices back to seconds so downstream consumers can align audio
+                and video cues.
+            hop_length: Window hop length used for the spectrogram. Determines
+                how many real-time samples separate two consecutive latent frames.
+            audio_latent_downsample_factor: Ratio between spectrogram frames and
+                latent frames; compensates for additional downsampling inside the
+                VAE encoder.
+            is_causal: When True, timing is shifted to account for causal
+                receptive fields so timestamps do not peek into the future.
+            shift: Integer offset applied to the latent indices. Enables
+                constructing overlapping windows from the same latent sequence.
+        """
+        self.hop_length = hop_length
+        self.sample_rate = sample_rate
+        self.audio_latent_downsample_factor = audio_latent_downsample_factor
+        self.is_causal = is_causal
+        self.shift = shift
+        self._patch_size = (1, patch_size, patch_size)
+    @property
+    def patch_size(self) -> Tuple[int, int, int]:
+        return self._patch_size
+    def get_token_count(self, tgt_shape: AudioLatentShape) -> int:
+        return tgt_shape.frames
+    def _get_audio_latent_time_in_sec(
+        self,
+        start_latent: int,
+        end_latent: int,
+        dtype: torch.dtype,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        """
+        Converts latent indices into real-time seconds while honoring causal
+        offsets and the configured hop length.
+        Args:
+            start_latent: Inclusive start index inside the latent sequence. This
+                sets the first timestamp returned.
+            end_latent: Exclusive end index. Determines how many timestamps get
+                generated.
+            dtype: Floating-point dtype used for the returned tensor, allowing
+                callers to control precision.
+            device: Target device for the timestamp tensor. When omitted the
+                computation occurs on CPU to avoid surprising GPU allocations.
+        """
+        if device is None:
+            device = torch.device("cpu")
+        audio_latent_frame = torch.arange(start_latent, end_latent, dtype=dtype, device=device)
+        audio_mel_frame = audio_latent_frame * self.audio_latent_downsample_factor
+        if self.is_causal:
+            # Frame offset for causal alignment.
+            # The "+1" ensures the timestamp corresponds to the first sample that is fully available.
+            causal_offset = 1
+            audio_mel_frame = (audio_mel_frame + causal_offset - self.audio_latent_downsample_factor).clip(min=0)
+        return audio_mel_frame * self.hop_length / self.sample_rate
+    def _compute_audio_timings(
+        self,
+        batch_size: int,
+        num_steps: int,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        """
+        Builds a `(B, 1, T, 2)` tensor containing timestamps for each latent frame.
+        This helper method underpins `get_patch_grid_bounds` for the audio patchifier.
+        Args:
+            batch_size: Number of sequences to broadcast the timings over.
+            num_steps: Number of latent frames (time steps) to convert into timestamps.
+            device: Device on which the resulting tensor should reside.
+        """
+        resolved_device = device
+        if resolved_device is None:
+            resolved_device = torch.device("cpu")
+        start_timings = self._get_audio_latent_time_in_sec(
+            self.shift,
+            num_steps + self.shift,
+            torch.float32,
+            resolved_device,
+        )
+        start_timings = start_timings.unsqueeze(0).expand(batch_size, -1).unsqueeze(1)
+        end_timings = self._get_audio_latent_time_in_sec(
+            self.shift + 1,
+            num_steps + self.shift + 1,
+            torch.float32,
+            resolved_device,
+        )
+        end_timings = end_timings.unsqueeze(0).expand(batch_size, -1).unsqueeze(1)
+        return torch.stack([start_timings, end_timings], dim=-1)
+    def patchify(
+        self,
+        audio_latents: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Flattens the audio latent tensor along time. Use `get_patch_grid_bounds`
+        to derive timestamps for each latent frame based on the configured hop
+        length and downsampling.
+        Args:
+            audio_latents: Latent tensor to patchify.
+        Returns:
+            Flattened patch tokens tensor. Use `get_patch_grid_bounds` to compute the
+            corresponding timing metadata when needed.
+        """
+        audio_latents = einops.rearrange(
+            audio_latents,
+            "b c t f -> b t (c f)",
+        )
+        return audio_latents
+    def unpatchify(
+        self,
+        audio_latents: torch.Tensor,
+        output_shape: AudioLatentShape,
+    ) -> torch.Tensor:
+        """
+        Restores the `(B, C, T, F)` spectrogram tensor from flattened patches.
+        Use `get_patch_grid_bounds` to recompute the timestamps that describe each
+        frame's position in real time.
+        Args:
+            audio_latents: Latent tensor to unpatchify.
+            output_shape: Shape of the unpatched output tensor.
+        Returns:
+            Unpatched latent tensor. Use `get_patch_grid_bounds` to compute the timing
+            metadata associated with the restored latents.
+        """
+        # audio_latents shape: (batch, time, freq * channels)
+        audio_latents = einops.rearrange(
+            audio_latents,
+            "b t (c f) -> b c t f",
+            c=output_shape.channels,
+            f=output_shape.mel_bins,
+        )
+        return audio_latents
+    def get_patch_grid_bounds(
+        self,
+        output_shape: AudioLatentShape | VideoLatentShape,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        """
+        Return the temporal bounds `[inclusive start, exclusive end)` for every
+        patch emitted by `patchify`. For audio this corresponds to timestamps in
+        seconds aligned with the original spectrogram grid.
+        The returned tensor has shape `[batch_size, 1, time_steps, 2]`, where:
+            - axis 1 (size 1) represents the temporal dimension
+            - axis 3 (size 2) stores the `[start, end)` timestamps per patch
+        Args:
+            output_shape: Audio grid specification describing the number of time steps.
+            device: Target device for the returned tensor.
+        """
+        if not isinstance(output_shape, AudioLatentShape):
+            raise ValueError("AudioPatchifier expects AudioLatentShape when computing coordinates")
+        return self._compute_audio_timings(output_shape.batch, output_shape.frames, device)

packages/ltx-core/src/ltx_core/components/protocols.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from typing import Protocol, Tuple
+import torch
+from ltx_core.types import AudioLatentShape, VideoLatentShape
+class Patchifier(Protocol):
+    """
+    Protocol for patchifiers that convert latent tensors into patches and assemble them back.
+    """
+    def patchify(
+        self,
+        latents: torch.Tensor,
+    ) -> torch.Tensor:
+        ...
+        """
+        Convert latent tensors into flattened patch tokens.
+        Args:
+            latents: Latent tensor to patchify.
+        Returns:
+            Flattened patch tokens tensor.
+        """
+    def unpatchify(
+        self,
+        latents: torch.Tensor,
+        output_shape: AudioLatentShape | VideoLatentShape,
+    ) -> torch.Tensor:
+        """
+        Converts latent tensors between spatio-temporal formats and flattened sequence representations.
+        Args:
+            latents: Patch tokens that must be rearranged back into the latent grid constructed by `patchify`.
+            output_shape: Shape of the output tensor. Note that output_shape is either AudioLatentShape or
+            VideoLatentShape.
+        Returns:
+            Dense latent tensor restored from the flattened representation.
+        """
+    @property
+    def patch_size(self) -> Tuple[int, int, int]:
+        ...
+        """
+        Returns the patch size as a tuple of (temporal, height, width) dimensions
+        """
+    def get_patch_grid_bounds(
+        self,
+        output_shape: AudioLatentShape | VideoLatentShape,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        ...
+        """
+        Compute metadata describing where each latent patch resides within the
+        grid specified by `output_shape`.
+        Args:
+            output_shape: Target grid layout for the patches.
+            device: Target device for the returned tensor.
+        Returns:
+            Tensor containing patch coordinate metadata such as spatial or temporal intervals.
+        """
+class SchedulerProtocol(Protocol):
+    """
+    Protocol for schedulers that provide a sigmas schedule tensor for a
+    given number of steps. Device is cpu.
+    """
+    def execute(self, steps: int, **kwargs) -> torch.FloatTensor: ...
+class GuiderProtocol(Protocol):
+    """
+    Protocol for guiders that compute a delta tensor given conditioning inputs.
+    The returned delta should be added to the conditional output (cond), enabling
+    multiple guiders to be chained together by accumulating their deltas.
+    """
+    scale: float
+    def delta(self, cond: torch.Tensor, uncond: torch.Tensor) -> torch.Tensor: ...
+    def enabled(self) -> bool:
+        """
+        Returns whether the corresponding perturbation is enabled. E.g. for CFG, this should return False if the scale
+        is 1.0.
+        """
+        ...
+class DiffusionStepProtocol(Protocol):
+    """
+    Protocol for diffusion steps that provide a next sample tensor for a given current sample tensor,
+    current denoised sample tensor, and sigmas tensor.
+    """
+    def step(
+        self, sample: torch.Tensor, denoised_sample: torch.Tensor, sigmas: torch.Tensor, step_index: int, **kwargs
+    ) -> torch.Tensor: ...

packages/ltx-core/src/ltx_core/components/schedulers.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import math
+from functools import lru_cache
+import numpy
+import scipy
+import torch
+from ltx_core.components.protocols import SchedulerProtocol
+BASE_SHIFT_ANCHOR = 1024
+MAX_SHIFT_ANCHOR = 4096
+class LTX2Scheduler(SchedulerProtocol):
+    """
+    Default scheduler for LTX-2 diffusion sampling.
+    Generates a sigma schedule with token-count-dependent shifting and optional
+    stretching to a terminal value.
+    """
+    def execute(
+        self,
+        steps: int,
+        latent: torch.Tensor | None = None,
+        max_shift: float = 2.05,
+        base_shift: float = 0.95,
+        stretch: bool = True,
+        terminal: float = 0.1,
+        default_number_of_tokens: int = MAX_SHIFT_ANCHOR,
+        **_kwargs,
+    ) -> torch.FloatTensor:
+        tokens = math.prod(latent.shape[2:]) if latent is not None else default_number_of_tokens
+        sigmas = torch.linspace(1.0, 0.0, steps + 1)
+        x1 = BASE_SHIFT_ANCHOR
+        x2 = MAX_SHIFT_ANCHOR
+        mm = (max_shift - base_shift) / (x2 - x1)
+        b = base_shift - mm * x1
+        sigma_shift = (tokens) * mm + b
+        power = 1
+        sigmas = torch.where(
+            sigmas != 0,
+            math.exp(sigma_shift) / (math.exp(sigma_shift) + (1 / sigmas - 1) ** power),
+            0,
+        )
+        # Stretch sigmas so that its final value matches the given terminal value.
+        if stretch:
+            non_zero_mask = sigmas != 0
+            non_zero_sigmas = sigmas[non_zero_mask]
+            one_minus_z = 1.0 - non_zero_sigmas
+            scale_factor = one_minus_z[-1] / (1.0 - terminal)
+            stretched = 1.0 - (one_minus_z / scale_factor)
+            sigmas[non_zero_mask] = stretched
+        return sigmas.to(torch.float32)
+class LinearQuadraticScheduler(SchedulerProtocol):
+    """
+    Scheduler with linear steps followed by quadratic steps.
+    Produces a sigma schedule that transitions linearly up to a threshold,
+    then follows a quadratic curve for the remaining steps.
+    """
+    def execute(
+        self, steps: int, threshold_noise: float = 0.025, linear_steps: int | None = None, **_kwargs
+    ) -> torch.FloatTensor:
+        if steps == 1:
+            return torch.FloatTensor([1.0, 0.0])
+        if linear_steps is None:
+            linear_steps = steps // 2
+        linear_sigma_schedule = [i * threshold_noise / linear_steps for i in range(linear_steps)]
+        threshold_noise_step_diff = linear_steps - threshold_noise * steps
+        quadratic_steps = steps - linear_steps
+        quadratic_sigma_schedule = []
+        if quadratic_steps > 0:
+            quadratic_coef = threshold_noise_step_diff / (linear_steps * quadratic_steps**2)
+            linear_coef = threshold_noise / linear_steps - 2 * threshold_noise_step_diff / (quadratic_steps**2)
+            const = quadratic_coef * (linear_steps**2)
+            quadratic_sigma_schedule = [
+                quadratic_coef * (i**2) + linear_coef * i + const for i in range(linear_steps, steps)
+            ]
+        sigma_schedule = linear_sigma_schedule + quadratic_sigma_schedule + [1.0]
+        sigma_schedule = [1.0 - x for x in sigma_schedule]
+        return torch.FloatTensor(sigma_schedule)
+class BetaScheduler(SchedulerProtocol):
+    """
+    Scheduler using a beta distribution to sample timesteps.
+    Based on: https://arxiv.org/abs/2407.12173
+    """
+    shift = 2.37
+    timesteps_length = 10000
+    def execute(self, steps: int, alpha: float = 0.6, beta: float = 0.6) -> torch.FloatTensor:
+        """
+        Execute the beta scheduler.
+        Args:
+            steps: The number of steps to execute the scheduler for.
+            alpha: The alpha parameter for the beta distribution.
+            beta: The beta parameter for the beta distribution.
+        Warnings:
+            The number of steps within `sigmas` theoretically might be less than `steps+1`,
+            because of the deduplication of the identical timesteps
+        Returns:
+            A tensor of sigmas.
+        """
+        model_sampling_sigmas = _precalculate_model_sampling_sigmas(self.shift, self.timesteps_length)
+        total_timesteps = len(model_sampling_sigmas) - 1
+        ts = 1 - numpy.linspace(0, 1, steps, endpoint=False)
+        ts = numpy.rint(scipy.stats.beta.ppf(ts, alpha, beta) * total_timesteps).tolist()
+        ts = list(dict.fromkeys(ts))
+        sigmas = [float(model_sampling_sigmas[int(t)]) for t in ts] + [0.0]
+        return torch.FloatTensor(sigmas)
+@lru_cache(maxsize=5)
+def _precalculate_model_sampling_sigmas(shift: float, timesteps_length: int) -> torch.Tensor:
+    timesteps = torch.arange(1, timesteps_length + 1, 1) / timesteps_length
+    return torch.Tensor([flux_time_shift(shift, 1.0, t) for t in timesteps])
+def flux_time_shift(mu: float, sigma: float, t: float) -> float:
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)

packages/ltx-core/src/ltx_core/conditioning/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""Conditioning utilities: latent state, tools, and conditioning types."""
+from ltx_core.conditioning.exceptions import ConditioningError
+from ltx_core.conditioning.item import ConditioningItem
+from ltx_core.conditioning.types import (
+    ConditioningItemAttentionStrengthWrapper,
+    VideoConditionByKeyframeIndex,
+    VideoConditionByLatentIndex,
+    VideoConditionByReferenceLatent,
+)
+__all__ = [
+    "ConditioningError",
+    "ConditioningItem",
+    "ConditioningItemAttentionStrengthWrapper",
+    "VideoConditionByKeyframeIndex",
+    "VideoConditionByLatentIndex",
+    "VideoConditionByReferenceLatent",
+]

packages/ltx-core/src/ltx_core/conditioning/exceptions.py ADDED Viewed

	@@ -0,0 +1,4 @@

+class ConditioningError(Exception):
+    """
+    Class for conditioning-related errors.
+    """

packages/ltx-core/src/ltx_core/conditioning/item.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from typing import Protocol
+from ltx_core.tools import LatentTools
+from ltx_core.types import LatentState
+class ConditioningItem(Protocol):
+    """Protocol for conditioning items that modify latent state during diffusion."""
+    def apply_to(self, latent_state: LatentState, latent_tools: LatentTools) -> LatentState:
+        """
+        Apply the conditioning to the latent state.
+        Args:
+            latent_state: The latent state to apply the conditioning to. This is state always patchified.
+        Returns:
+            The latent state after the conditioning has been applied.
+        IMPORTANT: If the conditioning needs to add extra tokens to the latent, it should add them to the end of the
+        latent.
+        """
+        ...

packages/ltx-core/src/ltx_core/conditioning/mask_utils.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""Utilities for building 2D self-attention masks for conditioning items."""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import torch
+if TYPE_CHECKING:
+    from ltx_core.types import LatentState
+def resolve_cross_mask(
+    attention_mask: float | int | torch.Tensor,
+    num_new_tokens: int,
+    batch_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """Convert an attention_mask (scalar or tensor) to a (B, M) cross_mask tensor.
+    Args:
+        attention_mask: Scalar value applied uniformly, 1D tensor of shape (M,)
+            broadcast across batch, or 2D tensor of shape (B, M).
+        num_new_tokens: Number of new conditioning tokens M.
+        batch_size: Batch size B.
+        device: Device for the output tensor.
+        dtype: Data type for the output tensor.
+    Returns:
+        Cross-mask tensor of shape (B, M).
+    """
+    if isinstance(attention_mask, (int, float)):
+        return torch.full(
+            (batch_size, num_new_tokens),
+            fill_value=float(attention_mask),
+            device=device,
+            dtype=dtype,
+        )
+    mask = attention_mask.to(device=device, dtype=dtype)
+    # Handle scalar (0-D) tensor like a Python scalar.
+    if mask.dim() == 0:
+        return torch.full(
+            (batch_size, num_new_tokens),
+            fill_value=float(mask.item()),
+            device=device,
+            dtype=dtype,
+        )
+    if mask.dim() == 1:
+        if mask.shape[0] != num_new_tokens:
+            raise ValueError(
+                f"1-D attention_mask length must equal num_new_tokens ({num_new_tokens}), got shape {tuple(mask.shape)}"
+            )
+        mask = mask.unsqueeze(0).expand(batch_size, -1)
+    elif mask.dim() == 2:
+        b, m = mask.shape
+        if m != num_new_tokens:
+            raise ValueError(
+                f"2-D attention_mask second dimension must equal num_new_tokens ({num_new_tokens}), "
+                f"got shape {tuple(mask.shape)}"
+            )
+        if b not in (batch_size, 1):
+            raise ValueError(
+                f"2-D attention_mask batch dimension must equal batch_size ({batch_size}) or 1, "
+                f"got shape {tuple(mask.shape)}"
+            )
+        if b == 1 and batch_size > 1:
+            mask = mask.expand(batch_size, -1)
+    else:
+        raise ValueError(
+            f"attention_mask tensor must be 0-D, 1-D, or 2-D, got {mask.dim()}-D with shape {tuple(mask.shape)}"
+        )
+    return mask
+def update_attention_mask(
+    latent_state: LatentState,
+    attention_mask: float | torch.Tensor | None,
+    num_noisy_tokens: int,
+    num_new_tokens: int,
+    batch_size: int,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor | None:
+    """Build or update the self-attention mask for newly appended conditioning tokens.
+    If *attention_mask* is ``None`` and no existing mask is present, returns
+    ``None``.  If *attention_mask* is ``None`` but an existing mask is present,
+    the mask is expanded with full attention (1s) for the new tokens so that
+    its dimensions stay consistent with the growing latent sequence.  Otherwise,
+    resolves *attention_mask* to a per-token cross-mask and expands the 2-D
+    attention mask via :func:`build_attention_mask`.
+    Args:
+        latent_state: Current latent state (provides the existing mask and total
+            existing-token count).
+        attention_mask: Per-token attention weight. Scalar, 1-D ``(M,)``, 2-D
+            ``(B, M)`` tensor, or ``None`` (no-op).
+        num_noisy_tokens: Number of original noisy tokens (from
+            ``latent_tools.target_shape.token_count()``).
+        num_new_tokens: Number of new conditioning tokens being appended.
+        batch_size: Batch size.
+        device: Device for the output tensor.
+        dtype: Data type for the output tensor.
+    Returns:
+        Updated attention mask of shape ``(B, N+M, N+M)``, or ``None`` if no
+        masking is needed.
+    """
+    if attention_mask is None:
+        if latent_state.attention_mask is None:
+            return None
+        # Existing mask present but no new mask requested: pad with 1s (full
+        # attention) so the mask dimensions stay consistent with the growing
+        # latent sequence.
+        cross_mask = torch.ones(batch_size, num_new_tokens, device=device, dtype=dtype)
+        return build_attention_mask(
+            existing_mask=latent_state.attention_mask,
+            num_noisy_tokens=num_noisy_tokens,
+            num_new_tokens=num_new_tokens,
+            num_existing_tokens=latent_state.latent.shape[1],
+            cross_mask=cross_mask,
+            device=device,
+            dtype=dtype,
+        )
+    cross_mask = resolve_cross_mask(attention_mask, num_new_tokens, batch_size, device, dtype)
+    return build_attention_mask(
+        existing_mask=latent_state.attention_mask,
+        num_noisy_tokens=num_noisy_tokens,
+        num_new_tokens=num_new_tokens,
+        num_existing_tokens=latent_state.latent.shape[1],
+        cross_mask=cross_mask,
+        device=device,
+        dtype=dtype,
+    )
+def build_attention_mask(
+    existing_mask: torch.Tensor | None,
+    num_noisy_tokens: int,
+    num_new_tokens: int,
+    num_existing_tokens: int,
+    cross_mask: torch.Tensor,
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """
+    Expand the attention mask to include newly appended conditioning tokens.
+    Each conditioning item appends M new reference tokens to the sequence. This function
+    builds a (B, N+M, N+M) attention mask with the following block structure:
+                     noisy      prev_ref    new_ref
+                   (N_noisy)   (N-N_noisy)    (M)
+                 ┌───────────┬───────────┬───────────┐
+        noisy    │           │           │           │
+       (N_noisy) │  existing │  existing │   cross   │
+                 │           │           │           │
+                 ├───────────┼───────────┼───────────┤
+       prev_ref  │           │           │           │
+      (N-N_noisy)│  existing │  existing │     0     │
+                 │           │           │           │
+                 ├───────────┼───────────┼───────────┤
+       new_ref   │           │           │           │
+         (M)     │   cross   │     0     │     1     │
+                 │           │           │           │
+                 └───────────┴───────────┴───────────┘
+    Where:
+      - **existing**: preserved from the previous mask (or 1.0 if first conditioning)
+      - **cross**: values from *cross_mask* (shape B, M), in [0, 1]
+      - **0**: no attention between different reference groups
+    Args:
+        existing_mask: Current attention mask of shape (B, N, N), or None if no mask exists yet.
+            When None, the top-left NxN block is filled with 1s (full attention between all
+            existing tokens including any prior reference tokens that had no mask).
+        num_noisy_tokens: Number of original noisy tokens (always at positions [0:num_noisy_tokens]).
+        num_new_tokens: Number of new conditioning tokens M being appended.
+        num_existing_tokens: Total number of current tokens N (noisy + any prior conditioning tokens).
+        cross_mask: Per-token attention weight of shape (B, M) controlling attention between
+            new reference tokens and noisy tokens. Values in [0, 1].
+        device: Device for the output tensor.
+        dtype: Data type for the output tensor.
+    Returns:
+        Attention mask of shape (B, N+M, N+M) with values in [0, 1].
+    """
+    batch_size = cross_mask.shape[0]
+    total = num_existing_tokens + num_new_tokens
+    # Start with zeros
+    mask = torch.zeros((batch_size, total, total), device=device, dtype=dtype)
+    # Top-left: preserve existing mask or fill with 1s for noisy tokens
+    if existing_mask is not None:
+        mask[:, :num_existing_tokens, :num_existing_tokens] = existing_mask
+    else:
+        mask[:, :num_existing_tokens, :num_existing_tokens] = 1.0
+    # Bottom-right: new reference tokens fully attend to themselves
+    mask[:, num_existing_tokens:, num_existing_tokens:] = 1.0
+    # Cross-attention between noisy tokens and new reference tokens
+    # cross_mask shape: (B, M) -> broadcast to (B, N_noisy, M) and (B, M, N_noisy)
+    # Noisy tokens attending to new reference tokens: [0:N_noisy, N:N+M]
+    # Each column j in this block gets cross_mask[:, j]
+    mask[:, :num_noisy_tokens, num_existing_tokens:] = cross_mask.unsqueeze(1)
+    # New reference tokens attending to noisy tokens: [N:N+M, 0:N_noisy]
+    # Each row i in this block gets cross_mask[:, i]
+    mask[:, num_existing_tokens:, :num_noisy_tokens] = cross_mask.unsqueeze(2)
+    # [N_noisy:N, N:N+M] and [N:N+M, N_noisy:N] remain 0 (no cross-ref attention)
+    return mask

packages/ltx-core/src/ltx_core/guidance/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""Guidance and perturbation utilities for attention manipulation."""
+from ltx_core.guidance.perturbations import (
+    BatchedPerturbationConfig,
+    Perturbation,
+    PerturbationConfig,
+    PerturbationType,
+)
+__all__ = [
+    "BatchedPerturbationConfig",
+    "Perturbation",
+    "PerturbationConfig",
+    "PerturbationType",
+]

packages/ltx-core/src/ltx_core/guidance/perturbations.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from dataclasses import dataclass
+from enum import Enum
+import torch
+from torch._prims_common import DeviceLikeType
+class PerturbationType(Enum):
+    """Types of attention perturbations for STG (Spatio-Temporal Guidance)."""
+    SKIP_A2V_CROSS_ATTN = "skip_a2v_cross_attn"
+    SKIP_V2A_CROSS_ATTN = "skip_v2a_cross_attn"
+    SKIP_VIDEO_SELF_ATTN = "skip_video_self_attn"
+    SKIP_AUDIO_SELF_ATTN = "skip_audio_self_attn"
+@dataclass(frozen=True)
+class Perturbation:
+    """A single perturbation specifying which attention type to skip and in which blocks."""
+    type: PerturbationType
+    blocks: list[int] | None  # None means all blocks
+    def is_perturbed(self, perturbation_type: PerturbationType, block: int) -> bool:
+        if self.type != perturbation_type:
+            return False
+        if self.blocks is None:
+            return True
+        return block in self.blocks
+@dataclass(frozen=True)
+class PerturbationConfig:
+    """Configuration holding a list of perturbations for a single sample."""
+    perturbations: list[Perturbation] | None
+    def is_perturbed(self, perturbation_type: PerturbationType, block: int) -> bool:
+        if self.perturbations is None:
+            return False
+        return any(perturbation.is_perturbed(perturbation_type, block) for perturbation in self.perturbations)
+    @staticmethod
+    def empty() -> "PerturbationConfig":
+        return PerturbationConfig([])
+@dataclass(frozen=True)
+class BatchedPerturbationConfig:
+    """Perturbation configurations for a batch, with utilities for generating attention masks."""
+    perturbations: list[PerturbationConfig]
+    def mask(
+        self, perturbation_type: PerturbationType, block: int, device: DeviceLikeType, dtype: torch.dtype
+    ) -> torch.Tensor:
+        mask = torch.ones((len(self.perturbations),), device=device, dtype=dtype)
+        for batch_idx, perturbation in enumerate(self.perturbations):
+            if perturbation.is_perturbed(perturbation_type, block):
+                mask[batch_idx] = 0
+        return mask
+    def mask_like(self, perturbation_type: PerturbationType, block: int, values: torch.Tensor) -> torch.Tensor:
+        mask = self.mask(perturbation_type, block, values.device, values.dtype)
+        return mask.view(mask.numel(), *([1] * len(values.shape[1:])))
+    def any_in_batch(self, perturbation_type: PerturbationType, block: int) -> bool:
+        return any(perturbation.is_perturbed(perturbation_type, block) for perturbation in self.perturbations)
+    def all_in_batch(self, perturbation_type: PerturbationType, block: int) -> bool:
+        return all(perturbation.is_perturbed(perturbation_type, block) for perturbation in self.perturbations)
+    @staticmethod
+    def empty(batch_size: int) -> "BatchedPerturbationConfig":
+        return BatchedPerturbationConfig([PerturbationConfig.empty() for _ in range(batch_size)])

packages/ltx-core/src/ltx_core/loader/__init__.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""Loader utilities for model weights, LoRAs, and safetensor operations."""
+from ltx_core.loader.fuse_loras import apply_loras
+from ltx_core.loader.module_ops import ModuleOps
+from ltx_core.loader.primitives import (
+    LoRAAdaptableProtocol,
+    LoraPathStrengthAndSDOps,
+    LoraStateDictWithStrength,
+    ModelBuilderProtocol,
+    StateDict,
+    StateDictLoader,
+)
+from ltx_core.loader.registry import DummyRegistry, Registry, StateDictRegistry
+from ltx_core.loader.sd_ops import (
+    LTXV_LORA_COMFY_RENAMING_MAP,
+    ContentMatching,
+    ContentReplacement,
+    KeyValueOperation,
+    KeyValueOperationResult,
+    SDKeyValueOperation,
+    SDOps,
+)
+from ltx_core.loader.sft_loader import SafetensorsModelStateDictLoader, SafetensorsStateDictLoader
+from ltx_core.loader.single_gpu_model_builder import SingleGPUModelBuilder
+__all__ = [
+    "LTXV_LORA_COMFY_RENAMING_MAP",
+    "ContentMatching",
+    "ContentReplacement",
+    "DummyRegistry",
+    "KeyValueOperation",
+    "KeyValueOperationResult",
+    "LoRAAdaptableProtocol",
+    "LoraPathStrengthAndSDOps",
+    "LoraStateDictWithStrength",
+    "ModelBuilderProtocol",
+    "ModuleOps",
+    "Registry",
+    "SDKeyValueOperation",
+    "SDOps",
+    "SafetensorsModelStateDictLoader",
+    "SafetensorsStateDictLoader",
+    "SingleGPUModelBuilder",
+    "StateDict",
+    "StateDictLoader",
+    "StateDictRegistry",
+    "apply_loras",
+]

packages/ltx-core/src/ltx_core/loader/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.33 kB). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/fuse_loras.cpython-312.pyc ADDED Viewed

Binary file (7.41 kB). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/module_ops.cpython-312.pyc ADDED Viewed

Binary file (955 Bytes). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/primitives.cpython-312.pyc ADDED Viewed

Binary file (5.37 kB). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/registry.cpython-312.pyc ADDED Viewed

Binary file (5.68 kB). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/sd_ops.cpython-312.pyc ADDED Viewed

Binary file (6.81 kB). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/sft_loader.cpython-312.pyc ADDED Viewed

Binary file (4.36 kB). View file

packages/ltx-core/src/ltx_core/loader/__pycache__/single_gpu_model_builder.cpython-312.pyc ADDED Viewed

Binary file (8.84 kB). View file

packages/ltx-core/src/ltx_core/loader/fuse_loras.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import torch
+from ltx_core.loader.primitives import LoraStateDictWithStrength, StateDict
+from ltx_core.quantization.fp8_cast import calculate_weight_float8
+from ltx_core.quantization.fp8_scaled_mm import quantize_weight_to_fp8_per_tensor
+def apply_loras(
+    model_sd: StateDict,
+    lora_sd_and_strengths: list[LoraStateDictWithStrength],
+    dtype: torch.dtype | None = None,
+    destination_sd: StateDict | None = None,
+) -> StateDict:
+    sd = {}
+    if destination_sd is not None:
+        sd = destination_sd.sd
+    size = 0
+    device = torch.device("meta")
+    inner_dtypes = set()
+    for key, weight in model_sd.sd.items():
+        if weight is None:
+            continue
+        # Skip scale keys - they are handled together with their weight keys
+        if key.endswith(".weight_scale"):
+            continue
+        device = weight.device
+        target_dtype = dtype if dtype is not None else weight.dtype
+        deltas_dtype = target_dtype if target_dtype not in [torch.float8_e4m3fn, torch.float8_e5m2] else torch.bfloat16
+        scale_key = key.replace(".weight", ".weight_scale") if key.endswith(".weight") else None
+        is_scaled_fp8 = scale_key is not None and scale_key in model_sd.sd
+        deltas = _prepare_deltas(lora_sd_and_strengths, key, deltas_dtype, device)
+        fused = _fuse_deltas(deltas, weight, key, sd, target_dtype, device, is_scaled_fp8, scale_key, model_sd)
+        sd.update(fused)
+        for tensor in fused.values():
+            inner_dtypes.add(tensor.dtype)
+            size += tensor.nbytes
+    if destination_sd is not None:
+        return destination_sd
+    return StateDict(sd, device, size, inner_dtypes)
+def _prepare_deltas(
+    lora_sd_and_strengths: list[LoraStateDictWithStrength], key: str, dtype: torch.dtype, device: torch.device
+) -> torch.Tensor | None:
+    deltas = []
+    prefix = key[: -len(".weight")]
+    key_a = f"{prefix}.lora_A.weight"
+    key_b = f"{prefix}.lora_B.weight"
+    for lsd, coef in lora_sd_and_strengths:
+        if key_a not in lsd.sd or key_b not in lsd.sd:
+            continue
+        a = lsd.sd[key_a].to(device=device)
+        b = lsd.sd[key_b].to(device=device)
+        product = torch.matmul(b * coef, a)
+        del a, b
+        deltas.append(product.to(dtype=dtype))
+    if len(deltas) == 0:
+        return None
+    elif len(deltas) == 1:
+        return deltas[0]
+    return torch.sum(torch.stack(deltas, dim=0), dim=0)
+def _fuse_deltas(
+    deltas: torch.Tensor | None,
+    weight: torch.Tensor,
+    key: str,
+    sd: dict[str, torch.Tensor],
+    target_dtype: torch.dtype,
+    device: torch.device,
+    is_scaled_fp8: bool,
+    scale_key: str | None,
+    model_sd: StateDict,
+) -> dict[str, torch.Tensor]:
+    if deltas is None:
+        if key in sd:
+            return {}
+        fused = _copy_weight_without_lora(weight, key, target_dtype, device, is_scaled_fp8, scale_key, model_sd)
+    elif weight.dtype == torch.float8_e4m3fn:
+        if is_scaled_fp8:
+            fused = _fuse_delta_with_scaled_fp8(deltas, weight, key, scale_key, model_sd)
+        else:
+            fused = _fuse_delta_with_cast_fp8(deltas, weight, key, target_dtype, device)
+    elif weight.dtype == torch.bfloat16:
+        fused = _fuse_delta_with_bfloat16(deltas, weight, key, target_dtype)
+    else:
+        raise ValueError(f"Unsupported dtype: {weight.dtype}")
+    return fused
+def _copy_weight_without_lora(
+    weight: torch.Tensor,
+    key: str,
+    target_dtype: torch.dtype,
+    device: torch.device,
+    is_scaled_fp8: bool,
+    scale_key: str | None,
+    model_sd: StateDict,
+) -> dict[str, torch.Tensor]:
+    """Copy original weight (and scale if applicable) when no LoRA affects this key."""
+    result = {key: weight.clone().to(dtype=target_dtype, device=device)}
+    if is_scaled_fp8:
+        result[scale_key] = model_sd.sd[scale_key].clone()
+    return result
+def _fuse_delta_with_scaled_fp8(
+    deltas: torch.Tensor,
+    weight: torch.Tensor,
+    key: str,
+    scale_key: str,
+    model_sd: StateDict,
+) -> dict[str, torch.Tensor]:
+    """Dequantize scaled FP8 weight, add LoRA delta, and re-quantize."""
+    weight_scale = model_sd.sd[scale_key]
+    original_weight = weight.t().to(torch.float32) * weight_scale
+    new_weight = original_weight + deltas.to(torch.float32)
+    new_fp8_weight, new_weight_scale = quantize_weight_to_fp8_per_tensor(new_weight)
+    return {key: new_fp8_weight, scale_key: new_weight_scale}
+def _fuse_delta_with_cast_fp8(
+    deltas: torch.Tensor,
+    weight: torch.Tensor,
+    key: str,
+    target_dtype: torch.dtype,
+    device: torch.device,
+) -> dict[str, torch.Tensor]:
+    """Fuse LoRA delta with cast-only FP8 weight (no scale factor)."""
+    if str(device).startswith("cuda"):
+        deltas = calculate_weight_float8(deltas, weight)
+    else:
+        deltas.add_(weight.to(dtype=deltas.dtype, device=device))
+    return {key: deltas.to(dtype=target_dtype)}
+def _fuse_delta_with_bfloat16(
+    deltas: torch.Tensor,
+    weight: torch.Tensor,
+    key: str,
+    target_dtype: torch.dtype,
+) -> dict[str, torch.Tensor]:
+    """Fuse LoRA delta with bfloat16 weight."""
+    deltas.add_(weight)
+    return {key: deltas.to(dtype=target_dtype)}

packages/ltx-core/src/ltx_core/loader/kernels.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# ruff: noqa: ANN001, ANN201, ERA001, N803, N806
+import triton
+import triton.language as tl
+@triton.jit
+def fused_add_round_kernel(
+    x_ptr,
+    output_ptr,  # contents will be added to the output
+    seed,
+    n_elements,
+    EXPONENT_BIAS,
+    MANTISSA_BITS,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """
+    A kernel to upcast 8bit quantized weights to bfloat16 with stochastic rounding
+    and add them to bfloat16 output weights. Might be used to upcast original model weights
+    and to further add them to precalculated deltas coming from LoRAs.
+    """
+    # Get program ID and compute offsets
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    # Load data
+    x = tl.load(x_ptr + offsets, mask=mask)
+    rand_vals = tl.rand(seed, offsets) - 0.5
+    x = tl.cast(x, tl.float16)
+    delta = tl.load(output_ptr + offsets, mask=mask)
+    delta = tl.cast(delta, tl.float16)
+    x = x + delta
+    x_bits = tl.cast(x, tl.int16, bitcast=True)
+    # Calculate the exponent. Unbiased fp16 exponent is ((x_bits & 0x7C00) >> 10) - 15 for
+    # normal numbers and -14 for subnormals.
+    fp16_exponent_bits = (x_bits & 0x7C00) >> 10
+    fp16_normals = fp16_exponent_bits > 0
+    fp16_exponent = tl.where(fp16_normals, fp16_exponent_bits - 15, -14)
+    # Add the target dtype's exponent bias and clamp to the target dtype's exponent range.
+    exponent = fp16_exponent + EXPONENT_BIAS
+    MAX_EXPONENT = 2 * EXPONENT_BIAS + 1
+    exponent = tl.where(exponent > MAX_EXPONENT, MAX_EXPONENT, exponent)
+    exponent = tl.where(exponent < 0, 0, exponent)
+    # Normal ULP exponent, expressed as an fp16 exponent field:
+    # (exponent - EXPONENT_BIAS - MANTISSA_BITS) + 15
+    # Simplifies to: fp16_exponent - MANTISSA_BITS + 15
+    # See https://en.wikipedia.org/wiki/Unit_in_the_last_place
+    eps_exp = tl.maximum(0, tl.minimum(31, exponent - EXPONENT_BIAS - MANTISSA_BITS + 15))
+    # Calculate epsilon in the target dtype
+    eps_normal = tl.cast(tl.cast(eps_exp << 10, tl.int16), tl.float16, bitcast=True)
+    # Subnormal ULP: 2^(1 - EXPONENT_BIAS - MANTISSA_BITS) ->
+    # fp16 exponent bits: (1 - EXPONENT_BIAS - MANTISSA_BITS) + 15 =
+    # 16 - EXPONENT_BIAS - MANTISSA_BITS
+    eps_subnormal = tl.cast(tl.cast((16 - EXPONENT_BIAS - MANTISSA_BITS) << 10, tl.int16), tl.float16, bitcast=True)
+    eps = tl.where(exponent > 0, eps_normal, eps_subnormal)
+    # Apply zero mask to epsilon
+    eps = tl.where(x == 0, 0.0, eps)
+    # Apply stochastic rounding
+    output = tl.cast(x + rand_vals * eps, tl.bfloat16)
+    # Store the result
+    tl.store(output_ptr + offsets, output, mask=mask)

packages/ltx-core/src/ltx_core/loader/module_ops.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from typing import Callable, NamedTuple
+import torch
+class ModuleOps(NamedTuple):
+    """
+    Defines a named operation for matching and mutating PyTorch modules.
+    Used to selectively transform modules in a model (e.g., replacing layers with quantized versions).
+    """
+    name: str
+    matcher: Callable[[torch.nn.Module], bool]
+    mutator: Callable[[torch.nn.Module], torch.nn.Module]

packages/ltx-core/src/ltx_core/loader/primitives.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from dataclasses import dataclass
+from typing import NamedTuple, Protocol
+import torch
+from ltx_core.loader.module_ops import ModuleOps
+from ltx_core.loader.sd_ops import SDOps
+from ltx_core.model.model_protocol import ModelType
+@dataclass(frozen=True)
+class StateDict:
+    """
+    Immutable container for a PyTorch state dictionary.
+    Contains:
+    - sd: Dictionary of tensors (weights, buffers, etc.)
+    - device: Device where tensors are stored
+    - size: Total memory footprint in bytes
+    - dtype: Set of tensor dtypes present
+    """
+    sd: dict
+    device: torch.device
+    size: int
+    dtype: set[torch.dtype]
+    def footprint(self) -> tuple[int, torch.device]:
+        return self.size, self.device
+class StateDictLoader(Protocol):
+    """
+    Protocol for loading state dictionaries from various sources.
+    Implementations must provide:
+    - metadata: Extract model metadata from a single path
+    - load: Load state dict from path(s) and apply SDOps transformations
+    """
+    def metadata(self, path: str) -> dict:
+        """
+        Load metadata from path
+        """
+    def load(self, path: str | list[str], sd_ops: SDOps | None = None, device: torch.device | None = None) -> StateDict:
+        """
+        Load state dict from path or paths (for sharded model storage) and apply sd_ops
+        """
+class ModelBuilderProtocol(Protocol[ModelType]):
+    """
+    Protocol for building PyTorch models from configuration dictionaries.
+    Implementations must provide:
+    - meta_model: Create a model from configuration dictionary and apply module operations
+    - build: Create and initialize a model from state dictionary and apply dtype transformations
+    """
+    def meta_model(self, config: dict, module_ops: list[ModuleOps] | None = None) -> ModelType:
+        """
+        Create a model on the meta device from a configuration dictionary.
+        This decouples model creation from weight loading, allowing the model
+        architecture to be instantiated without allocating memory for parameters.
+        Args:
+            config: Model configuration dictionary.
+            module_ops: Optional list of module operations to apply (e.g., quantization).
+        Returns:
+            Model instance on meta device (no actual memory allocated for parameters).
+        """
+        ...
+    def build(self, dtype: torch.dtype | None = None) -> ModelType:
+        """
+        Build the model
+        Args:
+            dtype: Target dtype for the model, if None, uses the dtype of the model_path model
+        Returns:
+            Model instance
+        """
+        ...
+class LoRAAdaptableProtocol(Protocol):
+    """
+    Protocol for models that can be adapted with LoRAs.
+    Implementations must provide:
+    - lora: Add a LoRA to the model
+    """
+    def lora(self, lora_path: str, strength: float) -> "LoRAAdaptableProtocol":
+        pass
+class LoraPathStrengthAndSDOps(NamedTuple):
+    """
+    Tuple containing a LoRA path, strength, and SDOps for applying to the LoRA state dict.
+    """
+    path: str
+    strength: float
+    sd_ops: SDOps
+class LoraStateDictWithStrength(NamedTuple):
+    """
+    Tuple containing a LoRA state dict and strength for applying to the model.
+    """
+    state_dict: StateDict
+    strength: float

packages/ltx-core/src/ltx_core/loader/registry.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import hashlib
+import threading
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Protocol
+from ltx_core.loader.primitives import StateDict
+from ltx_core.loader.sd_ops import SDOps
+class Registry(Protocol):
+    """
+    Protocol for managing state dictionaries in a registry.
+    It is used to store state dictionaries and reuse them later without loading them again.
+    Implementations must provide:
+    - add: Add a state dictionary to the registry
+    - pop: Remove a state dictionary from the registry
+    - get: Retrieve a state dictionary from the registry
+    - clear: Clear all state dictionaries from the registry
+    """
+    def add(self, paths: list[str], sd_ops: SDOps | None, state_dict: StateDict) -> None: ...
+    def pop(self, paths: list[str], sd_ops: SDOps | None) -> StateDict | None: ...
+    def get(self, paths: list[str], sd_ops: SDOps | None) -> StateDict | None: ...
+    def clear(self) -> None: ...
+class DummyRegistry(Registry):
+    """
+    Dummy registry that does not store state dictionaries.
+    """
+    def add(self, paths: list[str], sd_ops: SDOps | None, state_dict: StateDict) -> None:
+        pass
+    def pop(self, paths: list[str], sd_ops: SDOps | None) -> StateDict | None:
+        pass
+    def get(self, paths: list[str], sd_ops: SDOps | None) -> StateDict | None:
+        pass
+    def clear(self) -> None:
+        pass
+@dataclass
+class StateDictRegistry(Registry):
+    """
+    Registry that stores state dictionaries in a dictionary.
+    """
+    _state_dicts: dict[str, StateDict] = field(default_factory=dict)
+    _lock: threading.Lock = field(default_factory=threading.Lock)
+    def _generate_id(self, paths: list[str], sd_ops: SDOps) -> str:
+        m = hashlib.sha256()
+        parts = [str(Path(p).resolve()) for p in paths]
+        if sd_ops is not None:
+            parts.append(sd_ops.name)
+        m.update("\0".join(parts).encode("utf-8"))
+        return m.hexdigest()
+    def add(self, paths: list[str], sd_ops: SDOps | None, state_dict: StateDict) -> str:
+        sd_id = self._generate_id(paths, sd_ops)
+        with self._lock:
+            if sd_id in self._state_dicts:
+                raise ValueError(f"State dict retrieved from {paths} with {sd_ops} already added, check with get first")
+            self._state_dicts[sd_id] = state_dict
+        return sd_id
+    def pop(self, paths: list[str], sd_ops: SDOps | None) -> StateDict | None:
+        with self._lock:
+            return self._state_dicts.pop(self._generate_id(paths, sd_ops), None)
+    def get(self, paths: list[str], sd_ops: SDOps | None) -> StateDict | None:
+        with self._lock:
+            return self._state_dicts.get(self._generate_id(paths, sd_ops), None)
+    def clear(self) -> None:
+        with self._lock:
+            self._state_dicts.clear()

packages/ltx-core/src/ltx_core/loader/sd_ops.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from dataclasses import dataclass, replace
+from typing import NamedTuple, Protocol
+import torch
+@dataclass(frozen=True, slots=True)
+class ContentReplacement:
+    """
+    Represents a content replacement operation.
+    Used to replace a specific content with a replacement in a state dict key.
+    """
+    content: str
+    replacement: str
+@dataclass(frozen=True, slots=True)
+class ContentMatching:
+    """
+    Represents a content matching operation.
+    Used to match a specific prefix and suffix in a state dict key.
+    """
+    prefix: str = ""
+    suffix: str = ""
+class KeyValueOperationResult(NamedTuple):
+    """
+    Represents the result of a key-value operation.
+    Contains the new key and value after the operation has been applied.
+    """
+    new_key: str
+    new_value: torch.Tensor
+class KeyValueOperation(Protocol):
+    """
+    Protocol for key-value operations.
+    Used to apply operations to a specific key and value in a state dict.
+    """
+    def __call__(self, tensor_key: str, tensor_value: torch.Tensor) -> list[KeyValueOperationResult]: ...
+@dataclass(frozen=True, slots=True)
+class SDKeyValueOperation:
+    """
+    Represents a key-value operation.
+    Used to apply operations to a specific key and value in a state dict.
+    """
+    key_matcher: ContentMatching
+    kv_operation: KeyValueOperation
+@dataclass(frozen=True, slots=True)
+class SDOps:
+    """Immutable class representing state dict key operations."""
+    name: str
+    mapping: tuple[
+        ContentReplacement | ContentMatching | SDKeyValueOperation, ...
+    ] = ()  # Immutable tuple of (key, value) pairs
+    def with_replacement(self, content: str, replacement: str) -> "SDOps":
+        """Create a new SDOps instance with the specified replacement added to the mapping."""
+        new_mapping = (*self.mapping, ContentReplacement(content, replacement))
+        return replace(self, mapping=new_mapping)
+    def with_matching(self, prefix: str = "", suffix: str = "") -> "SDOps":
+        """Create a new SDOps instance with the specified prefix and suffix matching added to the mapping."""
+        new_mapping = (*self.mapping, ContentMatching(prefix, suffix))
+        return replace(self, mapping=new_mapping)
+    def with_kv_operation(
+        self,
+        operation: KeyValueOperation,
+        key_prefix: str = "",
+        key_suffix: str = "",
+    ) -> "SDOps":
+        """Create a new SDOps instance with the specified value operation added to the mapping."""
+        key_matcher = ContentMatching(key_prefix, key_suffix)
+        sd_kv_operation = SDKeyValueOperation(key_matcher, operation)
+        new_mapping = (*self.mapping, sd_kv_operation)
+        return replace(self, mapping=new_mapping)
+    def apply_to_key(self, key: str) -> str | None:
+        """Apply the mapping to the given name."""
+        matchers = [content for content in self.mapping if isinstance(content, ContentMatching)]
+        valid = any(key.startswith(f.prefix) and key.endswith(f.suffix) for f in matchers)
+        if not valid:
+            return None
+        for replacement in self.mapping:
+            if not isinstance(replacement, ContentReplacement):
+                continue
+            if replacement.content in key:
+                key = key.replace(replacement.content, replacement.replacement)
+        return key
+    def apply_to_key_value(self, key: str, value: torch.Tensor) -> list[KeyValueOperationResult]:
+        """Apply the value operation to the given name and associated value."""
+        for operation in self.mapping:
+            if not isinstance(operation, SDKeyValueOperation):
+                continue
+            if key.startswith(operation.key_matcher.prefix) and key.endswith(operation.key_matcher.suffix):
+                return operation.kv_operation(key, value)
+        return [KeyValueOperationResult(key, value)]
+# Predefined SDOps instances
+LTXV_LORA_COMFY_RENAMING_MAP = (
+    SDOps("LTXV_LORA_COMFY_PREFIX_MAP").with_matching().with_replacement("diffusion_model.", "")
+)
+LTXV_LORA_COMFY_TARGET_MAP = (
+    SDOps("LTXV_LORA_COMFY_TARGET_MAP")
+    .with_matching()
+    .with_replacement("diffusion_model.", "")
+    .with_replacement(".lora_A.weight", ".weight")
+    .with_replacement(".lora_B.weight", ".weight")
+)

packages/ltx-core/src/ltx_core/loader/sft_loader.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import json
+import safetensors
+import torch
+from ltx_core.loader.primitives import StateDict, StateDictLoader
+from ltx_core.loader.sd_ops import SDOps
+class SafetensorsStateDictLoader(StateDictLoader):
+    """
+    Loads weights from safetensors files without metadata support.
+    Use this for loading raw weight files. For model files that include
+    configuration metadata, use SafetensorsModelStateDictLoader instead.
+    """
+    def metadata(self, path: str) -> dict:
+        raise NotImplementedError("Not implemented")
+    def load(self, path: str | list[str], sd_ops: SDOps, device: torch.device | None = None) -> StateDict:
+        """
+        Load state dict from path or paths (for sharded model storage) and apply sd_ops
+        """
+        sd = {}
+        size = 0
+        dtype = set()
+        device = device or torch.device("cpu")
+        model_paths = path if isinstance(path, list) else [path]
+        for shard_path in model_paths:
+            with safetensors.safe_open(shard_path, framework="pt", device=str(device)) as f:
+                safetensor_keys = f.keys()
+                for name in safetensor_keys:
+                    expected_name = name if sd_ops is None else sd_ops.apply_to_key(name)
+                    if expected_name is None:
+                        continue
+                    value = f.get_tensor(name).to(device=device, non_blocking=True, copy=False)
+                    key_value_pairs = ((expected_name, value),)
+                    if sd_ops is not None:
+                        key_value_pairs = sd_ops.apply_to_key_value(expected_name, value)
+                    for key, value in key_value_pairs:
+                        size += value.nbytes
+                        dtype.add(value.dtype)
+                        sd[key] = value
+        return StateDict(sd=sd, device=device, size=size, dtype=dtype)
+class SafetensorsModelStateDictLoader(StateDictLoader):
+    """
+    Loads weights and configuration metadata from safetensors model files.
+    Unlike SafetensorsStateDictLoader, this loader can read model configuration
+    from the safetensors file metadata via the metadata() method.
+    """
+    def __init__(self, weight_loader: SafetensorsStateDictLoader | None = None):
+        self.weight_loader = weight_loader if weight_loader is not None else SafetensorsStateDictLoader()
+    def metadata(self, path: str) -> dict:
+        with safetensors.safe_open(path, framework="pt") as f:
+            meta = f.metadata()
+            if meta is None or "config" not in meta:
+                return {}
+            return json.loads(meta["config"])
+    def load(self, path: str | list[str], sd_ops: SDOps | None = None, device: torch.device | None = None) -> StateDict:
+        return self.weight_loader.load(path, sd_ops, device)

packages/ltx-core/src/ltx_core/loader/single_gpu_model_builder.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import logging
+from dataclasses import dataclass, field, replace
+from typing import Generic
+import torch
+from ltx_core.loader.fuse_loras import apply_loras
+from ltx_core.loader.module_ops import ModuleOps
+from ltx_core.loader.primitives import (
+    LoRAAdaptableProtocol,
+    LoraPathStrengthAndSDOps,
+    LoraStateDictWithStrength,
+    ModelBuilderProtocol,
+    StateDict,
+    StateDictLoader,
+)
+from ltx_core.loader.registry import DummyRegistry, Registry
+from ltx_core.loader.sd_ops import SDOps
+from ltx_core.loader.sft_loader import SafetensorsModelStateDictLoader
+from ltx_core.model.model_protocol import ModelConfigurator, ModelType
+logger: logging.Logger = logging.getLogger(__name__)
+@dataclass(frozen=True)
+class SingleGPUModelBuilder(Generic[ModelType], ModelBuilderProtocol[ModelType], LoRAAdaptableProtocol):
+    """
+    Builder for PyTorch models residing on a single GPU.
+    Attributes:
+        model_class_configurator: Class responsible for constructing the model from a config dict.
+        model_path: Path (or tuple of shard paths) to the model's `.safetensors` checkpoint(s).
+        model_sd_ops: Optional state-dict operations applied when loading the model weights.
+        module_ops: Sequence of module-level mutations applied to the meta model before weight loading.
+        loras: Sequence of LoRA adapters (path, strength, optional sd_ops) to fuse into the model.
+        model_loader: Strategy for loading state dicts from disk. Defaults to
+            :class:`SafetensorsModelStateDictLoader`.
+        registry: Cache for already-loaded state dicts. Defaults to :class:`DummyRegistry` (no caching).
+        lora_load_device: Device used when loading LoRA weight tensors from disk. Defaults to
+            ``torch.device("cpu")``, which keeps LoRA weights in CPU memory and transfers them to
+            the target GPU sequentially during fusion, reducing peak GPU memory usage compared to
+            loading all LoRA weights directly onto the GPU at once.
+    """
+    model_class_configurator: type[ModelConfigurator[ModelType]]
+    model_path: str | tuple[str, ...]
+    model_sd_ops: SDOps | None = None
+    module_ops: tuple[ModuleOps, ...] = field(default_factory=tuple)
+    loras: tuple[LoraPathStrengthAndSDOps, ...] = field(default_factory=tuple)
+    model_loader: StateDictLoader = field(default_factory=SafetensorsModelStateDictLoader)
+    registry: Registry = field(default_factory=DummyRegistry)
+    lora_load_device: torch.device = field(default_factory=lambda: torch.device("cpu"))
+    def lora(self, lora_path: str, strength: float = 1.0, sd_ops: SDOps | None = None) -> "SingleGPUModelBuilder":
+        return replace(self, loras=(*self.loras, LoraPathStrengthAndSDOps(lora_path, strength, sd_ops)))
+    def model_config(self) -> dict:
+        first_shard_path = self.model_path[0] if isinstance(self.model_path, tuple) else self.model_path
+        return self.model_loader.metadata(first_shard_path)
+    def meta_model(self, config: dict, module_ops: tuple[ModuleOps, ...]) -> ModelType:
+        with torch.device("meta"):
+            model = self.model_class_configurator.from_config(config)
+        for module_op in module_ops:
+            if module_op.matcher(model):
+                model = module_op.mutator(model)
+        return model
+    def load_sd(
+        self, paths: list[str], registry: Registry, device: torch.device | None, sd_ops: SDOps | None = None
+    ) -> StateDict:
+        state_dict = registry.get(paths, sd_ops)
+        if state_dict is None:
+            state_dict = self.model_loader.load(paths, sd_ops=sd_ops, device=device)
+            registry.add(paths, sd_ops=sd_ops, state_dict=state_dict)
+        return state_dict
+    def _return_model(self, meta_model: ModelType, device: torch.device) -> ModelType:
+        uninitialized_params = [name for name, param in meta_model.named_parameters() if str(param.device) == "meta"]
+        uninitialized_buffers = [name for name, buffer in meta_model.named_buffers() if str(buffer.device) == "meta"]
+        if uninitialized_params or uninitialized_buffers:
+            logger.warning(f"Uninitialized parameters or buffers: {uninitialized_params + uninitialized_buffers}")
+            return meta_model
+        retval = meta_model.to(device)
+        return retval
+    def build(self, device: torch.device | None = None, dtype: torch.dtype | None = None) -> ModelType:
+        device = torch.device("cuda") if device is None else device
+        config = self.model_config()
+        meta_model = self.meta_model(config, self.module_ops)
+        model_paths = list(self.model_path) if isinstance(self.model_path, tuple) else [self.model_path]
+        model_state_dict = self.load_sd(model_paths, sd_ops=self.model_sd_ops, registry=self.registry, device=device)
+        lora_strengths = [lora.strength for lora in self.loras]
+        if not lora_strengths or (min(lora_strengths) == 0 and max(lora_strengths) == 0):
+            sd = model_state_dict.sd
+            if dtype is not None:
+                sd = {key: value.to(dtype=dtype) for key, value in model_state_dict.sd.items()}
+            meta_model.load_state_dict(sd, strict=False, assign=True)
+            return self._return_model(meta_model, device)
+        lora_state_dicts = [
+            self.load_sd([lora.path], sd_ops=lora.sd_ops, registry=self.registry, device=self.lora_load_device)
+            for lora in self.loras
+        ]
+        lora_sd_and_strengths = [
+            LoraStateDictWithStrength(sd, strength)
+            for sd, strength in zip(lora_state_dicts, lora_strengths, strict=True)
+        ]
+        final_sd = apply_loras(
+            model_sd=model_state_dict,
+            lora_sd_and_strengths=lora_sd_and_strengths,
+            dtype=dtype,
+            destination_sd=model_state_dict if isinstance(self.registry, DummyRegistry) else None,
+        )
+        meta_model.load_state_dict(final_sd.sd, strict=False, assign=True)
+        return self._return_model(meta_model, device)

packages/ltx-core/src/ltx_core/model/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""Model definitions for LTX-2."""
+from ltx_core.model.model_protocol import ModelConfigurator, ModelType
+__all__ = [
+    "ModelConfigurator",
+    "ModelType",
+]

packages/ltx-core/src/ltx_core/model/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (358 Bytes). View file

packages/ltx-core/src/ltx_core/model/__pycache__/model_protocol.cpython-312.pyc ADDED Viewed

Binary file (807 Bytes). View file

packages/ltx-core/src/ltx_core/model/audio_vae/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""Audio VAE model components."""
+from ltx_core.model.audio_vae.audio_vae import AudioDecoder, AudioEncoder, decode_audio, encode_audio
+from ltx_core.model.audio_vae.model_configurator import (
+    AUDIO_VAE_DECODER_COMFY_KEYS_FILTER,
+    AUDIO_VAE_ENCODER_COMFY_KEYS_FILTER,
+    VOCODER_COMFY_KEYS_FILTER,
+    AudioDecoderConfigurator,
+    AudioEncoderConfigurator,
+    VocoderConfigurator,
+)
+from ltx_core.model.audio_vae.ops import AudioProcessor
+from ltx_core.model.audio_vae.vocoder import Vocoder, VocoderWithBWE
+__all__ = [
+    "AUDIO_VAE_DECODER_COMFY_KEYS_FILTER",
+    "AUDIO_VAE_ENCODER_COMFY_KEYS_FILTER",
+    "VOCODER_COMFY_KEYS_FILTER",
+    "AudioDecoder",
+    "AudioDecoderConfigurator",
+    "AudioEncoder",
+    "AudioEncoderConfigurator",
+    "AudioProcessor",
+    "Vocoder",
+    "VocoderConfigurator",
+    "VocoderWithBWE",
+    "decode_audio",
+    "encode_audio",
+]

packages/ltx-core/src/ltx_core/model/audio_vae/attention.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from enum import Enum
+import torch
+from ltx_core.model.common.normalization import NormType, build_normalization_layer
+class AttentionType(Enum):
+    """Enum for specifying the attention mechanism type."""
+    VANILLA = "vanilla"
+    LINEAR = "linear"
+    NONE = "none"
+class AttnBlock(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        norm_type: NormType = NormType.GROUP,
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = build_normalization_layer(in_channels, normtype=norm_type)
+        self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w).contiguous()
+        q = q.permute(0, 2, 1).contiguous()  # b,hw,c
+        k = k.reshape(b, c, h * w).contiguous()  # b,c,hw
+        w_ = torch.bmm(q, k).contiguous()  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w).contiguous()
+        w_ = w_.permute(0, 2, 1).contiguous()  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v, w_).contiguous()  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w).contiguous()
+        h_ = self.proj_out(h_)
+        return x + h_
+def make_attn(
+    in_channels: int,
+    attn_type: AttentionType = AttentionType.VANILLA,
+    norm_type: NormType = NormType.GROUP,
+) -> torch.nn.Module:
+    match attn_type:
+        case AttentionType.VANILLA:
+            return AttnBlock(in_channels, norm_type=norm_type)
+        case AttentionType.NONE:
+            return torch.nn.Identity()
+        case AttentionType.LINEAR:
+            raise NotImplementedError(f"Attention type {attn_type.value} is not supported yet.")
+        case _:
+            raise ValueError(f"Unknown attention type: {attn_type}")

packages/ltx-core/src/ltx_core/model/audio_vae/audio_vae.py ADDED Viewed

	@@ -0,0 +1,508 @@

+from typing import Set, Tuple
+import torch
+import torch.nn.functional as F
+from ltx_core.components.patchifiers import AudioPatchifier
+from ltx_core.model.audio_vae.attention import AttentionType, make_attn
+from ltx_core.model.audio_vae.causal_conv_2d import make_conv2d
+from ltx_core.model.audio_vae.causality_axis import CausalityAxis
+from ltx_core.model.audio_vae.downsample import build_downsampling_path
+from ltx_core.model.audio_vae.ops import AudioProcessor, PerChannelStatistics
+from ltx_core.model.audio_vae.resnet import ResnetBlock
+from ltx_core.model.audio_vae.upsample import build_upsampling_path
+from ltx_core.model.audio_vae.vocoder import Vocoder
+from ltx_core.model.common.normalization import NormType, build_normalization_layer
+from ltx_core.types import Audio, AudioLatentShape
+LATENT_DOWNSAMPLE_FACTOR = 4
+def build_mid_block(
+    channels: int,
+    temb_channels: int,
+    dropout: float,
+    norm_type: NormType,
+    causality_axis: CausalityAxis,
+    attn_type: AttentionType,
+    add_attention: bool,
+) -> torch.nn.Module:
+    """Build the middle block with two ResNet blocks and optional attention."""
+    mid = torch.nn.Module()
+    mid.block_1 = ResnetBlock(
+        in_channels=channels,
+        out_channels=channels,
+        temb_channels=temb_channels,
+        dropout=dropout,
+        norm_type=norm_type,
+        causality_axis=causality_axis,
+    )
+    mid.attn_1 = make_attn(channels, attn_type=attn_type, norm_type=norm_type) if add_attention else torch.nn.Identity()
+    mid.block_2 = ResnetBlock(
+        in_channels=channels,
+        out_channels=channels,
+        temb_channels=temb_channels,
+        dropout=dropout,
+        norm_type=norm_type,
+        causality_axis=causality_axis,
+    )
+    return mid
+def run_mid_block(mid: torch.nn.Module, features: torch.Tensor) -> torch.Tensor:
+    """Run features through the middle block."""
+    features = mid.block_1(features, temb=None)
+    features = mid.attn_1(features)
+    return mid.block_2(features, temb=None)
+class AudioEncoder(torch.nn.Module):
+    """
+    Encoder that compresses audio spectrograms into latent representations.
+    The encoder uses a series of downsampling blocks with residual connections,
+    attention mechanisms, and configurable causal convolutions.
+    """
+    def __init__(  # noqa: PLR0913
+        self,
+        *,
+        ch: int,
+        ch_mult: Tuple[int, ...] = (1, 2, 4, 8),
+        num_res_blocks: int,
+        attn_resolutions: Set[int],
+        dropout: float = 0.0,
+        resamp_with_conv: bool = True,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+        double_z: bool = True,
+        attn_type: AttentionType = AttentionType.VANILLA,
+        mid_block_add_attention: bool = True,
+        norm_type: NormType = NormType.GROUP,
+        causality_axis: CausalityAxis = CausalityAxis.WIDTH,
+        sample_rate: int = 16000,
+        mel_hop_length: int = 160,
+        n_fft: int = 1024,
+        is_causal: bool = True,
+        mel_bins: int = 64,
+        **_ignore_kwargs,
+    ) -> None:
+        """
+        Initialize the Encoder.
+        Args:
+            Arguments are configuration parameters, loaded from the audio VAE checkpoint config
+            (audio_vae.model.params.ddconfig):
+            ch: Base number of feature channels used in the first convolution layer.
+            ch_mult: Multiplicative factors for the number of channels at each resolution level.
+            num_res_blocks: Number of residual blocks to use at each resolution level.
+            attn_resolutions: Spatial resolutions (e.g., in time/frequency) at which to apply attention.
+            resolution: Input spatial resolution of the spectrogram (height, width).
+            z_channels: Number of channels in the latent representation.
+            norm_type: Normalization layer type to use within the network (e.g., group, batch).
+            causality_axis: Axis along which convolutions should be causal (e.g., time axis).
+            sample_rate: Audio sample rate in Hz for the input signals.
+            mel_hop_length: Hop length used when computing the mel spectrogram.
+            n_fft: FFT size used to compute the spectrogram.
+            mel_bins: Number of mel-frequency bins in the input spectrogram.
+            in_channels: Number of channels in the input spectrogram tensor.
+            double_z: If True, predict both mean and log-variance (doubling latent channels).
+            is_causal: If True, use causal convolutions suitable for streaming setups.
+            dropout: Dropout probability used in residual and mid blocks.
+            attn_type: Type of attention mechanism to use in attention blocks.
+            resamp_with_conv: If True, perform resolution changes using strided convolutions.
+            mid_block_add_attention: If True, add an attention block in the mid-level of the encoder.
+        """
+        super().__init__()
+        self.per_channel_statistics = PerChannelStatistics(latent_channels=ch)
+        self.sample_rate = sample_rate
+        self.mel_hop_length = mel_hop_length
+        self.n_fft = n_fft
+        self.is_causal = is_causal
+        self.mel_bins = mel_bins
+        self.patchifier = AudioPatchifier(
+            patch_size=1,
+            audio_latent_downsample_factor=LATENT_DOWNSAMPLE_FACTOR,
+            sample_rate=sample_rate,
+            hop_length=mel_hop_length,
+            is_causal=is_causal,
+        )
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.z_channels = z_channels
+        self.double_z = double_z
+        self.norm_type = norm_type
+        self.causality_axis = causality_axis
+        self.attn_type = attn_type
+        # downsampling
+        self.conv_in = make_conv2d(
+            in_channels,
+            self.ch,
+            kernel_size=3,
+            stride=1,
+            causality_axis=self.causality_axis,
+        )
+        self.non_linearity = torch.nn.SiLU()
+        self.down, block_in = build_downsampling_path(
+            ch=ch,
+            ch_mult=ch_mult,
+            num_resolutions=self.num_resolutions,
+            num_res_blocks=num_res_blocks,
+            resolution=resolution,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            norm_type=self.norm_type,
+            causality_axis=self.causality_axis,
+            attn_type=self.attn_type,
+            attn_resolutions=attn_resolutions,
+            resamp_with_conv=resamp_with_conv,
+        )
+        self.mid = build_mid_block(
+            channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            norm_type=self.norm_type,
+            causality_axis=self.causality_axis,
+            attn_type=self.attn_type,
+            add_attention=mid_block_add_attention,
+        )
+        self.norm_out = build_normalization_layer(block_in, normtype=self.norm_type)
+        self.conv_out = make_conv2d(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            causality_axis=self.causality_axis,
+        )
+    def forward(self, spectrogram: torch.Tensor) -> torch.Tensor:
+        """
+        Encode audio spectrogram into latent representations.
+        Args:
+            spectrogram: Input spectrogram of shape (batch, channels, time, frequency)
+        Returns:
+            Encoded latent representation of shape (batch, channels, frames, mel_bins)
+        """
+        h = self.conv_in(spectrogram)
+        h = self._run_downsampling_path(h)
+        h = run_mid_block(self.mid, h)
+        h = self._finalize_output(h)
+        return self._normalize_latents(h)
+    def _run_downsampling_path(self, h: torch.Tensor) -> torch.Tensor:
+        for level in range(self.num_resolutions):
+            stage = self.down[level]
+            for block_idx in range(self.num_res_blocks):
+                h = stage.block[block_idx](h, temb=None)
+                if stage.attn:
+                    h = stage.attn[block_idx](h)
+            if level != self.num_resolutions - 1:
+                h = stage.downsample(h)
+        return h
+    def _finalize_output(self, h: torch.Tensor) -> torch.Tensor:
+        h = self.norm_out(h)
+        h = self.non_linearity(h)
+        return self.conv_out(h)
+    def _normalize_latents(self, latent_output: torch.Tensor) -> torch.Tensor:
+        """
+        Normalize encoder latents using per-channel statistics.
+        When the encoder is configured with ``double_z=True``, the final
+        convolution produces twice the number of latent channels, typically
+        interpreted as two concatenated tensors along the channel dimension
+        (e.g., mean and variance or other auxiliary parameters).
+        This method intentionally uses only the first half of the channels
+        (the "mean" component) as input to the patchifier and normalization
+        logic. The remaining channels are left unchanged by this method and
+        are expected to be consumed elsewhere in the VAE pipeline.
+        If ``double_z=False``, the encoder output already contains only the
+        mean latents and the chunking operation simply returns that tensor.
+        """
+        means = torch.chunk(latent_output, 2, dim=1)[0]
+        latent_shape = AudioLatentShape(
+            batch=means.shape[0],
+            channels=means.shape[1],
+            frames=means.shape[2],
+            mel_bins=means.shape[3],
+        )
+        latent_patched = self.patchifier.patchify(means)
+        latent_normalized = self.per_channel_statistics.normalize(latent_patched)
+        return self.patchifier.unpatchify(latent_normalized, latent_shape)
+def encode_audio(
+    audio: Audio,
+    audio_encoder: AudioEncoder,
+    audio_processor: AudioProcessor | None = None,
+) -> torch.Tensor:
+    """Encode audio waveform into latent representation.
+    Args:
+        audio: Audio container with waveform tensor of shape (batch, channels, samples) and sampling rate.
+        audio_encoder: Audio encoder model
+        audio_processor: Audio processor model (optional, if not provided, it will be created from the audio encoder)
+    """
+    dtype = next(audio_encoder.parameters()).dtype
+    device = next(audio_encoder.parameters()).device
+    if audio_processor is None:
+        audio_processor = AudioProcessor(
+            target_sample_rate=audio_encoder.sample_rate,
+            mel_bins=audio_encoder.mel_bins,
+            mel_hop_length=audio_encoder.mel_hop_length,
+            n_fft=audio_encoder.n_fft,
+        ).to(device=device)
+    mel_spectrogram = audio_processor.waveform_to_mel(audio.to(device=device))
+    latent = audio_encoder(mel_spectrogram.to(dtype=dtype))
+    return latent
+class AudioDecoder(torch.nn.Module):
+    """
+    Symmetric decoder that reconstructs audio spectrograms from latent features.
+    The decoder mirrors the encoder structure with configurable channel multipliers,
+    attention resolutions, and causal convolutions.
+    """
+    def __init__(  # noqa: PLR0913
+        self,
+        *,
+        ch: int,
+        out_ch: int,
+        ch_mult: Tuple[int, ...] = (1, 2, 4, 8),
+        num_res_blocks: int,
+        attn_resolutions: Set[int],
+        resolution: int,
+        z_channels: int,
+        norm_type: NormType = NormType.GROUP,
+        causality_axis: CausalityAxis = CausalityAxis.WIDTH,
+        dropout: float = 0.0,
+        mid_block_add_attention: bool = True,
+        sample_rate: int = 16000,
+        mel_hop_length: int = 160,
+        is_causal: bool = True,
+        mel_bins: int | None = None,
+    ) -> None:
+        """
+        Initialize the Decoder.
+        Args:
+            Arguments are configuration parameters, loaded from the audio VAE checkpoint config
+            (audio_vae.model.params.ddconfig):
+            - ch, out_ch, ch_mult, num_res_blocks, attn_resolutions
+            - resolution, z_channels
+            - norm_type, causality_axis
+        """
+        super().__init__()
+        # Internal behavioural defaults that are not driven by the checkpoint.
+        resamp_with_conv = True
+        attn_type = AttentionType.VANILLA
+        # Per-channel statistics for denormalizing latents
+        self.per_channel_statistics = PerChannelStatistics(latent_channels=ch)
+        self.sample_rate = sample_rate
+        self.mel_hop_length = mel_hop_length
+        self.is_causal = is_causal
+        self.mel_bins = mel_bins
+        self.patchifier = AudioPatchifier(
+            patch_size=1,
+            audio_latent_downsample_factor=LATENT_DOWNSAMPLE_FACTOR,
+            sample_rate=sample_rate,
+            hop_length=mel_hop_length,
+            is_causal=is_causal,
+        )
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.out_ch = out_ch
+        self.give_pre_end = False
+        self.tanh_out = False
+        self.norm_type = norm_type
+        self.z_channels = z_channels
+        self.channel_multipliers = ch_mult
+        self.attn_resolutions = attn_resolutions
+        self.causality_axis = causality_axis
+        self.attn_type = attn_type
+        base_block_channels = ch * self.channel_multipliers[-1]
+        base_resolution = resolution // (2 ** (self.num_resolutions - 1))
+        self.z_shape = (1, z_channels, base_resolution, base_resolution)
+        self.conv_in = make_conv2d(
+            z_channels, base_block_channels, kernel_size=3, stride=1, causality_axis=self.causality_axis
+        )
+        self.non_linearity = torch.nn.SiLU()
+        self.mid = build_mid_block(
+            channels=base_block_channels,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            norm_type=self.norm_type,
+            causality_axis=self.causality_axis,
+            attn_type=self.attn_type,
+            add_attention=mid_block_add_attention,
+        )
+        self.up, final_block_channels = build_upsampling_path(
+            ch=ch,
+            ch_mult=ch_mult,
+            num_resolutions=self.num_resolutions,
+            num_res_blocks=num_res_blocks,
+            resolution=resolution,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+            norm_type=self.norm_type,
+            causality_axis=self.causality_axis,
+            attn_type=self.attn_type,
+            attn_resolutions=attn_resolutions,
+            resamp_with_conv=resamp_with_conv,
+            initial_block_channels=base_block_channels,
+        )
+        self.norm_out = build_normalization_layer(final_block_channels, normtype=self.norm_type)
+        self.conv_out = make_conv2d(
+            final_block_channels, out_ch, kernel_size=3, stride=1, causality_axis=self.causality_axis
+        )
+    def forward(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        Decode latent features back to audio spectrograms.
+        Args:
+            sample: Encoded latent representation of shape (batch, channels, frames, mel_bins)
+        Returns:
+            Reconstructed audio spectrogram of shape (batch, channels, time, frequency)
+        """
+        sample, target_shape = self._denormalize_latents(sample)
+        h = self.conv_in(sample)
+        h = run_mid_block(self.mid, h)
+        h = self._run_upsampling_path(h)
+        h = self._finalize_output(h)
+        return self._adjust_output_shape(h, target_shape)
+    def _denormalize_latents(self, sample: torch.Tensor) -> tuple[torch.Tensor, AudioLatentShape]:
+        latent_shape = AudioLatentShape(
+            batch=sample.shape[0],
+            channels=sample.shape[1],
+            frames=sample.shape[2],
+            mel_bins=sample.shape[3],
+        )
+        sample_patched = self.patchifier.patchify(sample)
+        sample_denormalized = self.per_channel_statistics.un_normalize(sample_patched)
+        sample = self.patchifier.unpatchify(sample_denormalized, latent_shape)
+        target_frames = latent_shape.frames * LATENT_DOWNSAMPLE_FACTOR
+        if self.causality_axis != CausalityAxis.NONE:
+            target_frames = max(target_frames - (LATENT_DOWNSAMPLE_FACTOR - 1), 1)
+        target_shape = AudioLatentShape(
+            batch=latent_shape.batch,
+            channels=self.out_ch,
+            frames=target_frames,
+            mel_bins=self.mel_bins if self.mel_bins is not None else latent_shape.mel_bins,
+        )
+        return sample, target_shape
+    def _adjust_output_shape(
+        self,
+        decoded_output: torch.Tensor,
+        target_shape: AudioLatentShape,
+    ) -> torch.Tensor:
+        """
+        Adjust output shape to match target dimensions for variable-length audio.
+        This function handles the common case where decoded audio spectrograms need to be
+        resized to match a specific target shape.
+        Args:
+            decoded_output: Tensor of shape (batch, channels, time, frequency)
+            target_shape: AudioLatentShape describing (batch, channels, time, mel bins)
+        Returns:
+            Tensor adjusted to match target_shape exactly
+        """
+        # Current output shape: (batch, channels, time, frequency)
+        _, _, current_time, current_freq = decoded_output.shape
+        target_channels = target_shape.channels
+        target_time = target_shape.frames
+        target_freq = target_shape.mel_bins
+        # Step 1: Crop first to avoid exceeding target dimensions
+        decoded_output = decoded_output[
+            :, :target_channels, : min(current_time, target_time), : min(current_freq, target_freq)
+        ]
+        # Step 2: Calculate padding needed for time and frequency dimensions
+        time_padding_needed = target_time - decoded_output.shape[2]
+        freq_padding_needed = target_freq - decoded_output.shape[3]
+        # Step 3: Apply padding if needed
+        if time_padding_needed > 0 or freq_padding_needed > 0:
+            # PyTorch padding format: (pad_left, pad_right, pad_top, pad_bottom)
+            # For audio: pad_left/right = frequency, pad_top/bottom = time
+            padding = (
+                0,
+                max(freq_padding_needed, 0),  # frequency padding (left, right)
+                0,
+                max(time_padding_needed, 0),  # time padding (top, bottom)
+            )
+            decoded_output = F.pad(decoded_output, padding)
+        # Step 4: Final safety crop to ensure exact target shape
+        decoded_output = decoded_output[:, :target_channels, :target_time, :target_freq]
+        return decoded_output
+    def _run_upsampling_path(self, h: torch.Tensor) -> torch.Tensor:
+        for level in reversed(range(self.num_resolutions)):
+            stage = self.up[level]
+            for block_idx, block in enumerate(stage.block):
+                h = block(h, temb=None)
+                if stage.attn:
+                    h = stage.attn[block_idx](h)
+            if level != 0 and hasattr(stage, "upsample"):
+                h = stage.upsample(h)
+        return h
+    def _finalize_output(self, h: torch.Tensor) -> torch.Tensor:
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = self.non_linearity(h)
+        h = self.conv_out(h)
+        return torch.tanh(h) if self.tanh_out else h
+def decode_audio(latent: torch.Tensor, audio_decoder: "AudioDecoder", vocoder: "Vocoder") -> Audio:
+    """
+    Decode an audio latent representation using the provided audio decoder and vocoder.
+    Args:
+        latent: Input audio latent tensor.
+        audio_decoder: Model to decode the latent to waveform features.
+        vocoder: Model to convert decoded features to audio waveform.
+    Returns:
+        Decoded audio with waveform and sampling rate.
+    """
+    decoded_audio = audio_decoder(latent)
+    waveform = vocoder(decoded_audio).squeeze(0).float()
+    return Audio(waveform=waveform, sampling_rate=vocoder.output_sampling_rate)

packages/ltx-core/src/ltx_core/model/audio_vae/causal_conv_2d.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import torch
+import torch.nn.functional as F
+from ltx_core.model.audio_vae.causality_axis import CausalityAxis
+class CausalConv2d(torch.nn.Module):
+    """
+    A causal 2D convolution.
+    This layer ensures that the output at time `t` only depends on inputs
+    at time `t` and earlier. It achieves this by applying asymmetric padding
+    to the time dimension (width) before the convolution.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int | tuple[int, int],
+        stride: int = 1,
+        dilation: int | tuple[int, int] = 1,
+        groups: int = 1,
+        bias: bool = True,
+        causality_axis: CausalityAxis = CausalityAxis.HEIGHT,
+    ) -> None:
+        super().__init__()
+        self.causality_axis = causality_axis
+        # Ensure kernel_size and dilation are tuples
+        kernel_size = torch.nn.modules.utils._pair(kernel_size)
+        dilation = torch.nn.modules.utils._pair(dilation)
+        # Calculate padding dimensions
+        pad_h = (kernel_size[0] - 1) * dilation[0]
+        pad_w = (kernel_size[1] - 1) * dilation[1]
+        # The padding tuple for F.pad is (pad_left, pad_right, pad_top, pad_bottom)
+        match self.causality_axis:
+            case CausalityAxis.NONE:
+                self.padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)
+            case CausalityAxis.WIDTH | CausalityAxis.WIDTH_COMPATIBILITY:
+                self.padding = (pad_w, 0, pad_h // 2, pad_h - pad_h // 2)
+            case CausalityAxis.HEIGHT:
+                self.padding = (pad_w // 2, pad_w - pad_w // 2, pad_h, 0)
+            case _:
+                raise ValueError(f"Invalid causality_axis: {causality_axis}")
+        # The internal convolution layer uses no padding, as we handle it manually
+        self.conv = torch.nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=0,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Apply causal padding before convolution
+        x = F.pad(x, self.padding)
+        return self.conv(x)
+def make_conv2d(
+    in_channels: int,
+    out_channels: int,
+    kernel_size: int | tuple[int, int],
+    stride: int = 1,
+    padding: tuple[int, int, int, int] | None = None,
+    dilation: int = 1,
+    groups: int = 1,
+    bias: bool = True,
+    causality_axis: CausalityAxis | None = None,
+) -> torch.nn.Module:
+    """
+    Create a 2D convolution layer that can be either causal or non-causal.
+    Args:
+        in_channels: Number of input channels
+        out_channels: Number of output channels
+        kernel_size: Size of the convolution kernel
+        stride: Convolution stride
+        padding: Padding (if None, will be calculated based on causal flag)
+        dilation: Dilation rate
+        groups: Number of groups for grouped convolution
+        bias: Whether to use bias
+        causality_axis: Dimension along which to apply causality.
+    Returns:
+        Either a regular Conv2d or CausalConv2d layer
+    """
+    if causality_axis is not None:
+        # For causal convolution, padding is handled internally by CausalConv2d
+        return CausalConv2d(in_channels, out_channels, kernel_size, stride, dilation, groups, bias, causality_axis)
+    else:
+        # For non-causal convolution, use symmetric padding if not specified
+        if padding is None:
+            padding = kernel_size // 2 if isinstance(kernel_size, int) else tuple(k // 2 for k in kernel_size)
+        return torch.nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+        )

packages/ltx-core/src/ltx_core/model/audio_vae/downsample.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from typing import Set, Tuple
+import torch
+from ltx_core.model.audio_vae.attention import AttentionType, make_attn
+from ltx_core.model.audio_vae.causality_axis import CausalityAxis
+from ltx_core.model.audio_vae.resnet import ResnetBlock
+from ltx_core.model.common.normalization import NormType
+class Downsample(torch.nn.Module):
+    """
+    A downsampling layer that can use either a strided convolution
+    or average pooling. Supports standard and causal padding for the
+    convolutional mode.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        with_conv: bool,
+        causality_axis: CausalityAxis = CausalityAxis.WIDTH,
+    ) -> None:
+        super().__init__()
+        self.with_conv = with_conv
+        self.causality_axis = causality_axis
+        if self.causality_axis != CausalityAxis.NONE and not self.with_conv:
+            raise ValueError("causality is only supported when `with_conv=True`.")
+        if self.with_conv:
+            # Do time downsampling here
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.with_conv:
+            # Padding tuple is in the order: (left, right, top, bottom).
+            match self.causality_axis:
+                case CausalityAxis.NONE:
+                    pad = (0, 1, 0, 1)
+                case CausalityAxis.WIDTH:
+                    pad = (2, 0, 0, 1)
+                case CausalityAxis.HEIGHT:
+                    pad = (0, 1, 2, 0)
+                case CausalityAxis.WIDTH_COMPATIBILITY:
+                    pad = (1, 0, 0, 1)
+                case _:
+                    raise ValueError(f"Invalid causality_axis: {self.causality_axis}")
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            # This branch is only taken if with_conv=False, which implies causality_axis is NONE.
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+def build_downsampling_path(  # noqa: PLR0913
+    *,
+    ch: int,
+    ch_mult: Tuple[int, ...],
+    num_resolutions: int,
+    num_res_blocks: int,
+    resolution: int,
+    temb_channels: int,
+    dropout: float,
+    norm_type: NormType,
+    causality_axis: CausalityAxis,
+    attn_type: AttentionType,
+    attn_resolutions: Set[int],
+    resamp_with_conv: bool,
+) -> tuple[torch.nn.ModuleList, int]:
+    """Build the downsampling path with residual blocks, attention, and downsampling layers."""
+    down_modules = torch.nn.ModuleList()
+    curr_res = resolution
+    in_ch_mult = (1, *tuple(ch_mult))
+    block_in = ch
+    for i_level in range(num_resolutions):
+        block = torch.nn.ModuleList()
+        attn = torch.nn.ModuleList()
+        block_in = ch * in_ch_mult[i_level]
+        block_out = ch * ch_mult[i_level]
+        for _ in range(num_res_blocks):
+            block.append(
+                ResnetBlock(
+                    in_channels=block_in,
+                    out_channels=block_out,
+                    temb_channels=temb_channels,
+                    dropout=dropout,
+                    norm_type=norm_type,
+                    causality_axis=causality_axis,
+                )
+            )
+            block_in = block_out
+            if curr_res in attn_resolutions:
+                attn.append(make_attn(block_in, attn_type=attn_type, norm_type=norm_type))
+        down = torch.nn.Module()
+        down.block = block
+        down.attn = attn
+        if i_level != num_resolutions - 1:
+            down.downsample = Downsample(block_in, resamp_with_conv, causality_axis=causality_axis)
+            curr_res = curr_res // 2
+        down_modules.append(down)
+    return down_modules, block_in

packages/ltx-core/src/ltx_core/model/audio_vae/model_configurator.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import torch
+from ltx_core.loader.sd_ops import KeyValueOperationResult, SDOps
+from ltx_core.model.audio_vae.attention import AttentionType
+from ltx_core.model.audio_vae.audio_vae import AudioDecoder, AudioEncoder
+from ltx_core.model.audio_vae.causality_axis import CausalityAxis
+from ltx_core.model.audio_vae.vocoder import MelSTFT, Vocoder, VocoderWithBWE
+from ltx_core.model.common.normalization import NormType
+from ltx_core.model.model_protocol import ModelConfigurator
+from ltx_core.utils import check_config_value
+def _vocoder_from_config(
+    cfg: dict,
+    apply_final_activation: bool = True,
+    output_sampling_rate: int | None = None,
+) -> Vocoder:
+    """Instantiate a Vocoder from a flat config dict.
+    Args:
+        cfg: Vocoder config dict (keys match Vocoder constructor args).
+        apply_final_activation: Whether to apply tanh/clamp at the output.
+        output_sampling_rate: Explicit override for the output sample rate.
+            When None, reads from cfg["output_sampling_rate"] (default 24000).
+    """
+    return Vocoder(
+        resblock_kernel_sizes=cfg.get("resblock_kernel_sizes", [3, 7, 11]),
+        upsample_rates=cfg.get("upsample_rates", [6, 5, 2, 2, 2]),
+        upsample_kernel_sizes=cfg.get("upsample_kernel_sizes", [16, 15, 8, 4, 4]),
+        resblock_dilation_sizes=cfg.get("resblock_dilation_sizes", [[1, 3, 5], [1, 3, 5], [1, 3, 5]]),
+        upsample_initial_channel=cfg.get("upsample_initial_channel", 1024),
+        resblock=cfg.get("resblock", "1"),
+        output_sampling_rate=(
+            output_sampling_rate if output_sampling_rate is not None else cfg.get("output_sampling_rate", 24000)
+        ),
+        activation=cfg.get("activation", "snake"),
+        use_tanh_at_final=cfg.get("use_tanh_at_final", True),
+        apply_final_activation=apply_final_activation,
+        use_bias_at_final=cfg.get("use_bias_at_final", True),
+    )
+class VocoderConfigurator(ModelConfigurator[Vocoder]):
+    """Configurator that auto-detects the checkpoint format.
+    Returns a plain Vocoder for pre-ltx-2.3 checkpoints (flat config) or a
+    VocoderWithBWE for ltx-2.3+ checkpoints (nested "vocoder" + "bwe" config).
+    """
+    @classmethod
+    def from_config(cls: type[Vocoder], config: dict) -> Vocoder | VocoderWithBWE:
+        cfg = config.get("vocoder", {})
+        if "bwe" not in cfg:
+            check_config_value(cfg, "resblock", "1")
+            check_config_value(cfg, "stereo", True)
+            return _vocoder_from_config(cfg)
+        vocoder_cfg = cfg.get("vocoder", {})
+        bwe_cfg = cfg["bwe"]
+        check_config_value(vocoder_cfg, "resblock", "AMP1")
+        check_config_value(vocoder_cfg, "stereo", True)
+        check_config_value(vocoder_cfg, "activation", "snakebeta")
+        check_config_value(bwe_cfg, "resblock", "AMP1")
+        check_config_value(bwe_cfg, "stereo", True)
+        check_config_value(bwe_cfg, "activation", "snakebeta")
+        vocoder = _vocoder_from_config(
+            vocoder_cfg,
+            output_sampling_rate=bwe_cfg["input_sampling_rate"],
+        )
+        bwe_generator = _vocoder_from_config(
+            bwe_cfg,
+            apply_final_activation=False,
+            output_sampling_rate=bwe_cfg["output_sampling_rate"],
+        )
+        mel_stft = MelSTFT(
+            filter_length=bwe_cfg["n_fft"],
+            hop_length=bwe_cfg["hop_length"],
+            win_length=bwe_cfg["n_fft"],
+            n_mel_channels=bwe_cfg["num_mels"],
+        )
+        return VocoderWithBWE(
+            vocoder=vocoder,
+            bwe_generator=bwe_generator,
+            mel_stft=mel_stft,
+            input_sampling_rate=bwe_cfg["input_sampling_rate"],
+            output_sampling_rate=bwe_cfg["output_sampling_rate"],
+            hop_length=bwe_cfg["hop_length"],
+        )
+def _strip_vocoder_prefix(key: str, value: torch.Tensor) -> list[KeyValueOperationResult]:
+    """Strip the leading 'vocoder.' prefix exactly once.
+    Uses removeprefix instead of str.replace so that BWE keys like
+    'vocoder.vocoder.conv_pre' become 'vocoder.conv_pre' (not 'conv_pre').
+    Works identically for legacy keys like 'vocoder.conv_pre' → 'conv_pre'.
+    """
+    return [KeyValueOperationResult(key.removeprefix("vocoder."), value)]
+VOCODER_COMFY_KEYS_FILTER = (
+    SDOps("VOCODER_COMFY_KEYS_FILTER")
+    .with_matching(prefix="vocoder.")
+    .with_kv_operation(operation=_strip_vocoder_prefix, key_prefix="vocoder.")
+)
+class AudioDecoderConfigurator(ModelConfigurator[AudioDecoder]):
+    @classmethod
+    def from_config(cls: type[AudioDecoder], config: dict) -> AudioDecoder:
+        audio_vae_cfg = config.get("audio_vae", {})
+        model_cfg = audio_vae_cfg.get("model", {})
+        model_params = model_cfg.get("params", {})
+        ddconfig = model_params.get("ddconfig", {})
+        preprocessing_cfg = audio_vae_cfg.get("preprocessing", {})
+        stft_cfg = preprocessing_cfg.get("stft", {})
+        mel_cfg = preprocessing_cfg.get("mel", {})
+        variables_cfg = audio_vae_cfg.get("variables", {})
+        sample_rate = model_params.get("sampling_rate", 16000)
+        mel_hop_length = stft_cfg.get("hop_length", 160)
+        is_causal = stft_cfg.get("causal", True)
+        mel_bins = ddconfig.get("mel_bins") or mel_cfg.get("n_mel_channels") or variables_cfg.get("mel_bins")
+        return AudioDecoder(
+            ch=ddconfig.get("ch", 128),
+            out_ch=ddconfig.get("out_ch", 2),
+            ch_mult=tuple(ddconfig.get("ch_mult", (1, 2, 4))),
+            num_res_blocks=ddconfig.get("num_res_blocks", 2),
+            attn_resolutions=ddconfig.get("attn_resolutions", {8, 16, 32}),
+            resolution=ddconfig.get("resolution", 256),
+            z_channels=ddconfig.get("z_channels", 8),
+            norm_type=NormType(ddconfig.get("norm_type", "pixel")),
+            causality_axis=CausalityAxis(ddconfig.get("causality_axis", "height")),
+            dropout=ddconfig.get("dropout", 0.0),
+            mid_block_add_attention=ddconfig.get("mid_block_add_attention", True),
+            sample_rate=sample_rate,
+            mel_hop_length=mel_hop_length,
+            is_causal=is_causal,
+            mel_bins=mel_bins,
+        )
+class AudioEncoderConfigurator(ModelConfigurator[AudioEncoder]):
+    @classmethod
+    def from_config(cls: type[AudioEncoder], config: dict) -> AudioEncoder:
+        audio_vae_cfg = config.get("audio_vae", {})
+        model_cfg = audio_vae_cfg.get("model", {})
+        model_params = model_cfg.get("params", {})
+        ddconfig = model_params.get("ddconfig", {})
+        preprocessing_cfg = audio_vae_cfg.get("preprocessing", {})
+        stft_cfg = preprocessing_cfg.get("stft", {})
+        mel_cfg = preprocessing_cfg.get("mel", {})
+        variables_cfg = audio_vae_cfg.get("variables", {})
+        sample_rate = model_params.get("sampling_rate", 16000)
+        mel_hop_length = stft_cfg.get("hop_length", 160)
+        n_fft = stft_cfg.get("filter_length", 1024)
+        is_causal = stft_cfg.get("causal", True)
+        mel_bins = ddconfig.get("mel_bins") or mel_cfg.get("n_mel_channels") or variables_cfg.get("mel_bins")
+        return AudioEncoder(
+            ch=ddconfig.get("ch", 128),
+            ch_mult=tuple(ddconfig.get("ch_mult", (1, 2, 4))),
+            num_res_blocks=ddconfig.get("num_res_blocks", 2),
+            attn_resolutions=ddconfig.get("attn_resolutions", {8, 16, 32}),
+            resolution=ddconfig.get("resolution", 256),
+            z_channels=ddconfig.get("z_channels", 8),
+            double_z=ddconfig.get("double_z", True),
+            dropout=ddconfig.get("dropout", 0.0),
+            resamp_with_conv=ddconfig.get("resamp_with_conv", True),
+            in_channels=ddconfig.get("in_channels", 2),
+            attn_type=AttentionType(ddconfig.get("attn_type", "vanilla")),
+            mid_block_add_attention=ddconfig.get("mid_block_add_attention", True),
+            norm_type=NormType(ddconfig.get("norm_type", "pixel")),
+            causality_axis=CausalityAxis(ddconfig.get("causality_axis", "height")),
+            sample_rate=sample_rate,
+            mel_hop_length=mel_hop_length,
+            n_fft=n_fft,
+            is_causal=is_causal,
+            mel_bins=mel_bins,
+        )
+AUDIO_VAE_DECODER_COMFY_KEYS_FILTER = (
+    SDOps("AUDIO_VAE_DECODER_COMFY_KEYS_FILTER")
+    .with_matching(prefix="audio_vae.decoder.")
+    .with_matching(prefix="audio_vae.per_channel_statistics.")
+    .with_replacement("audio_vae.decoder.", "")
+    .with_replacement("audio_vae.per_channel_statistics.", "per_channel_statistics.")
+)
+AUDIO_VAE_ENCODER_COMFY_KEYS_FILTER = (
+    SDOps("AUDIO_VAE_ENCODER_COMFY_KEYS_FILTER")
+    .with_matching(prefix="audio_vae.encoder.")
+    .with_matching(prefix="audio_vae.per_channel_statistics.")
+    .with_replacement("audio_vae.encoder.", "")
+    .with_replacement("audio_vae.per_channel_statistics.", "per_channel_statistics.")
+)

packages/ltx-core/src/ltx_core/model/audio_vae/ops.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import torch
+import torchaudio
+from torch import nn
+from ltx_core.types import Audio
+class AudioProcessor(nn.Module):
+    """Converts audio waveforms to log-mel spectrograms with optional resampling."""
+    def __init__(
+        self,
+        target_sample_rate: int,
+        mel_bins: int,
+        mel_hop_length: int,
+        n_fft: int,
+    ) -> None:
+        super().__init__()
+        self.target_sample_rate = target_sample_rate
+        self.mel_transform = torchaudio.transforms.MelSpectrogram(
+            sample_rate=target_sample_rate,
+            n_fft=n_fft,
+            win_length=n_fft,
+            hop_length=mel_hop_length,
+            f_min=0.0,
+            f_max=target_sample_rate / 2.0,
+            n_mels=mel_bins,
+            window_fn=torch.hann_window,
+            center=True,
+            pad_mode="reflect",
+            power=1.0,
+            mel_scale="slaney",
+            norm="slaney",
+        )
+    def resample_audio(self, audio: Audio) -> Audio:
+        """Resample audio to the processor's target sample rate if needed."""
+        if audio.sampling_rate == self.target_sample_rate:
+            return audio
+        resampled = torchaudio.functional.resample(audio.waveform, audio.sampling_rate, self.target_sample_rate)
+        resampled = resampled.to(device=audio.waveform.device, dtype=audio.waveform.dtype)
+        return Audio(waveform=resampled, sampling_rate=self.target_sample_rate)
+    def waveform_to_mel(
+        self,
+        audio: Audio,
+    ) -> torch.Tensor:
+        """Convert waveform to log-mel spectrogram [batch, channels, time, n_mels]."""
+        waveform = self.resample_audio(audio).waveform
+        mel = self.mel_transform(waveform)
+        mel = torch.log(torch.clamp(mel, min=1e-5))
+        mel = mel.to(device=waveform.device, dtype=waveform.dtype)
+        return mel.permute(0, 1, 3, 2).contiguous()
+class PerChannelStatistics(nn.Module):
+    """
+    Per-channel statistics for normalizing and denormalizing the latent representation.
+    This statics is computed over the entire dataset and stored in model's checkpoint under AudioVAE state_dict.
+    """
+    def __init__(self, latent_channels: int = 128) -> None:
+        super().__init__()
+        self.register_buffer("std-of-means", torch.empty(latent_channels))
+        self.register_buffer("mean-of-means", torch.empty(latent_channels))
+    def un_normalize(self, x: torch.Tensor) -> torch.Tensor:
+        return (x * self.get_buffer("std-of-means").to(x)) + self.get_buffer("mean-of-means").to(x)
+    def normalize(self, x: torch.Tensor) -> torch.Tensor:
+        return (x - self.get_buffer("mean-of-means").to(x)) / self.get_buffer("std-of-means").to(x)

packages/ltx-core/src/ltx_core/model/audio_vae/resnet.py ADDED Viewed

	@@ -0,0 +1,176 @@

+from typing import Tuple
+import torch
+from ltx_core.model.audio_vae.causal_conv_2d import make_conv2d
+from ltx_core.model.audio_vae.causality_axis import CausalityAxis
+from ltx_core.model.common.normalization import NormType, build_normalization_layer
+LRELU_SLOPE = 0.1
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels: int, kernel_size: int = 3, dilation: Tuple[int, int, int] = (1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = torch.nn.ModuleList(
+            [
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding="same",
+                ),
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding="same",
+                ),
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[2],
+                    padding="same",
+                ),
+            ]
+        )
+        self.convs2 = torch.nn.ModuleList(
+            [
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=1,
+                    padding="same",
+                ),
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=1,
+                    padding="same",
+                ),
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=1,
+                    padding="same",
+                ),
+            ]
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for conv1, conv2 in zip(self.convs1, self.convs2, strict=True):
+            xt = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
+            xt = conv1(xt)
+            xt = torch.nn.functional.leaky_relu(xt, LRELU_SLOPE)
+            xt = conv2(xt)
+            x = xt + x
+        return x
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels: int, kernel_size: int = 3, dilation: Tuple[int, int] = (1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = torch.nn.ModuleList(
+            [
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding="same",
+                ),
+                torch.nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding="same",
+                ),
+            ]
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for conv in self.convs:
+            xt = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
+            xt = conv(xt)
+            x = xt + x
+        return x
+class ResnetBlock(torch.nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: int | None = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        norm_type: NormType = NormType.GROUP,
+        causality_axis: CausalityAxis = CausalityAxis.HEIGHT,
+    ) -> None:
+        super().__init__()
+        self.causality_axis = causality_axis
+        if self.causality_axis != CausalityAxis.NONE and norm_type == NormType.GROUP:
+            raise ValueError("Causal ResnetBlock with GroupNorm is not supported.")
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = build_normalization_layer(in_channels, normtype=norm_type)
+        self.non_linearity = torch.nn.SiLU()
+        self.conv1 = make_conv2d(in_channels, out_channels, kernel_size=3, stride=1, causality_axis=causality_axis)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = build_normalization_layer(out_channels, normtype=norm_type)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = make_conv2d(out_channels, out_channels, kernel_size=3, stride=1, causality_axis=causality_axis)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = make_conv2d(
+                    in_channels, out_channels, kernel_size=3, stride=1, causality_axis=causality_axis
+                )
+            else:
+                self.nin_shortcut = make_conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, causality_axis=causality_axis
+                )
+    def forward(
+        self,
+        x: torch.Tensor,
+        temb: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        h = x
+        h = self.norm1(h)
+        h = self.non_linearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(self.non_linearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = self.non_linearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.conv_shortcut(x) if self.use_conv_shortcut else self.nin_shortcut(x)
+        return x + h

packages/ltx-core/src/ltx_core/model/audio_vae/upsample.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from typing import Set, Tuple
+import torch
+from ltx_core.model.audio_vae.attention import AttentionType, make_attn
+from ltx_core.model.audio_vae.causal_conv_2d import make_conv2d
+from ltx_core.model.audio_vae.causality_axis import CausalityAxis
+from ltx_core.model.audio_vae.resnet import ResnetBlock
+from ltx_core.model.common.normalization import NormType
+class Upsample(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        with_conv: bool,
+        causality_axis: CausalityAxis = CausalityAxis.HEIGHT,
+    ) -> None:
+        super().__init__()
+        self.with_conv = with_conv
+        self.causality_axis = causality_axis
+        if self.with_conv:
+            self.conv = make_conv2d(in_channels, in_channels, kernel_size=3, stride=1, causality_axis=causality_axis)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+            # Drop FIRST element in the causal axis to undo encoder's padding, while keeping the length 1 + 2 * n.
+            # For example, if the input is [0, 1, 2], after interpolation, the output is [0, 0, 1, 1, 2, 2].
+            # The causal convolution will pad the first element as [-, -, 0, 0, 1, 1, 2, 2],
+            # So the output elements rely on the following windows:
+            # 0: [-,-,0]
+            # 1: [-,0,0]
+            # 2: [0,0,1]
+            # 3: [0,1,1]
+            # 4: [1,1,2]
+            # 5: [1,2,2]
+            # Notice that the first and second elements in the output rely only on the first element in the input,
+            # while all other elements rely on two elements in the input.
+            # So we can drop the first element to undo the padding (rather than the last element).
+            # This is a no-op for non-causal convolutions.
+            match self.causality_axis:
+                case CausalityAxis.NONE:
+                    pass  # x remains unchanged
+                case CausalityAxis.HEIGHT:
+                    x = x[:, :, 1:, :]
+                case CausalityAxis.WIDTH:
+                    x = x[:, :, :, 1:]
+                case CausalityAxis.WIDTH_COMPATIBILITY:
+                    pass  # x remains unchanged
+                case _:
+                    raise ValueError(f"Invalid causality_axis: {self.causality_axis}")
+        return x
+def build_upsampling_path(  # noqa: PLR0913
+    *,
+    ch: int,
+    ch_mult: Tuple[int, ...],
+    num_resolutions: int,
+    num_res_blocks: int,
+    resolution: int,
+    temb_channels: int,
+    dropout: float,
+    norm_type: NormType,
+    causality_axis: CausalityAxis,
+    attn_type: AttentionType,
+    attn_resolutions: Set[int],
+    resamp_with_conv: bool,
+    initial_block_channels: int,
+) -> tuple[torch.nn.ModuleList, int]:
+    """Build the upsampling path with residual blocks, attention, and upsampling layers."""
+    up_modules = torch.nn.ModuleList()
+    block_in = initial_block_channels
+    curr_res = resolution // (2 ** (num_resolutions - 1))
+    for level in reversed(range(num_resolutions)):
+        stage = torch.nn.Module()
+        stage.block = torch.nn.ModuleList()
+        stage.attn = torch.nn.ModuleList()
+        block_out = ch * ch_mult[level]
+        for _ in range(num_res_blocks + 1):
+            stage.block.append(
+                ResnetBlock(
+                    in_channels=block_in,
+                    out_channels=block_out,
+                    temb_channels=temb_channels,
+                    dropout=dropout,
+                    norm_type=norm_type,
+                    causality_axis=causality_axis,
+                )
+            )
+            block_in = block_out
+            if curr_res in attn_resolutions:
+                stage.attn.append(make_attn(block_in, attn_type=attn_type, norm_type=norm_type))
+        if level != 0:
+            stage.upsample = Upsample(block_in, resamp_with_conv, causality_axis=causality_axis)
+            curr_res *= 2
+        up_modules.insert(0, stage)
+    return up_modules, block_in

packages/ltx-core/src/ltx_core/model/audio_vae/vocoder.py ADDED Viewed

	@@ -0,0 +1,575 @@

+import math
+from typing import List
+import einops
+import torch
+import torch.nn.functional as F
+from torch import nn
+from ltx_core.model.audio_vae.resnet import LRELU_SLOPE, ResBlock1
+def get_padding(kernel_size: int, dilation: int = 1) -> int:
+    return int((kernel_size * dilation - dilation) / 2)
+# ---------------------------------------------------------------------------
+# Anti-aliased resampling helpers (kaiser-sinc filters) for BigVGAN v2
+# Adopted from https://github.com/NVIDIA/BigVGAN
+# ---------------------------------------------------------------------------
+def _sinc(x: torch.Tensor) -> torch.Tensor:
+    return torch.where(
+        x == 0,
+        torch.tensor(1.0, device=x.device, dtype=x.dtype),
+        torch.sin(math.pi * x) / math.pi / x,
+    )
+def kaiser_sinc_filter1d(cutoff: float, half_width: float, kernel_size: int) -> torch.Tensor:
+    even = kernel_size % 2 == 0
+    half_size = kernel_size // 2
+    delta_f = 4 * half_width
+    amplitude = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if amplitude > 50.0:
+        beta = 0.1102 * (amplitude - 8.7)
+    elif amplitude >= 21.0:
+        beta = 0.5842 * (amplitude - 21) ** 0.4 + 0.07886 * (amplitude - 21.0)
+    else:
+        beta = 0.0
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+    time = torch.arange(-half_size, half_size) + 0.5 if even else torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * _sinc(2 * cutoff * time)
+        filter_ /= filter_.sum()
+    return filter_.view(1, 1, kernel_size)
+class LowPassFilter1d(nn.Module):
+    def __init__(
+        self,
+        cutoff: float = 0.5,
+        half_width: float = 0.6,
+        stride: int = 1,
+        padding: bool = True,
+        padding_mode: str = "replicate",
+        kernel_size: int = 12,
+    ) -> None:
+        super().__init__()
+        if cutoff < -0.0:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = kernel_size % 2 == 0
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        self.register_buffer("filter", kaiser_sinc_filter1d(cutoff, half_width, kernel_size))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        _, n_channels, _ = x.shape
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
+        return F.conv1d(x, self.filter.expand(n_channels, -1, -1), stride=self.stride, groups=n_channels)
+class UpSample1d(nn.Module):
+    def __init__(
+        self,
+        ratio: int = 2,
+        kernel_size: int | None = None,
+        persistent: bool = True,
+        window_type: str = "kaiser",
+    ) -> None:
+        super().__init__()
+        self.ratio = ratio
+        self.stride = ratio
+        if window_type == "hann":
+            # Hann-windowed sinc filter equivalent to torchaudio.functional.resample
+            rolloff = 0.99
+            lowpass_filter_width = 6
+            width = math.ceil(lowpass_filter_width / rolloff)
+            self.kernel_size = 2 * width * ratio + 1
+            self.pad = width
+            self.pad_left = 2 * width * ratio
+            self.pad_right = self.kernel_size - ratio
+            time_axis = (torch.arange(self.kernel_size) / ratio - width) * rolloff
+            time_clamped = time_axis.clamp(-lowpass_filter_width, lowpass_filter_width)
+            window = torch.cos(time_clamped * math.pi / lowpass_filter_width / 2) ** 2
+            sinc_filter = (torch.sinc(time_axis) * window * rolloff / ratio).view(1, 1, -1)
+        else:
+            # Kaiser-windowed sinc filter (BigVGAN default).
+            self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+            self.pad = self.kernel_size // ratio - 1
+            self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+            self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+            sinc_filter = kaiser_sinc_filter1d(
+                cutoff=0.5 / ratio,
+                half_width=0.6 / ratio,
+                kernel_size=self.kernel_size,
+            )
+        self.register_buffer("filter", sinc_filter, persistent=persistent)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        _, n_channels, _ = x.shape
+        x = F.pad(x, (self.pad, self.pad), mode="replicate")
+        filt = self.filter.to(dtype=x.dtype, device=x.device).expand(n_channels, -1, -1)
+        x = self.ratio * F.conv_transpose1d(x, filt, stride=self.stride, groups=n_channels)
+        return x[..., self.pad_left : -self.pad_right]
+class DownSample1d(nn.Module):
+    def __init__(self, ratio: int = 2, kernel_size: int | None = None) -> None:
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.lowpass = LowPassFilter1d(
+            cutoff=0.5 / ratio,
+            half_width=0.6 / ratio,
+            stride=ratio,
+            kernel_size=self.kernel_size,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lowpass(x)
+class Activation1d(nn.Module):
+    def __init__(
+        self,
+        activation: nn.Module,
+        up_ratio: int = 2,
+        down_ratio: int = 2,
+        up_kernel_size: int = 12,
+        down_kernel_size: int = 12,
+    ) -> None:
+        super().__init__()
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.upsample(x)
+        x = self.act(x)
+        return self.downsample(x)
+class Snake(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        alpha: float = 1.0,
+        alpha_trainable: bool = True,
+        alpha_logscale: bool = True,
+    ) -> None:
+        super().__init__()
+        self.alpha_logscale = alpha_logscale
+        self.alpha = nn.Parameter(torch.zeros(in_features) if alpha_logscale else torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.eps = 1e-9
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        return x + (1.0 / (alpha + self.eps)) * torch.sin(x * alpha).pow(2)
+class SnakeBeta(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        alpha: float = 1.0,
+        alpha_trainable: bool = True,
+        alpha_logscale: bool = True,
+    ) -> None:
+        super().__init__()
+        self.alpha_logscale = alpha_logscale
+        self.alpha = nn.Parameter(torch.zeros(in_features) if alpha_logscale else torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta = nn.Parameter(torch.zeros(in_features) if alpha_logscale else torch.ones(in_features) * alpha)
+        self.beta.requires_grad = alpha_trainable
+        self.eps = 1e-9
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        return x + (1.0 / (beta + self.eps)) * torch.sin(x * alpha).pow(2)
+class AMPBlock1(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        kernel_size: int = 3,
+        dilation: tuple[int, int, int] = (1, 3, 5),
+        activation: str = "snake",
+    ) -> None:
+        super().__init__()
+        act_cls = SnakeBeta if activation == "snakebeta" else Snake
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=get_padding(kernel_size, dilation[0]),
+                ),
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=get_padding(kernel_size, dilation[1]),
+                ),
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[2],
+                    padding=get_padding(kernel_size, dilation[2]),
+                ),
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)),
+                nn.Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)),
+                nn.Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)),
+            ]
+        )
+        self.acts1 = nn.ModuleList([Activation1d(act_cls(channels)) for _ in range(len(self.convs1))])
+        self.acts2 = nn.ModuleList([Activation1d(act_cls(channels)) for _ in range(len(self.convs2))])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, self.acts1, self.acts2, strict=True):
+            xt = a1(x)
+            xt = c1(xt)
+            xt = a2(xt)
+            xt = c2(xt)
+            x = x + xt
+        return x
+class Vocoder(torch.nn.Module):
+    """
+    Vocoder model for synthesizing audio from Mel spectrograms.
+    Args:
+        resblock_kernel_sizes: List of kernel sizes for the residual blocks.
+                               This value is read from the checkpoint at `config.vocoder.resblock_kernel_sizes`.
+        upsample_rates: List of upsampling rates.
+                               This value is read from the checkpoint at `config.vocoder.upsample_rates`.
+        upsample_kernel_sizes: List of kernel sizes for the upsampling layers.
+                               This value is read from the checkpoint at `config.vocoder.upsample_kernel_sizes`.
+        resblock_dilation_sizes: List of dilation sizes for the residual blocks.
+                               This value is read from the checkpoint at `config.vocoder.resblock_dilation_sizes`.
+        upsample_initial_channel: Initial number of channels for the upsampling layers.
+                               This value is read from the checkpoint at `config.vocoder.upsample_initial_channel`.
+        resblock: Type of residual block to use ("1", "2", or "AMP1").
+                                This value is read from the checkpoint at `config.vocoder.resblock`.
+        output_sampling_rate: Waveform sample rate.
+                               This value is read from the checkpoint at `config.vocoder.output_sampling_rate`.
+        activation: Activation type for BigVGAN v2 ("snake" or "snakebeta"). Only used when resblock="AMP1".
+        use_tanh_at_final: Apply tanh at the output (when apply_final_activation=True).
+        apply_final_activation: Whether to apply the final tanh/clamp activation.
+        use_bias_at_final: Whether to use bias in the final conv layer.
+    """
+    def __init__(  # noqa: PLR0913
+        self,
+        resblock_kernel_sizes: List[int] | None = None,
+        upsample_rates: List[int] | None = None,
+        upsample_kernel_sizes: List[int] | None = None,
+        resblock_dilation_sizes: List[List[int]] | None = None,
+        upsample_initial_channel: int = 1024,
+        resblock: str = "1",
+        output_sampling_rate: int = 24000,
+        activation: str = "snake",
+        use_tanh_at_final: bool = True,
+        apply_final_activation: bool = True,
+        use_bias_at_final: bool = True,
+    ) -> None:
+        super().__init__()
+        # Mutable default values are not supported as default arguments.
+        if resblock_kernel_sizes is None:
+            resblock_kernel_sizes = [3, 7, 11]
+        if upsample_rates is None:
+            upsample_rates = [6, 5, 2, 2, 2]
+        if upsample_kernel_sizes is None:
+            upsample_kernel_sizes = [16, 15, 8, 4, 4]
+        if resblock_dilation_sizes is None:
+            resblock_dilation_sizes = [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        self.output_sampling_rate = output_sampling_rate
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.use_tanh_at_final = use_tanh_at_final
+        self.apply_final_activation = apply_final_activation
+        self.is_amp = resblock == "AMP1"
+        # All production checkpoints are stereo: 128 input channels (2 stereo channels x 64 mel
+        # bins each), 2 output channels.
+        self.conv_pre = nn.Conv1d(
+            in_channels=128,
+            out_channels=upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+        resblock_cls = ResBlock1 if resblock == "1" else AMPBlock1
+        self.ups = nn.ModuleList(
+            nn.ConvTranspose1d(
+                upsample_initial_channel // (2**i),
+                upsample_initial_channel // (2 ** (i + 1)),
+                kernel_size,
+                stride,
+                padding=(kernel_size - stride) // 2,
+            )
+            for i, (stride, kernel_size) in enumerate(zip(upsample_rates, upsample_kernel_sizes, strict=True))
+        )
+        final_channels = upsample_initial_channel // (2 ** len(upsample_rates))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(upsample_rates)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilations in zip(resblock_kernel_sizes, resblock_dilation_sizes, strict=True):
+                if self.is_amp:
+                    self.resblocks.append(resblock_cls(ch, kernel_size, dilations, activation=activation))
+                else:
+                    self.resblocks.append(resblock_cls(ch, kernel_size, dilations))
+        if self.is_amp:
+            self.act_post: nn.Module = Activation1d(SnakeBeta(final_channels))
+        else:
+            self.act_post = nn.LeakyReLU()
+        # All production checkpoints are stereo: this final conv maps `final_channels` to 2 output channels (stereo).
+        self.conv_post = nn.Conv1d(
+            in_channels=final_channels,
+            out_channels=2,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+            bias=use_bias_at_final,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the vocoder.
+        Args:
+            x: Input Mel spectrogram tensor. Can be either:
+               - 3D: (batch_size, time, mel_bins) for mono
+               - 4D: (batch_size, 2, time, mel_bins) for stereo
+        Returns:
+            Audio waveform tensor of shape (batch_size, out_channels, audio_length)
+        """
+        x = x.transpose(2, 3)  # (batch, channels, time, mel_bins) -> (batch, channels, mel_bins, time)
+        if x.dim() == 4:  # stereo
+            assert x.shape[1] == 2, "Input must have 2 channels for stereo"
+            x = einops.rearrange(x, "b s c t -> b (s c) t")
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            if not self.is_amp:
+                x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            start = i * self.num_kernels
+            end = start + self.num_kernels
+            # Evaluate all resblocks with the same input tensor so they can run
+            # independently (and thus in parallel on accelerator hardware) before
+            # aggregating their outputs via mean.
+            block_outputs = torch.stack(
+                [self.resblocks[idx](x) for idx in range(start, end)],
+                dim=0,
+            )
+            x = block_outputs.mean(dim=0)
+        x = self.act_post(x)
+        x = self.conv_post(x)
+        if self.apply_final_activation:
+            x = torch.tanh(x) if self.use_tanh_at_final else torch.clamp(x, -1, 1)
+        return x
+class _STFTFn(nn.Module):
+    """Implements STFT as a convolution with precomputed DFT x Hann-window bases.
+    The DFT basis rows (real and imaginary parts interleaved) multiplied by the causal
+    Hann window are stored as buffers and loaded from the checkpoint. Using the exact
+    bfloat16 bases from training ensures the mel values fed to the BWE generator are
+    bit-identical to what it was trained on.
+    """
+    def __init__(self, filter_length: int, hop_length: int, win_length: int) -> None:
+        super().__init__()
+        self.hop_length = hop_length
+        self.win_length = win_length
+        n_freqs = filter_length // 2 + 1
+        self.register_buffer("forward_basis", torch.zeros(n_freqs * 2, 1, filter_length))
+        self.register_buffer("inverse_basis", torch.zeros(n_freqs * 2, 1, filter_length))
+    def forward(self, y: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """Compute magnitude and phase spectrogram from a batch of waveforms.
+        Applies causal (left-only) padding of win_length - hop_length samples so that
+        each output frame depends only on past and present input — no lookahead.
+        Args:
+            y: Waveform tensor of shape (B, T).
+        Returns:
+            magnitude: Linear amplitude spectrogram, shape (B, n_freqs, T_frames).
+            phase:     Phase spectrogram in radians, shape (B, n_freqs, T_frames).
+        """
+        if y.dim() == 2:
+            y = y.unsqueeze(1)  # (B, 1, T)
+        left_pad = max(0, self.win_length - self.hop_length)  # causal: left-only
+        y = F.pad(y, (left_pad, 0))
+        spec = F.conv1d(y, self.forward_basis, stride=self.hop_length, padding=0)
+        n_freqs = spec.shape[1] // 2
+        real, imag = spec[:, :n_freqs], spec[:, n_freqs:]
+        magnitude = torch.sqrt(real**2 + imag**2)
+        phase = torch.atan2(imag.float(), real.float()).to(real.dtype)
+        return magnitude, phase
+class MelSTFT(nn.Module):
+    """Causal log-mel spectrogram module whose buffers are loaded from the checkpoint.
+    Computes a log-mel spectrogram by running the causal STFT (_STFTFn) on the input
+    waveform and projecting the linear magnitude spectrum onto the mel filterbank.
+    The module's state dict layout matches the 'mel_stft.*' keys stored in the checkpoint
+    (mel_basis, stft_fn.forward_basis, stft_fn.inverse_basis).
+    """
+    def __init__(
+        self,
+        filter_length: int,
+        hop_length: int,
+        win_length: int,
+        n_mel_channels: int,
+    ) -> None:
+        super().__init__()
+        self.stft_fn = _STFTFn(filter_length, hop_length, win_length)
+        # Initialized to zeros; load_state_dict overwrites with the checkpoint's
+        # exact bfloat16 filterbank (vocoder.mel_stft.mel_basis, shape [n_mels, n_freqs]).
+        n_freqs = filter_length // 2 + 1
+        self.register_buffer("mel_basis", torch.zeros(n_mel_channels, n_freqs))
+    def mel_spectrogram(self, y: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute log-mel spectrogram and auxiliary spectral quantities.
+        Args:
+            y: Waveform tensor of shape (B, T).
+        Returns:
+            log_mel:   Log-compressed mel spectrogram, shape (B, n_mel_channels, T_frames).
+            magnitude: Linear amplitude spectrogram, shape (B, n_freqs, T_frames).
+            phase:     Phase spectrogram in radians, shape (B, n_freqs, T_frames).
+            energy:    Per-frame energy (L2 norm over frequency), shape (B, T_frames).
+        """
+        magnitude, phase = self.stft_fn(y)
+        energy = torch.norm(magnitude, dim=1)
+        mel = torch.matmul(self.mel_basis.to(magnitude.dtype), magnitude)
+        log_mel = torch.log(torch.clamp(mel, min=1e-5))
+        return log_mel, magnitude, phase, energy
+class VocoderWithBWE(nn.Module):
+    """Vocoder with bandwidth extension (BWE) upsampling.
+    Chains a mel-to-wav vocoder with a BWE module that upsamples the output
+    to a higher sample rate. The BWE computes a mel spectrogram from the
+    vocoder output, runs it through a second generator to predict a residual,
+    and adds it to a sinc-resampled skip connection.
+    """
+    def __init__(
+        self,
+        vocoder: Vocoder,
+        bwe_generator: Vocoder,
+        mel_stft: MelSTFT,
+        input_sampling_rate: int,
+        output_sampling_rate: int,
+        hop_length: int,
+    ) -> None:
+        super().__init__()
+        self.vocoder = vocoder
+        self.bwe_generator = bwe_generator
+        self.mel_stft = mel_stft
+        self.input_sampling_rate = input_sampling_rate
+        self.output_sampling_rate = output_sampling_rate
+        self.hop_length = hop_length
+        # Compute the resampler on CPU so the sinc filter is materialized even when
+        # the model is constructed on meta device (SingleGPUModelBuilder pattern).
+        # The filter is not stored in the checkpoint (persistent=False).
+        with torch.device("cpu"):
+            self.resampler = UpSample1d(
+                ratio=output_sampling_rate // input_sampling_rate, persistent=False, window_type="hann"
+            )
+    @property
+    def conv_pre(self) -> nn.Conv1d:
+        return self.vocoder.conv_pre
+    @property
+    def conv_post(self) -> nn.Conv1d:
+        return self.vocoder.conv_post
+    def _compute_mel(self, audio: torch.Tensor) -> torch.Tensor:
+        """Compute log-mel spectrogram from waveform using causal STFT bases.
+        Args:
+            audio: Waveform tensor of shape (B, C, T).
+        Returns:
+            mel: Log-mel spectrogram of shape (B, C, n_mels, T_frames).
+        """
+        batch, n_channels, _ = audio.shape
+        flat = audio.reshape(batch * n_channels, -1)  # (B*C, T)
+        mel, _, _, _ = self.mel_stft.mel_spectrogram(flat)  # (B*C, n_mels, T_frames)
+        return mel.reshape(batch, n_channels, mel.shape[1], mel.shape[2])  # (B, C, n_mels, T_frames)
+    def forward(self, mel_spec: torch.Tensor) -> torch.Tensor:
+        """Run the full vocoder + BWE forward pass.
+        Args:
+            mel_spec: Mel spectrogram of shape (B, 2, T, mel_bins) for stereo
+                      or (B, T, mel_bins) for mono. Same format as Vocoder.forward.
+        Returns:
+            Waveform tensor of shape (B, out_channels, T_out) clipped to [-1, 1].
+        """
+        x = self.vocoder(mel_spec)
+        _, _, length_low_rate = x.shape
+        output_length = length_low_rate * self.output_sampling_rate // self.input_sampling_rate
+        # Pad to multiple of hop_length for exact mel frame count
+        remainder = length_low_rate % self.hop_length
+        if remainder != 0:
+            x = F.pad(x, (0, self.hop_length - remainder))
+        # Compute mel spectrogram from vocoder output: (B, C, n_mels, T_frames)
+        mel = self._compute_mel(x)
+        # Vocoder.forward expects (B, C, T, mel_bins) — transpose before calling bwe_generator
+        mel_for_bwe = mel.transpose(2, 3)  # (B, C, T_frames, mel_bins)
+        residual = self.bwe_generator(mel_for_bwe)
+        skip = self.resampler(x)
+        assert residual.shape == skip.shape, f"residual {residual.shape} != skip {skip.shape}"
+        return torch.clamp(residual + skip, -1, 1)[..., :output_length]

packages/ltx-core/src/ltx_core/model/model_protocol.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from typing import Protocol, TypeVar
+ModelType = TypeVar("ModelType")
+class ModelConfigurator(Protocol[ModelType]):
+    """Protocol for model loader classes that instantiates models from a configuration dictionary."""
+    @classmethod
+    def from_config(cls, config: dict) -> ModelType: ...

packages/ltx-core/src/ltx_core/model/transformer/feed_forward.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+from ltx_core.model.transformer.gelu_approx import GELUApprox
+class FeedForward(torch.nn.Module):
+    def __init__(self, dim: int, dim_out: int, mult: int = 4) -> None:
+        super().__init__()
+        inner_dim = int(dim * mult)
+        project_in = GELUApprox(dim, inner_dim)
+        self.net = torch.nn.Sequential(project_in, torch.nn.Identity(), torch.nn.Linear(inner_dim, dim_out))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)

packages/ltx-core/src/ltx_core/model/upsampler/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""Latent upsampler model components."""
+from ltx_core.model.upsampler.model import LatentUpsampler, upsample_video
+from ltx_core.model.upsampler.model_configurator import LatentUpsamplerConfigurator
+__all__ = [
+    "LatentUpsampler",
+    "LatentUpsamplerConfigurator",
+    "upsample_video",
+]