Remove old capacitor_diffae package (renamed to fcdm_diffae)

Browse files

Files changed (12) hide show

capacitor_diffae/__init__.py +0 -33
capacitor_diffae/adaln.py +0 -50
capacitor_diffae/config.py +0 -62
capacitor_diffae/decoder.py +0 -169
capacitor_diffae/encoder.py +0 -129
capacitor_diffae/fcdm_block.py +0 -103
capacitor_diffae/model.py +0 -364
capacitor_diffae/norms.py +0 -39
capacitor_diffae/samplers.py +0 -263
capacitor_diffae/straight_through_encoder.py +0 -27
capacitor_diffae/time_embed.py +0 -83
capacitor_diffae/vp_diffusion.py +0 -151

capacitor_diffae/__init__.py DELETED Viewed

@@ -1,33 +0,0 @@
-"""CapacitorDiffAE: Standalone diffusion autoencoder with FCDM blocks.
-Capacitor DiffAE — a fast diffusion autoencoder with a 128-channel spatial
-bottleneck and a VP-parameterized diagonal Gaussian posterior. Built on FCDM
-(Fully Convolutional Diffusion Model) blocks with GRN and scale+gate AdaLN.
-Usage::
-    from capacitor_diffae import CapacitorDiffAE, CapacitorDiffAEInferenceConfig
-    model = CapacitorDiffAE.from_pretrained("path/to/weights", device="cuda")
-    # Encode (returns posterior mode by default)
-    latents = model.encode(images)  # images: [B,3,H,W] in [-1,1]
-    # Decode — PSNR-optimal (1 step, default)
-    recon = model.decode(latents, height=H, width=W)
-    # Decode — perceptual sharpness (10 steps + path-drop PDG)
-    cfg = CapacitorDiffAEInferenceConfig(num_steps=10, pdg=True, pdg_strength=2.0)
-    recon = model.decode(latents, height=H, width=W, inference_config=cfg)
-"""
-from .config import CapacitorDiffAEConfig, CapacitorDiffAEInferenceConfig
-from .encoder import EncoderPosterior
-from .model import CapacitorDiffAE
-__all__ = [
-    "CapacitorDiffAE",
-    "CapacitorDiffAEConfig",
-    "CapacitorDiffAEInferenceConfig",
-    "EncoderPosterior",
-]

capacitor_diffae/adaln.py DELETED Viewed

@@ -1,50 +0,0 @@
-"""Scale+Gate AdaLN (2-way) for FCDM decoder blocks."""
-from __future__ import annotations
-from torch import Tensor, nn
-class AdaLNScaleGateZeroProjector(nn.Module):
-    """Packed 2-way AdaLN projection (SiLU -> Linear), zero-initialized.
-    Outputs [B, 2*d_model] packed as (scale, gate).
-    """
-    def __init__(self, d_model: int, d_cond: int) -> None:
-        super().__init__()
-        self.d_model: int = int(d_model)
-        self.d_cond: int = int(d_cond)
-        self.act: nn.SiLU = nn.SiLU()
-        self.proj: nn.Linear = nn.Linear(self.d_cond, 2 * self.d_model)
-        nn.init.zeros_(self.proj.weight)
-        nn.init.zeros_(self.proj.bias)
-    def forward_activated(self, act_cond: Tensor) -> Tensor:
-        """Return packed modulation for a pre-activated conditioning vector."""
-        return self.proj(act_cond)
-    def forward(self, cond: Tensor) -> Tensor:
-        """Return packed modulation [B, 2*d_model]."""
-        return self.forward_activated(self.act(cond))
-class AdaLNScaleGateZeroLowRankDelta(nn.Module):
-    """Low-rank delta for 2-way AdaLN: down(d_cond -> rank) -> up(rank -> 2*d_model).
-    Zero-initialized up projection preserves zero-output semantics at init.
-    """
-    def __init__(self, *, d_model: int, d_cond: int, rank: int) -> None:
-        super().__init__()
-        self.d_model: int = int(d_model)
-        self.d_cond: int = int(d_cond)
-        self.rank: int = int(rank)
-        self.down: nn.Linear = nn.Linear(self.d_cond, self.rank, bias=False)
-        self.up: nn.Linear = nn.Linear(self.rank, 2 * self.d_model, bias=False)
-        nn.init.normal_(self.down.weight, mean=0.0, std=0.02)
-        nn.init.zeros_(self.up.weight)
-    def forward(self, act_cond: Tensor) -> Tensor:
-        """Return packed delta modulation [B, 2*d_model]."""
-        return self.up(self.down(act_cond))

capacitor_diffae/config.py DELETED Viewed

@@ -1,62 +0,0 @@
-"""Frozen model architecture and user-tunable inference configuration."""
-from __future__ import annotations
-import json
-from dataclasses import asdict, dataclass
-from pathlib import Path
-@dataclass(frozen=True)
-class CapacitorDiffAEConfig:
-    """Frozen model architecture config. Stored alongside weights as config.json."""
-    in_channels: int = 3
-    patch_size: int = 16
-    model_dim: int = 896
-    encoder_depth: int = 4
-    decoder_depth: int = 8
-    decoder_start_blocks: int = 2
-    decoder_end_blocks: int = 2
-    bottleneck_dim: int = 128
-    mlp_ratio: float = 4.0
-    depthwise_kernel_size: int = 7
-    adaln_low_rank_rank: int = 128
-    # Encoder posterior kind: "diagonal_gaussian" or "deterministic"
-    bottleneck_posterior_kind: str = "diagonal_gaussian"
-    # Post-bottleneck normalization: "channel_wise" or "disabled"
-    bottleneck_norm_mode: str = "disabled"
-    # VP diffusion schedule endpoints
-    logsnr_min: float = -10.0
-    logsnr_max: float = 10.0
-    # Pixel-space noise std for VP diffusion initialization
-    pixel_noise_std: float = 0.558
-    def save(self, path: str | Path) -> None:
-        """Save config as JSON."""
-        p = Path(path)
-        p.parent.mkdir(parents=True, exist_ok=True)
-        p.write_text(json.dumps(asdict(self), indent=2) + "\n")
-    @classmethod
-    def load(cls, path: str | Path) -> CapacitorDiffAEConfig:
-        """Load config from JSON."""
-        data = json.loads(Path(path).read_text())
-        return cls(**data)
-@dataclass
-class CapacitorDiffAEInferenceConfig:
-    """User-tunable inference parameters with sensible defaults.
-    PDG (Path-Drop Guidance) sharpens reconstructions by degrading conditioning
-    in one pass and amplifying the difference. When enabled, uses 2 NFE per step.
-    Recommended: ``pdg=True, pdg_strength=2.0, num_steps=10``.
-    """
-    num_steps: int = 1  # number of denoising steps (NFE)
-    sampler: str = "ddim"  # "ddim" or "dpmpp_2m"
-    schedule: str = "linear"  # "linear" or "cosine"
-    pdg: bool = False  # enable PDG for perceptual sharpening
-    pdg_strength: float = 2.0  # CFG-like strength when pdg=True
-    seed: int | None = None

capacitor_diffae/decoder.py DELETED Viewed

@@ -1,169 +0,0 @@
-"""Capacitor decoder: skip-concat topology with FCDM blocks and dual PDG.
-No outer RMSNorms (use_other_outer_rms_norms=False during training):
-norm_in, latent_norm, and norm_out are all absent.
-"""
-from __future__ import annotations
-import torch
-from torch import Tensor, nn
-from .adaln import AdaLNScaleGateZeroLowRankDelta, AdaLNScaleGateZeroProjector
-from .fcdm_block import FCDMBlock
-from .straight_through_encoder import Patchify
-from .time_embed import SinusoidalTimeEmbeddingMLP
-class Decoder(nn.Module):
-    """VP diffusion decoder conditioned on encoder latents and timestep.
-    Architecture (skip-concat, 2+4+2 default):
-        Patchify x_t -> Fuse with upsampled z
-        -> Start blocks (2) -> Middle blocks (4) -> Skip fuse -> End blocks (2)
-        -> Conv1x1 -> PixelShuffle
-    Dual PDG at inference:
-    - Path drop: replace middle block output with ``path_drop_mask_feature``.
-    - Token mask: replace a fraction of upsampled latent tokens with
-      ``latent_mask_feature`` before fusion.
-    """
-    def __init__(
-        self,
-        in_channels: int,
-        patch_size: int,
-        model_dim: int,
-        depth: int,
-        start_block_count: int,
-        end_block_count: int,
-        bottleneck_dim: int,
-        mlp_ratio: float,
-        depthwise_kernel_size: int,
-        adaln_low_rank_rank: int,
-    ) -> None:
-        super().__init__()
-        self.patch_size = int(patch_size)
-        self.model_dim = int(model_dim)
-        # Input processing (no norm_in)
-        self.patchify = Patchify(in_channels, patch_size, model_dim)
-        # Latent conditioning path (no latent_norm)
-        self.latent_up = nn.Conv2d(bottleneck_dim, model_dim, kernel_size=1, bias=True)
-        self.fuse_in = nn.Conv2d(2 * model_dim, model_dim, kernel_size=1, bias=True)
-        # Time embedding
-        self.time_embed = SinusoidalTimeEmbeddingMLP(model_dim)
-        # 2-way AdaLN: shared base projector + per-block low-rank deltas
-        self.adaln_base = AdaLNScaleGateZeroProjector(
-            d_model=model_dim, d_cond=model_dim
-        )
-        self.adaln_deltas = nn.ModuleList(
-            [
-                AdaLNScaleGateZeroLowRankDelta(
-                    d_model=model_dim, d_cond=model_dim, rank=adaln_low_rank_rank
-                )
-                for _ in range(depth)
-            ]
-        )
-        # Block layout: start + middle + end
-        middle_count = depth - start_block_count - end_block_count
-        self._middle_start_idx = start_block_count
-        self._end_start_idx = start_block_count + middle_count
-        def _make_blocks(count: int) -> nn.ModuleList:
-            return nn.ModuleList(
-                [
-                    FCDMBlock(
-                        model_dim,
-                        mlp_ratio,
-                        depthwise_kernel_size=depthwise_kernel_size,
-                        use_external_adaln=True,
-                    )
-                    for _ in range(count)
-                ]
-            )
-        self.start_blocks = _make_blocks(start_block_count)
-        self.middle_blocks = _make_blocks(middle_count)
-        self.fuse_skip = nn.Conv2d(2 * model_dim, model_dim, kernel_size=1, bias=True)
-        self.end_blocks = _make_blocks(end_block_count)
-        # Learned mask feature for path-drop PDG
-        self.path_drop_mask_feature = nn.Parameter(torch.zeros((1, model_dim, 1, 1)))
-        # Output head (no norm_out)
-        self.out_proj = nn.Conv2d(
-            model_dim, in_channels * (patch_size**2), kernel_size=1, bias=True
-        )
-        self.unpatchify = nn.PixelShuffle(patch_size)
-    def _adaln_m_for_layer(self, cond: Tensor, layer_idx: int) -> Tensor:
-        """Compute packed AdaLN modulation = shared_base + per-layer delta."""
-        act = self.adaln_base.act(cond)
-        base_m = self.adaln_base.forward_activated(act)
-        delta_m = self.adaln_deltas[layer_idx](act)
-        return base_m + delta_m
-    def _run_blocks(
-        self, blocks: nn.ModuleList, x: Tensor, cond: Tensor, start_index: int
-    ) -> Tensor:
-        """Run a group of decoder blocks with per-block AdaLN modulation."""
-        for local_idx, block in enumerate(blocks):
-            adaln_m = self._adaln_m_for_layer(cond, layer_idx=start_index + local_idx)
-            x = block(x, adaln_m=adaln_m)
-        return x
-    def forward(
-        self,
-        x_t: Tensor,
-        t: Tensor,
-        latents: Tensor,
-        *,
-        drop_middle_blocks: bool = False,
-    ) -> Tensor:
-        """Single decoder forward pass.
-        Args:
-            x_t: Noised image [B, C, H, W].
-            t: Timestep [B] in [0, 1].
-            latents: Encoder latents [B, bottleneck_dim, h, w].
-            drop_middle_blocks: Replace middle block output with mask feature (PDG).
-        Returns:
-            x0 prediction [B, C, H, W].
-        """
-        x_feat = self.patchify(x_t)
-        z_up = self.latent_up(latents)
-        fused = torch.cat([x_feat, z_up], dim=1)
-        fused = self.fuse_in(fused)
-        cond = self.time_embed(t.to(torch.float32).to(device=x_t.device))
-        start_out = self._run_blocks(self.start_blocks, fused, cond, start_index=0)
-        if drop_middle_blocks:
-            middle_out = self.path_drop_mask_feature.to(
-                device=x_t.device, dtype=x_t.dtype
-            ).expand_as(start_out)
-        else:
-            middle_out = self._run_blocks(
-                self.middle_blocks,
-                start_out,
-                cond,
-                start_index=self._middle_start_idx,
-            )
-        skip_fused = torch.cat([start_out, middle_out], dim=1)
-        skip_fused = self.fuse_skip(skip_fused)
-        end_out = self._run_blocks(
-            self.end_blocks, skip_fused, cond, start_index=self._end_start_idx
-        )
-        patches = self.out_proj(end_out)
-        return self.unpatchify(patches)

capacitor_diffae/encoder.py DELETED Viewed

@@ -1,129 +0,0 @@
-"""Capacitor encoder: patchify -> FCDMBlocks -> diagonal Gaussian posterior.
-No input RMSNorm (use_other_outer_rms_norms=False during training).
-Post-bottleneck RMSNorm (affine=False) on the mean branch.
-Encoder outputs posterior mode by default: alpha * RMSNorm(mean).
-"""
-from __future__ import annotations
-from dataclasses import dataclass
-import torch
-from torch import Tensor, nn
-from .fcdm_block import FCDMBlock
-from .norms import ChannelWiseRMSNorm
-from .straight_through_encoder import Patchify
-@dataclass(frozen=True)
-class EncoderPosterior:
-    """VP-parameterized diagonal Gaussian posterior.
-    mean: Clean signal branch mu [B, bottleneck_dim, h, w]
-    logsnr: Per-element log signal-to-noise ratio [B, bottleneck_dim, h, w]
-    """
-    mean: Tensor
-    logsnr: Tensor
-    @property
-    def alpha(self) -> Tensor:
-        """VP signal coefficient: sqrt(sigmoid(logsnr))."""
-        return torch.sigmoid(self.logsnr).sqrt()
-    @property
-    def sigma(self) -> Tensor:
-        """VP noise coefficient: sqrt(sigmoid(-logsnr))."""
-        return torch.sigmoid(-self.logsnr).sqrt()
-    def mode(self) -> Tensor:
-        """Posterior mode in token space: alpha * mean."""
-        return self.alpha.to(dtype=self.mean.dtype) * self.mean
-    def sample(self, *, generator: torch.Generator | None = None) -> Tensor:
-        """Sample from posterior: alpha * mean + sigma * eps."""
-        eps = torch.randn_like(self.mean, generator=generator)  # type: ignore[call-overload]
-        alpha = self.alpha.to(dtype=self.mean.dtype)
-        sigma = self.sigma.to(dtype=self.mean.dtype)
-        return alpha * self.mean + sigma * eps
-class Encoder(nn.Module):
-    """Encoder: Image [B,3,H,W] -> latents [B,bottleneck_dim,h,w].
-    With diagonal_gaussian posterior, the to_bottleneck projection outputs
-    2 * bottleneck_dim channels, split into mean and logsnr. The default
-    encode() returns the posterior mode: alpha * RMSNorm(mean).
-    """
-    def __init__(
-        self,
-        in_channels: int,
-        patch_size: int,
-        model_dim: int,
-        depth: int,
-        bottleneck_dim: int,
-        mlp_ratio: float,
-        depthwise_kernel_size: int,
-        bottleneck_posterior_kind: str = "diagonal_gaussian",
-        bottleneck_norm_mode: str = "disabled",
-    ) -> None:
-        super().__init__()
-        self.bottleneck_dim = int(bottleneck_dim)
-        self.bottleneck_posterior_kind = bottleneck_posterior_kind
-        self.bottleneck_norm_mode = bottleneck_norm_mode
-        self.patchify = Patchify(in_channels, patch_size, model_dim)
-        self.blocks = nn.ModuleList(
-            [
-                FCDMBlock(
-                    model_dim,
-                    mlp_ratio,
-                    depthwise_kernel_size=depthwise_kernel_size,
-                    use_external_adaln=False,
-                )
-                for _ in range(depth)
-            ]
-        )
-        out_dim = (
-            2 * bottleneck_dim
-            if bottleneck_posterior_kind == "diagonal_gaussian"
-            else bottleneck_dim
-        )
-        self.to_bottleneck = nn.Conv2d(model_dim, out_dim, kernel_size=1, bias=True)
-        if bottleneck_norm_mode == "channel_wise":
-            self.norm_out = ChannelWiseRMSNorm(bottleneck_dim, eps=1e-6, affine=False)
-        else:
-            self.norm_out = nn.Identity()
-    def encode_posterior(self, images: Tensor) -> EncoderPosterior:
-        """Encode images and return the full posterior (mean + logsnr).
-        Only valid when bottleneck_posterior_kind == "diagonal_gaussian".
-        """
-        z = self.patchify(images)
-        for block in self.blocks:
-            z = block(z)
-        projection = self.to_bottleneck(z)
-        mean, logsnr = projection.chunk(2, dim=1)
-        mean = self.norm_out(mean)
-        return EncoderPosterior(mean=mean, logsnr=logsnr)
-    def forward(self, images: Tensor) -> Tensor:
-        """Encode images [B,3,H,W] in [-1,1] to latents [B,bottleneck_dim,h,w].
-        Returns posterior mode (alpha * mean) for diagonal_gaussian,
-        or deterministic latents otherwise.
-        """
-        z = self.patchify(images)
-        for block in self.blocks:
-            z = block(z)
-        projection = self.to_bottleneck(z)
-        if self.bottleneck_posterior_kind == "diagonal_gaussian":
-            mean, logsnr = projection.chunk(2, dim=1)
-            mean = self.norm_out(mean)
-            alpha = torch.sigmoid(logsnr).sqrt().to(dtype=mean.dtype)
-            return alpha * mean
-        z = self.norm_out(projection)
-        return z

capacitor_diffae/fcdm_block.py DELETED Viewed

@@ -1,103 +0,0 @@
-"""FCDM block: ConvNeXt-style conv block with GRN and scale+gate AdaLN."""
-from __future__ import annotations
-import torch
-import torch.nn.functional as F
-from torch import Tensor, nn
-from .norms import ChannelWiseRMSNorm
-class GRN(nn.Module):
-    """Global Response Normalization for NCHW tensors."""
-    def __init__(self, channels: int, *, eps: float = 1e-6) -> None:
-        super().__init__()
-        self.eps: float = float(eps)
-        c = int(channels)
-        self.gamma = nn.Parameter(torch.zeros((1, c, 1, 1), dtype=torch.float32))
-        self.beta = nn.Parameter(torch.zeros((1, c, 1, 1), dtype=torch.float32))
-    def forward(self, x: Tensor) -> Tensor:
-        g = torch.linalg.vector_norm(x, ord=2, dim=(2, 3), keepdim=True)
-        g_fp32 = g.to(dtype=torch.float32)
-        n = (g_fp32 / (g_fp32.mean(dim=1, keepdim=True) + self.eps)).to(dtype=x.dtype)
-        gamma = self.gamma.to(device=x.device, dtype=x.dtype)
-        beta = self.beta.to(device=x.device, dtype=x.dtype)
-        return gamma * (x * n) + beta + x
-class FCDMBlock(nn.Module):
-    """ConvNeXt-style block with scale+gate AdaLN and GRN.
-    Two modes:
-    - Unconditioned (encoder): uses learned layer-scale for near-identity init.
-    - External AdaLN (decoder): receives packed [B, 2*C] modulation (scale, gate).
-      The gate is applied raw (no tanh).
-    """
-    def __init__(
-        self,
-        channels: int,
-        mlp_ratio: float,
-        *,
-        depthwise_kernel_size: int = 7,
-        use_external_adaln: bool = False,
-        norm_eps: float = 1e-6,
-        layer_scale_init: float = 1e-3,
-    ) -> None:
-        super().__init__()
-        self.channels: int = int(channels)
-        self.mlp_ratio: float = float(mlp_ratio)
-        self.dwconv = nn.Conv2d(
-            channels,
-            channels,
-            kernel_size=depthwise_kernel_size,
-            padding=depthwise_kernel_size // 2,
-            stride=1,
-            groups=channels,
-            bias=True,
-        )
-        self.norm = ChannelWiseRMSNorm(channels, eps=float(norm_eps), affine=False)
-        hidden = max(int(float(channels) * float(mlp_ratio)), 1)
-        self.pwconv1 = nn.Conv2d(channels, hidden, kernel_size=1, bias=True)
-        self.grn = GRN(hidden, eps=1e-6)
-        self.pwconv2 = nn.Conv2d(hidden, channels, kernel_size=1, bias=True)
-        if not use_external_adaln:
-            self.layer_scale = nn.Parameter(
-                torch.full((channels,), float(layer_scale_init))
-            )
-        else:
-            self.register_parameter("layer_scale", None)
-    def forward(self, x: Tensor, *, adaln_m: Tensor | None = None) -> Tensor:
-        b, c, _, _ = x.shape
-        if adaln_m is not None:
-            m = adaln_m.to(device=x.device, dtype=x.dtype)
-            scale, gate = m.chunk(2, dim=-1)
-        else:
-            scale = gate = None
-        h = self.dwconv(x)
-        h = self.norm(h)
-        if scale is not None:
-            h = h * (1.0 + scale.view(b, c, 1, 1))
-        h = self.pwconv1(h)
-        h = F.gelu(h)
-        h = self.grn(h)
-        h = self.pwconv2(h)
-        if gate is not None:
-            gate_view = gate.view(b, c, 1, 1)
-        else:
-            gate_view = self.layer_scale.view(1, c, 1, 1).to(  # type: ignore[union-attr]
-                device=h.device, dtype=h.dtype
-            )
-        return x + gate_view * h

capacitor_diffae/model.py DELETED Viewed

@@ -1,364 +0,0 @@
-"""CapacitorDiffAE: standalone HuggingFace-compatible diffusion autoencoder."""
-from __future__ import annotations
-from pathlib import Path
-import torch
-from torch import Tensor, nn
-from .config import CapacitorDiffAEConfig, CapacitorDiffAEInferenceConfig
-from .decoder import Decoder
-from .encoder import Encoder, EncoderPosterior
-from .samplers import run_ddim, run_dpmpp_2m
-from .vp_diffusion import get_schedule, make_initial_state, sample_noise
-def _resolve_model_dir(
-    path_or_repo_id: str | Path,
-    *,
-    revision: str | None,
-    cache_dir: str | Path | None,
-) -> Path:
-    """Resolve a local path or HuggingFace Hub repo ID to a local directory."""
-    local = Path(path_or_repo_id)
-    if local.is_dir():
-        return local
-    repo_id = str(path_or_repo_id)
-    try:
-        from huggingface_hub import snapshot_download
-    except ImportError:
-        raise ImportError(
-            f"'{repo_id}' is not an existing local directory. "
-            "To download from HuggingFace Hub, install huggingface_hub: "
-            "pip install huggingface_hub"
-        )
-    cache_dir_str = str(cache_dir) if cache_dir is not None else None
-    local_dir = snapshot_download(
-        repo_id,
-        revision=revision,
-        cache_dir=cache_dir_str,
-    )
-    return Path(local_dir)
-class CapacitorDiffAE(nn.Module):
-    """Standalone Capacitor DiffAE model for HuggingFace distribution.
-    A diffusion autoencoder built on FCDM (Fully Convolutional Diffusion Model)
-    blocks. Encodes images to compact 128-channel spatial latents via a
-    VP-parameterized diagonal Gaussian posterior, and decodes them back via
-    iterative VP diffusion with a skip-concat decoder.
-    Usage::
-        model = CapacitorDiffAE.from_pretrained("path/to/weights")
-        model = model.to("cuda", dtype=torch.bfloat16)
-        # Encode (returns posterior mode by default)
-        latents = model.encode(images)  # images: [B,3,H,W] in [-1,1]
-        # Decode (1 step by default — PSNR-optimal)
-        recon = model.decode(latents, height=H, width=W)
-        # Reconstruct (encode + 1-step decode)
-        recon = model.reconstruct(images)
-    """
-    _LATENT_NORM_EPS: float = 1e-4
-    def __init__(self, config: CapacitorDiffAEConfig) -> None:
-        super().__init__()
-        self.config = config
-        # Latent running stats for whitening/dewhitening
-        self.register_buffer(
-            "latent_norm_running_mean",
-            torch.zeros((config.bottleneck_dim,), dtype=torch.float32),
-        )
-        self.register_buffer(
-            "latent_norm_running_var",
-            torch.ones((config.bottleneck_dim,), dtype=torch.float32),
-        )
-        self.encoder = Encoder(
-            in_channels=config.in_channels,
-            patch_size=config.patch_size,
-            model_dim=config.model_dim,
-            depth=config.encoder_depth,
-            bottleneck_dim=config.bottleneck_dim,
-            mlp_ratio=config.mlp_ratio,
-            depthwise_kernel_size=config.depthwise_kernel_size,
-            bottleneck_posterior_kind=config.bottleneck_posterior_kind,
-            bottleneck_norm_mode=config.bottleneck_norm_mode,
-        )
-        self.decoder = Decoder(
-            in_channels=config.in_channels,
-            patch_size=config.patch_size,
-            model_dim=config.model_dim,
-            depth=config.decoder_depth,
-            start_block_count=config.decoder_start_blocks,
-            end_block_count=config.decoder_end_blocks,
-            bottleneck_dim=config.bottleneck_dim,
-            mlp_ratio=config.mlp_ratio,
-            depthwise_kernel_size=config.depthwise_kernel_size,
-            adaln_low_rank_rank=config.adaln_low_rank_rank,
-        )
-    @classmethod
-    def from_pretrained(
-        cls,
-        path_or_repo_id: str | Path,
-        *,
-        dtype: torch.dtype = torch.bfloat16,
-        device: str | torch.device = "cpu",
-        revision: str | None = None,
-        cache_dir: str | Path | None = None,
-    ) -> CapacitorDiffAE:
-        """Load a pretrained model from a local directory or HuggingFace Hub.
-        The directory (or repo) should contain:
-        - config.json: Model architecture config.
-        - model.safetensors (preferred) or model.pt: Model weights.
-        Args:
-            path_or_repo_id: Local directory path or HuggingFace Hub repo ID.
-            dtype: Load weights in this dtype (float32 or bfloat16).
-            device: Target device.
-            revision: Git revision for Hub downloads.
-            cache_dir: Where to cache Hub downloads.
-        Returns:
-            Loaded model in eval mode.
-        """
-        model_dir = _resolve_model_dir(
-            path_or_repo_id, revision=revision, cache_dir=cache_dir
-        )
-        config = CapacitorDiffAEConfig.load(model_dir / "config.json")
-        model = cls(config)
-        safetensors_path = model_dir / "model.safetensors"
-        pt_path = model_dir / "model.pt"
-        if safetensors_path.exists():
-            try:
-                from safetensors.torch import load_file
-                state_dict = load_file(str(safetensors_path), device=str(device))
-            except ImportError:
-                raise ImportError(
-                    "safetensors package required to load .safetensors files. "
-                    "Install with: pip install safetensors"
-                )
-        elif pt_path.exists():
-            state_dict = torch.load(
-                str(pt_path), map_location=device, weights_only=True
-            )
-        else:
-            raise FileNotFoundError(
-                f"No model weights found in {model_dir}. "
-                "Expected model.safetensors or model.pt."
-            )
-        model.load_state_dict(state_dict)
-        model = model.to(dtype=dtype, device=torch.device(device))
-        model.eval()
-        return model
-    def _latent_norm_stats(self) -> tuple[Tensor, Tensor]:
-        """Return (mean, std) tensors for latent whitening, shaped [1,C,1,1]."""
-        mean = self.latent_norm_running_mean.view(1, -1, 1, 1)
-        var = self.latent_norm_running_var.view(1, -1, 1, 1)
-        std = torch.sqrt(var.to(torch.float32) + self._LATENT_NORM_EPS)
-        return mean.to(torch.float32), std
-    def whiten(self, latents: Tensor) -> Tensor:
-        """Whiten encoder latents using per-channel running stats.
-        Use this before passing latents to a downstream latent-space
-        diffusion model. The whitened latents have approximately zero mean
-        and unit variance per channel.
-        Args:
-            latents: [B, bottleneck_dim, h, w] raw encoder output.
-        Returns:
-            Whitened latents [B, bottleneck_dim, h, w] in float32.
-        """
-        z = latents.to(torch.float32)
-        mean, std = self._latent_norm_stats()
-        return (z - mean.to(device=z.device)) / std.to(device=z.device)
-    def dewhiten(self, latents: Tensor) -> Tensor:
-        """Undo whitening to recover raw encoder latent scale.
-        Use this before passing whitened latents back to ``decode()``.
-        Args:
-            latents: [B, bottleneck_dim, h, w] whitened latents.
-        Returns:
-            Dewhitened latents [B, bottleneck_dim, h, w] in float32.
-        """
-        z = latents.to(torch.float32)
-        mean, std = self._latent_norm_stats()
-        return z * std.to(device=z.device) + mean.to(device=z.device)
-    def encode(self, images: Tensor) -> Tensor:
-        """Encode images to whitened latents (posterior mode).
-        Returns latents whitened using per-channel running stats, ready for
-        use by downstream latent-space diffusion models.
-        Args:
-            images: [B, 3, H, W] in [-1, 1], H and W divisible by patch_size.
-        Returns:
-            Whitened latents [B, bottleneck_dim, H/patch, W/patch].
-        """
-        try:
-            model_dtype = next(self.parameters()).dtype
-        except StopIteration:
-            model_dtype = torch.float32
-        z = self.encoder(images.to(dtype=model_dtype))
-        return self.whiten(z).to(dtype=model_dtype)
-    def encode_posterior(self, images: Tensor) -> EncoderPosterior:
-        """Encode images and return the full posterior (mean + logsnr).
-        Args:
-            images: [B, 3, H, W] in [-1, 1], H and W divisible by patch_size.
-        Returns:
-            EncoderPosterior with mean and logsnr tensors.
-        """
-        try:
-            model_dtype = next(self.parameters()).dtype
-        except StopIteration:
-            model_dtype = torch.float32
-        return self.encoder.encode_posterior(images.to(dtype=model_dtype))
-    @torch.no_grad()
-    def decode(
-        self,
-        latents: Tensor,
-        height: int,
-        width: int,
-        *,
-        inference_config: CapacitorDiffAEInferenceConfig | None = None,
-    ) -> Tensor:
-        """Decode whitened latents to images via VP diffusion.
-        Latents are dewhitened internally before being passed to the decoder.
-        Args:
-            latents: [B, bottleneck_dim, h, w] whitened encoder latents.
-            height: Output image height (divisible by patch_size).
-            width: Output image width (divisible by patch_size).
-            inference_config: Optional inference parameters.
-        Returns:
-            Reconstructed images [B, 3, H, W] in float32.
-        """
-        cfg = inference_config or CapacitorDiffAEInferenceConfig()
-        config = self.config
-        batch = int(latents.shape[0])
-        device = latents.device
-        try:
-            model_dtype = next(self.parameters()).dtype
-        except StopIteration:
-            model_dtype = torch.float32
-        # Dewhiten back to raw encoder scale for the decoder
-        latents = self.dewhiten(latents).to(dtype=model_dtype)
-        if height % config.patch_size != 0 or width % config.patch_size != 0:
-            raise ValueError(
-                f"height={height} and width={width} must be divisible by "
-                f"patch_size={config.patch_size}"
-            )
-        shape = (batch, config.in_channels, height, width)
-        noise = sample_noise(
-            shape,
-            noise_std=config.pixel_noise_std,
-            seed=cfg.seed,
-            device=torch.device("cpu"),
-            dtype=torch.float32,
-        )
-        schedule = get_schedule(cfg.schedule, cfg.num_steps).to(device=device)
-        initial_state = make_initial_state(
-            noise=noise.to(device=device),
-            t_start=schedule[0:1],
-            logsnr_min=config.logsnr_min,
-            logsnr_max=config.logsnr_max,
-        )
-        device_type = "cuda" if device.type == "cuda" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            latents_in = latents.to(device=device)
-            def _forward_fn(
-                x_t: Tensor,
-                t: Tensor,
-                latents: Tensor,
-                *,
-                drop_middle_blocks: bool = False,
-                mask_latent_tokens: bool = False,
-            ) -> Tensor:
-                return self.decoder(
-                    x_t.to(dtype=model_dtype),
-                    t,
-                    latents.to(dtype=model_dtype),
-                    drop_middle_blocks=drop_middle_blocks,
-                )
-            pdg_mode = "path_drop" if cfg.pdg else "disabled"
-            if cfg.sampler == "ddim":
-                sampler_fn = run_ddim
-            elif cfg.sampler == "dpmpp_2m":
-                sampler_fn = run_dpmpp_2m
-            else:
-                raise ValueError(
-                    f"Unsupported sampler: {cfg.sampler!r}. Use 'ddim' or 'dpmpp_2m'."
-                )
-            result = sampler_fn(
-                forward_fn=_forward_fn,
-                initial_state=initial_state,
-                schedule=schedule,
-                latents=latents_in,
-                logsnr_min=config.logsnr_min,
-                logsnr_max=config.logsnr_max,
-                pdg_mode=pdg_mode,
-                pdg_strength=cfg.pdg_strength,
-                device=device,
-            )
-        return result
-    @torch.no_grad()
-    def reconstruct(
-        self,
-        images: Tensor,
-        *,
-        inference_config: CapacitorDiffAEInferenceConfig | None = None,
-    ) -> Tensor:
-        """Encode then decode. Convenience wrapper.
-        Args:
-            images: [B, 3, H, W] in [-1, 1].
-            inference_config: Optional inference parameters.
-        Returns:
-            Reconstructed images [B, 3, H, W] in float32.
-        """
-        latents = self.encode(images)
-        _, _, h, w = images.shape
-        return self.decode(
-            latents, height=h, width=w, inference_config=inference_config
-        )

capacitor_diffae/norms.py DELETED Viewed

@@ -1,39 +0,0 @@
-"""Channel-wise RMSNorm for NCHW tensors."""
-from __future__ import annotations
-import torch
-from torch import Tensor, nn
-class ChannelWiseRMSNorm(nn.Module):
-    """Channel-wise RMSNorm with float32 reduction for numerical stability.
-    Normalizes across channels per spatial position. Supports optional
-    per-channel affine weight and bias.
-    """
-    def __init__(self, channels: int, eps: float = 1e-6, affine: bool = True) -> None:
-        super().__init__()
-        self.channels: int = int(channels)
-        self._eps: float = float(eps)
-        if affine:
-            self.weight = nn.Parameter(torch.ones(self.channels))
-            self.bias = nn.Parameter(torch.zeros(self.channels))
-        else:
-            self.register_parameter("weight", None)
-            self.register_parameter("bias", None)
-    def forward(self, x: Tensor) -> Tensor:
-        if x.dim() < 2:
-            return x
-        # Float32 accumulation for stability
-        ms = torch.mean(torch.square(x), dim=1, keepdim=True, dtype=torch.float32)
-        inv_rms = torch.rsqrt(ms + self._eps)
-        y = x * inv_rms
-        if self.weight is not None:
-            shape = (1, -1) + (1,) * (x.dim() - 2)
-            y = y * self.weight.view(shape).to(dtype=y.dtype)
-            if self.bias is not None:
-                y = y + self.bias.view(shape).to(dtype=y.dtype)
-        return y.to(dtype=x.dtype)

capacitor_diffae/samplers.py DELETED Viewed

@@ -1,263 +0,0 @@
-"""DDIM and DPM++2M samplers for VP diffusion with dual PDG support."""
-from __future__ import annotations
-from typing import Protocol
-import torch
-from torch import Tensor
-from .vp_diffusion import (
-    alpha_sigma_from_logsnr,
-    broadcast_time_like,
-    shifted_cosine_interpolated_logsnr_from_t,
-)
-class DecoderForwardFn(Protocol):
-    """Callable that predicts x0 from (x_t, t, latents) with dual PDG flags."""
-    def __call__(
-        self,
-        x_t: Tensor,
-        t: Tensor,
-        latents: Tensor,
-        *,
-        drop_middle_blocks: bool = False,
-        mask_latent_tokens: bool = False,
-    ) -> Tensor: ...
-def _reconstruct_eps_from_x0(
-    *, x_t: Tensor, x0_hat: Tensor, alpha: Tensor, sigma: Tensor
-) -> Tensor:
-    """Reconstruct eps_hat from (x_t, x0_hat) under VP parameterization.
-    eps_hat = (x_t - alpha * x0_hat) / sigma. All float32.
-    """
-    alpha_view = broadcast_time_like(alpha, x_t).to(dtype=torch.float32)
-    sigma_view = broadcast_time_like(sigma, x_t).to(dtype=torch.float32)
-    x_t_f32 = x_t.to(torch.float32)
-    x0_f32 = x0_hat.to(torch.float32)
-    return (x_t_f32 - alpha_view * x0_f32) / sigma_view
-def _ddim_step(
-    *,
-    x0_hat: Tensor,
-    eps_hat: Tensor,
-    alpha_next: Tensor,
-    sigma_next: Tensor,
-    ref: Tensor,
-) -> Tensor:
-    """DDIM step: x_next = alpha_next * x0_hat + sigma_next * eps_hat."""
-    a = broadcast_time_like(alpha_next, ref).to(dtype=torch.float32)
-    s = broadcast_time_like(sigma_next, ref).to(dtype=torch.float32)
-    return a * x0_hat + s * eps_hat
-def _predict_with_pdg(
-    forward_fn: DecoderForwardFn,
-    state: Tensor,
-    t_vec: Tensor,
-    latents: Tensor,
-    *,
-    pdg_mode: str,
-    pdg_strength: float,
-) -> Tensor:
-    """Run decoder forward with optional PDG guidance.
-    Args:
-        forward_fn: Decoder forward function.
-        state: Current noised state [B, C, H, W].
-        t_vec: Timestep vector [B].
-        latents: Encoder latents.
-        pdg_mode: "disabled", "path_drop", or "token_mask".
-        pdg_strength: CFG-like strength for PDG.
-    Returns:
-        x0_hat prediction in float32.
-    """
-    if pdg_mode == "path_drop":
-        x0_uncond = forward_fn(state, t_vec, latents, drop_middle_blocks=True).to(
-            torch.float32
-        )
-        x0_cond = forward_fn(state, t_vec, latents, drop_middle_blocks=False).to(
-            torch.float32
-        )
-        return x0_uncond + pdg_strength * (x0_cond - x0_uncond)
-    elif pdg_mode == "token_mask":
-        x0_uncond = forward_fn(state, t_vec, latents, mask_latent_tokens=True).to(
-            torch.float32
-        )
-        x0_cond = forward_fn(state, t_vec, latents, mask_latent_tokens=False).to(
-            torch.float32
-        )
-        return x0_uncond + pdg_strength * (x0_cond - x0_uncond)
-    else:
-        return forward_fn(state, t_vec, latents, drop_middle_blocks=False).to(
-            torch.float32
-        )
-def run_ddim(
-    *,
-    forward_fn: DecoderForwardFn,
-    initial_state: Tensor,
-    schedule: Tensor,
-    latents: Tensor,
-    logsnr_min: float,
-    logsnr_max: float,
-    log_change_high: float = 0.0,
-    log_change_low: float = 0.0,
-    pdg_mode: str = "disabled",
-    pdg_strength: float = 1.5,
-    device: torch.device | None = None,
-) -> Tensor:
-    """Run DDIM sampling loop with dual PDG support.
-    Args:
-        forward_fn: Decoder forward function (x_t, t, latents) -> x0_hat.
-        initial_state: Starting noised state [B, C, H, W] in float32.
-        schedule: Descending t-schedule [num_steps] in [0, 1].
-        latents: Encoder latents [B, bottleneck_dim, h, w].
-        logsnr_min, logsnr_max: VP schedule endpoints.
-        log_change_high, log_change_low: Shifted-cosine schedule parameters.
-        pdg_mode: "disabled", "path_drop", or "token_mask".
-        pdg_strength: CFG-like strength for PDG.
-        device: Target device.
-    Returns:
-        Denoised samples [B, C, H, W] in float32.
-    """
-    run_device = device or initial_state.device
-    batch_size = int(initial_state.shape[0])
-    state = initial_state.to(device=run_device, dtype=torch.float32)
-    # Precompute logSNR, alpha, sigma for all schedule points
-    lmb = shifted_cosine_interpolated_logsnr_from_t(
-        schedule.to(device=run_device),
-        logsnr_min=logsnr_min,
-        logsnr_max=logsnr_max,
-        log_change_high=log_change_high,
-        log_change_low=log_change_low,
-    )
-    alpha_sched, sigma_sched = alpha_sigma_from_logsnr(lmb)
-    for i in range(int(schedule.numel()) - 1):
-        t_i = schedule[i]
-        a_t = alpha_sched[i].expand(batch_size)
-        s_t = sigma_sched[i].expand(batch_size)
-        a_next = alpha_sched[i + 1].expand(batch_size)
-        s_next = sigma_sched[i + 1].expand(batch_size)
-        # Model prediction with optional PDG
-        t_vec = t_i.expand(batch_size).to(device=run_device, dtype=torch.float32)
-        x0_hat = _predict_with_pdg(
-            forward_fn,
-            state,
-            t_vec,
-            latents,
-            pdg_mode=pdg_mode,
-            pdg_strength=pdg_strength,
-        )
-        eps_hat = _reconstruct_eps_from_x0(
-            x_t=state, x0_hat=x0_hat, alpha=a_t, sigma=s_t
-        )
-        state = _ddim_step(
-            x0_hat=x0_hat,
-            eps_hat=eps_hat,
-            alpha_next=a_next,
-            sigma_next=s_next,
-            ref=state,
-        )
-    return state
-def run_dpmpp_2m(
-    *,
-    forward_fn: DecoderForwardFn,
-    initial_state: Tensor,
-    schedule: Tensor,
-    latents: Tensor,
-    logsnr_min: float,
-    logsnr_max: float,
-    log_change_high: float = 0.0,
-    log_change_low: float = 0.0,
-    pdg_mode: str = "disabled",
-    pdg_strength: float = 1.5,
-    device: torch.device | None = None,
-) -> Tensor:
-    """Run DPM++2M sampling loop with dual PDG support.
-    Multi-step solver using exponential integrator formulation in half-lambda space.
-    """
-    run_device = device or initial_state.device
-    batch_size = int(initial_state.shape[0])
-    state = initial_state.to(device=run_device, dtype=torch.float32)
-    # Precompute logSNR, alpha, sigma, half-lambda for all schedule points
-    lmb = shifted_cosine_interpolated_logsnr_from_t(
-        schedule.to(device=run_device),
-        logsnr_min=logsnr_min,
-        logsnr_max=logsnr_max,
-        log_change_high=log_change_high,
-        log_change_low=log_change_low,
-    )
-    alpha_sched, sigma_sched = alpha_sigma_from_logsnr(lmb)
-    half_lambda = 0.5 * lmb.to(torch.float32)
-    x0_prev: Tensor | None = None
-    for i in range(int(schedule.numel()) - 1):
-        t_i = schedule[i]
-        s_t = sigma_sched[i].expand(batch_size)
-        a_next = alpha_sched[i + 1].expand(batch_size)
-        s_next = sigma_sched[i + 1].expand(batch_size)
-        # Model prediction with optional PDG
-        t_vec = t_i.expand(batch_size).to(device=run_device, dtype=torch.float32)
-        x0_hat = _predict_with_pdg(
-            forward_fn,
-            state,
-            t_vec,
-            latents,
-            pdg_mode=pdg_mode,
-            pdg_strength=pdg_strength,
-        )
-        lam_t = half_lambda[i].expand(batch_size)
-        lam_next = half_lambda[i + 1].expand(batch_size)
-        h = (lam_next - lam_t).to(torch.float32)
-        phi_1 = torch.expm1(-h)
-        sigma_ratio = (s_next / s_t).to(torch.float32)
-        if i == 0 or x0_prev is None:
-            # First-order step
-            state = (
-                sigma_ratio.view(-1, *([1] * (state.dim() - 1))) * state
-                - broadcast_time_like(a_next, state).to(torch.float32)
-                * broadcast_time_like(phi_1, state).to(torch.float32)
-                * x0_hat
-            )
-        else:
-            # Second-order step
-            lam_prev = half_lambda[i - 1].expand(batch_size)
-            h_0 = (lam_t - lam_prev).to(torch.float32)
-            r0 = h_0 / h
-            d1_0 = (x0_hat - x0_prev) / broadcast_time_like(r0, x0_hat)
-            common = broadcast_time_like(a_next, state).to(
-                torch.float32
-            ) * broadcast_time_like(phi_1, state).to(torch.float32)
-            state = (
-                sigma_ratio.view(-1, *([1] * (state.dim() - 1))) * state
-                - common * x0_hat
-                - 0.5 * common * d1_0
-            )
-        x0_prev = x0_hat
-    return state

capacitor_diffae/straight_through_encoder.py DELETED Viewed

@@ -1,27 +0,0 @@
-"""PixelUnshuffle-based patchifier (no residual conv path)."""
-from __future__ import annotations
-from torch import Tensor, nn
-class Patchify(nn.Module):
-    """PixelUnshuffle(patch) -> Conv2d 1x1 projection.
-    Converts [B, C, H, W] images into [B, out_channels, H/patch, W/patch] features.
-    """
-    def __init__(self, in_channels: int, patch: int, out_channels: int) -> None:
-        super().__init__()
-        self.patch = int(patch)
-        self.unshuffle = nn.PixelUnshuffle(self.patch)
-        in_after = in_channels * (self.patch * self.patch)
-        self.proj = nn.Conv2d(in_after, out_channels, kernel_size=1, bias=True)
-    def forward(self, x: Tensor) -> Tensor:
-        if x.shape[2] % self.patch != 0 or x.shape[3] % self.patch != 0:
-            raise ValueError(
-                f"Input H={x.shape[2]} and W={x.shape[3]} must be divisible by patch={self.patch}"
-            )
-        y = self.unshuffle(x)
-        return self.proj(y)

capacitor_diffae/time_embed.py DELETED Viewed

@@ -1,83 +0,0 @@
-"""Sinusoidal timestep embedding with MLP projection."""
-from __future__ import annotations
-import math
-import torch
-from torch import Tensor, nn
-def _log_spaced_frequencies(
-    half: int, max_period: float, *, device: torch.device | None = None
-) -> Tensor:
-    """Log-spaced frequencies for sinusoidal embedding."""
-    return torch.exp(
-        -math.log(max_period)
-        * torch.arange(half, device=device, dtype=torch.float32)
-        / max(float(half - 1), 1.0)
-    )
-def sinusoidal_time_embedding(
-    t: Tensor,
-    dim: int,
-    *,
-    max_period: float = 10000.0,
-    scale: float | None = None,
-    freqs: Tensor | None = None,
-) -> Tensor:
-    """Sinusoidal timestep embedding (DDPM/DiT-style). Always float32."""
-    t32 = t.to(torch.float32)
-    if scale is not None:
-        t32 = t32 * float(scale)
-    half = dim // 2
-    if freqs is not None:
-        freqs = freqs.to(device=t32.device, dtype=torch.float32)
-    else:
-        freqs = _log_spaced_frequencies(half, max_period, device=t32.device)
-    angles = t32[:, None] * freqs[None, :]
-    return torch.cat([torch.sin(angles), torch.cos(angles)], dim=-1)
-class SinusoidalTimeEmbeddingMLP(nn.Module):
-    """Sinusoidal time embedding followed by Linear -> SiLU -> Linear."""
-    def __init__(
-        self,
-        dim: int,
-        *,
-        freq_dim: int = 256,
-        hidden_mult: float = 1.0,
-        time_scale: float = 1000.0,
-        max_period: float = 10000.0,
-    ) -> None:
-        super().__init__()
-        self.dim = int(dim)
-        self.freq_dim = int(freq_dim)
-        self.time_scale = float(time_scale)
-        self.max_period = float(max_period)
-        hidden_dim = max(int(round(int(dim) * float(hidden_mult))), 1)
-        freqs = _log_spaced_frequencies(self.freq_dim // 2, self.max_period)
-        self.register_buffer("freqs", freqs, persistent=True)
-        self.proj_in = nn.Linear(self.freq_dim, hidden_dim)
-        self.act = nn.SiLU()
-        self.proj_out = nn.Linear(hidden_dim, self.dim)
-    def forward(self, t: Tensor) -> Tensor:
-        freqs: Tensor = self.freqs  # type: ignore[assignment]
-        emb_freq = sinusoidal_time_embedding(
-            t.to(torch.float32),
-            self.freq_dim,
-            max_period=self.max_period,
-            scale=self.time_scale,
-            freqs=freqs,
-        )
-        dtype_in = self.proj_in.weight.dtype
-        hidden = self.proj_in(emb_freq.to(dtype_in))
-        hidden = self.act(hidden)
-        if hidden.dtype != self.proj_out.weight.dtype:
-            hidden = hidden.to(self.proj_out.weight.dtype)
-        return self.proj_out(hidden)

capacitor_diffae/vp_diffusion.py DELETED Viewed

@@ -1,151 +0,0 @@
-"""VP diffusion math: logSNR schedules, alpha/sigma computation, noise construction."""
-from __future__ import annotations
-import math
-import torch
-from torch import Tensor
-def alpha_sigma_from_logsnr(lmb: Tensor) -> tuple[Tensor, Tensor]:
-    """Compute (alpha, sigma) from logSNR in float32.
-    VP constraint: alpha^2 + sigma^2 = 1.
-    """
-    lmb32 = lmb.to(dtype=torch.float32)
-    alpha = torch.sqrt(torch.sigmoid(lmb32))
-    sigma = torch.sqrt(torch.sigmoid(-lmb32))
-    return alpha, sigma
-def broadcast_time_like(coeff: Tensor, x: Tensor) -> Tensor:
-    """Broadcast [B] coefficient to match x for per-sample scaling."""
-    view_shape = (int(x.shape[0]),) + (1,) * (x.dim() - 1)
-    return coeff.view(view_shape)
-def _cosine_interpolated_params(
-    logsnr_min: float, logsnr_max: float
-) -> tuple[float, float]:
-    """Compute (a, b) for cosine-interpolated logSNR schedule.
-    logsnr(t) = -2 * log(tan(a*t + b))
-    logsnr(0) = logsnr_max, logsnr(1) = logsnr_min
-    """
-    b = math.atan(math.exp(-0.5 * logsnr_max))
-    a = math.atan(math.exp(-0.5 * logsnr_min)) - b
-    return a, b
-def cosine_interpolated_logsnr_from_t(
-    t: Tensor, *, logsnr_min: float, logsnr_max: float
-) -> Tensor:
-    """Map t in [0,1] to logSNR via cosine-interpolated schedule. Always float32."""
-    a, b = _cosine_interpolated_params(logsnr_min, logsnr_max)
-    t32 = t.to(dtype=torch.float32)
-    a_t = torch.tensor(a, device=t32.device, dtype=torch.float32)
-    b_t = torch.tensor(b, device=t32.device, dtype=torch.float32)
-    u = a_t * t32 + b_t
-    return -2.0 * torch.log(torch.tan(u))
-def shifted_cosine_interpolated_logsnr_from_t(
-    t: Tensor,
-    *,
-    logsnr_min: float,
-    logsnr_max: float,
-    log_change_high: float = 0.0,
-    log_change_low: float = 0.0,
-) -> Tensor:
-    """SiD2 "shifted cosine" schedule: logSNR with resolution-dependent shifts.
-    lambda(t) = (1-t) * (base(t) + log_change_high) + t * (base(t) + log_change_low)
-    """
-    base = cosine_interpolated_logsnr_from_t(
-        t, logsnr_min=logsnr_min, logsnr_max=logsnr_max
-    )
-    t32 = t.to(dtype=torch.float32)
-    high = base + float(log_change_high)
-    low = base + float(log_change_low)
-    return (1.0 - t32) * high + t32 * low
-def get_schedule(schedule_type: str, num_steps: int) -> Tensor:
-    """Generate a descending t-schedule in [0, 1] for VP diffusion sampling.
-    ``num_steps`` is the number of function evaluations (NFE = decoder forward
-    passes).  Internally the schedule has ``num_steps + 1`` time points
-    (including both endpoints).
-    Args:
-        schedule_type: "linear" or "cosine".
-        num_steps: Number of decoder forward passes (NFE), >= 1.
-    Returns:
-        Descending 1D tensor with ``num_steps + 1`` elements from ~1.0 to ~0.0.
-    """
-    # NOTE: the upstream training code (src/ode/time_schedules.py) uses a
-    # different convention where num_steps counts schedule *points* (so NFE =
-    # num_steps - 1).  This export package corrects the off-by-one so that
-    # num_steps means NFE directly.  TODO: align the upstream convention.
-    n = max(int(num_steps) + 1, 2)
-    if schedule_type == "linear":
-        base = torch.linspace(0.0, 1.0, n)
-    elif schedule_type == "cosine":
-        i = torch.arange(n, dtype=torch.float32)
-        base = 0.5 * (1.0 - torch.cos(math.pi * (i / (n - 1))))
-    else:
-        raise ValueError(
-            f"Unsupported schedule type: {schedule_type!r}. Use 'linear' or 'cosine'."
-        )
-    # Descending: high t (noisy) -> low t (clean)
-    return torch.flip(base, dims=[0])
-def make_initial_state(
-    *,
-    noise: Tensor,
-    t_start: Tensor,
-    logsnr_min: float,
-    logsnr_max: float,
-    log_change_high: float = 0.0,
-    log_change_low: float = 0.0,
-) -> Tensor:
-    """Construct VP initial state x_t0 = sigma_start * noise (since x0=0).
-    All math in float32.
-    """
-    batch = int(noise.shape[0])
-    lmb_start = shifted_cosine_interpolated_logsnr_from_t(
-        t_start.expand(batch).to(dtype=torch.float32),
-        logsnr_min=logsnr_min,
-        logsnr_max=logsnr_max,
-        log_change_high=log_change_high,
-        log_change_low=log_change_low,
-    )
-    _alpha_start, sigma_start = alpha_sigma_from_logsnr(lmb_start)
-    sigma_view = broadcast_time_like(sigma_start, noise)
-    return sigma_view * noise.to(dtype=torch.float32)
-def sample_noise(
-    shape: tuple[int, ...],
-    *,
-    noise_std: float = 1.0,
-    seed: int | None = None,
-    device: torch.device | None = None,
-    dtype: torch.dtype = torch.float32,
-) -> Tensor:
-    """Sample Gaussian noise with optional seeding. CPU-seeded for reproducibility."""
-    if seed is None:
-        noise = torch.randn(
-            shape, device=device or torch.device("cpu"), dtype=torch.float32
-        )
-    else:
-        gen = torch.Generator(device="cpu")
-        gen.manual_seed(int(seed))
-        noise = torch.randn(shape, generator=gen, device="cpu", dtype=torch.float32)
-    noise = noise.mul(float(noise_std))
-    target_device = device if device is not None else torch.device("cpu")
-    return noise.to(device=target_device, dtype=dtype)