krystv
/

LiquidFlow-Gen

Model card Files Files and versions

xet

Community

krystv commited on 6 days ago

Commit

3214be6

verified ·

1 Parent(s): be1bcbb

Upload liquid_flow/liquid_flow_block.py

Browse files

Files changed (1) hide show

liquid_flow/liquid_flow_block.py +104 -194

liquid_flow/liquid_flow_block.py CHANGED Viewed

@@ -1,58 +1,34 @@
 """
 LiquidFlow Block — Hybrid CfC + Mamba-2 SSD architecture.
-The core innovation: combine Liquid Neural Network dynamics (CfC)
-with Mamba-2's efficient linear-time state space model.
 Architecture per block:
-    Input → [CfC Gate → Mamba2 SSD → CfC Gate] → Output
-                ↑                        ↑
-            Adaptive gating        Gated output
-The CfC provides:
-    - Time-continuous adaptive gating (what to process/ignore)
-    - State initialization for the SSM (the "liquid" memory)
-The Mamba-2 SSD provides:
-    - Efficient O(N) sequence processing
-    - Content-aware selection mechanism
-    - Parallelizable computation (no sequential bottleneck)
-Together they create a "Liquid State Space Model" (LSSM):
-    h_t = σ(-f(x_t;θ_f)·t) ⊙ SSM(x_t, h_{t-1}) + (1-σ(...)) ⊙ h(x_t;θ_h)
-Where SSM is the Mamba-2 selective state space model and the
-CfC time-gates control how much the SSM output influences state.
-This is inspired by:
-- LNNs: adaptive time constants for state evolution
-- Mamba-2: efficient selective state space models
-- DiMSUM: multi-scan architecture for 2D images
-- Gated SSM: gating mechanism from CfC applied to SSM
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .cfc_cell import CfCCell
-from .mamba2_ssd import Mamba2SSD
 class LiquidMambaBlock(nn.Module):
     """
     LiquidMamba: CfC-gated Mamba-2 SSD block.
-    The CfC cell acts as a learned gate on the Mamba-2 output,
-    creating a liquid time-constant mechanism for the SSM:
-    1. Input goes through Mamba-2 SSD (multi-directional scan)
-    2. CfC cell receives the SSM output + original input
-    3. CfC produces a time-gated output: σ(f)·SSM_out + (1-σ(f))·input
-    4. The CfC's liquid dynamics adaptively mix SSM features with raw input
-    This creates a "content-aware gating" that the CfC learns to
-    control based on both the input and the SSM's processed features.
     """
     def __init__(self, dim, d_state=16, d_conv=4, expand=2, dropout=0.0):
@@ -60,15 +36,20 @@ class LiquidMambaBlock(nn.Module):
         self.dim = dim
         # LayerNorms
-        self.norm_in = nn.LayerNorm(dim)
-        self.norm_mamba = nn.LayerNorm(dim)
-        self.norm_out = nn.LayerNorm(dim)
-        # Mamba-2 SSD for efficient sequence processing
-        self.mamba = Mamba2SSD(dim=dim, d_state=d_state, d_conv=d_conv, expand=expand)
-        # CfC gate: controls the flow between Mamba output and residual
-        self.cfc_gate = CfCCell(dim=dim, backbone_dropout=dropout, use_conv=True)
         # Feed-forward
         ff_dim = dim * expand
@@ -79,104 +60,52 @@ class LiquidMambaBlock(nn.Module):
             nn.Linear(ff_dim, dim),
             nn.Dropout(dropout),
         )
-        # Learnable mixing ratio init
-        self.gate_scale = nn.Parameter(torch.ones(1) * 0.5)
     def forward(self, x):
         """
         Args:
-            x: [B, C, H, W] (2D) or [B, L, C] (1D seq)
         Returns:
             Same shape as input
         """
         is_2d = x.dim() == 4
         if is_2d:
             B, C, H, W = x.shape
-            L = H * W
-            x_flat = x.flatten(2).transpose(1, 2)  # [B, HW, C]
-        else:
-            B, L, C = x.shape
-            x_flat = x
-        residual = x_flat
-        x_norm = self.norm_in(x_flat)
-        # Mamba-2 SSD processing with multi-directional scan
-        if is_2d:
-            # Reshape for 2D scanning
-            x_2d = x_norm.transpose(1, 2).reshape(B, C, H, W)
-            mamba_out = self._mamba_2d_scan(x_2d)
-            mamba_out = mamba_out.flatten(2).transpose(1, 2)  # [B, HW, C]
-        else:
-            mamba_out = self.mamba(x_norm)
-        # CfC gating: liquid dynamics control the mix
-        mamba_norm = self.norm_mamba(mamba_out)
-        # CfC receives both the Mamba output and the residual
-        # This lets it learn when to trust the SSM vs the original signal
-        cfc_input = mamba_norm + residual
-        cfc_out = self.cfc_gate(cfc_input)
-        # Gated mix: CfC controls the blend
-        gate = torch.sigmoid(self.gate_scale * (cfc_out - mamba_out))
-        mixed = gate * mamba_out + (1 - gate) * residual + cfc_out
-        # Feed-forward + residual
-        out_norm = self.norm_out(mixed)
-        out = mixed + self.ff(out_norm)
         if is_2d:
-            out = out.transpose(1, 2).reshape(B, C, H, W)
-        return out
-    def _mamba_2d_scan(self, x):
-        """
-        Multi-directional Mamba-2 scan for 2D images.
-        Scans in forward and backward raster directions, then merges.
-        This preserves 2D spatial structure better than single-direction scan.
-        """
-        B, C, H, W = x.shape
-        device = x.device
-        # Forward raster: left→right, top→bottom
-        fwd = x.flatten(2)  # [B, C, HW]
-        fwd_seq = fwd.transpose(1, 2)  # [B, HW, C]
-        fwd_out = self.mamba(fwd_seq)
-        # Backward raster: right→left, bottom→top
-        bwd = torch.flip(x.flatten(2), dims=[-1])  # [B, C, HW]
-        bwd_seq = bwd.transpose(1, 2)
-        bwd_out = self.mamba(bwd_seq)
-        bwd_out = torch.flip(bwd_out, dims=[1])  # Flip back
-        # Merge both directions
-        merged = (fwd_out + bwd_out) / 2
-        merged = merged.transpose(1, 2).reshape(B, C, H, W)
-        return merged
 class LiquidFlowStage(nn.Module):
-    """
-    A stage in LiquidFlow: multiple LiquidMamba blocks at the same resolution.
-    Architecture:
-        [LiquidMamba Block] × num_blocks
-        [Optional Downsample/Upsample]
-    This mirrors the hierarchical design from DiT/DiMSUM but with
-    liquid neural network dynamics in every block.
-    """
     def __init__(self, dim, num_blocks=4, d_state=16, expand=2, dropout=0.0):
         super().__init__()
-        self.dim = dim
         self.blocks = nn.ModuleList([
             LiquidMambaBlock(dim=dim, d_state=d_state, expand=expand, dropout=dropout)
             for _ in range(num_blocks)
@@ -190,27 +119,18 @@ class LiquidFlowStage(nn.Module):
 class LiquidFlowBackbone(nn.Module):
     """
-    Complete LiquidFlow backbone for image generation.
-    Architecture:
-        Input (noisy latent) [B, C, H, W]
-        ↓
-        [Patch Embed + Positional Encoding]
-        ↓
-        [LiquidMamba Stages × N]  (at uniform resolution)
-        ↓
-        [Output Head] → predicted noise
-    This is designed as a DiT-style noise predictor for diffusion models.
-    Args:
-        in_channels: Input channels (latent dim from VAE)
-        hidden_dim: Hidden dimension
-        num_stages: Number of processing stages
-        blocks_per_stage: Number of blocks per stage
-        d_state: SSM state dimension
-        expand: Expansion factor
-        dropout: Dropout rate
     """
     def __init__(
@@ -226,27 +146,30 @@ class LiquidFlowBackbone(nn.Module):
         super().__init__()
         self.in_channels = in_channels
         self.hidden_dim = hidden_dim
-        self.num_stages = num_stages
-        # Input embedding: patch embedding
-        self.patch_size = 2  # Fixed patch size
         self.in_proj = nn.Conv2d(in_channels, hidden_dim, kernel_size=1)
-        # Time embedding (for diffusion timestep)
         self.time_embed = nn.Sequential(
             nn.Linear(hidden_dim, hidden_dim * 4),
             nn.SiLU(),
             nn.Linear(hidden_dim * 4, hidden_dim),
         )
-        # Learnable positional encoding
-        # For 128×128 with patch_size=2: 64×64 = 4096 positions
         self.pos_embed = nn.Parameter(torch.randn(1, 4096, hidden_dim) * 0.02)
         # LiquidFlow stages
         self.stages = nn.ModuleList([
             LiquidFlowStage(
-                dim=hidden_dim,
                 num_blocks=blocks_per_stage,
                 d_state=d_state,
                 expand=expand,
@@ -255,80 +178,67 @@ class LiquidFlowBackbone(nn.Module):
             for _ in range(num_stages)
         ])
-        # Output head
         self.out_norm = nn.LayerNorm(hidden_dim)
-        self.out_proj = nn.Sequential(
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.GELU(),
-            nn.Linear(hidden_dim, in_channels * self.patch_size * self.patch_size),
-        )
-        # Timestep conditioner (modulated conv trick)
-        self.t_conditioner = nn.Sequential(
-            nn.SiLU(),
-            nn.Linear(hidden_dim, hidden_dim * 2),  # scale, shift
-        )
-    def _get_timestep_embedding(self, timesteps, dim, max_period=10000):
-        """Sinusoidal timestep embedding (from DiT)."""
         half = dim // 2
         freqs = torch.exp(
-            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
-        ).to(timesteps.device)
         args = timesteps.float().unsqueeze(-1) * freqs.unsqueeze(0)
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
         if dim % 2:
-            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-        return embedding
     def forward(self, x, t):
         """
         Args:
-            x: Noisy latent [B, C, H, W]
-            t: Diffusion timesteps [B]
         Returns:
-            Predicted noise [B, C, H, W]
         """
         B, C, H, W = x.shape
-        device = x.device
-        L = (H // self.patch_size) * (W // self.patch_size)
-        # Input projection
         x = self.in_proj(x)  # [B, hidden_dim, H, W]
-        # Flatten and add positional encoding
-        x_flat = x.flatten(2).transpose(1, 2)  # [B, H*W, hidden_dim]
-        # Time embedding
-        t_emb = self._get_timestep_embedding(t, self.hidden_dim)
         t_emb = self.time_embed(t_emb)  # [B, hidden_dim]
-        # Add time conditioning as bias to input
-        t_cond = self.t_conditioner(t_emb)  # [B, hidden_dim * 2]
-        t_scale, t_shift = t_cond.chunk(2, dim=-1)
-        x_flat = x_flat * (1 + t_scale.unsqueeze(1)) + t_shift.unsqueeze(1)
-        # Add positional encoding
-        x_flat = x_flat + self.pos_embed[:, :L, :]
-        # Reshape back to 2D for processing
-        x_2d = x_flat.transpose(1, 2).reshape(B, self.hidden_dim, H, W)
         # Process through all stages
         for stage in self.stages:
-            x_2d = stage(x_2d)
         # Output head
-        x_out = x_2d.flatten(2).transpose(1, 2)  # [B, H*W, hidden_dim]
-        x_out = self.out_norm(x_out)
-        x_out = self.out_proj(x_out)  # [B, H*W, C * patch²]
-        # Reshape to image
-        x_out = x_out.reshape(B, H, W, C, self.patch_size, self.patch_size)
-        x_out = x_out.permute(0, 3, 1, 4, 2, 5).reshape(B, C, H * self.patch_size, W * self.patch_size)
-        return x_out
-import math

 """
 LiquidFlow Block — Hybrid CfC + Mamba-2 SSD architecture.
+CORRECTED VERSION: proper dimensions, no sequential loops.
 Architecture per block:
+    Input → Mamba-2 SSD (bidirectional) → CfC adaptive gate → Output
+The CfC provides adaptive gating that modulates the SSM output
+based on input-dependent "liquid" time constants.
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import math
+from .cfc_cell import CfCCell, CfCBlock
+from .mamba2_ssd import Mamba2SSD, Mamba2Block
 class LiquidMambaBlock(nn.Module):
     """
     LiquidMamba: CfC-gated Mamba-2 SSD block.
+    Flow:
+    1. Input → LayerNorm → Mamba-2 SSD (bidirectional scan)
+    2. SSM output → CfC adaptive gate (parallel over all positions)
+    3. Gated output → residual + feed-forward
+    The CfC gate learns WHEN to trust the SSM output vs the raw input,
+    creating content-aware adaptive processing.
     """
     def __init__(self, dim, d_state=16, d_conv=4, expand=2, dropout=0.0):
         self.dim = dim
         # LayerNorms
+        self.norm_ssm = nn.LayerNorm(dim)
+        self.norm_gate = nn.LayerNorm(dim)
+        self.norm_ff = nn.LayerNorm(dim)
+        # Mamba-2 SSD: bidirectional scan
+        self.ssd_fwd = Mamba2SSD(dim=dim, d_state=d_state, d_conv=d_conv, expand=expand)
+        self.ssd_bwd = Mamba2SSD(dim=dim, d_state=d_state, d_conv=d_conv, expand=expand)
+        self.merge = nn.Linear(dim * 2, dim, bias=False)
+        # CfC gate: parallel adaptive gating
+        self.cfc_gate = CfCCell(dim=dim, dropout=dropout)
+        # Gate projection (learnable mixing)
+        self.gate_proj = nn.Linear(dim, dim)
         # Feed-forward
         ff_dim = dim * expand
             nn.Linear(ff_dim, dim),
             nn.Dropout(dropout),
         )
     def forward(self, x):
         """
         Args:
+            x: [B, C, H, W] or [B, L, C]
         Returns:
             Same shape as input
         """
         is_2d = x.dim() == 4
         if is_2d:
             B, C, H, W = x.shape
+            x = x.flatten(2).transpose(1, 2)  # [B, HW, C]
+        # === SSM branch ===
+        residual = x
+        x_norm = self.norm_ssm(x)
+        # Bidirectional Mamba-2 scan
+        fwd_out = self.ssd_fwd(x_norm)
+        bwd_out = torch.flip(self.ssd_bwd(torch.flip(x_norm, [1])), [1])
+        ssm_out = self.merge(torch.cat([fwd_out, bwd_out], dim=-1))
+        # === CfC gate ===
+        # CfC processes the SSM output and produces adaptive gate
+        gate_input = self.norm_gate(ssm_out)
+        cfc_out = self.cfc_gate(gate_input)  # [B, L, D] — parallel!
+        # Sigmoid gate: how much SSM output to use
+        gate = torch.sigmoid(self.gate_proj(cfc_out))
+        # Gated residual: blend SSM output with residual
+        x = residual + gate * ssm_out
+        # === Feed-forward ===
+        x = x + self.ff(self.norm_ff(x))
         if is_2d:
+            x = x.transpose(1, 2).reshape(B, C, H, W)
+        return x
 class LiquidFlowStage(nn.Module):
+    """Stack of LiquidMamba blocks at the same resolution."""
     def __init__(self, dim, num_blocks=4, d_state=16, expand=2, dropout=0.0):
         super().__init__()
         self.blocks = nn.ModuleList([
             LiquidMambaBlock(dim=dim, d_state=d_state, expand=expand, dropout=dropout)
             for _ in range(num_blocks)
 class LiquidFlowBackbone(nn.Module):
     """
+    Complete LiquidFlow backbone — DiT-style noise predictor.
+    FIXED: Output shape == Input shape (no patch_size confusion).
+    Architecture:
+        Input [B, in_ch, H, W]
+        → Conv2d projection to hidden_dim
+        → + sinusoidal timestep embedding (AdaLN-style)
+        → + learnable positional encoding
+        → N × LiquidMamba Stages
+        → Conv2d projection back to in_ch
+        → Output [B, in_ch, H, W]
     """
     def __init__(
         super().__init__()
         self.in_channels = in_channels
         self.hidden_dim = hidden_dim
+        # Input projection (pointwise conv)
         self.in_proj = nn.Conv2d(in_channels, hidden_dim, kernel_size=1)
+        # Timestep embedding
         self.time_embed = nn.Sequential(
             nn.Linear(hidden_dim, hidden_dim * 4),
             nn.SiLU(),
             nn.Linear(hidden_dim * 4, hidden_dim),
         )
+        # AdaLN-style conditioning: scale and shift
+        self.t_cond = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_dim, hidden_dim * 2),
+        )
+        # Positional encoding (learnable, supports up to 64×64 = 4096 positions)
         self.pos_embed = nn.Parameter(torch.randn(1, 4096, hidden_dim) * 0.02)
         # LiquidFlow stages
         self.stages = nn.ModuleList([
             LiquidFlowStage(
+                dim=hidden_dim,
                 num_blocks=blocks_per_stage,
                 d_state=d_state,
                 expand=expand,
             for _ in range(num_stages)
         ])
+        # Output projection
         self.out_norm = nn.LayerNorm(hidden_dim)
+        self.out_proj = nn.Linear(hidden_dim, in_channels)
+        self._init_weights()
+    def _init_weights(self):
+        # Zero-init output projection for residual learning
+        nn.init.zeros_(self.out_proj.weight)
+        nn.init.zeros_(self.out_proj.bias)
+    def _sinusoidal_embedding(self, timesteps, dim):
+        """Sinusoidal positional embedding for diffusion timesteps."""
         half = dim // 2
         freqs = torch.exp(
+            -math.log(10000.0) * torch.arange(half, device=timesteps.device).float() / half
+        )
         args = timesteps.float().unsqueeze(-1) * freqs.unsqueeze(0)
+        emb = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
         if dim % 2:
+            emb = F.pad(emb, (0, 1))
+        return emb
     def forward(self, x, t):
         """
         Args:
+            x: [B, in_channels, H, W] — noisy latent
+            t: [B] — diffusion timesteps (integers 0..T-1)
         Returns:
+            [B, in_channels, H, W] — predicted noise (same shape as input!)
         """
         B, C, H, W = x.shape
+        L = H * W
+        # Project to hidden dim
         x = self.in_proj(x)  # [B, hidden_dim, H, W]
+        x = x.flatten(2).transpose(1, 2)  # [B, HW, hidden_dim]
+        # Timestep conditioning (AdaLN)
+        t_emb = self._sinusoidal_embedding(t, self.hidden_dim)  # [B, hidden_dim]
         t_emb = self.time_embed(t_emb)  # [B, hidden_dim]
+        t_cond = self.t_cond(t_emb)  # [B, hidden_dim*2]
+        scale, shift = t_cond.chunk(2, dim=-1)  # each [B, hidden_dim]
+        # Apply conditioning + positional encoding
+        x = x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+        x = x + self.pos_embed[:, :L, :]
+        # Reshape to 2D for processing
+        x = x.transpose(1, 2).reshape(B, self.hidden_dim, H, W)
         # Process through all stages
         for stage in self.stages:
+            x = stage(x)
         # Output head
+        x = x.flatten(2).transpose(1, 2)  # [B, HW, hidden_dim]
+        x = self.out_norm(x)
+        x = self.out_proj(x)  # [B, HW, in_channels]
+        # Reshape back to image: [B, in_channels, H, W]
+        x = x.transpose(1, 2).reshape(B, self.in_channels, H, W)
+        return x