krystv
/

LiquidFlow-Gen

Model card Files Files and versions

xet

Community

krystv commited on 2 days ago

Commit

3798d56

verified ·

1 Parent(s): 68363aa

Upload liquid_flow/vae_wrapper.py

Browse files

Files changed (1) hide show

liquid_flow/vae_wrapper.py +28 -58

liquid_flow/vae_wrapper.py CHANGED Viewed

@@ -1,53 +1,33 @@
 """
-VAE Wrappers — compatible VAE interfaces for LiquidFlow.
-Supports two VAE backends:
-1. TAESD (Tiny AutoEncoder for SD): < 1M params, extremely fast, perfect for mobile
-2. SD-VAE (Stability AI VAE): Higher quality, 84M params, standard for SD pipelines
-TAESD is the DEFAULT for LiquidFlow — it's designed to be lightweight and
-fast enough for Colab/Kaggle free tier.
-Paper reference: "Tiny AutoEncoder for Stable Diffusion" (madebyollin/taesd)
-Model: madebyollin/taesd (335K downloads on HF)
 """
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import Optional
 class TAESDWrapper:
     """
     Wrapper for Tiny AutoEncoder for Stable Diffusion (TAESD).
-    TAESD properties:
-    - ~1M parameters (vs 84M for SD VAE)
-    - Latent dim: 4 channels @ 8x compression
-    - Extremely fast encode/decode
-    - Works on CPU — no GPU needed
-    - Perfect for Colab/Kaggle free tier
-    Model on HF: madebyollin/taesd
     """
-    def __init__(self, device='cpu'):
-        self.device = device
-        self.model = None
-    @staticmethod
-    def is_available():
-        """Check if TAESD can be loaded."""
-        try:
-            from diffusers import AutoencoderTiny
-            return True
-        except ImportError:
-            return False
     @staticmethod
     def load(device='cpu'):
-        """Load TAESD model."""
         from diffusers import AutoencoderTiny
         model = AutoencoderTiny.from_pretrained(
             "madebyollin/taesd",
@@ -57,25 +37,19 @@ class TAESDWrapper:
         model.eval()
         return model
-    @staticmethod
-    def get_latent_shape(image_size):
-        """Get latent spatial size given image size (8x compression)."""
-        return image_size // 8
     @staticmethod
     def encode(vae, x):
         """
         Encode image to latent.
         Args:
-            vae: TAESD model
             x: [B, 3, H, W] images in [-1, 1]
         Returns:
-            z: [B, 4, H/8, W/8]
         """
         with torch.no_grad():
-            posterior = vae.encode(x).latent_dist
-            z = posterior.sample()
-            z = z * vae.config.scaling_factor
         return z
     @staticmethod
@@ -83,34 +57,30 @@ class TAESDWrapper:
         """
         Decode latent to image.
         Args:
-            vae: TAESD model
-            z: [B, 4, H/8, W/8]
         Returns:
             x: [B, 3, H, W] images in [-1, 1]
         """
         with torch.no_grad():
-            z = z / vae.config.scaling_factor
             x = vae.decode(z).sample
         return x
 class SDVAEWrapper:
     """
     Wrapper for Stability AI VAE (sd-vae-ft-mse).
-    Properties:
-    - ~84M parameters
-    - Latent dim: 4 channels @ 8x compression
-    - Higher quality reconstruction than TAESD
-    - Requires GPU for reasonable speed
-    Model on HF: stabilityai/sd-vae-ft-mse
     """
-    def __init__(self, device='cpu'):
-        self.device = device
-        self.model = None
     @staticmethod
     def load(device='cpu'):
         """Load SD VAE model."""
@@ -125,7 +95,7 @@ class SDVAEWrapper:
     @staticmethod
     def encode(vae, x):
-        """Encode image to latent."""
         with torch.no_grad():
             posterior = vae.encode(x).latent_dist
             z = posterior.sample()
@@ -134,7 +104,7 @@ class SDVAEWrapper:
     @staticmethod
     def decode(vae, z):
-        """Decode latent to image."""
         with torch.no_grad():
             z = z / vae.config.scaling_factor
             x = vae.decode(z).sample

 """
+VAE Wrappers — corrected for actual TAESD and SD-VAE APIs.
+TAESD (AutoencoderTiny):
+  - encode(x) returns AutoencoderTinyOutput with .latents (no sampling)
+  - scaling_factor = 1.0 (no scaling needed)
+  - decode(z) returns DecoderOutput with .sample
+SD-VAE (AutoencoderKL):
+  - encode(x) returns AutoEncoderKLOutput with .latent_dist
+  - scaling_factor = 0.18215
+  - decode(z) returns DecoderOutput with .sample
 """
 import torch
 class TAESDWrapper:
     """
     Wrapper for Tiny AutoEncoder for Stable Diffusion (TAESD).
+    Key: TAESD uses .latents directly (deterministic encoder, no sampling).
+    scaling_factor = 1.0, so no scaling needed.
+    Model: madebyollin/taesd (~2.5M params, 9.8MB)
     """
     @staticmethod
     def load(device='cpu'):
+        """Load TAESD model from HuggingFace."""
         from diffusers import AutoencoderTiny
         model = AutoencoderTiny.from_pretrained(
             "madebyollin/taesd",
         model.eval()
         return model
     @staticmethod
     def encode(vae, x):
         """
         Encode image to latent.
         Args:
+            vae: AutoencoderTiny model
             x: [B, 3, H, W] images in [-1, 1]
         Returns:
+            z: [B, 4, H/8, W/8] latents
         """
         with torch.no_grad():
+            # TAESD returns .latents directly (no latent_dist)
+            z = vae.encode(x).latents
         return z
     @staticmethod
         """
         Decode latent to image.
         Args:
+            vae: AutoencoderTiny model
+            z: [B, 4, H/8, W/8] latents
         Returns:
             x: [B, 3, H, W] images in [-1, 1]
         """
         with torch.no_grad():
             x = vae.decode(z).sample
         return x
+    @staticmethod
+    def get_latent_shape(image_size):
+        """Get latent spatial size (8x compression)."""
+        return image_size // 8
 class SDVAEWrapper:
     """
     Wrapper for Stability AI VAE (sd-vae-ft-mse).
+    Key: Uses .latent_dist.sample() and scaling_factor=0.18215.
+    Model: stabilityai/sd-vae-ft-mse (~84M params)
     """
     @staticmethod
     def load(device='cpu'):
         """Load SD VAE model."""
     @staticmethod
     def encode(vae, x):
+        """Encode image to latent (with scaling)."""
         with torch.no_grad():
             posterior = vae.encode(x).latent_dist
             z = posterior.sample()
     @staticmethod
     def decode(vae, z):
+        """Decode latent to image (with unscaling)."""
         with torch.no_grad():
             z = z / vae.config.scaling_factor
             x = vae.decode(z).sample