Add offloading & improved fp8 inference.

Browse files

Files changed (11) hide show

configs/config-dev-eval.json +55 -0
configs/config-dev-offload.json +58 -0
configs/config-dev.json +2 -2
float8_quantize.py +288 -0
flux_pipeline.py +100 -48
image_encoder.py +71 -0
main.py +3 -1
modules/conditioner.py +40 -18
modules/flux_model.py +16 -19
turbojpeg_imgs.py +0 -134
util.py +6 -0

configs/config-dev-eval.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:1",
+  "ae_device": "cuda:1",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "flow_quantization_dtype": "qfloat8",
+  "text_enc_quantization_dtype": "qfloat8",
+  "num_to_quant": 22,
+  "compile_extras": false,
+  "compile_blocks": false
+}

configs/config-dev-offload.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "flow_quantization_dtype": "qfloat8",
+  "text_enc_quantization_dtype": "qint4",
+  "num_to_quant": 22,
+  "compile_extras": false,
+  "compile_blocks": false,
+  "offload_text_encoder": true,
+  "offload_vae": true,
+  "offload_flow": true
+}

configs/config-dev.json CHANGED Viewed

@@ -50,6 +50,6 @@
   "flow_quantization_dtype": "qfloat8",
   "text_enc_quantization_dtype": "qfloat8",
   "num_to_quant": 22,
-  "compile_extras": false,
-  "compile_blocks": false
 }

   "flow_quantization_dtype": "qfloat8",
   "text_enc_quantization_dtype": "qfloat8",
   "num_to_quant": 22,
+  "compile_extras": true,
+  "compile_blocks": true
 }

float8_quantize.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import torch
+import torch.nn as nn
+from torchao.float8.float8_utils import (
+    amax_to_scale,
+    tensor_to_amax,
+    to_fp8_saturated,
+)
+from torch.nn import init
+import math
+from torch.compiler import is_compiling
+try:
+    from cublas_ops import CublasLinear
+except ImportError:
+    CublasLinear = type(None)
+class F8Linear(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+        float8_dtype=torch.float8_e4m3fn,
+        float_weight: torch.Tensor = None,
+        float_bias: torch.Tensor = None,
+        num_scale_trials: int = 24,
+        input_float8_dtype=torch.float8_e5m2,
+    ) -> None:
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.float8_dtype = float8_dtype
+        self.input_float8_dtype = input_float8_dtype
+        self.input_scale_initialized = False
+        self.weight_initialized = False
+        self.max_value = torch.finfo(self.float8_dtype).max
+        self.input_max_value = torch.finfo(self.input_float8_dtype).max
+        factory_kwargs = {"dtype": dtype, "device": device}
+        if float_weight is None:
+            self.weight = nn.Parameter(
+                torch.empty((out_features, in_features), **factory_kwargs)
+            )
+        else:
+            self.weight = nn.Parameter(
+                float_weight, requires_grad=float_weight.requires_grad
+            )
+        if float_bias is None:
+            if bias:
+                self.bias = nn.Parameter(
+                    torch.empty(out_features, **factory_kwargs),
+                    requires_grad=bias.requires_grad,
+                )
+            else:
+                self.register_parameter("bias", None)
+        else:
+            self.bias = nn.Parameter(float_bias, requires_grad=float_bias.requires_grad)
+        self.num_scale_trials = num_scale_trials
+        self.input_amax_trials = torch.zeros(
+            num_scale_trials, requires_grad=False, device=device, dtype=torch.float32
+        )
+        self.trial_index = 0
+        self.register_buffer("scale", None)
+        self.register_buffer(
+            "input_scale",
+            None,
+        )
+        self.register_buffer(
+            "float8_data",
+            None,
+        )
+        self.scale_reciprocal = self.register_buffer("scale_reciprocal", None)
+        self.input_scale_reciprocal = self.register_buffer(
+            "input_scale_reciprocal", None
+        )
+    def quantize_weight(self):
+        if self.weight_initialized:
+            return
+        amax = tensor_to_amax(self.weight.data)
+        scale = amax_to_scale(amax, self.float8_dtype, self.weight.dtype)
+        self.float8_data = to_fp8_saturated(self.weight.data * scale, self.float8_dtype)
+        self.scale = scale.float()
+        self.weight_initialized = True
+        self.scale_reciprocal = self.scale.reciprocal().float()
+        self.weight.data = torch.zeros(
+            1, dtype=self.weight.dtype, device=self.weight.device, requires_grad=False
+        )
+    def quantize_input(self, x: torch.Tensor):
+        if self.input_scale_initialized:
+            return to_fp8_saturated(x * self.input_scale, self.input_float8_dtype)
+        elif self.trial_index < self.num_scale_trials:
+            amax = tensor_to_amax(x)
+            self.input_amax_trials[self.trial_index] = amax
+            self.trial_index += 1
+            self.input_scale = amax_to_scale(
+                self.input_amax_trials[: self.trial_index].max(),
+                self.input_float8_dtype,
+                self.weight.dtype,
+            )
+            self.input_scale_reciprocal = self.input_scale.reciprocal()
+            return to_fp8_saturated(x * self.input_scale, self.input_float8_dtype)
+        else:
+            self.input_scale = amax_to_scale(
+                self.input_amax_trials.max(), self.input_float8_dtype, self.weight.dtype
+            )
+            self.input_scale_reciprocal = self.input_scale.reciprocal()
+            self.input_scale_initialized = True
+            return to_fp8_saturated(x * self.input_scale, self.input_float8_dtype)
+    def reset_parameters(self) -> None:
+        if self.weight_initialized:
+            self.weight = nn.Parameter(
+                torch.empty(
+                    (self.out_features, self.in_features),
+                    **{
+                        "dtype": self.weight.dtype,
+                        "device": self.weight.device,
+                    },
+                )
+            )
+            self.weight_initialized = False
+            self.input_scale_initialized = False
+            self.trial_index = 0
+            self.input_amax_trials.zero_()
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            init.uniform_(self.bias, -bound, bound)
+        self.quantize_weight()
+        self.max_value = torch.finfo(self.float8_dtype).max
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.input_scale_initialized or is_compiling():
+            x = (
+                x.mul(self.input_scale)
+                .clamp(min=-self.input_max_value, max=self.input_max_value)
+                .type(self.input_float8_dtype)
+            )
+        else:
+            x = self.quantize_input(x)
+        prev_dims = x.shape[:-1]
+        x = x.view(-1, self.in_features)
+        # float8 matmul, much faster than float16 matmul w/ float32 accumulate on ADA devices!
+        return torch._scaled_mm(
+            x,
+            self.float8_data.T,
+            self.input_scale_reciprocal,
+            self.scale_reciprocal,
+            bias=self.bias,
+            out_dtype=self.weight.dtype,
+            use_fast_accum=True,
+        ).view(*prev_dims, self.out_features)
+    @classmethod
+    def from_linear(
+        cls,
+        linear: nn.Linear,
+        float8_dtype=torch.float8_e4m3fn,
+        input_float8_dtype=torch.float8_e5m2,
+    ):
+        f8_lin = cls(
+            in_features=linear.in_features,
+            out_features=linear.out_features,
+            bias=linear.bias is not None,
+            device=linear.weight.device,
+            dtype=linear.weight.dtype,
+            float8_dtype=float8_dtype,
+            float_weight=linear.weight.data,
+            float_bias=(linear.bias.data if linear.bias is not None else None),
+            input_float8_dtype=input_float8_dtype,
+        )
+        f8_lin.quantize_weight()
+        return f8_lin
+def recursive_swap_linears(
+    model: nn.Module,
+    float8_dtype=torch.float8_e4m3fn,
+    input_float8_dtype=torch.float8_e5m2,
+):
+    """
+    Recursively swaps all nn.Linear modules in the given model with F8Linear modules.
+    This function traverses the model's structure and replaces each nn.Linear
+    instance with an F8Linear instance, which uses 8-bit floating point
+    quantization for weights. The original linear layer's weights are deleted
+    after conversion to save memory.
+    Args:
+        model (nn.Module): The PyTorch model to modify.
+    Note:
+        This function modifies the model in-place. After calling this function,
+        all linear layers in the model will be using 8-bit quantization.
+    """
+    for name, child in model.named_children():
+        if isinstance(child, nn.Linear) and not isinstance(
+            child, (F8Linear, CublasLinear)
+        ):
+            setattr(
+                model,
+                name,
+                F8Linear.from_linear(
+                    child,
+                    float8_dtype=float8_dtype,
+                    input_float8_dtype=input_float8_dtype,
+                ),
+            )
+            del child
+        else:
+            recursive_swap_linears(child)
+@torch.inference_mode()
+def quantize_flow_transformer_and_dispatch_float8(
+    flow_model: nn.Module,
+    device=torch.device("cuda"),
+    float8_dtype=torch.float8_e4m3fn,
+    input_float8_dtype=torch.float8_e5m2,
+    offload_flow=False,
+):
+    """
+    Quantize the flux flow transformer model (original BFL codebase version) and dispatch to the given device.
+    """
+    for i, module in enumerate(flow_model.double_blocks):
+        module.to(device)
+        module.eval()
+        recursive_swap_linears(
+            module, float8_dtype=float8_dtype, input_float8_dtype=input_float8_dtype
+        )
+        torch.cuda.empty_cache()
+    for i, module in enumerate(flow_model.single_blocks):
+        module.to(device)
+        module.eval()
+        recursive_swap_linears(
+            module, float8_dtype=float8_dtype, input_float8_dtype=input_float8_dtype
+        )
+        torch.cuda.empty_cache()
+    to_gpu_extras = [
+        "vector_in",
+        "img_in",
+        "txt_in",
+        "time_in",
+        "guidance_in",
+        "final_layer",
+        "pe_embedder",
+    ]
+    for module in to_gpu_extras:
+        m_extra = getattr(flow_model, module)
+        if m_extra is None:
+            continue
+        m_extra.to(device)
+        m_extra.eval()
+        if isinstance(m_extra, nn.Linear) and not isinstance(
+            m_extra, (F8Linear, CublasLinear)
+        ):
+            setattr(
+                flow_model,
+                module,
+                F8Linear.from_linear(
+                    m_extra,
+                    float8_dtype=float8_dtype,
+                    input_float8_dtype=input_float8_dtype,
+                ),
+            )
+            del m_extra
+        elif module != "final_layer":
+            recursive_swap_linears(
+                m_extra,
+                float8_dtype=float8_dtype,
+                input_float8_dtype=input_float8_dtype,
+            )
+        torch.cuda.empty_cache()
+    if offload_flow:
+        flow_model.to("cpu")
+        torch.cuda.empty_cache()
+    return flow_model

flux_pipeline.py CHANGED Viewed

@@ -1,13 +1,12 @@
-import base64
 import io
 import math
 from typing import TYPE_CHECKING, Callable, List
 from PIL import Image
-from einops import rearrange, repeat
 import numpy as np
 import torch
 from flux_emphasis import get_weighted_text_embeddings_flux
 torch.backends.cuda.matmul.allow_tf32 = True
@@ -20,10 +19,9 @@ from torch._inductor import config as ind_config
 from pybase64 import standard_b64decode
 config.cache_size_limit = 10000000000
-ind_config.force_fuse_int_mm_with_mul = True
 from loguru import logger
-from turbojpeg_imgs import TurboImage
 from torchvision.transforms import functional as TF
 from tqdm import tqdm
 from util import (
@@ -50,7 +48,7 @@ class FluxPipeline:
         t5: "HFEmbedder" = None,
         model: "Flux" = None,
         ae: "AutoEncoder" = None,
-        dtype: torch.dtype = torch.bfloat16,
         verbose: bool = False,
         flux_device: torch.device | str = "cuda:0",
         ae_device: torch.device | str = "cuda:1",
@@ -87,10 +85,42 @@ class FluxPipeline:
         self.model: "Flux" = model
         self.ae: "AutoEncoder" = ae
         self.rng = torch.Generator(device="cpu")
-        self.turbojpeg = TurboImage()
         self.verbose = verbose
         self.ae_dtype = torch.bfloat16
         self.config = config
     @torch.inference_mode()
     def prepare(
@@ -126,6 +156,9 @@ class FluxPipeline:
         )
         img_ids = img_ids[None].repeat(bs, 1, 1, 1).flatten(1, 2)
         vec, txt, txt_ids = get_weighted_text_embeddings_flux(
             self,
             prompt,
@@ -134,6 +167,10 @@ class FluxPipeline:
             target_device=target_device,
             target_dtype=target_dtype,
         )
         return img, img_ids, vec, txt, txt_ids
     @torch.inference_mode()
@@ -196,29 +233,39 @@ class FluxPipeline:
     @torch.inference_mode()
     def into_bytes(self, x: torch.Tensor) -> io.BytesIO:
         # bring into PIL format and save
         x = x.clamp(-1, 1)
         num_images = x.shape[0]
         images: List[torch.Tensor] = []
         for i in range(num_images):
-            x = x[i].permute(1, 2, 0).add(1.0).mul(127.5).type(torch.uint8).contiguous()
             images.append(x)
         if len(images) == 1:
             im = images[0]
         else:
             im = torch.vstack(images)
-        im = self.turbojpeg.encode_torch(im, quality=95)
         images.clear()
         return io.BytesIO(im)
     @torch.inference_mode()
     def vae_decode(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor:
-        x = x.to(self.device_ae)
         x = self.unpack(x.float(), height, width)
         with torch.autocast(
             device_type=self.device_ae.type, dtype=torch.bfloat16, cache_enabled=False
         ):
             x = self.ae.decode(x)
         return x
     def unpack(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor:
@@ -269,11 +316,16 @@ class FluxPipeline:
                 dtype=torch.bfloat16,
                 cache_enabled=False,
             ):
                 init_image = (
                     self.ae.encode(init_image)
                     .to(dtype=self.dtype, device=self.device_flux)
                     .repeat(num_images, 1, 1, 1)
                 )
         x = self.get_noise(
             num_images,
@@ -338,11 +390,14 @@ class FluxPipeline:
             generator=generator,
             num_images=num_images,
         )
-        img, img_ids, vec, txt, txt_ids = self.prepare(
-            img=img,
-            prompt=prompt,
-            target_device=self.device_flux,
-            target_dtype=self.dtype,
         )
         # this is ignored for schnell
@@ -350,6 +405,8 @@ class FluxPipeline:
             (img.shape[0],), guidance, device=self.device_flux, dtype=self.dtype
         )
         t_vec = None
         for t_curr, t_prev in tqdm(
             zip(timesteps[:-1], timesteps[1:]), total=len(timesteps) - 1, disable=silent
         ):
@@ -374,6 +431,8 @@ class FluxPipeline:
             img = img + (t_prev - t_curr) * pred
         torch.cuda.empty_cache()
         # decode latents to pixel space
@@ -384,37 +443,35 @@ class FluxPipeline:
         return self.into_bytes(img)
     @classmethod
-    def load_pipeline_from_config_path(cls, path: str) -> "FluxPipeline":
         with torch.inference_mode():
             config = load_config_from_path(path)
             return cls.load_pipeline_from_config(config)
     @classmethod
     def load_pipeline_from_config(cls, config: ModelSpec) -> "FluxPipeline":
-        from quantize_swap_and_dispatch import quantize_and_dispatch_to_device
         with torch.inference_mode():
             print("flow_quantization_dtype", config.flow_quantization_dtype)
             models = load_models_from_config(config)
             config = models.config
-            num_layers_to_quantize = config.num_to_quant
             flux_device = into_device(config.flux_device)
             ae_device = into_device(config.ae_device)
             clip_device = into_device(config.text_enc_device)
             t5_device = into_device(config.text_enc_device)
             flux_dtype = into_dtype(config.flow_dtype)
-            flow_model = models.flow
-            flow_model = quantize_and_dispatch_to_device(
-                flow_model=flow_model,
-                flux_device=flux_device,
-                flux_dtype=flux_dtype,
-                num_layers_to_quantize=num_layers_to_quantize,
-                compile_extras=config.compile_extras,
-                compile_blocks=config.compile_blocks,
-                quantize_extras=config.quantize_extras,
-                quantization_dtype=config.flow_quantization_dtype,
             )
         return cls(
@@ -435,29 +492,24 @@ class FluxPipeline:
 if __name__ == "__main__":
     pipe = FluxPipeline.load_pipeline_from_config_path(
-        "configs/config-dev-gigaquant.json"
     )
     o = pipe.generate(
         prompt="Street photography portrait of a beautiful asian woman in traditional clothing with golden hairpin and blue eyes, wearing a red kimono with dragon patterns",
         height=1024,
-        width=1024,
         num_steps=24,
-        guidance=3.0,
     )
     open("out.jpg", "wb").write(o.read())
-    o = pipe.generate(
-        prompt="Street photography portrait of a beautiful asian woman in traditional clothing with golden hairpin and blue eyes, wearing a red kimono with dragon patterns",
-        height=1024,
-        width=1024,
-        num_steps=24,
-        guidance=3.0,
-    )
-    open("out2.jpg", "wb").write(o.read())
-    o = pipe.generate(
-        prompt="Street photography portrait of a beautiful asian woman in traditional clothing with golden hairpin and blue eyes, wearing a red kimono with dragon patterns",
-        height=1024,
-        width=1024,
-        num_steps=24,
-        guidance=3.0,
-    )
-    open("out3.jpg", "wb").write(o.read())

 import io
 import math
 from typing import TYPE_CHECKING, Callable, List
 from PIL import Image
 import numpy as np
 import torch
+from einops import rearrange
 from flux_emphasis import get_weighted_text_embeddings_flux
 torch.backends.cuda.matmul.allow_tf32 = True
 from pybase64 import standard_b64decode
 config.cache_size_limit = 10000000000
+ind_config.shape_padding = True
 from loguru import logger
+from image_encoder import ImageEncoder
 from torchvision.transforms import functional as TF
 from tqdm import tqdm
 from util import (
         t5: "HFEmbedder" = None,
         model: "Flux" = None,
         ae: "AutoEncoder" = None,
+        dtype: torch.dtype = torch.float16,
         verbose: bool = False,
         flux_device: torch.device | str = "cuda:0",
         ae_device: torch.device | str = "cuda:1",
         self.model: "Flux" = model
         self.ae: "AutoEncoder" = ae
         self.rng = torch.Generator(device="cpu")
+        self.img_encoder = ImageEncoder()
         self.verbose = verbose
         self.ae_dtype = torch.bfloat16
         self.config = config
+        self.offload_text_encoder = config.offload_text_encoder
+        self.offload_vae = config.offload_vae
+        self.offload_flow = config.offload_flow
+        if self.config.compile_blocks or self.config.compile_extras:
+            print("Warmups for compile...")
+            warmup_dict = dict(
+                prompt="Street photography portrait of a beautiful asian woman in traditional clothing with golden hairpin and blue eyes, wearing a red kimono with dragon patterns",
+                height=1024,
+                width=1024,
+                num_steps=30,
+                guidance=3.5,
+                seed=10,
+            )
+            self.generate(**warmup_dict)
+            to_gpu_extras = [
+                "vector_in",
+                "img_in",
+                "txt_in",
+                "time_in",
+                "guidance_in",
+                "final_layer",
+                "pe_embedder",
+            ]
+            if self.config.compile_blocks:
+                for block in self.model.double_blocks:
+                    block.compile()
+                for block in self.model.single_blocks:
+                    block.compile()
+            if self.config.compile_extras:
+                for extra in to_gpu_extras:
+                    getattr(self.model, extra).compile()
     @torch.inference_mode()
     def prepare(
         )
         img_ids = img_ids[None].repeat(bs, 1, 1, 1).flatten(1, 2)
+        if self.offload_text_encoder:
+            self.clip.to(self.device_clip)
+            self.t5.to(self.device_t5)
         vec, txt, txt_ids = get_weighted_text_embeddings_flux(
             self,
             prompt,
             target_device=target_device,
             target_dtype=target_dtype,
         )
+        if self.offload_text_encoder:
+            self.clip.to("cpu")
+            self.t5.to("cpu")
+            torch.cuda.empty_cache()
         return img, img_ids, vec, txt, txt_ids
     @torch.inference_mode()
     @torch.inference_mode()
     def into_bytes(self, x: torch.Tensor) -> io.BytesIO:
         # bring into PIL format and save
+        torch.cuda.synchronize()
+        x = x.contiguous()
         x = x.clamp(-1, 1)
         num_images = x.shape[0]
         images: List[torch.Tensor] = []
         for i in range(num_images):
+            x = x[i].add(1.0).mul(127.5).clamp(0, 255).contiguous().type(torch.uint8)
             images.append(x)
         if len(images) == 1:
             im = images[0]
         else:
             im = torch.vstack(images)
+        torch.cuda.synchronize()
+        im = self.turbojpeg.encode_torch(im, quality=99)
         images.clear()
         return io.BytesIO(im)
     @torch.inference_mode()
     def vae_decode(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        if self.offload_vae:
+            self.ae.to(self.device_ae)
+            x = x.to(self.device_ae)
+        else:
+            x = x.to(self.device_ae)
         x = self.unpack(x.float(), height, width)
         with torch.autocast(
             device_type=self.device_ae.type, dtype=torch.bfloat16, cache_enabled=False
         ):
             x = self.ae.decode(x)
+        if self.offload_vae:
+            self.ae.to("cpu")
+            torch.cuda.empty_cache()
         return x
     def unpack(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor:
                 dtype=torch.bfloat16,
                 cache_enabled=False,
             ):
+                if self.offload_vae:
+                    self.ae.to(self.device_ae)
                 init_image = (
                     self.ae.encode(init_image)
                     .to(dtype=self.dtype, device=self.device_flux)
                     .repeat(num_images, 1, 1, 1)
                 )
+                if self.offload_vae:
+                    self.ae.to("cpu")
+                    torch.cuda.empty_cache()
         x = self.get_noise(
             num_images,
             generator=generator,
             num_images=num_images,
         )
+        img, img_ids, vec, txt, txt_ids = map(
+            lambda x: x.contiguous(),
+            self.prepare(
+                img=img,
+                prompt=prompt,
+                target_device=self.device_flux,
+                target_dtype=self.dtype,
+            ),
         )
         # this is ignored for schnell
             (img.shape[0],), guidance, device=self.device_flux, dtype=self.dtype
         )
         t_vec = None
+        if self.offload_flow:
+            self.model.to(self.device_flux)
         for t_curr, t_prev in tqdm(
             zip(timesteps[:-1], timesteps[1:]), total=len(timesteps) - 1, disable=silent
         ):
             img = img + (t_prev - t_curr) * pred
+        if self.offload_flow:
+            self.model.to("cpu")
         torch.cuda.empty_cache()
         # decode latents to pixel space
         return self.into_bytes(img)
     @classmethod
+    def load_pipeline_from_config_path(
+        cls, path: str, flow_model_path: str = None
+    ) -> "FluxPipeline":
         with torch.inference_mode():
             config = load_config_from_path(path)
+            if flow_model_path:
+                config.ckpt_path = flow_model_path
             return cls.load_pipeline_from_config(config)
     @classmethod
     def load_pipeline_from_config(cls, config: ModelSpec) -> "FluxPipeline":
+        from float8_quantize import quantize_flow_transformer_and_dispatch_float8
         with torch.inference_mode():
             print("flow_quantization_dtype", config.flow_quantization_dtype)
             models = load_models_from_config(config)
             config = models.config
             flux_device = into_device(config.flux_device)
             ae_device = into_device(config.ae_device)
             clip_device = into_device(config.text_enc_device)
             t5_device = into_device(config.text_enc_device)
             flux_dtype = into_dtype(config.flow_dtype)
+            flow_model = models.flow.type(flux_dtype).to(
+                memory_format=torch.channels_last
+            )
+            flow_model = quantize_flow_transformer_and_dispatch_float8(
+                flow_model, flux_device
             )
         return cls(
 if __name__ == "__main__":
     pipe = FluxPipeline.load_pipeline_from_config_path(
+        "configs/config-dev-offload.json"
     )
     o = pipe.generate(
         prompt="Street photography portrait of a beautiful asian woman in traditional clothing with golden hairpin and blue eyes, wearing a red kimono with dragon patterns",
         height=1024,
+        width=576,
         num_steps=24,
+        guidance=3.5,
+        seed=10,
     )
     open("out.jpg", "wb").write(o.read())
+    for x in range(10):
+        o = pipe.generate(
+            prompt="Street photography portrait of a beautiful asian woman in traditional clothing with golden hairpin and blue eyes, wearing a red kimono with dragon patterns",
+            height=1024,
+            width=576,
+            num_steps=24,
+            guidance=3.5,
+        )
+        open(f"out{x}.jpg", "wb").write(o.read())

image_encoder.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import io
+from PIL import Image
+import numpy as np
+import torch
+class ImageEncoder:
+    @torch.inference_mode()
+    def encode_torch(self, img: torch.Tensor, quality=90):
+        if img.ndim == 2:
+            img = (
+                img[None]
+                .contiguous()
+                .repeat_interleave(3, dim=0)
+                .contiguous()
+                .clamp(0, 255)
+                .type(torch.uint8)
+            )
+            print(img.shape)
+        elif img.ndim == 3:
+            if img.shape[0] == 3:
+                img = img.contiguous().clamp(0, 255).type(torch.uint8)
+            elif img.shape[2] == 3:
+                img = img.permute(2, 0, 1).contiguous().clamp(0, 255).type(torch.uint8)
+            else:
+                raise ValueError(f"Unsupported image shape: {img.shape}")
+        else:
+            raise ValueError(f"Unsupported image num dims: {img.ndim}")
+        img = (
+            img.permute(1, 2, 0)
+            .contiguous()
+            .to(torch.uint8)
+            .cpu()
+            .numpy()
+            .astype(np.uint8)
+        )
+        im = Image.fromarray(img)
+        iob = io.BytesIO()
+        im.save(iob, format="JPEG", quality=95)
+        iob.seek(0)
+        return iob.getvalue()
+def test_real_img():
+    from PIL import Image
+    import numpy as np
+    im = "out.jpg"
+    im = Image.open(im)
+    im = np.array(im)
+    img_hwc = torch.from_numpy(im).cuda().type(torch.float32)
+    img_chw = img_hwc.permute(2, 0, 1).contiguous()
+    img_gray = img_hwc.mean(dim=2, keepdim=False).contiguous().clamp(0, 255)
+    tj = TurboImage()
+    o = tj.encode_torch(img_chw)
+    o2 = tj.encode_torch(img_hwc)
+    o3 = tj.encode_torch(img_gray)
+    with open("out_chw.jpg", "wb") as f:
+        f.write(o2)
+    with open("out_hwc.jpg", "wb") as f:
+        f.write(o)
+    with open("out_gray.jpg", "wb") as f:
+        f.write(o3)
+    # print(o)
+if __name__ == "__main__":
+    test_real_img()

main.py CHANGED Viewed

@@ -87,7 +87,9 @@ def main():
     args = parse_args()
     if args.config_path:
-        app.state.model = FluxPipeline.load_pipeline_from_config_path(args.config_path)
     else:
         model_version = (
             ModelVersion.flux_dev

     args = parse_args()
     if args.config_path:
+        app.state.model = FluxPipeline.load_pipeline_from_config_path(
+            args.config_path, flow_model_path=args.flow_model_path
+        )
     else:
         model_version = (
             ModelVersion.flux_dev

modules/conditioner.py CHANGED Viewed

@@ -1,10 +1,6 @@
 import os
 import torch
-from pydash import max_
-from quanto import freeze, qfloat8, qint2, qint4, qint8, quantize
-from quanto.nn.qmodule import _QMODULE_TABLE
-from safetensors.torch import load_file, load_model, save_model
 from torch import Tensor, nn
 from transformers import (
     CLIPTextModel,
@@ -13,7 +9,7 @@ from transformers import (
     T5Tokenizer,
     __version__,
 )
-from transformers.utils.quantization_config import QuantoConfig
 CACHE_DIR = os.environ.get("HF_HOME", "~/.cache/huggingface")
@@ -31,6 +27,25 @@ def into_quantization_name(quantization_dtype: str) -> str:
         raise ValueError(f"Unsupported quantization dtype: {quantization_dtype}")
 class HFEmbedder(nn.Module):
     def __init__(
         self,
@@ -38,15 +53,21 @@ class HFEmbedder(nn.Module):
         max_length: int,
         device: torch.device | int,
         quantization_dtype: str | None = None,
         **hf_kwargs,
     ):
         super().__init__()
         self.is_clip = version.startswith("openai")
         self.max_length = max_length
         self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
-        quant_name = (
-            into_quantization_name(quantization_dtype) if quantization_dtype else None
-        )
         if self.is_clip:
             self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
@@ -57,13 +78,10 @@ class HFEmbedder(nn.Module):
                 version,
                 **hf_kwargs,
                 quantization_config=(
-                    QuantoConfig(
-                        weights=quant_name,
-                    )
-                    if quant_name
                     else None
                 ),
-                device_map={"": device},
             )
         else:
@@ -72,17 +90,21 @@ class HFEmbedder(nn.Module):
             )
             self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(
                 version,
-                device_map={"": device},
                 **hf_kwargs,
                 quantization_config=(
-                    QuantoConfig(
-                        weights=quant_name,
-                    )
-                    if quant_name
                     else None
                 ),
             )
     def forward(self, text: list[str]) -> Tensor:
         batch_encoding = self.tokenizer(
             text,

 import os
 import torch
 from torch import Tensor, nn
 from transformers import (
     CLIPTextModel,
     T5Tokenizer,
     __version__,
 )
+from transformers.utils.quantization_config import QuantoConfig, BitsAndBytesConfig
 CACHE_DIR = os.environ.get("HF_HOME", "~/.cache/huggingface")
         raise ValueError(f"Unsupported quantization dtype: {quantization_dtype}")
+def auto_quantization_config(
+    quantization_dtype: str,
+) -> QuantoConfig | BitsAndBytesConfig:
+    if quantization_dtype == "qfloat8":
+        return QuantoConfig(weights="float8")
+    elif quantization_dtype == "qint4":
+        return BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_quant_type="nf4",
+        )
+    elif quantization_dtype == "qint8":
+        return BitsAndBytesConfig(load_in_8bit=True, llm_int8_has_fp16_weight=False)
+    elif quantization_dtype == "qint2":
+        return QuantoConfig(weights="int2")
+    else:
+        raise ValueError(f"Unsupported quantization dtype: {quantization_dtype}")
 class HFEmbedder(nn.Module):
     def __init__(
         self,
         max_length: int,
         device: torch.device | int,
         quantization_dtype: str | None = None,
+        offloading_device: torch.device | int | None = torch.device("cpu"),
         **hf_kwargs,
     ):
         super().__init__()
+        self.offloading_device = (
+            offloading_device
+            if isinstance(offloading_device, torch.device)
+            else torch.device(offloading_device)
+        )
+        self.device = (
+            device if isinstance(device, torch.device) else torch.device(device)
+        )
         self.is_clip = version.startswith("openai")
         self.max_length = max_length
         self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
         if self.is_clip:
             self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
                 version,
                 **hf_kwargs,
                 quantization_config=(
+                    auto_quantization_config(quantization_dtype)
+                    if quantization_dtype
                     else None
                 ),
             )
         else:
             )
             self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(
                 version,
                 **hf_kwargs,
                 quantization_config=(
+                    auto_quantization_config(quantization_dtype)
+                    if quantization_dtype
                     else None
                 ),
             )
+    def offload(self):
+        self.hf_module.to(device=self.offloading_device)
+        torch.cuda.empty_cache()
+    def cuda(self):
+        self.hf_module.to(device=self.device)
     def forward(self, text: list[str]) -> Tensor:
         batch_encoding = self.tokenizer(
             text,

modules/flux_model.py CHANGED Viewed

@@ -11,14 +11,13 @@ torch.set_float32_matmul_precision("high")
 import math
 from torch import Tensor, nn
-from torch._dynamo import config
-from torch._inductor import config as ind_config
 from pydantic import BaseModel
 from torch.nn import functional as F
-config.cache_size_limit = 10000000000
-ind_config.compile_threads = os.cpu_count()
-ind_config.shape_padding = True
 class FluxParams(BaseModel):
@@ -37,7 +36,7 @@ class FluxParams(BaseModel):
 # attention is always same shape each time it's called per H*W, so compile with fullgraph
-@torch.compile(mode="reduce-overhead", fullgraph=True, disable=DISABLE_COMPILE)
 def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
     q, k = apply_rope(q, k, pe)
     x = F.scaled_dot_product_attention(q, k, v).transpose(1, 2)
@@ -45,7 +44,7 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
     return x
-@torch.compile(mode="reduce-overhead", disable=DISABLE_COMPILE)
 def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
     scale = torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim
     omega = 1.0 / (theta**scale)
@@ -202,8 +201,7 @@ class DoubleStreamBlock(nn.Module):
         num_heads: int,
         mlp_ratio: float,
         qkv_bias: bool = False,
-        dtype: torch.dtype = torch.bfloat16,
-        idx: int = 0,
     ):
         super().__init__()
         self.dtype = dtype
@@ -232,9 +230,9 @@ class DoubleStreamBlock(nn.Module):
         self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.txt_mlp = nn.Sequential(
-            (nn.Linear(hidden_size, mlp_hidden_dim, bias=True)),
             nn.GELU(approximate="tanh"),
-            (nn.Linear(mlp_hidden_dim, hidden_size, bias=True)),
         )
         self.K = 3
         self.H = self.num_heads
@@ -279,13 +277,13 @@ class DoubleStreamBlock(nn.Module):
         img = img + img_mod1.gate * self.img_attn.proj(img_attn)
         img = img + img_mod2.gate * self.img_mlp(
             (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
-        ).clamp(min=-384, max=384)
         # calculate the txt bloks
         txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
         txt = txt + txt_mod2.gate * self.txt_mlp(
             (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
-        ).clamp(min=-384, max=384)
         return img, txt
@@ -302,7 +300,7 @@ class SingleStreamBlock(nn.Module):
         num_heads: int,
         mlp_ratio: float = 4.0,
         qk_scale: float | None = None,
-        dtype: torch.dtype = torch.bfloat16,
     ):
         super().__init__()
         self.dtype = dtype
@@ -343,7 +341,7 @@ class SingleStreamBlock(nn.Module):
         q, k = self.norm(q, k, v)
         attn = attention(q, k, v, pe=pe)
         output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)).clamp(
-            min=-384, max=384
         )
         return x + mod.gate * output
@@ -352,11 +350,11 @@ class LastLayer(nn.Module):
     def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
         super().__init__()
         self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.linear = nn.Linear(
             hidden_size, patch_size * patch_size * out_channels, bias=True
         )
         self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
         )
     def forward(self, x: Tensor, vec: Tensor) -> Tensor:
@@ -413,9 +411,8 @@ class Flux(nn.Module):
                     mlp_ratio=params.mlp_ratio,
                     qkv_bias=params.qkv_bias,
                     dtype=self.dtype,
-                    idx=idx,
                 )
-                for idx in range(params.depth)
             ]
         )

 import math
 from torch import Tensor, nn
 from pydantic import BaseModel
 from torch.nn import functional as F
+try:
+    from cublas_ops import CublasLinear
+except ImportError:
+    CublasLinear = nn.Linear
 class FluxParams(BaseModel):
 # attention is always same shape each time it's called per H*W, so compile with fullgraph
+# @torch.compile(mode="reduce-overhead", fullgraph=True, disable=DISABLE_COMPILE)
 def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
     q, k = apply_rope(q, k, pe)
     x = F.scaled_dot_product_attention(q, k, v).transpose(1, 2)
     return x
+# @torch.compile(mode="reduce-overhead", disable=DISABLE_COMPILE)
 def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
     scale = torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim
     omega = 1.0 / (theta**scale)
         num_heads: int,
         mlp_ratio: float,
         qkv_bias: bool = False,
+        dtype: torch.dtype = torch.float16,
     ):
         super().__init__()
         self.dtype = dtype
         self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
             nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
         self.K = 3
         self.H = self.num_heads
         img = img + img_mod1.gate * self.img_attn.proj(img_attn)
         img = img + img_mod2.gate * self.img_mlp(
             (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
+        ).clamp(min=-384 * 2, max=384 * 2)
         # calculate the txt bloks
         txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
         txt = txt + txt_mod2.gate * self.txt_mlp(
             (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
+        ).clamp(min=-384 * 2, max=384 * 2)
         return img, txt
         num_heads: int,
         mlp_ratio: float = 4.0,
         qk_scale: float | None = None,
+        dtype: torch.dtype = torch.float16,
     ):
         super().__init__()
         self.dtype = dtype
         q, k = self.norm(q, k, v)
         attn = attention(q, k, v, pe=pe)
         output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)).clamp(
+            min=-384 * 4, max=384 * 4
         )
         return x + mod.gate * output
     def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
         super().__init__()
         self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = CublasLinear(
             hidden_size, patch_size * patch_size * out_channels, bias=True
         )
         self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), CublasLinear(hidden_size, 2 * hidden_size, bias=True)
         )
     def forward(self, x: Tensor, vec: Tensor) -> Tensor:
                     mlp_ratio=params.mlp_ratio,
                     qkv_bias=params.qkv_bias,
                     dtype=self.dtype,
                 )
+                for _ in range(params.depth)
             ]
         )

turbojpeg_imgs.py DELETED Viewed

@@ -1,134 +0,0 @@
-import numpy as np
-import torch
-from turbojpeg import (
-    TurboJPEG,
-    TJPF_GRAY,
-    TJFLAG_PROGRESSIVE,
-    TJFLAG_FASTUPSAMPLE,
-    TJFLAG_FASTDCT,
-    TJPF_RGB,
-    TJPF_BGR,
-    TJSAMP_GRAY,
-    TJSAMP_411,
-    TJSAMP_420,
-    TJSAMP_422,
-    TJSAMP_444,
-    TJSAMP_440,
-    TJSAMP_441,
-)
-class Subsampling:
-    S411 = TJSAMP_411
-    S420 = TJSAMP_420
-    S422 = TJSAMP_422
-    S444 = TJSAMP_444
-    S440 = TJSAMP_440
-    S441 = TJSAMP_441
-    GRAY = TJSAMP_GRAY
-class Flags:
-    PROGRESSIVE = TJFLAG_PROGRESSIVE
-    FASTUPSAMPLE = TJFLAG_FASTUPSAMPLE
-    FASTDCT = TJFLAG_FASTDCT
-class PixelFormat:
-    GRAY = TJPF_GRAY
-    RGB = TJPF_RGB
-    BGR = TJPF_BGR
-class TurboImage:
-    def __init__(self):
-        self.tj = TurboJPEG()
-        self.flags = Flags.PROGRESSIVE
-        self.subsampling_gray = Subsampling.GRAY
-        self.pixel_format_gray = PixelFormat.GRAY
-        self.subsampling_rgb = Subsampling.S420
-        self.pixel_format_rgb = PixelFormat.RGB
-    def set_subsampling_gray(self, subsampling):
-        self.subsampling_gray = subsampling
-    def set_subsampling_rgb(self, subsampling):
-        self.subsampling_rgb = subsampling
-    def set_pixel_format_gray(self, pixel_format):
-        self.pixel_format_gray = pixel_format
-    def set_pixel_format_rgb(self, pixel_format):
-        self.pixel_format_rgb = pixel_format
-    def set_flags(self, flags):
-        self.flags = flags
-    def encode(
-        self,
-        img,
-        subsampling,
-        pixel_format,
-        quality=90,
-    ):
-        return self.tj.encode(
-            img,
-            quality=quality,
-            flags=self.flags,
-            pixel_format=pixel_format,
-            jpeg_subsample=subsampling,
-        )
-    @torch.inference_mode()
-    def encode_torch(self, img: torch.Tensor, quality=90):
-        if img.ndim == 2:
-            subsampling = self.subsampling_gray
-            pixel_format = self.pixel_format_gray
-            img = img.clamp(0, 255).cpu().contiguous().numpy().astype(np.uint8)
-        elif img.ndim == 3:
-            subsampling = self.subsampling_rgb
-            pixel_format = self.pixel_format_rgb
-            if img.shape[0] == 3:
-                img = (
-                    img.permute(1, 2, 0)
-                    .clamp(0, 255)
-                    .cpu()
-                    .contiguous()
-                    .numpy()
-                    .astype(np.uint8)
-                )
-            elif img.shape[2] == 3:
-                img = img.clamp(0, 255).cpu().contiguous().numpy().astype(np.uint8)
-            else:
-                raise ValueError(f"Unsupported image shape: {img.shape}")
-        else:
-            raise ValueError(f"Unsupported image num dims: {img.ndim}")
-        return self.encode(
-            img,
-            quality=quality,
-            subsampling=subsampling,
-            pixel_format=pixel_format,
-        )
-    def encode_numpy(self, img: np.ndarray, quality=90):
-        if img.ndim == 2:
-            subsampling = self.subsampling_gray
-            pixel_format = self.pixel_format_gray
-        elif img.ndim == 3:
-            if img.shape[0] == 3:
-                img = np.ascontiguousarray(img.transpose(1, 2, 0))
-            elif img.shape[2] == 3:
-                img = np.ascontiguousarray(img)
-            else:
-                raise ValueError(f"Unsupported image shape: {img.shape}")
-            subsampling = self.subsampling_rgb
-            pixel_format = self.pixel_format_rgb
-        else:
-            raise ValueError(f"Unsupported image num dims: {img.ndim}")
-        img = img.clip(0, 255).astype(np.uint8)
-        return self.encode(
-            img, quality=quality, subsampling=subsampling, pixel_format=pixel_format
-        )

util.py CHANGED Viewed

@@ -50,6 +50,9 @@ class ModelSpec(BaseModel):
     text_enc_quantization_dtype: Optional[QuantizationDtype] = QuantizationDtype.qfloat8
     ae_quantization_dtype: Optional[QuantizationDtype] = None
     clip_quantization_dtype: Optional[QuantizationDtype] = None
     model_config: ConfigDict = {
         "arbitrary_types_allowed": True,
@@ -242,6 +245,9 @@ def load_autoencoder(config: ModelSpec) -> AutoEncoder:
             current_quants=0,
             quantization_dtype=into_qtype(config.ae_quantization_dtype),
         )
     return ae

     text_enc_quantization_dtype: Optional[QuantizationDtype] = QuantizationDtype.qfloat8
     ae_quantization_dtype: Optional[QuantizationDtype] = None
     clip_quantization_dtype: Optional[QuantizationDtype] = None
+    offload_text_encoder: bool = False
+    offload_vae: bool = False
+    offload_flow: bool = False
     model_config: ConfigDict = {
         "arbitrary_types_allowed": True,
             current_quants=0,
             quantization_dtype=into_qtype(config.ae_quantization_dtype),
         )
+        if config.offload_vae:
+            ae.to("cpu")
+            torch.cuda.empty_cache()
     return ae