Merge pull request #3 from aredden/improved_precision

Browse files

Files changed (8) hide show

README.md +33 -28
float8_quantize.py +85 -18
flux_pipeline.py +95 -59
main.py +18 -0
modules/conditioner.py +10 -10
modules/flux_model.py +234 -51
modules/flux_model_f8.py +0 -491
util.py +20 -8

README.md CHANGED Viewed

@@ -41,6 +41,19 @@ Note:
 -   [Examples](#examples)
 -   [License](#license)
 ## Installation
 This repo _requires_ at least pytorch with cuda=12.4 and an ADA gpu with fp8 support, otherwise `torch._scaled_mm` will throw a CUDA error saying it's not supported. To install with conda/mamba:
@@ -106,30 +119,8 @@ python main.py --config-path <path_to_config> --port <port_number> --host <host_
 -   `--no-offload-ae`: Disable offloading the autoencoder to the CPU when not being used to increase e2e inference speed (default: True [implies it will offload, setting this flag sets it to False]).
 -   `--no-offload-text-enc`: Disable offloading the text encoder to the CPU when not being used to increase e2e inference speed (default: True [implies it will offload, setting this flag sets it to False]).
 -   `--prequantized-flow`: Load the flow model from a prequantized checkpoint, which reduces the size of the checkpoint by about 50% & reduces startup time (default: False).
-## Examples
-### Running the Server
-```bash
-python main.py --config-path configs/config-dev-1-4090.json --port 8088 --host 0.0.0.0
-```
-Or if you need more granular control over the all of the settings, you can run the server with something like this:
-```bash
-python main.py --port 8088 --host 0.0.0.0 \
-    --flow-model-path /path/to/your/flux1-dev.sft \
-    --text-enc-path /path/to/your/t5-v1_1-xxl-encoder-bf16 \
-    --autoencoder-path /path/to/your/ae.sft \
-    --model-version flux-dev \
-    --flux-device cuda:0 \
-    --text-enc-device cuda:0 \
-    --autoencoder-device cuda:0 \
-    --compile \
-    --quant-text-enc qfloat8 \
-    --quant-ae
-```
 ## Configuration
@@ -185,7 +176,10 @@ Example configuration file for a single 4090 (`configs/config-dev-offload-1-4090
     "compile_blocks": true, // compile the single-blocks and double-blocks
     "offload_text_encoder": true, // offload the text encoder to cpu when not in use
     "offload_vae": true, // offload the autoencoder to cpu when not in use
-    "offload_flow": false // offload the flow transformer to cpu when not in use
 }
 ```
@@ -232,6 +226,17 @@ Other things to change can be the
 -   `"ae_device": "cuda:0",`
     device for autoencoder (default: cuda:0) - set this to a different device - e.g. `"cuda:1"` if you have multiple gpus so you can set offloading for ae to false, does not need to be the same as flux_device or text_enc_device
 ## API Endpoints
 ### Generate Image
@@ -256,10 +261,10 @@ Other things to change can be the
 ### Running the Server
 ```bash
-python main.py --config-path configs/config-dev-offload-1-4090.json --port 8088 --host 0.0.0.0
 ```
-OR, if you need more granular control over the server, you can run the server with something like this:
 ```bash
 python main.py --port 8088 --host 0.0.0.0 \
@@ -275,7 +280,7 @@ python main.py --port 8088 --host 0.0.0.0 \
     --quant-ae
 ```
-### Generating an Image
 Send a POST request to `http://<host>:<port>/generate` with the following JSON body:

 -   [Examples](#examples)
 -   [License](#license)
+### Updates 08/24/24
+-   Add config options for levels of quantization for the flow transformer:
+    -   `quantize_modulation`: Quantize the modulation layers in the flow model. If false, adds ~2GB vram usage for moderate precision improvements `(default: true)`
+    -   `quantize_flow_embedder_layers`: Quantize the flow embedder layers in the flow model. If false, adds ~512MB vram usage, but precision improves considerably. `(default: false)`
+-   Override default config values when loading FluxPipeline, e.g. `FluxPipeline.load_pipeline_from_config_path(config_path, **config_overrides)`
+#### Fixes
+-   Fix bug where loading text encoder from HF with bnb will error if device is not set to cuda:0
+**note:** prequantized flow models will only work with the specified quantization levels as when they were created. e.g. if you create a prequantized flow model with `quantize_modulation` set to false, it will only work with `quantize_modulation` set to false, same with `quantize_flow_embedder_layers`.
 ## Installation
 This repo _requires_ at least pytorch with cuda=12.4 and an ADA gpu with fp8 support, otherwise `torch._scaled_mm` will throw a CUDA error saying it's not supported. To install with conda/mamba:
 -   `--no-offload-ae`: Disable offloading the autoencoder to the CPU when not being used to increase e2e inference speed (default: True [implies it will offload, setting this flag sets it to False]).
 -   `--no-offload-text-enc`: Disable offloading the text encoder to the CPU when not being used to increase e2e inference speed (default: True [implies it will offload, setting this flag sets it to False]).
 -   `--prequantized-flow`: Load the flow model from a prequantized checkpoint, which reduces the size of the checkpoint by about 50% & reduces startup time (default: False).
+-   `--no-quantize-flow-modulation`: Disable quantization of the modulation layers in the flow transformer, which improves precision _moderately_ but adds ~2GB vram usage.
+-   `--quantize-flow-embedder-layers`: Quantize the flow embedder layers in the flow transformer, reduces precision _considerably_ but saves ~512MB vram usage.
 ## Configuration
     "compile_blocks": true, // compile the single-blocks and double-blocks
     "offload_text_encoder": true, // offload the text encoder to cpu when not in use
     "offload_vae": true, // offload the autoencoder to cpu when not in use
+    "offload_flow": false, // offload the flow transformer to cpu when not in use
+    "prequantized_flow": false, // load the flow transformer from a prequantized checkpoint, which reduces the size of the checkpoint by about 50% & reduces startup time (default: false)
+    "quantize_modulation": true, // quantize the modulation layers in the flow transformer, which reduces precision moderately but saves ~2GB vram usage (default: true)
+    "quantize_flow_embedder_layers": false, // quantize the flow embedder layers in the flow transformer, if false, improves precision considerably at the cost of adding ~512MB vram usage (default: false)
 }
 ```
 -   `"ae_device": "cuda:0",`
     device for autoencoder (default: cuda:0) - set this to a different device - e.g. `"cuda:1"` if you have multiple gpus so you can set offloading for ae to false, does not need to be the same as flux_device or text_enc_device
+-   `"prequantized_flow": false,`
+    load the flow transformer from a prequantized checkpoint, which reduces the size of the checkpoint by about 50% & reduces startup time (default: false)
+        - Note: MUST be a prequantized checkpoint created with the same quantization settings as the current config, and must have been quantized using this repo.
+-   `"quantize_modulation": true,`
+    quantize the modulation layers in the flow transformer, which improves precision at the cost of adding ~2GB vram usage (default: true)
+-   `"quantize_flow_embedder_layers": false,`
+    quantize the flow embedder layers in the flow transformer, which improves precision considerably at the cost of adding ~512MB vram usage (default: false)
 ## API Endpoints
 ### Generate Image
 ### Running the Server
 ```bash
+python main.py --config-path configs/config-dev-1-4090.json --port 8088 --host 0.0.0.0
 ```
+Or if you need more granular control over the all of the settings, you can run the server with something like this:
 ```bash
 python main.py --port 8088 --host 0.0.0.0 \
     --quant-ae
 ```
+### Generating an image on a client
 Send a POST request to `http://<host>:<port>/generate` with the following JSON body:

float8_quantize.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import torch
 import torch.nn as nn
 from torchao.float8.float8_utils import (
@@ -10,7 +11,8 @@ import math
 from torch.compiler import is_compiling
 from torch import __version__
 from torch.version import cuda
-from typing import TypeVar
 IS_TORCH_2_4 = __version__ < (2, 4, 9)
 LT_TORCH_2_4 = __version__ < (2, 4)
@@ -42,7 +44,7 @@ class F8Linear(nn.Module):
         float8_dtype=torch.float8_e4m3fn,
         float_weight: torch.Tensor = None,
         float_bias: torch.Tensor = None,
-        num_scale_trials: int = 24,
         input_float8_dtype=torch.float8_e5m2,
     ) -> None:
         super().__init__()
@@ -183,6 +185,11 @@ class F8Linear(nn.Module):
             1, dtype=self.weight.dtype, device=self.weight.device, requires_grad=False
         )
     def quantize_input(self, x: torch.Tensor):
         if self.input_scale_initialized:
             return to_fp8_saturated(x * self.input_scale, self.input_float8_dtype)
@@ -279,10 +286,12 @@ class F8Linear(nn.Module):
         return f8_lin
 def recursive_swap_linears(
     model: nn.Module,
     float8_dtype=torch.float8_e4m3fn,
     input_float8_dtype=torch.float8_e5m2,
 ) -> None:
     """
     Recursively swaps all nn.Linear modules in the given model with F8Linear modules.
@@ -300,6 +309,8 @@ def recursive_swap_linears(
         all linear layers in the model will be using 8-bit quantization.
     """
     for name, child in model.named_children():
         if isinstance(child, nn.Linear) and not isinstance(
             child, (F8Linear, CublasLinear)
         ):
@@ -315,7 +326,35 @@ def recursive_swap_linears(
             )
             del child
         else:
-            recursive_swap_linears(child)
 @torch.inference_mode()
@@ -325,6 +364,10 @@ def quantize_flow_transformer_and_dispatch_float8(
     float8_dtype=torch.float8_e4m3fn,
     input_float8_dtype=torch.float8_e5m2,
     offload_flow=False,
 ) -> nn.Module:
     """
     Quantize the flux flow transformer model (original BFL codebase version) and dispatch to the given device.
@@ -334,19 +377,36 @@ def quantize_flow_transformer_and_dispatch_float8(
     Allows for fast dispatch to gpu & quantize without causing OOM on gpus with limited memory.
     After dispatching, if offload_flow is True, offloads the model to cpu.
     """
     for module in flow_model.double_blocks:
         module.to(device)
         module.eval()
         recursive_swap_linears(
-            module, float8_dtype=float8_dtype, input_float8_dtype=input_float8_dtype
         )
         torch.cuda.empty_cache()
     for module in flow_model.single_blocks:
         module.to(device)
         module.eval()
         recursive_swap_linears(
-            module, float8_dtype=float8_dtype, input_float8_dtype=input_float8_dtype
         )
         torch.cuda.empty_cache()
     to_gpu_extras = [
@@ -367,23 +427,30 @@ def quantize_flow_transformer_and_dispatch_float8(
         if isinstance(m_extra, nn.Linear) and not isinstance(
             m_extra, (F8Linear, CublasLinear)
         ):
-            setattr(
-                flow_model,
-                module,
-                F8Linear.from_linear(
                     m_extra,
                     float8_dtype=float8_dtype,
                     input_float8_dtype=input_float8_dtype,
-                ),
-            )
-            del m_extra
-        elif module != "final_layer":
-            recursive_swap_linears(
-                m_extra,
-                float8_dtype=float8_dtype,
-                input_float8_dtype=input_float8_dtype,
-            )
         torch.cuda.empty_cache()
     if offload_flow:
         flow_model.to("cpu")
         torch.cuda.empty_cache()

+from loguru import logger
 import torch
 import torch.nn as nn
 from torchao.float8.float8_utils import (
 from torch.compiler import is_compiling
 from torch import __version__
 from torch.version import cuda
+from modules.flux_model import Modulation
 IS_TORCH_2_4 = __version__ < (2, 4, 9)
 LT_TORCH_2_4 = __version__ < (2, 4)
         float8_dtype=torch.float8_e4m3fn,
         float_weight: torch.Tensor = None,
         float_bias: torch.Tensor = None,
+        num_scale_trials: int = 12,
         input_float8_dtype=torch.float8_e5m2,
     ) -> None:
         super().__init__()
             1, dtype=self.weight.dtype, device=self.weight.device, requires_grad=False
         )
+    def set_weight_tensor(self, tensor: torch.Tensor):
+        self.weight.data = tensor
+        self.weight_initialized = False
+        self.quantize_weight()
     def quantize_input(self, x: torch.Tensor):
         if self.input_scale_initialized:
             return to_fp8_saturated(x * self.input_scale, self.input_float8_dtype)
         return f8_lin
+@torch.inference_mode()
 def recursive_swap_linears(
     model: nn.Module,
     float8_dtype=torch.float8_e4m3fn,
     input_float8_dtype=torch.float8_e5m2,
+    quantize_modulation: bool = True,
 ) -> None:
     """
     Recursively swaps all nn.Linear modules in the given model with F8Linear modules.
         all linear layers in the model will be using 8-bit quantization.
     """
     for name, child in model.named_children():
+        if isinstance(child, Modulation) and not quantize_modulation:
+            continue
         if isinstance(child, nn.Linear) and not isinstance(
             child, (F8Linear, CublasLinear)
         ):
             )
             del child
         else:
+            recursive_swap_linears(
+                child,
+                float8_dtype=float8_dtype,
+                input_float8_dtype=input_float8_dtype,
+                quantize_modulation=quantize_modulation,
+            )
+@torch.inference_mode()
+def swap_to_cublaslinear(model: nn.Module):
+    if not isinstance(CublasLinear, torch.nn.Module):
+        return
+    for name, child in model.named_children():
+        if isinstance(child, nn.Linear) and not isinstance(
+            child, (F8Linear, CublasLinear)
+        ):
+            cublas_lin = CublasLinear(
+                child.in_features,
+                child.out_features,
+                bias=child.bias is not None,
+                dtype=child.weight.dtype,
+                device=child.weight.device,
+            )
+            cublas_lin.weight.data = child.weight.clone().detach()
+            cublas_lin.bias.data = child.bias.clone().detach()
+            setattr(model, name, cublas_lin)
+            del child
+        else:
+            swap_to_cublaslinear(child)
 @torch.inference_mode()
     float8_dtype=torch.float8_e4m3fn,
     input_float8_dtype=torch.float8_e5m2,
     offload_flow=False,
+    swap_linears_with_cublaslinear=True,
+    flow_dtype=torch.float16,
+    quantize_modulation: bool = True,
+    quantize_flow_embedder_layers: bool = True,
 ) -> nn.Module:
     """
     Quantize the flux flow transformer model (original BFL codebase version) and dispatch to the given device.
     Allows for fast dispatch to gpu & quantize without causing OOM on gpus with limited memory.
     After dispatching, if offload_flow is True, offloads the model to cpu.
+    if swap_linears_with_cublaslinear is true, and flow_dtype == torch.float16, then swap all linears with cublaslinears for 2x performance boost on consumer GPUs.
+    Otherwise will skip the cublaslinear swap.
+    For added extra precision, you can set quantize_flow_embedder_layers to False,
+    this helps maintain the output quality of the flow transformer moreso than fully quantizing,
+    at the expense of ~512MB more VRAM usage.
+    For added extra precision, you can set quantize_modulation to False,
+    this helps maintain the output quality of the flow transformer moreso than fully quantizing,
+    at the expense of ~2GB more VRAM usage, but- has a much higher impact on image quality than the embedder layers.
     """
     for module in flow_model.double_blocks:
         module.to(device)
         module.eval()
         recursive_swap_linears(
+            module,
+            float8_dtype=float8_dtype,
+            input_float8_dtype=input_float8_dtype,
+            quantize_modulation=quantize_modulation,
         )
         torch.cuda.empty_cache()
     for module in flow_model.single_blocks:
         module.to(device)
         module.eval()
         recursive_swap_linears(
+            module,
+            float8_dtype=float8_dtype,
+            input_float8_dtype=input_float8_dtype,
+            quantize_modulation=quantize_modulation,
         )
         torch.cuda.empty_cache()
     to_gpu_extras = [
         if isinstance(m_extra, nn.Linear) and not isinstance(
             m_extra, (F8Linear, CublasLinear)
         ):
+            if quantize_flow_embedder_layers:
+                setattr(
+                    flow_model,
+                    module,
+                    F8Linear.from_linear(
+                        m_extra,
+                        float8_dtype=float8_dtype,
+                        input_float8_dtype=input_float8_dtype,
+                    ),
+                )
+            del m_extra
+        elif module != "final_layer":
+            if quantize_flow_embedder_layers:
+                recursive_swap_linears(
                     m_extra,
                     float8_dtype=float8_dtype,
                     input_float8_dtype=input_float8_dtype,
+                    quantize_modulation=quantize_modulation,
+                )
         torch.cuda.empty_cache()
+    if swap_linears_with_cublaslinear and flow_dtype == torch.float16:
+        swap_to_cublaslinear(flow_model)
+    elif swap_linears_with_cublaslinear and flow_dtype != torch.float16:
+        logger.warning("Skipping cublas linear swap because flow_dtype is not float16")
     if offload_flow:
         flow_model.to("cpu")
         torch.cuda.empty_cache()

flux_pipeline.py CHANGED Viewed

@@ -31,6 +31,7 @@ from torchvision.transforms import functional as TF
 from tqdm import tqdm
 from util import (
     ModelSpec,
     into_device,
     into_dtype,
     load_config_from_path,
@@ -80,29 +81,17 @@ class FluxPipeline:
         This class is responsible for preparing input tensors for the Flux model, generating
         timesteps and noise, and handling device management for model offloading.
         """
         self.debug = debug
         self.name = name
-        self.device_flux = (
-            flux_device
-            if isinstance(flux_device, torch.device)
-            else torch.device(flux_device)
-        )
-        self.device_ae = (
-            ae_device
-            if isinstance(ae_device, torch.device)
-            else torch.device(ae_device)
-        )
-        self.device_clip = (
-            clip_device
-            if isinstance(clip_device, torch.device)
-            else torch.device(clip_device)
-        )
-        self.device_t5 = (
-            t5_device
-            if isinstance(t5_device, torch.device)
-            else torch.device(t5_device)
-        )
-        self.dtype = dtype
         self.offload = offload
         self.clip: "HFEmbedder" = clip
         self.t5: "HFEmbedder" = t5
@@ -116,6 +105,8 @@ class FluxPipeline:
         self.offload_text_encoder = config.offload_text_encoder
         self.offload_vae = config.offload_vae
         self.offload_flow = config.offload_flow
         if not self.offload_flow:
             self.model.to(self.device_flux)
         if not self.offload_vae:
@@ -124,40 +115,16 @@ class FluxPipeline:
             self.clip.to(self.device_clip)
             self.t5.to(self.device_t5)
-        if self.config.compile_blocks or self.config.compile_extras:
-            if not self.config.prequantized_flow:
-                logger.info("Running warmups for compile...")
-                warmup_dict = dict(
-                    prompt="A beautiful test image used to solidify the fp8 nn.Linear input scales prior to compilation 😉",
-                    height=768,
-                    width=768,
-                    num_steps=25,
-                    guidance=3.5,
-                    seed=10,
-                )
-                self.generate(**warmup_dict)
-            to_gpu_extras = [
-                "vector_in",
-                "img_in",
-                "txt_in",
-                "time_in",
-                "guidance_in",
-                "final_layer",
-                "pe_embedder",
-            ]
-            if self.config.compile_blocks:
-                for block in self.model.double_blocks:
-                    block.compile()
-                for block in self.model.single_blocks:
-                    block.compile()
-            if self.config.compile_extras:
-                for extra in to_gpu_extras:
-                    getattr(self.model, extra).compile()
-    def set_seed(self, seed: int | None = None) -> torch.Generator:
         if isinstance(seed, (int, float)):
             seed = int(abs(seed)) % MAX_RAND
-            self.rng = torch.manual_seed(seed)
         elif isinstance(seed, str):
             try:
                 seed = abs(int(seed)) % MAX_RAND
@@ -166,14 +133,71 @@ class FluxPipeline:
                     f"Recieved string representation of seed, but was not able to convert to int: {seed}, using random seed"
                 )
                 seed = abs(self.rng.seed()) % MAX_RAND
         else:
             seed = abs(self.rng.seed()) % MAX_RAND
-        torch.cuda.manual_seed_all(seed)
-        np.random.seed(seed)
-        random.seed(seed)
-        cuda_generator = torch.Generator("cuda").manual_seed(seed)
         return cuda_generator, seed
     @torch.inference_mode()
     def prepare(
         self,
@@ -608,12 +632,18 @@ class FluxPipeline:
     @classmethod
     def load_pipeline_from_config_path(
-        cls, path: str, flow_model_path: str = None, debug: bool = False
     ) -> "FluxPipeline":
         with torch.inference_mode():
             config = load_config_from_path(path)
             if flow_model_path:
                 config.ckpt_path = flow_model_path
             return cls.load_pipeline_from_config(config, debug=debug)
     @classmethod
@@ -639,7 +669,13 @@ class FluxPipeline:
             if not config.prequantized_flow:
                 flow_model = quantize_flow_transformer_and_dispatch_float8(
-                    flow_model, flux_device, offload_flow=config.offload_flow
                 )
             else:
                 flow_model.eval().requires_grad_(False)

 from tqdm import tqdm
 from util import (
     ModelSpec,
+    ModelVersion,
     into_device,
     into_dtype,
     load_config_from_path,
         This class is responsible for preparing input tensors for the Flux model, generating
         timesteps and noise, and handling device management for model offloading.
         """
+        if config is None:
+            raise ValueError("ModelSpec config is required!")
         self.debug = debug
         self.name = name
+        self.device_flux = into_device(flux_device)
+        self.device_ae = into_device(ae_device)
+        self.device_clip = into_device(clip_device)
+        self.device_t5 = into_device(t5_device)
+        self.dtype = into_dtype(dtype)
         self.offload = offload
         self.clip: "HFEmbedder" = clip
         self.t5: "HFEmbedder" = t5
         self.offload_text_encoder = config.offload_text_encoder
         self.offload_vae = config.offload_vae
         self.offload_flow = config.offload_flow
+        # If models are not offloaded, move them to the appropriate devices
         if not self.offload_flow:
             self.model.to(self.device_flux)
         if not self.offload_vae:
             self.clip.to(self.device_clip)
             self.t5.to(self.device_t5)
+        # compile the model if needed
+        if config.compile_blocks or config.compile_extras:
+            self.compile()
+    def set_seed(
+        self, seed: int | None = None, seed_globally: bool = False
+    ) -> torch.Generator:
         if isinstance(seed, (int, float)):
             seed = int(abs(seed)) % MAX_RAND
+            cuda_generator = torch.Generator("cuda").manual_seed(seed)
         elif isinstance(seed, str):
             try:
                 seed = abs(int(seed)) % MAX_RAND
                     f"Recieved string representation of seed, but was not able to convert to int: {seed}, using random seed"
                 )
                 seed = abs(self.rng.seed()) % MAX_RAND
+            cuda_generator = torch.Generator("cuda").manual_seed(seed)
         else:
             seed = abs(self.rng.seed()) % MAX_RAND
+            cuda_generator = torch.Generator("cuda").manual_seed(seed)
+        if seed_globally:
+            torch.cuda.manual_seed_all(seed)
+            np.random.seed(seed)
+            random.seed(seed)
         return cuda_generator, seed
+    @torch.inference_mode()
+    def compile(self):
+        """
+        Compiles the model and extras.
+        First, if:
+        - A) Checkpoint which already has float8 quantized weights and tuned input scales.
+        In which case, it will not run warmups since it assumes the input scales are already tuned.
+        - B) Checkpoint which has not been quantized, in which  case it will be quantized
+        and the input scales will be tuned. via running a warmup loop.
+            - If the model is flux-schnell, it will run 3 warmup loops since each loop is 4 steps.
+            - If the model is flux-dev, it will run 1 warmup loop for 12 steps.
+        """
+        # Run warmups if the checkpoint is not prequantized
+        if not self.config.prequantized_flow:
+            logger.info("Running warmups for compile...")
+            warmup_dict = dict(
+                prompt="A beautiful test image used to solidify the fp8 nn.Linear input scales prior to compilation 😉",
+                height=768,
+                width=768,
+                num_steps=12,
+                guidance=3.5,
+                seed=10,
+            )
+            if self.config.version == ModelVersion.flux_schnell:
+                warmup_dict["num_steps"] = 4
+                for _ in range(3):
+                    self.generate(**warmup_dict)
+            else:
+                self.generate(**warmup_dict)
+        # Compile the model and extras
+        to_gpu_extras = [
+            "vector_in",
+            "img_in",
+            "txt_in",
+            "time_in",
+            "guidance_in",
+            "final_layer",
+            "pe_embedder",
+        ]
+        if self.config.compile_blocks:
+            for block in self.model.double_blocks:
+                block.compile()
+            for block in self.model.single_blocks:
+                block.compile()
+        if self.config.compile_extras:
+            for extra in to_gpu_extras:
+                getattr(self.model, extra).compile()
     @torch.inference_mode()
     def prepare(
         self,
     @classmethod
     def load_pipeline_from_config_path(
+        cls, path: str, flow_model_path: str = None, debug: bool = False, **kwargs
     ) -> "FluxPipeline":
         with torch.inference_mode():
             config = load_config_from_path(path)
             if flow_model_path:
                 config.ckpt_path = flow_model_path
+            for k, v in kwargs.items():
+                if hasattr(config, k):
+                    logger.info(
+                        f"Overriding config {k}:{getattr(config, k)} with value {v}"
+                    )
+                    setattr(config, k, v)
             return cls.load_pipeline_from_config(config, debug=debug)
     @classmethod
             if not config.prequantized_flow:
                 flow_model = quantize_flow_transformer_and_dispatch_float8(
+                    flow_model,
+                    flux_device,
+                    offload_flow=config.offload_flow,
+                    swap_linears_with_cublaslinear=flux_dtype == torch.float16,
+                    flow_dtype=flux_dtype,
+                    quantize_modulation=config.quantize_modulation,
+                    quantize_flow_embedder_layers=config.quantize_flow_embedder_layers,
                 )
             else:
                 flow_model.eval().requires_grad_(False)

main.py CHANGED Viewed

@@ -129,6 +129,22 @@ def parse_args():
         + "and then saving the state_dict as a safetensors file), "
         + "which reduces the size of the checkpoint by about 50% & reduces startup time",
     )
     return parser.parse_args()
@@ -171,6 +187,8 @@ def main():
             offload_ae=args.offload_ae,
             offload_text_enc=args.offload_text_enc,
             prequantized_flow=args.prequantized_flow,
         )
         app.state.model = FluxPipeline.load_pipeline_from_config(config)

         + "and then saving the state_dict as a safetensors file), "
         + "which reduces the size of the checkpoint by about 50% & reduces startup time",
     )
+    parser.add_argument(
+        "-nqfm",
+        "--no-quantize-flow-modulation",
+        action="store_false",
+        default=True,
+        dest="quantize_modulation",
+        help="Disable quantization of the modulation layers in the flow model, adds ~2GB vram usage for moderate precision improvements",
+    )
+    parser.add_argument(
+        "-qfl",
+        "--quantize-flow-embedder-layers",
+        action="store_true",
+        default=False,
+        dest="quantize_flow_embedder_layers",
+        help="Quantize the flow embedder layers in the flow model, saves ~512MB vram usage, but precision loss is very noticeable",
+    )
     return parser.parse_args()
             offload_ae=args.offload_ae,
             offload_text_enc=args.offload_text_enc,
             prequantized_flow=args.prequantized_flow,
+            quantize_modulation=args.quantize_modulation,
+            quantize_flow_embedder_layers=args.quantize_flow_embedder_layers,
         )
         app.state.model = FluxPipeline.load_pipeline_from_config(config)

modules/conditioner.py CHANGED Viewed

@@ -56,6 +56,16 @@ class HFEmbedder(nn.Module):
         self.max_length = max_length
         self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
         if self.is_clip:
             self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
                 version, max_length=max_length
@@ -64,11 +74,6 @@ class HFEmbedder(nn.Module):
             self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(
                 version,
                 **hf_kwargs,
-                quantization_config=(
-                    auto_quantization_config(quantization_dtype)
-                    if quantization_dtype
-                    else None
-                ),
             )
         else:
@@ -78,11 +83,6 @@ class HFEmbedder(nn.Module):
             self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(
                 version,
                 **hf_kwargs,
-                quantization_config=(
-                    auto_quantization_config(quantization_dtype)
-                    if quantization_dtype
-                    else None
-                ),
             )
     def offload(self):

         self.max_length = max_length
         self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+        auto_quant_config = (
+            auto_quantization_config(quantization_dtype) if quantization_dtype else None
+        )
+        # BNB will move to cuda:0 by default if not specified
+        if isinstance(auto_quant_config, BitsAndBytesConfig):
+            hf_kwargs["device_map"] = {"": self.device.index}
+        if auto_quant_config is not None:
+            hf_kwargs["quantization_config"] = auto_quant_config
         if self.is_clip:
             self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
                 version, max_length=max_length
             self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(
                 version,
                 **hf_kwargs,
             )
         else:
             self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(
                 version,
                 **hf_kwargs,
             )
     def offload(self):

modules/flux_model.py CHANGED Viewed

@@ -1,7 +1,11 @@
 from collections import namedtuple
 import os
 import torch
 DISABLE_COMPILE = os.getenv("DISABLE_COMPILE", "0") == "1"
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
@@ -14,11 +18,6 @@ from torch import Tensor, nn
 from pydantic import BaseModel
 from torch.nn import functional as F
-try:
-    from cublas_ops import CublasLinear
-except ImportError:
-    CublasLinear = nn.Linear
 class FluxParams(BaseModel):
     in_channels: int
@@ -116,11 +115,39 @@ def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 10
 class MLPEmbedder(nn.Module):
-    def __init__(self, in_dim: int, hidden_dim: int):
         super().__init__()
-        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
         self.silu = nn.SiLU()
-        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
     def forward(self, x: Tensor) -> Tensor:
         return self.out_layer(self.silu(self.in_layer(x)))
@@ -148,14 +175,38 @@ class QKNorm(torch.nn.Module):
 class SelfAttention(nn.Module):
-    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
         super().__init__()
         self.num_heads = num_heads
         head_dim = dim // num_heads
-        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
         self.norm = QKNorm(head_dim)
-        self.proj = nn.Linear(dim, dim)
         self.K = 3
         self.H = self.num_heads
         self.KH = self.K * self.H
@@ -178,11 +229,21 @@ ModulationOut = namedtuple("ModulationOut", ["shift", "scale", "gate"])
 class Modulation(nn.Module):
-    def __init__(self, dim: int, double: bool):
         super().__init__()
         self.is_double = double
         self.multiplier = 6 if double else 3
-        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
         self.act = nn.SiLU()
     def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
@@ -202,37 +263,83 @@ class DoubleStreamBlock(nn.Module):
         mlp_ratio: float,
         qkv_bias: bool = False,
         dtype: torch.dtype = torch.float16,
     ):
         super().__init__()
         self.dtype = dtype
         mlp_hidden_dim = int(hidden_size * mlp_ratio)
         self.num_heads = num_heads
         self.hidden_size = hidden_size
-        self.img_mod = Modulation(hidden_size, double=True)
         self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.img_attn = SelfAttention(
-            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
         )
         self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.img_mlp = nn.Sequential(
-            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
             nn.GELU(approximate="tanh"),
-            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
-        self.txt_mod = Modulation(hidden_size, double=True)
         self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.txt_attn = SelfAttention(
-            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
         )
         self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.txt_mlp = nn.Sequential(
-            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
             nn.GELU(approximate="tanh"),
-            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
         )
         self.K = 3
         self.H = self.num_heads
@@ -301,8 +408,12 @@ class SingleStreamBlock(nn.Module):
         mlp_ratio: float = 4.0,
         qk_scale: float | None = None,
         dtype: torch.dtype = torch.float16,
     ):
         super().__init__()
         self.dtype = dtype
         self.hidden_dim = hidden_size
         self.num_heads = num_heads
@@ -311,9 +422,25 @@ class SingleStreamBlock(nn.Module):
         self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
         # qkv and mlp_in
-        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
         # proj and mlp_out
-        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
         self.norm = QKNorm(head_dim)
@@ -321,7 +448,11 @@ class SingleStreamBlock(nn.Module):
         self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.mlp_act = nn.GELU(approximate="tanh")
-        self.modulation = Modulation(hidden_size, double=False)
         self.K = 3
         self.H = self.num_heads
@@ -350,11 +481,11 @@ class LastLayer(nn.Module):
     def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
         super().__init__()
         self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.linear = CublasLinear(
             hidden_size, patch_size * patch_size * out_channels, bias=True
         )
         self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(), CublasLinear(hidden_size, 2 * hidden_size, bias=True)
         )
     def forward(self, x: Tensor, vec: Tensor) -> Tensor:
@@ -369,50 +500,96 @@ class Flux(nn.Module):
     Transformer model for flow matching on sequences.
     """
-    def __init__(self, params: FluxParams, dtype: torch.dtype = torch.float16):
         super().__init__()
         self.dtype = dtype
-        self.params = params
-        self.in_channels = params.in_channels
         self.out_channels = self.in_channels
-        if params.hidden_size % params.num_heads != 0:
             raise ValueError(
-                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
             )
-        pe_dim = params.hidden_size // params.num_heads
-        if sum(params.axes_dim) != pe_dim:
             raise ValueError(
-                f"Got {params.axes_dim} but expected positional dim {pe_dim}"
             )
-        self.hidden_size = params.hidden_size
-        self.num_heads = params.num_heads
         self.pe_embedder = EmbedND(
             dim=pe_dim,
-            theta=params.theta,
-            axes_dim=params.axes_dim,
             dtype=self.dtype,
         )
-        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
         self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-            if params.guidance_embed
             else nn.Identity()
         )
-        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
         self.double_blocks = nn.ModuleList(
             [
                 DoubleStreamBlock(
                     self.hidden_size,
                     self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
-                    qkv_bias=params.qkv_bias,
                     dtype=self.dtype,
                 )
-                for _ in range(params.depth)
             ]
         )
@@ -421,10 +598,12 @@ class Flux(nn.Module):
                 SingleStreamBlock(
                     self.hidden_size,
                     self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
                     dtype=self.dtype,
                 )
-                for _ in range(params.depth_single_blocks)
             ]
         )
@@ -477,13 +656,17 @@ class Flux(nn.Module):
         return img
     @classmethod
-    def from_pretrained(cls, path: str, dtype: torch.dtype = torch.bfloat16) -> "Flux":
         from util import load_config_from_path
         from safetensors.torch import load_file
         config = load_config_from_path(path)
         with torch.device("meta"):
-            klass = cls(params=config.params, dtype=dtype).type(dtype)
         ckpt = load_file(config.ckpt_path, device="cpu")
         klass.load_state_dict(ckpt, assign=True)

 from collections import namedtuple
 import os
+from typing import TYPE_CHECKING
 import torch
+if TYPE_CHECKING:
+    from util import ModelSpec
 DISABLE_COMPILE = os.getenv("DISABLE_COMPILE", "0") == "1"
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
 from pydantic import BaseModel
 from torch.nn import functional as F
 class FluxParams(BaseModel):
     in_channels: int
 class MLPEmbedder(nn.Module):
+    def __init__(
+        self, in_dim: int, hidden_dim: int, prequantized: bool = False, quantized=False
+    ):
+        from float8_quantize import F8Linear
         super().__init__()
+        self.in_layer = (
+            nn.Linear(in_dim, hidden_dim, bias=True)
+            if not prequantized
+            else (
+                F8Linear(
+                    in_features=in_dim,
+                    out_features=hidden_dim,
+                    bias=True,
+                )
+                if quantized
+                else nn.Linear(in_dim, hidden_dim, bias=True)
+            )
+        )
         self.silu = nn.SiLU()
+        self.out_layer = (
+            nn.Linear(hidden_dim, hidden_dim, bias=True)
+            if not prequantized
+            else (
+                F8Linear(
+                    in_features=hidden_dim,
+                    out_features=hidden_dim,
+                    bias=True,
+                )
+                if quantized
+                else nn.Linear(hidden_dim, hidden_dim, bias=True)
+            )
+        )
     def forward(self, x: Tensor) -> Tensor:
         return self.out_layer(self.silu(self.in_layer(x)))
 class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        prequantized: bool = False,
+    ):
         super().__init__()
+        from float8_quantize import F8Linear
         self.num_heads = num_heads
         head_dim = dim // num_heads
+        self.qkv = (
+            nn.Linear(dim, dim * 3, bias=qkv_bias)
+            if not prequantized
+            else F8Linear(
+                in_features=dim,
+                out_features=dim * 3,
+                bias=qkv_bias,
+            )
+        )
         self.norm = QKNorm(head_dim)
+        self.proj = (
+            nn.Linear(dim, dim)
+            if not prequantized
+            else F8Linear(
+                in_features=dim,
+                out_features=dim,
+                bias=True,
+            )
+        )
         self.K = 3
         self.H = self.num_heads
         self.KH = self.K * self.H
 class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool, quantized_modulation: bool = False):
         super().__init__()
+        from float8_quantize import F8Linear
         self.is_double = double
         self.multiplier = 6 if double else 3
+        self.lin = (
+            nn.Linear(dim, self.multiplier * dim, bias=True)
+            if not quantized_modulation
+            else F8Linear(
+                in_features=dim,
+                out_features=self.multiplier * dim,
+                bias=True,
+            )
+        )
         self.act = nn.SiLU()
     def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
         mlp_ratio: float,
         qkv_bias: bool = False,
         dtype: torch.dtype = torch.float16,
+        quantized_modulation: bool = False,
+        prequantized: bool = False,
     ):
         super().__init__()
+        from float8_quantize import F8Linear
         self.dtype = dtype
         mlp_hidden_dim = int(hidden_size * mlp_ratio)
         self.num_heads = num_heads
         self.hidden_size = hidden_size
+        self.img_mod = Modulation(
+            hidden_size, double=True, quantized_modulation=quantized_modulation
+        )
         self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.img_attn = SelfAttention(
+            dim=hidden_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            prequantized=prequantized,
         )
         self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.img_mlp = nn.Sequential(
+            (
+                nn.Linear(hidden_size, mlp_hidden_dim, bias=True)
+                if not prequantized
+                else F8Linear(
+                    in_features=hidden_size,
+                    out_features=mlp_hidden_dim,
+                    bias=True,
+                )
+            ),
             nn.GELU(approximate="tanh"),
+            (
+                nn.Linear(mlp_hidden_dim, hidden_size, bias=True)
+                if not prequantized
+                else F8Linear(
+                    in_features=mlp_hidden_dim,
+                    out_features=hidden_size,
+                    bias=True,
+                )
+            ),
         )
+        self.txt_mod = Modulation(
+            hidden_size, double=True, quantized_modulation=quantized_modulation
+        )
         self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.txt_attn = SelfAttention(
+            dim=hidden_size,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            prequantized=prequantized,
         )
         self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.txt_mlp = nn.Sequential(
+            (
+                nn.Linear(hidden_size, mlp_hidden_dim, bias=True)
+                if not prequantized
+                else F8Linear(
+                    in_features=hidden_size,
+                    out_features=mlp_hidden_dim,
+                    bias=True,
+                )
+            ),
             nn.GELU(approximate="tanh"),
+            (
+                nn.Linear(mlp_hidden_dim, hidden_size, bias=True)
+                if not prequantized
+                else F8Linear(
+                    in_features=mlp_hidden_dim,
+                    out_features=hidden_size,
+                    bias=True,
+                )
+            ),
         )
         self.K = 3
         self.H = self.num_heads
         mlp_ratio: float = 4.0,
         qk_scale: float | None = None,
         dtype: torch.dtype = torch.float16,
+        quantized_modulation: bool = False,
+        prequantized: bool = False,
     ):
         super().__init__()
+        from float8_quantize import F8Linear
         self.dtype = dtype
         self.hidden_dim = hidden_size
         self.num_heads = num_heads
         self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
         # qkv and mlp_in
+        self.linear1 = (
+            nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+            if not prequantized
+            else F8Linear(
+                in_features=hidden_size,
+                out_features=hidden_size * 3 + self.mlp_hidden_dim,
+                bias=True,
+            )
+        )
         # proj and mlp_out
+        self.linear2 = (
+            nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+            if not prequantized
+            else F8Linear(
+                in_features=hidden_size + self.mlp_hidden_dim,
+                out_features=hidden_size,
+                bias=True,
+            )
+        )
         self.norm = QKNorm(head_dim)
         self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
         self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(
+            hidden_size,
+            double=False,
+            quantized_modulation=quantized_modulation and prequantized,
+        )
         self.K = 3
         self.H = self.num_heads
     def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
         super().__init__()
         self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(
             hidden_size, patch_size * patch_size * out_channels, bias=True
         )
         self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
         )
     def forward(self, x: Tensor, vec: Tensor) -> Tensor:
     Transformer model for flow matching on sequences.
     """
+    def __init__(self, config: "ModelSpec", dtype: torch.dtype = torch.float16):
         super().__init__()
         self.dtype = dtype
+        self.params = config.params
+        self.in_channels = config.params.in_channels
         self.out_channels = self.in_channels
+        prequantized_flow = config.prequantized_flow
+        quantized_embedders = config.quantize_flow_embedder_layers and prequantized_flow
+        quantized_modulation = config.quantize_modulation and prequantized_flow
+        from float8_quantize import F8Linear
+        if config.params.hidden_size % config.params.num_heads != 0:
             raise ValueError(
+                f"Hidden size {config.params.hidden_size} must be divisible by num_heads {config.params.num_heads}"
             )
+        pe_dim = config.params.hidden_size // config.params.num_heads
+        if sum(config.params.axes_dim) != pe_dim:
             raise ValueError(
+                f"Got {config.params.axes_dim} but expected positional dim {pe_dim}"
             )
+        self.hidden_size = config.params.hidden_size
+        self.num_heads = config.params.num_heads
         self.pe_embedder = EmbedND(
             dim=pe_dim,
+            theta=config.params.theta,
+            axes_dim=config.params.axes_dim,
             dtype=self.dtype,
         )
+        self.img_in = (
+            nn.Linear(self.in_channels, self.hidden_size, bias=True)
+            if not prequantized_flow
+            else (
+                F8Linear(
+                    in_features=self.in_channels,
+                    out_features=self.hidden_size,
+                    bias=True,
+                )
+                if quantized_embedders
+                else nn.Linear(self.in_channels, self.hidden_size, bias=True)
+            )
+        )
+        self.time_in = MLPEmbedder(
+            in_dim=256,
+            hidden_dim=self.hidden_size,
+            prequantized=prequantized_flow,
+            quantized=quantized_embedders,
+        )
+        self.vector_in = MLPEmbedder(
+            config.params.vec_in_dim,
+            self.hidden_size,
+            prequantized=prequantized_flow,
+            quantized=quantized_embedders,
+        )
         self.guidance_in = (
+            MLPEmbedder(
+                in_dim=256,
+                hidden_dim=self.hidden_size,
+                prequantized=prequantized_flow,
+                quantized=quantized_embedders,
+            )
+            if config.params.guidance_embed
             else nn.Identity()
         )
+        self.txt_in = (
+            nn.Linear(config.params.context_in_dim, self.hidden_size)
+            if not quantized_embedders
+            else (
+                F8Linear(
+                    in_features=config.params.context_in_dim,
+                    out_features=self.hidden_size,
+                    bias=True,
+                )
+                if quantized_embedders
+                else nn.Linear(config.params.context_in_dim, self.hidden_size)
+            )
+        )
         self.double_blocks = nn.ModuleList(
             [
                 DoubleStreamBlock(
                     self.hidden_size,
                     self.num_heads,
+                    mlp_ratio=config.params.mlp_ratio,
+                    qkv_bias=config.params.qkv_bias,
                     dtype=self.dtype,
+                    quantized_modulation=quantized_modulation,
+                    prequantized=prequantized_flow,
                 )
+                for _ in range(config.params.depth)
             ]
         )
                 SingleStreamBlock(
                     self.hidden_size,
                     self.num_heads,
+                    mlp_ratio=config.params.mlp_ratio,
                     dtype=self.dtype,
+                    quantized_modulation=quantized_modulation,
+                    prequantized=prequantized_flow,
                 )
+                for _ in range(config.params.depth_single_blocks)
             ]
         )
         return img
     @classmethod
+    def from_pretrained(
+        cls: "Flux", path: str, dtype: torch.dtype = torch.float16
+    ) -> "Flux":
         from util import load_config_from_path
         from safetensors.torch import load_file
         config = load_config_from_path(path)
         with torch.device("meta"):
+            klass = cls(config=config, dtype=dtype)
+            if not config.prequantized_flow:
+                klass.type(dtype)
         ckpt = load_file(config.ckpt_path, device="cpu")
         klass.load_state_dict(ckpt, assign=True)

modules/flux_model_f8.py DELETED Viewed

@@ -1,491 +0,0 @@
-from collections import namedtuple
-import os
-import torch
-DISABLE_COMPILE = os.getenv("DISABLE_COMPILE", "0") == "1"
-torch.backends.cuda.matmul.allow_tf32 = True
-torch.backends.cudnn.allow_tf32 = True
-torch.backends.cudnn.benchmark = True
-torch.backends.cudnn.benchmark_limit = 20
-torch.set_float32_matmul_precision("high")
-import math
-from torch import Tensor, nn
-from pydantic import BaseModel
-from torch.nn import functional as F
-from float8_quantize import F8Linear
-try:
-    from cublas_ops import CublasLinear
-except ImportError:
-    CublasLinear = nn.Linear
-class FluxParams(BaseModel):
-    in_channels: int
-    vec_in_dim: int
-    context_in_dim: int
-    hidden_size: int
-    mlp_ratio: float
-    num_heads: int
-    depth: int
-    depth_single_blocks: int
-    axes_dim: list[int]
-    theta: int
-    qkv_bias: bool
-    guidance_embed: bool
-# attention is always same shape each time it's called per H*W, so compile with fullgraph
-# @torch.compile(mode="reduce-overhead", fullgraph=True, disable=DISABLE_COMPILE)
-def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
-    q, k = apply_rope(q, k, pe)
-    x = F.scaled_dot_product_attention(q, k, v).transpose(1, 2)
-    x = x.reshape(*x.shape[:-2], -1)
-    return x
-# @torch.compile(mode="reduce-overhead", disable=DISABLE_COMPILE)
-def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
-    scale = torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim
-    omega = 1.0 / (theta**scale)
-    out = torch.einsum("...n,d->...nd", pos, omega)
-    out = torch.stack(
-        [torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1
-    )
-    out = out.reshape(*out.shape[:-1], 2, 2)
-    return out
-def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
-    xq_ = xq.reshape(*xq.shape[:-1], -1, 1, 2)
-    xk_ = xk.reshape(*xk.shape[:-1], -1, 1, 2)
-    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
-    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
-    return xq_out.reshape(*xq.shape), xk_out.reshape(*xk.shape)
-class EmbedND(nn.Module):
-    def __init__(
-        self,
-        dim: int,
-        theta: int,
-        axes_dim: list[int],
-        dtype: torch.dtype = torch.bfloat16,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        self.axes_dim = axes_dim
-        self.dtype = dtype
-    def forward(self, ids: Tensor) -> Tensor:
-        n_axes = ids.shape[-1]
-        emb = torch.cat(
-            [
-                rope(ids[..., i], self.axes_dim[i], self.theta).type(self.dtype)
-                for i in range(n_axes)
-            ],
-            dim=-3,
-        )
-        return emb.unsqueeze(1)
-def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
-    """
-    Create sinusoidal timestep embeddings.
-    :param t: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param dim: the dimension of the output.
-    :param max_period: controls the minimum frequency of the embeddings.
-    :return: an (N, D) Tensor of positional embeddings.
-    """
-    t = time_factor * t
-    half = dim // 2
-    freqs = torch.exp(
-        -math.log(max_period)
-        * torch.arange(start=0, end=half, dtype=torch.float32, device=t.device)
-        / half
-    )
-    args = t[:, None].float() * freqs[None]
-    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-    if dim % 2:
-        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-    return embedding
-class MLPEmbedder(nn.Module):
-    def __init__(self, in_dim: int, hidden_dim: int):
-        super().__init__()
-        self.in_layer = F8Linear(in_dim, hidden_dim, bias=True)
-        self.silu = nn.SiLU()
-        self.out_layer = F8Linear(hidden_dim, hidden_dim, bias=True)
-    def forward(self, x: Tensor) -> Tensor:
-        return self.out_layer(self.silu(self.in_layer(x)))
-class RMSNorm(torch.nn.Module):
-    def __init__(self, dim: int):
-        super().__init__()
-        self.scale = nn.Parameter(torch.ones(dim))
-    def forward(self, x: Tensor):
-        return F.rms_norm(x, self.scale.shape, self.scale, eps=1e-6)
-class QKNorm(torch.nn.Module):
-    def __init__(self, dim: int):
-        super().__init__()
-        self.query_norm = RMSNorm(dim)
-        self.key_norm = RMSNorm(dim)
-    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
-        q = self.query_norm(q)
-        k = self.key_norm(k)
-        return q, k
-class SelfAttention(nn.Module):
-    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.qkv = F8Linear(dim, dim * 3, bias=qkv_bias)
-        self.norm = QKNorm(head_dim)
-        self.proj = F8Linear(dim, dim)
-        self.K = 3
-        self.H = self.num_heads
-        self.KH = self.K * self.H
-    def rearrange_for_norm(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
-        B, L, D = x.shape
-        q, k, v = x.reshape(B, L, self.K, self.H, D // self.KH).permute(2, 0, 3, 1, 4)
-        return q, k, v
-    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
-        qkv = self.qkv(x)
-        q, k, v = self.rearrange_for_norm(qkv)
-        q, k = self.norm(q, k, v)
-        x = attention(q, k, v, pe=pe)
-        x = self.proj(x)
-        return x
-ModulationOut = namedtuple("ModulationOut", ["shift", "scale", "gate"])
-class Modulation(nn.Module):
-    def __init__(self, dim: int, double: bool):
-        super().__init__()
-        self.is_double = double
-        self.multiplier = 6 if double else 3
-        self.lin = F8Linear(dim, self.multiplier * dim, bias=True)
-        self.act = nn.SiLU()
-    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
-        out = self.lin(self.act(vec))[:, None, :].chunk(self.multiplier, dim=-1)
-        return (
-            ModulationOut(*out[:3]),
-            ModulationOut(*out[3:]) if self.is_double else None,
-        )
-class DoubleStreamBlock(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float,
-        qkv_bias: bool = False,
-        dtype: torch.dtype = torch.float16,
-    ):
-        super().__init__()
-        self.dtype = dtype
-        mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        self.num_heads = num_heads
-        self.hidden_size = hidden_size
-        self.img_mod = Modulation(hidden_size, double=True)
-        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.img_attn = SelfAttention(
-            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
-        )
-        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.img_mlp = nn.Sequential(
-            F8Linear(hidden_size, mlp_hidden_dim, bias=True),
-            nn.GELU(approximate="tanh"),
-            F8Linear(mlp_hidden_dim, hidden_size, bias=True),
-        )
-        self.txt_mod = Modulation(hidden_size, double=True)
-        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.txt_attn = SelfAttention(
-            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
-        )
-        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.txt_mlp = nn.Sequential(
-            F8Linear(hidden_size, mlp_hidden_dim, bias=True),
-            nn.GELU(approximate="tanh"),
-            F8Linear(mlp_hidden_dim, hidden_size, bias=True),
-        )
-        self.K = 3
-        self.H = self.num_heads
-        self.KH = self.K * self.H
-    def rearrange_for_norm(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
-        B, L, D = x.shape
-        q, k, v = x.reshape(B, L, self.K, self.H, D // self.KH).permute(2, 0, 3, 1, 4)
-        return q, k, v
-    def forward(
-        self,
-        img: Tensor,
-        txt: Tensor,
-        vec: Tensor,
-        pe: Tensor,
-    ) -> tuple[Tensor, Tensor]:
-        img_mod1, img_mod2 = self.img_mod(vec)
-        txt_mod1, txt_mod2 = self.txt_mod(vec)
-        # prepare image for attention
-        img_modulated = self.img_norm1(img)
-        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
-        img_qkv = self.img_attn.qkv(img_modulated)
-        img_q, img_k, img_v = self.rearrange_for_norm(img_qkv)
-        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
-        # prepare txt for attention
-        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
-        txt_qkv = self.txt_attn.qkv(txt_modulated)
-        txt_q, txt_k, txt_v = self.rearrange_for_norm(txt_qkv)
-        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
-        q = torch.cat((txt_q, img_q), dim=2)
-        k = torch.cat((txt_k, img_k), dim=2)
-        v = torch.cat((txt_v, img_v), dim=2)
-        attn = attention(q, k, v, pe=pe)
-        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
-        # calculate the img bloks
-        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-        img = img + img_mod2.gate * self.img_mlp(
-            (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
-        ).clamp(min=-384 * 2, max=384 * 2)
-        # calculate the txt bloks
-        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
-        txt = txt + txt_mod2.gate * self.txt_mlp(
-            (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
-        ).clamp(min=-384 * 2, max=384 * 2)
-        return img, txt
-class SingleStreamBlock(nn.Module):
-    """
-    A DiT block with parallel linear layers as described in
-    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
-    """
-    def __init__(
-        self,
-        hidden_size: int,
-        num_heads: int,
-        mlp_ratio: float = 4.0,
-        qk_scale: float | None = None,
-        dtype: torch.dtype = torch.float16,
-    ):
-        super().__init__()
-        self.dtype = dtype
-        self.hidden_dim = hidden_size
-        self.num_heads = num_heads
-        head_dim = hidden_size // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
-        # qkv and mlp_in
-        self.linear1 = F8Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
-        # proj and mlp_out
-        self.linear2 = F8Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
-        self.norm = QKNorm(head_dim)
-        self.hidden_size = hidden_size
-        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.mlp_act = nn.GELU(approximate="tanh")
-        self.modulation = Modulation(hidden_size, double=False)
-        self.K = 3
-        self.H = self.num_heads
-        self.KH = self.K * self.H
-    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
-        mod = self.modulation(vec)[0]
-        pre_norm = self.pre_norm(x)
-        x_mod = (1 + mod.scale) * pre_norm + mod.shift
-        qkv, mlp = torch.split(
-            self.linear1(x_mod),
-            [3 * self.hidden_size, self.mlp_hidden_dim],
-            dim=-1,
-        )
-        B, L, D = qkv.shape
-        q, k, v = qkv.reshape(B, L, self.K, self.H, D // self.KH).permute(2, 0, 3, 1, 4)
-        q, k = self.norm(q, k, v)
-        attn = attention(q, k, v, pe=pe)
-        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2)).clamp(
-            min=-384 * 4, max=384 * 4
-        )
-        return x + mod.gate * output
-class LastLayer(nn.Module):
-    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
-        super().__init__()
-        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
-        self.linear = CublasLinear(
-            hidden_size, patch_size * patch_size * out_channels, bias=True
-        )
-        self.adaLN_modulation = nn.Sequential(
-            nn.SiLU(), CublasLinear(hidden_size, 2 * hidden_size, bias=True)
-        )
-    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
-        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
-        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
-        x = self.linear(x)
-        return x
-class Flux(nn.Module):
-    """
-    Transformer model for flow matching on sequences.
-    """
-    def __init__(self, params: FluxParams, dtype: torch.dtype = torch.float16):
-        super().__init__()
-        self.dtype = dtype
-        self.params = params
-        self.in_channels = params.in_channels
-        self.out_channels = self.in_channels
-        if params.hidden_size % params.num_heads != 0:
-            raise ValueError(
-                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
-            )
-        pe_dim = params.hidden_size // params.num_heads
-        if sum(params.axes_dim) != pe_dim:
-            raise ValueError(
-                f"Got {params.axes_dim} but expected positional dim {pe_dim}"
-            )
-        self.hidden_size = params.hidden_size
-        self.num_heads = params.num_heads
-        self.pe_embedder = EmbedND(
-            dim=pe_dim,
-            theta=params.theta,
-            axes_dim=params.axes_dim,
-            dtype=self.dtype,
-        )
-        self.img_in = F8Linear(self.in_channels, self.hidden_size, bias=True)
-        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
-        self.guidance_in = (
-            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
-            if params.guidance_embed
-            else nn.Identity()
-        )
-        self.txt_in = F8Linear(params.context_in_dim, self.hidden_size)
-        self.double_blocks = nn.ModuleList(
-            [
-                DoubleStreamBlock(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
-                    qkv_bias=params.qkv_bias,
-                    dtype=self.dtype,
-                )
-                for _ in range(params.depth)
-            ]
-        )
-        self.single_blocks = nn.ModuleList(
-            [
-                SingleStreamBlock(
-                    self.hidden_size,
-                    self.num_heads,
-                    mlp_ratio=params.mlp_ratio,
-                    dtype=self.dtype,
-                )
-                for _ in range(params.depth_single_blocks)
-            ]
-        )
-        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
-    def forward(
-        self,
-        img: Tensor,
-        img_ids: Tensor,
-        txt: Tensor,
-        txt_ids: Tensor,
-        timesteps: Tensor,
-        y: Tensor,
-        guidance: Tensor | None = None,
-    ) -> Tensor:
-        if img.ndim != 3 or txt.ndim != 3:
-            raise ValueError("Input img and txt tensors must have 3 dimensions.")
-        # running on sequences img
-        img = self.img_in(img)
-        vec = self.time_in(timestep_embedding(timesteps, 256).type(self.dtype))
-        if self.params.guidance_embed:
-            if guidance is None:
-                raise ValueError(
-                    "Didn't get guidance strength for guidance distilled model."
-                )
-            vec = vec + self.guidance_in(
-                timestep_embedding(guidance, 256).type(self.dtype)
-            )
-        vec = vec + self.vector_in(y)
-        txt = self.txt_in(txt)
-        ids = torch.cat((txt_ids, img_ids), dim=1)
-        pe = self.pe_embedder(ids)
-        # double stream blocks
-        for block in self.double_blocks:
-            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
-        img = torch.cat((txt, img), 1)
-        # single stream blocks
-        for block in self.single_blocks:
-            img = block(img, vec=vec, pe=pe)
-        img = img[:, txt.shape[1] :, ...]
-        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
-        return img
-    @classmethod
-    def from_pretrained(cls, path: str, dtype: torch.dtype = torch.bfloat16) -> "Flux":
-        from util import load_config_from_path
-        from safetensors.torch import load_file
-        config = load_config_from_path(path)
-        with torch.device("meta"):
-            klass = cls(params=config.params, dtype=dtype).type(dtype)
-        ckpt = load_file(config.ckpt_path, device="cpu")
-        klass.load_state_dict(ckpt, assign=True)
-        return klass.to("cpu")

util.py CHANGED Viewed

@@ -6,14 +6,17 @@ import torch
 from modules.autoencoder import AutoEncoder, AutoEncoderParams
 from modules.conditioner import HFEmbedder
 from modules.flux_model import Flux, FluxParams
-from modules.flux_model_f8 import Flux as FluxF8
 from safetensors.torch import load_file as load_sft
 try:
     from enum import StrEnum
 except:
     from enum import Enum
     class StrEnum(str, Enum):
         pass
 from pydantic import BaseModel, ConfigDict
 from loguru import logger
@@ -61,6 +64,11 @@ class ModelSpec(BaseModel):
     offload_flow: bool = False
     prequantized_flow: bool = False
     model_config: ConfigDict = {
         "arbitrary_types_allowed": True,
         "use_enum_values": True,
@@ -84,6 +92,8 @@ def parse_device(device: str | torch.device | None) -> torch.device:
 def into_dtype(dtype: str) -> torch.dtype:
     if dtype == "float16":
         return torch.float16
     elif dtype == "bfloat16":
@@ -125,6 +135,8 @@ def load_config(
     quant_text_enc: Optional[Literal["float8", "qint2", "qint4", "qint8"]] = None,
     quant_ae: bool = False,
     prequantized_flow: bool = False,
 ) -> ModelSpec:
     """
     Load a model configuration using the passed arguments.
@@ -192,6 +204,8 @@ def load_config(
         }.get(quant_text_enc, None),
         ae_quantization_dtype=QuantizationDtype.qfloat8 if quant_ae else None,
         prequantized_flow=prequantized_flow,
     )
@@ -219,16 +233,14 @@ def print_load_warning(missing: list[str], unexpected: list[str]) -> None:
         )
-def load_flow_model(config: ModelSpec) -> Flux | FluxF8:
     ckpt_path = config.ckpt_path
     FluxClass = Flux
-    if config.prequantized_flow:
-        FluxClass = FluxF8
     with torch.device("meta"):
-        model = FluxClass(config.params, dtype=into_dtype(config.flow_dtype)).type(
-            into_dtype(config.flow_dtype)
-        )
     if ckpt_path is not None:
         # load_sft doesn't support torch.device
@@ -279,7 +291,7 @@ def load_autoencoder(config: ModelSpec) -> AutoEncoder:
 class LoadedModels(BaseModel):
-    flow: Flux | FluxF8
     ae: AutoEncoder
     clip: HFEmbedder
     t5: HFEmbedder

 from modules.autoencoder import AutoEncoder, AutoEncoderParams
 from modules.conditioner import HFEmbedder
 from modules.flux_model import Flux, FluxParams
 from safetensors.torch import load_file as load_sft
 try:
     from enum import StrEnum
 except:
     from enum import Enum
     class StrEnum(str, Enum):
         pass
 from pydantic import BaseModel, ConfigDict
 from loguru import logger
     offload_flow: bool = False
     prequantized_flow: bool = False
+    # Improved precision via not quanitzing the modulation linear layers
+    quantize_modulation: bool = True
+    # Improved precision via not quanitzing the flow embedder layers
+    quantize_flow_embedder_layers: bool = False
     model_config: ConfigDict = {
         "arbitrary_types_allowed": True,
         "use_enum_values": True,
 def into_dtype(dtype: str) -> torch.dtype:
+    if isinstance(dtype, torch.dtype):
+        return dtype
     if dtype == "float16":
         return torch.float16
     elif dtype == "bfloat16":
     quant_text_enc: Optional[Literal["float8", "qint2", "qint4", "qint8"]] = None,
     quant_ae: bool = False,
     prequantized_flow: bool = False,
+    quantize_modulation: bool = True,
+    quantize_flow_embedder_layers: bool = False,
 ) -> ModelSpec:
     """
     Load a model configuration using the passed arguments.
         }.get(quant_text_enc, None),
         ae_quantization_dtype=QuantizationDtype.qfloat8 if quant_ae else None,
         prequantized_flow=prequantized_flow,
+        quantize_modulation=quantize_modulation,
+        quantize_flow_embedder_layers=quantize_flow_embedder_layers,
     )
         )
+def load_flow_model(config: ModelSpec) -> Flux:
     ckpt_path = config.ckpt_path
     FluxClass = Flux
     with torch.device("meta"):
+        model = FluxClass(config, dtype=into_dtype(config.flow_dtype))
+        if not config.prequantized_flow:
+            model.type(into_dtype(config.flow_dtype))
     if ckpt_path is not None:
         # load_sft doesn't support torch.device
 class LoadedModels(BaseModel):
+    flow: Flux
     ae: AutoEncoder
     clip: HFEmbedder
     t5: HFEmbedder