Updates..

Browse files

Files changed (7) hide show

configs/config-dev-gigaquant.json +9 -3
configs/config-dev.json +2 -0
flux_pipeline.py +2 -0
modules/conditioner.py +68 -9
modules/flux_model.py +0 -1
quantize_swap_and_dispatch.py +48 -15
util.py +25 -2

configs/config-dev-gigaquant.json CHANGED Viewed

@@ -41,12 +41,18 @@
   "repo_ae": "ae.sft",
   "text_enc_max_length": 512,
   "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
-  "text_enc_device": "cuda:1",
-  "ae_device": "cuda:1",
   "flux_device": "cuda:0",
   "flow_dtype": "float16",
   "ae_dtype": "bfloat16",
   "text_enc_dtype": "bfloat16",
-  "num_to_quant": 8000,
   "quantize_extras": true
 }

   "repo_ae": "ae.sft",
   "text_enc_max_length": 512,
   "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
   "flux_device": "cuda:0",
   "flow_dtype": "float16",
   "ae_dtype": "bfloat16",
   "text_enc_dtype": "bfloat16",
+  "num_to_quant": 220,
+  "flow_quantization_dtype": "qint4",
+  "text_enc_quantization_dtype": "qint4",
+  "ae_quantization_dtype": "qint4",
+  "clip_quantization_dtype": "qint4",
+  "compile_extras": false,
+  "compile_blocks": false,
   "quantize_extras": true
 }

configs/config-dev.json CHANGED Viewed

@@ -47,6 +47,8 @@
   "flow_dtype": "float16",
   "ae_dtype": "bfloat16",
   "text_enc_dtype": "bfloat16",
   "num_to_quant": 22,
   "compile_extras": false,
   "compile_blocks": false

   "flow_dtype": "float16",
   "ae_dtype": "bfloat16",
   "text_enc_dtype": "bfloat16",
+  "flow_quantization_dtype": "qfloat8",
+  "text_enc_quantization_dtype": "qfloat8",
   "num_to_quant": 22,
   "compile_extras": false,
   "compile_blocks": false

flux_pipeline.py CHANGED Viewed

@@ -394,6 +394,7 @@ class FluxPipeline:
         from quantize_swap_and_dispatch import quantize_and_dispatch_to_device
         with torch.inference_mode():
             models = load_models_from_config(config)
             config = models.config
@@ -413,6 +414,7 @@ class FluxPipeline:
                 compile_extras=config.compile_extras,
                 compile_blocks=config.compile_blocks,
                 quantize_extras=config.quantize_extras,
             )
         return cls(

         from quantize_swap_and_dispatch import quantize_and_dispatch_to_device
         with torch.inference_mode():
+            print("flow_quantization_dtype", config.flow_quantization_dtype)
             models = load_models_from_config(config)
             config = models.config
                 compile_extras=config.compile_extras,
                 compile_blocks=config.compile_blocks,
                 quantize_extras=config.quantize_extras,
+                quantization_dtype=config.flow_quantization_dtype,
             )
         return cls(

modules/conditioner.py CHANGED Viewed

@@ -1,37 +1,85 @@
-from torch import Tensor, nn
 import torch
-from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer
-from transformers.utils.quantization_config import BitsAndBytesConfig, QuantoConfig
 class HFEmbedder(nn.Module):
     def __init__(
-        self, version: str, max_length: int, device: torch.device | int, **hf_kwargs
     ):
         super().__init__()
         self.is_clip = version.startswith("openai")
         self.max_length = max_length
         self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
         if self.is_clip:
             self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
                 version, max_length=max_length
             )
             self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(
-                version, **hf_kwargs
             )
-            self.hf_module = self.hf_module.eval().requires_grad_(False).to(device)
         else:
             self.tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(
                 version, max_length=max_length
             )
             self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(
                 version,
-                **hf_kwargs,
                 device_map={"": device},
-                quantization_config=QuantoConfig(
-                    weights="float8",
                 ),
             )
@@ -51,3 +99,14 @@ class HFEmbedder(nn.Module):
             output_hidden_states=False,
         )
         return outputs[self.output_key]

+import os
 import torch
+from pydash import max_
+from quanto import freeze, qfloat8, qint2, qint4, qint8, quantize
+from quanto.nn.qmodule import _QMODULE_TABLE
+from safetensors.torch import load_file, load_model, save_model
+from torch import Tensor, nn
+from transformers import (
+    CLIPTextModel,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5Tokenizer,
+    __version__,
+)
+from transformers.utils.quantization_config import QuantoConfig
+CACHE_DIR = os.environ.get("HF_HOME", "~/.cache/huggingface")
+def into_quantization_name(quantization_dtype: str) -> str:
+    if quantization_dtype == "qfloat8":
+        return "float8"
+    elif quantization_dtype == "qint4":
+        return "int4"
+    elif quantization_dtype == "qint8":
+        return "int8"
+    elif quantization_dtype == "qint2":
+        return "int2"
+    else:
+        raise ValueError(f"Unsupported quantization dtype: {quantization_dtype}")
 class HFEmbedder(nn.Module):
     def __init__(
+        self,
+        version: str,
+        max_length: int,
+        device: torch.device | int,
+        quantization_dtype: str | None = None,
+        **hf_kwargs,
     ):
         super().__init__()
         self.is_clip = version.startswith("openai")
         self.max_length = max_length
         self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+        quant_name = (
+            into_quantization_name(quantization_dtype) if quantization_dtype else None
+        )
         if self.is_clip:
             self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(
                 version, max_length=max_length
             )
             self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(
+                version,
+                **hf_kwargs,
+                quantization_config=(
+                    QuantoConfig(
+                        weights=quant_name,
+                    )
+                    if quant_name
+                    else None
+                ),
+                device_map={"": device},
             )
         else:
             self.tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(
                 version, max_length=max_length
             )
             self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(
                 version,
                 device_map={"": device},
+                **hf_kwargs,
+                quantization_config=(
+                    QuantoConfig(
+                        weights=quant_name,
+                    )
+                    if quant_name
+                    else None
                 ),
             )
             output_hidden_states=False,
         )
         return outputs[self.output_key]
+if __name__ == "__main__":
+    model = HFEmbedder(
+        "city96/t5-v1_1-xxl-encoder-bf16",
+        max_length=512,
+        device=0,
+        quantization_dtype="qfloat8",
+    )
+    o = model(["hello"])
+    print(o)

modules/flux_model.py CHANGED Viewed

@@ -13,7 +13,6 @@ import math
 from torch import Tensor, nn
 from torch._dynamo import config
 from torch._inductor import config as ind_config
-from xformers.ops import memory_efficient_attention_forward
 from pydantic import BaseModel
 from torch.nn import functional as F

 from torch import Tensor, nn
 from torch._dynamo import config
 from torch._inductor import config as ind_config
 from pydantic import BaseModel
 from torch.nn import functional as F

quantize_swap_and_dispatch.py CHANGED Viewed

@@ -5,11 +5,35 @@ import torch
 from click import secho
 from cublas_ops import CublasLinear
-from quanto.nn import QModuleMixin, quantize_module, QLinear, QConv2d, QLayerNorm
-from quanto.tensor import Optimizer, qtype, qfloat8
 from torch import nn
 def _set_module_by_name(parent_module, name, child_module):
     module_names = name.split(".")
     if len(module_names) == 1:
@@ -121,7 +145,12 @@ def _is_block_compilable(module: nn.Module) -> bool:
 def _simple_swap_linears(model: nn.Module, root_name: str = ""):
     for name, module in model.named_children():
-        if _is_linear(module):
             weights = module.weight.data
             bias = None
             if module.bias is not None:
@@ -155,7 +184,7 @@ def _full_quant(
     if current_quants < max_quants:
         current_quants += _quantize(model, quantization_dtype)
         _freeze(model)
-        print(f"Quantized {current_quants} modules")
     return current_quants
@@ -174,11 +203,13 @@ def quantize_and_dispatch_to_device(
     flux_device: torch.device = torch.device("cuda"),
     flux_dtype: torch.dtype = torch.float16,
     num_layers_to_quantize: int = 20,
-    quantization_dtype: qtype = qfloat8,
     compile_blocks: bool = True,
     compile_extras: bool = True,
     quantize_extras: bool = False,
 ):
     num_quanted = 0
     flow_model = flow_model.requires_grad_(False).eval().type(flux_dtype)
     for block in flow_model.single_blocks:
@@ -188,7 +219,7 @@ def quantize_and_dispatch_to_device(
                 block,
                 num_layers_to_quantize,
                 num_quanted,
-                quantization_dtype=quantization_dtype,
             )
     for block in flow_model.double_blocks:
@@ -198,7 +229,7 @@ def quantize_and_dispatch_to_device(
                 block,
                 num_layers_to_quantize,
                 num_quanted,
-                quantization_dtype=quantization_dtype,
             )
     to_gpu_extras = [
@@ -221,10 +252,11 @@ def quantize_and_dispatch_to_device(
                 block.compile()
                 secho(f"Compiled block {i}", fg="green")
-    _simple_swap_linears(flow_model)
     for extra in to_gpu_extras:
         m_extra = getattr(flow_model, extra).cuda(flux_device).type(flux_dtype)
-        if compile_blocks:
             if extra in ["time_in", "vector_in", "guidance_in", "final_layer"]:
                 m_extra.compile()
                 secho(
@@ -232,10 +264,11 @@ def quantize_and_dispatch_to_device(
                     fg="green",
                 )
         elif quantize_extras:
-            _full_quant(
-                m_extra,
-                current_quants=num_quanted,
-                max_quants=num_layers_to_quantize,
-                quantization_dtype=quantization_dtype,
-            )
     return flow_model

 from click import secho
 from cublas_ops import CublasLinear
+from quanto import (
+    QModuleMixin,
+    quantize_module,
+    QLinear,
+    QConv2d,
+    QLayerNorm,
+)
+from quanto.tensor import Optimizer, qtype, qfloat8, qint4, qint8
 from torch import nn
+class QuantizationDtype:
+    qfloat8 = "qfloat8"
+    qint2 = "qint2"
+    qint4 = "qint4"
+    qint8 = "qint8"
+def into_qtype(qtype: QuantizationDtype) -> qtype:
+    if qtype == QuantizationDtype.qfloat8:
+        return qfloat8
+    elif qtype == QuantizationDtype.qint4:
+        return qint4
+    elif qtype == QuantizationDtype.qint8:
+        return qint8
+    else:
+        raise ValueError(f"Unknown qtype: {qtype}")
 def _set_module_by_name(parent_module, name, child_module):
     module_names = name.split(".")
     if len(module_names) == 1:
 def _simple_swap_linears(model: nn.Module, root_name: str = ""):
     for name, module in model.named_children():
+        if (
+            _is_linear(module)
+            and hasattr(module, "weight")
+            and module.weight is not None
+            and module.weight.data is not None
+        ):
             weights = module.weight.data
             bias = None
             if module.bias is not None:
     if current_quants < max_quants:
         current_quants += _quantize(model, quantization_dtype)
         _freeze(model)
+        print(f"Quantized {current_quants} modules with {quantization_dtype}")
     return current_quants
     flux_device: torch.device = torch.device("cuda"),
     flux_dtype: torch.dtype = torch.float16,
     num_layers_to_quantize: int = 20,
+    quantization_dtype: QuantizationDtype = QuantizationDtype.qfloat8,
     compile_blocks: bool = True,
     compile_extras: bool = True,
     quantize_extras: bool = False,
+    replace_linears: bool = True,
 ):
+    quant_type = into_qtype(quantization_dtype)
     num_quanted = 0
     flow_model = flow_model.requires_grad_(False).eval().type(flux_dtype)
     for block in flow_model.single_blocks:
                 block,
                 num_layers_to_quantize,
                 num_quanted,
+                quantization_dtype=quant_type,
             )
     for block in flow_model.double_blocks:
                 block,
                 num_layers_to_quantize,
                 num_quanted,
+                quantization_dtype=quant_type,
             )
     to_gpu_extras = [
                 block.compile()
                 secho(f"Compiled block {i}", fg="green")
+    if replace_linears:
+        _simple_swap_linears(flow_model)
     for extra in to_gpu_extras:
         m_extra = getattr(flow_model, extra).cuda(flux_device).type(flux_dtype)
+        if compile_extras:
             if extra in ["time_in", "vector_in", "guidance_in", "final_layer"]:
                 m_extra.compile()
                 secho(
                     fg="green",
                 )
         elif quantize_extras:
+            if not isinstance(m_extra, nn.Linear):
+                _full_quant(
+                    m_extra,
+                    current_quants=num_quanted,
+                    max_quants=num_layers_to_quantize,
+                    quantization_dtype=quantization_dtype,
+                )
     return flow_model

util.py CHANGED Viewed

@@ -18,6 +18,13 @@ class ModelVersion(StrEnum):
     flux_schnell = "flux-schnell"
 class ModelSpec(BaseModel):
     version: ModelVersion
     params: FluxParams
@@ -39,6 +46,10 @@ class ModelSpec(BaseModel):
     quantize_extras: bool = False
     compile_extras: bool = False
     compile_blocks: bool = False
     model_config: ConfigDict = {
         "arbitrary_types_allowed": True,
@@ -199,13 +210,15 @@ def load_text_encoders(config: ModelSpec) -> tuple[HFEmbedder, HFEmbedder]:
         "openai/clip-vit-large-patch14",
         max_length=77,
         torch_dtype=into_dtype(config.text_enc_dtype),
-        device=into_device(config.text_enc_device),
     )
     t5 = HFEmbedder(
         config.text_enc_path,
         max_length=config.text_enc_max_length,
         torch_dtype=into_dtype(config.text_enc_dtype),
         device=into_device(config.text_enc_device).index or 0,
     )
     return clip, t5
@@ -213,12 +226,22 @@ def load_text_encoders(config: ModelSpec) -> tuple[HFEmbedder, HFEmbedder]:
 def load_autoencoder(config: ModelSpec) -> AutoEncoder:
     ckpt_path = config.ae_path
     with torch.device("meta" if ckpt_path is not None else config.ae_device):
-        ae = AutoEncoder(config.ae_params)
     if ckpt_path is not None:
         sd = load_sft(ckpt_path, device=str(config.ae_device))
         missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
         print_load_warning(missing, unexpected)
     return ae

     flux_schnell = "flux-schnell"
+class QuantizationDtype(StrEnum):
+    qfloat8 = "qfloat8"
+    qint2 = "qint2"
+    qint4 = "qint4"
+    qint8 = "qint8"
 class ModelSpec(BaseModel):
     version: ModelVersion
     params: FluxParams
     quantize_extras: bool = False
     compile_extras: bool = False
     compile_blocks: bool = False
+    flow_quantization_dtype: Optional[QuantizationDtype] = QuantizationDtype.qfloat8
+    text_enc_quantization_dtype: Optional[QuantizationDtype] = QuantizationDtype.qfloat8
+    ae_quantization_dtype: Optional[QuantizationDtype] = None
+    clip_quantization_dtype: Optional[QuantizationDtype] = None
     model_config: ConfigDict = {
         "arbitrary_types_allowed": True,
         "openai/clip-vit-large-patch14",
         max_length=77,
         torch_dtype=into_dtype(config.text_enc_dtype),
+        device=into_device(config.text_enc_device).index or 0,
+        quantization_dtype=config.clip_quantization_dtype,
     )
     t5 = HFEmbedder(
         config.text_enc_path,
         max_length=config.text_enc_max_length,
         torch_dtype=into_dtype(config.text_enc_dtype),
         device=into_device(config.text_enc_device).index or 0,
+        quantization_dtype=config.text_enc_quantization_dtype,
     )
     return clip, t5
 def load_autoencoder(config: ModelSpec) -> AutoEncoder:
     ckpt_path = config.ae_path
     with torch.device("meta" if ckpt_path is not None else config.ae_device):
+        ae = AutoEncoder(config.ae_params).to(into_dtype(config.ae_dtype))
     if ckpt_path is not None:
         sd = load_sft(ckpt_path, device=str(config.ae_device))
         missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
         print_load_warning(missing, unexpected)
+    if config.ae_quantization_dtype is not None:
+        from quantize_swap_and_dispatch import _full_quant, into_qtype
+        ae.to(into_device(config.ae_device))
+        _full_quant(
+            ae,
+            max_quants=8000,
+            current_quants=0,
+            quantization_dtype=into_qtype(config.ae_quantization_dtype),
+        )
     return ae