fireworks-ai
/

FLUX.1-dev-fp8-flumina

aredden commited on Aug 24, 2024

Commit

604f17d

1 Parent(s): 1f9e684

Add quantize embedders/modulation to argparse options

Files changed (2) hide show

main.py CHANGED Viewed

@@ -129,6 +129,22 @@ def parse_args():
         + "and then saving the state_dict as a safetensors file), "
         + "which reduces the size of the checkpoint by about 50% & reduces startup time",
     )
     return parser.parse_args()
@@ -171,6 +187,8 @@ def main():
             offload_ae=args.offload_ae,
             offload_text_enc=args.offload_text_enc,
             prequantized_flow=args.prequantized_flow,
         )
         app.state.model = FluxPipeline.load_pipeline_from_config(config)

         + "and then saving the state_dict as a safetensors file), "
         + "which reduces the size of the checkpoint by about 50% & reduces startup time",
     )
+    parser.add_argument(
+        "-nqfm",
+        "--no-quantize-flow-modulation",
+        action="store_false",
+        default=True,
+        dest="quantize_modulation",
+        help="Disable quantization of the modulation layers in the flow model, adds ~2GB vram usage for moderate precision improvements",
+    )
+    parser.add_argument(
+        "-qfl",
+        "--quantize-flow-embedder-layers",
+        action="store_true",
+        default=False,
+        dest="quantize_flow_embedder_layers",
+        help="Quantize the flow embedder layers in the flow model, saves ~512MB vram usage, but precision loss is very noticeable",
+    )
     return parser.parse_args()
             offload_ae=args.offload_ae,
             offload_text_enc=args.offload_text_enc,
             prequantized_flow=args.prequantized_flow,
+            quantize_modulation=args.quantize_modulation,
+            quantize_flow_embedder_layers=args.quantize_flow_embedder_layers,
         )
         app.state.model = FluxPipeline.load_pipeline_from_config(config)

util.py CHANGED Viewed

@@ -135,6 +135,8 @@ def load_config(
     quant_text_enc: Optional[Literal["float8", "qint2", "qint4", "qint8"]] = None,
     quant_ae: bool = False,
     prequantized_flow: bool = False,
 ) -> ModelSpec:
     """
     Load a model configuration using the passed arguments.
@@ -202,6 +204,8 @@ def load_config(
         }.get(quant_text_enc, None),
         ae_quantization_dtype=QuantizationDtype.qfloat8 if quant_ae else None,
         prequantized_flow=prequantized_flow,
     )

     quant_text_enc: Optional[Literal["float8", "qint2", "qint4", "qint8"]] = None,
     quant_ae: bool = False,
     prequantized_flow: bool = False,
+    quantize_modulation: bool = True,
+    quantize_flow_embedder_layers: bool = False,
 ) -> ModelSpec:
     """
     Load a model configuration using the passed arguments.
         }.get(quant_text_enc, None),
         ae_quantization_dtype=QuantizationDtype.qfloat8 if quant_ae else None,
         prequantized_flow=prequantized_flow,
+        quantize_modulation=quantize_modulation,
+        quantize_flow_embedder_layers=quantize_flow_embedder_layers,
     )