Spaces:

n42
/

pictero

Running

App Files Files Community

nickyreinert-vml commited on Jul 3

Commit

e4a20eb

•

1 Parent(s): ddd360b

adding pre compile feature

Browse files

Files changed (2) hide show

app.py +13 -1
config.py +4 -1

app.py CHANGED Viewed

@@ -131,6 +131,12 @@ def attention_slicing_change(attention_slicing, config):
     config = set_config(config, 'attention_slicing', attention_slicing)
     return config, config, assemble_code(config)
 def safety_checker_change(safety_checker, config):
@@ -298,6 +304,9 @@ def run_inference(config, config_history, pipeline, progress=gr.Progress(track_t
         # ATTENTION SLICING
         if str(config["attention_slicing"]).lower() == 'true': pipeline.enable_attention_slicing()
         # AUTO ENCODER
         if str(config["auto_encoder"]).lower() != 'none' and str(config["auto_encoder"]).lower() != 'null' and str(config["auto_encoder"]).lower() != '':
             pipeline.vae = AutoencoderKL.from_pretrained(config["auto_encoder"], torch_dtype=get_data_type(config["data_type"])).to(config["device"])
@@ -445,12 +454,13 @@ with gr.Blocks(analytics_enabled=False) as demo:
         gr.Column("")
     with gr.Accordion("Device specific settings", open=False):
         with gr.Row():
-            in_cpu_offload = gr.Radio(label="CPU Offload:", value=config.value["cpu_offload"], choices=["True", "False"], info="This may increase performance, as it offloads computations from the GPU to the CPU. But this can also lead to slower executions and lower effectiveness. Compare running time and outputs before making sure, that this setting will help you")
             in_data_type = gr.Radio(label="Data Type:", value=config.value["data_type"], choices=["bfloat16", "float16", "float32"], info="`bfloat16` is not supported on MPS devices right now; `float16` may also not be supported on all devices, Half-precision weights, will save GPU memory, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16")
             in_allow_tensorfloat32 = gr.Radio(label="Allow TensorFloat32:", value=config.value["allow_tensorfloat32"], choices=["True", "False"], info="is not supported on MPS devices right now; use TensorFloat-32 is faster, but results in slightly less accurate computations, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16 ")
         with gr.Row():
             in_variant = gr.Radio(label="Variant:", value=config.value["variant"], choices=["fp16", None], info="Use half-precision weights will save GPU memory, not all models support that, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16 ")
             in_attention_slicing = gr.Radio(label="Attention slicing:", value=config.value["attention_slicing"], choices=["True", "False"], info="Attention operation will be cutted into multiple steps, see https://huggingface.co/docs/diffusers/optimization/mps")
             gr.Column("")
     gr.Markdown("### Model")
@@ -553,6 +563,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
     in_allow_tensorfloat32.change(tensorfloat32_change, inputs=[in_allow_tensorfloat32, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('allow_tensorfloat32', value, config)")
     in_variant.change(variant_change, inputs=[in_variant, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('variant', value, config)")
     in_attention_slicing.change(attention_slicing_change, inputs=[in_attention_slicing, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('attention_slicing', value, config)")
     in_model_refiner.change(model_refiner_change, inputs=[in_model_refiner, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('model_refiner', value, config)")
     in_cpu_offload.change(cpu_offload_change, inputs=[in_cpu_offload, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('cpu_offload', value, config)")
     in_safety_checker.change(safety_checker_change, inputs=[in_safety_checker, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('safety_checker', value, config)")
@@ -586,6 +597,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
             in_model_refiner,
             in_variant,
             in_attention_slicing,
             in_safety_checker,
             in_requires_safety_checker,
             in_auto_encoders,

     config = set_config(config, 'attention_slicing', attention_slicing)
     return config, config, assemble_code(config)
+def pre_compile_unet_change(pre_compile_unet, config):
+    config = set_config(config, 'pre_compile_unet', pre_compile_unet)
+    return config, config, assemble_code(config)
 def safety_checker_change(safety_checker, config):
         # ATTENTION SLICING
         if str(config["attention_slicing"]).lower() == 'true': pipeline.enable_attention_slicing()
+        # PRE COMPILE UNET
+        if str(config["pre_compile_unet"]).lower() == 'true': pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
         # AUTO ENCODER
         if str(config["auto_encoder"]).lower() != 'none' and str(config["auto_encoder"]).lower() != 'null' and str(config["auto_encoder"]).lower() != '':
             pipeline.vae = AutoencoderKL.from_pretrained(config["auto_encoder"], torch_dtype=get_data_type(config["data_type"])).to(config["device"])
         gr.Column("")
     with gr.Accordion("Device specific settings", open=False):
         with gr.Row():
+            in_cpu_offload = gr.Radio(label="CPU Offload:", value=config.value["cpu_offload"], choices=["True", "False"], info="This may increase performance, as it offloads computations from the GPU to the CPU. But this can also lead to slower executions and lower effectiveness. Compare running time and outputs before making sure, that this setting will help you, is not supported on MPS")
             in_data_type = gr.Radio(label="Data Type:", value=config.value["data_type"], choices=["bfloat16", "float16", "float32"], info="`bfloat16` is not supported on MPS devices right now; `float16` may also not be supported on all devices, Half-precision weights, will save GPU memory, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16")
             in_allow_tensorfloat32 = gr.Radio(label="Allow TensorFloat32:", value=config.value["allow_tensorfloat32"], choices=["True", "False"], info="is not supported on MPS devices right now; use TensorFloat-32 is faster, but results in slightly less accurate computations, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16 ")
         with gr.Row():
             in_variant = gr.Radio(label="Variant:", value=config.value["variant"], choices=["fp16", None], info="Use half-precision weights will save GPU memory, not all models support that, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16 ")
             in_attention_slicing = gr.Radio(label="Attention slicing:", value=config.value["attention_slicing"], choices=["True", "False"], info="Attention operation will be cutted into multiple steps, see https://huggingface.co/docs/diffusers/optimization/mps")
+            in_pre_compile_unet = gr.Radio(label="Pre-Compile UNet:", value=config.value["pre_compile_unet"], choices=["True", "False"], info="Can speed up the inference process, compilation takes some time, so you should only apply this option when you finalize your inference, does not work on MPS, see https://huggingface.co/docs/diffusers/optimization/torch2.0 ")
             gr.Column("")
     gr.Markdown("### Model")
     in_allow_tensorfloat32.change(tensorfloat32_change, inputs=[in_allow_tensorfloat32, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('allow_tensorfloat32', value, config)")
     in_variant.change(variant_change, inputs=[in_variant, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('variant', value, config)")
     in_attention_slicing.change(attention_slicing_change, inputs=[in_attention_slicing, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('attention_slicing', value, config)")
+    in_pre_compile_unet.change(pre_compile_unet_change, inputs=[in_pre_compile_unet, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('pre_compile_unet', value, config)")
     in_model_refiner.change(model_refiner_change, inputs=[in_model_refiner, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('model_refiner', value, config)")
     in_cpu_offload.change(cpu_offload_change, inputs=[in_cpu_offload, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('cpu_offload', value, config)")
     in_safety_checker.change(safety_checker_change, inputs=[in_safety_checker, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('safety_checker', value, config)")
             in_model_refiner,
             in_variant,
             in_attention_slicing,
+            in_pre_compile_unet,
             in_safety_checker,
             in_requires_safety_checker,
             in_auto_encoders,

config.py CHANGED Viewed

@@ -41,6 +41,7 @@ def get_initial_config():
         "scheduler": None,
         "variant": None,
         "attention_slicing": "False",
         "allow_tensorfloat32": allow_tensorfloat32,
         "use_safetensors": "False",
         "data_type": data_type,
@@ -104,6 +105,7 @@ def get_config_from_url(initial_config, request: Request):
             return_config['refiner'],
             return_config['variant'],
             return_config['attention_slicing'],
             return_config['safety_checker'],
             return_config['requires_safety_checker'],
             return_config['auto_encoder'],
@@ -175,6 +177,7 @@ def assemble_code(str_config):
             variant=variant).to(device)''')
     if str(config["attention_slicing"]).lower() != 'false': code.append("pipeline.enable_attention_slicing()")
     if str(config["cpu_offload"]).lower() != 'false': code.append("pipeline.enable_model_cpu_offload()")
@@ -191,7 +194,7 @@ def assemble_code(str_config):
                 "{config['refiner']}",
                 text_encoder_2 = base.text_encoder_2,
                 vae = base.vae,
-                torch_dtype = data_type,
                 use_safetensors = use_safetensors,
                 variant=variant,
             ).to(device)''')

         "scheduler": None,
         "variant": None,
         "attention_slicing": "False",
+        "pre_compile_unet": "False",
         "allow_tensorfloat32": allow_tensorfloat32,
         "use_safetensors": "False",
         "data_type": data_type,
             return_config['refiner'],
             return_config['variant'],
             return_config['attention_slicing'],
+            return_config['pre_compile_unet'],
             return_config['safety_checker'],
             return_config['requires_safety_checker'],
             return_config['auto_encoder'],
             variant=variant).to(device)''')
     if str(config["attention_slicing"]).lower() != 'false': code.append("pipeline.enable_attention_slicing()")
+    if str(config["pre_compile_unet"]).lower() != 'false': code.append("pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)")
     if str(config["cpu_offload"]).lower() != 'false': code.append("pipeline.enable_model_cpu_offload()")
                 "{config['refiner']}",
                 text_encoder_2 = base.text_encoder_2,
                 vae = base.vae,
+                torch_dtype = data_t    ype,
                 use_safetensors = use_safetensors,
                 variant=variant,
             ).to(device)''')