nickyreinert-vml commited on
Commit
e4a20eb
1 Parent(s): ddd360b

adding pre compile feature

Browse files
Files changed (2) hide show
  1. app.py +13 -1
  2. config.py +4 -1
app.py CHANGED
@@ -131,6 +131,12 @@ def attention_slicing_change(attention_slicing, config):
131
  config = set_config(config, 'attention_slicing', attention_slicing)
132
 
133
  return config, config, assemble_code(config)
 
 
 
 
 
 
134
 
135
  def safety_checker_change(safety_checker, config):
136
 
@@ -298,6 +304,9 @@ def run_inference(config, config_history, pipeline, progress=gr.Progress(track_t
298
  # ATTENTION SLICING
299
  if str(config["attention_slicing"]).lower() == 'true': pipeline.enable_attention_slicing()
300
 
 
 
 
301
  # AUTO ENCODER
302
  if str(config["auto_encoder"]).lower() != 'none' and str(config["auto_encoder"]).lower() != 'null' and str(config["auto_encoder"]).lower() != '':
303
  pipeline.vae = AutoencoderKL.from_pretrained(config["auto_encoder"], torch_dtype=get_data_type(config["data_type"])).to(config["device"])
@@ -445,12 +454,13 @@ with gr.Blocks(analytics_enabled=False) as demo:
445
  gr.Column("")
446
  with gr.Accordion("Device specific settings", open=False):
447
  with gr.Row():
448
- in_cpu_offload = gr.Radio(label="CPU Offload:", value=config.value["cpu_offload"], choices=["True", "False"], info="This may increase performance, as it offloads computations from the GPU to the CPU. But this can also lead to slower executions and lower effectiveness. Compare running time and outputs before making sure, that this setting will help you")
449
  in_data_type = gr.Radio(label="Data Type:", value=config.value["data_type"], choices=["bfloat16", "float16", "float32"], info="`bfloat16` is not supported on MPS devices right now; `float16` may also not be supported on all devices, Half-precision weights, will save GPU memory, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16")
450
  in_allow_tensorfloat32 = gr.Radio(label="Allow TensorFloat32:", value=config.value["allow_tensorfloat32"], choices=["True", "False"], info="is not supported on MPS devices right now; use TensorFloat-32 is faster, but results in slightly less accurate computations, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16 ")
451
  with gr.Row():
452
  in_variant = gr.Radio(label="Variant:", value=config.value["variant"], choices=["fp16", None], info="Use half-precision weights will save GPU memory, not all models support that, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16 ")
453
  in_attention_slicing = gr.Radio(label="Attention slicing:", value=config.value["attention_slicing"], choices=["True", "False"], info="Attention operation will be cutted into multiple steps, see https://huggingface.co/docs/diffusers/optimization/mps")
 
454
  gr.Column("")
455
 
456
  gr.Markdown("### Model")
@@ -553,6 +563,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
553
  in_allow_tensorfloat32.change(tensorfloat32_change, inputs=[in_allow_tensorfloat32, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('allow_tensorfloat32', value, config)")
554
  in_variant.change(variant_change, inputs=[in_variant, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('variant', value, config)")
555
  in_attention_slicing.change(attention_slicing_change, inputs=[in_attention_slicing, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('attention_slicing', value, config)")
 
556
  in_model_refiner.change(model_refiner_change, inputs=[in_model_refiner, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('model_refiner', value, config)")
557
  in_cpu_offload.change(cpu_offload_change, inputs=[in_cpu_offload, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('cpu_offload', value, config)")
558
  in_safety_checker.change(safety_checker_change, inputs=[in_safety_checker, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('safety_checker', value, config)")
@@ -586,6 +597,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
586
  in_model_refiner,
587
  in_variant,
588
  in_attention_slicing,
 
589
  in_safety_checker,
590
  in_requires_safety_checker,
591
  in_auto_encoders,
 
131
  config = set_config(config, 'attention_slicing', attention_slicing)
132
 
133
  return config, config, assemble_code(config)
134
+
135
+ def pre_compile_unet_change(pre_compile_unet, config):
136
+
137
+ config = set_config(config, 'pre_compile_unet', pre_compile_unet)
138
+
139
+ return config, config, assemble_code(config)
140
 
141
  def safety_checker_change(safety_checker, config):
142
 
 
304
  # ATTENTION SLICING
305
  if str(config["attention_slicing"]).lower() == 'true': pipeline.enable_attention_slicing()
306
 
307
+ # PRE COMPILE UNET
308
+ if str(config["pre_compile_unet"]).lower() == 'true': pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
309
+
310
  # AUTO ENCODER
311
  if str(config["auto_encoder"]).lower() != 'none' and str(config["auto_encoder"]).lower() != 'null' and str(config["auto_encoder"]).lower() != '':
312
  pipeline.vae = AutoencoderKL.from_pretrained(config["auto_encoder"], torch_dtype=get_data_type(config["data_type"])).to(config["device"])
 
454
  gr.Column("")
455
  with gr.Accordion("Device specific settings", open=False):
456
  with gr.Row():
457
+ in_cpu_offload = gr.Radio(label="CPU Offload:", value=config.value["cpu_offload"], choices=["True", "False"], info="This may increase performance, as it offloads computations from the GPU to the CPU. But this can also lead to slower executions and lower effectiveness. Compare running time and outputs before making sure, that this setting will help you, is not supported on MPS")
458
  in_data_type = gr.Radio(label="Data Type:", value=config.value["data_type"], choices=["bfloat16", "float16", "float32"], info="`bfloat16` is not supported on MPS devices right now; `float16` may also not be supported on all devices, Half-precision weights, will save GPU memory, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16")
459
  in_allow_tensorfloat32 = gr.Radio(label="Allow TensorFloat32:", value=config.value["allow_tensorfloat32"], choices=["True", "False"], info="is not supported on MPS devices right now; use TensorFloat-32 is faster, but results in slightly less accurate computations, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16 ")
460
  with gr.Row():
461
  in_variant = gr.Radio(label="Variant:", value=config.value["variant"], choices=["fp16", None], info="Use half-precision weights will save GPU memory, not all models support that, see https://huggingface.co/docs/diffusers/main/en/optimization/fp16 ")
462
  in_attention_slicing = gr.Radio(label="Attention slicing:", value=config.value["attention_slicing"], choices=["True", "False"], info="Attention operation will be cutted into multiple steps, see https://huggingface.co/docs/diffusers/optimization/mps")
463
+ in_pre_compile_unet = gr.Radio(label="Pre-Compile UNet:", value=config.value["pre_compile_unet"], choices=["True", "False"], info="Can speed up the inference process, compilation takes some time, so you should only apply this option when you finalize your inference, does not work on MPS, see https://huggingface.co/docs/diffusers/optimization/torch2.0 ")
464
  gr.Column("")
465
 
466
  gr.Markdown("### Model")
 
563
  in_allow_tensorfloat32.change(tensorfloat32_change, inputs=[in_allow_tensorfloat32, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('allow_tensorfloat32', value, config)")
564
  in_variant.change(variant_change, inputs=[in_variant, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('variant', value, config)")
565
  in_attention_slicing.change(attention_slicing_change, inputs=[in_attention_slicing, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('attention_slicing', value, config)")
566
+ in_pre_compile_unet.change(pre_compile_unet_change, inputs=[in_pre_compile_unet, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('pre_compile_unet', value, config)")
567
  in_model_refiner.change(model_refiner_change, inputs=[in_model_refiner, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('model_refiner', value, config)")
568
  in_cpu_offload.change(cpu_offload_change, inputs=[in_cpu_offload, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('cpu_offload', value, config)")
569
  in_safety_checker.change(safety_checker_change, inputs=[in_safety_checker, config], outputs=[config, out_config, out_code], js="(value, config) => set_cookie('safety_checker', value, config)")
 
597
  in_model_refiner,
598
  in_variant,
599
  in_attention_slicing,
600
+ in_pre_compile_unet,
601
  in_safety_checker,
602
  in_requires_safety_checker,
603
  in_auto_encoders,
config.py CHANGED
@@ -41,6 +41,7 @@ def get_initial_config():
41
  "scheduler": None,
42
  "variant": None,
43
  "attention_slicing": "False",
 
44
  "allow_tensorfloat32": allow_tensorfloat32,
45
  "use_safetensors": "False",
46
  "data_type": data_type,
@@ -104,6 +105,7 @@ def get_config_from_url(initial_config, request: Request):
104
  return_config['refiner'],
105
  return_config['variant'],
106
  return_config['attention_slicing'],
 
107
  return_config['safety_checker'],
108
  return_config['requires_safety_checker'],
109
  return_config['auto_encoder'],
@@ -175,6 +177,7 @@ def assemble_code(str_config):
175
  variant=variant).to(device)''')
176
 
177
  if str(config["attention_slicing"]).lower() != 'false': code.append("pipeline.enable_attention_slicing()")
 
178
 
179
  if str(config["cpu_offload"]).lower() != 'false': code.append("pipeline.enable_model_cpu_offload()")
180
 
@@ -191,7 +194,7 @@ def assemble_code(str_config):
191
  "{config['refiner']}",
192
  text_encoder_2 = base.text_encoder_2,
193
  vae = base.vae,
194
- torch_dtype = data_type,
195
  use_safetensors = use_safetensors,
196
  variant=variant,
197
  ).to(device)''')
 
41
  "scheduler": None,
42
  "variant": None,
43
  "attention_slicing": "False",
44
+ "pre_compile_unet": "False",
45
  "allow_tensorfloat32": allow_tensorfloat32,
46
  "use_safetensors": "False",
47
  "data_type": data_type,
 
105
  return_config['refiner'],
106
  return_config['variant'],
107
  return_config['attention_slicing'],
108
+ return_config['pre_compile_unet'],
109
  return_config['safety_checker'],
110
  return_config['requires_safety_checker'],
111
  return_config['auto_encoder'],
 
177
  variant=variant).to(device)''')
178
 
179
  if str(config["attention_slicing"]).lower() != 'false': code.append("pipeline.enable_attention_slicing()")
180
+ if str(config["pre_compile_unet"]).lower() != 'false': code.append("pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)")
181
 
182
  if str(config["cpu_offload"]).lower() != 'false': code.append("pipeline.enable_model_cpu_offload()")
183
 
 
194
  "{config['refiner']}",
195
  text_encoder_2 = base.text_encoder_2,
196
  vae = base.vae,
197
+ torch_dtype = data_t ype,
198
  use_safetensors = use_safetensors,
199
  variant=variant,
200
  ).to(device)''')