import gradio as gr from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline, AutoencoderKL from diffusers.utils import load_image from transformers import DPTImageProcessor, DPTForDepthEstimation import torch import sa_handler import pipeline_calls # Initialize models depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda") feature_processor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas") controlnet = ControlNetModel.from_pretrained( "diffusers/controlnet-depth-sdxl-1.0", variant="fp16", use_safetensors=True, torch_dtype=torch.float16, ).to("cuda") vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda") pipeline = StableDiffusionXLControlNetPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, variant="fp16", use_safetensors=True, torch_dtype=torch.float16, ).to("cuda") # Configure pipeline for CPU offloading and VAE slicing pipeline.enable_model_cpu_offload() pipeline.enable_vae_slicing() # Initialize style-aligned handler sa_args = sa_handler.StyleAlignedArgs(share_group_norm=False, share_layer_norm=False, share_attention=True, adain_queries=True, adain_keys=True, adain_values=False, ) handler = sa_handler.Handler(pipeline) handler.register(sa_args, ) # Function to run ControlNet depth with StyleAligned def style_aligned_controlnet(ref_style_prompt, depth_map, ref_image, img_generation_prompt): try: if depth_map == True: image = load_image(ref_image) depth_image = pipeline_calls.get_depth_map(image, feature_processor, depth_estimator) else: depth_image = load_image(ref_image).resize((1024, 1024)) controlnet_conditioning_scale = 0.8 num_images_per_prompt = 3 # adjust according to VRAM size latents = torch.randn(1 + num_images_per_prompt, 4, 128, 128).to(pipeline.unet.dtype) latents[1:] = torch.randn(num_images_per_prompt, 4, 128, 128).to(pipeline.unet.dtype) images = pipeline_calls.controlnet_call(pipeline, [ref_style_prompt, img_generation_prompt], image=depth_image, num_inference_steps=50, controlnet_conditioning_scale=controlnet_conditioning_scale, num_images_per_prompt=num_images_per_prompt, latents=latents) return [images[0], depth_image] + images[1:], gr.Image(value=images[0], visible=True) except Exception as e: raise gr.Error(f"Error in generating images:{e}") # Create a Gradio UI with gr.Blocks() as demo: gr.HTML('

Style-aligned with ControlNet Depth

') with gr.Row(): with gr.Column(variant='panel'): # Textbox for reference style prompt ref_style_prompt = gr.Textbox( label='Reference style prompt', info="Enter a Prompt to generate the reference image", placeholder='a poster in