import spaces import gradio as gr from gradio_imageslider import ImageSlider import torch from hidiffusion import apply_hidiffusion from diffusers import ( ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, DDIMScheduler, ) from compel import Compel, ReturnedEmbeddingsType from PIL import Image import os import time import cv2 import numpy as np device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1" print(f"device: {device}") print(f"dtype: {dtype}") print(f"low memory: {LOW_MEMORY}") model = "stabilityai/stable-diffusion-xl-base-1.0" # vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=dtype) scheduler = DDIMScheduler.from_pretrained(model, subfolder="scheduler") controlnet = ControlNetModel.from_pretrained( "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16 ) pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained( model, controlnet=controlnet, torch_dtype=dtype, variant="fp16", use_safetensors=True, scheduler=scheduler, ) pipe.enable_xformers_memory_efficient_attention() # pipe.enable_model_cpu_offload() pipe.enable_vae_tiling() apply_hidiffusion(pipe) compel = Compel( tokenizer=[pipe.tokenizer, pipe.tokenizer_2], text_encoder=[pipe.text_encoder, pipe.text_encoder_2], returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, requires_pooled=[False, True], ) pipe = pipe.to(device) def pad_image(image): w, h = image.size if w == h: return image elif w > h: new_image = Image.new(image.mode, (w, w), (0, 0, 0)) pad_w = 0 pad_h = (w - h) // 2 new_image.paste(image, (0, pad_h)) return new_image else: new_image = Image.new(image.mode, (h, h), (0, 0, 0)) pad_w = (h - w) // 2 pad_h = 0 new_image.paste(image, (pad_w, 0)) return new_image @spaces.GPU def predict( input_image, prompt, negative_prompt, seed, controlnet_conditioning_scale, guidance_scale=8.5, scale=2, strength=1.0, controlnet_start=0.0, controlnet_end=1.0, progress=gr.Progress(track_tqdm=True), ): if input_image is None: raise gr.Error("Please upload an image.") padded_image = pad_image(input_image).resize((1024, 1024)).convert("RGB") conditioning, pooled = compel([prompt, negative_prompt]) generator = torch.manual_seed(seed) last_time = time.time() canny_image = np.array(padded_image) canny_image = cv2.Canny(canny_image, 100, 200) canny_image = canny_image[:, :, None] canny_image = np.concatenate([canny_image, canny_image, canny_image], axis=2) canny_image = Image.fromarray(canny_image) images = pipe( image=padded_image, control_image=canny_image, strength=strength, prompt_embeds=conditioning[0:1], pooled_prompt_embeds=pooled[0:1], negative_prompt_embeds=conditioning[1:2], negative_pooled_prompt_embeds=pooled[1:2], width=1024 * scale, height=1024 * scale, controlnet_conditioning_scale=controlnet_conditioning_scale, controlnet_start=controlnet_start, controlnet_end=controlnet_end, generator=generator, num_inference_steps=40, guidance_scale=guidance_scale, eta=1.0, ) print(f"Time taken: {time.time() - last_time}") return (padded_image, images.images[0]) css = """ #intro{ # max-width: 32rem; # text-align: center; # margin: 0 auto; } """ with gr.Blocks(css=css) as demo: gr.Markdown( """ # Enhance This ### DemoFusion SDXL [DemoFusion](https://ruoyidu.github.io/demofusion/demofusion.html) enables higher-resolution image generation. You can upload an initial image and prompt to generate an enhanced version. [Duplicate Space](https://huggingface.co/spaces/radames/Enhance-This-DemoFusion-SDXL?duplicate=true) to avoid the queue. GPU Time Comparison: T4: ~276s - A10G: ~113.6s A100: ~43.5s RTX 4090: ~48.1s Notes The author advises against the term "super resolution" because it's more like image-to-image generation than enhancement, but it's still a lot of fun! """, elem_id="intro", ) with gr.Row(): with gr.Column(scale=1): image_input = gr.Image(type="pil", label="Input Image") prompt = gr.Textbox( label="Prompt", info="The prompt is very important to get the desired results. Please try to describe the image as best as you can. Accepts Compel Syntax", ) negative_prompt = gr.Textbox( label="Negative Prompt", value="blurry, ugly, duplicate, poorly drawn, deformed, mosaic", ) seed = gr.Slider( minimum=0, maximum=2**64 - 1, value=1415926535897932, step=1, label="Seed", randomize=True, ) with gr.Accordion(label="Advanced", open=False): guidance_scale = gr.Slider( minimum=0, maximum=50, value=8.5, step=0.001, label="Guidance Scale", ) scale = gr.Slider( minimum=1, maximum=5, value=2, step=1, label="Magnification Scale", # interactive=False, ) controlnet_conditioning_scale = gr.Slider( minimum=0, maximum=1, step=0.001, value=0.5, label="ControlNet Conditioning Scale", ) strength = gr.Slider( minimum=0, maximum=2, step=0.001, value=1, label="Strength", ) controlnet_start = gr.Slider( minimum=0, maximum=1, step=0.001, value=0.0, label="ControlNet Start", ) controlnet_end = gr.Slider( minimum=0.0, maximum=1.0, step=0.001, value=1.0, label="ControlNet End", ) btn = gr.Button() with gr.Column(scale=2): image_slider = ImageSlider(position=0.5) inputs = [ image_input, prompt, negative_prompt, seed, controlnet_conditioning_scale, guidance_scale, scale, strength, controlnet_start, controlnet_end, ] outputs = [image_slider] btn.click(predict, inputs=inputs, outputs=outputs, concurrency_limit=1) gr.Examples( fn=predict, examples=[ [ "./examples/lara.jpeg", "photography of lara croft 8k high definition award winning", "blurry, ugly, duplicate, poorly drawn, deformed, mosaic", 5436236241, 0.5, 8.5, 3, 0.8, 0.0, 1.0, ], [ "./examples/cybetruck.jpeg", "photo of tesla cybertruck futuristic car 8k high definition on a sand dune in mars, future", "blurry, ugly, duplicate, poorly drawn, deformed, mosaic", 383472451451, 0.5, 8.5, 3, 0.8, 0.0, 1.0, ], [ "./examples/jesus.png", "a photorealistic painting of Jesus Christ, 4k high definition", "blurry, ugly, duplicate, poorly drawn, deformed, mosaic", 13317204146129588000, 0.5, 8.5, 3, 0.8, 0.0, 1.0, ], [ "./examples/anna-sullivan-DioLM8ViiO8-unsplash.jpg", "A crowded stadium with enthusiastic fans watching a daytime sporting event, the stands filled with colorful attire and the sun casting a warm glow", "blurry, ugly, duplicate, poorly drawn, deformed, mosaic", 5623124123512, 0.5, 8.5, 3, 0.8, 0.0, 1.0, ], [ "./examples/img_aef651cb-2919-499d-aa49-6d4e2e21a56e_1024.jpg", "a large red flower on a black background 4k high definition", "blurry, ugly, duplicate, poorly drawn, deformed, mosaic", 23123412341234, 0.5, 8.5, 3, 0.8, 0.0, 1.0, ], [ "./examples/huggingface.jpg", "photo realistic huggingface human+++ emoji costume, round, yellow, skin+++ texture+++", "blurry, ugly, duplicate, poorly drawn, deformed, mosaic, emoji cartoon, drawing, pixelated", 5532144938416372000, 0.101, 25.206, 4.64, 0.8, 0.0, 1.0, ], ], inputs=inputs, outputs=outputs, cache_examples="lazy", ) demo.queue(api_open=False) demo.launch(show_api=False)