import cv2 import torch import random import tempfile import numpy as np from pathlib import Path from PIL import Image from diffusers import ( ControlNetModel, StableDiffusionXLControlNetPipeline, UNet2DConditionModel, EulerDiscreteScheduler, ) import spaces import gradio as gr from huggingface_hub import hf_hub_download, snapshot_download from ip_adapter import IPAdapterXL from safetensors.torch import load_file snapshot_download( repo_id="h94/IP-Adapter", allow_patterns="sdxl_models/*", local_dir="." ) # CPU fallback & pipeline-definition MAX_SEED = np.iinfo(np.int32).max device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32 # load models & scheduler (==>EULER) & CN (==>canny > test what's better!!!) base_model_path = "stabilityai/stable-diffusion-xl-base-1.0" image_encoder_path = "sdxl_models/image_encoder" ip_ckpt = "sdxl_models/ip-adapter_sdxl.bin" controlnet_path = "diffusers/controlnet-canny-sdxl-1.0" controlnet = ControlNetModel.from_pretrained( controlnet_path, use_safetensors=False, torch_dtype=torch.float16 ).to(device) # load SDXL lightning >> put Turbo here if fallback to Comfy @Litto pipe = StableDiffusionXLControlNetPipeline.from_pretrained( base_model_path, controlnet = controlnet, torch_dtype=torch.float16, variant="fp16", add_watermark=False, ).to(device) pipe.set_progress_bar_config(disable=True) pipe.scheduler = EulerDiscreteScheduler.from_config( pipe.scheduler.config, timestep_spacing="trailing", prediction_type="epsilon" ) pipe.unet.load_state_dict( load_file( hf_hub_download( "ByteDance/SDXL-Lightning", "sdxl_lightning_2step_unet.safetensors" ), device="cuda", ) ) # load ip-adapter with specific target blocks for style transfer and layout preservation. Should be better than Comfy! Test this! # target_blocks=["block"] for original IP-Adapter # target_blocks=["up_blocks.0.attentions.1"] for style blocks only # target_blocks = ["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"] # for style+layout blocks ip_model = IPAdapterXL( pipe, image_encoder_path, ip_ckpt, device, target_blocks=["up_blocks.0.attentions.1"] ) # Resizing the input image # OpenCV goes here!!! # Test this with smaller side-no for faster infr def resize_img( input_image, max_side=1280, min_side=1024, size=None, pad_to_max_side=False, mode=Image.BILINEAR, base_pixel_number=64, ): w, h = input_image.size if size is not None: w_resize_new, h_resize_new = size else: ratio = min_side / min(h, w) w, h = round(ratio * w), round(ratio * h) ratio = max_side / max(h, w) input_image = input_image.resize([round(ratio * w), round(ratio * h)], mode) w = (round(ratio * w) // base_pixel_number) * base_pixel_number w = (round(ratio * h) // base_pixel_number) * base_pixel_number nput_image.resize([w_resize_new, h_resize_new], mode) input_image = input_image.resize([w_resize_new, h_resize_new], mode) if pad_to_max_side: res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255 offset_x = (max_side - w_resize_new) // 2 offset_y = (max_side - h_resize_new) // 2 res[offset_y : offset_y + h_resize_new, offset_x : offset_x + w_resize_new] = ( np.array(input_image) ) input_image = Image.fromarray(res) return input_image # expand example images for endpoints --> info an Johannes/Jascha what to expect examples = [ [ "./asset/0.jpg", None, "3D model, cute monster, test prompt", 1.0, 0.0, ], [ "./asset/2.jpg", "./asset/house.jpg", "3D model, cute, kawai, house, another test prompt", 1.0, 0.6, ], ] def run_for_examples(style_image, source_image, prompt, scale, control_scale): return create_image( image_pil=style_image, input_image=source_image, prompt=prompt, n_prompt="text, watermark, low res, low quality, worst quality, deformed, blurry", scale=scale, control_scale=control_scale, guidance_scale=0.0, num_inference_steps=2, seed=42, target="Load only style blocks", neg_content_prompt="", neg_content_scale=0, ) # Main function for image synthesis (input -> run_for_examples) @spaces.GPU(enable_queue=True) def create_image( image_pil, input_image, prompt, n_prompt, scale, control_scale, guidance_scale, num_inference_steps, seed, target="Load only style blocks", neg_content_prompt=None, neg_content_scale=0, ): seed = random.randint(0, MAX_SEED) if seed == -1 else seed if target == "Load original IP-Adapter": # target_blocks=["blocks"] for original IP-Adapter ip_model = IPAdapterXL( pipe, image_encoder_path, ip_ckpt, device, target_blocks=["blocks"] ) elif target == "Load only style blocks": # target_blocks=["up_blocks.0.attentions.1"] for style blocks only ip_model = IPAdapterXL( pipe, image_encoder_path, ip_ckpt, device, target_blocks=["up_blocks.0.attentions.1"], ) elif target == "Load style+layout block": # target_blocks = ["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"] # for style+layout blocks ip_model = IPAdapterXL( pipe, image_encoder_path, ip_ckpt, device, target_blocks=["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"], ) if input_image is not None: input_image = resize_img(input_image, max_side=1024) cv_input_image = pil_to_cv2(input_image) detected_map = cv2.Canny(cv_input_image, 50, 200) canny_map = Image.fromarray(cv2.cvtColor(detected_map, cv2.COLOR_BGR2RGB)) else: canny_map = Image.new("RGB", (1024, 1024), color=(255,255,255)) control_scale = 0 if float(control_scale) == 0: canny_map = canny_map.resize((1024, 1024)) if len(neg_content_prompt) > 0 and neg_content_scale != 0: images = ip_model.generate( pil_image_image_pil, prompt=prompt, negative_prompt=n_prompt, scale=scale, guidance_scale=guidance_scale, num_samples=1, num_inference_steps=num_inference_steps, seed=seed, image=canny_map, controlnet_conditioning_scale=float(control_scale), ) image = images[0] with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmpfile: image.save(tmpfile, "JPEG", quality=80, optimize=True, progressive=True) # check what happens to imgs when this changes!!! return Path(tmpfile.name) def pil_to_cv2(image_pil): image_np = np.array(image_pil) image_cv2 = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR) return image_cv2 # Gradio Description & Frontend Stuff for Space (remove this for Endpoint) title = r"""

MewMewMew: Simsalabim!

""" description = r""" Let's test this! ARM <3 GoldExtra
SDXL-Lightning && IP-Adapter """ article = r""" Ask Hidéo if something breaks: Hidéo's Mail """ block = gr.Blocks() with block: #description gr.Markdown(title) gr.Markdown(description) with gr.Tabs(): with gr.Row(): with gr.Column(): with gr.Row(): with gr.Column(): image_pil = gr.Image(label="Style Image", type="pil") with gr.Column(): prompt = gr.Textbox( label="Prompt", value="mewmewmew, kitty cats, unicorns, uWu", ) scale = gr.Slider( minimum=0, maximum=2.0, step=0.01, value=1.0, label="Maßstab // scale" ) with gr.Accordion(open=False, label="Für Details erweitern!"): target = gr.Radio( [ "Load only style blocks", "Load style+layout block", "Load original IP-Adapter", ], value="Load only style blocks", label="Modus für IP-Adapter auswählen" ) with gr.Column(): src_image_pil = gr.Image( label="Guidance Image (optional)", type="pil" ) control_scale = gr.Slider( minimum=0, maximum=1.0, step=0.1, value=0.5, label="ControlNet-Stärke // control_scale", ) n_prompt = gr.Textbox( label="Negative Prompts", value="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry", ) neg_content_prompt = gr.Textbox( label="Negative Content Prompt (optional)", value="" ) neg_content_scale = gr.Slider( minimum=0, maximum=1.0, step=0.1, value=0.5, label="Negative Content Stärke // neg_content_scale" ) guidance_scale = gr.Slider( minimum=0, maximum=10.0, step=0.01, value=0.0, label="guidance-scale" ) num_inference_steps = gr.Slider( minimum=2, maximum=50.0, step=1.0, value=2, label="Anzahl der Inference Steps (optional) // num_inference_steps" ) seed = gr.Slider( minimum=-1, maximum=MAX_SEED, value=-1, step=1, label="Seed Value // -1 = random // Seed-Proof=True" ) generate_button = gr.Button("Simsalabim") with gr.Column(): generated_image = gr.Image(label="MewMewMagix uWu") inputs = [ image_pil, src_image_pil, prompt, n_prompt, scale, control_scale, guidance_scale, num_inference_steps, seed, target, neg_content_prompt, neg_content_scale, ] outputs = [generated_image] gr.on( triggers=[ prompt.input, generate_button.click, guidance_scale.input, scale.input, control_scale.input, seed.input, ], fn=create_image, inputs=inputs, outputs=outputs, show_progress="minimal", show_api=False, trigger_mode="always_last", ) gr.Examples( examples=examples, inputs=[image_pil, src_image_pil, prompt, scale, control_scale], fn=run_for_examples, outputs=[generated_image], cache_examples=False, ) gr.Markdown(article) block.queue(api_open=False) block.launch(show_api=False)