Spaces:

hideosnes
/

SDXL-Lightning

Paused

App Files Files Community

hideosnes commited on May 6

Commit

d51b2d2

•

1 Parent(s): 73b7db1

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -99

app.py CHANGED Viewed

@@ -21,12 +21,12 @@ snapshot_download(
     repo_id="h94/IP-Adapter", allow_patterns="sdxl_models/*", local_dir="."
 )
-# CPU fallback & pipeline-definition
 MAX_SEED = np.iinfo(np.int32).max
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32
-# load models & scheduler (==>EULER) & CN (==>canny > test what's better!!!)
 base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
 image_encoder_path = "sdxl_models/image_encoder"
 ip_ckpt = "sdxl_models/ip-adapter_sdxl.bin"
@@ -36,14 +36,14 @@ controlnet = ControlNetModel.from_pretrained(
     controlnet_path, use_safetensors=False, torch_dtype=torch.float16
 ).to(device)
-# load SDXL lightning >> put Turbo here if fallback to Comfy @Litto
 pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
     base_model_path,
-    controlnet = controlnet,
     torch_dtype=torch.float16,
     variant="fp16",
-    add_watermark=False,
 ).to(device)
 pipe.set_progress_bar_config(disable=True)
 pipe.scheduler = EulerDiscreteScheduler.from_config(
@@ -51,14 +51,14 @@ pipe.scheduler = EulerDiscreteScheduler.from_config(
 )
 pipe.unet.load_state_dict(
     load_file(
-    hf_hub_download(
-        "ByteDance/SDXL-Lightning", "sdxl_lightning_2step_unet.safetensors"
-    ),
-    device="cuda",
-  )
 )
-# load ip-adapter with specific target blocks for style transfer and layout preservation. Should be better than Comfy! Test this!
 # target_blocks=["block"] for original IP-Adapter
 # target_blocks=["up_blocks.0.attentions.1"] for style blocks only
 # target_blocks = ["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"] # for style+layout blocks
@@ -67,12 +67,9 @@ ip_model = IPAdapterXL(
     image_encoder_path,
     ip_ckpt,
     device,
-    target_blocks=["up_blocks.0.attentions.1"]
 )
-# Resizing the input image
-# OpenCV goes here!!!
-# Test this with smaller side-no for faster infr
 def resize_img(
     input_image,
@@ -91,9 +88,8 @@ def resize_img(
         w, h = round(ratio * w), round(ratio * h)
         ratio = max_side / max(h, w)
         input_image = input_image.resize([round(ratio * w), round(ratio * h)], mode)
-        w = (round(ratio * w) // base_pixel_number) * base_pixel_number
-        w = (round(ratio * h) // base_pixel_number) * base_pixel_number
-        nput_image.resize([w_resize_new, h_resize_new], mode)
     input_image = input_image.resize([w_resize_new, h_resize_new], mode)
     if pad_to_max_side:
@@ -106,31 +102,52 @@ def resize_img(
         input_image = Image.fromarray(res)
     return input_image
-# expand example images for endpoints --> info an Johannes/Jascha what to expect
 examples = [
     [
-        "./asset/0.jpg",
         None,
-        "3D model, cute monster, test prompt",
         1.0,
         0.0,
     ],
     [
-        "./asset/2.jpg",
-        "./asset/house.jpg",
-        "3D model, cute, kawai, house, another test prompt",
         1.0,
         0.6,
     ],
 ]
 def run_for_examples(style_image, source_image, prompt, scale, control_scale):
     return create_image(
         image_pil=style_image,
         input_image=source_image,
         prompt=prompt,
-        n_prompt="text, watermark, low res, low quality, worst quality, deformed, blurry",
         scale=scale,
         control_scale=control_scale,
         guidance_scale=0.0,
@@ -141,7 +158,6 @@ def run_for_examples(style_image, source_image, prompt, scale, control_scale):
         neg_content_scale=0,
     )
-# Main function for image synthesis (input -> run_for_examples)
 @spaces.GPU(enable_queue=True)
 def create_image(
@@ -167,12 +183,20 @@ def create_image(
     elif target == "Load only style blocks":
         # target_blocks=["up_blocks.0.attentions.1"] for style blocks only
         ip_model = IPAdapterXL(
-            pipe, image_encoder_path, ip_ckpt, device, target_blocks=["up_blocks.0.attentions.1"],
         )
     elif target == "Load style+layout block":
         # target_blocks = ["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"] # for style+layout blocks
         ip_model = IPAdapterXL(
-            pipe, image_encoder_path, ip_ckpt, device, target_blocks=["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"],
         )
     if input_image is not None:
@@ -181,7 +205,7 @@ def create_image(
         detected_map = cv2.Canny(cv_input_image, 50, 200)
         canny_map = Image.fromarray(cv2.cvtColor(detected_map, cv2.COLOR_BGR2RGB))
     else:
-        canny_map = Image.new("RGB", (1024, 1024), color=(255,255,255))
         control_scale = 0
     if float(control_scale) == 0:
@@ -189,7 +213,22 @@ def create_image(
     if len(neg_content_prompt) > 0 and neg_content_scale != 0:
         images = ip_model.generate(
-            pil_image_image_pil,
             prompt=prompt,
             negative_prompt=n_prompt,
             scale=scale,
@@ -202,31 +241,47 @@ def create_image(
         )
     image = images[0]
     with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmpfile:
-        image.save(tmpfile, "JPEG", quality=80, optimize=True, progressive=True) # check what happens to imgs when this changes!!!
         return Path(tmpfile.name)
 def pil_to_cv2(image_pil):
     image_np = np.array(image_pil)
     image_cv2 = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
     return image_cv2
-# Gradio Description & Frontend Stuff for Space (remove this for Endpoint)
 title = r"""
-<h1 align="center">MewMewMew: Simsalabim!</h1>
 """
 description = r"""
-<b>Let's test this! ARM <3 GoldExtra</b><br>
-<b>SDXL-Lightning && IP-Adapter</b>
 """
 article = r"""
-Ask Hidéo if something breaks: <a href="mailto:hideo@artificialmuseum.com">Hidéo's Mail</a>
 """
 block = gr.Blocks()
 with block:
-    #description
     gr.Markdown(title)
     gr.Markdown(description)
@@ -239,71 +294,77 @@ with block:
                     with gr.Column():
                         prompt = gr.Textbox(
                             label="Prompt",
-                            value="mewmewmew, kitty cats, unicorns, uWu",
                         )
                         scale = gr.Slider(
-                            minimum=0, maximum=2.0, step=0.01, value=1.0, label="Maßstab // scale"
-                        )
-                    with gr.Accordion(open=False, label="Für Details erweitern!"):
-                        target = gr.Radio(
-                            [
-                                "Load only style blocks",
-                                "Load style+layout block",
-                                "Load original IP-Adapter",
-                            ],
-                            value="Load only style blocks",
-                            label="Modus für IP-Adapter auswählen"
                         )
-                        with gr.Column():
-                            src_image_pil = gr.Image(
-                                label="Guidance Image (optional)", type="pil"
-                            )
-                        control_scale = gr.Slider(
-                            minimum=0, maximum=1.0, step=0.1, value=0.5,
-                            label="ControlNet-Stärke // control_scale",
-                        )
-                        n_prompt = gr.Textbox(
-                            label="Negative Prompts",
-                            value="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
-                        )
-                        neg_content_prompt = gr.Textbox(
-                            label="Negative Content Prompt (optional)", value=""
-                        )
-                        neg_content_scale = gr.Slider(
-                            minimum=0,
-                            maximum=1.0,
-                            step=0.1,
-                            value=0.5,
-                            label="Negative Content Stärke // neg_content_scale"
-                        )
-                        guidance_scale = gr.Slider(
-                            minimum=0,
-                            maximum=10.0,
-                            step=0.01,
-                            value=0.0,
-                            label="guidance-scale"
-                        )
-                        num_inference_steps = gr.Slider(
-                            minimum=2,
-                            maximum=50.0,
-                            step=1.0,
-                            value=2,
-                            label="Anzahl der Inference Steps (optional) // num_inference_steps"
-                        )
-                        seed = gr.Slider(
-                            minimum=-1,
-                            maximum=MAX_SEED,
-                            value=-1,
-                            step=1,
-                            label="Seed Value // -1 = random // Seed-Proof=True"
                         )
-                generate_button = gr.Button("Simsalabim")
             with gr.Column():
-                generated_image = gr.Image(label="MewMewMagix uWu")
     inputs = [
         image_pil,
@@ -343,10 +404,10 @@ with block:
         inputs=[image_pil, src_image_pil, prompt, scale, control_scale],
         fn=run_for_examples,
         outputs=[generated_image],
-        cache_examples=False,
     )
     gr.Markdown(article)
-    block.queue(api_open=False)
-    block.launch(show_api=False)

     repo_id="h94/IP-Adapter", allow_patterns="sdxl_models/*", local_dir="."
 )
+# global variable
 MAX_SEED = np.iinfo(np.int32).max
 device = "cuda" if torch.cuda.is_available() else "cpu"
 dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32
+# initialization
 base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
 image_encoder_path = "sdxl_models/image_encoder"
 ip_ckpt = "sdxl_models/ip-adapter_sdxl.bin"
     controlnet_path, use_safetensors=False, torch_dtype=torch.float16
 ).to(device)
+# load SDXL lightnining
 pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
     base_model_path,
+    controlnet=controlnet,
     torch_dtype=torch.float16,
     variant="fp16",
+    add_watermarker=False,
 ).to(device)
 pipe.set_progress_bar_config(disable=True)
 pipe.scheduler = EulerDiscreteScheduler.from_config(
 )
 pipe.unet.load_state_dict(
     load_file(
+        hf_hub_download(
+            "ByteDance/SDXL-Lightning", "sdxl_lightning_2step_unet.safetensors"
+        ),
+        device="cuda",
+    )
 )
+# load ip-adapter
 # target_blocks=["block"] for original IP-Adapter
 # target_blocks=["up_blocks.0.attentions.1"] for style blocks only
 # target_blocks = ["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"] # for style+layout blocks
     image_encoder_path,
     ip_ckpt,
     device,
+    target_blocks=["up_blocks.0.attentions.1"],
 )
 def resize_img(
     input_image,
         w, h = round(ratio * w), round(ratio * h)
         ratio = max_side / max(h, w)
         input_image = input_image.resize([round(ratio * w), round(ratio * h)], mode)
+        w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number
+        h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number
     input_image = input_image.resize([w_resize_new, h_resize_new], mode)
     if pad_to_max_side:
         input_image = Image.fromarray(res)
     return input_image
 examples = [
     [
+        "./assets/0.jpg",
+        None,
+        "a cat, masterpiece, best quality, high quality",
+        1.0,
+        0.0,
+    ],
+    [
+        "./assets/1.jpg",
         None,
+        "a cat, masterpiece, best quality, high quality",
         1.0,
         0.0,
     ],
     [
+        "./assets/2.jpg",
+        None,
+        "a cat, masterpiece, best quality, high quality",
+        1.0,
+        0.0,
+    ],
+    [
+        "./assets/3.jpg",
+        None,
+        "a cat, masterpiece, best quality, high quality",
+        1.0,
+        0.0,
+    ],
+    [
+        "./assets/2.jpg",
+        "./assets/yann-lecun.jpg",
+        "a man, masterpiece, best quality, high quality",
         1.0,
         0.6,
     ],
 ]
 def run_for_examples(style_image, source_image, prompt, scale, control_scale):
     return create_image(
         image_pil=style_image,
         input_image=source_image,
         prompt=prompt,
+        n_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
         scale=scale,
         control_scale=control_scale,
         guidance_scale=0.0,
         neg_content_scale=0,
     )
 @spaces.GPU(enable_queue=True)
 def create_image(
     elif target == "Load only style blocks":
         # target_blocks=["up_blocks.0.attentions.1"] for style blocks only
         ip_model = IPAdapterXL(
+            pipe,
+            image_encoder_path,
+            ip_ckpt,
+            device,
+            target_blocks=["up_blocks.0.attentions.1"],
         )
     elif target == "Load style+layout block":
         # target_blocks = ["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"] # for style+layout blocks
         ip_model = IPAdapterXL(
+            pipe,
+            image_encoder_path,
+            ip_ckpt,
+            device,
+            target_blocks=["up_blocks.0.attentions.1", "down_blocks.2.attentions.1"],
         )
     if input_image is not None:
         detected_map = cv2.Canny(cv_input_image, 50, 200)
         canny_map = Image.fromarray(cv2.cvtColor(detected_map, cv2.COLOR_BGR2RGB))
     else:
+        canny_map = Image.new("RGB", (1024, 1024), color=(255, 255, 255))
         control_scale = 0
     if float(control_scale) == 0:
     if len(neg_content_prompt) > 0 and neg_content_scale != 0:
         images = ip_model.generate(
+            pil_image=image_pil,
+            prompt=prompt,
+            negative_prompt=n_prompt,
+            scale=scale,
+            guidance_scale=guidance_scale,
+            num_samples=1,
+            num_inference_steps=num_inference_steps,
+            seed=seed,
+            image=canny_map,
+            controlnet_conditioning_scale=float(control_scale),
+            neg_content_prompt=neg_content_prompt,
+            neg_content_scale=neg_content_scale,
+        )
+    else:
+        images = ip_model.generate(
+            pil_image=image_pil,
             prompt=prompt,
             negative_prompt=n_prompt,
             scale=scale,
         )
     image = images[0]
     with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmpfile:
+        image.save(tmpfile, "JPEG", quality=80, optimize=True, progressive=True)
         return Path(tmpfile.name)
 def pil_to_cv2(image_pil):
     image_np = np.array(image_pil)
     image_cv2 = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
     return image_cv2
+# Description
 title = r"""
+<h1 align="center">InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image Generation</h1>
 """
 description = r"""
+<b>Forked from <a href='https://github.com/InstantStyle/InstantStyle' target='_blank'>InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image Generation</a>.<br>
+<b>Model by <a href='https://huggingface.co/ByteDance/SDXL-Lightning' target='_blank'>SDXL Lightning</a> and <a href='https://huggingface.co/h94/IP-Adapter' target='_blank'>IP-Adapter</a>.</b><br>
 """
 article = r"""
+---
+📝 **Citation**
+<br>
+If our work is helpful for your research or applications, please cite us via:
+```bibtex
+@article{wang2024instantstyle,
+  title={InstantStyle: Free Lunch towards Style-Preserving in Text-to-Image Generation},
+  author={Wang, Haofan and Wang, Qixun and Bai, Xu and Qin, Zekui and Chen, Anthony},
+  journal={arXiv preprint arXiv:2404.02733},
+  year={2024}
+}
+```
+📧 **Contact**
+<br>
+If you have any questions, please feel free to open an issue or directly reach us out at <b>haofanwang.ai@gmail.com</b>.
 """
 block = gr.Blocks()
 with block:
+    # description
     gr.Markdown(title)
     gr.Markdown(description)
                     with gr.Column():
                         prompt = gr.Textbox(
                             label="Prompt",
+                            value="a cat, masterpiece, best quality, high quality",
                         )
                         scale = gr.Slider(
+                            minimum=0, maximum=2.0, step=0.01, value=1.0, label="Scale"
                         )
+                with gr.Accordion(open=False, label="Advanced Options"):
+                    target = gr.Radio(
+                        [
+                            "Load only style blocks",
+                            "Load style+layout block",
+                            "Load original IP-Adapter",
+                        ],
+                        value="Load only style blocks",
+                        label="Style mode",
+                    )
+                    with gr.Column():
+                        src_image_pil = gr.Image(
+                            label="Source Image (optional)", type="pil"
                         )
+                    control_scale = gr.Slider(
+                        minimum=0,
+                        maximum=1.0,
+                        step=0.01,
+                        value=0.5,
+                        label="Controlnet conditioning scale",
+                    )
+                    n_prompt = gr.Textbox(
+                        label="Neg Prompt",
+                        value="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
+                    )
+                    neg_content_prompt = gr.Textbox(
+                        label="Neg Content Prompt", value=""
+                    )
+                    neg_content_scale = gr.Slider(
+                        minimum=0,
+                        maximum=1.0,
+                        step=0.01,
+                        value=0.5,
+                        label="Neg Content Scale",
+                    )
+                    guidance_scale = gr.Slider(
+                        minimum=0,
+                        maximum=10.0,
+                        step=0.01,
+                        value=0.0,
+                        label="guidance scale",
+                    )
+                    num_inference_steps = gr.Slider(
+                        minimum=2,
+                        maximum=50.0,
+                        step=1.0,
+                        value=2,
+                        label="num inference steps",
+                    )
+                    seed = gr.Slider(
+                        minimum=-1,
+                        maximum=MAX_SEED,
+                        value=-1,
+                        step=1,
+                        label="Seed Value",
+                    )
+                generate_button = gr.Button("Generate Image")
             with gr.Column():
+                generated_image = gr.Image(label="Generated Image")
     inputs = [
         image_pil,
         inputs=[image_pil, src_image_pil, prompt, scale, control_scale],
         fn=run_for_examples,
         outputs=[generated_image],
+        cache_examples=True,
     )
     gr.Markdown(article)
+block.queue(api_open=False)
+block.launch(show_api=False)