Spaces:

silentchen
/

test

Paused

App Files Files Community

silentchen commited on Mar 31, 2023

Commit

72c1946

1 Parent(s): 6ae5687

Upload app.py

Browse files

Files changed (1) hide show

app.py +250 -452

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import torch
 from omegaconf import OmegaConf
-# from layout_guidance.inference import inference
 from transformers import CLIPTextModel, CLIPTokenizer
 from diffusers import AutoencoderKL, LMSDiscreteScheduler
 from my_model import unet_2d_condition
@@ -9,151 +8,17 @@ import json
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 from functools import partial
-from collections import Counter
 import math
-import gc
 from utils import compute_ca_loss
 from gradio import processing_utils
 from typing import Optional
 import warnings
-from datetime import datetime
-from huggingface_hub import hf_hub_download
-hf_hub_download = partial(hf_hub_download, library_name="gligen_demo")
 import sys
 sys.tracebacklimit = 0
-def load_from_hf(repo_id, filename='diffusion_pytorch_model.bin', subfolder=None):
-    cache_file = hf_hub_download(repo_id=repo_id, filename=filename, subfolder=subfolder)
-    return torch.load(cache_file, map_location='cpu')
-def load_ckpt_config_from_hf(modality):
-    ckpt = load_from_hf('gligen/demo_ckpts_legacy', filename=f'{modality}.pth', subfolder='model')
-    config = load_from_hf('gligen/demo_ckpts_legacy', filename=f'{modality}.pth', subfolder='config')
-    return ckpt, config
-def ckpt_load_helper(modality, is_inpaint, is_style, common_instances=None):
-    pretrained_ckpt_gligen, config = load_ckpt_config_from_hf(modality)
-    config = OmegaConf.create(config["_content"])  # config used in training
-    config.alpha_scale = 1.0
-    config.model['params']['is_inpaint'] = is_inpaint
-    config.model['params']['is_style'] = is_style
-    if common_instances is None:
-        common_ckpt = load_from_hf('gligen/demo_ckpts_legacy', filename=f'common.pth', subfolder='model')
-        common_instances = load_common_ckpt(config, common_ckpt)
-    loaded_model_list = load_ckpt(config, pretrained_ckpt_gligen, common_instances)
-    return loaded_model_list, common_instances
-class Instance:
-    def __init__(self, capacity=2):
-        self.model_type = 'base'
-        self.loaded_model_list = {}
-        self.counter = Counter()
-        self.global_counter = Counter()
-        self.loaded_model_list['base'], self.common_instances = ckpt_load_helper(
-            'gligen-generation-text-box',
-            is_inpaint=False, is_style=False, common_instances=None
-        )
-        self.capacity = capacity
-    def _log(self, model_type, batch_size, instruction, phrase_list):
-        self.counter[model_type] += 1
-        self.global_counter[model_type] += 1
-        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        print('[{}] Current: {}, All: {}. Samples: {}, prompt: {}, phrases: {}'.format(
-            current_time, dict(self.counter), dict(self.global_counter), batch_size, instruction, phrase_list
-        ))
-    def get_model(self, model_type, batch_size, instruction, phrase_list):
-        if model_type in self.loaded_model_list:
-            self._log(model_type, batch_size, instruction, phrase_list)
-            return self.loaded_model_list[model_type]
-        if self.capacity == len(self.loaded_model_list):
-            least_used_type = self.counter.most_common()[-1][0]
-            del self.loaded_model_list[least_used_type]
-            del self.counter[least_used_type]
-            gc.collect()
-            torch.cuda.empty_cache()
-        self.loaded_model_list[model_type] = self._get_model(model_type)
-        self._log(model_type, batch_size, instruction, phrase_list)
-        return self.loaded_model_list[model_type]
-    def _get_model(self, model_type):
-        if model_type == 'base':
-            return ckpt_load_helper(
-                'gligen-generation-text-box',
-                is_inpaint=False, is_style=False, common_instances=self.common_instances
-            )[0]
-        elif model_type == 'inpaint':
-            return ckpt_load_helper(
-                'gligen-inpainting-text-box',
-                is_inpaint=True, is_style=False, common_instances=self.common_instances
-            )[0]
-        elif model_type == 'style':
-            return ckpt_load_helper(
-                'gligen-generation-text-image-box',
-                is_inpaint=False, is_style=True, common_instances=self.common_instances
-            )[0]
-        assert False
-# instance = Instance()
-def load_clip_model():
-    from transformers import CLIPProcessor, CLIPModel
-    version = "openai/clip-vit-large-patch14"
-    model = CLIPModel.from_pretrained(version).cuda()
-    processor = CLIPProcessor.from_pretrained(version)
-    return {
-        'version': version,
-        'model': model,
-        'processor': processor,
-    }
-# clip_model = load_clip_model()
-class ImageMask(gr.components.Image):
-    """
-    Sets: source="canvas", tool="sketch"
-    """
-    is_template = True
-    def __init__(self, **kwargs):
-        super().__init__(source="upload", tool="sketch", interactive=True, **kwargs)
-    def preprocess(self, x):
-        if x is None:
-            return x
-        if self.tool == "sketch" and self.source in ["upload", "webcam"] and type(x) != dict:
-            decode_image = processing_utils.decode_base64_to_image(x)
-            width, height = decode_image.size
-            mask = np.zeros((height, width, 4), dtype=np.uint8)
-            mask[..., -1] = 255
-            mask = self.postprocess(mask)
-            x = {'image': x, 'mask': mask}
-        return super().preprocess(x)
 class Blocks(gr.Blocks):
     def __init__(
@@ -206,19 +71,7 @@ def draw_box(boxes=[], texts=[], img=None):
                   fill=(255, 255, 255))
     return img
-with open('./conf/unet/config.json') as f:
-    unet_config = json.load(f)
-unet = unet_2d_condition.UNet2DConditionModel(**unet_config).from_pretrained('runwayml/stable-diffusion-v1-5', subfolder="unet")
-tokenizer = CLIPTokenizer.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="tokenizer")
-text_encoder = CLIPTextModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="text_encoder")
-vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae")
-attn_map = None
-cfg = OmegaConf.load('./conf/net_conf.yaml')
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-unet.to(device)
-text_encoder.to(device)
-vae.to(device)
 def inference(device, unet, vae, tokenizer, text_encoder, prompt, cfg,attn_map, bboxes, object_positions, batch_size, loss_scale, loss_threshold, max_iter, max_index_step, rand_seed, guidance_scale):
     uncond_input = tokenizer(
         [""] * 1, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt"
@@ -328,7 +181,7 @@ def auto_append_grounding(language_instruction, grounding_texts):
     return language_instruction
-def generate(language_instruction, grounding_texts, sketch_pad,
              loss_threshold, guidance_scale, batch_size, rand_seed, max_step, loss_scale, max_iter,
              state):
     if 'boxes' not in state:
@@ -406,26 +259,16 @@ def center_crop(img, HW=None, tgt_size=(512, 512)):
 def draw(input, grounding_texts, new_image_trigger, state):
     if type(input) == dict:
         image = input['image']
         mask = input['mask']
     else:
         mask = input
     if mask.ndim == 3:
-        mask = mask[..., 0]
     image_scale = 1.0
-    mask = binarize(mask)
-    if mask.shape != (512, 512):
-        # assert False, "should not receive any non- 512x512 masks."
-        if 'original_image' in state and state['original_image'].shape[:2] == mask.shape:
-            mask = center_crop(mask, state['inpaint_hw'])
-            image = center_crop(state['original_image'], state['inpaint_hw'])
-        else:
-            mask = np.zeros((512, 512), dtype=np.uint8)
-    # mask = center_crop(mask)
     mask = binarize(mask)
     if type(mask) != np.ndarray:
@@ -464,14 +307,8 @@ def draw(input, grounding_texts, new_image_trigger, state):
     grounding_texts = [x for x in grounding_texts if len(x) > 0]
     if len(grounding_texts) < len(state['boxes']):
         grounding_texts += [f'Obj. {bid + 1}' for bid in range(len(grounding_texts), len(state['boxes']))]
-    print("state", state)
     box_image = draw_box(state['boxes'], grounding_texts, image)
-    if box_image is not None and state.get('inpaint_hw', None):
-        inpaint_hw = state['inpaint_hw']
-        box_image_resize = np.array(box_image.resize((inpaint_hw, inpaint_hw)))
-        original_image = state['original_image'].copy()
-        box_image = sized_center_fill(original_image, box_image_resize, inpaint_hw, inpaint_hw)
     return [box_image, new_image_trigger, image_scale, state]
@@ -479,291 +316,252 @@ def clear(task, sketch_pad_trigger, batch_size, state, switch_task=False):
     if task != 'Grounded Inpainting':
         sketch_pad_trigger = sketch_pad_trigger + 1
     blank_samples = batch_size % 2 if batch_size > 1 else 0
-    out_images = [gr.Image.update(value=None, visible=True) for i in range(batch_size)] \
-                 + [gr.Image.update(value=None, visible=True) for _ in range(blank_samples)] \
-                 + [gr.Image.update(value=None, visible=False) for _ in range(4 - batch_size - blank_samples)]
-    state = {}
-    return [None, sketch_pad_trigger, None, 1.0] + out_images + [state]
-css = """
-#img2img_image, #img2img_image > .fixed-height, #img2img_image > .fixed-height > div, #img2img_image > .fixed-height > div > img
-{
-    height: var(--height) !important;
-    max-height: var(--height) !important;
-    min-height: var(--height) !important;
-}
-#paper-info a {
-    color:#008AD7;
-    text-decoration: none;
-}
-#paper-info a:hover {
-    cursor: pointer;
-    text-decoration: none;
-}
-.tooltip {
-    color: #555;
-    position: relative;
-    display: inline-block;
-    cursor: pointer;
-}
-.tooltip .tooltiptext {
-    visibility: hidden;
-    width: 400px;
-    background-color: #555;
-    color: #fff;
-    text-align: center;
-    padding: 5px;
-    border-radius: 5px;
-    position: absolute;
-    z-index: 1; /* Set z-index to 1 */
-    left: 10px;
-    top: 100%;
-    opacity: 0;
-    transition: opacity 0.3s;
-}
-.tooltip:hover .tooltiptext {
-    visibility: visible;
-    opacity: 1;
-    z-index: 9999; /* Set a high z-index value when hovering */
-}
-"""
-rescale_js = """
-function(x) {
-    const root = document.querySelector('gradio-app').shadowRoot || document.querySelector('gradio-app');
-    let image_scale = parseFloat(root.querySelector('#image_scale input').value) || 1.0;
-    const image_width = root.querySelector('#img2img_image').clientWidth;
-    const target_height = parseInt(image_width * image_scale);
-    document.body.style.setProperty('--height', `${target_height}px`);
-    root.querySelectorAll('button.justify-center.rounded')[0].style.display='none';
-    root.querySelectorAll('button.justify-center.rounded')[1].style.display='none';
-    return x;
-}
-"""
-with Blocks(
-        css=css,
-        analytics_enabled=False,
-        title="Layout-Guidance demo",
-) as main:
-    description = """<p style="text-align: center; font-weight: bold;">
-        <span style="font-size: 28px">Layout Guidance</span>
-        <br>
-        <span style="font-size: 18px" id="paper-info">
-            [<a href=" " target="_blank">Project Page</a>]
-            [<a href=" " target="_blank">Paper</a>]
-            [<a href=" " target="_blank">GitHub</a>]
-        </span>
-    </p>
-    """
-    gr.HTML(description)
-    with gr.Column():
-        language_instruction = gr.Textbox(
-            label="Text Prompt",
-        )
-        grounding_instruction = gr.Textbox(
-            label="Grounding instruction (Separated by semicolon)",
-        )
-        sketch_pad_trigger = gr.Number(value=0, visible=False)
-        sketch_pad_resize_trigger = gr.Number(value=0, visible=False)
-        init_white_trigger = gr.Number(value=0, visible=False)
-        image_scale = gr.Number(value=0, elem_id="image_scale", visible=False)
-        new_image_trigger = gr.Number(value=0, visible=False)
-        with gr.Row():
-            sketch_pad = ImageMask(label="Sketch Pad", elem_id="img2img_image")
-            out_imagebox = gr.Image(type="pil", label="Parsed Sketch Pad")
-            out_gen_1 = gr.Image(type="pil", visible=True, label="Generated Image")
-            # out_gen_2 = gr.Image(type="pil", visible=True, label="Generated Image")
-            # out_gen_3 = gr.Image(type="pil", visible=True, show_label=False)
-            # out_gen_4 = gr.Image(type="pil", visible=True, show_label=False)
-        with gr.Row():
-            clear_btn = gr.Button(value='Clear')
-            gen_btn = gr.Button(value='Generate')
-            # clear_btn = gr.Button(value='Clear')
-            # clear_btn = gr.Button(value='Clear')
-        with gr.Accordion("Advanced Options", open=False):
-            with gr.Column():
-                description = """<div class="tooltip">Loss Scale Factor &#9432
-                    <span class="tooltiptext">The scale factor of the backward guidance loss. The larger it is, the better control we get while it sometimes losses fidelity. </span>
-                    </div>
-                    <div class="tooltip">Guidance Scale &#9432
-                    <span class="tooltiptext">The scale factor of classifier-free guidance. </span>
-                    </div>
-                    <div class="tooltip" >Max Iteration per Step &#9432
-                    <span class="tooltiptext">The max iterations of backward guidance in each diffusion inference process.</span>
-                    </div>
-                    <div class="tooltip" >Loss Threshold &#9432
-                    <span class="tooltiptext">The threshold of loss. If the loss computed by cross-attention map is smaller then the threshold, the backward guidance is stopped. </span>
-                    </div>
-                    <div class="tooltip" >Max Step of Backward Guidance &#9432
-                    <span class="tooltiptext">The max steps of backward guidance in diffusion inference process.</span>
-                    </div>
-                """
-                gr.HTML(description)
-                Loss_scale = gr.Slider(minimum=0, maximum=500, step=5, value=30,label="Loss Scale Factor")
-                guidance_scale = gr.Slider(minimum=0, maximum=50, step=0.5, value=7.5, label="Guidance Scale")
-                batch_size = gr.Slider(minimum=1, maximum=4, step=1, value=1, label="Number of Samples", visible=False)
-                max_iter = gr.Slider(minimum=0, maximum=10, step=1, value=5, label="Max Iteration per Step")
-                loss_threshold = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.2, label="Loss Threshold")
-                max_step = gr.Slider(minimum=0, maximum=50, step=1, value=10, label="Max Step of Backward Guidance")
-                    # fix_seed = gr.Checkbox(value=True, label="Fixed seed")
-                rand_seed = gr.Slider(minimum=0, maximum=1000, step=1, value=445, label="Random Seed")
-        # with gr.Column(scale=4):
-        #     gr.HTML('<span style="font-size: 20px; font-weight: bold">Generated Images</span>')
-        #     with gr.Row():
-        #         out_gen_1 = gr.Image(type="pil", visible=True, show_label=False, label="Generated Image")
-        #         out_gen_2 = gr.Image(type="pil", visible=True, show_label=False)
-        #     with gr.Row():
-        #         out_gen_3 = gr.Image(type="pil", visible=False, show_label=False)
-        #         out_gen_4 = gr.Image(type="pil", visible=False, show_label=False)
-        state = gr.State({})
-        class Controller:
-            def __init__(self):
-                self.calls = 0
-                self.tracks = 0
-                self.resizes = 0
-                self.scales = 0
-            def init_white(self, init_white_trigger):
-                self.calls += 1
-                return np.ones((512, 512), dtype='uint8') * 255, 1.0, init_white_trigger + 1
-            def change_n_samples(self, n_samples):
-                blank_samples = n_samples % 2 if n_samples > 1 else 0
-                return [gr.Image.update(visible=True) for _ in range(n_samples + blank_samples)] \
-                    + [gr.Image.update(visible=False) for _ in range(4 - n_samples - blank_samples)]
-            def resize_centercrop(self, state):
-                self.resizes += 1
-                image = state['original_image'].copy()
-                inpaint_hw = int(0.9 * min(*image.shape[:2]))
-                state['inpaint_hw'] = inpaint_hw
-                image_cc = center_crop(image, inpaint_hw)
-                # print(f'resize triggered {self.resizes}', image.shape, '->', image_cc.shape)
-                return image_cc, state
-            def resize_masked(self, state):
-                self.resizes += 1
-                image = state['original_image'].copy()
-                inpaint_hw = int(0.9 * min(*image.shape[:2]))
-                state['inpaint_hw'] = inpaint_hw
-                image_mask = sized_center_mask(image, inpaint_hw, inpaint_hw)
-                state['masked_image'] = image_mask.copy()
-                # print(f'mask triggered {self.resizes}')
-                return image_mask, state
-            def switch_task_hide_cond(self, task):
-                cond = False
-                if task == "Grounded Generation":
-                    cond = True
-                return gr.Checkbox.update(visible=cond, value=False), gr.Image.update(value=None,
-                                                                                      visible=False), gr.Slider.update(
-                    visible=cond), gr.Checkbox.update(visible=(not cond), value=False)
-        controller = Controller()
-        main.load(
-            lambda x: x + 1,
-            inputs=sketch_pad_trigger,
-            outputs=sketch_pad_trigger,
-            queue=False)
-        sketch_pad.edit(
-            draw,
-            inputs=[sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state],
-            outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state],
-            queue=False,
-        )
-        grounding_instruction.change(
-            draw,
-            inputs=[sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state],
-            outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state],
-            queue=False,
-        )
-        clear_btn.click(
-            clear,
-            inputs=[sketch_pad_trigger, sketch_pad_trigger, batch_size, state],
-            outputs=[sketch_pad, sketch_pad_trigger, out_imagebox, image_scale, out_gen_1, state],
-            queue=False)
-        sketch_pad_trigger.change(
-            controller.init_white,
-            inputs=[init_white_trigger],
-            outputs=[sketch_pad, image_scale, init_white_trigger],
-            queue=False)
-        sketch_pad_resize_trigger.change(
-            controller.resize_masked,
-            inputs=[state],
-            outputs=[sketch_pad, state],
-            queue=False)
-        # batch_size.change(
-        #     controller.change_n_samples,
-        #     inputs=[batch_size],
-        #     outputs=[out_gen_1, out_gen_2],
-        #     queue=False)
-        # batch_size.change(
-        #     controller.change_n_samples,
-        #     inputs=[batch_size],
-        #     outputs=[out_gen_1, out_gen_2],
-        #     queue=False)
-        gen_btn.click(
-            generate,
-            inputs=[
-                language_instruction, grounding_instruction, sketch_pad,
-                loss_threshold, guidance_scale, batch_size, rand_seed,
-                max_step,
-                Loss_scale, max_iter,
-                state,
-            ],
-            outputs=[out_gen_1, state],
-            queue=True
-        )
-        sketch_pad_resize_trigger.change(
-            None,
-            None,
-            sketch_pad_resize_trigger,
-            _js=rescale_js,
-            queue=False)
-        init_white_trigger.change(
-            None,
-            None,
-            init_white_trigger,
-            _js=rescale_js,
-            queue=False)
-    with gr.Column():
-        gr.Examples(
-            examples=[
-                [
-                    # "images/input.png",
-                    "A hello kitty toy is playing with a purple ball.",
-                    "hello kitty;ball",
-                    "images/hello_kitty_results.png"
-                ],
-            ],
-            inputs=[language_instruction, grounding_instruction, out_gen_1],
-            outputs=None,
-            fn=None,
-            cache_examples=False,
-        )
-main.queue(concurrency_count=1, api_open=False)
-main.launch(share=False, show_api=False, show_error=True)

 import gradio as gr
 import torch
 from omegaconf import OmegaConf
 from transformers import CLIPTextModel, CLIPTokenizer
 from diffusers import AutoencoderKL, LMSDiscreteScheduler
 from my_model import unet_2d_condition
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 from functools import partial
 import math
 from utils import compute_ca_loss
 from gradio import processing_utils
 from typing import Optional
 import warnings
 import sys
 sys.tracebacklimit = 0
 class Blocks(gr.Blocks):
     def __init__(
                   fill=(255, 255, 255))
     return img
 def inference(device, unet, vae, tokenizer, text_encoder, prompt, cfg,attn_map, bboxes, object_positions, batch_size, loss_scale, loss_threshold, max_iter, max_index_step, rand_seed, guidance_scale):
     uncond_input = tokenizer(
         [""] * 1, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt"
     return language_instruction
+def generate(unet, vae, tokenizer, text_encoder, cfg, attn_map, language_instruction, grounding_texts, sketch_pad,
              loss_threshold, guidance_scale, batch_size, rand_seed, max_step, loss_scale, max_iter,
              state):
     if 'boxes' not in state:
 def draw(input, grounding_texts, new_image_trigger, state):
     if type(input) == dict:
         image = input['image']
         mask = input['mask']
     else:
         mask = input
     if mask.ndim == 3:
+        mask = 255 - mask[..., 0]
     image_scale = 1.0
     mask = binarize(mask)
     if type(mask) != np.ndarray:
     grounding_texts = [x for x in grounding_texts if len(x) > 0]
     if len(grounding_texts) < len(state['boxes']):
         grounding_texts += [f'Obj. {bid + 1}' for bid in range(len(grounding_texts), len(state['boxes']))]
     box_image = draw_box(state['boxes'], grounding_texts, image)
     return [box_image, new_image_trigger, image_scale, state]
     if task != 'Grounded Inpainting':
         sketch_pad_trigger = sketch_pad_trigger + 1
     blank_samples = batch_size % 2 if batch_size > 1 else 0
+    out_images = [gr.Image.update(value=None, visible=True) for i in range(batch_size)]
+    # state = {}
+    return [None, sketch_pad_trigger, None, 1.0] + out_images + [{}]
+def main():
+    css = """
+    #img2img_image, #img2img_image > .fixed-height, #img2img_image > .fixed-height > div, #img2img_image > .fixed-height > div > img
+    {
+        height: var(--height) !important;
+        max-height: var(--height) !important;
+        min-height: var(--height) !important;
+    }
+    #paper-info a {
+        color:#008AD7;
+        text-decoration: none;
+    }
+    #paper-info a:hover {
+        cursor: pointer;
+        text-decoration: none;
+    }
+    .tooltip {
+        color: #555;
+        position: relative;
+        display: inline-block;
+        cursor: pointer;
+    }
+    .tooltip .tooltiptext {
+        visibility: hidden;
+        width: 400px;
+        background-color: #555;
+        color: #fff;
+        text-align: center;
+        padding: 5px;
+        border-radius: 5px;
+        position: absolute;
+        z-index: 1; /* Set z-index to 1 */
+        left: 10px;
+        top: 100%;
+        opacity: 0;
+        transition: opacity 0.3s;
+    }
+    .tooltip:hover .tooltiptext {
+        visibility: visible;
+        opacity: 1;
+        z-index: 9999; /* Set a high z-index value when hovering */
+    }
+    """
+    rescale_js = """
+    function(x) {
+        const root = document.querySelector('gradio-app').shadowRoot || document.querySelector('gradio-app');
+        let image_scale = parseFloat(root.querySelector('#image_scale input').value) || 1.0;
+        const image_width = root.querySelector('#img2img_image').clientWidth;
+        const target_height = parseInt(image_width * image_scale);
+        document.body.style.setProperty('--height', `${target_height}px`);
+        root.querySelectorAll('button.justify-center.rounded')[0].style.display='none';
+        root.querySelectorAll('button.justify-center.rounded')[1].style.display='none';
+        return x;
+    }
+    """
+    with open('./conf/unet/config.json') as f:
+        unet_config = json.load(f)
+    unet = unet_2d_condition.UNet2DConditionModel(**unet_config).from_pretrained('runwayml/stable-diffusion-v1-5',
+                                                                                 subfolder="unet")
+    tokenizer = CLIPTokenizer.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="tokenizer")
+    text_encoder = CLIPTextModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="text_encoder")
+    vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="vae")
+    attn_map = None
+    cfg = OmegaConf.load('./conf/net_conf.yaml')
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    unet.to(device)
+    text_encoder.to(device)
+    vae.to(device)
+    with Blocks(
+            css=css,
+            analytics_enabled=False,
+            title="Layout-Guidance demo",
+    ) as demo:
+        description = """<p style="text-align: center; font-weight: bold;">
+            <span style="font-size: 28px">Layout Guidance</span>
+            <br>
+            <span style="font-size: 18px" id="paper-info">
+                [<a href=" " target="_blank">Project Page</a>]
+                [<a href=" " target="_blank">Paper</a>]
+                [<a href=" " target="_blank">GitHub</a>]
+            </span>
+        </p>
+        """
+        gr.HTML(description)
+        with gr.Column():
+            language_instruction = gr.Textbox(
+                label="Text Prompt",
+            )
+            grounding_instruction = gr.Textbox(
+                label="Grounding instruction (Separated by semicolon)",
+            )
+            sketch_pad_trigger = gr.Number(value=0, visible=False)
+            sketch_pad_resize_trigger = gr.Number(value=0, visible=False)
+            init_white_trigger = gr.Number(value=0, visible=False)
+            image_scale = gr.Number(value=0, elem_id="image_scale", visible=False)
+            new_image_trigger = gr.Number(value=0, visible=False)
+            with gr.Row():
+                sketch_pad = gr.Paint(label="Sketch Pad", elem_id="img2img_image", source='canvas', shape=(512, 512))
+                out_imagebox = gr.Image(type="pil", label="Parsed Sketch Pad")
+                out_gen_1 = gr.Image(type="pil", visible=True, label="Generated Image")
+            with gr.Row():
+                clear_btn = gr.Button(value='Clear')
+                gen_btn = gr.Button(value='Generate')
+            with gr.Accordion("Advanced Options", open=False):
+                with gr.Column():
+                    description = """<div class="tooltip">Loss Scale Factor &#9432
+                        <span class="tooltiptext">The scale factor of the backward guidance loss. The larger it is, the better control we get while it sometimes losses fidelity. </span>
+                        </div>
+                        <div class="tooltip">Guidance Scale &#9432
+                        <span class="tooltiptext">The scale factor of classifier-free guidance. </span>
+                        </div>
+                        <div class="tooltip" >Max Iteration per Step &#9432
+                        <span class="tooltiptext">The max iterations of backward guidance in each diffusion inference process.</span>
+                        </div>
+                        <div class="tooltip" >Loss Threshold &#9432
+                        <span class="tooltiptext">The threshold of loss. If the loss computed by cross-attention map is smaller then the threshold, the backward guidance is stopped. </span>
+                        </div>
+                        <div class="tooltip" >Max Step of Backward Guidance &#9432
+                        <span class="tooltiptext">The max steps of backward guidance in diffusion inference process.</span>
+                        </div>
+                    """
+                    gr.HTML(description)
+                    Loss_scale = gr.Slider(minimum=0, maximum=500, step=5, value=30,label="Loss Scale Factor")
+                    guidance_scale = gr.Slider(minimum=0, maximum=50, step=0.5, value=7.5, label="Guidance Scale")
+                    batch_size = gr.Slider(minimum=1, maximum=4, step=1, value=1, label="Number of Samples", visible=False)
+                    max_iter = gr.Slider(minimum=0, maximum=10, step=1, value=5, label="Max Iteration per Step")
+                    loss_threshold = gr.Slider(minimum=0, maximum=1, step=0.1, value=0.2, label="Loss Threshold")
+                    max_step = gr.Slider(minimum=0, maximum=50, step=1, value=10, label="Max Step of Backward Guidance")
+                        # fix_seed = gr.Checkbox(value=True, label="Fixed seed")
+                    rand_seed = gr.Slider(minimum=0, maximum=1000, step=1, value=445, label="Random Seed")
+            state = gr.State({})
+            class Controller:
+                def __init__(self):
+                    self.calls = 0
+                    self.tracks = 0
+                    self.resizes = 0
+                    self.scales = 0
+                def init_white(self, init_white_trigger):
+                    self.calls += 1
+                    return np.ones((512, 512), dtype='uint8') * 255, 1.0, init_white_trigger + 1
+                def change_n_samples(self, n_samples):
+                    blank_samples = n_samples % 2 if n_samples > 1 else 0
+                    return [gr.Image.update(visible=True) for _ in range(n_samples + blank_samples)] \
+                        + [gr.Image.update(visible=False) for _ in range(4 - n_samples - blank_samples)]
+            controller = Controller()
+            demo.load(
+                lambda x: x + 1,
+                inputs=sketch_pad_trigger,
+                outputs=sketch_pad_trigger,
+                queue=False)
+            sketch_pad.edit(
+                draw,
+                inputs=[sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state],
+                outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state],
+                queue=False,
+            )
+            grounding_instruction.change(
+                draw,
+                inputs=[sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state],
+                outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state],
+                queue=False,
+            )
+            clear_btn.click(
+                clear,
+                inputs=[sketch_pad_trigger, sketch_pad_trigger, batch_size, state],
+                outputs=[sketch_pad, sketch_pad_trigger, out_imagebox, image_scale, out_gen_1, state],
+                queue=False)
+            sketch_pad_trigger.change(
+                controller.init_white,
+                inputs=[init_white_trigger],
+                outputs=[sketch_pad, image_scale, init_white_trigger],
+                queue=False)
+            gen_btn.click(
+                fn=partial(generate, unet, vae, tokenizer, text_encoder, cfg, attn_map),
+                inputs=[
+                    language_instruction, grounding_instruction, sketch_pad,
+                    loss_threshold, guidance_scale, batch_size, rand_seed,
+                    max_step,
+                    Loss_scale, max_iter,
+                    state,
+                ],
+                outputs=[out_gen_1, state],
+                queue=True
+            )
+            sketch_pad_resize_trigger.change(
+                None,
+                None,
+                sketch_pad_resize_trigger,
+                _js=rescale_js,
+                queue=False)
+            init_white_trigger.change(
+                None,
+                None,
+                init_white_trigger,
+                _js=rescale_js,
+                queue=False)
+        with gr.Column():
+            gr.Examples(
+                examples=[
+                    [
+                        # "images/input.png",
+                        "A hello kitty toy is playing with a purple ball.",
+                        "hello kitty;ball",
+                        "images/hello_kitty_results.png"
+                    ],
+                ],
+                inputs=[language_instruction, grounding_instruction, out_gen_1],
+                outputs=None,
+                fn=None,
+                cache_examples=False,
+            )
+        description = """<p> The source codes of the demo are modified based on the <a href="https://huggingface.co/spaces/gligen/demo/tree/main">GlIGen</a>. Thanks! </p>"""
+        gr.HTML(description)
+    demo.queue(concurrency_count=1, api_open=False)
+    demo.launch(share=False, show_api=False, show_error=True)
+if __name__ == '__main__':
+    main()