import gradio as gr import torch from omegaconf import OmegaConf from gligen.task_grounded_generation import grounded_generation_box, load_ckpt, load_common_ckpt import json import numpy as np from PIL import Image, ImageDraw, ImageFont from functools import partial from collections import Counter import math import gc from gradio import processing_utils from typing import Optional import warnings from datetime import datetime from example_component import create_examples from huggingface_hub import hf_hub_download hf_hub_download = partial(hf_hub_download, library_name="gligen_demo") import cv2 import sys sys.tracebacklimit = 0 def load_from_hf(repo_id, filename='diffusion_pytorch_model.bin', subfolder=None): cache_file = hf_hub_download(repo_id=repo_id, filename=filename, subfolder=subfolder) return torch.load(cache_file, map_location='cpu') def load_ckpt_config_from_hf(modality): ckpt = load_from_hf('gligen/demo_ckpts_legacy', filename=f'{modality}.pth', subfolder='model') config = load_from_hf('gligen/demo_ckpts_legacy', filename=f'{modality}.pth', subfolder='config') return ckpt, config def ckpt_load_helper(modality, is_inpaint, is_style, common_instances=None): pretrained_ckpt_gligen, config = load_ckpt_config_from_hf(modality) config = OmegaConf.create( config["_content"] ) # config used in training config.alpha_scale = 1.0 if common_instances is None: common_ckpt = load_from_hf('gligen/demo_ckpts_legacy', filename=f'common.pth', subfolder='model') common_instances = load_common_ckpt(config, common_ckpt) loaded_model_list = load_ckpt(config, pretrained_ckpt_gligen, common_instances) return loaded_model_list, common_instances class Instance: def __init__(self, capacity = 2): self.model_type = 'base' self.loaded_model_list = {} self.counter = Counter() self.global_counter = Counter() self.loaded_model_list['base'], self.common_instances = ckpt_load_helper( 'gligen-generation-text-box', is_inpaint=False, is_style=False, common_instances=None ) self.capacity = capacity def _log(self, model_type, batch_size, instruction, phrase_list): self.counter[model_type] += 1 self.global_counter[model_type] += 1 current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print('[{}] Current: {}, All: {}. Samples: {}, prompt: {}, phrases: {}'.format( current_time, dict(self.counter), dict(self.global_counter), batch_size, instruction, phrase_list )) def get_model(self, model_type, batch_size, instruction, phrase_list): if model_type in self.loaded_model_list: self._log(model_type, batch_size, instruction, phrase_list) return self.loaded_model_list[model_type] if self.capacity == len(self.loaded_model_list): least_used_type = self.counter.most_common()[-1][0] del self.loaded_model_list[least_used_type] del self.counter[least_used_type] gc.collect() torch.cuda.empty_cache() self.loaded_model_list[model_type] = self._get_model(model_type) self._log(model_type, batch_size, instruction, phrase_list) return self.loaded_model_list[model_type] def _get_model(self, model_type): if model_type == 'base': return ckpt_load_helper( 'gligen-generation-text-box', is_inpaint=False, is_style=False, common_instances=self.common_instances )[0] elif model_type == 'inpaint': return ckpt_load_helper( 'gligen-inpainting-text-box', is_inpaint=True, is_style=False, common_instances=self.common_instances )[0] elif model_type == 'style': return ckpt_load_helper( 'gligen-generation-text-image-box', is_inpaint=False, is_style=True, common_instances=self.common_instances )[0] assert False instance = Instance() def load_clip_model(): from transformers import CLIPProcessor, CLIPModel version = "openai/clip-vit-large-patch14" model = CLIPModel.from_pretrained(version).cuda() processor = CLIPProcessor.from_pretrained(version) return { 'version': version, 'model': model, 'processor': processor, } clip_model = load_clip_model() class ImageMask(gr.components.Image): """ Sets: source="canvas", tool="sketch" """ is_template = True def __init__(self, **kwargs): super().__init__(source="upload", tool="sketch", interactive=True, **kwargs) def preprocess(self, x): if x is None: return x if self.tool == "sketch" and self.source in ["upload", "webcam"] and type(x) != dict: decode_image = processing_utils.decode_base64_to_image(x) width, height = decode_image.size mask = np.zeros((height, width, 4), dtype=np.uint8) mask[..., -1] = 255 mask = self.postprocess(mask) x = {'image': x, 'mask': mask} print('vao preprocess-------------------------') hh = super().preprocess(x) return hh class Blocks(gr.Blocks): def __init__( self, theme: str = "default", analytics_enabled: Optional[bool] = None, mode: str = "blocks", title: str = "Gradio", css: Optional[str] = None, **kwargs, ): self.extra_configs = { 'thumbnail': kwargs.pop('thumbnail', ''), 'url': kwargs.pop('url', 'https://gradio.app/'), 'creator': kwargs.pop('creator', '@teamGradio'), } super(Blocks, self).__init__(theme, analytics_enabled, mode, title, css, **kwargs) warnings.filterwarnings("ignore") def get_config_file(self): config = super(Blocks, self).get_config_file() for k, v in self.extra_configs.items(): config[k] = v return config ''' inference model ''' # @torch.no_grad() def inference(task, language_instruction, phrase_list, location_list, inpainting_boxes_nodrop, image, alpha_sample, guidance_scale, batch_size, fix_seed, rand_seed, actual_mask, style_image, *args, **kwargs): # import pdb; pdb.set_trace() # grounding_instruction = json.loads(grounding_instruction) # phrase_list, location_list = [], [] # for k, v in grounding_instruction.items(): # phrase_list.append(k) # location_list.append(v) placeholder_image = Image.open('images/teddy.jpg').convert("RGB") image_list = [placeholder_image] * len(phrase_list) # placeholder input for visual prompt, which is disabled batch_size = int(batch_size) if not 1 <= batch_size <= 4: batch_size = 1 if style_image == None: has_text_mask = 1 has_image_mask = 0 # then we hack above 'image_list' else: valid_phrase_len = len(phrase_list) phrase_list += ['placeholder'] has_text_mask = [1]*valid_phrase_len + [0] image_list = [placeholder_image]*valid_phrase_len + [style_image] has_image_mask = [0]*valid_phrase_len + [1] location_list += [ [0.0, 0.0, 1, 0.01] ] # style image grounding location instruction = dict( prompt = language_instruction, phrases = phrase_list, images = image_list, locations = location_list, alpha_type = [alpha_sample, 0, 1.0 - alpha_sample], has_text_mask = has_text_mask, has_image_mask = has_image_mask, save_folder_name = language_instruction, guidance_scale = guidance_scale, batch_size = batch_size, fix_seed = bool(fix_seed), rand_seed = int(rand_seed), actual_mask = actual_mask, inpainting_boxes_nodrop = inpainting_boxes_nodrop, ) get_model = partial(instance.get_model, batch_size=batch_size, instruction=language_instruction, phrase_list=phrase_list) with torch.autocast(device_type='cuda', dtype=torch.float16): if task == 'User provide boxes' or 'Available boxes': if style_image == None: result = grounded_generation_box(get_model('base'), instruction, *args, **kwargs) torch.cuda.empty_cache() return result else: return grounded_generation_box(get_model('style'), instruction, *args, **kwargs) def draw_box(boxes=[], texts=[], img=None): if len(boxes) == 0 and img is None: return None if img is None: img = Image.new('RGB', (512, 512), (255, 255, 255)) colors = ["red", "olive", "blue", "green", "orange", "brown", "cyan", "purple"] draw = ImageDraw.Draw(img) font = ImageFont.truetype("DejaVuSansMono.ttf", size=18) for bid, box in enumerate(boxes): draw.rectangle([box[0], box[1], box[2], box[3]], outline=colors[bid % len(colors)], width=4) anno_text = texts[bid] draw.rectangle([box[0], box[3] - int(font.size * 1.2), box[0] + int((len(anno_text) + 0.8) * font.size * 0.6), box[3]], outline=colors[bid % len(colors)], fill=colors[bid % len(colors)], width=4) draw.text([box[0] + int(font.size * 0.2), box[3] - int(font.size*1.2)], anno_text, font=font, fill=(255,255,255)) return img def get_concat(ims): if len(ims) == 1: n_col = 1 else: n_col = 2 n_row = math.ceil(len(ims) / 2) dst = Image.new('RGB', (ims[0].width * n_col, ims[0].height * n_row), color="white") for i, im in enumerate(ims): row_id = i // n_col col_id = i % n_col dst.paste(im, (im.width * col_id, im.height * row_id)) return dst def auto_append_grounding(language_instruction, grounding_texts): for grounding_text in grounding_texts: if grounding_text.lower() not in language_instruction.lower() and grounding_text != 'auto': language_instruction += "; " + grounding_text return language_instruction def generate(task, language_instruction, grounding_texts, sketch_pad, alpha_sample, guidance_scale, batch_size, fix_seed, rand_seed, use_actual_mask, append_grounding, style_cond_image, state): if 'boxes' not in state: state['boxes'] = [] boxes = state['boxes'] grounding_texts = [x.strip() for x in grounding_texts.split(';')] # assert len(boxes) == len(grounding_texts) if len(boxes) != len(grounding_texts): if len(boxes) < len(grounding_texts): raise ValueError("""The number of boxes should be equal to the number of grounding objects. Number of boxes drawn: {}, number of grounding tokens: {}. Please draw boxes accordingly on the sketch pad.""".format(len(boxes), len(grounding_texts))) grounding_texts = grounding_texts + [""] * (len(boxes) - len(grounding_texts)) boxes = (np.asarray(boxes) / 512).tolist() grounding_instruction = json.dumps({obj: box for obj,box in zip(grounding_texts, boxes)}) image = None actual_mask = None if append_grounding: language_instruction = auto_append_grounding(language_instruction, grounding_texts) gen_images, gen_overlays = inference( task, language_instruction, grounding_texts,boxes, boxes, image, alpha_sample, guidance_scale, batch_size, fix_seed, rand_seed, actual_mask, style_cond_image, clip_model=clip_model, ) blank_samples = batch_size % 2 if batch_size > 1 else 0 gen_images = [gr.Image.update(value=x, visible=True) for i,x in enumerate(gen_images)] \ + [gr.Image.update(value=None, visible=True) for _ in range(blank_samples)] \ + [gr.Image.update(value=None, visible=False) for _ in range(4 - batch_size - blank_samples)] return gen_images + [state] def binarize(x): return (x != 0).astype('uint8') * 255 def binarize_2(x): gray_image = cv2.cvtColor(x, cv2.COLOR_BGR2GRAY) return (gray_image!=255).astype('uint8') * 255 def sized_center_crop(img, cropx, cropy): y, x = img.shape[:2] startx = x // 2 - (cropx // 2) starty = y // 2 - (cropy // 2) return img[starty:starty+cropy, startx:startx+cropx] def sized_center_fill(img, fill, cropx, cropy): y, x = img.shape[:2] startx = x // 2 - (cropx // 2) starty = y // 2 - (cropy // 2) img[starty:starty+cropy, startx:startx+cropx] = fill return img def sized_center_mask(img, cropx, cropy): y, x = img.shape[:2] startx = x // 2 - (cropx // 2) starty = y // 2 - (cropy // 2) center_region = img[starty:starty+cropy, startx:startx+cropx].copy() img = (img * 0.2).astype('uint8') img[starty:starty+cropy, startx:startx+cropx] = center_region return img def center_crop(img, HW=None, tgt_size=(512, 512)): if HW is None: H, W = img.shape[:2] HW = min(H, W) img = sized_center_crop(img, HW, HW) img = Image.fromarray(img) img = img.resize(tgt_size) return np.array(img) def draw(task, input, grounding_texts, new_image_trigger, state, generate_parsed, box_image): print('input', generate_parsed) if type(input) == dict: image = input['image'] mask = input['mask'] if generate_parsed==1: generate_parsed = 0 # import pdb; pdb.set_trace() print('do nothing') print('boxes', box_image.size) return [box_image, new_image_trigger, 1., state, generate_parsed] else: mask = input if mask.ndim == 3: mask = mask[..., 0] image_scale = 1.0 print('vao draw--------------------') mask = binarize(mask) if mask.shape != (512, 512): # assert False, "should not receive any non- 512x512 masks." if 'original_image' in state and state['original_image'].shape[:2] == mask.shape: mask = center_crop(mask, state['inpaint_hw']) image = center_crop(state['original_image'], state['inpaint_hw']) else: mask = np.zeros((512, 512), dtype=np.uint8) mask = binarize(mask) if type(mask) != np.ndarray: mask = np.array(mask) # if mask.sum() == 0: state = {} print('delete state') if True: image = None else: image = Image.fromarray(image) if 'boxes' not in state: state['boxes'] = [] if 'masks' not in state or len(state['masks']) == 0 : state['masks'] = [] last_mask = np.zeros_like(mask) else: last_mask = state['masks'][-1] if type(mask) == np.ndarray and mask.size > 1 : diff_mask = mask - last_mask else: diff_mask = np.zeros([]) if diff_mask.sum() > 0: x1x2 = np.where(diff_mask.max(0) > 1)[0] y1y2 = np.where(diff_mask.max(1) > 1)[0] y1, y2 = y1y2.min(), y1y2.max() x1, x2 = x1x2.min(), x1x2.max() if (x2 - x1 > 5) and (y2 - y1 > 5): state['masks'].append(mask.copy()) state['boxes'].append((x1, y1, x2, y2)) grounding_texts = [x.strip() for x in grounding_texts.split(';')] grounding_texts = [x for x in grounding_texts if len(x) > 0] if len(grounding_texts) < len(state['boxes']): grounding_texts += [f'Obj. {bid+1}' for bid in range(len(grounding_texts), len(state['boxes']))] if box_image is not None: print('boxes', box_image.size) box_image = draw_box(state['boxes'], grounding_texts, image) generate_parsed = 0 print('img scale',image_scale) return [box_image, new_image_trigger, image_scale, state, generate_parsed] def change_state(bboxes,layout, state, instruction): # mask = state['boxes'] = [] state['masks'] = [] image = None list_boxes = bboxes.split('/') result =[] for b in list_boxes: ints = b[1:-1].split(',') l = [] for i in ints: l.append(int(i)) result.append(l) print('run change state') for box in result: state['boxes'].append(box) grounding_texts = [x.strip() for x in instruction.split(';')] grounding_texts = [x for x in grounding_texts if len(x) > 0] if len(grounding_texts) < len(result): grounding_texts += [f'Obj. {bid+1}' for bid in range(len(grounding_texts), len(result))] box_image = draw_box(result, grounding_texts) mask= np.asarray(box_image) mask = binarize_2(mask) state['masks'].append(mask.copy()) # print('done change state', state) print('done change state') # import pdb; pdb.set_trace() return [box_image,state] def example_click(name, grounding_instruction, instruction, bboxes,generate_parsed, trigger_parsed): list_boxes = bboxes.split('/') result =[] for b in list_boxes: ints = b[1:-1].split(',') l = [] for i in ints: l.append(int(i)) result.append(l) print('run change state') box_image = draw_box(result, instruction) trigger_parsed += 1 print('done the example click') return [box_image, trigger_parsed] def clear(task, sketch_pad_trigger, batch_size, state, switch_task=False): sketch_pad_trigger = sketch_pad_trigger + 1 blank_samples = batch_size % 2 if batch_size > 1 else 0 out_images = [gr.Image.update(value=None, visible=True) for i in range(batch_size)] \ + [gr.Image.update(value=None, visible=True) for _ in range(blank_samples)] \ + [gr.Image.update(value=None, visible=False) for _ in range(4 - batch_size - blank_samples)] state = {} return [None, sketch_pad_trigger, None, 1.0] + out_images + [state] css = """ #img2img_image, #img2img_image > .fixed-height, #img2img_image > .fixed-height > div, #img2img_image > .fixed-height > div > img { height: var(--height) !important; max-height: var(--height) !important; min-height: var(--height) !important; } #paper-info a { color:#008AD7; text-decoration: none; } #paper-info a:hover { cursor: pointer; text-decoration: none; } #my_image > div.fixed-height { height: var(--height) !important; } """ rescale_js = """ function(x) { const root = document.querySelector('gradio-app').shadowRoot || document.querySelector('gradio-app'); let image_scale = parseFloat(root.querySelector('#image_scale input').value) || 1.0; const image_width = root.querySelector('#img2img_image').clientWidth; const target_height = parseInt(image_width * image_scale); document.body.style.setProperty('--height', `${target_height}px`); root.querySelectorAll('button.justify-center.rounded')[0].style.display='none'; root.querySelectorAll('button.justify-center.rounded')[1].style.display='none'; return x; } """ # [Paper] with Blocks( css=css, analytics_enabled=False, title="Attention-refocusing demo", ) as main: description = """

Grounded Text-to-Image Synthesis with Attention Refocusing
[Project Page] [GitHub]

To identify the areas of interest based on specific spatial parameters, you need to (1) ⌨️ input the names of the concepts you're interested in Grounding Instruction, and (2) 🖱️ draw their corresponding bounding boxes using Sketch Pad -- the parsed boxes will automatically be showed up once you've drawn them.
For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. Duplicate Space

""" gr.HTML(description) with gr.Row(): with gr.Column(scale=4): sketch_pad_trigger = gr.Number(value=0, visible=False) sketch_pad_resize_trigger = gr.Number(value=0, visible=False) trigger_stage = gr.Number(value=0, visible=False) init_white_trigger = gr.Number(value=0, visible=False) image_scale = gr.Number(value=1.0, elem_id="image_scale", visible=False) new_image_trigger = gr.Number(value=0, visible=False) text_box = gr.Textbox(visible=False) generate_parsed = gr.Number(value=0, visible=False) task = gr.Radio( choices=["Available boxes", 'User provide boxes'], type="value", value="User provide boxes", label="Task", visible=False ) language_instruction = gr.Textbox( label="Language instruction", ) grounding_instruction = gr.Textbox( label="Grounding instruction (Separated by semicolon)", ) with gr.Row(): sketch_pad = ImageMask(label="Sketch Pad", elem_id="img2img_image") out_imagebox = gr.Image(type="pil",elem_id="my_image" ,label="Parsed Sketch Pad", shape=(512,512)) with gr.Row(): clear_btn = gr.Button(value='Clear') gen_btn = gr.Button(value='Generate') with gr.Row(): parsed_btn = gr.Button(value='generate parsed boxes', visible=False) with gr.Accordion("Advanced Options", open=False): with gr.Column(): alpha_sample = gr.Slider(minimum=0, maximum=1.0, step=0.1, value=0.3, label="Scheduled Sampling (τ)") guidance_scale = gr.Slider(minimum=0, maximum=50, step=0.5, value=7.5, label="Guidance Scale") batch_size = gr.Slider(minimum=1, maximum=4,visible=False, step=1, value=1, label="Number of Samples") append_grounding = gr.Checkbox(value=True, label="Append grounding instructions to the caption") use_actual_mask = gr.Checkbox(value=False, label="Use actual mask for inpainting", visible=False) with gr.Row(): fix_seed = gr.Checkbox(value=True, label="Fixed seed") rand_seed = gr.Slider(minimum=0, maximum=1000, step=1, value=0, label="Seed") with gr.Row(): use_style_cond = gr.Checkbox(value=False,visible=False, label="Enable Style Condition") style_cond_image = gr.Image(type="pil",visible=False, label="Style Condition", interactive=True) with gr.Column(scale=4): gr.HTML('Generated Images') with gr.Row(): out_gen_1 = gr.Image(type="pil", visible=True, show_label=False) out_gen_2 = gr.Image(type="pil", visible=False, show_label=False) with gr.Row(): out_gen_3 = gr.Image(type="pil", visible=False, show_label=False) out_gen_4 = gr.Image(type="pil", visible=False, show_label=False) state = gr.State({}) class Controller: def __init__(self): self.calls = 0 self.tracks = 0 self.resizes = 0 self.scales = 0 def init_white(self, init_white_trigger): self.calls += 1 return np.ones((512, 512), dtype='uint8') * 255, 1.0, init_white_trigger+1 def change_n_samples(self, n_samples): blank_samples = n_samples % 2 if n_samples > 1 else 0 return [gr.Image.update(visible=True) for _ in range(n_samples + blank_samples)] \ + [gr.Image.update(visible=False) for _ in range(4 - n_samples - blank_samples)] controller = Controller() main.load( lambda x:x+1, inputs=sketch_pad_trigger, outputs=sketch_pad_trigger, queue=False) sketch_pad.edit( draw, inputs=[task, sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state, generate_parsed, out_imagebox], outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state, generate_parsed], queue=False, ) trigger_stage.change( change_state, inputs=[text_box,sketch_pad, state, grounding_instruction], outputs=[out_imagebox,state], queue=True ) grounding_instruction.change( draw, inputs=[task, sketch_pad, grounding_instruction, sketch_pad_resize_trigger, state, generate_parsed,out_imagebox], outputs=[out_imagebox, sketch_pad_resize_trigger, image_scale, state, generate_parsed], queue=False, ) clear_btn.click( clear, inputs=[task, sketch_pad_trigger, batch_size, state], outputs=[sketch_pad, sketch_pad_trigger, out_imagebox, image_scale, out_gen_1, out_gen_2, out_gen_3, out_gen_4, state], queue=False) sketch_pad_trigger.change( controller.init_white, inputs=[init_white_trigger], outputs=[sketch_pad, image_scale, init_white_trigger], queue=False) gen_btn.click( generate, inputs=[ task, language_instruction, grounding_instruction, sketch_pad, alpha_sample, guidance_scale, batch_size, fix_seed, rand_seed, use_actual_mask, append_grounding, style_cond_image, state, ], outputs=[out_gen_1, out_gen_2, out_gen_3, out_gen_4, state], queue=True ) init_white_trigger.change( None, None, init_white_trigger, _js=rescale_js, queue=False) examples = [ [ 'guide_imgs/0_a_cat_on_the_right_of_a_dog.jpg', "a cat;a dog", "a cat on the right of a dog", '(291, 88, 481, 301)/(25, 64, 260, 391)', 1, 1 ], [ 'guide_imgs/0_a_bus_on_the_left_of_a_car.jpg',#'guide_imgs/0_a_bus_on_the_left_of_a_car.jpg', "a bus;a car", "a bus and a car", '(8,128,266,384)/(300,196,502,316)', #'(8,128,266,384)', #/(300,196,502,316) 1, 2 ], [ 'guide_imgs/1_Two_cars_on_the_street..jpg', "a car;a car", "Two cars on the street.", '(34, 98, 247, 264)/(271, 122, 481, 293)', 1, 3 ], [ 'guide_imgs/80_two_apples_lay_side_by_side_on_a_wooden_table,_their_glossy_red_and_green_skins_glinting_in_the_sunlight..jpg', "an apple;an apple", "two apples lay side by side on a wooden table, their glossy red and green skins glinting in the sunlight.", '(40, 210, 235, 450)/(275, 210, 470, 450)', 1, 4 ], [ 'guide_imgs/10_A_banana_on_the_left_of_an_apple..jpg', "a banana;an apple", "A banana on the left of an apple.", '(62, 193, 225, 354)/(300, 184, 432, 329)', 1, 5 ], [ 'guide_imgs/15_A_pizza_on_the_right_of_a_suitcase..jpg', "a pizza ;a suitcase", "A pizza on the right of a suitcase.", '(307, 112, 490, 280)/(41, 120, 244, 270)', 1, 6 ], [ 'guide_imgs/1_A_wine_glass_on_top_of_a_dog..jpg', "a wine glass;a dog", "A wine glass on top of a dog.", '(206, 78, 306, 214)/(137, 222, 367, 432)', 1, 7 ] , [ 'guide_imgs/2_A_bicycle_on_top_of_a_boat..jpg', "a bicycle;a boat", "A bicycle on top of a boat.", '(185, 110, 335, 205)/(111, 228, 401, 373)', 1, 8 ] , [ 'guide_imgs/4_A_laptop_on_top_of_a_teddy_bear..jpg', "a laptop;a teddy bear", "A laptop on top of a teddy bear.", '(180, 70, 332, 210)/(150, 240, 362, 420)', 1, 9 ] , [ 'guide_imgs/0_A_train_on_top_of_a_surfboard..jpg', "a train;a surfboard", "A train on top of a surfboard.", '(130, 80, 385, 240)/(75, 260, 440, 450)', 1, 10 ] ] with gr.Column(): create_examples( examples=examples, inputs=[sketch_pad, grounding_instruction,language_instruction , text_box, generate_parsed, trigger_stage], outputs=None, fn=None, cache_examples=False, ) main.queue(concurrency_count=1, api_open=False) main.launch(share=False, show_api=False, show_error=True, debug=False, server_name="0.0.0.0")