# -*- coding: utf-8 -*- """Copy of compose_glide.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/19xx6Nu4FeiGj-TzTUFxBf-15IkeuFx_F """ import streamlit as st import gradio as gr import torch as th from composable_diffusion.download import download_model from composable_diffusion.model_creation import create_model_and_diffusion as create_model_and_diffusion_for_clevr from composable_diffusion.model_creation import model_and_diffusion_defaults as model_and_diffusion_defaults_for_clevr from torch import autocast from composable_stable_diffusion_pipeline import ComposableStableDiffusionPipeline # This notebook supports both CPU and GPU. # On CPU, generating one sample may take on the order of 20 minutes. # On a GPU, it should be under a minute. has_cuda = th.cuda.is_available() device = th.device('cpu' if not th.cuda.is_available() else 'cuda') print(device) # init stable diffusion model pipe = ComposableStableDiffusionPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", use_auth_token='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' ).to(device) def dummy(images, **kwargs): return images, False pipe.safety_checker = dummy # create model for CLEVR Objects clevr_options = model_and_diffusion_defaults_for_clevr() flags = { "image_size": 128, "num_channels": 192, "num_res_blocks": 2, "learn_sigma": True, "use_scale_shift_norm": False, "raw_unet": True, "noise_schedule": "squaredcos_cap_v2", "rescale_learned_sigmas": False, "rescale_timesteps": False, "num_classes": '2', "dataset": "clevr_pos", "use_fp16": has_cuda, "timestep_respacing": '100' } for key, val in flags.items(): clevr_options[key] = val clevr_model, clevr_diffusion = create_model_and_diffusion_for_clevr(**clevr_options) clevr_model.eval() if has_cuda: clevr_model.convert_to_fp16() clevr_model.to(device) clevr_model.load_state_dict(th.load(download_model('clevr_pos'), device)) print('total clevr_pos parameters', sum(x.numel() for x in clevr_model.parameters())) def compose_clevr_objects(prompt, guidance_scale, steps): coordinates = [[float(x.split(',')[0].strip()), float(x.split(',')[1].strip())] for x in prompt.split('|')] coordinates += [[-1, -1]] # add unconditional score label batch_size = 1 clevr_options['timestep_respacing'] = str(int(steps)) _, clevr_diffusion = create_model_and_diffusion_for_clevr(**clevr_options) def model_fn(x_t, ts, **kwargs): half = x_t[:1] combined = th.cat([half] * kwargs['y'].size(0), dim=0) model_out = clevr_model(combined, ts, **kwargs) eps, rest = model_out[:, :3], model_out[:, 3:] masks = kwargs.get('masks') cond_eps = eps[masks].mean(dim=0, keepdim=True) uncond_eps = eps[~masks].mean(dim=0, keepdim=True) half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps) eps = th.cat([half_eps] * x_t.size(0), dim=0) return th.cat([eps, rest], dim=1) def sample(coordinates): masks = [True] * (len(coordinates) - 1) + [False] model_kwargs = dict( y=th.tensor(coordinates, dtype=th.float, device=device), masks=th.tensor(masks, dtype=th.bool, device=device) ) samples = clevr_diffusion.p_sample_loop( model_fn, (len(coordinates), 3, clevr_options["image_size"], clevr_options["image_size"]), device=device, clip_denoised=True, progress=True, model_kwargs=model_kwargs, cond_fn=None, )[:batch_size] return samples samples = sample(coordinates) out_img = samples[0].permute(1, 2, 0) out_img = (out_img + 1) / 2 out_img = (out_img.detach().cpu() * 255.).to(th.uint8) out_img = out_img.numpy() return out_img def stable_diffusion_compose(prompt, scale, steps, weights, seed): generator = th.Generator("cuda").manual_seed(int(seed)) with autocast('cpu' if not th.cuda.is_available() else 'cuda'): image = pipe(prompt, guidance_scale=scale, num_inference_steps=steps, weights=weights, generator=generator)["sample"][0] image.save(f'{"_".join(prompt.split())}.png') return image def compose(prompt, weights, version, guidance_scale, steps, seed): try: with th.no_grad(): if version == 'Stable_Diffusion_1v_4': return stable_diffusion_compose(prompt, guidance_scale, steps, weights, seed) else: return compose_clevr_objects(prompt, guidance_scale, steps) except Exception as e: print(e) return None examples_1 = "A castle in a forest | grainy, fog" examples_2 = 'A blue sky | A mountain in the horizon | Cherry Blossoms in front of the mountain' examples_3 = '0.1, 0.5 | 0.3, 0.5 | 0.5, 0.5 | 0.7, 0.5 | 0.9, 0.5' examples_4 = 'a photo of Obama | a photo of Biden' examples_5 = 'a white church | lightning in the background' examples_6 = 'a camel | arctic' examples_7 = 'A lake | A mountain | Cherry Blossoms next to the lake' examples = [ [examples_1, "1 | -1", 'Stable_Diffusion_1v_4', 15, 50, 0], [examples_4, "1 | 1", 'Stable_Diffusion_1v_4', 15, 50, 0], [examples_7, "1 | 1 | 1", 'Stable_Diffusion_1v_4', 15, 50, 0], [examples_5, "1 | 1", 'Stable_Diffusion_1v_4', 15, 50, 0], [examples_6, "1 | 1", 'Stable_Diffusion_1v_4', 15, 50, 0], [examples_6, "1 | -1", 'Stable_Diffusion_1v_4', 15, 50, 0], [examples_3, "1 | 1 | 1 | 1 | 1", 'CLEVR Objects', 10, 100, 0] ] title = 'Compositional Visual Generation with Composable Diffusion Models' description = '

Our conjucntion and negation operators are also added into stable diffusion webui! (Negation and Conjunction)

See more information from our Project Page.

When composing multiple sentences, use `|` as the delimiter, see given examples below.

You can also specify the weight of each text by using `|` as the delimiter. When the weight is negative, it will use Negation Operator (NOT). Otherwise it will use Conjucntion operator (AND).

Only Conjunction operator is enabled for CLEVR Object.

Note: When using Stable Diffusion, black images will be returned if the given prompt is detected as problematic. For composing GLIDE model, we recommend using the Colab demo in our Project Page.

' iface = gr.Interface(compose, inputs=[ gr.Textbox(label='prompt', value='a photo of Obama | a photo of Biden'), gr.Textbox(label='weights', value='1 | 1'), gr.Radio(['Stable_Diffusion_1v_4', 'CLEVR Objects'], type="value", label='version', value='Stable_Diffusion_1v_4'), gr.Slider(2, 30, value=15), gr.Slider(10, 200, value=50), gr.Number(0) ], outputs='image', cache_examples=False, title=title, description=description, examples=examples) iface.launch()