# -*- coding: utf-8 -*- """Copy of compose_glide.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/19xx6Nu4FeiGj-TzTUFxBf-15IkeuFx_F """ import streamlit as st import gradio as gr import torch as th from composable_diffusion.download import download_model from composable_diffusion.model_creation import create_model_and_diffusion as create_model_and_diffusion_for_clevr from composable_diffusion.model_creation import model_and_diffusion_defaults as model_and_diffusion_defaults_for_clevr from composable_diffusion.composable_stable_diffusion.pipeline_composable_stable_diffusion import ComposableStableDiffusionPipeline # This notebook supports both CPU and GPU. # On CPU, generating one sample may take on the order of 20 minutes. # On a GPU, it should be under a minute. has_cuda = th.cuda.is_available() device = th.device('cpu' if not th.cuda.is_available() else 'cuda') # init stable diffusion model pipe = ComposableStableDiffusionPipeline.from_pretrained( "CompVis/stable-diffusion-v1-4", ).to(device) pipe.safety_checker = None # create model for CLEVR Objects clevr_options = model_and_diffusion_defaults_for_clevr() flags = { "image_size": 128, "num_channels": 192, "num_res_blocks": 2, "learn_sigma": True, "use_scale_shift_norm": False, "raw_unet": True, "noise_schedule": "squaredcos_cap_v2", "rescale_learned_sigmas": False, "rescale_timesteps": False, "num_classes": '2', "dataset": "clevr_pos", "use_fp16": has_cuda, "timestep_respacing": '100' } for key, val in flags.items(): clevr_options[key] = val clevr_model, clevr_diffusion = create_model_and_diffusion_for_clevr(**clevr_options) clevr_model.eval() if has_cuda: clevr_model.convert_to_fp16() clevr_model.to(device) clevr_model.load_state_dict(th.load(download_model('clevr_pos'), device)) print('total clevr_pos parameters', sum(x.numel() for x in clevr_model.parameters())) def compose_clevr_objects(prompt, weights, steps): weights = [float(x.strip()) for x in weights.split('|')] weights = th.tensor(weights, device=device).reshape(-1, 1, 1, 1) coordinates = [ [ float(x.split(',')[0].strip()), float(x.split(',')[1].strip())] for x in prompt.split('|') ] coordinates += [[-1, -1]] # add unconditional score label batch_size = 1 clevr_options['timestep_respacing'] = str(int(steps)) _, clevr_diffusion = create_model_and_diffusion_for_clevr(**clevr_options) def model_fn(x_t, ts, **kwargs): half = x_t[:1] combined = th.cat([half] * kwargs['y'].size(0), dim=0) model_out = clevr_model(combined, ts, **kwargs) eps, rest = model_out[:, :3], model_out[:, 3:] masks = kwargs.get('masks') cond_eps = eps[masks] uncond_eps = eps[~masks] half_eps = uncond_eps + (weights * (cond_eps - uncond_eps)).sum(dim=0, keepdims=True) eps = th.cat([half_eps] * x_t.size(0), dim=0) return th.cat([eps, rest], dim=1) def sample(coordinates): masks = [True] * (len(coordinates) - 1) + [False] model_kwargs = dict( y=th.tensor(coordinates, dtype=th.float, device=device), masks=th.tensor(masks, dtype=th.bool, device=device) ) samples = clevr_diffusion.p_sample_loop( model_fn, (len(coordinates), 3, clevr_options["image_size"], clevr_options["image_size"]), device=device, clip_denoised=True, progress=True, model_kwargs=model_kwargs, cond_fn=None, )[:batch_size] return samples samples = sample(coordinates) out_img = samples[0].permute(1, 2, 0) out_img = (out_img + 1) / 2 out_img = (out_img.detach().cpu() * 255.).to(th.uint8) out_img = out_img.numpy() return out_img def stable_diffusion_compose(prompt, steps, weights, seed): generator = th.Generator("cuda").manual_seed(int(seed)) image = pipe(prompt, guidance_scale=7.5, num_inference_steps=steps, weights=weights, generator=generator).images[0] image.save(f'{"_".join(prompt.split())}.png') return image def compose(prompt, weights, version, steps, seed): try: with th.no_grad(): if version == 'Stable_Diffusion_1v_4': res = stable_diffusion_compose(prompt, steps, weights, seed) return res else: return compose_clevr_objects(prompt, weights, steps) except Exception as e: print(e) return None examples_1 = "A castle in a forest | grainy, fog" examples_3 = '0.1, 0.5 | 0.3, 0.5 | 0.5, 0.5 | 0.7, 0.5 | 0.9, 0.5' examples_5 = 'a white church | lightning in the background' examples_6 = 'mystical trees | A dark magical pond | dark' examples_7 = 'A lake | A mountain | Cherry Blossoms next to the lake' examples = [ [examples_6, "7.5 | 7.5 | -7.5", 'Stable_Diffusion_1v_4', 50, 8], [examples_6, "7.5 | 7.5 | 7.5", 'Stable_Diffusion_1v_4', 50, 8], [examples_1, "7.5 | -7.5", 'Stable_Diffusion_1v_4', 50, 0], [examples_7, "7.5 | 7.5 | 7.5", 'Stable_Diffusion_1v_4', 50, 3], [examples_5, "7.5 | 7.5", 'Stable_Diffusion_1v_4', 50, 0], [examples_3, "7.5 | 7.5 | 7.5 | 7.5 | 7.5", 'CLEVR Objects', 100, 0] ] title = 'Compositional Visual Generation with Composable Diffusion Models' description = '

Our conjunction and negation (a.k.a. negative prompts) operators are also added into stable diffusion webui! (Negation and Conjunction)

See more information from our Project Page.

When composing multiple sentences, use `|` as the delimiter, see given examples below.

You can also specify the weight of each text by using `|` as the delimiter. When the weight is negative, it will use Negation Operator (NOT), which indicates the corresponding prompt is a negative prompt. Otherwise it will use Conjunction operator (AND).

Only Conjunction operator is enabled for CLEVR Object.

Note: When using Stable Diffusion, black images will be returned if the given prompt is detected as problematic. For composing GLIDE model, we recommend using the Colab demo in our Project Page.

' iface = gr.Interface(compose, inputs=[ gr.Textbox(label='prompt', value='mystical trees | A dark magical pond | dark'), gr.Textbox(label='weights', value='7.5 | 7.5 | -7.5'), gr.Radio(['Stable_Diffusion_1v_4', 'CLEVR Objects'], type="value", label='version', value='Stable_Diffusion_1v_4'), gr.Slider(10, 200, value=50), gr.Number(8) ], outputs='image', cache_examples=False, title=title, description=description, examples=examples) iface.launch()