import os import sys os.system('pip install gradio==2.3.0a0') import gradio as gr os.system('git clone https://github.com/openai/CLIP') os.system('git clone https://github.com/openai/guided-diffusion') os.system('pip install -e ./CLIP') os.system('pip install -e ./guided-diffusion') os.system('pip install kornia') os.system("curl -OL 'https://openaipublic.blob.core.windows.net/diffusion/jul-2021/256x256_diffusion_uncond.pt'") # Imports import math import sys #from IPython import display from kornia import augmentation, filters from PIL import Image import torch from torch import nn from torch.nn import functional as F from torchvision import transforms from torchvision.transforms import functional as TF from tqdm.notebook import tqdm sys.path.append('./CLIP') sys.path.append('./guided-diffusion') import clip from guided_diffusion.script_util import create_model_and_diffusion, model_and_diffusion_defaults import numpy as np import imageio # Model settings model_config = model_and_diffusion_defaults() model_config.update({ 'attention_resolutions': '32, 16, 8', 'class_cond': False, 'diffusion_steps': 1000, 'rescale_timesteps': False, 'timestep_respacing': '500', 'image_size': 256, 'learn_sigma': True, 'noise_schedule': 'linear', 'num_channels': 256, 'num_head_channels': 64, 'num_res_blocks': 2, 'resblock_updown': True, 'use_fp16': True, 'use_scale_shift_norm': True, }) # Load models and define necessary functions device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print('Using device:', device) model, diffusion = create_model_and_diffusion(**model_config) model.load_state_dict(torch.load('256x256_diffusion_uncond.pt', map_location='cpu')) model.eval().requires_grad_(False).to(device) if model_config['use_fp16']: model.convert_to_fp16() clip_model = clip.load('ViT-B/16', jit=False)[0].eval().requires_grad_(False).to(device) clip_size = clip_model.visual.input_resolution normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]) def spherical_dist_loss(x, y): x = F.normalize(x, dim=-1) y = F.normalize(y, dim=-1) return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2) def inference(text): all_frames = [] prompt = text batch_size = 1 clip_guidance_scale = 2750 seed = 0 if seed is not None: torch.manual_seed(seed) text_embed = clip_model.encode_text(clip.tokenize(prompt).to(device)).float() translate_by = 8 / clip_size if translate_by: aug = augmentation.RandomAffine(0, (translate_by, translate_by), padding_mode='border', p=1) else: aug = nn.Identity() cur_t = diffusion.num_timesteps - 1 def cond_fn(x, t, y=None): with torch.enable_grad(): x_in = x.detach().requires_grad_() sigma = min(24, diffusion.sqrt_recipm1_alphas_cumprod[cur_t] / 4) kernel_size = max(math.ceil((sigma * 6 + 1) / 2) * 2 - 1, 3) x_blur = filters.gaussian_blur2d(x_in, (kernel_size, kernel_size), (sigma, sigma)) clip_in = F.interpolate(aug(x_blur.add(1).div(2)), (clip_size, clip_size), mode='bilinear', align_corners=False) image_embed = clip_model.encode_image(normalize(clip_in)).float() losses = spherical_dist_loss(image_embed, text_embed) grad = -torch.autograd.grad(losses.sum(), x_in)[0] return grad * clip_guidance_scale samples = diffusion.p_sample_loop_progressive( model, (batch_size, 3, model_config['image_size'], model_config['image_size']), clip_denoised=True, model_kwargs={}, cond_fn=cond_fn, progress=True, ) for i, sample in enumerate(samples): cur_t -= 1 if i % 1 == 0 or cur_t == -1: print() for j, image in enumerate(sample['pred_xstart']): #filename = f'progress_{j:05}.png' img = TF.to_pil_image(image.add(1).div(2).clamp(0, 1)) all_frames.append(img) tqdm.write(f'Step {i}, output {j}:') #display.display(display.Image(filename)) writer = imageio.get_writer('video.mp4', fps=20) for im in all_frames: writer.append_data(np.array(im)) writer.close() return img, 'video.mp4' title = "CLIP Guided Diffusion" description = "Gradio demo for CLIP Guided Diffusion. To use it, simply add your text, or click one of the examples to load them. Read more at the links below." article = "

By Katherine Crowson (https://github.com/crowsonkb, https://twitter.com/RiversHaveWings). It uses OpenAI's 256x256 unconditional ImageNet diffusion model (https://github.com/openai/guided-diffusion) together with CLIP (https://github.com/openai/CLIP) to connect text prompts with images. | Colab

" iface = gr.Interface(inference, inputs="text", outputs=["image","video"], title=title, description=description, article=article, examples=[["coral reef city by artistation artists"]], enable_queue=True) iface.launch()