Spaces:

akhaliq
/

stylegan3_clip

Runtime error

stylegan3_clip / app.py

Ahsen Khaliq

Update app.py

f884994 over 3 years ago

8.38 kB

	import os
	os.system("git clone https://github.com/openai/CLIP")
	os.system("pip install -e ./CLIP")
	os.system("pip install gradio==2.3.7")
	import sys
	sys.path.append('./CLIP')
	import io
	import os, time
	import pickle
	import shutil
	import numpy as np
	from PIL import Image
	import torch
	import torch.nn.functional as F
	import requests
	import torchvision.transforms as transforms
	import torchvision.transforms.functional as TF
	import clip
	from tqdm.notebook import tqdm
	from torchvision.transforms import Compose, Resize, ToTensor, Normalize
	from einops import rearrange
	import gradio as gr
	import imageio

	print(torch.cuda.get_device_name(0))
	device = torch.device('cuda:0')
	def fetch(url_or_path):
	if str(url_or_path).startswith('http://') or str(url_or_path).startswith('https://'):
	r = requests.get(url_or_path)
	r.raise_for_status()
	fd = io.BytesIO()
	fd.write(r.content)
	fd.seek(0)
	return fd
	return open(url_or_path, 'rb')
	def fetch_model(url_or_path):
	basename = os.path.basename(url_or_path)
	if os.path.exists(basename):
	return basename
	else:
	os.system("wget -c '{url_or_path}'")
	return basename
	def norm1(prompt):
	"Normalize to the unit sphere."
	return prompt / prompt.square().sum(dim=-1,keepdim=True).sqrt()
	def spherical_dist_loss(x, y):
	x = F.normalize(x, dim=-1)
	y = F.normalize(y, dim=-1)
	return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2)
	class MakeCutouts(torch.nn.Module):
	def __init__(self, cut_size, cutn, cut_pow=1.):
	super().__init__()
	self.cut_size = cut_size
	self.cutn = cutn
	self.cut_pow = cut_pow
	def forward(self, input):
	sideY, sideX = input.shape[2:4]
	max_size = min(sideX, sideY)
	min_size = min(sideX, sideY, self.cut_size)
	cutouts = []
	for _ in range(self.cutn):
	size = int(torch.rand([])*self.cut_pow (max_size - min_size) + min_size)
	offsetx = torch.randint(0, sideX - size + 1, ())
	offsety = torch.randint(0, sideY - size + 1, ())
	cutout = input[:, :, offsety:offsety + size, offsetx:offsetx + size]
	cutouts.append(F.adaptive_avg_pool2d(cutout, self.cut_size))
	return torch.cat(cutouts)
	make_cutouts = MakeCutouts(224, 32, 0.5)
	def embed_image(image):
	n = image.shape[0]
	cutouts = make_cutouts(image)
	embeds = clip_model.embed_cutout(cutouts)
	embeds = rearrange(embeds, '(cc n) c -> cc n c', n=n)
	return embeds
	def embed_url(url):
	image = Image.open(fetch(url)).convert('RGB')
	return embed_image(TF.to_tensor(image).to(device).unsqueeze(0)).mean(0).squeeze(0)
	class CLIP(object):
	def __init__(self):
	clip_model = "ViT-B/32"
	self.model, _ = clip.load(clip_model)
	self.model = self.model.requires_grad_(False)
	self.normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
	std=[0.26862954, 0.26130258, 0.27577711])
	@torch.no_grad()
	def embed_text(self, prompt):
	"Normalized clip text embedding."
	return norm1(self.model.encode_text(clip.tokenize(prompt).to(device)).float())
	def embed_cutout(self, image):
	"Normalized clip image embedding."
	return norm1(self.model.encode_image(self.normalize(image)))

	clip_model = CLIP()
	# Load stylegan model
	base_url = "https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/"
	model_name = "stylegan3-t-ffhqu-1024x1024.pkl"
	#model_name = "stylegan3-r-metfacesu-1024x1024.pkl"
	#model_name = "stylegan3-t-afhqv2-512x512.pkl"
	network_url = base_url + model_name
	os.system("wget -c https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-t-ffhqu-1024x1024.pkl")
	with open('stylegan3-t-ffhqu-1024x1024.pkl', 'rb') as fp:
	G = pickle.load(fp)['G_ema'].to(device)
	zs = torch.randn([10000, G.mapping.z_dim], device=device)
	w_stds = G.mapping(zs, None).std(0)


	def inference(text,steps,image,mode, seed):
	if mode == "CLIP+StyleGAN3":
	all_frames = []
	target = clip_model.embed_text(text)
	if image:
	target = embed_image(TF.to_tensor(image).to(device).unsqueeze(0)).mean(0).squeeze(0)
	else:
	target = clip_model.embed_text(text)
	steps = steps
	#seed = 2
	seed = -1
	if seed == -1:
	seed = np.random.randint(0,2**32 - 1)
	tf = Compose([
	Resize(224),
	lambda x: torch.clamp((x+1)/2,min=0,max=1),
	])
	torch.manual_seed(seed)
	timestring = time.strftime('%Y%m%d%H%M%S')
	with torch.no_grad():
	qs = []
	losses = []
	for _ in range(8):
	q = (G.mapping(torch.randn([4,G.mapping.z_dim], device=device), None, truncation_psi=0.7) - G.mapping.w_avg) / w_stds
	images = G.synthesis(q * w_stds + G.mapping.w_avg)
	embeds = embed_image(images.add(1).div(2))
	loss = spherical_dist_loss(embeds, target).mean(0)
	i = torch.argmin(loss)
	qs.append(q[i])
	losses.append(loss[i])
	qs = torch.stack(qs)
	losses = torch.stack(losses)
	print(losses)
	print(losses.shape, qs.shape)
	i = torch.argmin(losses)
	q = qs[i].unsqueeze(0)
	q.requires_grad_()
	q_ema = q
	opt = torch.optim.AdamW([q], lr=0.03, betas=(0.0,0.999))
	loop = tqdm(range(steps))
	for i in loop:
	opt.zero_grad()
	w = q * w_stds
	image = G.synthesis(w + G.mapping.w_avg, noise_mode='const')
	embed = embed_image(image.add(1).div(2))
	loss = spherical_dist_loss(embed, target).mean()
	loss.backward()
	opt.step()
	loop.set_postfix(loss=loss.item(), q_magnitude=q.std().item())
	q_ema = q_ema * 0.9 + q * 0.1
	image = G.synthesis(q_ema * w_stds + G.mapping.w_avg, noise_mode='const')
	pil_image = TF.to_pil_image(image[0].add(1).div(2).clamp(0,1))
	all_frames.append(pil_image)
	#os.makedirs(f'samples/{timestring}', exist_ok=True)
	#pil_image.save(f'samples/{timestring}/{i:04}.jpg')
	writer = imageio.get_writer('test.mp4', fps=15)
	for im in all_frames:
	writer.append_data(np.array(im))
	writer.close()
	return pil_image, "test.mp4"
	else:
	os.system("python gen_images.py --outdir=out --trunc=1 --seeds="+str(seed)+" \
	--network=https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhq-1024x1024.pkl")
	os.system("python gen_video.py --output=lerp.mp4 --trunc=1 --seeds=0-"+str(seed)+" --grid=1x1 \
	--network=https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-ffhq-1024x1024.pkl")
	img = Image.new("RGB", (800, 1280), (255, 255, 255))
	out = Image.open(f'out/seed{seed:04d}.png')
	return out, "lerp.mp4"


	title = "StyleGAN3+CLIP"
	description = "Gradio demo for StyleGAN3+CLIP: Generates images (mostly faces) using StyleGAN3 with CLIP guidance. To use it, simply add your text, or click one of the examples to load them. Read more at the links below."
	article = "<p style='text-align: center'><a href='https://colab.research.google.com/drive/1eYlenR1GHPZXt-YuvXabzO9wfh9CWY36' target='_blank'>Colab</a> Written by nshepperd (https://twitter.com/nshepperd1, https://github.com/nshepperd). Thanks to Katherine Crowson (https://twitter.com/RiversHaveWings, https://github.com/crowsonkb) for coming up with many improved sampling tricks, as well as some of the code \| <a href='https://github.com/NVlabs/stylegan3' target='_blank'>StyleGAN3 Gihub</a> \| <a href='https://github.com/openai/CLIP' target='_blank'>CLIP Github</a></p>"
	examples = [['mario',150,None,"CLIP+StyleGAN3"],['',150,None,"Stylegan3",10]]
	gr.Interface(
	inference,
	["text",gr.inputs.Slider(minimum=50, maximum=200, step=1, default=150, label="steps"),gr.inputs.Image(type="pil", label="Image (Optional)", optional=True),gr.inputs.Radio(["CLIP+StyleGAN3","Stylegan3"], type="value", default="CLIP+StyleGAN3", label="mode"),gr.inputs.Slider(minimum=5, maximum=10, step=1, default=5, label="seed (for stylegan3)")],
	[gr.outputs.Image(type="pil", label="Output"),"playable_video"],
	title=title,
	description=description,
	article=article,
	enable_queue=True,
	examples=examples
	).launch(debug=True)