Spaces:

akhaliq
/

stylegan3_clip

Runtime error

App Files Files Community

Ahsen Khaliq commited on Oct 13, 2021

Commit

1dca94e

•

1 Parent(s): a7aebee

Update app.py

Browse files

Files changed (1) hide show

app.py +166 -115

app.py CHANGED Viewed

@@ -1,135 +1,186 @@
 import os
-os.system("""pip install --upgrade https://github.com/podgorskiy/dnnlib/releases/download/0.0.1/dnnlib-0.0.1-py3-none-any.whl numpy tqdm Pillow torch-utils==0.0.7 torch==1.9.0+cu111 torchvision==0.10.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html ftfy regex git+https://github.com/openai/CLIP.git ninja git+https://github.com/geoopt/geoopt.git gdown exrex torchtext==0.10.0""")
-os.system("nvidia-smi")
-import gradio as gr
 import pickle
 import numpy as np
-import PIL
 import torch
-import dnnlib
 import clip
-import exrex
 from tqdm.notebook import tqdm
-from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
-network_pkl = "https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/ffhq.pkl"
-if not os.path.isfile(os.path.basename(network_pkl)):
-    os.system("wget https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/ffhq.pkl")
-cuda_available = torch.cuda.is_available()
-device = torch.device('cuda' if cuda_available else 'cpu')
-# Load StyleGAN
-with open(os.path.basename(network_pkl), 'rb') as f:
-    # If legacy pkl then convert before loading.
-    try:
-        G = pickle.load(f)['G_ema'].to(device)
-    except ModuleNotFoundError:
-        import legacy
-        G = legacy.load_network_pkl(f)['G_ema'].to(device)
-clip_model = "ViT-B/32"
-model, preprocess = clip.load(clip_model)
-os.system("pwd")
-if not os.path.exists('CLIP_vecs.npy'):
-    os.system("wget https://www.dropbox.com/s/seqev3lvy6e6dz6/CLIP_vecs.npy")
-os.system("ls")
-if os.path.exists('CLIP_vecs.npy'):
-    CLIP_vecs = torch.from_numpy(np.load('CLIP_vecs.npy'))
-    seeded_z = torch.from_numpy(np.stack([np.random.RandomState(seed).randn(G.w_dim) for seed in range(CLIP_vecs.shape[0])]))
-os.system("ls")
-def spherical_avg(p, w=None, tol=1e-6):
-    """Applies a weighted spherical average as described in the paper
-    `Spherical Averages and Applications to Spherical Splines and
-    Interpolation <http://math.ucsd.edu/~sbuss/ResearchWeb/spheremean>`__ .
-    Args:
-        p (torch.Tensor): Input vectors
-        w (torch.Tensor, optional): Weights for averaging.
-        tol (float, optional): The desired tolerance of the output.
-            Default: 1e-6
-    """
-    from geoopt import Sphere
-    sphere = Sphere()
-    if w is None:
-        w = p.new_ones([p.shape[0]])
-    assert p.ndim == 2 and w.ndim == 1 and len(p) == len(w)
-    w = w / w.sum()
-    p = sphere.projx(p)
-    q = sphere.projx(p.mul(w.unsqueeze(1)).sum(dim=0))
-    while True:
-        q_new = sphere.retr(q, sphere.logmap(q, p).mul(w.unsqueeze(1)).sum(dim=0))
-        norm = torch.linalg.vector_norm(q.sub(q_new))
-        q = q_new
-        if norm <= tol:
-            break
-    return q
 def inference(text):
-  prompt = text
-  prompt_preview = False
-  continue_opt = False
-  iterations = 20
-  k = 18
-  if not continue_opt:
-      augmented_prompts = list(exrex.generate(prompt))
-      if not len(augmented_prompts)<=32:
-          return PIL.Image.new(mode="RGB", size=(200, 200),color = (255,255,255))
-      augmented_prompts, polarities = list(map(lambda x: x.replace('~',''), augmented_prompts)), list(map(lambda x: x.__contains__('~'), augmented_prompts))
-      with torch.no_grad():
-          # Encode strings to features
-          text_features = model.encode_text(clip.tokenize(augmented_prompts).to(device)).cpu().to(torch.float32)*torch.tensor(list(map(lambda x: -1 if x else 1,polarities))).unsqueeze(1).expand(-1,512)
-          # If we have more than one feature vector use their spherical average instead
-          if text_features.shape[0]>1:
-              text_features = spherical_avg(text_features).unsqueeze(0)
-          # Use the vector table if it exists, fallback on w_avg if not
-          if os.path.exists('CLIP_vecs.npy'):
-              tmp = torch.nn.functional.cosine_similarity(CLIP_vecs,text_features.cpu())
-              tmp, indexes = torch.topk(tmp,k,dim=0)
-              tmp = torch.softmax(tmp/0.01,dim=-1)
-              ws = G.mapping((seeded_z[indexes]).reshape(-1,G.w_dim).to(device), c=None).cpu()
-              found_w = torch.sum(ws*tmp.unsqueeze(1).unsqueeze(2),dim=0).unsqueeze(0)
-              found_w = found_w.to(device)-G.mapping.w_avg
-          else:
-              found_w = torch.zeros(1,18,512, device=device)
-          # Prepare for gradient decent
-          text_features = text_features.to(device)
-          found_w.requires_grad = True
-  # Adapted preprocessing routine for connecting StyleGAN to CLIP
-  stylegan_transform = Compose([
-          Resize(224),
-          lambda x: torch.clamp((x+1)/2,min=0,max=1),
-          Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
-          ])
-  if not continue_opt:
-      optimizer = torch.optim.Adam((found_w,),0.01,betas=(0,0.999))
-  for i in tqdm(range(iterations)):
-      optimizer.zero_grad()
-      img = G.synthesis(found_w+G.mapping.w_avg, noise_mode='const', force_fp32=not cuda_available)
-      img = stylegan_transform(img)
-      image_features = model.encode_image(img)
-      loss = -torch.nn.functional.cosine_similarity(image_features,text_features)
-      loss.backward()
-      optimizer.step()
-  img = G.synthesis(found_w+G.mapping.w_avg, noise_mode='const', force_fp32=not cuda_available)
-  return PIL.Image.fromarray((img.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8)[0].cpu().numpy(), 'RGB')

 import os
+os.system("pip install --upgrade torch==1.9.1+cu111 torchvision==0.10.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html")
+os.system("git clone https://github.com/NVlabs/stylegan3")
+os.system("git clone https://github.com/openai/CLIP")
+os.system("pip install -e ./CLIP")
+os.system("pip install einops ninja")
+import sys
+sys.path.append('./CLIP')
+sys.path.append('./stylegan3')
+import io
+import os, time
 import pickle
+import shutil
 import numpy as np
+from PIL import Image
 import torch
+import torch.nn.functional as F
+import requests
+import torchvision.transforms as transforms
+import torchvision.transforms.functional as TF
 import clip
 from tqdm.notebook import tqdm
+from torchvision.transforms import Compose, Resize, ToTensor, Normalize
+from einops import rearrange
+device = torch.device('cuda:0')
+def fetch(url_or_path):
+    if str(url_or_path).startswith('http://') or str(url_or_path).startswith('https://'):
+        r = requests.get(url_or_path)
+        r.raise_for_status()
+        fd = io.BytesIO()
+        fd.write(r.content)
+        fd.seek(0)
+        return fd
+    return open(url_or_path, 'rb')
+def fetch_model(url_or_path):
+    basename = os.path.basename(url_or_path)
+    if os.path.exists(basename):
+        return basename
+    else:
+        !wget -c '{url_or_path}'
+        return basename
+def norm1(prompt):
+    "Normalize to the unit sphere."
+    return prompt / prompt.square().sum(dim=-1,keepdim=True).sqrt()
+def spherical_dist_loss(x, y):
+    x = F.normalize(x, dim=-1)
+    y = F.normalize(y, dim=-1)
+    return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2)
+class MakeCutouts(torch.nn.Module):
+    def __init__(self, cut_size, cutn, cut_pow=1.):
+        super().__init__()
+        self.cut_size = cut_size
+        self.cutn = cutn
+        self.cut_pow = cut_pow
+    def forward(self, input):
+        sideY, sideX = input.shape[2:4]
+        max_size = min(sideX, sideY)
+        min_size = min(sideX, sideY, self.cut_size)
+        cutouts = []
+        for _ in range(self.cutn):
+            size = int(torch.rand([])**self.cut_pow * (max_size - min_size) + min_size)
+            offsetx = torch.randint(0, sideX - size + 1, ())
+            offsety = torch.randint(0, sideY - size + 1, ())
+            cutout = input[:, :, offsety:offsety + size, offsetx:offsetx + size]
+            cutouts.append(F.adaptive_avg_pool2d(cutout, self.cut_size))
+        return torch.cat(cutouts)
+make_cutouts = MakeCutouts(224, 32, 0.5)
+def embed_image(image):
+  n = image.shape[0]
+  cutouts = make_cutouts(image)
+  embeds = clip_model.embed_cutout(cutouts)
+  embeds = rearrange(embeds, '(cc n) c -> cc n c', n=n)
+  return embeds
+def embed_url(url):
+  image = Image.open(fetch(url)).convert('RGB')
+  return embed_image(TF.to_tensor(image).to(device).unsqueeze(0)).mean(0).squeeze(0)
+class CLIP(object):
+  def __init__(self):
+    clip_model = "ViT-B/32"
+    self.model, _ = clip.load(clip_model)
+    self.model = self.model.requires_grad_(False)
+    self.normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],
+                                          std=[0.26862954, 0.26130258, 0.27577711])
+  @torch.no_grad()
+  def embed_text(self, prompt):
+      "Normalized clip text embedding."
+      return norm1(self.model.encode_text(clip.tokenize(prompt).to(device)).float())
+  def embed_cutout(self, image):
+      "Normalized clip image embedding."
+      return norm1(self.model.encode_image(self.normalize(image)))
+clip_model = CLIP()
+# Load stylegan model
+base_url = "https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/"
+model_name = "stylegan3-t-ffhqu-1024x1024.pkl"
+#model_name = "stylegan3-r-metfacesu-1024x1024.pkl"
+#model_name = "stylegan3-t-afhqv2-512x512.pkl"
+network_url = base_url + model_name
+with open(fetch_model(network_url), 'rb') as fp:
+  G = pickle.load(fp)['G_ema'].to(device)
+zs = torch.randn([10000, G.mapping.z_dim], device=device)
+w_stds = G.mapping(zs, None).std(0)
 def inference(text):
+  target = clip_model.embed_text(text)
+  steps = 600
+  seed = 2
+  tf = Compose([
+      Resize(224),
+      lambda x: torch.clamp((x+1)/2,min=0,max=1),
+      ])
+  torch.manual_seed(seed)
+  timestring = time.strftime('%Y%m%d%H%M%S')
+  with torch.no_grad():
+    qs = []
+    losses = []
+    for _ in range(8):
+      q = (G.mapping(torch.randn([4,G.mapping.z_dim], device=device), None, truncation_psi=0.7) - G.mapping.w_avg) / w_stds
+      images = G.synthesis(q * w_stds + G.mapping.w_avg)
+      embeds = embed_image(images.add(1).div(2))
+      loss = spherical_dist_loss(embeds, target).mean(0)
+      i = torch.argmin(loss)
+      qs.append(q[i])
+      losses.append(loss[i])
+    qs = torch.stack(qs)
+    losses = torch.stack(losses)
+    print(losses)
+    print(losses.shape, qs.shape)
+    i = torch.argmin(losses)
+    q = qs[i].unsqueeze(0)
+  q.requires_grad_()
+  q_ema = q
+  opt = torch.optim.AdamW([q], lr=0.03, betas=(0.0,0.999))
+  loop = tqdm(range(steps))
+  for i in loop:
+    opt.zero_grad()
+    w = q * w_stds
+    image = G.synthesis(w + G.mapping.w_avg, noise_mode='const')
+    embed = embed_image(image.add(1).div(2))
+    loss = spherical_dist_loss(embed, target).mean()
+    loss.backward()
+    opt.step()
+    loop.set_postfix(loss=loss.item(), q_magnitude=q.std().item())
+    q_ema = q_ema * 0.9 + q * 0.1
+    image = G.synthesis(q_ema * w_stds + G.mapping.w_avg, noise_mode='const')
+    if i % 10 == 0:
+      display(TF.to_pil_image(tf(image)[0]))
+    pil_image = TF.to_pil_image(image[0].add(1).div(2).clamp(0,1))
+    #os.makedirs(f'samples/{timestring}', exist_ok=True)
+    #pil_image.save(f'samples/{timestring}/{i:04}.jpg')
+  return pil_image