Spaces:

ykilcher
/

apes

Running

App Files Files Community

Yannic Kilcher commited on Apr 24, 2022

Commit

5c824be

•

1 Parent(s): 6f160b3

added interfaces for interpolation and projection

Browse files

Files changed (6) hide show

.gitignore +1 -0
README.md +5 -0
interface.py +70 -0
interface_projector.py +126 -0
interpolate.py +10 -0
projector.py +54 -5

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 __pycache__/
 .cache/

 __pycache__/
 .cache/
+proj.mp4

README.md CHANGED Viewed

@@ -1,3 +1,8 @@
 ## StyleGAN2-ADA &mdash; Official PyTorch implementation
 ![Teaser image](./docs/stylegan2-ada-teaser-1024x252.png)

+## Project repo for apes by ykilcher
+Note: most of the code is taken from nvlabs/stylegan2-ada-pytroch (original readme below).
+I added gradio interfaces and CLIP projection.
 ## StyleGAN2-ADA &mdash; Official PyTorch implementation
 ![Teaser image](./docs/stylegan2-ada-teaser-1024x252.png)

interface.py ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/usr/bin/env python3
+import gradio as gr
+import numpy as np
+import torch
+import pickle
+import types
+from huggingface_hub import hf_hub_url, cached_download
+# with open('../models/gamma500/network-snapshot-010000.pkl', 'rb') as f:
+with open(cached_download(hf_hub_url('ykilcher/apes', 'gamma500/network-snapshot-010000.pkl')), 'rb') as f:
+    G = pickle.load(f)['G_ema']# torch.nn.Module
+device = torch.device("cpu")
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+    G = G.to(device)
+else:
+    _old_forward = G.forward
+    def _new_forward(self, *args, **kwargs):
+        kwargs["force_fp32"] = True
+        return _old_forward(self, *args, **kwargs)
+    G.forward = types.MethodType(_new_forward, G)
+    _old_synthesis_forward = G.synthesis.forward
+    def _new_synthesis_forward(self, *args, **kwargs):
+        kwargs["force_fp32"] = True
+        return _old_synthesis_forward(self, *args, **kwargs)
+    G.synthesis.forward = types.MethodType(_new_synthesis_forward, G.synthesis)
+def generate(num_images, interpolate):
+    if interpolate:
+        z1 = torch.randn([1, G.z_dim])# latent codes
+        z2 = torch.randn([1, G.z_dim])# latent codes
+        zs = torch.cat([z1 + (z2 - z1) * i / (num_images-1) for i in range(num_images)], 0)
+    else:
+        zs = torch.randn([num_images, G.z_dim])# latent codes
+    with torch.no_grad():
+        zs = zs.to(device)
+        img = G(zs, None, force_fp32=True, truncation_psi=1, noise_mode='const')
+        img = (img.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8)
+    return img.cpu().numpy()
+def greet(num_images, interpolate):
+    img = generate(round(num_images), interpolate)
+    imgs = list(img)
+    if len(imgs) == 1:
+        return imgs[0]
+    grid_len = int(np.ceil(np.sqrt(len(imgs)))) * 2
+    grid_height = int(np.ceil(len(imgs) / grid_len))
+    grid = np.zeros((grid_height * imgs[0].shape[0], grid_len * imgs[0].shape[1], 3), dtype=np.uint8)
+    for i, img in enumerate(imgs):
+        y = (i // grid_len) * img.shape[0]
+        x = (i % grid_len) * img.shape[1]
+        grid[y:y+img.shape[0], x:x+img.shape[1], :] = img
+    return grid
+iface = gr.Interface(fn=greet, inputs=[
+    gr.inputs.Number(default=1, label="Num Images"),
+    gr.inputs.Checkbox(default=False, label="Interpolate")
+    ], outputs="image")
+iface.launch()

interface_projector.py ADDED Viewed

	@@ -0,0 +1,126 @@

+#!/usr/bin/env python3
+import gradio as gr
+import numpy as np
+import torch
+import pickle
+import PIL.Image
+import types
+from projector import project, imageio, _MODELS
+from huggingface_hub import hf_hub_url, cached_download
+# with open("../models/gamma500/network-snapshot-010000.pkl", "rb") as f:
+# with open("../models/gamma400/network-snapshot-010600.pkl", "rb") as f:
+# with open("../models/gamma400/network-snapshot-019600.pkl", "rb") as f:
+with open(cached_download(hf_hub_url('ykilcher/apes', 'gamma500/network-snapshot-010000.pkl')), 'rb') as f:
+    G = pickle.load(f)["G_ema"]  # torch.nn.Module
+device = torch.device("cpu")
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+    G = G.to(device)
+else:
+    _old_forward = G.forward
+    def _new_forward(self, *args, **kwargs):
+        kwargs["force_fp32"] = True
+        return _old_forward(self, *args, **kwargs)
+    G.forward = types.MethodType(_new_forward, G)
+    _old_synthesis_forward = G.synthesis.forward
+    def _new_synthesis_forward(self, *args, **kwargs):
+        kwargs["force_fp32"] = True
+        return _old_synthesis_forward(self, *args, **kwargs)
+    G.synthesis.forward = types.MethodType(_new_synthesis_forward, G.synthesis)
+def generate(
+    target_image_upload,
+    # target_image_webcam,
+    num_steps,
+    seed,
+    learning_rate,
+    model_name,
+    normalize_for_clip,
+    loss_type,
+    regularize_noise_weight,
+    initial_noise_factor,
+):
+    seed = round(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    target_image = target_image_upload
+    # if target_image is None:
+        # target_image = target_image_webcam
+    num_steps = round(num_steps)
+    print(type(target_image))
+    print(target_image.dtype)
+    print(target_image.max())
+    print(target_image.min())
+    print(target_image.shape)
+    target_pil = PIL.Image.fromarray(target_image).convert("RGB")
+    w, h = target_pil.size
+    s = min(w, h)
+    target_pil = target_pil.crop(
+        ((w - s) // 2, (h - s) // 2, (w + s) // 2, (h + s) // 2)
+    )
+    target_pil = target_pil.resize(
+        (G.img_resolution, G.img_resolution), PIL.Image.LANCZOS
+    )
+    target_uint8 = np.array(target_pil, dtype=np.uint8)
+    target_image = torch.from_numpy(target_uint8.transpose([2, 0, 1])).to(device)
+    projected_w_steps = project(
+        G,
+        target=target_image,
+        num_steps=num_steps,
+        device=device,
+        verbose=True,
+        initial_learning_rate=learning_rate,
+        model_name=model_name,
+        normalize_for_clip=normalize_for_clip,
+        loss_type=loss_type,
+        regularize_noise_weight=regularize_noise_weight,
+        initial_noise_factor=initial_noise_factor,
+    )
+    with torch.no_grad():
+        video = imageio.get_writer(f'proj.mp4', mode='I', fps=10, codec='libx264', bitrate='16M')
+        for w in projected_w_steps:
+            synth_image = G.synthesis(w.to(device).unsqueeze(0), noise_mode="const")
+            synth_image = (synth_image + 1) * (255 / 2)
+            synth_image = (
+                synth_image.permute(0, 2, 3, 1)
+                .clamp(0, 255)
+                .to(torch.uint8)[0]
+                .cpu()
+                .numpy()
+            )
+            video.append_data(np.concatenate([target_uint8, synth_image], axis=1))
+        video.close()
+    return synth_image, "proj.mp4"
+iface = gr.Interface(
+    fn=generate,
+    inputs=[
+        gr.inputs.Image(source="upload", optional=True),
+        # gr.inputs.Image(source="webcam", optional=True),
+        gr.inputs.Number(default=250, label="steps"),
+        gr.inputs.Number(default=69420, label="seed"),
+        gr.inputs.Number(default=0.05, label="learning_rate"),
+        gr.inputs.Dropdown(default='RN50', label="model_name", choices=['vgg16', *_MODELS.keys()]),
+        gr.inputs.Checkbox(default=True, label="normalize_for_clip"),
+        gr.inputs.Dropdown(
+            default="l2", label="loss_type", choices=["l2", "l1", "cosine"]
+        ),
+        gr.inputs.Number(default=1e5, label="regularize_noise_weight"),
+        gr.inputs.Number(default=0.05, label="initial_noise_factor"),
+    ],
+    outputs=["image", "video"],
+)
+iface.launch(inbrowser=True)

interpolate.py ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/usr/bin/env python3
+import torch
+import pickle
+with open('../models/gamma500/network-snapshot-010000.pkl', 'rb') as f:
+    G = pickle.load(f)['G_ema']# torch.nn.Module
+z = torch.randn([1, G.z_dim])# latent codes
+c = None                                # class labels (not used in this example)
+img = G(z, c, force_fp32=True)                           # NCHW, float32, dynamic range [-1, +1]

projector.py CHANGED Viewed

@@ -22,6 +22,18 @@ import torch.nn.functional as F
 import dnnlib
 import legacy
 def project(
     G,
     target: torch.Tensor, # [C,H,W] and dynamic range [0,255], W & H must match G output resolution
@@ -35,6 +47,9 @@ def project(
     noise_ramp_length          = 0.75,
     regularize_noise_weight    = 1e5,
     verbose                    = False,
     device: torch.device
 ):
     assert target.shape == (G.img_channels, G.img_resolution, G.img_resolution)
@@ -56,16 +71,38 @@ def project(
     # Setup noise inputs.
     noise_bufs = { name: buf for (name, buf) in G.synthesis.named_buffers() if 'noise_const' in name }
     # Load VGG16 feature detector.
     url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/vgg16.pt'
     with dnnlib.util.open_url(url) as f:
         vgg16 = torch.jit.load(f).eval().to(device)
     # Features for target image.
     target_images = target.unsqueeze(0).to(device).to(torch.float32)
-    if target_images.shape[2] > 256:
-        target_images = F.interpolate(target_images, size=(256, 256), mode='area')
-    target_features = vgg16(target_images, resize_images=False, return_lpips=True)
     w_opt = torch.tensor(w_avg, dtype=torch.float32, device=device, requires_grad=True) # pylint: disable=not-callable
     w_out = torch.zeros([num_steps] + list(w_opt.shape[1:]), dtype=torch.float32, device=device)
@@ -98,8 +135,20 @@ def project(
             synth_images = F.interpolate(synth_images, size=(256, 256), mode='area')
         # Features for synth images.
-        synth_features = vgg16(synth_images, resize_images=False, return_lpips=True)
-        dist = (target_features - synth_features).square().sum()
         # Noise regularization.
         reg_loss = 0.0

 import dnnlib
 import legacy
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+    "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
+    "RN50x64": "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt",
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+    "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
+    "ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
+}
 def project(
     G,
     target: torch.Tensor, # [C,H,W] and dynamic range [0,255], W & H must match G output resolution
     noise_ramp_length          = 0.75,
     regularize_noise_weight    = 1e5,
     verbose                    = False,
+    model_name='vgg16',
+    loss_type='l2',
+    normalize_for_clip=True,
     device: torch.device
 ):
     assert target.shape == (G.img_channels, G.img_resolution, G.img_resolution)
     # Setup noise inputs.
     noise_bufs = { name: buf for (name, buf) in G.synthesis.named_buffers() if 'noise_const' in name }
+    USE_CLIP = model_name != 'vgg16'
     # Load VGG16 feature detector.
     url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/vgg16.pt'
+    if USE_CLIP:
+        # url = 'https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt'
+        # url = 'https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt'
+        # url = 'https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt'
+        # url = 'https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt'
+        url = _MODELS[model_name]
     with dnnlib.util.open_url(url) as f:
         vgg16 = torch.jit.load(f).eval().to(device)
     # Features for target image.
     target_images = target.unsqueeze(0).to(device).to(torch.float32)
+    if USE_CLIP:
+        image_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]).to(device)[:, None, None]
+        image_std = torch.tensor([0.26862954, 0.26130258, 0.27577711]).to(device)[:, None, None]
+        # target_images = F.interpolate(target_images, size=(224, 224), mode='area')
+        target_images = F.interpolate(target_images, size=(vgg16.input_resolution.item(), vgg16.input_resolution.item()), mode='area')
+        print("target_images.shape:", target_images.shape)
+        def _encode_image(image):
+            image = image / 255.
+            # image = torch.sigmoid(image)
+            if normalize_for_clip:
+                image = (image - image_mean) / image_std
+            return vgg16.encode_image(image)
+        target_features = _encode_image(target_images.clamp(0, 255))
+        target_features = target_features.detach()
+    else:
+        if target_images.shape[2] > 256:
+            target_images = F.interpolate(target_images, size=(256, 256), mode='area')
+        target_features = vgg16(target_images, resize_images=False, return_lpips=True)
     w_opt = torch.tensor(w_avg, dtype=torch.float32, device=device, requires_grad=True) # pylint: disable=not-callable
     w_out = torch.zeros([num_steps] + list(w_opt.shape[1:]), dtype=torch.float32, device=device)
             synth_images = F.interpolate(synth_images, size=(256, 256), mode='area')
         # Features for synth images.
+        if USE_CLIP:
+            synth_images = F.interpolate(synth_images, size=(vgg16.input_resolution.item(), vgg16.input_resolution.item()), mode='area')
+            synth_features = _encode_image(synth_images)
+            if loss_type == 'cosine':
+                target_features_normalized = target_features / target_features.norm(dim=-1, keepdim=True).detach()
+                synth_features_normalized = synth_features / synth_features.norm(dim=-1, keepdim=True).detach()
+                dist = 1.0 - torch.sum(synth_features_normalized * target_features_normalized)
+            elif loss_type == 'l1':
+                dist = (target_features - synth_features).abs().sum()
+            else:
+                dist = (target_features - synth_features).square().sum()
+        else:
+            synth_features = vgg16(synth_images, resize_images=False, return_lpips=True)
+            dist = (target_features - synth_features).square().sum()
         # Noise regularization.
         reg_loss = 0.0