Spaces:

rinong
/

StyleGAN-NADA

Runtime error

App Files Files Community

rinong commited on May 17, 2022

Commit

fcf0449

•

1 Parent(s): 6104a4e

Added StyleCLIP support

Browse files

Files changed (3) hide show

app.py +32 -6
generate_videos.py +2 -2
styleclip/styleclip_global.py +158 -0

app.py CHANGED Viewed

@@ -19,12 +19,16 @@ from torchvision import utils
 from model.sg2_model import Generator
 from generate_videos import generate_frames, video_from_interpolations, project_code_by_edit_name
 model_dir = "models"
 os.makedirs(model_dir, exist_ok=True)
 model_repos = {"e4e": ("akhaliq/JoJoGAN_e4e_ffhq_encode", "e4e_ffhq_encode.pt"),
                "dlib": ("akhaliq/jojogan_dlib", "shape_predictor_68_face_landmarks.dat"),
                "base": ("akhaliq/jojogan-stylegan2-ffhq-config-f", "stylegan2-ffhq-config-f.pt"),
                "anime": ("rinong/stylegan-nada-models", "anime.pt"),
                "joker": ("rinong/stylegan-nada-models", "joker.pt"),
@@ -70,7 +74,7 @@ class ImageEditor(object):
         self.generators = {}
-        self.model_list = [name for name in model_paths.keys() if name not in ["e4e", "dlib"]]
         for model in self.model_list:
             g_ema = Generator(
@@ -108,6 +112,10 @@ class ImageEditor(object):
             model_paths["dlib"]
         )
         print("setup complete")
     def get_style_list(self):
@@ -186,7 +194,15 @@ class ImageEditor(object):
                     target_latents.append(project_code_by_edit_name(np_source_latent, attribute_name, strength))
         elif edit_choices["edit_type"] == "StyleCLIP":
-            pass
         # if edit type is none or if all slides were set to 0
         if not target_latents:
@@ -228,9 +244,13 @@ class ImageEditor(object):
             with torch.no_grad():
                 for g_ema in generators:
                     latent_for_gen = random.choice(target_latents)
-                    latent_for_gen = [torch.from_numpy(latent_for_gen).float().to(self.device)]
-                    img, _ = g_ema(latent_for_gen, input_is_latent=True, truncation=1, randomize_noise=False)
                     output_path = os.path.join(out_dir, f"out_{len(output_paths)}.jpg")
                     utils.save_image(img, output_path, nrow=1, normalize=True, range=(-1, 1))
@@ -294,6 +314,9 @@ with blocks:
     gr.Markdown(
         "For more information about the paper and code for training your own models (with examples OR text), see below."
     )
     with gr.Row():
         input_img = gr.inputs.Image(type="filepath", label="Input image")
@@ -306,7 +329,8 @@ with blocks:
             with gr.Tabs():
                 with gr.TabItem("InterFaceGAN Editing Options"):
                     gr.Markdown("Move the sliders to make the chosen attribute stronger (e.g. the person older) or leave at 0 to disable editing.")
-                    gr.Markdown("If multiple options are provided, they will be used randomly between images (or sequentially for a video), <u>not</u> together")
                     pose_slider   = gr.Slider(label="Pose", minimum=-1, maximum=1, value=0, step=0.05)
                     smile_slider  = gr.Slider(label="Smile", minimum=-1, maximum=1, value=0, step=0.05)
@@ -343,7 +367,9 @@ with blocks:
                     with gr.Row():
                         vid_button  = gr.Button("Generate Video")
                         loop_styles = gr.inputs.Checkbox(default=True, label="Loop video back to the initial style?")
                 with gr.Column():
                     vid_output = gr.outputs.Video(label="Output Video")

 from model.sg2_model import Generator
 from generate_videos import generate_frames, video_from_interpolations, project_code_by_edit_name
+from styleclip.styleclip_global import project_code_with_styleclip, style_tensor_to_style_dict
+import clip
 model_dir = "models"
 os.makedirs(model_dir, exist_ok=True)
 model_repos = {"e4e": ("akhaliq/JoJoGAN_e4e_ffhq_encode", "e4e_ffhq_encode.pt"),
                "dlib": ("akhaliq/jojogan_dlib", "shape_predictor_68_face_landmarks.dat"),
+               "sc_fs3": ("rinong/stylegan-nada-models", "fs3.npy"),
                "base": ("akhaliq/jojogan-stylegan2-ffhq-config-f", "stylegan2-ffhq-config-f.pt"),
                "anime": ("rinong/stylegan-nada-models", "anime.pt"),
                "joker": ("rinong/stylegan-nada-models", "joker.pt"),
         self.generators = {}
+        self.model_list = [name for name in model_paths.keys() if name not in ["e4e", "dlib", "sc_fs3"]]
         for model in self.model_list:
             g_ema = Generator(
             model_paths["dlib"]
         )
+        self.styleclip_fs3 = torch.from_numpy(np.load(model_paths["sc_fs3"])).to(self.device)
+        self.clip_model, _ = clip.load("ViT-B/32", device=self.device)
         print("setup complete")
     def get_style_list(self):
                     target_latents.append(project_code_by_edit_name(np_source_latent, attribute_name, strength))
         elif edit_choices["edit_type"] == "StyleCLIP":
+            source_s_dict = generators[0].get_s_code(source_latent, input_is_latent=True)
+            target_latents.append(project_code_with_styleclip(source_s_dict,
+                                                              edit_choices["src_text"],
+                                                              edit_choices["tar_text"],
+                                                              edit_choices["alpha"],
+                                                              edit_choices["beta"],
+                                                              generators[0],
+                                                              self.styleclip_fs3,
+                                                              self.clip_model))
         # if edit type is none or if all slides were set to 0
         if not target_latents:
             with torch.no_grad():
                 for g_ema in generators:
                     latent_for_gen = random.choice(target_latents)
+                    if edit_choices["edit_type"] == "StyleCLIP":
+                        latent_for_gen = style_tensor_to_style_dict(latent_for_gen, g_ema)
+                        img, _ = g_ema(latent_for_gen, input_is_s_code=True, input_is_latent=True, truncation=1, randomize_noise=False)
+                    else:
+                        latent_for_gen = [torch.from_numpy(latent_for_gen).float().to(self.device)]
+                        img, _ = g_ema(latent_for_gen, input_is_latent=True, truncation=1, randomize_noise=False)
                     output_path = os.path.join(out_dir, f"out_{len(output_paths)}.jpg")
                     utils.save_image(img, output_path, nrow=1, normalize=True, range=(-1, 1))
     gr.Markdown(
         "For more information about the paper and code for training your own models (with examples OR text), see below."
     )
+    gr.Markdown("<h4 style='font-size: 110%;margin-top:.5em'>On biases</h4><div>This model relies on StyleGAN and CLIP, both of which are prone to biases such as poor representation of minorities or reinforcement of societal biases, such as gender norms. </div>")
     with gr.Row():
         input_img = gr.inputs.Image(type="filepath", label="Input image")
             with gr.Tabs():
                 with gr.TabItem("InterFaceGAN Editing Options"):
                     gr.Markdown("Move the sliders to make the chosen attribute stronger (e.g. the person older) or leave at 0 to disable editing.")
+                    gr.Markdown("If multiple options are provided, they will be used randomly between images (or sequentially for a video), <u>not</u> together.")
+                    gr.Markdown("Please note that some directions may be entangled. For example, hair length adjustments are likely to also modify the perceived gender.")
                     pose_slider   = gr.Slider(label="Pose", minimum=-1, maximum=1, value=0, step=0.05)
                     smile_slider  = gr.Slider(label="Smile", minimum=-1, maximum=1, value=0, step=0.05)
                     with gr.Row():
                         vid_button  = gr.Button("Generate Video")
                         loop_styles = gr.inputs.Checkbox(default=True, label="Loop video back to the initial style?")
+                    with gr.Row():
+                        gr.Markdown("Warning: Videos generation requires the synthesis of hundreds of frames and is expected to take several minutes.")
+                        gr.Markdown("To reduce queue times, we significantly reduced the number of video frames. Using more than 3 styles will further reduce the frames per style, leading to quicker transitions. For better control, we reccomend cloning the gradio app, adjusting `num_alphas` in `generate_videos`, and running the code locally.")
                 with gr.Column():
                     vid_output = gr.outputs.Video(label="Output Video")

generate_videos.py CHANGED Viewed

@@ -62,14 +62,14 @@ def generate_frames(source_latent, target_latents, g_ema_list, output_dir):
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    num_alphas = min(20, 60 // len(target_latents))
     alphas = np.linspace(0, 1, num=num_alphas)
     latents = interpolate_with_target_latents(source_latent, target_latents, alphas)
     segments = len(g_ema_list) - 1
     if segments:
         segment_length = len(latents) / segments

     device = "cuda" if torch.cuda.is_available() else "cpu"
+    num_alphas = min(10, 30 // len(target_latents))
     alphas = np.linspace(0, 1, num=num_alphas)
     latents = interpolate_with_target_latents(source_latent, target_latents, alphas)
     segments = len(g_ema_list) - 1
     if segments:
         segment_length = len(latents) / segments

styleclip/styleclip_global.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import numpy as np
+import torch
+from tqdm import tqdm
+from pathlib import Path
+import os
+import clip
+imagenet_templates = [
+    'a bad photo of a {}.',
+    'a photo of many {}.',
+    'a sculpture of a {}.',
+    'a photo of the hard to see {}.',
+    'a low resolution photo of the {}.',
+    'a rendering of a {}.',
+    'graffiti of a {}.',
+    'a bad photo of the {}.',
+    'a cropped photo of the {}.',
+    'a tattoo of a {}.',
+    'the embroidered {}.',
+    'a photo of a hard to see {}.',
+    'a bright photo of a {}.',
+    'a photo of a clean {}.',
+    'a photo of a dirty {}.',
+    'a dark photo of the {}.',
+    'a drawing of a {}.',
+    'a photo of my {}.',
+    'the plastic {}.',
+    'a photo of the cool {}.',
+    'a close-up photo of a {}.',
+    'a black and white photo of the {}.',
+    'a painting of the {}.',
+    'a painting of a {}.',
+    'a pixelated photo of the {}.',
+    'a sculpture of the {}.',
+    'a bright photo of the {}.',
+    'a cropped photo of a {}.',
+    'a plastic {}.',
+    'a photo of the dirty {}.',
+    'a jpeg corrupted photo of a {}.',
+    'a blurry photo of the {}.',
+    'a photo of the {}.',
+    'a good photo of the {}.',
+    'a rendering of the {}.',
+    'a {} in a video game.',
+    'a photo of one {}.',
+    'a doodle of a {}.',
+    'a close-up photo of the {}.',
+    'a photo of a {}.',
+    'the origami {}.',
+    'the {} in a video game.',
+    'a sketch of a {}.',
+    'a doodle of the {}.',
+    'a origami {}.',
+    'a low resolution photo of a {}.',
+    'the toy {}.',
+    'a rendition of the {}.',
+    'a photo of the clean {}.',
+    'a photo of a large {}.',
+    'a rendition of a {}.',
+    'a photo of a nice {}.',
+    'a photo of a weird {}.',
+    'a blurry photo of a {}.',
+    'a cartoon {}.',
+    'art of a {}.',
+    'a sketch of the {}.',
+    'a embroidered {}.',
+    'a pixelated photo of a {}.',
+    'itap of the {}.',
+    'a jpeg corrupted photo of the {}.',
+    'a good photo of a {}.',
+    'a plushie {}.',
+    'a photo of the nice {}.',
+    'a photo of the small {}.',
+    'a photo of the weird {}.',
+    'the cartoon {}.',
+    'art of the {}.',
+    'a drawing of the {}.',
+    'a photo of the large {}.',
+    'a black and white photo of a {}.',
+    'the plushie {}.',
+    'a dark photo of a {}.',
+    'itap of a {}.',
+    'graffiti of the {}.',
+    'a toy {}.',
+    'itap of my {}.',
+    'a photo of a cool {}.',
+    'a photo of a small {}.',
+    'a tattoo of the {}.',
+]
+FFHQ_CODE_INDICES = [(0, 512), (512, 1024), (1024, 1536), (1536, 2048), (2560, 3072), (3072, 3584), (4096, 4608), (4608, 5120), (5632, 6144), (6144, 6656), (7168, 7680), (7680, 7936), (8192, 8448), (8448, 8576), (8704, 8832), (8832, 8896), (8960, 9024), (9024, 9056)] + \
+                    [(2048, 2560), (3584, 4096), (5120, 5632), (6656, 7168), (7936, 8192), (8576, 8704), (8896, 8960), (9056, 9088)]
+def zeroshot_classifier(model, classnames, templates, device):
+    with torch.no_grad():
+        zeroshot_weights = []
+        for classname in tqdm(classnames):
+            texts = [template.format(classname) for template in templates]  # format with class
+            texts = clip.tokenize(texts).to(device)  # tokenize
+            class_embeddings = model.encode_text(texts)  # embed with text encoder
+            class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
+            class_embedding = class_embeddings.mean(dim=0)
+            class_embedding /= class_embedding.norm()
+            zeroshot_weights.append(class_embedding)
+        zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(device)
+    return zeroshot_weights
+def get_direction(neutral_class, target_class, beta, di, clip_model=None):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if clip_model is None:
+        clip_model, _ = clip.load("ViT-B/32", device=device)
+    class_names = [neutral_class, target_class]
+    class_weights = zeroshot_classifier(clip_model, class_names, imagenet_templates, device)
+    dt = class_weights[:, 1] - class_weights[:, 0]
+    dt = dt / dt.norm()
+    relevance = di @ dt
+    mask = relevance.abs() > beta
+    direction = relevance * mask
+    direction_max = direction.abs().max()
+    if direction_max > 0:
+        direction = direction / direction_max
+    else:
+        raise ValueError(f'Beta value {beta} is too high for mapping from {neutral_class} to {target_class},'
+                         f' try setting it to a lower value')
+    return direction
+def style_tensor_to_style_dict(style_tensor, refernce_generator):
+    style_layers = refernce_generator.modulation_layers
+    style_dict = {}
+    for layer_idx, layer in enumerate(style_layers):
+        style_dict[layer] = style_tensor[:, FFHQ_CODE_INDICES[layer_idx][0]:FFHQ_CODE_INDICES[layer_idx][1]]
+    return style_dict
+def style_dict_to_style_tensor(style_dict, reference_generator):
+    style_layers = reference_generator.modulation_layers
+    style_tensor = torch.zeros(shape=(1, 9088))
+    for layer in style_dict:
+        layer_idx = style_layers.index(layer)
+        style_tensor[:, FFHQ_CODE_INDICES[layer_idx][0]:FFHQ_CODE_INDICES[layer_idx][1]] = style_dict[layer]
+    return style_tensor
+def project_code_with_styleclip(source_latent, source_class, target_class, alpha, beta, reference_generator, di, clip_model=None):
+        edit_direction = get_direction(source_class, target_class, beta)
+        source_s = style_dict_to_style_tensor(source_latent, reference_generator)
+        return source_s + alpha * edit_direction