Spaces:

ethanNeuralImage
/

inversion_testing

Runtime error

File size: 12,399 Bytes

import argparse
import copy
import os
import time
from tqdm import tqdm

import numpy as np
import PIL.Image
import torch

import clip
from wrapper import (FaceLandmarksDetector, Generator_wrapper, 
                     VGGFeatExtractor, e4eEncoder, PivotTuning)
from projector import project 

class Manipulator():
    """Manipulator for style editing

    in paper, use 100 image pairs to estimate the mean for alpha(magnitude of the perturbation) [-5, 5]

    *** Args ***
    G : Genertor wrapper for synthesis styles
    device : torch.device
    lst_alpha : magnitude of the perturbation
    num_images : num images to process

    *** Attributes ***
    S :  List[dict(str, torch.Tensor)] # length 2,000
    styles : List[dict(str, torch.Tensor)] # length of num_images
                (num_images, style)
    lst_alpha : List[int]
    boundary : (num_images, len_alpha)
    edited_styles : List[styles]
    edited_images : List[(num_images, 3, 1024, 1024)]
    """
    def __init__(
        self, 
        G, 
        device, 
        lst_alpha=[0], 
        num_images=1, 
        start_ind=0, 
        face_preprocess=True,
        dataset_name=''
    ):
        """Initialize 
        - use pre-saved generated latent/style from random Z
        - to use projection, used method "set_real_img_projection"
        """
        assert start_ind + num_images < 2000
        self.W = torch.load(f'tensor/W{dataset_name}.pt')
        self.S = torch.load(f'tensor/S{dataset_name}.pt')
        self.S_mean = torch.load(f'tensor/S_mean{dataset_name}.pt')
        self.S_std = torch.load(f'tensor/S_std{dataset_name}.pt')

        self.S = {layer: self.S[layer].to(device) for layer in G.style_layers}
        self.styles = {layer: self.S[layer][start_ind:start_ind+num_images] for layer in G.style_layers}
        self.latent = self.W[start_ind:start_ind+num_images]
        self.latent = self.latent.to(device)
        del self.W
        del self.S

        # S_mean, S_std for extracting global style direction
        self.S_mean = {layer: self.S_mean[layer].to(device) for layer in G.style_layers}
        self.S_std = {layer: self.S_std[layer].to(device) for layer in G.style_layers}

        # setting
        self.face_preprocess = face_preprocess
        if face_preprocess:
            self.landmarks_detector = FaceLandmarksDetector()
        self.vgg16 = VGGFeatExtractor(device).module
        self.W_projector_steps = 200
        self.G = G
        self.device = device
        self.num_images = num_images
        self.lst_alpha = lst_alpha
        self.manipulate_layers = [layer for layer in G.style_layers if 'torgb' not in layer] 

    def set_alpha(self, lst_alpha):
        """Setter for alpha
        """
        self.lst_alpha = lst_alpha

    def set_real_img_projection(self, img, inv_mode='w', pti_mode=None):
        """Set real img instead of pre-saved styles
        Args : 
        - img : img directory or img file path to manipulate
            - face aligned if self.face_preprocess == True
            - set self.num_images
        - inv_mode : inversion mode, setting self.latent, self.styles
            - w : use W projector (projector.project)
            - w+ : use e4e encoder (wrapper.e4eEncoder)
        - pti_mode : pivot tuning inversion mode (wrapper.PivotTuning)
            - None
            - w : W latent pivot tuning
            - s : S style pivot tuning
        """
        assert inv_mode in ['w', 'w+']
        assert pti_mode in [None, 'w', 's']
        allowed_extensions = ['jpg', 'JPG', 'jpeg', 'JPEG', 'png', 'PNG']

        # img directory input
        if os.path.isdir(img):
            imgpaths = sorted(os.listdir(img))
            imgpaths = [os.path.join(img, imgpath) 
                        for imgpath in imgpaths 
                        if imgpath.split('.')[-1] in allowed_extensions]
        # img file path input
        else:
            imgpaths = [img]

        self.num_images = len(imgpaths)
        if inv_mode == 'w':
            targets = list()
            target_pils = list()
            for imgpath in imgpaths:
                if self.face_preprocess:
                    target_pil = self.landmarks_detector(imgpath)
                else:
                    target_pil = PIL.Image.open(imgpath).convert('RGB')
                target_pils.append(target_pil)
                w, h = target_pil.size
                s = min(w, h)
                target_pil = target_pil.crop(((w - s) // 2, (h - s) // 2, (w + s) // 2, (h + s) // 2))
                target_pil = target_pil.resize((self.G.G.img_resolution, self.G.G.img_resolution), 
                                                PIL.Image.LANCZOS)
                target_uint8 = np.array(target_pil, dtype=np.uint8)
                targets.append(torch.Tensor(target_uint8.transpose([2,0,1])).to(self.device))

            self.latent = list()
            for target in tqdm(targets, total=len(targets)):
                projected_w_steps = project(
                    self.G.G,
                    self.vgg16,
                    target=target,
                    num_steps=self.W_projector_steps, # TODO get projector steps from configs
                    device=self.device,
                    verbose=False,
                )
                self.latent.append(projected_w_steps[-1])
            self.latent = torch.stack(self.latent)
            self.styles = self.G.mapping_stylespace(self.latent)

        else: # inv_mode == 'w+'
            # use e4e encoder
            target_pils = list()
            for imgpath in imgpaths:
                if self.face_preprocess:
                    target_pil = self.landmarks_detector(imgpath)
                else:
                    target_pil = PIL.Image.open(imgpath).convert('RGB')
                target_pils.append(target_pil)

            self.encoder = e4eEncoder(self.device)
            self.latent = self.encoder(target_pils)
            self.styles = self.G.mapping_stylespace(self.latent)

        if pti_mode is not None: # w or s
            # pivot tuning inversion 
            pti = PivotTuning(self.device, self.G.G, mode=pti_mode)
            new_G = pti(self.latent, target_pils)
            self.G.G = new_G

    def manipulate(self, delta_s):
        """Edit style by given delta_style
        - use perturbation (delta s) * (alpha) as a boundary
        """
        styles = [copy.deepcopy(self.styles) for _ in range(len(self.lst_alpha))]

        for (alpha, style) in zip(self.lst_alpha, styles):
            for layer in self.G.style_layers:
                perturbation = delta_s[layer] * alpha
                style[layer] += perturbation
        return styles

    def manipulate_one_channel(self, layer, channel_ind:int):
        """Edit style from given layer, channel index
        - use mean value of pre-saved style
        - use perturbation (pre-saved style std) * (alpha) as a boundary
        """
        assert layer in self.G.style_layers
        assert 0 <= channel_ind < self.styles[layer].shape[1]
        boundary = self.S_std[layer][channel_ind].item()
        # apply self.S_mean value for given layer, channel_ind
        for ind in range(self.num_images):
            self.styles[layer][ind][channel_ind] = self.S_mean[layer][channel_ind]
        styles = [copy.deepcopy(self.styles) for _ in range(len(self.lst_alpha))]
        
        perturbation = (torch.Tensor(self.lst_alpha) * boundary).numpy().tolist()
       
        # apply one channel manipulation
        for img_ind in range(self.num_images):
            for edit_ind, delta in enumerate(perturbation):
                styles[edit_ind][layer][img_ind][channel_ind] += delta

        return styles

    def synthesis_from_styles(self, styles, start_ind, end_ind):
        """Synthesis edited styles from styles, lst_alpha
        """
        styles_ = list()
        for style in styles:
            style_ = dict()
            for layer in self.G.style_layers:
                style_[layer] = style[layer][start_ind:end_ind].to(self.device)
            styles_.append(style_)
        print("synthesis_from_styles", type(style_))
        imgs = [self.G.synthesis_from_stylespace(self.latent[start_ind:end_ind], style_).cpu() 
                for style_ in styles_]
        return imgs


def extract_global_direction(G, device, lst_alpha, num_images, dataset_name=''):
    """Extract global style direction in 100 images
    """
    assert len(lst_alpha) == 2
    model, preprocess = clip.load("ViT-B/32", device=device)
    
    # lindex in original tf version
    manipulate_layers = [layer for layer in G.style_layers if 'torgb' not in layer] 

    # total channel: 6048 (1024 resolution)
    resolution = G.G.img_resolution
    latent = torch.randn([1,G.to_w_idx[f'G.synthesis.b{resolution}.torgb.affine']+1,512]).to(device) # 1024 -> 18, 512 -> 16, 256 -> 14
    style = G.mapping_stylespace(latent)
    cnt = 0
    for layer in manipulate_layers:
        cnt += style[layer].shape[1]
    del latent
    del style

    # 1024 -> 6048 channels, 256 -> 4928 channels
    print(f"total channels to manipulate: {cnt}")
    
    manipulator = Manipulator(G, device, lst_alpha, num_images, face_preprocess=False, dataset_name=dataset_name)

    all_feats = list()

    for layer in manipulate_layers:
        print(f'\nStyle manipulation in layer "{layer}"')
        channel_num = manipulator.styles[layer].shape[1]

        for channel_ind in tqdm(range(channel_num), total=channel_num):
            styles = manipulator.manipulate_one_channel(layer, channel_ind)
            # 2 * 100 images
            batchsize = 10
            nbatch = int(100 / batchsize)
            feats = list()
            for img_ind in range(0, nbatch): # batch size 10 * 2
                start = img_ind*nbatch
                end = img_ind*nbatch + batchsize
                synth_imgs = manipulator.synthesis_from_styles(styles, start, end)
                synth_imgs = [(synth_img.permute(0,2,3,1)*127.5+128).clamp(0,255).to(torch.uint8).numpy()
                            for synth_img in synth_imgs]
                imgs = list()
                for i in range(batchsize):
                    img0 = PIL.Image.fromarray(synth_imgs[0][i])
                    img1 = PIL.Image.fromarray(synth_imgs[1][i])
                    imgs.append(preprocess(img0).unsqueeze(0).to(device))
                    imgs.append(preprocess(img1).unsqueeze(0).to(device))
                with torch.no_grad():
                    feat = model.encode_image(torch.cat(imgs))
                feats.append(feat)
            all_feats.append(torch.cat(feats).view([-1, 2, 512]).cpu())

    all_feats = torch.stack(all_feats).numpy()

    fs = all_feats
    fs1=fs/np.linalg.norm(fs,axis=-1)[:,:,:,None]
    fs2=fs1[:,:,1,:]-fs1[:,:,0,:] # 5*sigma - (-5)*sigma
    fs3=fs2/np.linalg.norm(fs2,axis=-1)[:,:,None]
    fs3=fs3.mean(axis=1)
    fs3=fs3/np.linalg.norm(fs3,axis=-1)[:,None]

    np.save(f'tensor/fs3{dataset_name}.npy', fs3) # global style direction


if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('runtype', type=str, default='test')
    parser.add_argument('--ckpt', type=str, default='pretrained/ffhq.pkl')
    parser.add_argument('--face_preprocess', type=bool, default=True)
    parser.add_argument('--dataset_name', type=str, default='')
    args = parser.parse_args()
    
    runtype = args.runtype
    assert runtype in ['test', 'extract'] 

    device = torch.device('cuda:0')
    ckpt = args.ckpt
    G = Generator_wrapper(ckpt, device)

    face_preprocess = args.face_preprocess
    dataset_name = args.dataset_name

    if runtype == 'test': # test manipulator
        num_images = 100
        lst_alpha = [-5, 0, 5]
        layer = G.style_layers[6]
        channel_ind = 501
        manipulator = Manipulator(G, device, lst_alpha, num_images, face_preprocess=face_preprocess, dataset_name=dataset_name)
        styles = manipulator.manipulate_one_channel(layer, channel_ind)
        start_ind, end_ind= 0, 10
        imgs = manipulator.synthesis_from_styles(styles, start_ind, end_ind)
        print(len(imgs), imgs[0].shape)

    elif runtype == 'extract': # extract global style direction from "tensor/S.pt"
        num_images = 100
        lst_alpha = [-5, 5]
        extract_global_direction(G, device, lst_alpha, num_images, dataset_name=dataset_name)