inversion_testing / ris /manipulator.py
ethanNeuralImage's picture
remove PTI from model
fdd9364
raw
history blame contribute delete
No virus
12.4 kB
import argparse
import copy
import os
import time
from tqdm import tqdm
import numpy as np
import PIL.Image
import torch
import clip
from wrapper import (FaceLandmarksDetector, Generator_wrapper,
VGGFeatExtractor, e4eEncoder, PivotTuning)
from projector import project
class Manipulator():
"""Manipulator for style editing
in paper, use 100 image pairs to estimate the mean for alpha(magnitude of the perturbation) [-5, 5]
*** Args ***
G : Genertor wrapper for synthesis styles
device : torch.device
lst_alpha : magnitude of the perturbation
num_images : num images to process
*** Attributes ***
S : List[dict(str, torch.Tensor)] # length 2,000
styles : List[dict(str, torch.Tensor)] # length of num_images
(num_images, style)
lst_alpha : List[int]
boundary : (num_images, len_alpha)
edited_styles : List[styles]
edited_images : List[(num_images, 3, 1024, 1024)]
"""
def __init__(
self,
G,
device,
lst_alpha=[0],
num_images=1,
start_ind=0,
face_preprocess=True,
dataset_name=''
):
"""Initialize
- use pre-saved generated latent/style from random Z
- to use projection, used method "set_real_img_projection"
"""
assert start_ind + num_images < 2000
self.W = torch.load(f'tensor/W{dataset_name}.pt')
self.S = torch.load(f'tensor/S{dataset_name}.pt')
self.S_mean = torch.load(f'tensor/S_mean{dataset_name}.pt')
self.S_std = torch.load(f'tensor/S_std{dataset_name}.pt')
self.S = {layer: self.S[layer].to(device) for layer in G.style_layers}
self.styles = {layer: self.S[layer][start_ind:start_ind+num_images] for layer in G.style_layers}
self.latent = self.W[start_ind:start_ind+num_images]
self.latent = self.latent.to(device)
del self.W
del self.S
# S_mean, S_std for extracting global style direction
self.S_mean = {layer: self.S_mean[layer].to(device) for layer in G.style_layers}
self.S_std = {layer: self.S_std[layer].to(device) for layer in G.style_layers}
# setting
self.face_preprocess = face_preprocess
if face_preprocess:
self.landmarks_detector = FaceLandmarksDetector()
self.vgg16 = VGGFeatExtractor(device).module
self.W_projector_steps = 200
self.G = G
self.device = device
self.num_images = num_images
self.lst_alpha = lst_alpha
self.manipulate_layers = [layer for layer in G.style_layers if 'torgb' not in layer]
def set_alpha(self, lst_alpha):
"""Setter for alpha
"""
self.lst_alpha = lst_alpha
def set_real_img_projection(self, img, inv_mode='w', pti_mode=None):
"""Set real img instead of pre-saved styles
Args :
- img : img directory or img file path to manipulate
- face aligned if self.face_preprocess == True
- set self.num_images
- inv_mode : inversion mode, setting self.latent, self.styles
- w : use W projector (projector.project)
- w+ : use e4e encoder (wrapper.e4eEncoder)
- pti_mode : pivot tuning inversion mode (wrapper.PivotTuning)
- None
- w : W latent pivot tuning
- s : S style pivot tuning
"""
assert inv_mode in ['w', 'w+']
assert pti_mode in [None, 'w', 's']
allowed_extensions = ['jpg', 'JPG', 'jpeg', 'JPEG', 'png', 'PNG']
# img directory input
if os.path.isdir(img):
imgpaths = sorted(os.listdir(img))
imgpaths = [os.path.join(img, imgpath)
for imgpath in imgpaths
if imgpath.split('.')[-1] in allowed_extensions]
# img file path input
else:
imgpaths = [img]
self.num_images = len(imgpaths)
if inv_mode == 'w':
targets = list()
target_pils = list()
for imgpath in imgpaths:
if self.face_preprocess:
target_pil = self.landmarks_detector(imgpath)
else:
target_pil = PIL.Image.open(imgpath).convert('RGB')
target_pils.append(target_pil)
w, h = target_pil.size
s = min(w, h)
target_pil = target_pil.crop(((w - s) // 2, (h - s) // 2, (w + s) // 2, (h + s) // 2))
target_pil = target_pil.resize((self.G.G.img_resolution, self.G.G.img_resolution),
PIL.Image.LANCZOS)
target_uint8 = np.array(target_pil, dtype=np.uint8)
targets.append(torch.Tensor(target_uint8.transpose([2,0,1])).to(self.device))
self.latent = list()
for target in tqdm(targets, total=len(targets)):
projected_w_steps = project(
self.G.G,
self.vgg16,
target=target,
num_steps=self.W_projector_steps, # TODO get projector steps from configs
device=self.device,
verbose=False,
)
self.latent.append(projected_w_steps[-1])
self.latent = torch.stack(self.latent)
self.styles = self.G.mapping_stylespace(self.latent)
else: # inv_mode == 'w+'
# use e4e encoder
target_pils = list()
for imgpath in imgpaths:
if self.face_preprocess:
target_pil = self.landmarks_detector(imgpath)
else:
target_pil = PIL.Image.open(imgpath).convert('RGB')
target_pils.append(target_pil)
self.encoder = e4eEncoder(self.device)
self.latent = self.encoder(target_pils)
self.styles = self.G.mapping_stylespace(self.latent)
if pti_mode is not None: # w or s
# pivot tuning inversion
pti = PivotTuning(self.device, self.G.G, mode=pti_mode)
new_G = pti(self.latent, target_pils)
self.G.G = new_G
def manipulate(self, delta_s):
"""Edit style by given delta_style
- use perturbation (delta s) * (alpha) as a boundary
"""
styles = [copy.deepcopy(self.styles) for _ in range(len(self.lst_alpha))]
for (alpha, style) in zip(self.lst_alpha, styles):
for layer in self.G.style_layers:
perturbation = delta_s[layer] * alpha
style[layer] += perturbation
return styles
def manipulate_one_channel(self, layer, channel_ind:int):
"""Edit style from given layer, channel index
- use mean value of pre-saved style
- use perturbation (pre-saved style std) * (alpha) as a boundary
"""
assert layer in self.G.style_layers
assert 0 <= channel_ind < self.styles[layer].shape[1]
boundary = self.S_std[layer][channel_ind].item()
# apply self.S_mean value for given layer, channel_ind
for ind in range(self.num_images):
self.styles[layer][ind][channel_ind] = self.S_mean[layer][channel_ind]
styles = [copy.deepcopy(self.styles) for _ in range(len(self.lst_alpha))]
perturbation = (torch.Tensor(self.lst_alpha) * boundary).numpy().tolist()
# apply one channel manipulation
for img_ind in range(self.num_images):
for edit_ind, delta in enumerate(perturbation):
styles[edit_ind][layer][img_ind][channel_ind] += delta
return styles
def synthesis_from_styles(self, styles, start_ind, end_ind):
"""Synthesis edited styles from styles, lst_alpha
"""
styles_ = list()
for style in styles:
style_ = dict()
for layer in self.G.style_layers:
style_[layer] = style[layer][start_ind:end_ind].to(self.device)
styles_.append(style_)
print("synthesis_from_styles", type(style_))
imgs = [self.G.synthesis_from_stylespace(self.latent[start_ind:end_ind], style_).cpu()
for style_ in styles_]
return imgs
def extract_global_direction(G, device, lst_alpha, num_images, dataset_name=''):
"""Extract global style direction in 100 images
"""
assert len(lst_alpha) == 2
model, preprocess = clip.load("ViT-B/32", device=device)
# lindex in original tf version
manipulate_layers = [layer for layer in G.style_layers if 'torgb' not in layer]
# total channel: 6048 (1024 resolution)
resolution = G.G.img_resolution
latent = torch.randn([1,G.to_w_idx[f'G.synthesis.b{resolution}.torgb.affine']+1,512]).to(device) # 1024 -> 18, 512 -> 16, 256 -> 14
style = G.mapping_stylespace(latent)
cnt = 0
for layer in manipulate_layers:
cnt += style[layer].shape[1]
del latent
del style
# 1024 -> 6048 channels, 256 -> 4928 channels
print(f"total channels to manipulate: {cnt}")
manipulator = Manipulator(G, device, lst_alpha, num_images, face_preprocess=False, dataset_name=dataset_name)
all_feats = list()
for layer in manipulate_layers:
print(f'\nStyle manipulation in layer "{layer}"')
channel_num = manipulator.styles[layer].shape[1]
for channel_ind in tqdm(range(channel_num), total=channel_num):
styles = manipulator.manipulate_one_channel(layer, channel_ind)
# 2 * 100 images
batchsize = 10
nbatch = int(100 / batchsize)
feats = list()
for img_ind in range(0, nbatch): # batch size 10 * 2
start = img_ind*nbatch
end = img_ind*nbatch + batchsize
synth_imgs = manipulator.synthesis_from_styles(styles, start, end)
synth_imgs = [(synth_img.permute(0,2,3,1)*127.5+128).clamp(0,255).to(torch.uint8).numpy()
for synth_img in synth_imgs]
imgs = list()
for i in range(batchsize):
img0 = PIL.Image.fromarray(synth_imgs[0][i])
img1 = PIL.Image.fromarray(synth_imgs[1][i])
imgs.append(preprocess(img0).unsqueeze(0).to(device))
imgs.append(preprocess(img1).unsqueeze(0).to(device))
with torch.no_grad():
feat = model.encode_image(torch.cat(imgs))
feats.append(feat)
all_feats.append(torch.cat(feats).view([-1, 2, 512]).cpu())
all_feats = torch.stack(all_feats).numpy()
fs = all_feats
fs1=fs/np.linalg.norm(fs,axis=-1)[:,:,:,None]
fs2=fs1[:,:,1,:]-fs1[:,:,0,:] # 5*sigma - (-5)*sigma
fs3=fs2/np.linalg.norm(fs2,axis=-1)[:,:,None]
fs3=fs3.mean(axis=1)
fs3=fs3/np.linalg.norm(fs3,axis=-1)[:,None]
np.save(f'tensor/fs3{dataset_name}.npy', fs3) # global style direction
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('runtype', type=str, default='test')
parser.add_argument('--ckpt', type=str, default='pretrained/ffhq.pkl')
parser.add_argument('--face_preprocess', type=bool, default=True)
parser.add_argument('--dataset_name', type=str, default='')
args = parser.parse_args()
runtype = args.runtype
assert runtype in ['test', 'extract']
device = torch.device('cuda:0')
ckpt = args.ckpt
G = Generator_wrapper(ckpt, device)
face_preprocess = args.face_preprocess
dataset_name = args.dataset_name
if runtype == 'test': # test manipulator
num_images = 100
lst_alpha = [-5, 0, 5]
layer = G.style_layers[6]
channel_ind = 501
manipulator = Manipulator(G, device, lst_alpha, num_images, face_preprocess=face_preprocess, dataset_name=dataset_name)
styles = manipulator.manipulate_one_channel(layer, channel_ind)
start_ind, end_ind= 0, 10
imgs = manipulator.synthesis_from_styles(styles, start_ind, end_ind)
print(len(imgs), imgs[0].shape)
elif runtype == 'extract': # extract global style direction from "tensor/S.pt"
num_images = 100
lst_alpha = [-5, 5]
extract_global_direction(G, device, lst_alpha, num_images, dataset_name=dataset_name)