MMFS / app.py
dongyi's picture
Update app.py
582610a
import gradio as gr
from PIL import Image
import os
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
from models.modules.stylegan2.model import StyledConv, ToRGB, EqualLinear, ResBlock, ConvLayer, PixelNorm
from utils.util import *
from utils.data_utils import Transforms
from data import CustomDataLoader
from data.super_dataset import SuperDataset
from configs import parse_config
from utils.augmentation import ImagePathToImage
import clip
from torchvision.transforms import Compose, Resize, ToTensor, Normalize, InterpolationMode
from models.style_based_pix2pixII_model import CLIPFeats2Wplus
class Stylizer(nn.Module):
def __init__(self, ngf=64, phase=2, model_weights=None):
super(Stylizer, self).__init__()
# encoder
self.encoder = nn.Sequential(
ConvLayer(3, ngf, 3), # 512
ResBlock(ngf * 1, ngf * 1), # 256
ResBlock(ngf * 1, ngf * 2), # 128
ResBlock(ngf * 2, ngf * 4), # 64
ResBlock(ngf * 4, ngf * 8), # 32
ConvLayer(ngf * 8, ngf * 8, 3) # 32
)
# mapping network
self.mapping_z = nn.Sequential(*([ PixelNorm() ] + [ EqualLinear(512, 512, activation='fused_lrelu', lr_mul=0.01) for _ in range(8) ]))
# style-based decoder
channels = {
32 : ngf * 8,
64 : ngf * 8,
128: ngf * 4,
256: ngf * 2,
512: ngf * 1
}
self.decoder0 = StyledConv(channels[32], channels[32], 3, 512)
self.to_rgb0 = ToRGB(channels[32], 512, upsample=False)
for i in range(4):
ichan = channels[2 ** (i + 5)]
ochan = channels[2 ** (i + 6)]
setattr(self, f'decoder{i + 1}a', StyledConv(ichan, ochan, 3, 512, upsample=True))
setattr(self, f'decoder{i + 1}b', StyledConv(ochan, ochan, 3, 512))
setattr(self, f'to_rgb{i + 1}', ToRGB(ochan, 512))
self.n_latent = 10
# random style for testing
self.test_z = torch.randn(1, 512)
# load pretrained model weights
if phase == 2:
# load pretrained encoder and stylegan2 decoder
self.load_state_dict(model_weights)
if phase == 3:
self.clip_mapper = CLIPFeats2Wplus(n_tokens=self.n_latent)
# load pretraned base model and freeze all params except clip mapper
self.load_state_dict(model_weights, strict=False)
params = dict(self.named_parameters())
for k in params.keys():
if 'clip_mapper' in k:
print(f'{k} not freezed !')
continue
params[k].requires_grad = False
def get_styles(self, x, **kwargs):
if len(kwargs) == 0:
return self.mapping_z(self.test_z.to(x.device).repeat(x.shape[0], 1)).repeat(self.n_latent, 1, 1)
elif 'mixing' in kwargs and kwargs['mixing']:
w0 = self.mapping_z(torch.randn(x.shape[0], 512, device=x.device))
w1 = self.mapping_z(torch.randn(x.shape[0], 512, device=x.device))
inject_index = random.randint(1, self.n_latent - 1)
return torch.cat([
w0.repeat(inject_index, 1, 1),
w1.repeat(self.n_latent - inject_index, 1, 1)
])
elif 'z' in kwargs:
return self.mapping_z(kwargs['z']).repeat(self.n_latent, 1, 1)
elif 'clip_feats' in kwargs:
return self.clip_mapper(kwargs['clip_feats'])
else:
z = torch.randn(x.shape[0], 512, device=x.device)
return self.mapping_z(z).repeat(self.n_latent, 1, 1)
def forward(self, x, **kwargs):
# encode
feat = self.encoder(x)
# get style code
styles = self.get_styles(x, **kwargs)
# style-based generate
feat = self.decoder0(feat, styles[0])
out = self.to_rgb0(feat, styles[1])
for i in range(4):
feat = getattr(self, f'decoder{i + 1}a')(feat, styles[i * 2 + 1])
feat = getattr(self, f'decoder{i + 1}b')(feat, styles[i * 2 + 2])
out = getattr(self, f'to_rgb{i + 1}')(feat, styles[i * 2 + 3], out)
return F.hardtanh(out)
def tensor2file(input_image):
if not isinstance(input_image, np.ndarray):
if isinstance(input_image, torch.Tensor): # get the data from a variable
image_tensor = input_image.data
else:
return input_image
image_numpy = image_tensor[0].cpu().float().numpy() # convert it into a numpy array
if image_numpy.shape[0] == 1: # grayscale to RGB
image_numpy = np.tile(image_numpy, (3, 1, 1))
image_numpy = (np.transpose(image_numpy, (1, 2, 0)) + 1) / 2.0 * 255.0 # post-processing: tranpose and scaling
else: # if it is a numpy array, do nothing
image_numpy = input_image
if image_numpy.shape[2] <= 3:
image_numpy = image_numpy.astype(np.uint8)
image_pil = Image.fromarray(image_numpy)
return image_pil
else:
return image_pil
device = "cuda"
def generate_multi_model(input_img):
# parse config
config = parse_config("./exp/sp2pII-phase2.yaml")
# hard-code some parameters for test
config['common']['phase'] = "test"
config['dataset']['n_threads'] = 0 # test code only supports num_threads = 0
config['dataset']['batch_size'] = 1 # test code only supports batch_size = 1
config['dataset']['serial_batches'] = True # disable data shuffling; comment this line if results on randomly chosen images are needed.
config['dataset']['no_flip'] = True # no flip; comment this line if results on flipped images are needed.
# override data augmentation
config['dataset']['load_size'] = config['testing']['load_size']
config['dataset']['crop_size'] = config['testing']['crop_size']
config['dataset']['preprocess'] = config['testing']['preprocess']
config['training']['pretrained_model'] = "./pretrained_models/phase2_pretrain_90000.pth"
# add testing path
config['testing']['test_img'] = input_img
config['testing']['test_video'] = None
config['testing']['test_folder'] = None
dataset = SuperDataset(config)
dataloader = CustomDataLoader(config, dataset)
model_dict = torch.load("./pretrained_models/phase2_pretrain_90000.pth", map_location='cpu')
# init netG
model = Stylizer(ngf=config['model']['ngf'], phase=2, model_weights=model_dict['G_ema_model']).to(device)
for data in dataloader:
real_A = data['test_A'].to(device)
fake_B = model(real_A, mixing=False)
output_img = tensor2file(fake_B) # get image results
return output_img
def generate_one_shot(src_img, img_prompt):
# init model
state_dict = torch.load(f"./checkpoints/{img_prompt[-2:]}/epoch_latest.pth", map_location='cpu')
model = Stylizer(ngf=64, phase=3, model_weights=state_dict['G_ema_model'])
model.to(device)
model.eval()
model.requires_grad_(False)
clip_model, img_preprocess = clip.load('ViT-B/32', device=device)
clip_model.eval()
clip_model.requires_grad_(False)
# image transform for stylizer
img_transform = Compose([
Resize((512, 512), interpolation=InterpolationMode.LANCZOS),
ToTensor(),
Normalize([0.5], [0.5])
])
# get clip features
with torch.no_grad():
img = img_preprocess(Image.open(f"./example/reference/{img_prompt[-2:]}.png")).unsqueeze(0).to(device)
clip_feats = clip_model.encode_image(img)
clip_feats /= clip_feats.norm(dim=1, keepdim=True)
# load image & to tensor
img = Image.open(src_img)
if not img.mode == 'RGB':
img = img.convert('RGB')
img = img_transform(img).unsqueeze(0).to(device)
# stylize it !
with torch.no_grad():
res = model(img, clip_feats=clip_feats)
output_img = tensor2file(res) # get image results
return output_img
def generate_zero_shot(src_img, txt_prompt):
# init model
state_dict = torch.load(f"./checkpoints/{txt_prompt.replace(' ', '_')}/epoch_latest.pth", map_location='cpu')
model = Stylizer(ngf=64, phase=3, model_weights=state_dict['G_ema_model'])
model.to(device)
model.eval()
model.requires_grad_(False)
clip_model, img_preprocess = clip.load('ViT-B/32', device=device)
clip_model.eval()
clip_model.requires_grad_(False)
# image transform for stylizer
img_transform = Compose([
Resize((512, 512), interpolation=InterpolationMode.LANCZOS),
ToTensor(),
Normalize([0.5], [0.5])
])
# get clip features
with torch.no_grad():
text = clip.tokenize(txt_prompt).to(device)
clip_feats = clip_model.encode_text(text)
clip_feats /= clip_feats.norm(dim=1, keepdim=True)
# load image & to tensor
img = Image.open(src_img)
if not img.mode == 'RGB':
img = img.convert('RGB')
img = img_transform(img).unsqueeze(0).to(device)
# stylize it !
with torch.no_grad():
res = model(img, clip_feats=clip_feats)
output_img = tensor2file(res) # get image results
return output_img
with gr.Blocks() as demo:
# 顶部文字
gr.Markdown("# MMFS")
# 多个tab
with gr.Tabs():
with gr.TabItem("Multi-Model"):
multi_input_img = gr.Image(label="Upload Input Face Image", type='filepath', height=400)
gr.Examples(examples=["./example/source/01.png", "./example/source/02.png", "./example/source/03.png", "./example/source/04.png"], inputs=multi_input_img)
multi_model_button = gr.Button("Random Stylize")
multi_output_img = gr.Image(label="Output Image", height=400)
with gr.TabItem("One-Shot"):
one_shot_src_img = gr.Image(label="Upload Input Face Image", type='filepath', height=400)
gr.Examples(examples=["./example/source/01.png", "./example/source/02.png", "./example/source/03.png", "./example/source/04.png"], inputs=one_shot_src_img)
with gr.Row():
gr.Image(shape=(100, 100), value = Image.open("example/reference/01.png"), type='pil', label="ref01")
gr.Image(shape=(100, 100), value = Image.open("example/reference/02.png"), type='pil', label="ref02")
gr.Image(shape=(100, 100), value = Image.open("example/reference/03.png"), type='pil', label="ref03")
gr.Image(shape=(100, 100), value = Image.open("example/reference/04.png"), type='pil', label="ref04")
one_shot_ref_img = gr.Radio(['ref01','ref02','ref03','ref04'],value="ref01", label="Select a reference style image")
one_shot_test_button = gr.Button("Stylize Image")
one_shot_output_img = gr.Image(label="Output Image", height=400)
with gr.TabItem("Zero-Shot"):
zero_shot_src_img = gr.Image(label="Upload Input Face Image", type='filepath', height=400)
gr.Examples(examples=["./example/source/01.png", "./example/source/02.png", "./example/source/03.png", "./example/source/04.png"], inputs=zero_shot_src_img)
zero_shot_ref_prompt = gr.Dropdown(
label="Txt Prompt",
info="Select a reference style prompt",
choices=[
"pop art",
"watercolor painting",
],
max_choices=1,
value="pop art",
)
zero_shot_test_button = gr.Button("Stylize Image")
zero_shot_output_img = gr.Image(label="Output Image", height=400)
multi_model_button.click(fn=generate_multi_model, inputs=multi_input_img, outputs=multi_output_img)
one_shot_test_button.click(fn=generate_one_shot, inputs=[one_shot_src_img, one_shot_ref_img], outputs=one_shot_output_img)
zero_shot_test_button.click(fn=generate_zero_shot, inputs=[zero_shot_src_img, zero_shot_ref_prompt], outputs=zero_shot_output_img)
demo.queue(max_size=20)
demo.launch()