Spaces:

ldkong
/

TranSVAE

Build error

App Files Files Community

TranSVAE / app.py

ldkong

Update app.py

32e426d about 1 year ago

raw history blame contribute delete

No virus

15.1 kB

	import gradio as gr

	import cv2
	import imageio
	import math
	from math import ceil
	import matplotlib.pyplot as plt
	import numpy as np
	from PIL import Image
	import subprocess
	import torch
	import torch.nn as nn
	import torch.nn.functional as F


	class RelationModuleMultiScale(torch.nn.Module):

	def __init__(self, img_feature_dim, num_bottleneck, num_frames):
	super(RelationModuleMultiScale, self).__init__()
	self.subsample_num = 3
	self.img_feature_dim = img_feature_dim
	self.scales = [i for i in range(num_frames, 1, -1)]
	self.relations_scales = []
	self.subsample_scales = []
	for scale in self.scales:
	relations_scale = self.return_relationset(num_frames, scale)
	self.relations_scales.append(relations_scale)
	self.subsample_scales.append(min(self.subsample_num, len(relations_scale)))
	self.num_frames = num_frames
	self.fc_fusion_scales = nn.ModuleList()
	for i in range(len(self.scales)):
	scale = self.scales[i]
	fc_fusion = nn.Sequential(nn.ReLU(), nn.Linear(scale * self.img_feature_dim, num_bottleneck), nn.ReLU())
	self.fc_fusion_scales += [fc_fusion]

	def forward(self, input):
	act_scale_1 = input[:, self.relations_scales[0][0] , :]
	act_scale_1 = act_scale_1.view(act_scale_1.size(0), self.scales[0] * self.img_feature_dim)
	act_scale_1 = self.fc_fusion_scales[0](act_scale_1)
	act_scale_1 = act_scale_1.unsqueeze(1)
	act_all = act_scale_1.clone()
	for scaleID in range(1, len(self.scales)):
	act_relation_all = torch.zeros_like(act_scale_1)
	num_total_relations = len(self.relations_scales[scaleID])
	num_select_relations = self.subsample_scales[scaleID]
	idx_relations_evensample = [int(ceil(i * num_total_relations / num_select_relations)) for i in range(num_select_relations)]
	for idx in idx_relations_evensample:
	act_relation = input[:, self.relations_scales[scaleID][idx], :]
	act_relation = act_relation.view(act_relation.size(0), self.scales[scaleID] * self.img_feature_dim)
	act_relation = self.fc_fusion_scales[scaleID](act_relation)
	act_relation = act_relation.unsqueeze(1)
	act_relation_all += act_relation
	act_all = torch.cat((act_all, act_relation_all), 1)
	return act_all

	def return_relationset(self, num_frames, num_frames_relation):
	import itertools
	return list(itertools.combinations([i for i in range(num_frames)], num_frames_relation))


	class TransferVAE_Video(nn.Module):

	def __init__(self):
	super(TransferVAE_Video, self).__init__()
	self.f_dim = 512
	self.z_dim = 512
	self.fc_dim = 1024
	self.channels = 3
	self.frames = 8
	self.batch_size = 128
	self.dropout_rate = 0.5
	self.num_class = 15
	self.prior_sample = 'random'

	import dcgan_64
	self.encoder = dcgan_64.encoder(self.fc_dim, self.channels)
	self.decoder = dcgan_64.decoder_woSkip(self.z_dim + self.f_dim, self.channels)
	self.fc_output_dim = self.fc_dim

	self.relu = nn.LeakyReLU(0.1)
	self.dropout_f = nn.Dropout(p=self.dropout_rate)
	self.dropout_v = nn.Dropout(p=self.dropout_rate)

	self.hidden_dim = 512
	self.f_rnn_layers = 1

	self.z_prior_lstm_ly1 = nn.LSTMCell(self.z_dim, self.hidden_dim)
	self.z_prior_lstm_ly2 = nn.LSTMCell(self.hidden_dim, self.hidden_dim)

	self.z_prior_mean = nn.Linear(self.hidden_dim, self.z_dim)
	self.z_prior_logvar = nn.Linear(self.hidden_dim, self.z_dim)

	self.z_lstm = nn.LSTM(self.fc_output_dim, self.hidden_dim, self.f_rnn_layers, bidirectional=True, batch_first=True)
	self.f_mean = nn.Linear(self.hidden_dim * 2, self.f_dim)
	self.f_logvar = nn.Linear(self.hidden_dim * 2, self.f_dim)

	self.z_rnn = nn.RNN(self.hidden_dim * 2, self.hidden_dim, batch_first=True)
	self.z_mean = nn.Linear(self.hidden_dim, self.z_dim)
	self.z_logvar = nn.Linear(self.hidden_dim, self.z_dim)

	self.fc_feature_domain_frame = nn.Linear(self.z_dim, self.z_dim)
	self.fc_classifier_domain_frame = nn.Linear(self.z_dim, 2)

	self.num_bottleneck = 256
	self.TRN = RelationModuleMultiScale(self.z_dim, self.num_bottleneck, self.frames)
	self.bn_trn_S = nn.BatchNorm1d(self.num_bottleneck)
	self.bn_trn_T = nn.BatchNorm1d(self.num_bottleneck)
	self.feat_aggregated_dim = self.num_bottleneck

	self.fc_feature_domain_video = nn.Linear(self.feat_aggregated_dim, self.feat_aggregated_dim)
	self.fc_classifier_domain_video = nn.Linear(self.feat_aggregated_dim, 2)

	self.relation_domain_classifier_all = nn.ModuleList()
	for i in range(self.frames-1):
	relation_domain_classifier = nn.Sequential(
	nn.Linear(self.feat_aggregated_dim, self.feat_aggregated_dim),
	nn.ReLU(),
	nn.Linear(self.feat_aggregated_dim, 2)
	)
	self.relation_domain_classifier_all += [relation_domain_classifier]

	self.pred_classifier_video = nn.Linear(self.feat_aggregated_dim, self.num_class)
	self.fc_feature_domain_latent = nn.Linear(self.f_dim, self.f_dim)
	self.fc_classifier_doamin_latent = nn.Linear(self.f_dim, 2)


	def encode_and_sample_post(self, x):
	conv_x = self.encoder_frame(x)
	lstm_out, _ = self.z_lstm(conv_x)
	backward = lstm_out[:, 0, self.hidden_dim:2 * self.hidden_dim]
	frontal = lstm_out[:, self.frames - 1, 0:self.hidden_dim]
	lstm_out_f = torch.cat((frontal, backward), dim=1)

	f_mean = self.f_mean(lstm_out_f)
	f_post = f_mean

	features, _ = self.z_rnn(lstm_out)

	z_mean = self.z_mean(features)
	z_post = z_mean

	return f_post, z_post


	def decoder_frame(self,zf):
	recon_x = self.decoder(zf)
	return recon_x


	def encoder_frame(self, x):
	x_shape = x.shape
	x = x.view(-1, x_shape[-3], x_shape[-2], x_shape[-1])
	x_embed = self.encoder(x)[0]
	return x_embed.view(x_shape[0], x_shape[1], -1)


	def forward(self, x, beta):
	f_post, z_post = self.encode_and_sample_post(x)
	if isinstance(f_post, list):
	f_expand = f_post[0].unsqueeze(1).expand(-1, self.frames, self.f_dim)
	else:
	f_expand = f_post.unsqueeze(1).expand(-1, self.frames, self.f_dim)
	zf = torch.cat((z_post, f_expand), dim=2)
	recon_x = self.decoder_frame(zf)
	return f_post, z_post, recon_x


	def name2seq(file_name):
	images = []

	for frame in range(8):
	frame_name = '%d' % (frame)
	image_filename = file_name + frame_name + '.png'
	image = imageio.imread(image_filename)
	images.append(image[:, :, :3])

	images = np.asarray(images, dtype='f') / 256.0
	images = images.transpose((0, 3, 1, 2))
	images = torch.Tensor(images).unsqueeze(dim=0)
	return images


	def concat(file_name):
	images = []

	for frame in range(8):
	frame_name = '%d' % (frame)
	image_filename = file_name + frame_name + '.png'
	image = imageio.imread(image_filename)
	images.append(image)

	gif_filename = 'demo.gif'
	return imageio.mimsave(gif_filename, images)


	def MyPlot(frame_id, src_orig, tar_orig, src_recon, tar_recon, src_Zt, tar_Zt, src_Zf_tar_Zt, tar_Zf_src_Zt):

	fig, axs = plt.subplots(2, 4, sharex=True, sharey=True, figsize=(10, 5))

	axs[0, 0].imshow(src_orig)
	axs[0, 0].set_title("\n\n\nOriginal\nInput")
	axs[0, 0].axis('off')

	axs[1, 0].imshow(tar_orig)
	axs[1, 0].axis('off')

	axs[0, 1].imshow(src_recon)
	axs[0, 1].set_title("\n\n\nReconstructed\nOutput")
	axs[0, 1].axis('off')

	axs[1, 1].imshow(tar_recon)
	axs[1, 1].axis('off')

	axs[0, 2].imshow(src_Zt)
	axs[0, 2].set_title("\n\n\nOutput\nw/ Zt")
	axs[0, 2].axis('off')

	axs[1, 2].imshow(tar_Zt)
	axs[1, 2].axis('off')

	axs[0, 3].imshow(tar_Zf_src_Zt)
	axs[0, 3].set_title("\n\n\nExchange\nZt and Zf")
	axs[0, 3].axis('off')

	axs[1, 3].imshow(src_Zf_tar_Zt)
	axs[1, 3].axis('off')

	plt.subplots_adjust(hspace=0.0125, wspace=0.0)

	save_name = 'MyPlot_{}.png'.format(frame_id)

	plt.savefig(save_name, dpi=200, format='png', bbox_inches='tight', pad_inches=0.0)


	# == Load Model ==
	model = TransferVAE_Video()
	model.load_state_dict(torch.load('TransferVAE.pth.tar', map_location=torch.device('cpu'))['state_dict'])
	model.eval()


	def run(source, action_source, hair_source, top_source, bottom_source, target, action_target, hair_target, top_target, bottom_target):

	# == Source Avatar ==
	# body
	body_source = '0'

	# hair
	if hair_source == "green": hair_source = '0'
	elif hair_source == "yellow": hair_source = '2'
	elif hair_source == "rose": hair_source = '4'
	elif hair_source == "red": hair_source = '7'
	elif hair_source == "wine": hair_source = '8'

	# top
	if top_source == "brown": top_source = '0'
	elif top_source == "blue": top_source = '1'
	elif top_source == "white": top_source = '2'

	# bottom
	if bottom_source == "white": bottom_source = '0'
	elif bottom_source == "golden": bottom_source = '1'
	elif bottom_source == "red": bottom_source = '2'
	elif bottom_source == "silver": bottom_source = '3'

	file_name_source = './Sprite/frames/domain_1/' + action_source + '/'
	file_name_source = file_name_source + 'front' + '_' + str(body_source) + str(bottom_source) + str(top_source) + str(hair_source) + '_'


	# == Target Avatar ==
	# body
	body_target = '1'

	# hair
	if hair_target == "violet": hair_target = '1'
	elif hair_target == "silver": hair_target = '3'
	elif hair_target == "purple": hair_target = '5'
	elif hair_target == "grey": hair_target = '6'
	elif hair_target == "golden": hair_target = '9'

	# top
	if top_target == "grey": top_target = '3'
	elif top_target == "khaki": top_target = '4'
	elif top_target == "linen": top_target = '5'
	elif top_target == "ocre": top_target = '6'

	# bottom
	if bottom_target == "denim": bottom_target = '4'
	elif bottom_target == "olive": bottom_target = '5'
	elif bottom_target == "brown": bottom_target = '6'

	file_name_target = './Sprite/frames/domain_2/' + action_target + '/'
	file_name_target = file_name_target + 'front' + '_' + str(body_target) + str(bottom_target) + str(top_target) + str(hair_target) + '_'


	# == Load Input ==
	images_source = name2seq(file_name_source)
	images_target = name2seq(file_name_target)
	x = torch.cat((images_source, images_target), dim=0)


	# == Forward ==
	with torch.no_grad():
	f_post, z_post, recon_x = model(x, [0]*3)


	src_orig_sample = x[0, :, :, :, :]
	src_recon_sample = recon_x[0, :, :, :, :]
	src_f_post = f_post[0, :].unsqueeze(0)
	src_z_post = z_post[0, :, :].unsqueeze(0)

	tar_orig_sample = x[1, :, :, :, :]
	tar_recon_sample = recon_x[1, :, :, :, :]
	tar_f_post = f_post[1, :].unsqueeze(0)
	tar_z_post = z_post[1, :, :].unsqueeze(0)


	# == Visualize ==
	for frame in range(8):

	# original frame
	src_orig = src_orig_sample[frame, :, :, :].detach().numpy().transpose((1, 2, 0))
	tar_orig = tar_orig_sample[frame, :, :, :].detach().numpy().transpose((1, 2, 0))

	# reconstructed frame
	src_recon = src_recon_sample[frame, :, :, :].detach().numpy().transpose((1, 2, 0))
	tar_recon = tar_recon_sample[frame, :, :, :].detach().numpy().transpose((1, 2, 0))

	# Zt
	f_expand_src = 0 * src_f_post.unsqueeze(1).expand(-1, 8, 512)
	zf_src = torch.cat((src_z_post, f_expand_src), dim=2)
	recon_x_src = model.decoder_frame(zf_src)
	src_Zt = recon_x_src.squeeze()[frame, :, :, :].detach().numpy().transpose((1, 2, 0))

	f_expand_tar = 0 * tar_f_post.unsqueeze(1).expand(-1, 8, 512)
	zf_tar = torch.cat((tar_z_post, f_expand_tar), dim=2)
	recon_x_tar = model.decoder_frame(zf_tar)
	tar_Zt = recon_x_tar.squeeze()[frame, :, :, :].detach().numpy().transpose((1, 2, 0))

	# Zf_Zt
	f_expand_src = src_f_post.unsqueeze(1).expand(-1, 8, 512)
	zf_srcZf_tarZt = torch.cat((tar_z_post, f_expand_src), dim=2)
	recon_x_srcZf_tarZt = model.decoder_frame(zf_srcZf_tarZt)
	src_Zf_tar_Zt = recon_x_srcZf_tarZt.squeeze()[frame, :, :, :].detach().numpy().transpose((1, 2, 0))

	f_expand_tar = tar_f_post.unsqueeze(1).expand(-1, 8, 512)
	zf_tarZf_srcZt = torch.cat((src_z_post, f_expand_tar), dim=2)
	recon_x_tarZf_srcZt = model.decoder_frame(zf_tarZf_srcZt)
	tar_Zf_src_Zt = recon_x_tarZf_srcZt.squeeze()[frame, :, :, :].detach().numpy().transpose((1, 2, 0))

	MyPlot(frame, src_orig, tar_orig, src_recon, tar_recon, src_Zt, tar_Zt, src_Zf_tar_Zt, tar_Zf_src_Zt)

	a = concat('MyPlot_')

	return 'demo.gif'


	desc = """
	Welcome to the demo page of TranSVAE, a disentanglement framework designed for unsupervised video domain adaptation. In this live demo, you are able to:

	- Explore domain disentanglement and transfer in TranSVAE with Sprites avatars;
	- Customize the Sprites avatars by yourself via changing their actions, hair colors, top wears, and bottom wears.

	For more details, read the [TranSVAE paper](https://arxiv.org/abs/2208.07365) and visit our [project page](https://ldkong.com/TranSVAE). The training and testing code is available at our [GitHub Repo](https://github.com/ldkong1205/TranSVAE). Have fun!
	"""

	gr.Interface(
	fn=run,
	inputs=[
	gr.Markdown(
	"""
	👦🏻 Human - Source Avatar
	"""
	),
	gr.Radio(choices=["slash", "spellcard", "walk"], value="slash"),
	gr.Radio(choices=["green", "yellow", "rose", "red", "wine"], value="green"),
	gr.Radio(choices=["brown", "blue", "white"], value="brown"),
	gr.Radio(choices=["white", "golden", "red", "silver"], value="white"),
	gr.Markdown(
	"""
	👽 Alien - Target Avatar
	"""
	),
	gr.Radio(choices=["slash", "spellcard", "walk"], value="walk"),
	gr.Radio(choices=["violet", "silver", "purple", "grey", "golden"], value="golden"),
	gr.Radio(choices=["grey", "khaki", "linen", "ocre"], value="ocre"),
	gr.Radio(choices=["denim", "olive", "brown"], value="brown"),
	],
	outputs=[
	gr.components.Image(type="file", label="Domain Disentanglement"),
	],
	live=False,
	cache_examples=True,
	title="TranSVAE for Unsupervised Video Domain Adaptation",
	description=desc
	).launch(share=True)