import gradio as gr import cv2 import imageio import math from math import ceil import matplotlib.pyplot as plt import numpy as np from PIL import Image import subprocess import torch import torch.nn as nn import torch.nn.functional as F class RelationModuleMultiScale(torch.nn.Module): def __init__(self, img_feature_dim, num_bottleneck, num_frames): super(RelationModuleMultiScale, self).__init__() self.subsample_num = 3 self.img_feature_dim = img_feature_dim self.scales = [i for i in range(num_frames, 1, -1)] self.relations_scales = [] self.subsample_scales = [] for scale in self.scales: relations_scale = self.return_relationset(num_frames, scale) self.relations_scales.append(relations_scale) self.subsample_scales.append(min(self.subsample_num, len(relations_scale))) self.num_frames = num_frames self.fc_fusion_scales = nn.ModuleList() for i in range(len(self.scales)): scale = self.scales[i] fc_fusion = nn.Sequential(nn.ReLU(), nn.Linear(scale * self.img_feature_dim, num_bottleneck), nn.ReLU()) self.fc_fusion_scales += [fc_fusion] def forward(self, input): act_scale_1 = input[:, self.relations_scales[0][0] , :] act_scale_1 = act_scale_1.view(act_scale_1.size(0), self.scales[0] * self.img_feature_dim) act_scale_1 = self.fc_fusion_scales[0](act_scale_1) act_scale_1 = act_scale_1.unsqueeze(1) act_all = act_scale_1.clone() for scaleID in range(1, len(self.scales)): act_relation_all = torch.zeros_like(act_scale_1) num_total_relations = len(self.relations_scales[scaleID]) num_select_relations = self.subsample_scales[scaleID] idx_relations_evensample = [int(ceil(i * num_total_relations / num_select_relations)) for i in range(num_select_relations)] for idx in idx_relations_evensample: act_relation = input[:, self.relations_scales[scaleID][idx], :] act_relation = act_relation.view(act_relation.size(0), self.scales[scaleID] * self.img_feature_dim) act_relation = self.fc_fusion_scales[scaleID](act_relation) act_relation = act_relation.unsqueeze(1) act_relation_all += act_relation act_all = torch.cat((act_all, act_relation_all), 1) return act_all def return_relationset(self, num_frames, num_frames_relation): import itertools return list(itertools.combinations([i for i in range(num_frames)], num_frames_relation)) class TransferVAE_Video(nn.Module): def __init__(self): super(TransferVAE_Video, self).__init__() self.f_dim = 512 self.z_dim = 512 self.fc_dim = 1024 self.channels = 3 self.frames = 8 self.batch_size = 128 self.dropout_rate = 0.5 self.num_class = 15 self.prior_sample = 'random' import dcgan_64 self.encoder = dcgan_64.encoder(self.fc_dim, self.channels) self.decoder = dcgan_64.decoder_woSkip(self.z_dim + self.f_dim, self.channels) self.fc_output_dim = self.fc_dim self.relu = nn.LeakyReLU(0.1) self.dropout_f = nn.Dropout(p=self.dropout_rate) self.dropout_v = nn.Dropout(p=self.dropout_rate) self.hidden_dim = 512 self.f_rnn_layers = 1 self.z_prior_lstm_ly1 = nn.LSTMCell(self.z_dim, self.hidden_dim) self.z_prior_lstm_ly2 = nn.LSTMCell(self.hidden_dim, self.hidden_dim) self.z_prior_mean = nn.Linear(self.hidden_dim, self.z_dim) self.z_prior_logvar = nn.Linear(self.hidden_dim, self.z_dim) self.z_lstm = nn.LSTM(self.fc_output_dim, self.hidden_dim, self.f_rnn_layers, bidirectional=True, batch_first=True) self.f_mean = nn.Linear(self.hidden_dim * 2, self.f_dim) self.f_logvar = nn.Linear(self.hidden_dim * 2, self.f_dim) self.z_rnn = nn.RNN(self.hidden_dim * 2, self.hidden_dim, batch_first=True) self.z_mean = nn.Linear(self.hidden_dim, self.z_dim) self.z_logvar = nn.Linear(self.hidden_dim, self.z_dim) self.fc_feature_domain_frame = nn.Linear(self.z_dim, self.z_dim) self.fc_classifier_domain_frame = nn.Linear(self.z_dim, 2) self.num_bottleneck = 256 self.TRN = RelationModuleMultiScale(self.z_dim, self.num_bottleneck, self.frames) self.bn_trn_S = nn.BatchNorm1d(self.num_bottleneck) self.bn_trn_T = nn.BatchNorm1d(self.num_bottleneck) self.feat_aggregated_dim = self.num_bottleneck self.fc_feature_domain_video = nn.Linear(self.feat_aggregated_dim, self.feat_aggregated_dim) self.fc_classifier_domain_video = nn.Linear(self.feat_aggregated_dim, 2) self.relation_domain_classifier_all = nn.ModuleList() for i in range(self.frames-1): relation_domain_classifier = nn.Sequential( nn.Linear(self.feat_aggregated_dim, self.feat_aggregated_dim), nn.ReLU(), nn.Linear(self.feat_aggregated_dim, 2) ) self.relation_domain_classifier_all += [relation_domain_classifier] self.pred_classifier_video = nn.Linear(self.feat_aggregated_dim, self.num_class) self.fc_feature_domain_latent = nn.Linear(self.f_dim, self.f_dim) self.fc_classifier_doamin_latent = nn.Linear(self.f_dim, 2) def encode_and_sample_post(self, x): conv_x = self.encoder_frame(x) lstm_out, _ = self.z_lstm(conv_x) backward = lstm_out[:, 0, self.hidden_dim:2 * self.hidden_dim] frontal = lstm_out[:, self.frames - 1, 0:self.hidden_dim] lstm_out_f = torch.cat((frontal, backward), dim=1) f_mean = self.f_mean(lstm_out_f) f_post = f_mean features, _ = self.z_rnn(lstm_out) z_mean = self.z_mean(features) z_post = z_mean return f_post, z_post def decoder_frame(self,zf): recon_x = self.decoder(zf) return recon_x def encoder_frame(self, x): x_shape = x.shape x = x.view(-1, x_shape[-3], x_shape[-2], x_shape[-1]) x_embed = self.encoder(x)[0] return x_embed.view(x_shape[0], x_shape[1], -1) def forward(self, x, beta): f_post, z_post = self.encode_and_sample_post(x) if isinstance(f_post, list): f_expand = f_post[0].unsqueeze(1).expand(-1, self.frames, self.f_dim) else: f_expand = f_post.unsqueeze(1).expand(-1, self.frames, self.f_dim) zf = torch.cat((z_post, f_expand), dim=2) recon_x = self.decoder_frame(zf) return f_post, z_post, recon_x def name2seq(file_name): images = [] for frame in range(8): frame_name = '%d' % (frame) image_filename = file_name + frame_name + '.png' image = imageio.imread(image_filename) images.append(image[:, :, :3]) images = np.asarray(images, dtype='f') / 256.0 images = images.transpose((0, 3, 1, 2)) images = torch.Tensor(images).unsqueeze(dim=0) return images def concat(file_name): images = [] for frame in range(8): frame_name = '%d' % (frame) image_filename = file_name + frame_name + '.png' image = imageio.imread(image_filename) images.append(image) gif_filename = 'demo.gif' return imageio.mimsave(gif_filename, images) def MyPlot(frame_id, src_orig, tar_orig, src_recon, tar_recon, src_Zt, tar_Zt, src_Zf_tar_Zt, tar_Zf_src_Zt): fig, axs = plt.subplots(2, 4, sharex=True, sharey=True, figsize=(10, 5)) axs[0, 0].imshow(src_orig) axs[0, 0].set_title("\n\n\nOriginal\nInput") axs[0, 0].axis('off') axs[1, 0].imshow(tar_orig) axs[1, 0].axis('off') axs[0, 1].imshow(src_recon) axs[0, 1].set_title("\n\n\nReconstructed\nOutput") axs[0, 1].axis('off') axs[1, 1].imshow(tar_recon) axs[1, 1].axis('off') axs[0, 2].imshow(src_Zt) axs[0, 2].set_title("\n\n\nOutput\nw/ Zt") axs[0, 2].axis('off') axs[1, 2].imshow(tar_Zt) axs[1, 2].axis('off') axs[0, 3].imshow(tar_Zf_src_Zt) axs[0, 3].set_title("\n\n\nExchange\nZt and Zf") axs[0, 3].axis('off') axs[1, 3].imshow(src_Zf_tar_Zt) axs[1, 3].axis('off') plt.subplots_adjust(hspace=0.0125, wspace=0.0) save_name = 'MyPlot_{}.png'.format(frame_id) plt.savefig(save_name, dpi=200, format='png', bbox_inches='tight', pad_inches=0.0) # == Load Model == model = TransferVAE_Video() model.load_state_dict(torch.load('TransferVAE.pth.tar', map_location=torch.device('cpu'))['state_dict']) model.eval() def run(source, action_source, hair_source, top_source, bottom_source, target, action_target, hair_target, top_target, bottom_target): # == Source Avatar == # body body_source = '0' # hair if hair_source == "green": hair_source = '0' elif hair_source == "yellow": hair_source = '2' elif hair_source == "rose": hair_source = '4' elif hair_source == "red": hair_source = '7' elif hair_source == "wine": hair_source = '8' # top if top_source == "brown": top_source = '0' elif top_source == "blue": top_source = '1' elif top_source == "white": top_source = '2' # bottom if bottom_source == "white": bottom_source = '0' elif bottom_source == "golden": bottom_source = '1' elif bottom_source == "red": bottom_source = '2' elif bottom_source == "silver": bottom_source = '3' file_name_source = './Sprite/frames/domain_1/' + action_source + '/' file_name_source = file_name_source + 'front' + '_' + str(body_source) + str(bottom_source) + str(top_source) + str(hair_source) + '_' # == Target Avatar == # body body_target = '1' # hair if hair_target == "violet": hair_target = '1' elif hair_target == "silver": hair_target = '3' elif hair_target == "purple": hair_target = '5' elif hair_target == "grey": hair_target = '6' elif hair_target == "golden": hair_target = '9' # top if top_target == "grey": top_target = '3' elif top_target == "khaki": top_target = '4' elif top_target == "linen": top_target = '5' elif top_target == "ocre": top_target = '6' # bottom if bottom_target == "denim": bottom_target = '4' elif bottom_target == "olive": bottom_target = '5' elif bottom_target == "brown": bottom_target = '6' file_name_target = './Sprite/frames/domain_2/' + action_target + '/' file_name_target = file_name_target + 'front' + '_' + str(body_target) + str(bottom_target) + str(top_target) + str(hair_target) + '_' # == Load Input == images_source = name2seq(file_name_source) images_target = name2seq(file_name_target) x = torch.cat((images_source, images_target), dim=0) # == Forward == with torch.no_grad(): f_post, z_post, recon_x = model(x, [0]*3) src_orig_sample = x[0, :, :, :, :] src_recon_sample = recon_x[0, :, :, :, :] src_f_post = f_post[0, :].unsqueeze(0) src_z_post = z_post[0, :, :].unsqueeze(0) tar_orig_sample = x[1, :, :, :, :] tar_recon_sample = recon_x[1, :, :, :, :] tar_f_post = f_post[1, :].unsqueeze(0) tar_z_post = z_post[1, :, :].unsqueeze(0) # == Visualize == for frame in range(8): # original frame src_orig = src_orig_sample[frame, :, :, :].detach().numpy().transpose((1, 2, 0)) tar_orig = tar_orig_sample[frame, :, :, :].detach().numpy().transpose((1, 2, 0)) # reconstructed frame src_recon = src_recon_sample[frame, :, :, :].detach().numpy().transpose((1, 2, 0)) tar_recon = tar_recon_sample[frame, :, :, :].detach().numpy().transpose((1, 2, 0)) # Zt f_expand_src = 0 * src_f_post.unsqueeze(1).expand(-1, 8, 512) zf_src = torch.cat((src_z_post, f_expand_src), dim=2) recon_x_src = model.decoder_frame(zf_src) src_Zt = recon_x_src.squeeze()[frame, :, :, :].detach().numpy().transpose((1, 2, 0)) f_expand_tar = 0 * tar_f_post.unsqueeze(1).expand(-1, 8, 512) zf_tar = torch.cat((tar_z_post, f_expand_tar), dim=2) recon_x_tar = model.decoder_frame(zf_tar) tar_Zt = recon_x_tar.squeeze()[frame, :, :, :].detach().numpy().transpose((1, 2, 0)) # Zf_Zt f_expand_src = src_f_post.unsqueeze(1).expand(-1, 8, 512) zf_srcZf_tarZt = torch.cat((tar_z_post, f_expand_src), dim=2) recon_x_srcZf_tarZt = model.decoder_frame(zf_srcZf_tarZt) src_Zf_tar_Zt = recon_x_srcZf_tarZt.squeeze()[frame, :, :, :].detach().numpy().transpose((1, 2, 0)) f_expand_tar = tar_f_post.unsqueeze(1).expand(-1, 8, 512) zf_tarZf_srcZt = torch.cat((src_z_post, f_expand_tar), dim=2) recon_x_tarZf_srcZt = model.decoder_frame(zf_tarZf_srcZt) tar_Zf_src_Zt = recon_x_tarZf_srcZt.squeeze()[frame, :, :, :].detach().numpy().transpose((1, 2, 0)) MyPlot(frame, src_orig, tar_orig, src_recon, tar_recon, src_Zt, tar_Zt, src_Zf_tar_Zt, tar_Zf_src_Zt) a = concat('MyPlot_') return 'demo.gif' desc = """ Welcome to the demo page of TranSVAE, a disentanglement framework designed for unsupervised video domain adaptation. In this live demo, you are able to: - Explore domain disentanglement and transfer in TranSVAE with Sprites avatars; - Customize the Sprites avatars by yourself via changing their actions, hair colors, top wears, and bottom wears. For more details, read the [TranSVAE paper](https://arxiv.org/abs/2208.07365) and visit our [project page](https://ldkong.com/TranSVAE). The training and testing code is available at our [GitHub Repo](https://github.com/ldkong1205/TranSVAE). Have fun! """ gr.Interface( fn=run, inputs=[ gr.Markdown( """ 👦🏻 Human - Source Avatar """ ), gr.Radio(choices=["slash", "spellcard", "walk"], value="slash"), gr.Radio(choices=["green", "yellow", "rose", "red", "wine"], value="green"), gr.Radio(choices=["brown", "blue", "white"], value="brown"), gr.Radio(choices=["white", "golden", "red", "silver"], value="white"), gr.Markdown( """ 👽 Alien - Target Avatar """ ), gr.Radio(choices=["slash", "spellcard", "walk"], value="walk"), gr.Radio(choices=["violet", "silver", "purple", "grey", "golden"], value="golden"), gr.Radio(choices=["grey", "khaki", "linen", "ocre"], value="ocre"), gr.Radio(choices=["denim", "olive", "brown"], value="brown"), ], outputs=[ gr.components.Image(type="file", label="Domain Disentanglement"), ], live=False, cache_examples=True, title="TranSVAE for Unsupervised Video Domain Adaptation", description=desc ).launch(share=True)