Spaces:

ldkong
/

TranSVAE

Build error

App Files Files Community

ldkong commited on Jul 28, 2022

Commit

3c3a705

•

1 Parent(s): 082d673

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -310

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import gradio as gr
-import argparse
 import cv2
 import imageio
 import math
@@ -29,7 +28,7 @@ class RelationModuleMultiScale(torch.nn.Module):
             self.relations_scales.append(relations_scale)
             self.subsample_scales.append(min(self.subsample_num, len(relations_scale)))
         self.num_frames = num_frames
-        self.fc_fusion_scales = nn.ModuleList() # high-tech modulelist
         for i in range(len(self.scales)):
             scale = self.scales[i]
             fc_fusion = nn.Sequential(nn.ReLU(), nn.Linear(scale * self.img_feature_dim, num_bottleneck), nn.ReLU())
@@ -60,31 +59,6 @@ class RelationModuleMultiScale(torch.nn.Module):
         return list(itertools.combinations([i for i in range(num_frames)], num_frames_relation))
-parser = argparse.ArgumentParser()
-parser.add_argument('--dataset',  default='Sprite', help='datasets')
-parser.add_argument('--data_root', default='dataset', help='root directory for data')
-parser.add_argument('--num_class',  type=int, default=15, help='the number of class for jester dataset')
-parser.add_argument('--input_type',  default='image', choices=['feature', 'image'], help='the type of input')
-parser.add_argument('--src',  default='domain_1', help='source domain')
-parser.add_argument('--tar',  default='domain_2', help='target domain')
-parser.add_argument('--num_segments', type=int, default=8, help='the number of frame segment')
-parser.add_argument('--backbone', type=str, default="dcgan", choices=['dcgan', 'resnet101', 'I3Dpretrain','I3Dfinetune'], help='backbone')
-parser.add_argument('--channels', default=3, type=int, help='input channels for image inputs')
-parser.add_argument('--add_fc', default=1, type=int, metavar='M', help='number of additional fc layers (excluding the last fc layer) (e.g. 0, 1, 2)')
-parser.add_argument('--fc_dim', type=int, default=1024, help='dimension of added fc')
-parser.add_argument('--frame_aggregation', type=str, default='trn', choices=[ 'rnn', 'trn'], help='aggregation of frame features (none if baseline_type is not video)')
-parser.add_argument('--dropout_rate', default=0.5, type=float, help='dropout ratio for frame-level feature (default: 0.5)')
-parser.add_argument('--f_dim', type=int, default=512, help='dim of f')
-parser.add_argument('--z_dim', type=int, default=512, help='dimensionality of z_t')
-parser.add_argument('--f_rnn_layers', type=int, default=1, help='number of layers (content lstm)')
-parser.add_argument('--use_bn', type=str, default='none', choices=['none', 'AdaBN', 'AutoDIAL'], help='normalization-based methods')
-parser.add_argument('--prior_sample', type=str, default='random', choices=['random', 'post'], help='how to sample prior')
-parser.add_argument('--batch_size', default=128, type=int, help='-batch size')
-parser.add_argument('--use_attn', type=str, default='TransAttn', choices=['none', 'TransAttn', 'general'], help='attention-mechanism')
-parser.add_argument('--data_threads', type=int, default=5, help='number of data loading threads')
-opt = parser.parse_args(args=[])
 class GradReverse(Function):
     @staticmethod
     def forward(ctx, x, beta):
@@ -99,157 +73,70 @@ class GradReverse(Function):
 class TransferVAE_Video(nn.Module):
-    def __init__(self, opt):
         super(TransferVAE_Video, self).__init__()
-        self.f_dim = opt.f_dim
-        self.z_dim = opt.z_dim
-        self.fc_dim = opt.fc_dim
-        self.channels = opt.channels
-        self.input_type = opt.input_type
-        self.frames = opt.num_segments
-        self.use_bn = opt.use_bn
-        self.frame_aggregation = opt.frame_aggregation
-        self.batch_size = opt.batch_size
-        self.use_attn = opt.use_attn
-        self.dropout_rate = opt.dropout_rate
-        self.num_class = opt.num_class
-        self.prior_sample = opt.prior_sample
-        if self.input_type == 'image':
-            import dcgan_64
-            self.encoder = dcgan_64.encoder(self.fc_dim, self.channels)
-            self.decoder = dcgan_64.decoder_woSkip(self.z_dim + self.f_dim, self.channels)
-            self.fc_output_dim = self.fc_dim
-        elif self.input_type == 'feature':
-            if opt.backbone == 'resnet101':
-                model_backnone = getattr(torchvision.models, opt.backbone)(True) # model_test is only used for getting the dim #
-                self.input_dim = model_backnone.fc.in_features
-            elif opt.backbone == 'I3Dpretrain':
-                self.input_dim = 2048
-            elif opt.backbone == 'I3Dfinetune':
-                self.input_dim = 2048
-            self.add_fc = opt.add_fc
-            self.enc_fc_layer1 = nn.Linear(self.input_dim, self.fc_dim)
-            self.dec_fc_layer1 = nn.Linear(self.fc_dim, self.input_dim)
-            self.fc_output_dim = self.fc_dim
-            if self.use_bn == 'shared':
-                self.bn_enc_layer1 = nn.BatchNorm1d(self.fc_output_dim)
-                self.bn_dec_layer1 = nn.BatchNorm1d(self.input_dim)
-            elif self.use_bn == 'separated':
-                self.bn_S_enc_layer1 = nn.BatchNorm1d(self.fc_output_dim)
-                self.bn_T_enc_layer1 = nn.BatchNorm1d(self.fc_output_dim)
-                self.bn_S_dec_layer1 = nn.BatchNorm1d(self.input_dim)
-                self.bn_T_dec_layer1 = nn.BatchNorm1d(self.input_dim)
-            if self.add_fc > 1:
-                self.enc_fc_layer2 = nn.Linear(self.fc_dim, self.fc_dim)
-                self.dec_fc_layer2 = nn.Linear(self.fc_dim, self.fc_dim)
-                self.fc_output_dim = self.fc_dim
-                ## use batchnormalization or not (if yes whether the source and target share the same batchnormalization)
-                if self.use_bn == 'shared':
-                    self.bn_enc_layer2 = nn.BatchNorm1d(self.fc_output_dim)
-                    self.bn_dec_layer2 = nn.BatchNorm1d(self.fc_dim)
-                elif self.use_bn == 'separated':
-                    self.bn_S_enc_layer2 = nn.BatchNorm1d(self.fc_output_dim)
-                    self.bn_T_enc_layer2 = nn.BatchNorm1d(self.fc_output_dim)
-                    self.bn_S_dec_layer2 = nn.BatchNorm1d(self.fc_dim)
-                    self.bn_T_dec_layer2 = nn.BatchNorm1d(self.fc_dim)
-            if self.add_fc > 2:
-                self.enc_fc_layer3 = nn.Linear(self.fc_dim, self.fc_dim)
-                self.dec_fc_layer3 = nn.Linear(self.fc_dim, self.fc_dim)
-                self.fc_output_dim = self.fc_dim
-                ## use batchnormalization or not (if yes whether the source and target share the same batchnormalization)
-                if self.use_bn == 'shared':
-                    self.bn_enc_layer3 = nn.BatchNorm1d(self.fc_output_dim)
-                    self.bn_dec_layer3 = nn.BatchNorm1d(self.fc_dim)
-                elif self.use_bn == 'separated':
-                    self.bn_S_enc_layer3 = nn.BatchNorm1d(self.fc_output_dim)
-                    self.bn_T_enc_layer3 = nn.BatchNorm1d(self.fc_output_dim)
-                    self.bn_S_dec_layer3 = nn.BatchNorm1d(self.fc_dim)
-                    self.bn_T_dec_layer3 = nn.BatchNorm1d(self.fc_dim)
-            self.z_2_out = nn.Linear(self.z_dim + self.f_dim, self.fc_output_dim)
-        ## nonlinearity and dropout
         self.relu = nn.LeakyReLU(0.1)
         self.dropout_f = nn.Dropout(p=self.dropout_rate)
         self.dropout_v = nn.Dropout(p=self.dropout_rate)
-        # -------------------------------
-        ## Disentangle strcuture
-        # -------------------------------
-        #self.hidden_dim = opt.rnn_size
-        self.hidden_dim = opt.z_dim
-        self.f_rnn_layers = opt.f_rnn_layers
-        # Prior of content is a uniform Gaussian and prior of the dynamics is an LSTM
         self.z_prior_lstm_ly1 = nn.LSTMCell(self.z_dim, self.hidden_dim)
         self.z_prior_lstm_ly2 = nn.LSTMCell(self.hidden_dim, self.hidden_dim)
         self.z_prior_mean = nn.Linear(self.hidden_dim, self.z_dim)
         self.z_prior_logvar = nn.Linear(self.hidden_dim, self.z_dim)
-        # POSTERIOR DISTRIBUTION NETWORKS
-        # content and motion features share one lstm
         self.z_lstm = nn.LSTM(self.fc_output_dim, self.hidden_dim, self.f_rnn_layers, bidirectional=True, batch_first=True)
         self.f_mean = nn.Linear(self.hidden_dim * 2, self.f_dim)
         self.f_logvar = nn.Linear(self.hidden_dim * 2, self.f_dim)
         self.z_rnn = nn.RNN(self.hidden_dim * 2, self.hidden_dim, batch_first=True)
-        # Each timestep is for each z so no reshaping and feature mixing
         self.z_mean = nn.Linear(self.hidden_dim, self.z_dim)
         self.z_logvar = nn.Linear(self.hidden_dim, self.z_dim)
-         # -------------------------------
-        ## z_t constraints
-        # -------------------------------
-        ## adversarial loss for frame features z_t
         self.fc_feature_domain_frame = nn.Linear(self.z_dim, self.z_dim)
         self.fc_classifier_domain_frame = nn.Linear(self.z_dim, 2)
-        ## #------ aggregate frame-based features (frame feature --> video feature) ------#
-        if self.frame_aggregation == 'rnn':
-            self.bilstm = nn.LSTM(self.z_dim, self.z_dim * 2, self.f_rnn_layers, bidirectional=True, batch_first=True)
-            self.feat_aggregated_dim = self.z_dim * 2
-        elif self.frame_aggregation == 'trn': # 4. TRN (ECCV 2018) ==> fix segment # for both train/val
-            self.num_bottleneck = 256 # 256
-            self.TRN = RelationModuleMultiScale(self.z_dim, self.num_bottleneck, self.frames)
-            self.bn_trn_S = nn.BatchNorm1d(self.num_bottleneck)
-            self.bn_trn_T = nn.BatchNorm1d(self.num_bottleneck)
-            self.feat_aggregated_dim = self.num_bottleneck
-        ## adversarial loss for video features
         self.fc_feature_domain_video = nn.Linear(self.feat_aggregated_dim, self.feat_aggregated_dim)
         self.fc_classifier_domain_video = nn.Linear(self.feat_aggregated_dim, 2)
-        ## adversarial loss for each relation of features
-        if self.frame_aggregation == 'trn':
-            self.relation_domain_classifier_all = nn.ModuleList()
-            for i in range(self.frames-1):
-                relation_domain_classifier = nn.Sequential(
-                    nn.Linear(self.feat_aggregated_dim, self.feat_aggregated_dim),
-                    nn.ReLU(),
-                    nn.Linear(self.feat_aggregated_dim, 2)
-                )
-                self.relation_domain_classifier_all += [relation_domain_classifier]
-        ## classifier for action prediction task
         self.pred_classifier_video = nn.Linear(self.feat_aggregated_dim, self.num_class)
-        ## classifier for prediction domains
         self.fc_feature_domain_latent = nn.Linear(self.f_dim, self.f_dim)
         self.fc_classifier_doamin_latent = nn.Linear(self.f_dim, 2)
-        ## attention option
-        if self.use_attn == 'general':
-            self.attn_layer = nn.Sequential(
-                nn.Linear(self.feat_aggregated_dim, self.feat_aggregated_dim),
-                nn.Tanh(),
-                nn.Linear(self.feat_aggregated_dim, 1)
-                )
     def domain_classifier_frame(self, feat, beta):
         feat_fc_domain_frame = GradReverse.apply(feat, beta)
@@ -258,6 +145,7 @@ class TransferVAE_Video(nn.Module):
         pred_fc_domain_frame = self.fc_classifier_domain_frame(feat_fc_domain_frame)
         return pred_fc_domain_frame
     def domain_classifier_video(self, feat_video, beta):
         feat_fc_domain_video = GradReverse.apply(feat_video, beta)
         feat_fc_domain_video = self.fc_feature_domain_video(feat_fc_domain_video)
@@ -265,17 +153,19 @@ class TransferVAE_Video(nn.Module):
         pred_fc_domain_video = self.fc_classifier_domain_video(feat_fc_domain_video)
         return pred_fc_domain_video
     def domain_classifier_latent(self, f):
         feat_fc_domain_latent = self.fc_feature_domain_latent(f)
         feat_fc_domain_latent = self.relu(feat_fc_domain_latent)
         pred_fc_domain_latent = self.fc_classifier_doamin_latent(feat_fc_domain_latent)
         return pred_fc_domain_latent
     def domain_classifier_relation(self, feat_relation, beta):
         pred_fc_domain_relation_video = None
         for i in range(len(self.relation_domain_classifier_all)):
-            feat_relation_single = feat_relation[:,i,:].squeeze(1) # 128x1x256 --> 128x256
-            feat_fc_domain_relation_single = GradReverse.apply(feat_relation_single, beta) # the same beta for all relations (for now)
             pred_fc_domain_relation_single = self.relation_domain_classifier_all[i](feat_fc_domain_relation_single)
@@ -288,6 +178,7 @@ class TransferVAE_Video(nn.Module):
         return pred_fc_domain_relation_video
     def get_trans_attn(self, pred_domain):
         softmax = nn.Softmax(dim=1)
         logsoftmax = nn.LogSoftmax(dim=1)
@@ -295,6 +186,7 @@ class TransferVAE_Video(nn.Module):
         weights = 1 - entropy
         return weights
     def get_general_attn(self, feat):
         num_segments = feat.size()[1]
         feat = feat.view(-1, feat.size()[-1]) # reshape features: 128x4x256 --> (128x4)x256
@@ -303,15 +195,11 @@ class TransferVAE_Video(nn.Module):
         weights = F.softmax(weights, dim=1)  # softmax over segments ==> 128x4x1
         return weights
     def get_attn_feat_relation(self, feat_fc, pred_domain, num_segments):
-        if self.use_attn == 'TransAttn':
-            weights_attn = self.get_trans_attn(pred_domain)
-        elif self.use_attn == 'general':
-            weights_attn = self.get_general_attn(feat_fc)
         weights_attn = weights_attn.view(-1, num_segments-1, 1).repeat(1,1,feat_fc.size()[-1]) # reshape & repeat weights (e.g. 16 x 4 x 256)
         feat_fc_attn = (weights_attn+1) * feat_fc
         return feat_fc_attn, weights_attn[:,:,0]
@@ -357,94 +245,18 @@ class TransferVAE_Video(nn.Module):
             f_post = f_post_list
         # f_mean and f_post are list if triple else not
         return f_mean, f_logvar, f_post, z_mean, z_logvar, z_post
     def decoder_frame(self,zf):
-        if self.input_type == 'image':
-            recon_x = self.decoder(zf)
-            return recon_x
-        if self.input_type == 'feature':
-            zf = self.z_2_out(zf) # batch,frames,(z_dim+f_dim) -> batch,frames,fc_output_dim
-            zf = self.relu(zf)
-            if self.add_fc > 2:
-                zf = self.dec_fc_layer3(zf)
-                if self.use_bn == 'shared':
-                    zf = self.bn_dec_layer3(zf)
-                elif self.use_bn == 'separated':
-                    zf_src = self.bn_S_dec_layer3(zf[:self.batchsize,:,:])
-                    zf_tar = self.bn_T_dec_layer3(zf[self.batchsize:,:,:])
-                    zf = torch.cat([zf_src,zf_tar],axis=0)
-                zf = self.relu(zf)
-            if self.add_fc > 1:
-                zf = self.dec_fc_layer2(zf)
-                if self.use_bn == 'shared':
-                    zf = self.bn_dec_layer2(zf)
-                elif self.use_bn == 'separated':
-                    zf_src = self.bn_S_dec_layer2(zf[:self.batchsize,:,:])
-                    zf_tar = self.bn_T_dec_layer2(zf[self.batchsize:,:,:])
-                    zf = torch.cat([zf_src,zf_tar],axis=0)
-                zf = self.relu(zf)
-            zf = self.dec_fc_layer1(zf)
-            if self.use_bn == 'shared':
-                zf = self.bn_dec_layer2(zf)
-            elif self.use_bn == 'separated':
-                zf_src = self.bn_S_dec_layer2(zf[:self.batchsize,:,:])
-                zf_tar = self.bn_T_dec_layer2(zf[self.batchsize:,:,:])
-                zf = torch.cat([zf_src,zf_tar],axis=0)
-            recon_x = self.relu(zf)
-            return recon_x
     def encoder_frame(self, x):
-        if self.input_type == 'image':
-            # input x is list of length Frames [batchsize, channels, size, size]
-            # convert it to [batchsize, frames, channels, size, size]
-            # [batch_size, frames, channels, size, size] to [batch_size * frames, channels, size, size]
-            x_shape = x.shape
-            x = x.view(-1, x_shape[-3], x_shape[-2], x_shape[-1])
-            x_embed = self.encoder(x)[0]
-            # to [batch_size,frames,embed_dim]
-            return x_embed.view(x_shape[0], x_shape[1], -1)
-        if self.input_type == 'feature':
-            # input is [batchsize, framew, input_dim]
-            x_embed = self.enc_fc_layer1(x)
-            ## use batchnormalization or not (if yes whether the source and target share the same batchnormalization)
-            if self.use_bn == 'shared':
-                x_embed = self.bn_enc_layer1(x_embed)
-            elif self.use_bn == 'separated':
-                x_embed_src = self.bn_S_enc_layer1(x_embed[:self.batchsize,:,:])
-                x_embed_tar = self.bn_T_enc_layer1(x_embed[self.batchsize:,:,:])
-                x_embed = torch.cat([x_embed_src,x_embed_tar],axis=0)
-            x_embed = self.relu(x_embed)
-            if self.add_fc > 1:
-                x_embed = self.enc_fc_layer2(x_embed)
-                if self.use_bn == 'shared':
-                    x_embed = self.bn_enc_layer2(x_embed)
-                elif self.use_bn == 'separated':
-                    x_embed_src = self.bn_S_enc_layer2(x_embed[:self.batchsize,:,:])
-                    x_embed_tar = self.bn_T_enc_layer2(x_embed[self.batchsize:,:,:])
-                    x_embed = torch.cat([x_embed_src,x_embed_tar],axis=0)
-                x_embed = self.relu(x_embed)
-            if self.add_fc > 2:
-                x_embed = self.enc_fc_layer3(x_embed)
-                if self.use_bn == 'shared':
-                    x_embed = self.bn_enc_layer3(x_embed)
-                elif self.use_bn == 'separated':
-                    x_embed_src = self.bn_S_enc_layer3(x_embed[:self.batchsize,:,:])
-                    x_embed_tar = self.bn_T_enc_layer3(x_embed[self.batchsize:,:,:])
-                    x_embed = torch.cat([x_embed_src,x_embed_tar],axis=0)
-                x_embed = self.relu(x_embed)
-            ## [batchsize, frame, output_dim]
-            return x_embed
     def reparameterize(self, mean, logvar, random_sampling=True):
@@ -458,7 +270,7 @@ class TransferVAE_Video(nn.Module):
             return mean
     def sample_z_prior_train(self, z_post, random_sampling=True):
-        z_out = None  # This will ultimately store all z_s in the format [batch_size, frames, z_dim]
         z_means = None
         z_logvars = None
         batch_size = z_post.shape[0]
@@ -526,77 +338,17 @@ class TransferVAE_Video(nn.Module):
         return z_means, z_logvars, z_out
     def forward(self, x, beta):
-        # beta [beta_relation, beta_video, beta_frame]
-        f_mean, f_logvar, f_post, z_mean_post, z_logvar_post, z_post = self.encode_and_sample_post(x)
-        if self.prior_sample == 'random':
-            z_mean_prior, z_logvar_prior, z_prior = self.sample_z(z_post.size(0),random_sampling=False)
-        elif self.prior_sample == 'post':
-            z_mean_prior, z_logvar_prior, z_prior = self.sample_z_prior_train(z_post, random_sampling=False)
         if isinstance(f_post, list):
             f_expand = f_post[0].unsqueeze(1).expand(-1, self.frames, self.f_dim)
         else:
             f_expand = f_post.unsqueeze(1).expand(-1, self.frames, self.f_dim)
-        zf = torch.cat((z_post, f_expand), dim=2) # batch,frames,(z_dim+f_dim)
-        ## reconcstruct x
         recon_x = self.decoder_frame(zf)
-        ## For constraints on z_post [batch,frame,z_dim] and f_post [batch,f_dim]
-        pred_domain_all = [] # list save domain predictions (1) z_post (frame level) (2) each z_post_relation (if trn) (3) z_post (video level) (4)f_post
-        #1. adversarial on z_post (frame level)
-        z_post_feat = z_post.view(-1, z_post.size()[-1]) # e.g. 32 x 5 x 2048 --> 160 x 2048
-        z_post_feat = self.dropout_f(z_post_feat)
-        pred_fc_domain_frame = self.domain_classifier_frame(z_post_feat, beta[2])
-        pred_fc_domain_frame = pred_fc_domain_frame.view((z_post.size(0), self.frames) + pred_fc_domain_frame.size()[-1:])
-        pred_domain_all.append(pred_fc_domain_frame)
-        #2 adversarial on z_post (video level, relation level if trn is used)
-        if self.frame_aggregation == 'rnn':
-            self.bilstm.flatten_parameters()
-            z_post_video_feat, _ = self.bilstm(z_post)
-            backward = z_post_video_feat[:, 0, self.z_dim:2 * self.z_dim]
-            frontal = z_post_video_feat[:, self.frames - 1, 0:self.z_dim]
-            z_post_video_feat = torch.cat((frontal, backward), dim=1)
-            pred_fc_domain_relation = []
-            pred_domain_all.append(pred_fc_domain_relation)
-        elif self.frame_aggregation == 'trn':
-            z_post_video_relation = self.TRN(z_post) ## [batch, frame-1, self.feat_aggregated_dim]
-            # adversarial branch for each relation
-            pred_fc_domain_relation = self.domain_classifier_relation(z_post_video_relation, beta[0])
-            pred_domain_all.append(pred_fc_domain_relation.view((z_post.size(0), z_post_video_relation.size()[1]) + pred_fc_domain_relation.size()[-1:]))
-            # transferable attention
-            if self.use_attn != 'none': # get the attention weighting
-                z_post_video_relation_attn, _ = self.get_attn_feat_relation(z_post_video_relation, pred_fc_domain_relation, self.frames)
-            # sum up relation features (ignore 1-relation)
-            z_post_video_feat = torch.sum(z_post_video_relation_attn, 1)
-        z_post_video_feat = self.dropout_v(z_post_video_feat)
-        pred_fc_domain_video = self.domain_classifier_video(z_post_video_feat, beta[1])
-        pred_fc_domain_video = pred_fc_domain_video.view((z_post.size(0),) + pred_fc_domain_video.size()[-1:])
-        pred_domain_all.append(pred_fc_domain_video)
-        #3. video prediction
-        pred_video_class = self.pred_classifier_video(z_post_video_feat)
-        #4. domain prediction on f
-        if isinstance(f_post, list):
-            pred_fc_domain_latent = self.domain_classifier_latent(f_post[0])
-        else:
-            pred_fc_domain_latent = self.domain_classifier_latent(f_post)
-        pred_domain_all.append(pred_fc_domain_latent)
-        return f_mean, f_logvar, f_post, z_mean_post, z_logvar_post, z_post, z_mean_prior, z_logvar_prior, z_prior, recon_x, pred_domain_all, pred_video_class
 def name2seq(file_name):
@@ -700,6 +452,12 @@ def MyPlot(frame_id, src_orig, tar_orig, src_recon, tar_recon, src_Zt, tar_Zt, s
     plt.savefig(save_name, dpi=200, format='png', bbox_inches='tight', pad_inches=0.0)
 def run(domain_source, action_source, hair_source, top_source, bottom_source, domain_target, action_target, hair_target, top_target, bottom_target):
     # == Source Avatar ==
@@ -760,15 +518,9 @@ def run(domain_source, action_source, hair_source, top_source, bottom_source, do
     x = torch.cat((images_source, images_target), dim=0)
-    # == Load Model ==
-    model = TransferVAE_Video(opt)
-    model.load_state_dict(torch.load('TransferVAE.pth.tar', map_location=torch.device('cpu'))['state_dict'])
-    model.eval()
     # == Forward ==
     with torch.no_grad():
-        f_mean, f_logvar, f_post, z_post_mean, z_post_logvar, z_post, z_prior_mean, z_prior_logvar, z_prior, recon_x, pred_domain_all, pred_video_class = model(x, [0]*3)
     src_orig_sample = x[0, :, :, :, :]
     src_recon_sample = recon_x[0, :, :, :, :]
@@ -824,12 +576,12 @@ def run(domain_source, action_source, hair_source, top_source, bottom_source, do
 gr.Interface(
     run,
     inputs=[
-        gr.Textbox(value="Source Avatar - Human", interactive=False),
         gr.Radio(choices=["slash", "spellcard", "walk"], value="slash"),
         gr.Radio(choices=["green", "yellow", "rose", "red", "wine"], value="green"),
         gr.Radio(choices=["brown", "blue", "white"], value="brown"),
         gr.Radio(choices=["white", "golden", "red", "silver"], value="white"),
-        gr.Textbox(value="Target Avatar - Alien", interactive=False),
         gr.Radio(choices=["slash", "spellcard", "walk"], value="walk"),
         gr.Radio(choices=["violet", "silver", "purple", "grey", "golden"], value="golden"),
         gr.Radio(choices=["grey", "khaki", "linen", "ocre"], value="ocre"),

 import gradio as gr
 import cv2
 import imageio
 import math
             self.relations_scales.append(relations_scale)
             self.subsample_scales.append(min(self.subsample_num, len(relations_scale)))
         self.num_frames = num_frames
+        self.fc_fusion_scales = nn.ModuleList()
         for i in range(len(self.scales)):
             scale = self.scales[i]
             fc_fusion = nn.Sequential(nn.ReLU(), nn.Linear(scale * self.img_feature_dim, num_bottleneck), nn.ReLU())
         return list(itertools.combinations([i for i in range(num_frames)], num_frames_relation))
 class GradReverse(Function):
     @staticmethod
     def forward(ctx, x, beta):
 class TransferVAE_Video(nn.Module):
+    def __init__(self):
         super(TransferVAE_Video, self).__init__()
+        self.f_dim = 512
+        self.z_dim = 512
+        self.fc_dim = 1024
+        self.channels = 3
+        self.frames = 8
+        self.batch_size = 128
+        self.dropout_rate = 0.5
+        self.num_class = 15
+        self.prior_sample = 'random'
+        import dcgan_64
+        self.encoder = dcgan_64.encoder(self.fc_dim, self.channels)
+        self.decoder = dcgan_64.decoder_woSkip(self.z_dim + self.f_dim, self.channels)
+        self.fc_output_dim = self.fc_dim
         self.relu = nn.LeakyReLU(0.1)
         self.dropout_f = nn.Dropout(p=self.dropout_rate)
         self.dropout_v = nn.Dropout(p=self.dropout_rate)
+        self.hidden_dim = 512
+        self.f_rnn_layers = 1
         self.z_prior_lstm_ly1 = nn.LSTMCell(self.z_dim, self.hidden_dim)
         self.z_prior_lstm_ly2 = nn.LSTMCell(self.hidden_dim, self.hidden_dim)
         self.z_prior_mean = nn.Linear(self.hidden_dim, self.z_dim)
         self.z_prior_logvar = nn.Linear(self.hidden_dim, self.z_dim)
         self.z_lstm = nn.LSTM(self.fc_output_dim, self.hidden_dim, self.f_rnn_layers, bidirectional=True, batch_first=True)
         self.f_mean = nn.Linear(self.hidden_dim * 2, self.f_dim)
         self.f_logvar = nn.Linear(self.hidden_dim * 2, self.f_dim)
         self.z_rnn = nn.RNN(self.hidden_dim * 2, self.hidden_dim, batch_first=True)
         self.z_mean = nn.Linear(self.hidden_dim, self.z_dim)
         self.z_logvar = nn.Linear(self.hidden_dim, self.z_dim)
         self.fc_feature_domain_frame = nn.Linear(self.z_dim, self.z_dim)
         self.fc_classifier_domain_frame = nn.Linear(self.z_dim, 2)
+        self.num_bottleneck = 256
+        self.TRN = RelationModuleMultiScale(self.z_dim, self.num_bottleneck, self.frames)
+        self.bn_trn_S = nn.BatchNorm1d(self.num_bottleneck)
+        self.bn_trn_T = nn.BatchNorm1d(self.num_bottleneck)
+        self.feat_aggregated_dim = self.num_bottleneck
         self.fc_feature_domain_video = nn.Linear(self.feat_aggregated_dim, self.feat_aggregated_dim)
         self.fc_classifier_domain_video = nn.Linear(self.feat_aggregated_dim, 2)
+        self.relation_domain_classifier_all = nn.ModuleList()
+        for i in range(self.frames-1):
+            relation_domain_classifier = nn.Sequential(
+                nn.Linear(self.feat_aggregated_dim, self.feat_aggregated_dim),
+                nn.ReLU(),
+                nn.Linear(self.feat_aggregated_dim, 2)
+            )
+            self.relation_domain_classifier_all += [relation_domain_classifier]
         self.pred_classifier_video = nn.Linear(self.feat_aggregated_dim, self.num_class)
         self.fc_feature_domain_latent = nn.Linear(self.f_dim, self.f_dim)
         self.fc_classifier_doamin_latent = nn.Linear(self.f_dim, 2)
     def domain_classifier_frame(self, feat, beta):
         feat_fc_domain_frame = GradReverse.apply(feat, beta)
         pred_fc_domain_frame = self.fc_classifier_domain_frame(feat_fc_domain_frame)
         return pred_fc_domain_frame
     def domain_classifier_video(self, feat_video, beta):
         feat_fc_domain_video = GradReverse.apply(feat_video, beta)
         feat_fc_domain_video = self.fc_feature_domain_video(feat_fc_domain_video)
         pred_fc_domain_video = self.fc_classifier_domain_video(feat_fc_domain_video)
         return pred_fc_domain_video
     def domain_classifier_latent(self, f):
         feat_fc_domain_latent = self.fc_feature_domain_latent(f)
         feat_fc_domain_latent = self.relu(feat_fc_domain_latent)
         pred_fc_domain_latent = self.fc_classifier_doamin_latent(feat_fc_domain_latent)
         return pred_fc_domain_latent
     def domain_classifier_relation(self, feat_relation, beta):
         pred_fc_domain_relation_video = None
         for i in range(len(self.relation_domain_classifier_all)):
+            feat_relation_single = feat_relation[:,i,:].squeeze(1)
+            feat_fc_domain_relation_single = GradReverse.apply(feat_relation_single, beta)
             pred_fc_domain_relation_single = self.relation_domain_classifier_all[i](feat_fc_domain_relation_single)
         return pred_fc_domain_relation_video
     def get_trans_attn(self, pred_domain):
         softmax = nn.Softmax(dim=1)
         logsoftmax = nn.LogSoftmax(dim=1)
         weights = 1 - entropy
         return weights
     def get_general_attn(self, feat):
         num_segments = feat.size()[1]
         feat = feat.view(-1, feat.size()[-1]) # reshape features: 128x4x256 --> (128x4)x256
         weights = F.softmax(weights, dim=1)  # softmax over segments ==> 128x4x1
         return weights
     def get_attn_feat_relation(self, feat_fc, pred_domain, num_segments):
+        weights_attn = self.get_trans_attn(pred_domain)
         weights_attn = weights_attn.view(-1, num_segments-1, 1).repeat(1,1,feat_fc.size()[-1]) # reshape & repeat weights (e.g. 16 x 4 x 256)
         feat_fc_attn = (weights_attn+1) * feat_fc
         return feat_fc_attn, weights_attn[:,:,0]
             f_post = f_post_list
         # f_mean and f_post are list if triple else not
         return f_mean, f_logvar, f_post, z_mean, z_logvar, z_post
     def decoder_frame(self,zf):
+        recon_x = self.decoder(zf)
+        return recon_x
     def encoder_frame(self, x):
+        x_shape = x.shape
+        x = x.view(-1, x_shape[-3], x_shape[-2], x_shape[-1])
+        x_embed = self.encoder(x)[0]
+        return x_embed.view(x_shape[0], x_shape[1], -1)
     def reparameterize(self, mean, logvar, random_sampling=True):
             return mean
     def sample_z_prior_train(self, z_post, random_sampling=True):
+        z_out = None
         z_means = None
         z_logvars = None
         batch_size = z_post.shape[0]
         return z_means, z_logvars, z_out
     def forward(self, x, beta):
+        _, _, f_post, _, _, z_post = self.encode_and_sample_post(x)
         if isinstance(f_post, list):
             f_expand = f_post[0].unsqueeze(1).expand(-1, self.frames, self.f_dim)
         else:
             f_expand = f_post.unsqueeze(1).expand(-1, self.frames, self.f_dim)
+        zf = torch.cat((z_post, f_expand), dim=2)
         recon_x = self.decoder_frame(zf)
+        return f_post, z_post, recon_x
 def name2seq(file_name):
     plt.savefig(save_name, dpi=200, format='png', bbox_inches='tight', pad_inches=0.0)
+# == Load Model ==
+model = TransferVAE_Video(opt)
+model.load_state_dict(torch.load('TransferVAE.pth.tar', map_location=torch.device('cpu'))['state_dict'])
+model.eval()
 def run(domain_source, action_source, hair_source, top_source, bottom_source, domain_target, action_target, hair_target, top_target, bottom_target):
     # == Source Avatar ==
     x = torch.cat((images_source, images_target), dim=0)
     # == Forward ==
     with torch.no_grad():
+    f_post, z_post, recon_x = model(x, [0]*3)
     src_orig_sample = x[0, :, :, :, :]
     src_recon_sample = recon_x[0, :, :, :, :]
 gr.Interface(
     run,
     inputs=[
+        gr.Textbox(value="Source Avatar - Human", show_label=False, interactive=False),
         gr.Radio(choices=["slash", "spellcard", "walk"], value="slash"),
         gr.Radio(choices=["green", "yellow", "rose", "red", "wine"], value="green"),
         gr.Radio(choices=["brown", "blue", "white"], value="brown"),
         gr.Radio(choices=["white", "golden", "red", "silver"], value="white"),
+        gr.Textbox(value="Target Avatar - Alien", show_label=False, interactive=False),
         gr.Radio(choices=["slash", "spellcard", "walk"], value="walk"),
         gr.Radio(choices=["violet", "silver", "purple", "grey", "golden"], value="golden"),
         gr.Radio(choices=["grey", "khaki", "linen", "ocre"], value="ocre"),