Spaces:

vinthony
/

SadTalker

Running on A10G

App Files Files Community

shadowcun commited on Jul 3, 2023

Commit

cdf3959

•

1 Parent(s): 9ab094a

new version of sadtalker

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

src/__pycache__/generate_batch.cpython-38.pyc +0 -0
src/__pycache__/generate_facerender_batch.cpython-38.pyc +0 -0
src/__pycache__/test_audio2coeff.cpython-38.pyc +0 -0
src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc +0 -0
src/audio2exp_models/__pycache__/networks.cpython-38.pyc +0 -0
src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc +0 -0
src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc +0 -0
src/audio2pose_models/__pycache__/cvae.cpython-38.pyc +0 -0
src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc +0 -0
src/audio2pose_models/__pycache__/networks.cpython-38.pyc +0 -0
src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc +0 -0
src/{src/config → config}/facerender_pirender.yaml +0 -0
src/face3d/models/__pycache__/__init__.cpython-38.pyc +0 -0
src/face3d/models/__pycache__/base_model.cpython-38.pyc +0 -0
src/face3d/models/__pycache__/networks.cpython-38.pyc +0 -0
src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc +0 -0
src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc +0 -0
src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc +0 -0
src/face3d/util/__pycache__/__init__.cpython-38.pyc +0 -0
src/face3d/util/__pycache__/load_mats.cpython-38.pyc +0 -0
src/face3d/util/__pycache__/preprocess.cpython-38.pyc +0 -0
src/facerender/__pycache__/animate.cpython-38.pyc +0 -0
src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc +0 -0
src/facerender/modules/__pycache__/generator.cpython-38.pyc +0 -0
src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc +0 -0
src/facerender/modules/__pycache__/make_animation.cpython-38.pyc +0 -0
src/facerender/modules/__pycache__/mapping.cpython-38.pyc +0 -0
src/facerender/modules/__pycache__/util.cpython-38.pyc +0 -0
src/{src/facerender → facerender}/pirender/base_function.py +0 -0
src/{src/facerender → facerender}/pirender/config.py +0 -0
src/{src/facerender → facerender}/pirender/face_model.py +0 -0
src/{src/facerender → facerender}/pirender_animate.py +0 -0
src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc +0 -0
src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc +0 -0
src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc +0 -0
src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc +0 -0
src/generate_facerender_batch.py +3 -4
src/gradio_demo.py +21 -6
src/src/audio2exp_models/audio2exp.py +0 -41
src/src/audio2exp_models/networks.py +0 -74
src/src/audio2pose_models/audio2pose.py +0 -94
src/src/audio2pose_models/audio_encoder.py +0 -64
src/src/audio2pose_models/cvae.py +0 -149
src/src/audio2pose_models/discriminator.py +0 -76
src/src/audio2pose_models/networks.py +0 -140
src/src/audio2pose_models/res_unet.py +0 -65
src/src/config/auido2exp.yaml +0 -58
src/src/config/auido2pose.yaml +0 -49
src/src/config/facerender.yaml +0 -45
src/src/config/facerender_still.yaml +0 -45

src/__pycache__/generate_batch.cpython-38.pyc DELETED Viewed

Binary file (3.49 kB)

src/__pycache__/generate_facerender_batch.cpython-38.pyc DELETED Viewed

Binary file (4.06 kB)

src/__pycache__/test_audio2coeff.cpython-38.pyc DELETED Viewed

Binary file (3.91 kB)

src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc DELETED Viewed

Binary file (1.28 kB)

src/audio2exp_models/__pycache__/networks.cpython-38.pyc DELETED Viewed

Binary file (2.14 kB)

src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc DELETED Viewed

Binary file (2.86 kB)

src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc DELETED Viewed

Binary file (2.17 kB)

src/audio2pose_models/__pycache__/cvae.cpython-38.pyc DELETED Viewed

Binary file (4.69 kB)

src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc DELETED Viewed

Binary file (2.45 kB)

src/audio2pose_models/__pycache__/networks.cpython-38.pyc DELETED Viewed

Binary file (4.74 kB)

src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc DELETED Viewed

Binary file (1.91 kB)

src/{src/config → config}/facerender_pirender.yaml RENAMED Viewed

File without changes

src/face3d/models/__pycache__/__init__.cpython-38.pyc DELETED Viewed

Binary file (3.27 kB)

src/face3d/models/__pycache__/base_model.cpython-38.pyc DELETED Viewed

Binary file (12.5 kB)

src/face3d/models/__pycache__/networks.cpython-38.pyc DELETED Viewed

Binary file (17.1 kB)

src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc DELETED Viewed

Binary file (861 Bytes)

src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc DELETED Viewed

Binary file (5.43 kB)

src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc DELETED Viewed

Binary file (5.49 kB)

src/face3d/util/__pycache__/__init__.cpython-38.pyc DELETED Viewed

Binary file (294 Bytes)

src/face3d/util/__pycache__/load_mats.cpython-38.pyc DELETED Viewed

Binary file (2.95 kB)

src/face3d/util/__pycache__/preprocess.cpython-38.pyc DELETED Viewed

Binary file (3.34 kB)

src/facerender/__pycache__/animate.cpython-38.pyc DELETED Viewed

Binary file (6.91 kB)

src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc DELETED Viewed

Binary file (3.92 kB)

src/facerender/modules/__pycache__/generator.cpython-38.pyc DELETED Viewed

Binary file (6.59 kB)

src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc DELETED Viewed

Binary file (4.83 kB)

src/facerender/modules/__pycache__/make_animation.cpython-38.pyc DELETED Viewed

Binary file (4.76 kB)

src/facerender/modules/__pycache__/mapping.cpython-38.pyc DELETED Viewed

Binary file (1.69 kB)

src/facerender/modules/__pycache__/util.cpython-38.pyc DELETED Viewed

Binary file (17.2 kB)

src/{src/facerender → facerender}/pirender/base_function.py RENAMED Viewed

File without changes

src/{src/facerender → facerender}/pirender/config.py RENAMED Viewed

File without changes

src/{src/facerender → facerender}/pirender/face_model.py RENAMED Viewed

File without changes

src/{src/facerender → facerender}/pirender_animate.py RENAMED Viewed

File without changes

src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc DELETED Viewed

Binary file (403 Bytes)

src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc DELETED Viewed

Binary file (12.9 kB)

src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc DELETED Viewed

Binary file (4.84 kB)

src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc DELETED Viewed

Binary file (3.49 kB)

src/generate_facerender_batch.py CHANGED Viewed

@@ -7,7 +7,7 @@ import scipy.io as scio
 def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path,
                         batch_size, input_yaw_list=None, input_pitch_list=None, input_roll_list=None,
-                        expression_scale=1.0, still_mode = False, preprocess='crop', size = 256):
     semantic_radius = 13
     video_name = os.path.splitext(os.path.split(coeff_path)[-1])[0]
@@ -27,10 +27,9 @@ def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path,
     source_semantics_dict = scio.loadmat(first_coeff_path)
     generated_dict = scio.loadmat(coeff_path)
-    if 'full' not in preprocess.lower():
         source_semantics = source_semantics_dict['coeff_3dmm'][:1,:70]         #1 70
         generated_3dmm = generated_dict['coeff_3dmm'][:,:70]
     else:
         source_semantics = source_semantics_dict['coeff_3dmm'][:1,:73]         #1 70
         generated_3dmm = generated_dict['coeff_3dmm'][:,:70]
@@ -43,7 +42,7 @@ def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path,
     # target
     generated_3dmm[:, :64] = generated_3dmm[:, :64] * expression_scale
-    if 'full' in preprocess.lower():
         generated_3dmm = np.concatenate([generated_3dmm, np.repeat(source_semantics[:,70:], generated_3dmm.shape[0], axis=0)], axis=1)
     if still_mode:

 def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path,
                         batch_size, input_yaw_list=None, input_pitch_list=None, input_roll_list=None,
+                        expression_scale=1.0, still_mode = False, preprocess='crop', size = 256, facemodel='facevid2vid'):
     semantic_radius = 13
     video_name = os.path.splitext(os.path.split(coeff_path)[-1])[0]
     source_semantics_dict = scio.loadmat(first_coeff_path)
     generated_dict = scio.loadmat(coeff_path)
+    if 'full' not in preprocess.lower() and facemodel != 'pirender':
         source_semantics = source_semantics_dict['coeff_3dmm'][:1,:70]         #1 70
         generated_3dmm = generated_dict['coeff_3dmm'][:,:70]
     else:
         source_semantics = source_semantics_dict['coeff_3dmm'][:1,:73]         #1 70
         generated_3dmm = generated_dict['coeff_3dmm'][:,:70]
     # target
     generated_3dmm[:, :64] = generated_3dmm[:, :64] * expression_scale
+    if 'full' in preprocess.lower() or facemodel == 'pirender':
         generated_3dmm = np.concatenate([generated_3dmm, np.repeat(source_semantics[:,70:], generated_3dmm.shape[0], axis=0)], axis=1)
     if still_mode:

src/gradio_demo.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import torch, uuid
-import os, sys, shutil
 from src.utils.preprocess import CropAndExtract
 from src.test_audio2coeff import Audio2Coeff
 from src.facerender.animate import AnimateFromCoeff
@@ -20,8 +21,10 @@ class SadTalker():
     def __init__(self, checkpoint_path='checkpoints', config_path='src/config', lazy_load=False):
-        if torch.cuda.is_available() :
             device = "cuda"
         else:
             device = "cpu"
@@ -35,7 +38,9 @@ class SadTalker():
     def test(self, source_image, driven_audio, preprocess='crop',
         still_mode=False,  use_enhancer=False, batch_size=1, size=256,
-        pose_style = 0, exp_scale=1.0,
         use_ref_video = False,
         ref_video = None,
         ref_info = None,
@@ -48,7 +53,15 @@ class SadTalker():
         self.audio_to_coeff = Audio2Coeff(self.sadtalker_paths, self.device)
         self.preprocess_model = CropAndExtract(self.sadtalker_paths, self.device)
-        self.animate_from_coeff = AnimateFromCoeff(self.sadtalker_paths, self.device)
         time_tag = str(uuid.uuid4())
         save_dir = os.path.join(result_dir, time_tag)
@@ -131,11 +144,13 @@ class SadTalker():
         if use_ref_video and ref_info == 'all':
             coeff_path = ref_video_coeff_path # self.audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
         else:
-            batch = get_data(first_coeff_path, audio_path, self.device, ref_eyeblink_coeff_path=ref_eyeblink_coeff_path, still=still_mode, idlemode=use_idle_mode, length_of_audio=length_of_audio, use_blink=use_blink) # longer audio?
             coeff_path = self.audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
         #coeff2video
-        data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode, preprocess=preprocess, size=size, expression_scale = exp_scale)
         return_path = self.animate_from_coeff.generate(data, save_dir,  pic_path, crop_info, enhancer='gfpgan' if use_enhancer else None, preprocess=preprocess, img_size=size)
         video_name = data['video_name']
         print(f'The generated video is named {video_name} in {save_dir}')

 import torch, uuid
+import os, sys, shutil, platform
+from src.facerender.pirender_animate import AnimateFromCoeff_PIRender
 from src.utils.preprocess import CropAndExtract
 from src.test_audio2coeff import Audio2Coeff
 from src.facerender.animate import AnimateFromCoeff
     def __init__(self, checkpoint_path='checkpoints', config_path='src/config', lazy_load=False):
+        if torch.cuda.is_available():
             device = "cuda"
+        elif platform.system() == 'Darwin': # macos
+            device = "mps"
         else:
             device = "cpu"
     def test(self, source_image, driven_audio, preprocess='crop',
         still_mode=False,  use_enhancer=False, batch_size=1, size=256,
+        pose_style = 0,
+        facerender='facevid2vid',
+        exp_scale=1.0,
         use_ref_video = False,
         ref_video = None,
         ref_info = None,
         self.audio_to_coeff = Audio2Coeff(self.sadtalker_paths, self.device)
         self.preprocess_model = CropAndExtract(self.sadtalker_paths, self.device)
+        if facerender == 'facevid2vid' and self.device != 'mps':
+            self.animate_from_coeff = AnimateFromCoeff(self.sadtalker_paths, self.device)
+        elif facerender == 'pirender' or self.device == 'mps':
+            self.animate_from_coeff = AnimateFromCoeff_PIRender(self.sadtalker_paths, self.device)
+            facerender = 'pirender'
+        else:
+            raise(RuntimeError('Unknown model: {}'.format(facerender)))
         time_tag = str(uuid.uuid4())
         save_dir = os.path.join(result_dir, time_tag)
         if use_ref_video and ref_info == 'all':
             coeff_path = ref_video_coeff_path # self.audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
         else:
+            batch = get_data(first_coeff_path, audio_path, self.device, ref_eyeblink_coeff_path=ref_eyeblink_coeff_path, still=still_mode, \
+                idlemode=use_idle_mode, length_of_audio=length_of_audio, use_blink=use_blink) # longer audio?
             coeff_path = self.audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
         #coeff2video
+        data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode, \
+            preprocess=preprocess, size=size, expression_scale = exp_scale, facemodel=facerender)
         return_path = self.animate_from_coeff.generate(data, save_dir,  pic_path, crop_info, enhancer='gfpgan' if use_enhancer else None, preprocess=preprocess, img_size=size)
         video_name = data['video_name']
         print(f'The generated video is named {video_name} in {save_dir}')

src/src/audio2exp_models/audio2exp.py DELETED Viewed

@@ -1,41 +0,0 @@
-from tqdm import tqdm
-import torch
-from torch import nn
-class Audio2Exp(nn.Module):
-    def __init__(self, netG, cfg, device, prepare_training_loss=False):
-        super(Audio2Exp, self).__init__()
-        self.cfg = cfg
-        self.device = device
-        self.netG = netG.to(device)
-    def test(self, batch):
-        mel_input = batch['indiv_mels']                         # bs T 1 80 16
-        bs = mel_input.shape[0]
-        T = mel_input.shape[1]
-        exp_coeff_pred = []
-        for i in tqdm(range(0, T, 10),'audio2exp:'): # every 10 frames
-            current_mel_input = mel_input[:,i:i+10]
-            #ref = batch['ref'][:, :, :64].repeat((1,current_mel_input.shape[1],1))           #bs T 64
-            ref = batch['ref'][:, :, :64][:, i:i+10]
-            ratio = batch['ratio_gt'][:, i:i+10]                               #bs T
-            audiox = current_mel_input.view(-1, 1, 80, 16)                  # bs*T 1 80 16
-            curr_exp_coeff_pred  = self.netG(audiox, ref, ratio)         # bs T 64
-            exp_coeff_pred += [curr_exp_coeff_pred]
-        # BS x T x 64
-        results_dict = {
-            'exp_coeff_pred': torch.cat(exp_coeff_pred, axis=1)
-            }
-        return results_dict

src/src/audio2exp_models/networks.py DELETED Viewed

@@ -1,74 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-class Conv2d(nn.Module):
-    def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, use_act = True, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.conv_block = nn.Sequential(
-                            nn.Conv2d(cin, cout, kernel_size, stride, padding),
-                            nn.BatchNorm2d(cout)
-                            )
-        self.act = nn.ReLU()
-        self.residual = residual
-        self.use_act = use_act
-    def forward(self, x):
-        out = self.conv_block(x)
-        if self.residual:
-            out += x
-        if self.use_act:
-            return self.act(out)
-        else:
-            return out
-class SimpleWrapperV2(nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.audio_encoder = nn.Sequential(
-            Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
-            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
-            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
-            Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1),
-            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
-            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
-            Conv2d(64, 128, kernel_size=3, stride=3, padding=1),
-            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
-            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
-            Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1),
-            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
-            Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
-            Conv2d(512, 512, kernel_size=1, stride=1, padding=0),
-            )
-        #### load the pre-trained audio_encoder
-        #self.audio_encoder = self.audio_encoder.to(device)
-        '''
-        wav2lip_state_dict = torch.load('/apdcephfs_cq2/share_1290939/wenxuazhang/checkpoints/wav2lip.pth')['state_dict']
-        state_dict = self.audio_encoder.state_dict()
-        for k,v in wav2lip_state_dict.items():
-            if 'audio_encoder' in k:
-                print('init:', k)
-                state_dict[k.replace('module.audio_encoder.', '')] = v
-        self.audio_encoder.load_state_dict(state_dict)
-        '''
-        self.mapping1 = nn.Linear(512+64+1, 64)
-        #self.mapping2 = nn.Linear(30, 64)
-        #nn.init.constant_(self.mapping1.weight, 0.)
-        nn.init.constant_(self.mapping1.bias, 0.)
-    def forward(self, x, ref, ratio):
-        x = self.audio_encoder(x).view(x.size(0), -1)
-        ref_reshape = ref.reshape(x.size(0), -1)
-        ratio = ratio.reshape(x.size(0), -1)
-        y = self.mapping1(torch.cat([x, ref_reshape, ratio], dim=1))
-        out = y.reshape(ref.shape[0], ref.shape[1], -1) #+ ref # resudial
-        return out

src/src/audio2pose_models/audio2pose.py DELETED Viewed

@@ -1,94 +0,0 @@
-import torch
-from torch import nn
-from src.audio2pose_models.cvae import CVAE
-from src.audio2pose_models.discriminator import PoseSequenceDiscriminator
-from src.audio2pose_models.audio_encoder import AudioEncoder
-class Audio2Pose(nn.Module):
-    def __init__(self, cfg, wav2lip_checkpoint, device='cuda'):
-        super().__init__()
-        self.cfg = cfg
-        self.seq_len = cfg.MODEL.CVAE.SEQ_LEN
-        self.latent_dim = cfg.MODEL.CVAE.LATENT_SIZE
-        self.device = device
-        self.audio_encoder = AudioEncoder(wav2lip_checkpoint, device)
-        self.audio_encoder.eval()
-        for param in self.audio_encoder.parameters():
-            param.requires_grad = False
-        self.netG = CVAE(cfg)
-        self.netD_motion = PoseSequenceDiscriminator(cfg)
-    def forward(self, x):
-        batch = {}
-        coeff_gt = x['gt'].cuda().squeeze(0)           #bs frame_len+1 73
-        batch['pose_motion_gt'] = coeff_gt[:, 1:, 64:70] - coeff_gt[:, :1, 64:70] #bs frame_len 6
-        batch['ref'] = coeff_gt[:, 0, 64:70]  #bs  6
-        batch['class'] = x['class'].squeeze(0).cuda() # bs
-        indiv_mels= x['indiv_mels'].cuda().squeeze(0) # bs seq_len+1 80 16
-        # forward
-        audio_emb_list = []
-        audio_emb = self.audio_encoder(indiv_mels[:, 1:, :, :].unsqueeze(2)) #bs seq_len 512
-        batch['audio_emb'] = audio_emb
-        batch = self.netG(batch)
-        pose_motion_pred = batch['pose_motion_pred']           # bs frame_len 6
-        pose_gt = coeff_gt[:, 1:, 64:70].clone()               # bs frame_len 6
-        pose_pred = coeff_gt[:, :1, 64:70] + pose_motion_pred  # bs frame_len 6
-        batch['pose_pred'] = pose_pred
-        batch['pose_gt'] = pose_gt
-        return batch
-    def test(self, x):
-        batch = {}
-        ref = x['ref']                            #bs 1 70
-        batch['ref'] = x['ref'][:,0,-6:]
-        batch['class'] = x['class']
-        bs = ref.shape[0]
-        indiv_mels= x['indiv_mels']               # bs T 1 80 16
-        indiv_mels_use = indiv_mels[:, 1:]        # we regard the ref as the first frame
-        num_frames = x['num_frames']
-        num_frames = int(num_frames) - 1
-        #
-        div = num_frames//self.seq_len
-        re = num_frames%self.seq_len
-        audio_emb_list = []
-        pose_motion_pred_list = [torch.zeros(batch['ref'].unsqueeze(1).shape, dtype=batch['ref'].dtype,
-                                                device=batch['ref'].device)]
-        for i in range(div):
-            z = torch.randn(bs, self.latent_dim).to(ref.device)
-            batch['z'] = z
-            audio_emb = self.audio_encoder(indiv_mels_use[:, i*self.seq_len:(i+1)*self.seq_len,:,:,:]) #bs seq_len 512
-            batch['audio_emb'] = audio_emb
-            batch = self.netG.test(batch)
-            pose_motion_pred_list.append(batch['pose_motion_pred'])  #list of bs seq_len 6
-        if re != 0:
-            z = torch.randn(bs, self.latent_dim).to(ref.device)
-            batch['z'] = z
-            audio_emb = self.audio_encoder(indiv_mels_use[:, -1*self.seq_len:,:,:,:]) #bs seq_len  512
-            if audio_emb.shape[1] != self.seq_len:
-                pad_dim = self.seq_len-audio_emb.shape[1]
-                pad_audio_emb = audio_emb[:, :1].repeat(1, pad_dim, 1)
-                audio_emb = torch.cat([pad_audio_emb, audio_emb], 1)
-            batch['audio_emb'] = audio_emb
-            batch = self.netG.test(batch)
-            pose_motion_pred_list.append(batch['pose_motion_pred'][:,-1*re:,:])
-        pose_motion_pred = torch.cat(pose_motion_pred_list, dim = 1)
-        batch['pose_motion_pred'] = pose_motion_pred
-        pose_pred = ref[:, :1, -6:] + pose_motion_pred  # bs T 6
-        batch['pose_pred'] = pose_pred
-        return batch

src/src/audio2pose_models/audio_encoder.py DELETED Viewed

@@ -1,64 +0,0 @@
-import torch
-from torch import nn
-from torch.nn import functional as F
-class Conv2d(nn.Module):
-    def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.conv_block = nn.Sequential(
-                            nn.Conv2d(cin, cout, kernel_size, stride, padding),
-                            nn.BatchNorm2d(cout)
-                            )
-        self.act = nn.ReLU()
-        self.residual = residual
-    def forward(self, x):
-        out = self.conv_block(x)
-        if self.residual:
-            out += x
-        return self.act(out)
-class AudioEncoder(nn.Module):
-    def __init__(self, wav2lip_checkpoint, device):
-        super(AudioEncoder, self).__init__()
-        self.audio_encoder = nn.Sequential(
-            Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
-            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
-            Conv2d(32, 32, kernel_size=3, stride=1, padding=1, residual=True),
-            Conv2d(32, 64, kernel_size=3, stride=(3, 1), padding=1),
-            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
-            Conv2d(64, 64, kernel_size=3, stride=1, padding=1, residual=True),
-            Conv2d(64, 128, kernel_size=3, stride=3, padding=1),
-            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
-            Conv2d(128, 128, kernel_size=3, stride=1, padding=1, residual=True),
-            Conv2d(128, 256, kernel_size=3, stride=(3, 2), padding=1),
-            Conv2d(256, 256, kernel_size=3, stride=1, padding=1, residual=True),
-            Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
-            Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
-        #### load the pre-trained audio_encoder, we do not need to load wav2lip model here.
-        # wav2lip_state_dict = torch.load(wav2lip_checkpoint, map_location=torch.device(device))['state_dict']
-        # state_dict = self.audio_encoder.state_dict()
-        # for k,v in wav2lip_state_dict.items():
-        #     if 'audio_encoder' in k:
-        #         state_dict[k.replace('module.audio_encoder.', '')] = v
-        # self.audio_encoder.load_state_dict(state_dict)
-    def forward(self, audio_sequences):
-        # audio_sequences = (B, T, 1, 80, 16)
-        B = audio_sequences.size(0)
-        audio_sequences = torch.cat([audio_sequences[:, i] for i in range(audio_sequences.size(1))], dim=0)
-        audio_embedding = self.audio_encoder(audio_sequences) # B, 512, 1, 1
-        dim = audio_embedding.shape[1]
-        audio_embedding = audio_embedding.reshape((B, -1, dim, 1, 1))
-        return audio_embedding.squeeze(-1).squeeze(-1) #B seq_len+1 512

src/src/audio2pose_models/cvae.py DELETED Viewed

@@ -1,149 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-from src.audio2pose_models.res_unet import ResUnet
-def class2onehot(idx, class_num):
-    assert torch.max(idx).item() < class_num
-    onehot = torch.zeros(idx.size(0), class_num).to(idx.device)
-    onehot.scatter_(1, idx, 1)
-    return onehot
-class CVAE(nn.Module):
-    def __init__(self, cfg):
-        super().__init__()
-        encoder_layer_sizes = cfg.MODEL.CVAE.ENCODER_LAYER_SIZES
-        decoder_layer_sizes = cfg.MODEL.CVAE.DECODER_LAYER_SIZES
-        latent_size = cfg.MODEL.CVAE.LATENT_SIZE
-        num_classes = cfg.DATASET.NUM_CLASSES
-        audio_emb_in_size = cfg.MODEL.CVAE.AUDIO_EMB_IN_SIZE
-        audio_emb_out_size = cfg.MODEL.CVAE.AUDIO_EMB_OUT_SIZE
-        seq_len = cfg.MODEL.CVAE.SEQ_LEN
-        self.latent_size = latent_size
-        self.encoder = ENCODER(encoder_layer_sizes, latent_size, num_classes,
-                                audio_emb_in_size, audio_emb_out_size, seq_len)
-        self.decoder = DECODER(decoder_layer_sizes, latent_size, num_classes,
-                                audio_emb_in_size, audio_emb_out_size, seq_len)
-    def reparameterize(self, mu, logvar):
-        std = torch.exp(0.5 * logvar)
-        eps = torch.randn_like(std)
-        return mu + eps * std
-    def forward(self, batch):
-        batch = self.encoder(batch)
-        mu = batch['mu']
-        logvar = batch['logvar']
-        z = self.reparameterize(mu, logvar)
-        batch['z'] = z
-        return self.decoder(batch)
-    def test(self, batch):
-        '''
-        class_id = batch['class']
-        z = torch.randn([class_id.size(0), self.latent_size]).to(class_id.device)
-        batch['z'] = z
-        '''
-        return self.decoder(batch)
-class ENCODER(nn.Module):
-    def __init__(self, layer_sizes, latent_size, num_classes,
-                audio_emb_in_size, audio_emb_out_size, seq_len):
-        super().__init__()
-        self.resunet = ResUnet()
-        self.num_classes = num_classes
-        self.seq_len = seq_len
-        self.MLP = nn.Sequential()
-        layer_sizes[0] += latent_size + seq_len*audio_emb_out_size + 6
-        for i, (in_size, out_size) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])):
-            self.MLP.add_module(
-                name="L{:d}".format(i), module=nn.Linear(in_size, out_size))
-            self.MLP.add_module(name="A{:d}".format(i), module=nn.ReLU())
-        self.linear_means = nn.Linear(layer_sizes[-1], latent_size)
-        self.linear_logvar = nn.Linear(layer_sizes[-1], latent_size)
-        self.linear_audio = nn.Linear(audio_emb_in_size, audio_emb_out_size)
-        self.classbias = nn.Parameter(torch.randn(self.num_classes, latent_size))
-    def forward(self, batch):
-        class_id = batch['class']
-        pose_motion_gt = batch['pose_motion_gt']                             #bs seq_len 6
-        ref = batch['ref']                             #bs 6
-        bs = pose_motion_gt.shape[0]
-        audio_in = batch['audio_emb']                          # bs seq_len audio_emb_in_size
-        #pose encode
-        pose_emb = self.resunet(pose_motion_gt.unsqueeze(1))          #bs 1 seq_len 6
-        pose_emb = pose_emb.reshape(bs, -1)                    #bs seq_len*6
-        #audio mapping
-        print(audio_in.shape)
-        audio_out = self.linear_audio(audio_in)                # bs seq_len audio_emb_out_size
-        audio_out = audio_out.reshape(bs, -1)
-        class_bias = self.classbias[class_id]                  #bs latent_size
-        x_in = torch.cat([ref, pose_emb, audio_out, class_bias], dim=-1) #bs seq_len*(audio_emb_out_size+6)+latent_size
-        x_out = self.MLP(x_in)
-        mu = self.linear_means(x_out)
-        logvar = self.linear_means(x_out)                      #bs latent_size
-        batch.update({'mu':mu, 'logvar':logvar})
-        return batch
-class DECODER(nn.Module):
-    def __init__(self, layer_sizes, latent_size, num_classes,
-                audio_emb_in_size, audio_emb_out_size, seq_len):
-        super().__init__()
-        self.resunet = ResUnet()
-        self.num_classes = num_classes
-        self.seq_len = seq_len
-        self.MLP = nn.Sequential()
-        input_size = latent_size + seq_len*audio_emb_out_size + 6
-        for i, (in_size, out_size) in enumerate(zip([input_size]+layer_sizes[:-1], layer_sizes)):
-            self.MLP.add_module(
-                name="L{:d}".format(i), module=nn.Linear(in_size, out_size))
-            if i+1 < len(layer_sizes):
-                self.MLP.add_module(name="A{:d}".format(i), module=nn.ReLU())
-            else:
-                self.MLP.add_module(name="sigmoid", module=nn.Sigmoid())
-        self.pose_linear = nn.Linear(6, 6)
-        self.linear_audio = nn.Linear(audio_emb_in_size, audio_emb_out_size)
-        self.classbias = nn.Parameter(torch.randn(self.num_classes, latent_size))
-    def forward(self, batch):
-        z = batch['z']                                          #bs latent_size
-        bs = z.shape[0]
-        class_id = batch['class']
-        ref = batch['ref']                             #bs 6
-        audio_in = batch['audio_emb']                           # bs seq_len audio_emb_in_size
-        #print('audio_in: ', audio_in[:, :, :10])
-        audio_out = self.linear_audio(audio_in)                 # bs seq_len audio_emb_out_size
-        #print('audio_out: ', audio_out[:, :, :10])
-        audio_out = audio_out.reshape([bs, -1])                 # bs seq_len*audio_emb_out_size
-        class_bias = self.classbias[class_id]                   #bs latent_size
-        z = z + class_bias
-        x_in = torch.cat([ref, z, audio_out], dim=-1)
-        x_out = self.MLP(x_in)                                  # bs layer_sizes[-1]
-        x_out = x_out.reshape((bs, self.seq_len, -1))
-        #print('x_out: ', x_out)
-        pose_emb = self.resunet(x_out.unsqueeze(1))             #bs 1 seq_len 6
-        pose_motion_pred = self.pose_linear(pose_emb.squeeze(1))       #bs seq_len 6
-        batch.update({'pose_motion_pred':pose_motion_pred})
-        return batch

src/src/audio2pose_models/discriminator.py DELETED Viewed

@@ -1,76 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch import nn
-class ConvNormRelu(nn.Module):
-    def __init__(self, conv_type='1d', in_channels=3, out_channels=64, downsample=False,
-                 kernel_size=None, stride=None, padding=None, norm='BN', leaky=False):
-        super().__init__()
-        if kernel_size is None:
-            if downsample:
-                kernel_size, stride, padding = 4, 2, 1
-            else:
-                kernel_size, stride, padding = 3, 1, 1
-        if conv_type == '2d':
-            self.conv = nn.Conv2d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding,
-                bias=False,
-            )
-            if norm == 'BN':
-                self.norm = nn.BatchNorm2d(out_channels)
-            elif norm == 'IN':
-                self.norm = nn.InstanceNorm2d(out_channels)
-            else:
-                raise NotImplementedError
-        elif conv_type == '1d':
-            self.conv = nn.Conv1d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding,
-                bias=False,
-            )
-            if norm == 'BN':
-                self.norm = nn.BatchNorm1d(out_channels)
-            elif norm == 'IN':
-                self.norm = nn.InstanceNorm1d(out_channels)
-            else:
-                raise NotImplementedError
-        nn.init.kaiming_normal_(self.conv.weight)
-        self.act = nn.LeakyReLU(negative_slope=0.2, inplace=False) if leaky else nn.ReLU(inplace=True)
-    def forward(self, x):
-        x = self.conv(x)
-        if isinstance(self.norm, nn.InstanceNorm1d):
-            x = self.norm(x.permute((0, 2, 1))).permute((0, 2, 1))  # normalize on [C]
-        else:
-            x = self.norm(x)
-        x = self.act(x)
-        return x
-class PoseSequenceDiscriminator(nn.Module):
-    def __init__(self, cfg):
-        super().__init__()
-        self.cfg = cfg
-        leaky = self.cfg.MODEL.DISCRIMINATOR.LEAKY_RELU
-        self.seq = nn.Sequential(
-            ConvNormRelu('1d', cfg.MODEL.DISCRIMINATOR.INPUT_CHANNELS, 256, downsample=True, leaky=leaky),  # B, 256, 64
-            ConvNormRelu('1d', 256, 512, downsample=True, leaky=leaky),  # B, 512, 32
-            ConvNormRelu('1d', 512, 1024, kernel_size=3, stride=1, padding=1, leaky=leaky),  # B, 1024, 16
-            nn.Conv1d(1024, 1, kernel_size=3, stride=1, padding=1, bias=True)  # B, 1, 16
-        )
-    def forward(self, x):
-        x = x.reshape(x.size(0), x.size(1), -1).transpose(1, 2)
-        x = self.seq(x)
-        x = x.squeeze(1)
-        return x

src/src/audio2pose_models/networks.py DELETED Viewed

@@ -1,140 +0,0 @@
-import torch.nn as nn
-import torch
-class ResidualConv(nn.Module):
-    def __init__(self, input_dim, output_dim, stride, padding):
-        super(ResidualConv, self).__init__()
-        self.conv_block = nn.Sequential(
-            nn.BatchNorm2d(input_dim),
-            nn.ReLU(),
-            nn.Conv2d(
-                input_dim, output_dim, kernel_size=3, stride=stride, padding=padding
-            ),
-            nn.BatchNorm2d(output_dim),
-            nn.ReLU(),
-            nn.Conv2d(output_dim, output_dim, kernel_size=3, padding=1),
-        )
-        self.conv_skip = nn.Sequential(
-            nn.Conv2d(input_dim, output_dim, kernel_size=3, stride=stride, padding=1),
-            nn.BatchNorm2d(output_dim),
-        )
-    def forward(self, x):
-        return self.conv_block(x) + self.conv_skip(x)
-class Upsample(nn.Module):
-    def __init__(self, input_dim, output_dim, kernel, stride):
-        super(Upsample, self).__init__()
-        self.upsample = nn.ConvTranspose2d(
-            input_dim, output_dim, kernel_size=kernel, stride=stride
-        )
-    def forward(self, x):
-        return self.upsample(x)
-class Squeeze_Excite_Block(nn.Module):
-    def __init__(self, channel, reduction=16):
-        super(Squeeze_Excite_Block, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2d(1)
-        self.fc = nn.Sequential(
-            nn.Linear(channel, channel // reduction, bias=False),
-            nn.ReLU(inplace=True),
-            nn.Linear(channel // reduction, channel, bias=False),
-            nn.Sigmoid(),
-        )
-    def forward(self, x):
-        b, c, _, _ = x.size()
-        y = self.avg_pool(x).view(b, c)
-        y = self.fc(y).view(b, c, 1, 1)
-        return x * y.expand_as(x)
-class ASPP(nn.Module):
-    def __init__(self, in_dims, out_dims, rate=[6, 12, 18]):
-        super(ASPP, self).__init__()
-        self.aspp_block1 = nn.Sequential(
-            nn.Conv2d(
-                in_dims, out_dims, 3, stride=1, padding=rate[0], dilation=rate[0]
-            ),
-            nn.ReLU(inplace=True),
-            nn.BatchNorm2d(out_dims),
-        )
-        self.aspp_block2 = nn.Sequential(
-            nn.Conv2d(
-                in_dims, out_dims, 3, stride=1, padding=rate[1], dilation=rate[1]
-            ),
-            nn.ReLU(inplace=True),
-            nn.BatchNorm2d(out_dims),
-        )
-        self.aspp_block3 = nn.Sequential(
-            nn.Conv2d(
-                in_dims, out_dims, 3, stride=1, padding=rate[2], dilation=rate[2]
-            ),
-            nn.ReLU(inplace=True),
-            nn.BatchNorm2d(out_dims),
-        )
-        self.output = nn.Conv2d(len(rate) * out_dims, out_dims, 1)
-        self._init_weights()
-    def forward(self, x):
-        x1 = self.aspp_block1(x)
-        x2 = self.aspp_block2(x)
-        x3 = self.aspp_block3(x)
-        out = torch.cat([x1, x2, x3], dim=1)
-        return self.output(out)
-    def _init_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight)
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1)
-                m.bias.data.zero_()
-class Upsample_(nn.Module):
-    def __init__(self, scale=2):
-        super(Upsample_, self).__init__()
-        self.upsample = nn.Upsample(mode="bilinear", scale_factor=scale)
-    def forward(self, x):
-        return self.upsample(x)
-class AttentionBlock(nn.Module):
-    def __init__(self, input_encoder, input_decoder, output_dim):
-        super(AttentionBlock, self).__init__()
-        self.conv_encoder = nn.Sequential(
-            nn.BatchNorm2d(input_encoder),
-            nn.ReLU(),
-            nn.Conv2d(input_encoder, output_dim, 3, padding=1),
-            nn.MaxPool2d(2, 2),
-        )
-        self.conv_decoder = nn.Sequential(
-            nn.BatchNorm2d(input_decoder),
-            nn.ReLU(),
-            nn.Conv2d(input_decoder, output_dim, 3, padding=1),
-        )
-        self.conv_attn = nn.Sequential(
-            nn.BatchNorm2d(output_dim),
-            nn.ReLU(),
-            nn.Conv2d(output_dim, 1, 1),
-        )
-    def forward(self, x1, x2):
-        out = self.conv_encoder(x1) + self.conv_decoder(x2)
-        out = self.conv_attn(out)
-        return out * x2

src/src/audio2pose_models/res_unet.py DELETED Viewed

@@ -1,65 +0,0 @@
-import torch
-import torch.nn as nn
-from src.audio2pose_models.networks import ResidualConv, Upsample
-class ResUnet(nn.Module):
-    def __init__(self, channel=1, filters=[32, 64, 128, 256]):
-        super(ResUnet, self).__init__()
-        self.input_layer = nn.Sequential(
-            nn.Conv2d(channel, filters[0], kernel_size=3, padding=1),
-            nn.BatchNorm2d(filters[0]),
-            nn.ReLU(),
-            nn.Conv2d(filters[0], filters[0], kernel_size=3, padding=1),
-        )
-        self.input_skip = nn.Sequential(
-            nn.Conv2d(channel, filters[0], kernel_size=3, padding=1)
-        )
-        self.residual_conv_1 = ResidualConv(filters[0], filters[1], stride=(2,1), padding=1)
-        self.residual_conv_2 = ResidualConv(filters[1], filters[2], stride=(2,1), padding=1)
-        self.bridge = ResidualConv(filters[2], filters[3], stride=(2,1), padding=1)
-        self.upsample_1 = Upsample(filters[3], filters[3], kernel=(2,1), stride=(2,1))
-        self.up_residual_conv1 = ResidualConv(filters[3] + filters[2], filters[2], stride=1, padding=1)
-        self.upsample_2 = Upsample(filters[2], filters[2], kernel=(2,1), stride=(2,1))
-        self.up_residual_conv2 = ResidualConv(filters[2] + filters[1], filters[1], stride=1, padding=1)
-        self.upsample_3 = Upsample(filters[1], filters[1], kernel=(2,1), stride=(2,1))
-        self.up_residual_conv3 = ResidualConv(filters[1] + filters[0], filters[0], stride=1, padding=1)
-        self.output_layer = nn.Sequential(
-            nn.Conv2d(filters[0], 1, 1, 1),
-            nn.Sigmoid(),
-        )
-    def forward(self, x):
-        # Encode
-        x1 = self.input_layer(x) + self.input_skip(x)
-        x2 = self.residual_conv_1(x1)
-        x3 = self.residual_conv_2(x2)
-        # Bridge
-        x4 = self.bridge(x3)
-        # Decode
-        x4 = self.upsample_1(x4)
-        x5 = torch.cat([x4, x3], dim=1)
-        x6 = self.up_residual_conv1(x5)
-        x6 = self.upsample_2(x6)
-        x7 = torch.cat([x6, x2], dim=1)
-        x8 = self.up_residual_conv2(x7)
-        x8 = self.upsample_3(x8)
-        x9 = torch.cat([x8, x1], dim=1)
-        x10 = self.up_residual_conv3(x9)
-        output = self.output_layer(x10)
-        return output

src/src/config/auido2exp.yaml DELETED Viewed

@@ -1,58 +0,0 @@
-DATASET:
-  TRAIN_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/file_list/train.txt
-  EVAL_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/file_list/val.txt
-  TRAIN_BATCH_SIZE: 32
-  EVAL_BATCH_SIZE: 32
-  EXP: True
-  EXP_DIM: 64
-  FRAME_LEN: 32
-  COEFF_LEN: 73
-  NUM_CLASSES: 46
-  AUDIO_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav
-  COEFF_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav2lip_3dmm
-  LMDB_PATH: /apdcephfs_cq2/share_1290939/shadowcun/datasets/VoxCeleb/v1/imdb
-  DEBUG: True
-  NUM_REPEATS: 2
-  T: 40
-MODEL:
-  FRAMEWORK: V2
-  AUDIOENCODER:
-    LEAKY_RELU: True
-    NORM: 'IN'
-  DISCRIMINATOR:
-    LEAKY_RELU: False
-    INPUT_CHANNELS: 6
-  CVAE:
-    AUDIO_EMB_IN_SIZE: 512
-    AUDIO_EMB_OUT_SIZE: 128
-    SEQ_LEN: 32
-    LATENT_SIZE: 256
-    ENCODER_LAYER_SIZES: [192, 1024]
-    DECODER_LAYER_SIZES: [1024, 192]
-TRAIN:
-  MAX_EPOCH: 300
-  GENERATOR:
-    LR: 2.0e-5
-  DISCRIMINATOR:
-    LR: 1.0e-5
-  LOSS:
-    W_FEAT: 0
-    W_COEFF_EXP: 2
-    W_LM: 1.0e-2
-    W_LM_MOUTH: 0
-    W_REG: 0
-    W_SYNC: 0
-    W_COLOR: 0
-    W_EXPRESSION: 0
-    W_LIPREADING: 0.01
-    W_LIPREADING_VV: 0
-    W_EYE_BLINK: 4
-TAG:
-  NAME:  small_dataset

src/src/config/auido2pose.yaml DELETED Viewed

@@ -1,49 +0,0 @@
-DATASET:
-  TRAIN_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/audio2pose_unet_noAudio/dataset/train_33.txt
-  EVAL_FILE_LIST: /apdcephfs_cq2/share_1290939/wenxuazhang/code/audio2pose_unet_noAudio/dataset/val.txt
-  TRAIN_BATCH_SIZE: 64
-  EVAL_BATCH_SIZE: 1
-  EXP: True
-  EXP_DIM: 64
-  FRAME_LEN: 32
-  COEFF_LEN: 73
-  NUM_CLASSES: 46
-  AUDIO_ROOT_PATH: /apdcephfs_cq2/share_1290939/wenxuazhang/voxceleb1/wav
-  COEFF_ROOT_PATH: /apdcephfs_cq2/share_1290939/shadowcun/datasets/VoxCeleb/v1/imdb
-  DEBUG: True
-MODEL:
-  AUDIOENCODER:
-    LEAKY_RELU: True
-    NORM: 'IN'
-  DISCRIMINATOR:
-    LEAKY_RELU: False
-    INPUT_CHANNELS: 6
-  CVAE:
-    AUDIO_EMB_IN_SIZE: 512
-    AUDIO_EMB_OUT_SIZE: 6
-    SEQ_LEN: 32
-    LATENT_SIZE: 64
-    ENCODER_LAYER_SIZES: [192, 128]
-    DECODER_LAYER_SIZES: [128, 192]
-TRAIN:
-  MAX_EPOCH: 150
-  GENERATOR:
-    LR: 1.0e-4
-  DISCRIMINATOR:
-    LR: 1.0e-4
-  LOSS:
-    LAMBDA_REG: 1
-    LAMBDA_LANDMARKS: 0
-    LAMBDA_VERTICES: 0
-    LAMBDA_GAN_MOTION: 0.7
-    LAMBDA_GAN_COEFF: 0
-    LAMBDA_KL: 1
-TAG:
-  NAME: cvae_UNET_useAudio_usewav2lipAudioEncoder

src/src/config/facerender.yaml DELETED Viewed

@@ -1,45 +0,0 @@
-model_params:
-  common_params:
-    num_kp: 15
-    image_channel: 3
-    feature_channel: 32
-    estimate_jacobian: False   # True
-  kp_detector_params:
-     temperature: 0.1
-     block_expansion: 32
-     max_features: 1024
-     scale_factor: 0.25         # 0.25
-     num_blocks: 5
-     reshape_channel: 16384  # 16384 = 1024 * 16
-     reshape_depth: 16
-  he_estimator_params:
-     block_expansion: 64
-     max_features: 2048
-     num_bins: 66
-  generator_params:
-    block_expansion: 64
-    max_features: 512
-    num_down_blocks: 2
-    reshape_channel: 32
-    reshape_depth: 16         # 512 = 32 * 16
-    num_resblocks: 6
-    estimate_occlusion_map: True
-    dense_motion_params:
-      block_expansion: 32
-      max_features: 1024
-      num_blocks: 5
-      reshape_depth: 16
-      compress: 4
-  discriminator_params:
-    scales: [1]
-    block_expansion: 32
-    max_features: 512
-    num_blocks: 4
-    sn: True
-  mapping_params:
-      coeff_nc: 70
-      descriptor_nc: 1024
-      layer: 3
-      num_kp: 15
-      num_bins: 66

src/src/config/facerender_still.yaml DELETED Viewed

@@ -1,45 +0,0 @@
-model_params:
-  common_params:
-    num_kp: 15
-    image_channel: 3
-    feature_channel: 32
-    estimate_jacobian: False   # True
-  kp_detector_params:
-     temperature: 0.1
-     block_expansion: 32
-     max_features: 1024
-     scale_factor: 0.25         # 0.25
-     num_blocks: 5
-     reshape_channel: 16384  # 16384 = 1024 * 16
-     reshape_depth: 16
-  he_estimator_params:
-     block_expansion: 64
-     max_features: 2048
-     num_bins: 66
-  generator_params:
-    block_expansion: 64
-    max_features: 512
-    num_down_blocks: 2
-    reshape_channel: 32
-    reshape_depth: 16         # 512 = 32 * 16
-    num_resblocks: 6
-    estimate_occlusion_map: True
-    dense_motion_params:
-      block_expansion: 32
-      max_features: 1024
-      num_blocks: 5
-      reshape_depth: 16
-      compress: 4
-  discriminator_params:
-    scales: [1]
-    block_expansion: 32
-    max_features: 512
-    num_blocks: 4
-    sn: True
-  mapping_params:
-      coeff_nc: 73
-      descriptor_nc: 1024
-      layer: 3
-      num_kp: 15
-      num_bins: 66