diff --git a/app.py b/app.py
index 44ffb9b5bfe538cbbabacf93652e5aaa45be50f5..e2ad7c6463381f98b4379a41b7cccebb57614033 100644
--- a/app.py
+++ b/app.py
@@ -1,90 +1,88 @@
 import os, sys
 import tempfile
 import gradio as gr
-from modules.text2speech import text2speech 
-from modules.sadtalker_test import SadTalker  
-
-def get_driven_audio(audio):  
-    if os.path.isfile(audio):
-        return audio
-    else:
-        save_path = tempfile.NamedTemporaryFile(
-                delete=False,
-                suffix=("." + "wav"),
-            )
-        gen_audio = text2speech(audio, save_path.name)
-        return gen_audio, gen_audio 
+from src.gradio_demo import SadTalker  
+from src.utils.text2speech import TTSTalker
 
 def get_source_image(image):   
         return image
 
-def sadtalker_demo(result_dir='./tmp/'):
+
+
+def sadtalker_demo():
 
     sad_talker = SadTalker()
+    tts_talker = TTSTalker()
+
     with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
-        gr.Markdown("<div align='center'> <h3> 😭 SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation (CVPR 2023) </h3> \
+        gr.Markdown("<div align='center'> <h2> 😭 SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation (CVPR 2023) </span> </h2> \
                     <a style='font-size:18px;color: #efefef' href='https://arxiv.org/abs/2211.12194'>Arxiv</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
                     <a style='font-size:18px;color: #efefef' href='https://sadtalker.github.io'>Homepage</a>  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
-                     <a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </a> </div>")
+                     <a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </div>")
         
-        with gr.Row():
+        with gr.Row().style(equal_height=False):
             with gr.Column(variant='panel'):
                 with gr.Tabs(elem_id="sadtalker_source_image"):
                     with gr.TabItem('Upload image'):
                         with gr.Row():
-                            source_image = gr.Image(label="Source image", source="upload", type="filepath").style(height=256)
+                            source_image = gr.Image(label="Source image", source="upload", type="filepath").style(height=256,width=256)
  
                 with gr.Tabs(elem_id="sadtalker_driven_audio"):
-                    with gr.TabItem('Upload audio(wav/mp3 only currently)'):
+                    with gr.TabItem('Upload OR TTS'):
                         with gr.Column(variant='panel'):
                             driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
+                    
+                        with gr.Column(variant='panel'):
+                            input_text = gr.Textbox(label="Generating audio from text", lines=5, placeholder="Alternatively, you can genreate the audio from text using @Coqui.ai TTS.")
+                            tts = gr.Button('Generate audio',elem_id="sadtalker_audio_generate", variant='primary')
+                            tts.click(fn=tts_talker.test, inputs=[input_text], outputs=[driven_audio])
+                        
 
             with gr.Column(variant='panel'): 
                 with gr.Tabs(elem_id="sadtalker_checkbox"):
                     with gr.TabItem('Settings'):
                         with gr.Column(variant='panel'):
-                            is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion)").style(container=True)
-                            is_resize_mode = gr.Checkbox(label="Resize Mode (⚠️ Resize mode need manually crop the image firstly, can handle larger image crop)").style(container=True)
-                            is_enhance_mode = gr.Checkbox(label="Enhance Mode (better face quality )").style(container=True)
+                            is_still_mode = gr.Checkbox(label="w/ Still Mode (fewer hand motion, works on full body)")
+                            enhancer = gr.Checkbox(label="w/ GFPGAN as Face enhancer")
                             submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
 
                 with gr.Tabs(elem_id="sadtalker_genearted"):
                         gen_video = gr.Video(label="Generated video", format="mp4").style(width=256)
-                        gen_text = gr.Textbox(visible=False)
-                    
+
         with gr.Row():
             examples = [
                 [
-                    'examples/source_image/art_10.png',
-                    'examples/driven_audio/deyu.wav',
+                    'examples/source_image/full_body_1.png',
+                    'examples/driven_audio/bus_chinese.wav',
                     True,
-                    False,
                     False
                 ],
                 [
-                    'examples/source_image/art_1.png',
-                    'examples/driven_audio/fayu.wav',
+                    'examples/source_image/full_body_2.png',
+                    'examples/driven_audio/itosinger1.wav',
                     True,
+                    False
+                ],
+                [
+                    'examples/source_image/art_13.png',
+                    'examples/driven_audio/fayu.wav',
                     True,
                     False
                 ],
                 [
-                    'examples/source_image/art_9.png',
-                    'examples/driven_audio/itosinger1.wav',
+                    'examples/source_image/art_5.png',
+                    'examples/driven_audio/chinese_news.wav',
                     True,
-                    False,
-                    True
-                ]
+                    False
+                ],
             ]
             gr.Examples(examples=examples,
                         inputs=[
                             source_image,
                             driven_audio,
                             is_still_mode,
-                            is_resize_mode,
-                            is_enhance_mode,
-                            gr.Textbox(value=result_dir, visible=False)], 
-                        outputs=[gen_video, gen_text],
+                            enhancer], 
+                        outputs=[gen_video],
                         fn=sad_talker.test,
                         cache_examples=os.getenv('SYSTEM') == 'spaces')
 
@@ -93,10 +91,8 @@ def sadtalker_demo(result_dir='./tmp/'):
                     inputs=[source_image,
                             driven_audio,
                             is_still_mode,
-                            is_resize_mode,
-                            is_enhance_mode,
-                            gr.Textbox(value=result_dir, visible=False)], 
-                    outputs=[gen_video, gen_text]
+                            enhancer], 
+                    outputs=[gen_video]
                     )
 
     return sadtalker_interface
@@ -104,8 +100,7 @@ def sadtalker_demo(result_dir='./tmp/'):
 
 if __name__ == "__main__":
 
-    sadtalker_result_dir = os.path.join('./', 'results')
-    demo = sadtalker_demo(sadtalker_result_dir)
+    demo = sadtalker_demo()
     demo.launch()
 
 
diff --git a/examples/driven_audio/bus_chinese.wav b/examples/driven_audio/bus_chinese.wav
new file mode 100644
index 0000000000000000000000000000000000000000..888647738d72dfaee99b8d40bb0ddf6f7a1872e7
Binary files /dev/null and b/examples/driven_audio/bus_chinese.wav differ
diff --git a/examples/source_image/full_body_1.png b/examples/source_image/full_body_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..4fca65c949b7c7e7f7ed9459c473314a38be791f
Binary files /dev/null and b/examples/source_image/full_body_1.png differ
diff --git a/examples/source_image/full_body_2.png b/examples/source_image/full_body_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..b7bc6228cb2f4e8c01af8d2f52bbbf62540e2412
Binary files /dev/null and b/examples/source_image/full_body_2.png differ
diff --git a/examples/source_image/happy.png b/examples/source_image/happy.png
new file mode 100644
index 0000000000000000000000000000000000000000..9d194ba9a03dfda0867703d54ea6233819c46a73
Binary files /dev/null and b/examples/source_image/happy.png differ
diff --git a/examples/source_image/happy1.png b/examples/source_image/happy1.png
new file mode 100644
index 0000000000000000000000000000000000000000..b702974cca1a648ec70efee776e484284b527c90
Binary files /dev/null and b/examples/source_image/happy1.png differ
diff --git a/examples/source_image/people_0.png b/examples/source_image/people_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..8895eeb07a3e300b9bcfa3bb53e7a6a552182bc3
Binary files /dev/null and b/examples/source_image/people_0.png differ
diff --git a/examples/source_image/sad.png b/examples/source_image/sad.png
new file mode 100644
index 0000000000000000000000000000000000000000..6584467fdac971207883cdcd84b31da1dbc4dfa6
Binary files /dev/null and b/examples/source_image/sad.png differ
diff --git a/examples/source_image/sad1.png b/examples/source_image/sad1.png
new file mode 100644
index 0000000000000000000000000000000000000000..341e0cb70886995ecf72eebb4b8a4474ab7d287b
Binary files /dev/null and b/examples/source_image/sad1.png differ
diff --git a/modules/__pycache__/sadtalker_test.cpython-38.pyc b/modules/__pycache__/sadtalker_test.cpython-38.pyc
index c54ce9b8728a52636f9cb9f9c47616709d04cfe4..a96311c6eee958b442fec8776d088b74e7b8b3a2 100644
Binary files a/modules/__pycache__/sadtalker_test.cpython-38.pyc and b/modules/__pycache__/sadtalker_test.cpython-38.pyc differ
diff --git a/src/__pycache__/generate_batch.cpython-38.pyc b/src/__pycache__/generate_batch.cpython-38.pyc
index c68dd09e49933b52115307195bf3aa446d924922..dc3eb4726e9835d34c08362da995941fef530b8f 100644
Binary files a/src/__pycache__/generate_batch.cpython-38.pyc and b/src/__pycache__/generate_batch.cpython-38.pyc differ
diff --git a/src/__pycache__/generate_facerender_batch.cpython-38.pyc b/src/__pycache__/generate_facerender_batch.cpython-38.pyc
index 6a30615ed3eaa5902a2fa553ed3ed17a9ae92a51..cc944270498549b70e901f5b1c764d1d832eb49e 100644
Binary files a/src/__pycache__/generate_facerender_batch.cpython-38.pyc and b/src/__pycache__/generate_facerender_batch.cpython-38.pyc differ
diff --git a/src/__pycache__/test_audio2coeff.cpython-38.pyc b/src/__pycache__/test_audio2coeff.cpython-38.pyc
index c2553cc97f50096d7c7005ad39274a8653cb6ad4..a6d261868c02b57145618adcd583481cf623e391 100644
Binary files a/src/__pycache__/test_audio2coeff.cpython-38.pyc and b/src/__pycache__/test_audio2coeff.cpython-38.pyc differ
diff --git a/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc b/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc
index 460563d74a990c40a3c5bd6f3209acca6d86b550..de88551314f6c19ad1f5b5b33704f1303f51e029 100644
Binary files a/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc and b/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc differ
diff --git a/src/audio2exp_models/__pycache__/networks.cpython-38.pyc b/src/audio2exp_models/__pycache__/networks.cpython-38.pyc
index 766660615f22f94c740dd420ccef83ed442c4fac..d703bd9e8f3d0c27c16fa713bba3d0969e984ad3 100644
Binary files a/src/audio2exp_models/__pycache__/networks.cpython-38.pyc and b/src/audio2exp_models/__pycache__/networks.cpython-38.pyc differ
diff --git a/src/audio2exp_models/audio2exp.py b/src/audio2exp_models/audio2exp.py
index 5f6e6b77b0ceb2089539caa440f7106c7b1e8aa2..9e79a929560592687a505e13188796e2b0ca8772 100644
--- a/src/audio2exp_models/audio2exp.py
+++ b/src/audio2exp_models/audio2exp.py
@@ -22,7 +22,8 @@ class Audio2Exp(nn.Module):
             
             current_mel_input = mel_input[:,i:i+10]
 
-            ref = batch['ref'][:, :, :64].repeat((1,current_mel_input.shape[1],1))           #bs T 64
+            #ref = batch['ref'][:, :, :64].repeat((1,current_mel_input.shape[1],1))           #bs T 64
+            ref = batch['ref'][:, :, :64][:, i:i+10]
             ratio = batch['ratio_gt'][:, i:i+10]                               #bs T
 
             audiox = current_mel_input.view(-1, 1, 80, 16)                  # bs*T 1 80 16
diff --git a/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc b/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc
index 20fa93168344012f0bdb77727b5b5669fac8a10b..5b2dcc996a73224e972148e252fb4e2deedd69a5 100644
Binary files a/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc and b/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc b/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc
index 97d9bdf072c5bd356cc312357646c6eae2b798d0..b0f11a59fea18ee93c30da5cd4c94d04897ea010 100644
Binary files a/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc and b/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc b/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc
index 0d9aaee3ad4caa8afc40f723d224eb5b25e8afcd..1aa0e494be950e6ca972390b27f2dddc8be6d193 100644
Binary files a/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc and b/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc b/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc
index c7ebfcd0dd3538cedeb7eba984f94d9763b392c6..817b8836123ed1a3b5795d912d84c3ff54d7accc 100644
Binary files a/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc and b/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/networks.cpython-38.pyc b/src/audio2pose_models/__pycache__/networks.cpython-38.pyc
index 239626089b91321b1c00cfba2dfe0a3ba1ccb0b9..d18f56064377373a8f4f400c59379b0b79d9f649 100644
Binary files a/src/audio2pose_models/__pycache__/networks.cpython-38.pyc and b/src/audio2pose_models/__pycache__/networks.cpython-38.pyc differ
diff --git a/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc b/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc
index 0e6b40591fd932ddb2cf686b72afd08c90de1a44..5aa2863a646a6eb8b44e0ebdebc5c21b562c2f39 100644
Binary files a/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc and b/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc differ
diff --git a/src/audio2pose_models/audio2pose.py b/src/audio2pose_models/audio2pose.py
index 3a37179e221340662a817628df3d01ae9e34404f..1a8410d6ee7f7f1d50305f61332bfbdb9dc8bf0e 100644
--- a/src/audio2pose_models/audio2pose.py
+++ b/src/audio2pose_models/audio2pose.py
@@ -12,7 +12,7 @@ class Audio2Pose(nn.Module):
         self.latent_dim = cfg.MODEL.CVAE.LATENT_SIZE
         self.device = device
 
-        self.audio_encoder = AudioEncoder(wav2lip_checkpoint)
+        self.audio_encoder = AudioEncoder(wav2lip_checkpoint, device)
         self.audio_encoder.eval()
         for param in self.audio_encoder.parameters():
             param.requires_grad = False
@@ -20,10 +20,6 @@ class Audio2Pose(nn.Module):
         self.netG = CVAE(cfg)
         self.netD_motion = PoseSequenceDiscriminator(cfg)
         
-        self.gan_criterion = nn.MSELoss()
-        self.reg_criterion = nn.L1Loss(reduction='none')
-        self.pair_criterion = nn.PairwiseDistance()
-        self.cosine_loss = nn.CosineSimilarity(dim=1)
         
     def forward(self, x):
 
@@ -81,6 +77,10 @@ class Audio2Pose(nn.Module):
             z = torch.randn(bs, self.latent_dim).to(ref.device)
             batch['z'] = z
             audio_emb = self.audio_encoder(indiv_mels_use[:, -1*self.seq_len:,:,:,:]) #bs seq_len  512
+            if audio_emb.shape[1] != self.seq_len:
+                pad_dim = self.seq_len-audio_emb.shape[1]
+                pad_audio_emb = audio_emb[:, :1].repeat(1, pad_dim, 1) 
+                audio_emb = torch.cat([pad_audio_emb, audio_emb], 1) 
             batch['audio_emb'] = audio_emb
             batch = self.netG.test(batch)
             pose_motion_pred_list.append(batch['pose_motion_pred'][:,-1*re:,:])   
diff --git a/src/audio2pose_models/audio_encoder.py b/src/audio2pose_models/audio_encoder.py
index 0ce036df119f86ef28c3ac8d6c834264571c309a..ea9095ad762caf48ff0f97abf4a086f6f7fee7e7 100644
--- a/src/audio2pose_models/audio_encoder.py
+++ b/src/audio2pose_models/audio_encoder.py
@@ -19,7 +19,7 @@ class Conv2d(nn.Module):
         return self.act(out)
 
 class AudioEncoder(nn.Module):
-    def __init__(self, wav2lip_checkpoint):
+    def __init__(self, wav2lip_checkpoint, device):
         super(AudioEncoder, self).__init__()
 
         self.audio_encoder = nn.Sequential(
@@ -41,8 +41,8 @@ class AudioEncoder(nn.Module):
             Conv2d(256, 512, kernel_size=3, stride=1, padding=0),
             Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
 
-        #### load the pre-trained audio_encoder\
-        wav2lip_state_dict = torch.load(wav2lip_checkpoint)['state_dict']
+        #### load the pre-trained audio_encoder
+        wav2lip_state_dict = torch.load(wav2lip_checkpoint, map_location=torch.device(device))['state_dict']
         state_dict = self.audio_encoder.state_dict()
 
         for k,v in wav2lip_state_dict.items():
diff --git a/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc b/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc
index 0469c877400338fae921f4aedf1159b03abbb101..25b9b1377b35ea7231f4d3b44d81aab8d44f4b5b 100644
Binary files a/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc and b/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc differ
diff --git a/src/face3d/__pycache__/visualize.cpython-38.pyc b/src/face3d/__pycache__/visualize.cpython-38.pyc
deleted file mode 100644
index a666447a57777ba5a4c6ed6642f234b79c45d372..0000000000000000000000000000000000000000
Binary files a/src/face3d/__pycache__/visualize.cpython-38.pyc and /dev/null differ
diff --git a/src/face3d/models/__pycache__/__init__.cpython-38.pyc b/src/face3d/models/__pycache__/__init__.cpython-38.pyc
index 886f0b184346c5530d0bf8d6f4b2300079511225..023f4afb376ad418cc6e3cdd9e821cfa0bcd33f3 100644
Binary files a/src/face3d/models/__pycache__/__init__.cpython-38.pyc and b/src/face3d/models/__pycache__/__init__.cpython-38.pyc differ
diff --git a/src/face3d/models/__pycache__/base_model.cpython-38.pyc b/src/face3d/models/__pycache__/base_model.cpython-38.pyc
index e42691ec8e26c5c38baf6bd0172dff8110754da1..1076d15ca87eb8922a4fb3706a3aff777187b612 100644
Binary files a/src/face3d/models/__pycache__/base_model.cpython-38.pyc and b/src/face3d/models/__pycache__/base_model.cpython-38.pyc differ
diff --git a/src/face3d/models/__pycache__/bfm.cpython-38.pyc b/src/face3d/models/__pycache__/bfm.cpython-38.pyc
deleted file mode 100644
index 088a48bf9f0cabeb667c11c21000f0254c63ec81..0000000000000000000000000000000000000000
Binary files a/src/face3d/models/__pycache__/bfm.cpython-38.pyc and /dev/null differ
diff --git a/src/face3d/models/__pycache__/facerecon_model.cpython-38.pyc b/src/face3d/models/__pycache__/facerecon_model.cpython-38.pyc
deleted file mode 100644
index 3e8de7975dee1099cb3e7698227df4e4062f86ee..0000000000000000000000000000000000000000
Binary files a/src/face3d/models/__pycache__/facerecon_model.cpython-38.pyc and /dev/null differ
diff --git a/src/face3d/models/__pycache__/losses.cpython-38.pyc b/src/face3d/models/__pycache__/losses.cpython-38.pyc
deleted file mode 100644
index ffbf94d1f1e09d5ba0653c588b0cfaeb3df7b920..0000000000000000000000000000000000000000
Binary files a/src/face3d/models/__pycache__/losses.cpython-38.pyc and /dev/null differ
diff --git a/src/face3d/models/__pycache__/networks.cpython-38.pyc b/src/face3d/models/__pycache__/networks.cpython-38.pyc
index 1a97b5cd3309786e87448c4478ae2d19a18e096b..e52b5dac3ce0e017ed844aed711ddfb94223be98 100644
Binary files a/src/face3d/models/__pycache__/networks.cpython-38.pyc and b/src/face3d/models/__pycache__/networks.cpython-38.pyc differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-36.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-36.pyc
deleted file mode 100644
index c49397797cf06eaa01ef1327d25f0c145a511994..0000000000000000000000000000000000000000
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-36.pyc and /dev/null differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-37.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-37.pyc
deleted file mode 100644
index 82f8ed2b49d5c718fe15c47d620156600f776765..0000000000000000000000000000000000000000
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-37.pyc and /dev/null differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc
index 83f6ad3ed4af3cc3d3cfa9067e345cdffb058638..a891077dd80e455e762875f37b16ff11e58441e7 100644
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-39.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-39.pyc
deleted file mode 100644
index b1291676de1f08eaba633f000d015eab672e0036..0000000000000000000000000000000000000000
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-39.pyc and /dev/null differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-36.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-36.pyc
deleted file mode 100644
index 6be617e2ecf266f566e6e5d4972465fcd0379ac5..0000000000000000000000000000000000000000
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-36.pyc and /dev/null differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-37.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-37.pyc
deleted file mode 100644
index 0a085d7cb2aa24dabc85966931e3aa9db54310e3..0000000000000000000000000000000000000000
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-37.pyc and /dev/null differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc
index f59247d26d9210b5fd2960df842753a903a90b3d..e7d3278234555217f1055e02d930d1cd8731afa1 100644
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-39.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-39.pyc
deleted file mode 100644
index d8a633135905cc3c5fe7673c6d6ab584e0692ce7..0000000000000000000000000000000000000000
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-39.pyc and /dev/null differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-36.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-36.pyc
deleted file mode 100644
index 6d9748f002ee2f953efa2391054329b6d32f9016..0000000000000000000000000000000000000000
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-36.pyc and /dev/null differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-37.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-37.pyc
deleted file mode 100644
index 50b9f06989f4ca4f6f5bd7a1fdf1952f2035e974..0000000000000000000000000000000000000000
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-37.pyc and /dev/null differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc
index d8edc64d28aa3e3fb8c26ba795d04a8ef35b1540..db57e8b41e4fe5bdbee04db62986c15c0e4bffb1 100644
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc differ
diff --git a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-39.pyc b/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-39.pyc
deleted file mode 100644
index 24ebbc749bfa90340e389e2c88bd1f8218c3e338..0000000000000000000000000000000000000000
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-39.pyc and /dev/null differ
diff --git a/src/face3d/util/__pycache__/__init__.cpython-38.pyc b/src/face3d/util/__pycache__/__init__.cpython-38.pyc
index 22771f3169f2da9a37c1bd619a0e5d05003492b9..2671705d02bed0a099b4a375070d0949c1450b7b 100644
Binary files a/src/face3d/util/__pycache__/__init__.cpython-38.pyc and b/src/face3d/util/__pycache__/__init__.cpython-38.pyc differ
diff --git a/src/face3d/util/__pycache__/load_mats.cpython-38.pyc b/src/face3d/util/__pycache__/load_mats.cpython-38.pyc
index 8a48b59ca078ef709825d54c069f518c15103c4e..f44224c0f7c12afc3590f10b9f5ac570b6b668bb 100644
Binary files a/src/face3d/util/__pycache__/load_mats.cpython-38.pyc and b/src/face3d/util/__pycache__/load_mats.cpython-38.pyc differ
diff --git a/src/face3d/util/__pycache__/nvdiffrast.cpython-38.pyc b/src/face3d/util/__pycache__/nvdiffrast.cpython-38.pyc
deleted file mode 100644
index 0ac5cc3eb7c6fd3141005a9cd53f604c49036717..0000000000000000000000000000000000000000
Binary files a/src/face3d/util/__pycache__/nvdiffrast.cpython-38.pyc and /dev/null differ
diff --git a/src/face3d/util/__pycache__/preprocess.cpython-38.pyc b/src/face3d/util/__pycache__/preprocess.cpython-38.pyc
index 7900dafbd8b74629c391eb8972f615650d4461df..90eb37261ae38ab925f149db62d91a1d0078bfcf 100644
Binary files a/src/face3d/util/__pycache__/preprocess.cpython-38.pyc and b/src/face3d/util/__pycache__/preprocess.cpython-38.pyc differ
diff --git a/src/face3d/util/__pycache__/util.cpython-38.pyc b/src/face3d/util/__pycache__/util.cpython-38.pyc
deleted file mode 100644
index 56d6f9217276ff22306a567df4861f802e61a82a..0000000000000000000000000000000000000000
Binary files a/src/face3d/util/__pycache__/util.cpython-38.pyc and /dev/null differ
diff --git a/src/facerender/__pycache__/animate.cpython-38.pyc b/src/facerender/__pycache__/animate.cpython-38.pyc
index 11fb3d0ee467093c0cb318003c52eb4c78f11cc9..1f8003ddb550fc6e235abccfb5f8481ee8c16afa 100644
Binary files a/src/facerender/__pycache__/animate.cpython-38.pyc and b/src/facerender/__pycache__/animate.cpython-38.pyc differ
diff --git a/src/facerender/animate.py b/src/facerender/animate.py
index be2d62ebaeffe06a8dee1e268d832690b1937320..1bd221ad4c99d911222fdf1eb087ebb626afc867 100644
--- a/src/facerender/animate.py
+++ b/src/facerender/animate.py
@@ -16,6 +16,8 @@ from src.facerender.modules.make_animation import make_animation
 
 from pydub import AudioSegment 
 from src.utils.face_enhancer import enhancer as face_enhancer
+from src.utils.paste_pic import paste_pic
+
 
 
 class AnimateFromCoeff():
@@ -30,21 +32,26 @@ class AnimateFromCoeff():
                                                     **config['model_params']['common_params'])
         kp_extractor = KPDetector(**config['model_params']['kp_detector_params'],
                                     **config['model_params']['common_params'])
+        he_estimator = HEEstimator(**config['model_params']['he_estimator_params'],
+                               **config['model_params']['common_params'])
         mapping = MappingNet(**config['model_params']['mapping_params'])
 
 
         generator.to(device)
         kp_extractor.to(device)
+        he_estimator.to(device)
         mapping.to(device)
         for param in generator.parameters():
             param.requires_grad = False
         for param in kp_extractor.parameters():
             param.requires_grad = False 
+        for param in he_estimator.parameters():
+            param.requires_grad = False
         for param in mapping.parameters():
             param.requires_grad = False
 
         if free_view_checkpoint is not None:
-            self.load_cpk_facevid2vid(free_view_checkpoint, kp_detector=kp_extractor, generator=generator)
+            self.load_cpk_facevid2vid(free_view_checkpoint, kp_detector=kp_extractor, generator=generator, he_estimator=he_estimator)
         else:
             raise AttributeError("Checkpoint should be specified for video head pose estimator.")
 
@@ -55,10 +62,12 @@ class AnimateFromCoeff():
 
         self.kp_extractor = kp_extractor
         self.generator = generator
+        self.he_estimator = he_estimator
         self.mapping = mapping
 
         self.kp_extractor.eval()
         self.generator.eval()
+        self.he_estimator.eval()
         self.mapping.eval()
          
         self.device = device
@@ -107,26 +116,35 @@ class AnimateFromCoeff():
 
         return checkpoint['epoch']
 
-    def generate(self, x, video_save_dir, enhancer=None, original_size=None):
+    def generate(self, x, video_save_dir, pic_path, crop_info, enhancer=None, full_img_enhancer=None):
 
         source_image=x['source_image'].type(torch.FloatTensor)
         source_semantics=x['source_semantics'].type(torch.FloatTensor)
-        target_semantics=x['target_semantics_list'].type(torch.FloatTensor)
-        yaw_c_seq = x['yaw_c_seq'].type(torch.FloatTensor)
-        pitch_c_seq = x['pitch_c_seq'].type(torch.FloatTensor)
-        roll_c_seq = x['roll_c_seq'].type(torch.FloatTensor)
+        target_semantics=x['target_semantics_list'].type(torch.FloatTensor) 
         source_image=source_image.to(self.device)
         source_semantics=source_semantics.to(self.device)
         target_semantics=target_semantics.to(self.device)
-        yaw_c_seq = x['yaw_c_seq'].to(self.device)
-        pitch_c_seq = x['pitch_c_seq'].to(self.device)
-        roll_c_seq = x['roll_c_seq'].to(self.device)
+        if 'yaw_c_seq' in x:
+            yaw_c_seq = x['yaw_c_seq'].type(torch.FloatTensor)
+            yaw_c_seq = x['yaw_c_seq'].to(self.device)
+        else:
+            yaw_c_seq = None
+        if 'pitch_c_seq' in x:
+            pitch_c_seq = x['pitch_c_seq'].type(torch.FloatTensor)
+            pitch_c_seq = x['pitch_c_seq'].to(self.device)
+        else:
+            pitch_c_seq = None
+        if 'roll_c_seq' in x:
+            roll_c_seq = x['roll_c_seq'].type(torch.FloatTensor) 
+            roll_c_seq = x['roll_c_seq'].to(self.device)
+        else:
+            roll_c_seq = None
 
         frame_num = x['frame_num']
 
         predictions_video = make_animation(source_image, source_semantics, target_semantics,
-                                        self.generator, self.kp_extractor, self.mapping, 
-                                        yaw_c_seq, pitch_c_seq, roll_c_seq, use_exp = True,)
+                                        self.generator, self.kp_extractor, self.he_estimator, self.mapping, 
+                                        yaw_c_seq, pitch_c_seq, roll_c_seq, use_exp = True)
 
         predictions_video = predictions_video.reshape((-1,)+predictions_video.shape[2:])
         predictions_video = predictions_video[:frame_num]
@@ -139,6 +157,7 @@ class AnimateFromCoeff():
         result = img_as_ubyte(video)
 
         ### the generated video is 256x256, so we  keep the aspect ratio, 
+        original_size = crop_info[0]
         if original_size:
             result = [ cv2.resize(result_i,(256, int(256.0 * original_size[1]/original_size[0]) )) for result_i in result ]
         
@@ -157,7 +176,9 @@ class AnimateFromCoeff():
 
             imageio.mimsave(enhanced_path, enhanced_images, fps=float(25))
 
-        av_path = os.path.join(video_save_dir, video_name) 
+        av_path = os.path.join(video_save_dir, video_name)
+        return_path = av_path 
+        
         audio_path =  x['audio_path'] 
         audio_name = os.path.splitext(os.path.split(audio_path)[-1])[0]
         new_audio_path = os.path.join(video_save_dir, audio_name+'.wav')
@@ -171,12 +192,28 @@ class AnimateFromCoeff():
 
         cmd = r'ffmpeg -y -i "%s" -i "%s" -vcodec copy "%s"' % (path, new_audio_path, av_path)
         os.system(cmd)
+        print(f'The generated video is named {video_name} in {video_save_dir}')
 
         if enhancer:
+            return_path = av_path_enhancer
             cmd = r'ffmpeg -y -i "%s" -i "%s" -vcodec copy "%s"' % (enhanced_path, new_audio_path, av_path_enhancer)
             os.system(cmd)
             os.remove(enhanced_path)
+            print(f'The generated video is named {video_name_enhancer} in {video_save_dir}')
+
+        if len(crop_info) == 3:
+            video_name_full = x['video_name']  + '_full.mp4'
+            full_video_path = os.path.join(video_save_dir, video_name_full)
+            return_path = full_video_path
+            if enhancer:
+                paste_pic(av_path_enhancer, pic_path, crop_info, new_audio_path, full_video_path)
+            else:
+                paste_pic(path, pic_path, crop_info, new_audio_path, full_video_path)
+            print(f'The generated video is named {video_name_full} in {video_save_dir}') 
+
 
         os.remove(path)
         os.remove(new_audio_path)
 
+        return return_path
+
diff --git a/src/facerender/modules/__pycache__/animate_model.cpython-38.pyc b/src/facerender/modules/__pycache__/animate_model.cpython-38.pyc
deleted file mode 100644
index 1ecb83e033911eb82d582e097c513ea0fd4cb69a..0000000000000000000000000000000000000000
Binary files a/src/facerender/modules/__pycache__/animate_model.cpython-38.pyc and /dev/null differ
diff --git a/src/facerender/modules/__pycache__/animate_model.cpython-39.pyc b/src/facerender/modules/__pycache__/animate_model.cpython-39.pyc
deleted file mode 100644
index 8e9a594ddff05d41ed7fea66e42b37558869332a..0000000000000000000000000000000000000000
Binary files a/src/facerender/modules/__pycache__/animate_model.cpython-39.pyc and /dev/null differ
diff --git a/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc b/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc
index 5178c3763bc9f6fcff3a8a410deff7d3c30060db..7558dbc6512fceb2147fd1fae031212d07e4449d 100644
Binary files a/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc and b/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/dense_motion.cpython-39.pyc b/src/facerender/modules/__pycache__/dense_motion.cpython-39.pyc
deleted file mode 100644
index 9a6cec5db6525ef350d0fcd52efe814b0d3f1e6d..0000000000000000000000000000000000000000
Binary files a/src/facerender/modules/__pycache__/dense_motion.cpython-39.pyc and /dev/null differ
diff --git a/src/facerender/modules/__pycache__/generator.cpython-38.pyc b/src/facerender/modules/__pycache__/generator.cpython-38.pyc
index 8d132f05d36e505f21c864d4c95931472ba58051..11aa36c10f79820e84d8a275234b85b0371cc050 100644
Binary files a/src/facerender/modules/__pycache__/generator.cpython-38.pyc and b/src/facerender/modules/__pycache__/generator.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/generator.cpython-39.pyc b/src/facerender/modules/__pycache__/generator.cpython-39.pyc
deleted file mode 100644
index ac9587fe99d8905d8ac99d60025ed1a8d5bacf1b..0000000000000000000000000000000000000000
Binary files a/src/facerender/modules/__pycache__/generator.cpython-39.pyc and /dev/null differ
diff --git a/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc b/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc
index ccc5d4543365bfc022a06a72d6ed9d388249279a..e0bd1dcd3e98a316628449370f08dc8bd2dde4b9 100644
Binary files a/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc and b/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/keypoint_detector.cpython-39.pyc b/src/facerender/modules/__pycache__/keypoint_detector.cpython-39.pyc
deleted file mode 100644
index e609a2ce2bea049dcc08e711684347032da88e1a..0000000000000000000000000000000000000000
Binary files a/src/facerender/modules/__pycache__/keypoint_detector.cpython-39.pyc and /dev/null differ
diff --git a/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc b/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc
index 1b54bcc293d742f70db165849b9764666b0f9a8b..76e338a936f0354c81abaa5fc677c5622db16eb3 100644
Binary files a/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc and b/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/mapping.cpython-38.pyc b/src/facerender/modules/__pycache__/mapping.cpython-38.pyc
index 7e1a2baa2bfab28fe7e3904f94a644633124b56c..b464c917a4d3feb94fa629b3390c000af89ceb9a 100644
Binary files a/src/facerender/modules/__pycache__/mapping.cpython-38.pyc and b/src/facerender/modules/__pycache__/mapping.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/mapping5.cpython-38.pyc b/src/facerender/modules/__pycache__/mapping5.cpython-38.pyc
deleted file mode 100644
index ae35fb77f8552d2aa9cb263cba6ca9d37bbee9a7..0000000000000000000000000000000000000000
Binary files a/src/facerender/modules/__pycache__/mapping5.cpython-38.pyc and /dev/null differ
diff --git a/src/facerender/modules/__pycache__/mapping5.cpython-39.pyc b/src/facerender/modules/__pycache__/mapping5.cpython-39.pyc
deleted file mode 100644
index fa6b6db40007f95fca648909a638810273b2c050..0000000000000000000000000000000000000000
Binary files a/src/facerender/modules/__pycache__/mapping5.cpython-39.pyc and /dev/null differ
diff --git a/src/facerender/modules/__pycache__/util.cpython-38.pyc b/src/facerender/modules/__pycache__/util.cpython-38.pyc
index 1e1c92955be38c880c52cc70b8051fd8ef4fa63a..4f4d1a6d0e3797390e942821e1e2c238e1c8a8d2 100644
Binary files a/src/facerender/modules/__pycache__/util.cpython-38.pyc and b/src/facerender/modules/__pycache__/util.cpython-38.pyc differ
diff --git a/src/facerender/modules/__pycache__/util.cpython-39.pyc b/src/facerender/modules/__pycache__/util.cpython-39.pyc
deleted file mode 100644
index 8764b93cb4e5964b831caf9ff376b70105f3dc5d..0000000000000000000000000000000000000000
Binary files a/src/facerender/modules/__pycache__/util.cpython-39.pyc and /dev/null differ
diff --git a/src/facerender/modules/dense_motion.py b/src/facerender/modules/dense_motion.py
index 30c13060be8e82979771514b4ec51e5de23f49fa..a286ead2e84ed1961335d34a3b50ab38f25e4495 100644
--- a/src/facerender/modules/dense_motion.py
+++ b/src/facerender/modules/dense_motion.py
@@ -102,6 +102,10 @@ class DenseMotionNetwork(nn.Module):
         mask = F.softmax(mask, dim=1)
         out_dict['mask'] = mask
         mask = mask.unsqueeze(2)                                   # (bs, num_kp+1, 1, d, h, w)
+        
+        zeros_mask = torch.zeros_like(mask)   
+        mask = torch.where(mask < 1e-3, zeros_mask, mask) 
+
         sparse_motion = sparse_motion.permute(0, 1, 5, 2, 3, 4)    # (bs, num_kp+1, 3, d, h, w)
         deformation = (sparse_motion * mask).sum(dim=1)            # (bs, 3, d, h, w)
         deformation = deformation.permute(0, 2, 3, 4, 1)           # (bs, d, h, w, 3)
diff --git a/src/facerender/modules/make_animation.py b/src/facerender/modules/make_animation.py
index 2b2382d82d26043145184b339103aac64abdaa62..e7887a3fed50d294948dd0a7d4c4956583b5f705 100644
--- a/src/facerender/modules/make_animation.py
+++ b/src/facerender/modules/make_animation.py
@@ -62,29 +62,33 @@ def get_rotation_matrix(yaw, pitch, roll):
 
     return rot_mat
 
-def keypoint_transformation(kp_canonical, he):
+def keypoint_transformation(kp_canonical, he, wo_exp=False):
     kp = kp_canonical['value']    # (bs, k, 3) 
     yaw, pitch, roll= he['yaw'], he['pitch'], he['roll']      
     yaw = headpose_pred_to_degree(yaw) 
     pitch = headpose_pred_to_degree(pitch)
     roll = headpose_pred_to_degree(roll)
 
-    if 'yaw_c' in he: 
-        yaw = yaw + he['yaw_c']
-    if 'pitch_c' in he: 
-        pitch = pitch + he['pitch_c']
-    if 'roll_c' in he: 
-        roll = roll + he['roll_c'] 
+    if 'yaw_in' in he:
+        yaw = he['yaw_in']
+    if 'pitch_in' in he:
+        pitch = he['pitch_in']
+    if 'roll_in' in he:
+        roll = he['roll_in']
 
     rot_mat = get_rotation_matrix(yaw, pitch, roll)    # (bs, 3, 3)
 
     t, exp = he['t'], he['exp']
+    if wo_exp:
+        exp =  exp*0  
     
     # keypoint rotation
     kp_rotated = torch.einsum('bmp,bkp->bkm', rot_mat, kp)
 
     # keypoint translation
-    t = t.unsqueeze_(1).repeat(1, kp.shape[1], 1)
+    t[:, 0] = t[:, 0]*0
+    t[:, 2] = t[:, 2]*0
+    t = t.unsqueeze(1).repeat(1, kp.shape[1], 1)
     kp_t = kp_rotated + t
 
     # add expression deviation 
@@ -96,7 +100,7 @@ def keypoint_transformation(kp_canonical, he):
 
 
 def make_animation(source_image, source_semantics, target_semantics,
-                            generator, kp_detector, mapping, 
+                            generator, kp_detector, he_estimator, mapping, 
                             yaw_c_seq=None, pitch_c_seq=None, roll_c_seq=None,
                             use_exp=True):
     with torch.no_grad():
@@ -109,14 +113,12 @@ def make_animation(source_image, source_semantics, target_semantics,
         for frame_idx in tqdm(range(target_semantics.shape[1]), 'Face Renderer:'):
             target_semantics_frame = target_semantics[:, frame_idx]
             he_driving = mapping(target_semantics_frame)
-            if not use_exp:
-                he_driving['exp'] = he_driving['exp']*0
             if yaw_c_seq is not None:
-                he_driving['yaw_c'] = yaw_c_seq[:, frame_idx]
+                he_driving['yaw_in'] = yaw_c_seq[:, frame_idx]
             if pitch_c_seq is not None:
-                he_driving['pitch_c'] = pitch_c_seq[:, frame_idx]
+                he_driving['pitch_in'] = pitch_c_seq[:, frame_idx] 
             if roll_c_seq is not None:
-                he_driving['roll_c'] = roll_c_seq[:, frame_idx]
+                he_driving['roll_in'] = roll_c_seq[:, frame_idx] 
             
             kp_driving = keypoint_transformation(kp_canonical, he_driving)
                 
@@ -124,6 +126,14 @@ def make_animation(source_image, source_semantics, target_semantics,
                                    #kp_driving_initial=kp_driving_initial)
             kp_norm = kp_driving
             out = generator(source_image, kp_source=kp_source, kp_driving=kp_norm)
+            '''
+            source_image_new = out['prediction'].squeeze(1)
+            kp_canonical_new =  kp_detector(source_image_new)
+            he_source_new = he_estimator(source_image_new) 
+            kp_source_new = keypoint_transformation(kp_canonical_new, he_source_new, wo_exp=True)
+            kp_driving_new = keypoint_transformation(kp_canonical_new, he_driving, wo_exp=True)
+            out = generator(source_image_new, kp_source=kp_source_new, kp_driving=kp_driving_new)
+            '''
             predictions.append(out['prediction'])
         predictions_ts = torch.stack(predictions, dim=1)
     return predictions_ts
diff --git a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-36.pyc b/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-36.pyc
deleted file mode 100644
index 8327a281a1c119814499648bdec814cf753ba0ba..0000000000000000000000000000000000000000
Binary files a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-36.pyc and /dev/null differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-37.pyc b/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-37.pyc
deleted file mode 100644
index 4e9c9671abd49037eb51d66e7bb6046177433a27..0000000000000000000000000000000000000000
Binary files a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-37.pyc and /dev/null differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc
index 03d5fdb5ff0e14c08894b394b8c1cae7e1f324c4..a08f1284e68bb6251119739bc46a2dab9f5a171b 100644
Binary files a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-39.pyc b/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-39.pyc
deleted file mode 100644
index 9c0d18c3cec16bbeccbc825186b14c60550563a1..0000000000000000000000000000000000000000
Binary files a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-39.pyc and /dev/null differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-36.pyc b/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-36.pyc
deleted file mode 100644
index 24a89a661e425c0b49c5d616759928e701eab005..0000000000000000000000000000000000000000
Binary files a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-36.pyc and /dev/null differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-37.pyc b/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-37.pyc
deleted file mode 100644
index d7658dccf719cd85ac0c6e6f6b190ffe6f32c5ed..0000000000000000000000000000000000000000
Binary files a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-37.pyc and /dev/null differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc
index 20a4560fc425087d5d63c70cc08fd12c2d8a7ea1..f1a96eace36b537e5cfc85be1be94616151aca85 100644
Binary files a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-39.pyc b/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-39.pyc
deleted file mode 100644
index d1c07e4d0f03cd52a105f009d16f079559a5f97e..0000000000000000000000000000000000000000
Binary files a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-39.pyc and /dev/null differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-36.pyc b/src/facerender/sync_batchnorm/__pycache__/comm.cpython-36.pyc
deleted file mode 100644
index 7602415a703e1bd2b6008a9bf6dde9778d4349ae..0000000000000000000000000000000000000000
Binary files a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-36.pyc and /dev/null differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-37.pyc b/src/facerender/sync_batchnorm/__pycache__/comm.cpython-37.pyc
deleted file mode 100644
index 1ce98838a834f854dbbc7a8d2f4f1295802e97f3..0000000000000000000000000000000000000000
Binary files a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-37.pyc and /dev/null differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc
index eb7252b8ad1b6aec2f5566979db0494f71a63d91..e6578b03a7060d9b9b31681e6f7ef27e4251f52e 100644
Binary files a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-39.pyc b/src/facerender/sync_batchnorm/__pycache__/comm.cpython-39.pyc
deleted file mode 100644
index b84f093a8aef9c2b92f0beead2318296163c9e1f..0000000000000000000000000000000000000000
Binary files a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-39.pyc and /dev/null differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-36.pyc b/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-36.pyc
deleted file mode 100644
index 4a53e2cdf5b5c2d0f7fc9f6c928fe116d629a6c8..0000000000000000000000000000000000000000
Binary files a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-36.pyc and /dev/null differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-37.pyc b/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-37.pyc
deleted file mode 100644
index b91c03d671fb5a9334bd4791f6e1f55d397f2e62..0000000000000000000000000000000000000000
Binary files a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-37.pyc and /dev/null differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc b/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc
index 30c9811579d75333db1b60fe4622f682013f719b..90f775d27997dc8659edde9eb763d0f8b4007ace 100644
Binary files a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc differ
diff --git a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-39.pyc b/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-39.pyc
deleted file mode 100644
index 561b184da4d393c548f7eb0b3076c765d4bf3745..0000000000000000000000000000000000000000
Binary files a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-39.pyc and /dev/null differ
diff --git a/src/generate_batch.py b/src/generate_batch.py
index 2d9e19b6aa4c19c13caf0a208e1189cd6c19f796..8bf580e49427527bfd1c2ff533de45ee91e3872e 100644
--- a/src/generate_batch.py
+++ b/src/generate_batch.py
@@ -48,7 +48,7 @@ def generate_blink_seq_randomly(num_frames):
             break
     return ratio
 
-def get_data(first_coeff_path, audio_path, device):
+def get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path):
 
     syncnet_mel_step_size = 16
     fps = 25
@@ -56,10 +56,6 @@ def get_data(first_coeff_path, audio_path, device):
     pic_name = os.path.splitext(os.path.split(first_coeff_path)[-1])[0]
     audio_name = os.path.splitext(os.path.split(audio_path)[-1])[0]
 
-    source_semantics_path = first_coeff_path
-    source_semantics_dict = scio.loadmat(source_semantics_path)
-    ref_coeff = source_semantics_dict['coeff_3dmm'][:1,:70]         #1 70
-
     wav = audio.load_wav(audio_path, 16000) 
     wav_length, num_frames = parse_audio_length(len(wav), 16000, 25)
     wav = crop_pad_audio(wav, wav_length)
@@ -76,7 +72,27 @@ def get_data(first_coeff_path, audio_path, device):
         m = spec[seq, :]
         indiv_mels.append(m.T)
     indiv_mels = np.asarray(indiv_mels)         # T 80 16
+
     ratio = generate_blink_seq_randomly(num_frames)      # T
+    source_semantics_path = first_coeff_path
+    source_semantics_dict = scio.loadmat(source_semantics_path)
+    ref_coeff = source_semantics_dict['coeff_3dmm'][:1,:70]         #1 70
+    ref_coeff = np.repeat(ref_coeff, num_frames, axis=0)
+
+    if ref_eyeblink_coeff_path is not None:
+        ratio[:num_frames] = 0
+        refeyeblink_coeff_dict = scio.loadmat(ref_eyeblink_coeff_path)
+        refeyeblink_coeff = refeyeblink_coeff_dict['coeff_3dmm'][:,:64]
+        refeyeblink_num_frames = refeyeblink_coeff.shape[0]
+        if refeyeblink_num_frames<num_frames:
+            div = num_frames//refeyeblink_num_frames
+            re = num_frames%refeyeblink_num_frames
+            refeyeblink_coeff_list = [refeyeblink_coeff for i in range(div)]
+            refeyeblink_coeff_list.append(refeyeblink_coeff[:re, :64])
+            refeyeblink_coeff = np.concatenate(refeyeblink_coeff_list, axis=0)
+            print(refeyeblink_coeff.shape[0])
+
+        ref_coeff[:, :64] = refeyeblink_coeff[:num_frames, :64] 
     
     indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1).unsqueeze(0) # bs T 1 80 16
     ratio = torch.FloatTensor(ratio).unsqueeze(0)                        # bs T
diff --git a/src/generate_facerender_batch.py b/src/generate_facerender_batch.py
index fc737cffc8e960828fb6e59ab1c22e7541a307f9..53b8cf7ada396907a77702c264616c3a8cdc05ab 100644
--- a/src/generate_facerender_batch.py
+++ b/src/generate_facerender_batch.py
@@ -6,7 +6,7 @@ import torch
 import scipy.io as scio
 
 def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path, 
-                        batch_size, camera_yaw_list=[0], camera_pitch_list=[0], camera_roll_list=[0], 
+                        batch_size, input_yaw_list=None, input_pitch_list=None, input_roll_list=None, 
                         expression_scale=1.0, still_mode = False):
 
     semantic_radius = 13
@@ -63,13 +63,16 @@ def get_facerender_data(coeff_path, pic_path, first_coeff_path, audio_path,
     data['video_name'] = video_name
     data['audio_path'] = audio_path
     
-    yaw_c_seq = gen_camera_pose(camera_yaw_list, frame_num, batch_size)
-    pitch_c_seq = gen_camera_pose(camera_pitch_list, frame_num, batch_size)
-    roll_c_seq = gen_camera_pose(camera_roll_list, frame_num, batch_size) 
-
-    data['yaw_c_seq'] = torch.FloatTensor(yaw_c_seq)
-    data['pitch_c_seq'] = torch.FloatTensor(pitch_c_seq)
-    data['roll_c_seq'] = torch.FloatTensor(roll_c_seq)
+    if input_yaw_list is not None:
+        yaw_c_seq = gen_camera_pose(input_yaw_list, frame_num, batch_size)
+        data['yaw_c_seq'] = torch.FloatTensor(yaw_c_seq)
+    if input_pitch_list is not None:
+        pitch_c_seq = gen_camera_pose(input_pitch_list, frame_num, batch_size)
+        data['pitch_c_seq'] = torch.FloatTensor(pitch_c_seq)
+    if input_roll_list is not None:
+        roll_c_seq = gen_camera_pose(input_roll_list, frame_num, batch_size) 
+        data['roll_c_seq'] = torch.FloatTensor(roll_c_seq)
+ 
     return data
 
 def transform_semantic_1(semantic, semantic_radius):
diff --git a/src/gradio_demo.py b/src/gradio_demo.py
index 4f78c97349652e23cf463c49527191fcec795564..d2310d7323b05f8ef08eccbeeb64c329f2072d01 100644
--- a/src/gradio_demo.py
+++ b/src/gradio_demo.py
@@ -1,12 +1,10 @@
 import torch, uuid
-from time import gmtime, strftime
 import os, sys, shutil
 from src.utils.preprocess import CropAndExtract
 from src.test_audio2coeff import Audio2Coeff  
 from src.facerender.animate import AnimateFromCoeff
 from src.generate_batch import get_data
 from src.generate_facerender_batch import get_facerender_data
-from src.utils.text2speech import text2speech
 
 from pydub import AudioSegment
 
@@ -53,7 +51,7 @@ class SadTalker():
                                             facerender_yaml_path, device)
         self.device = device
 
-    def test(self, source_image, driven_audio, still_mode, use_enhancer, result_dir='./'):
+    def test(self, source_image, driven_audio, still_mode, use_enhancer, result_dir='./results/'):
 
         time_tag = str(uuid.uuid4())
         save_dir = os.path.join(result_dir, time_tag)
@@ -76,7 +74,7 @@ class SadTalker():
             else:
                 shutil.move(driven_audio, input_dir)
         else:
-            text2speech
+            raise AttributeError("error audio")
 
 
         os.makedirs(save_dir, exist_ok=True)
@@ -84,18 +82,18 @@ class SadTalker():
         #crop image and extract 3dmm from image
         first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
         os.makedirs(first_frame_dir, exist_ok=True)
-        first_coeff_path, crop_pic_path, original_size = self.preprocess_model.generate(pic_path, first_frame_dir)
+        first_coeff_path, crop_pic_path, crop_info = self.preprocess_model.generate(pic_path, first_frame_dir)
         
         if first_coeff_path is None:
             raise AttributeError("No face is detected")
 
         #audio2ceoff
-        batch = get_data(first_coeff_path, audio_path, self.device) # longer audio?
+        batch = get_data(first_coeff_path, audio_path, self.device, None) # longer audio?
         coeff_path = self.audio_to_coeff.generate(batch, save_dir, pose_style)
         #coeff2video
-        batch_size = 4
+        batch_size = 2
         data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode)
-        self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None, original_size=original_size)
+        return_path = self.animate_from_coeff.generate(data, save_dir,  pic_path, crop_info, enhancer='gfpgan' if use_enhancer else None)
         video_name = data['video_name']
         print(f'The generated video is named {video_name} in {save_dir}')
 
@@ -103,11 +101,6 @@ class SadTalker():
         torch.cuda.synchronize()
         import gc; gc.collect()
         
-        if use_enhancer:
-            return os.path.join(save_dir, video_name+'_enhanced.mp4'), os.path.join(save_dir, video_name+'_enhanced.mp4')
-
-        else:
-            return os.path.join(save_dir, video_name+'.mp4'), os.path.join(save_dir, video_name+'.mp4')
-        
+        return return_path    
 
     
\ No newline at end of file
diff --git a/src/test_audio2coeff.py b/src/test_audio2coeff.py
index 3db6be3af59b0319c50106d9a92c903118f28410..c3c6abcfbbf024244210328e12a4436864949e10 100644
--- a/src/test_audio2coeff.py
+++ b/src/test_audio2coeff.py
@@ -1,7 +1,7 @@
 import os 
 import torch
 import numpy as np
-from scipy.io import savemat
+from scipy.io import savemat, loadmat
 from yacs.config import CfgNode as CN
 from scipy.signal import savgol_filter
 
@@ -60,7 +60,7 @@ class Audio2Coeff():
  
         self.device = device
 
-    def generate(self, batch, coeff_save_dir, pose_style):
+    def generate(self, batch, coeff_save_dir, pose_style, ref_pose_coeff_path=None):
 
         with torch.no_grad():
             #test
@@ -74,14 +74,39 @@ class Audio2Coeff():
             results_dict_pose = self.audio2pose_model.test(batch) 
             pose_pred = results_dict_pose['pose_pred']                        #bs T 6
 
-            pose_pred = torch.Tensor(savgol_filter(np.array(pose_pred.cpu()), 13, 2, axis=1)).to(self.device)
+            pose_len = pose_pred.shape[1]
+            if pose_len<13: 
+                pose_len = int((pose_len-1)/2)*2+1
+                pose_pred = torch.Tensor(savgol_filter(np.array(pose_pred.cpu()), pose_len, 2, axis=1)).to(self.device)
+            else:
+                pose_pred = torch.Tensor(savgol_filter(np.array(pose_pred.cpu()), 13, 2, axis=1)).to(self.device) 
+            
             coeffs_pred = torch.cat((exp_pred, pose_pred), dim=-1)            #bs T 70
 
             coeffs_pred_numpy = coeffs_pred[0].clone().detach().cpu().numpy() 
+
+            
+            if ref_pose_coeff_path is not None: 
+                 coeffs_pred_numpy = self.using_refpose(coeffs_pred_numpy, ref_pose_coeff_path)
         
             savemat(os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name'])),  
                     {'coeff_3dmm': coeffs_pred_numpy})
 
             return os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name']))
+    
+    def using_refpose(self, coeffs_pred_numpy, ref_pose_coeff_path):
+        num_frames = coeffs_pred_numpy.shape[0]
+        refpose_coeff_dict = loadmat(ref_pose_coeff_path)
+        refpose_coeff = refpose_coeff_dict['coeff_3dmm'][:,64:70]
+        refpose_num_frames = refpose_coeff.shape[0]
+        if refpose_num_frames<num_frames:
+            div = num_frames//refpose_num_frames
+            re = num_frames%refpose_num_frames
+            refpose_coeff_list = [refpose_coeff for i in range(div)]
+            refpose_coeff_list.append(refpose_coeff[:re, :])
+            refpose_coeff = np.concatenate(refpose_coeff_list, axis=0)
+
+        coeffs_pred_numpy[:, 64:70] = refpose_coeff[:num_frames, :] 
+        return coeffs_pred_numpy
 
 
diff --git a/src/utils/__pycache__/audio.cpython-38.pyc b/src/utils/__pycache__/audio.cpython-38.pyc
index c9037ed6e9b29bf1f5ba29b25ed9c067103bb361..9f4fe6227f50165dfd5ef7458765d3c806e571c9 100644
Binary files a/src/utils/__pycache__/audio.cpython-38.pyc and b/src/utils/__pycache__/audio.cpython-38.pyc differ
diff --git a/src/utils/__pycache__/croper.cpython-38.pyc b/src/utils/__pycache__/croper.cpython-38.pyc
index addfae662741dd661426427e2f29d506c399adba..e9eacf099f5dd124f8d00eb7fa9076dddde74df7 100644
Binary files a/src/utils/__pycache__/croper.cpython-38.pyc and b/src/utils/__pycache__/croper.cpython-38.pyc differ
diff --git a/src/utils/__pycache__/face_enhancer.cpython-38.pyc b/src/utils/__pycache__/face_enhancer.cpython-38.pyc
index 51b465795f49c49c741a7fb510d02564337deb28..a46dc8b648c6ed407e9486a5452677166f6ed6ea 100644
Binary files a/src/utils/__pycache__/face_enhancer.cpython-38.pyc and b/src/utils/__pycache__/face_enhancer.cpython-38.pyc differ
diff --git a/src/utils/__pycache__/hparams.cpython-38.pyc b/src/utils/__pycache__/hparams.cpython-38.pyc
index 29278c1421204d040aa03f77ed43e18f9b60dad8..bab0f32bce885e5d2fb8061dd81b4767fb987df8 100644
Binary files a/src/utils/__pycache__/hparams.cpython-38.pyc and b/src/utils/__pycache__/hparams.cpython-38.pyc differ
diff --git a/src/utils/__pycache__/preprocess.cpython-38.pyc b/src/utils/__pycache__/preprocess.cpython-38.pyc
index e5e0b7f2a4c29050bfbb30405816311acd3060f0..6a7c7064a0cf9edb8e4a8af6a41ba3020daecd88 100644
Binary files a/src/utils/__pycache__/preprocess.cpython-38.pyc and b/src/utils/__pycache__/preprocess.cpython-38.pyc differ
diff --git a/src/utils/face_enhancer.py b/src/utils/face_enhancer.py
index 6192649d7141f2cd05f1302f7c954bfb8fa612fa..ecf3587ced450273f05388e6cb9d8d6e5b11a51f 100644
--- a/src/utils/face_enhancer.py
+++ b/src/utils/face_enhancer.py
@@ -52,8 +52,7 @@ def enhancer(images, method='gfpgan'):
             images[idx],
             has_aligned=True,
             only_center_face=False,
-            paste_back=True,
-            weight=0.5)
+            paste_back=True)
         
         restored_img += restored_faces
        
diff --git a/src/utils/paste_pic.py b/src/utils/paste_pic.py
new file mode 100644
index 0000000000000000000000000000000000000000..508aa88d5f12aa81ab9a40b71913f6fabfc6332f
--- /dev/null
+++ b/src/utils/paste_pic.py
@@ -0,0 +1,47 @@
+import cv2, os
+import numpy as np
+from tqdm import tqdm
+import uuid 
+
+def paste_pic(video_path, pic_path, crop_info, new_audio_path, full_video_path):
+
+    full_img = cv2.imread(pic_path)
+    print(full_img.dtype)
+    frame_h = full_img.shape[0]
+    frame_w = full_img.shape[1]
+
+    video_stream = cv2.VideoCapture(video_path)
+    fps = video_stream.get(cv2.CAP_PROP_FPS)
+    crop_frames = []
+    while 1:
+        still_reading, frame = video_stream.read()
+        if not still_reading:
+            video_stream.release()
+            break
+        crop_frames.append(frame)
+    
+    if len(crop_info) != 3:
+        print("you didn't crop the image")
+        return
+    else:
+        r_w, r_h = crop_info[0]
+        clx, cly, crx, cry = crop_info[1]
+        lx, ly, rx, ry = crop_info[2]
+        lx, ly, rx, ry = int(lx), int(ly), int(rx), int(ry)
+        oy1, oy2, ox1, ox2 = cly+ly, cly+ry, clx+lx, clx+rx
+
+    tmp_path = str(uuid.uuid4())+'.mp4'
+    out_tmp = cv2.VideoWriter(tmp_path, cv2.VideoWriter_fourcc(*'MP4V'), fps, (frame_w, frame_h))
+    for crop_frame in tqdm(crop_frames, 'seamlessClone:'):
+        p = cv2.resize(crop_frame.astype(np.uint8), (r_w, r_h)) 
+
+        mask = 255*np.ones(p.shape, p.dtype)
+        location = ((ox1+ox2) // 2, (oy1+oy2) // 2)
+        gen_img = cv2.seamlessClone(p, full_img, mask, location, cv2.NORMAL_CLONE)
+
+        #full_img[oy1:oy2, ox1:ox2] = p
+        out_tmp.write(gen_img)
+    out_tmp.release()
+    cmd = r'ffmpeg -y -i "%s" -i "%s"  "%s"' % (tmp_path, new_audio_path, full_video_path)
+    os.system(cmd)
+    os.remove(tmp_path)
diff --git a/src/utils/preprocess.py b/src/utils/preprocess.py
index 4e3dad8d4a49080a3300f672965a11a8a2054fa2..d12cada70ccefcc905bc6727a304a6e974c4f25c 100644
--- a/src/utils/preprocess.py
+++ b/src/utils/preprocess.py
@@ -77,9 +77,8 @@ class CropAndExtract():
                 if not still_reading:
                     video_stream.release()
                     break 
-                full_frames.append(frame)
-                break
-        x_full_frames = [cv2.cvtColor(full_frames[0], cv2.COLOR_BGR2RGB) ] 
+                full_frames.append(frame) 
+        x_full_frames = [cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  for frame in full_frames] 
 
         if crop_or_resize.lower() == 'crop': # default crop
             x_full_frames, crop, quad = self.croper.crop(x_full_frames, xsize=pic_size)
@@ -87,10 +86,10 @@ class CropAndExtract():
             lx, ly, rx, ry = quad
             lx, ly, rx, ry = int(lx), int(ly), int(rx), int(ry)
             oy1, oy2, ox1, ox2 = cly+ly, cly+ry, clx+lx, clx+rx
-            original_size = (ox2 - ox1, oy2 - oy1)
+            crop_info = ((ox2 - ox1, oy2 - oy1), crop, quad)
         else:
             oy1, oy2, ox1, ox2 = 0, x_full_frames[0].shape[0], 0, x_full_frames[0].shape[1] 
-            original_size = (ox2 - ox1, oy2 - oy1)
+            crop_info = ((ox2 - ox1, oy2 - oy1))
 
         frames_pil = [Image.fromarray(cv2.resize(frame,(pic_size, pic_size))) for frame in x_full_frames]
         if len(frames_pil) == 0:
@@ -149,4 +148,4 @@ class CropAndExtract():
 
             savemat(coeff_path, {'coeff_3dmm': semantic_npy, 'full_3dmm': np.array(full_coeffs)[0]})
 
-        return coeff_path, png_path, original_size
\ No newline at end of file
+        return coeff_path, png_path, crop_info
\ No newline at end of file
diff --git a/src/utils/text2speech.py b/src/utils/text2speech.py
index 3ecaef36961494c8b2b1f5771a70b997efa04ffd..6948edf1e96c78b534882aa003f7b71e6eb9c323 100644
--- a/src/utils/text2speech.py
+++ b/src/utils/text2speech.py
@@ -1,12 +1,21 @@
 import os
+import tempfile
+from TTS.api import TTS
 
-def text2speech(txt, audio_path):
-    print(txt)
-    cmd = f'tts --text "{txt}" --out_path {audio_path}'
-    print(cmd)
-    try:
-        os.system(cmd)
-        return audio_path
-    except:
-        print("Error: Failed convert txt to audio")
-        return None
\ No newline at end of file
+
+
+class TTSTalker():
+    def __init__(self) -> None:
+        model_name = TTS.list_models()[0]
+        self.tts = TTS(model_name)
+
+    def test(self, text, language='en'):
+
+        tempf  = tempfile.NamedTemporaryFile(
+                delete = False,
+                suffix = ('.'+'wav'),
+            )
+
+        self.tts.tts_to_file(text, speaker=self.tts.speakers[0], language=language, file_path=tempf.name)
+
+        return tempf.name
\ No newline at end of file