Spaces:

vinthony
/

SadTalker

Running on A10G

App Files Files Community

vinthony commited on Mar 29, 2023

Commit

a86a2b8

•

1 Parent(s): 74a9811

fixed req

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +153 -0
app.py +26 -8
modules/__pycache__/sadtalker_test.cpython-38.pyc +0 -0
modules/__pycache__/text2speech.cpython-38.pyc +0 -0
modules/sadtalker_test.py +3 -3
src/__pycache__/generate_batch.cpython-38.pyc +0 -0
src/__pycache__/generate_facerender_batch.cpython-38.pyc +0 -0
src/__pycache__/test_audio2coeff.cpython-38.pyc +0 -0
src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc +0 -0
src/audio2exp_models/__pycache__/networks.cpython-38.pyc +0 -0
src/audio2exp_models/audio2exp.py +15 -5
src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc +0 -0
src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc +0 -0
src/audio2pose_models/__pycache__/cvae.cpython-38.pyc +0 -0
src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc +0 -0
src/audio2pose_models/__pycache__/networks.cpython-38.pyc +0 -0
src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc +0 -0
src/audio2pose_models/audio2pose.py +1 -0
src/audio2pose_models/audio_encoder.py +2 -2
src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc +0 -0
src/face3d/extract_kp_videos.py +1 -1
src/face3d/models/__pycache__/__init__.cpython-38.pyc +0 -0
src/face3d/models/__pycache__/base_model.cpython-38.pyc +0 -0
src/face3d/models/__pycache__/networks.cpython-38.pyc +0 -0
src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc +0 -0
src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc +0 -0
src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc +0 -0
src/face3d/util/__pycache__/__init__.cpython-38.pyc +0 -0
src/face3d/util/__pycache__/load_mats.cpython-38.pyc +0 -0
src/face3d/util/__pycache__/preprocess.cpython-38.pyc +0 -0
src/facerender/__pycache__/animate.cpython-38.pyc +0 -0
src/facerender/animate.py +10 -1
src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc +0 -0
src/facerender/modules/__pycache__/generator.cpython-38.pyc +0 -0
src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc +0 -0
src/facerender/modules/__pycache__/make_animation.cpython-38.pyc +0 -0
src/facerender/modules/__pycache__/mapping.cpython-38.pyc +0 -0
src/facerender/modules/__pycache__/util.cpython-38.pyc +0 -0
src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc +0 -0
src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc +0 -0
src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc +0 -0
src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc +0 -0
src/generate_batch.py +4 -25
src/gradio_demo.py +113 -0
src/test_audio2coeff.py +1 -1
src/utils/__pycache__/audio.cpython-38.pyc +0 -0
src/utils/__pycache__/croper.cpython-38.pyc +0 -0
src/utils/__pycache__/face_enhancer.cpython-38.pyc +0 -0
src/utils/__pycache__/hparams.cpython-38.pyc +0 -0
src/utils/__pycache__/preprocess.cpython-38.pyc +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,153 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/

app.py CHANGED Viewed

@@ -27,15 +27,15 @@ def sadtalker_demo(result_dir='./tmp/'):
                     <a style='font-size:18px;color: #efefef' href='https://sadtalker.github.io'>Homepage</a>  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
                      <a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </div>")
-        with gr.Row().style(equal_height=False):
             with gr.Column(variant='panel'):
                 with gr.Tabs(elem_id="sadtalker_source_image"):
                     with gr.TabItem('Upload image'):
                         with gr.Row():
-                            source_image = gr.Image(label="Source image", source="upload", type="filepath").style(height=256,width=256)
                 with gr.Tabs(elem_id="sadtalker_driven_audio"):
-                    with gr.TabItem('Upload audio(wav only currently)'):
                         with gr.Column(variant='panel'):
                             driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
@@ -43,12 +43,13 @@ def sadtalker_demo(result_dir='./tmp/'):
                 with gr.Tabs(elem_id="sadtalker_checkbox"):
                     with gr.TabItem('Settings'):
                         with gr.Column(variant='panel'):
-                            is_still_mode = gr.Checkbox(label="w/ Still Mode (fewer head motion)")
-                            enhancer = gr.Checkbox(label="w/ GFPGAN as Face enhancer")
                             submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
                 with gr.Tabs(elem_id="sadtalker_genearted"):
-                        gen_video = gr.Video(label="Generated video", format="mp4").style(height=256,width=256)
                         gen_text = gr.Textbox(visible=False)
         with gr.Row():
@@ -57,7 +58,22 @@ def sadtalker_demo(result_dir='./tmp/'):
                     'examples/source_image/art_10.png',
                     'examples/driven_audio/deyu.wav',
                     True,
                     False
                 ]
             ]
             gr.Examples(examples=examples,
@@ -65,7 +81,8 @@ def sadtalker_demo(result_dir='./tmp/'):
                             source_image,
                             driven_audio,
                             is_still_mode,
-                            enhancer,
                             gr.Textbox(value=result_dir, visible=False)],
                         outputs=[gen_video, gen_text],
                         fn=sad_talker.test,
@@ -76,7 +93,8 @@ def sadtalker_demo(result_dir='./tmp/'):
                     inputs=[source_image,
                             driven_audio,
                             is_still_mode,
-                            enhancer,
                             gr.Textbox(value=result_dir, visible=False)],
                     outputs=[gen_video, gen_text]
                     )

                     <a style='font-size:18px;color: #efefef' href='https://sadtalker.github.io'>Homepage</a>  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
                      <a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </div>")
+        with gr.Row():
             with gr.Column(variant='panel'):
                 with gr.Tabs(elem_id="sadtalker_source_image"):
                     with gr.TabItem('Upload image'):
                         with gr.Row():
+                            source_image = gr.Image(label="Source image", source="upload", type="filepath").style(height=256)
                 with gr.Tabs(elem_id="sadtalker_driven_audio"):
+                    with gr.TabItem('Upload audio(wav/mp3 only currently)'):
                         with gr.Column(variant='panel'):
                             driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
                 with gr.Tabs(elem_id="sadtalker_checkbox"):
                     with gr.TabItem('Settings'):
                         with gr.Column(variant='panel'):
+                            is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion)").style(container=True)
+                            is_resize_mode = gr.Checkbox(label="Resize Mode (⚠️ Resize mode need manually crop the image firstly, can handle larger image crop)").style(container=True)
+                            is_enhance_mode = gr.Checkbox(label="Enhance Mode (better face quality )").style(container=True)
                             submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
                 with gr.Tabs(elem_id="sadtalker_genearted"):
+                        gen_video = gr.Video(label="Generated video", format="mp4").style(width=256)
                         gen_text = gr.Textbox(visible=False)
         with gr.Row():
                     'examples/source_image/art_10.png',
                     'examples/driven_audio/deyu.wav',
                     True,
+                    False,
                     False
+                ],
+                [
+                    'examples/source_image/art_1.png',
+                    'examples/driven_audio/fayu.wav',
+                    True,
+                    True,
+                    False
+                ],
+                [
+                    'examples/source_image/art_9.png',
+                    'examples/driven_audio/itosinger1.wav',
+                    True,
+                    False,
+                    True
                 ]
             ]
             gr.Examples(examples=examples,
                             source_image,
                             driven_audio,
                             is_still_mode,
+                            is_resize_mode,
+                            is_enhance_mode,
                             gr.Textbox(value=result_dir, visible=False)],
                         outputs=[gen_video, gen_text],
                         fn=sad_talker.test,
                     inputs=[source_image,
                             driven_audio,
                             is_still_mode,
+                            is_resize_mode,
+                            is_enhance_mode,
                             gr.Textbox(value=result_dir, visible=False)],
                     outputs=[gen_video, gen_text]
                     )

modules/__pycache__/sadtalker_test.cpython-38.pyc CHANGED Viewed

Binary files a/modules/__pycache__/sadtalker_test.cpython-38.pyc and b/modules/__pycache__/sadtalker_test.cpython-38.pyc differ

modules/__pycache__/text2speech.cpython-38.pyc CHANGED Viewed

Binary files a/modules/__pycache__/text2speech.cpython-38.pyc and b/modules/__pycache__/text2speech.cpython-38.pyc differ

modules/sadtalker_test.py CHANGED Viewed

@@ -60,7 +60,7 @@ class SadTalker():
                                             facerender_yaml_path, device)
         self.device = device
-    def test(self, source_image, driven_audio, still_mode, use_enhancer, result_dir='./'):
         time_tag =  str(uuid.uuid4()) # strftime("%Y_%m_%d_%H.%M.%S")
         save_dir = os.path.join(result_dir, time_tag)
@@ -91,7 +91,7 @@ class SadTalker():
         #crop image and extract 3dmm from image
         first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
         os.makedirs(first_frame_dir, exist_ok=True)
-        first_coeff_path, crop_pic_path = self.preprocess_model.generate(pic_path, first_frame_dir)
         if first_coeff_path is None:
             raise AttributeError("No face is detected")
@@ -101,7 +101,7 @@ class SadTalker():
         #coeff2video
         batch_size = 4
         data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode)
-        self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None)
         video_name = data['video_name']
         print(f'The generated video is named {video_name} in {save_dir}')

                                             facerender_yaml_path, device)
         self.device = device
+    def test(self, source_image, driven_audio, still_mode, resize_mode, use_enhancer, result_dir='./'):
         time_tag =  str(uuid.uuid4()) # strftime("%Y_%m_%d_%H.%M.%S")
         save_dir = os.path.join(result_dir, time_tag)
         #crop image and extract 3dmm from image
         first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
         os.makedirs(first_frame_dir, exist_ok=True)
+        first_coeff_path, crop_pic_path, original_size = self.preprocess_model.generate(pic_path, first_frame_dir, crop_or_resize= 'crop' if resize_mode == 'crop' else 'resize')
         if first_coeff_path is None:
             raise AttributeError("No face is detected")
         #coeff2video
         batch_size = 4
         data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode)
+        self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None, original_size=original_size)
         video_name = data['video_name']
         print(f'The generated video is named {video_name} in {save_dir}')

src/__pycache__/generate_batch.cpython-38.pyc CHANGED Viewed

Binary files a/src/__pycache__/generate_batch.cpython-38.pyc and b/src/__pycache__/generate_batch.cpython-38.pyc differ

src/__pycache__/generate_facerender_batch.cpython-38.pyc CHANGED Viewed

Binary files a/src/__pycache__/generate_facerender_batch.cpython-38.pyc and b/src/__pycache__/generate_facerender_batch.cpython-38.pyc differ

src/__pycache__/test_audio2coeff.cpython-38.pyc CHANGED Viewed

Binary files a/src/__pycache__/test_audio2coeff.cpython-38.pyc and b/src/__pycache__/test_audio2coeff.cpython-38.pyc differ

src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc CHANGED Viewed

Binary files a/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc and b/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc differ

src/audio2exp_models/__pycache__/networks.cpython-38.pyc CHANGED Viewed

Binary files a/src/audio2exp_models/__pycache__/networks.cpython-38.pyc and b/src/audio2exp_models/__pycache__/networks.cpython-38.pyc differ

src/audio2exp_models/audio2exp.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import torch
 from torch import nn
@@ -15,15 +16,24 @@ class Audio2Exp(nn.Module):
         bs = mel_input.shape[0]
         T = mel_input.shape[1]
-        ref = batch['ref'][:, :, :64].repeat((1,T,1))           #bs T 64
-        ratio = batch['ratio_gt']                               #bs T
-        audiox = mel_input.view(-1, 1, 80, 16)                  # bs*T 1 80 16
-        exp_coeff_pred  = self.netG(audiox, ref, ratio)         # bs T 64
         # BS x T x 64
         results_dict = {
-            'exp_coeff_pred': exp_coeff_pred
             }
         return results_dict

+from tqdm import tqdm
 import torch
 from torch import nn
         bs = mel_input.shape[0]
         T = mel_input.shape[1]
+        exp_coeff_pred = []
+        for i in tqdm(range(0, T, 10),'audio2exp:'): # every 10 frames
+            current_mel_input = mel_input[:,i:i+10]
+            ref = batch['ref'][:, :, :64].repeat((1,current_mel_input.shape[1],1))           #bs T 64
+            ratio = batch['ratio_gt'][:, i:i+10]                               #bs T
+            audiox = current_mel_input.view(-1, 1, 80, 16)                  # bs*T 1 80 16
+            curr_exp_coeff_pred  = self.netG(audiox, ref, ratio)         # bs T 64
+            exp_coeff_pred += [curr_exp_coeff_pred]
         # BS x T x 64
         results_dict = {
+            'exp_coeff_pred': torch.cat(exp_coeff_pred, axis=1)
             }
         return results_dict

src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc CHANGED Viewed

Binary files a/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc and b/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc differ

src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc CHANGED Viewed

Binary files a/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc and b/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc differ

src/audio2pose_models/__pycache__/cvae.cpython-38.pyc CHANGED Viewed

Binary files a/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc and b/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc differ

src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc CHANGED Viewed

Binary files a/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc and b/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc differ

src/audio2pose_models/__pycache__/networks.cpython-38.pyc CHANGED Viewed

Binary files a/src/audio2pose_models/__pycache__/networks.cpython-38.pyc and b/src/audio2pose_models/__pycache__/networks.cpython-38.pyc differ

src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc CHANGED Viewed

Binary files a/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc and b/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc differ

src/audio2pose_models/audio2pose.py CHANGED Viewed

@@ -76,6 +76,7 @@ class Audio2Pose(nn.Module):
             batch['audio_emb'] = audio_emb
             batch = self.netG.test(batch)
             pose_motion_pred_list.append(batch['pose_motion_pred'])  #list of bs seq_len 6
         if re != 0:
             z = torch.randn(bs, self.latent_dim).to(ref.device)
             batch['z'] = z

             batch['audio_emb'] = audio_emb
             batch = self.netG.test(batch)
             pose_motion_pred_list.append(batch['pose_motion_pred'])  #list of bs seq_len 6
         if re != 0:
             z = torch.randn(bs, self.latent_dim).to(ref.device)
             batch['z'] = z

src/audio2pose_models/audio_encoder.py CHANGED Viewed

@@ -19,7 +19,7 @@ class Conv2d(nn.Module):
         return self.act(out)
 class AudioEncoder(nn.Module):
-    def __init__(self, wav2lip_checkpoint, device='cpu'):
         super(AudioEncoder, self).__init__()
         self.audio_encoder = nn.Sequential(
@@ -42,7 +42,7 @@ class AudioEncoder(nn.Module):
             Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
         #### load the pre-trained audio_encoder\
-        wav2lip_state_dict = torch.load(wav2lip_checkpoint, map_location=device)['state_dict']
         state_dict = self.audio_encoder.state_dict()
         for k,v in wav2lip_state_dict.items():

         return self.act(out)
 class AudioEncoder(nn.Module):
+    def __init__(self, wav2lip_checkpoint):
         super(AudioEncoder, self).__init__()
         self.audio_encoder = nn.Sequential(
             Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
         #### load the pre-trained audio_encoder\
+        wav2lip_state_dict = torch.load(wav2lip_checkpoint)['state_dict']
         state_dict = self.audio_encoder.state_dict()
         for k,v in wav2lip_state_dict.items():

src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc CHANGED Viewed

Binary files a/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc and b/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc differ

src/face3d/extract_kp_videos.py CHANGED Viewed

@@ -71,7 +71,7 @@ def read_video(filename):
 def run(data):
     filename, opt, device = data
     os.environ['CUDA_VISIBLE_DEVICES'] = device
-    kp_extractor = KeypointExtractor(device)
     images = read_video(filename)
     name = filename.split('/')[-2:]
     os.makedirs(os.path.join(opt.output_dir, name[-2]), exist_ok=True)

 def run(data):
     filename, opt, device = data
     os.environ['CUDA_VISIBLE_DEVICES'] = device
+    kp_extractor = KeypointExtractor()
     images = read_video(filename)
     name = filename.split('/')[-2:]
     os.makedirs(os.path.join(opt.output_dir, name[-2]), exist_ok=True)

src/face3d/models/__pycache__/__init__.cpython-38.pyc CHANGED Viewed

Binary files a/src/face3d/models/__pycache__/__init__.cpython-38.pyc and b/src/face3d/models/__pycache__/__init__.cpython-38.pyc differ

src/face3d/models/__pycache__/base_model.cpython-38.pyc CHANGED Viewed

Binary files a/src/face3d/models/__pycache__/base_model.cpython-38.pyc and b/src/face3d/models/__pycache__/base_model.cpython-38.pyc differ

src/face3d/models/__pycache__/networks.cpython-38.pyc CHANGED Viewed

Binary files a/src/face3d/models/__pycache__/networks.cpython-38.pyc and b/src/face3d/models/__pycache__/networks.cpython-38.pyc differ

src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc CHANGED Viewed

Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc differ

src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc CHANGED Viewed

Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc differ

src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc CHANGED Viewed

Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc differ

src/face3d/util/__pycache__/__init__.cpython-38.pyc CHANGED Viewed

Binary files a/src/face3d/util/__pycache__/__init__.cpython-38.pyc and b/src/face3d/util/__pycache__/__init__.cpython-38.pyc differ

src/face3d/util/__pycache__/load_mats.cpython-38.pyc CHANGED Viewed

Binary files a/src/face3d/util/__pycache__/load_mats.cpython-38.pyc and b/src/face3d/util/__pycache__/load_mats.cpython-38.pyc differ

src/face3d/util/__pycache__/preprocess.cpython-38.pyc CHANGED Viewed

Binary files a/src/face3d/util/__pycache__/preprocess.cpython-38.pyc and b/src/face3d/util/__pycache__/preprocess.cpython-38.pyc differ

src/facerender/__pycache__/animate.cpython-38.pyc CHANGED Viewed

Binary files a/src/facerender/__pycache__/animate.cpython-38.pyc and b/src/facerender/__pycache__/animate.cpython-38.pyc differ

src/facerender/animate.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import yaml
 import numpy as np
 import warnings
@@ -106,7 +107,7 @@ class AnimateFromCoeff():
         return checkpoint['epoch']
-    def generate(self, x, video_save_dir, enhancer=None):
         source_image=x['source_image'].type(torch.FloatTensor)
         source_semantics=x['source_semantics'].type(torch.FloatTensor)
@@ -137,6 +138,10 @@ class AnimateFromCoeff():
             video.append(image)
         result = img_as_ubyte(video)
         video_name = x['video_name']  + '.mp4'
         path = os.path.join(video_save_dir, 'temp_'+video_name)
         imageio.mimsave(path, result, fps=float(25))
@@ -146,6 +151,10 @@ class AnimateFromCoeff():
             av_path_enhancer = os.path.join(video_save_dir, video_name_enhancer)
             enhanced_path = os.path.join(video_save_dir, 'temp_'+video_name_enhancer)
             enhanced_images = face_enhancer(result, method=enhancer)
             imageio.mimsave(enhanced_path, enhanced_images, fps=float(25))
         av_path = os.path.join(video_save_dir, video_name)

 import os
+import cv2
 import yaml
 import numpy as np
 import warnings
         return checkpoint['epoch']
+    def generate(self, x, video_save_dir, enhancer=None, original_size=None):
         source_image=x['source_image'].type(torch.FloatTensor)
         source_semantics=x['source_semantics'].type(torch.FloatTensor)
             video.append(image)
         result = img_as_ubyte(video)
+        ### the generated video is 256x256, so we  keep the aspect ratio,
+        if original_size:
+            result = [ cv2.resize(result_i,(256, int(256.0 * original_size[1]/original_size[0]) )) for result_i in result ]
         video_name = x['video_name']  + '.mp4'
         path = os.path.join(video_save_dir, 'temp_'+video_name)
         imageio.mimsave(path, result, fps=float(25))
             av_path_enhancer = os.path.join(video_save_dir, video_name_enhancer)
             enhanced_path = os.path.join(video_save_dir, 'temp_'+video_name_enhancer)
             enhanced_images = face_enhancer(result, method=enhancer)
+            if original_size:
+                enhanced_images = [ cv2.resize(result_i,(256, int(256.0 * original_size[1]/original_size[0]) )) for result_i in enhanced_images ]
             imageio.mimsave(enhanced_path, enhanced_images, fps=float(25))
         av_path = os.path.join(video_save_dir, video_name)

src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc CHANGED Viewed

Binary files a/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc and b/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc differ

src/facerender/modules/__pycache__/generator.cpython-38.pyc CHANGED Viewed

Binary files a/src/facerender/modules/__pycache__/generator.cpython-38.pyc and b/src/facerender/modules/__pycache__/generator.cpython-38.pyc differ

src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc CHANGED Viewed

Binary files a/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc and b/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc differ

src/facerender/modules/__pycache__/make_animation.cpython-38.pyc CHANGED Viewed

Binary files a/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc and b/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc differ

src/facerender/modules/__pycache__/mapping.cpython-38.pyc CHANGED Viewed

Binary files a/src/facerender/modules/__pycache__/mapping.cpython-38.pyc and b/src/facerender/modules/__pycache__/mapping.cpython-38.pyc differ

src/facerender/modules/__pycache__/util.cpython-38.pyc CHANGED Viewed

Binary files a/src/facerender/modules/__pycache__/util.cpython-38.pyc and b/src/facerender/modules/__pycache__/util.cpython-38.pyc differ

src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc CHANGED Viewed

Binary files a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc differ

src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc CHANGED Viewed

Binary files a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc differ

src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc CHANGED Viewed

Binary files a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc differ

src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc CHANGED Viewed

Binary files a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc differ

src/generate_batch.py CHANGED Viewed

@@ -1,18 +1,11 @@
 import os
 import torch
 import numpy as np
 import random
 import scipy.io as scio
 import src.utils.audio as audio
-import subprocess, platform
-from pydub import AudioSegment
-def mp3_to_wav(mp3_filename,wav_filename,frame_rate):
-    mp3_file = AudioSegment.from_mp3(file=mp3_filename)
-    mp3_file.set_frame_rate(frame_rate).export(wav_filename,format="wav")
 def crop_pad_audio(wav, audio_length):
     if len(wav) > audio_length:
@@ -33,7 +26,6 @@ def generate_blink_seq(num_frames):
     ratio = np.zeros((num_frames,1))
     frame_id = 0
     while frame_id in range(num_frames):
-        #start = random.choice(range(60,70))
         start = 80
         if frame_id+start+9<=num_frames - 1:
             ratio[frame_id+start:frame_id+start+9, 0] = [0.5,0.6,0.7,0.9,1, 0.9, 0.7,0.6,0.5]
@@ -48,7 +40,6 @@ def generate_blink_seq_randomly(num_frames):
         return ratio
     frame_id = 0
     while frame_id in range(num_frames):
-        #start = random.choice(range(60,70))
         start = random.choice(range(min(10,num_frames), min(int(num_frames/2), 70)))
         if frame_id+start+5<=num_frames - 1:
             ratio[frame_id+start:frame_id+start+5, 0] = [0.5, 0.9, 1.0, 0.9, 0.5]
@@ -60,8 +51,6 @@ def generate_blink_seq_randomly(num_frames):
 def get_data(first_coeff_path, audio_path, device):
     syncnet_mel_step_size = 16
-    syncnet_T = 5
-    MAX_FRAME = 32
     fps = 25
     pic_name = os.path.splitext(os.path.split(first_coeff_path)[-1])[0]
@@ -71,23 +60,14 @@ def get_data(first_coeff_path, audio_path, device):
     source_semantics_dict = scio.loadmat(source_semantics_path)
     ref_coeff = source_semantics_dict['coeff_3dmm'][:1,:70]         #1 70
-    print(audio_path)
-    if '.mp3' in audio_path:
-        print(audio_path)
-        mp3_to_wav(audio_path, audio_path.replace('.mp3','.wav'), 16000)
-        new_audio = audio_path.replace('.mp3','.wav')
-    else:
-        new_audio = audio_path
-    wav = audio.load_wav(new_audio, 16000)
     wav_length, num_frames = parse_audio_length(len(wav), 16000, 25)
     wav = crop_pad_audio(wav, wav_length)
     orig_mel = audio.melspectrogram(wav).T
     spec = orig_mel.copy()         # nframes 80
     indiv_mels = []
-    for i in range(num_frames):
         start_frame_num = i-2
         start_idx = int(80. * (start_frame_num / float(fps)))
         end_idx = start_idx + syncnet_mel_step_size
@@ -97,7 +77,6 @@ def get_data(first_coeff_path, audio_path, device):
         indiv_mels.append(m.T)
     indiv_mels = np.asarray(indiv_mels)         # T 80 16
     ratio = generate_blink_seq_randomly(num_frames)      # T
     indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1).unsqueeze(0) # bs T 1 80 16
     ratio = torch.FloatTensor(ratio).unsqueeze(0)                        # bs T

 import os
+from tqdm import tqdm
 import torch
 import numpy as np
 import random
 import scipy.io as scio
 import src.utils.audio as audio
 def crop_pad_audio(wav, audio_length):
     if len(wav) > audio_length:
     ratio = np.zeros((num_frames,1))
     frame_id = 0
     while frame_id in range(num_frames):
         start = 80
         if frame_id+start+9<=num_frames - 1:
             ratio[frame_id+start:frame_id+start+9, 0] = [0.5,0.6,0.7,0.9,1, 0.9, 0.7,0.6,0.5]
         return ratio
     frame_id = 0
     while frame_id in range(num_frames):
         start = random.choice(range(min(10,num_frames), min(int(num_frames/2), 70)))
         if frame_id+start+5<=num_frames - 1:
             ratio[frame_id+start:frame_id+start+5, 0] = [0.5, 0.9, 1.0, 0.9, 0.5]
 def get_data(first_coeff_path, audio_path, device):
     syncnet_mel_step_size = 16
     fps = 25
     pic_name = os.path.splitext(os.path.split(first_coeff_path)[-1])[0]
     source_semantics_dict = scio.loadmat(source_semantics_path)
     ref_coeff = source_semantics_dict['coeff_3dmm'][:1,:70]         #1 70
+    wav = audio.load_wav(audio_path, 16000)
     wav_length, num_frames = parse_audio_length(len(wav), 16000, 25)
     wav = crop_pad_audio(wav, wav_length)
     orig_mel = audio.melspectrogram(wav).T
     spec = orig_mel.copy()         # nframes 80
     indiv_mels = []
+    for i in tqdm(range(num_frames), 'mel:'):
         start_frame_num = i-2
         start_idx = int(80. * (start_frame_num / float(fps)))
         end_idx = start_idx + syncnet_mel_step_size
         indiv_mels.append(m.T)
     indiv_mels = np.asarray(indiv_mels)         # T 80 16
     ratio = generate_blink_seq_randomly(num_frames)      # T
     indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1).unsqueeze(0) # bs T 1 80 16
     ratio = torch.FloatTensor(ratio).unsqueeze(0)                        # bs T

src/gradio_demo.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import torch, uuid
+from time import gmtime, strftime
+import os, sys, shutil
+from src.utils.preprocess import CropAndExtract
+from src.test_audio2coeff import Audio2Coeff
+from src.facerender.animate import AnimateFromCoeff
+from src.generate_batch import get_data
+from src.generate_facerender_batch import get_facerender_data
+from src.utils.text2speech import text2speech
+from pydub import AudioSegment
+def mp3_to_wav(mp3_filename,wav_filename,frame_rate):
+    mp3_file = AudioSegment.from_file(file=mp3_filename)
+    mp3_file.set_frame_rate(frame_rate).export(wav_filename,format="wav")
+class SadTalker():
+    def __init__(self, checkpoint_path='checkpoints', config_path='src/config'):
+        if torch.cuda.is_available() :
+            device = "cuda"
+        else:
+            device = "cpu"
+        os.environ['TORCH_HOME']= checkpoint_path
+        path_of_lm_croper = os.path.join( checkpoint_path, 'shape_predictor_68_face_landmarks.dat')
+        path_of_net_recon_model = os.path.join( checkpoint_path, 'epoch_20.pth')
+        dir_of_BFM_fitting = os.path.join( checkpoint_path, 'BFM_Fitting')
+        wav2lip_checkpoint = os.path.join( checkpoint_path, 'wav2lip.pth')
+        audio2pose_checkpoint = os.path.join( checkpoint_path, 'auido2pose_00140-model.pth')
+        audio2pose_yaml_path = os.path.join( config_path, 'auido2pose.yaml')
+        audio2exp_checkpoint = os.path.join( checkpoint_path, 'auido2exp_00300-model.pth')
+        audio2exp_yaml_path = os.path.join( config_path, 'auido2exp.yaml')
+        free_view_checkpoint = os.path.join( checkpoint_path, 'facevid2vid_00189-model.pth.tar')
+        mapping_checkpoint = os.path.join( checkpoint_path, 'mapping_00229-model.pth.tar')
+        facerender_yaml_path = os.path.join( config_path, 'facerender.yaml')
+        #init model
+        print(path_of_lm_croper)
+        self.preprocess_model = CropAndExtract(path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device)
+        print(audio2pose_checkpoint)
+        self.audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path,
+                                audio2exp_checkpoint, audio2exp_yaml_path, wav2lip_checkpoint, device)
+        print(free_view_checkpoint)
+        self.animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint,
+                                            facerender_yaml_path, device)
+        self.device = device
+    def test(self, source_image, driven_audio, still_mode, use_enhancer, result_dir='./'):
+        time_tag = str(uuid.uuid4())
+        save_dir = os.path.join(result_dir, time_tag)
+        os.makedirs(save_dir, exist_ok=True)
+        input_dir = os.path.join(save_dir, 'input')
+        os.makedirs(input_dir, exist_ok=True)
+        print(source_image)
+        pic_path = os.path.join(input_dir, os.path.basename(source_image))
+        shutil.move(source_image, input_dir)
+        if os.path.isfile(driven_audio):
+            audio_path = os.path.join(input_dir, os.path.basename(driven_audio))
+            #### mp3 to wav
+            if '.mp3' in audio_path:
+                mp3_to_wav(driven_audio, audio_path.replace('.mp3', '.wav'), 16000)
+                audio_path = audio_path.replace('.mp3', '.wav')
+            else:
+                shutil.move(driven_audio, input_dir)
+        else:
+            text2speech
+        os.makedirs(save_dir, exist_ok=True)
+        pose_style = 0
+        #crop image and extract 3dmm from image
+        first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
+        os.makedirs(first_frame_dir, exist_ok=True)
+        first_coeff_path, crop_pic_path, original_size = self.preprocess_model.generate(pic_path, first_frame_dir)
+        if first_coeff_path is None:
+            raise AttributeError("No face is detected")
+        #audio2ceoff
+        batch = get_data(first_coeff_path, audio_path, self.device) # longer audio?
+        coeff_path = self.audio_to_coeff.generate(batch, save_dir, pose_style)
+        #coeff2video
+        batch_size = 4
+        data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode)
+        self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None, original_size=original_size)
+        video_name = data['video_name']
+        print(f'The generated video is named {video_name} in {save_dir}')
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+        import gc; gc.collect()
+        if use_enhancer:
+            return os.path.join(save_dir, video_name+'_enhanced.mp4'), os.path.join(save_dir, video_name+'_enhanced.mp4')
+        else:
+            return os.path.join(save_dir, video_name+'.mp4'), os.path.join(save_dir, video_name+'.mp4')

src/test_audio2coeff.py CHANGED Viewed

@@ -81,7 +81,7 @@ class Audio2Coeff():
             savemat(os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name'])),
                     {'coeff_3dmm': coeffs_pred_numpy})
-            torch.cuda.empty_cache()
             return os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name']))

             savemat(os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name'])),
                     {'coeff_3dmm': coeffs_pred_numpy})
             return os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name']))

src/utils/__pycache__/audio.cpython-38.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/audio.cpython-38.pyc and b/src/utils/__pycache__/audio.cpython-38.pyc differ

src/utils/__pycache__/croper.cpython-38.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/croper.cpython-38.pyc and b/src/utils/__pycache__/croper.cpython-38.pyc differ

src/utils/__pycache__/face_enhancer.cpython-38.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/face_enhancer.cpython-38.pyc and b/src/utils/__pycache__/face_enhancer.cpython-38.pyc differ

src/utils/__pycache__/hparams.cpython-38.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/hparams.cpython-38.pyc and b/src/utils/__pycache__/hparams.cpython-38.pyc differ

src/utils/__pycache__/preprocess.cpython-38.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/preprocess.cpython-38.pyc and b/src/utils/__pycache__/preprocess.cpython-38.pyc differ