vinthony commited on
Commit
a86a2b8
·
1 Parent(s): 74a9811
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +153 -0
  2. app.py +26 -8
  3. modules/__pycache__/sadtalker_test.cpython-38.pyc +0 -0
  4. modules/__pycache__/text2speech.cpython-38.pyc +0 -0
  5. modules/sadtalker_test.py +3 -3
  6. src/__pycache__/generate_batch.cpython-38.pyc +0 -0
  7. src/__pycache__/generate_facerender_batch.cpython-38.pyc +0 -0
  8. src/__pycache__/test_audio2coeff.cpython-38.pyc +0 -0
  9. src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc +0 -0
  10. src/audio2exp_models/__pycache__/networks.cpython-38.pyc +0 -0
  11. src/audio2exp_models/audio2exp.py +15 -5
  12. src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc +0 -0
  13. src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc +0 -0
  14. src/audio2pose_models/__pycache__/cvae.cpython-38.pyc +0 -0
  15. src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc +0 -0
  16. src/audio2pose_models/__pycache__/networks.cpython-38.pyc +0 -0
  17. src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc +0 -0
  18. src/audio2pose_models/audio2pose.py +1 -0
  19. src/audio2pose_models/audio_encoder.py +2 -2
  20. src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc +0 -0
  21. src/face3d/extract_kp_videos.py +1 -1
  22. src/face3d/models/__pycache__/__init__.cpython-38.pyc +0 -0
  23. src/face3d/models/__pycache__/base_model.cpython-38.pyc +0 -0
  24. src/face3d/models/__pycache__/networks.cpython-38.pyc +0 -0
  25. src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc +0 -0
  26. src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc +0 -0
  27. src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc +0 -0
  28. src/face3d/util/__pycache__/__init__.cpython-38.pyc +0 -0
  29. src/face3d/util/__pycache__/load_mats.cpython-38.pyc +0 -0
  30. src/face3d/util/__pycache__/preprocess.cpython-38.pyc +0 -0
  31. src/facerender/__pycache__/animate.cpython-38.pyc +0 -0
  32. src/facerender/animate.py +10 -1
  33. src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc +0 -0
  34. src/facerender/modules/__pycache__/generator.cpython-38.pyc +0 -0
  35. src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc +0 -0
  36. src/facerender/modules/__pycache__/make_animation.cpython-38.pyc +0 -0
  37. src/facerender/modules/__pycache__/mapping.cpython-38.pyc +0 -0
  38. src/facerender/modules/__pycache__/util.cpython-38.pyc +0 -0
  39. src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc +0 -0
  40. src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc +0 -0
  41. src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc +0 -0
  42. src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc +0 -0
  43. src/generate_batch.py +4 -25
  44. src/gradio_demo.py +113 -0
  45. src/test_audio2coeff.py +1 -1
  46. src/utils/__pycache__/audio.cpython-38.pyc +0 -0
  47. src/utils/__pycache__/croper.cpython-38.pyc +0 -0
  48. src/utils/__pycache__/face_enhancer.cpython-38.pyc +0 -0
  49. src/utils/__pycache__/hparams.cpython-38.pyc +0 -0
  50. src/utils/__pycache__/preprocess.cpython-38.pyc +0 -0
.gitignore ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
app.py CHANGED
@@ -27,15 +27,15 @@ def sadtalker_demo(result_dir='./tmp/'):
27
  <a style='font-size:18px;color: #efefef' href='https://sadtalker.github.io'>Homepage</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
28
  <a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </div>")
29
 
30
- with gr.Row().style(equal_height=False):
31
  with gr.Column(variant='panel'):
32
  with gr.Tabs(elem_id="sadtalker_source_image"):
33
  with gr.TabItem('Upload image'):
34
  with gr.Row():
35
- source_image = gr.Image(label="Source image", source="upload", type="filepath").style(height=256,width=256)
36
 
37
  with gr.Tabs(elem_id="sadtalker_driven_audio"):
38
- with gr.TabItem('Upload audio(wav only currently)'):
39
  with gr.Column(variant='panel'):
40
  driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
41
 
@@ -43,12 +43,13 @@ def sadtalker_demo(result_dir='./tmp/'):
43
  with gr.Tabs(elem_id="sadtalker_checkbox"):
44
  with gr.TabItem('Settings'):
45
  with gr.Column(variant='panel'):
46
- is_still_mode = gr.Checkbox(label="w/ Still Mode (fewer head motion)")
47
- enhancer = gr.Checkbox(label="w/ GFPGAN as Face enhancer")
 
48
  submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
49
 
50
  with gr.Tabs(elem_id="sadtalker_genearted"):
51
- gen_video = gr.Video(label="Generated video", format="mp4").style(height=256,width=256)
52
  gen_text = gr.Textbox(visible=False)
53
 
54
  with gr.Row():
@@ -57,7 +58,22 @@ def sadtalker_demo(result_dir='./tmp/'):
57
  'examples/source_image/art_10.png',
58
  'examples/driven_audio/deyu.wav',
59
  True,
 
60
  False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  ]
62
  ]
63
  gr.Examples(examples=examples,
@@ -65,7 +81,8 @@ def sadtalker_demo(result_dir='./tmp/'):
65
  source_image,
66
  driven_audio,
67
  is_still_mode,
68
- enhancer,
 
69
  gr.Textbox(value=result_dir, visible=False)],
70
  outputs=[gen_video, gen_text],
71
  fn=sad_talker.test,
@@ -76,7 +93,8 @@ def sadtalker_demo(result_dir='./tmp/'):
76
  inputs=[source_image,
77
  driven_audio,
78
  is_still_mode,
79
- enhancer,
 
80
  gr.Textbox(value=result_dir, visible=False)],
81
  outputs=[gen_video, gen_text]
82
  )
 
27
  <a style='font-size:18px;color: #efefef' href='https://sadtalker.github.io'>Homepage</a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \
28
  <a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </div>")
29
 
30
+ with gr.Row():
31
  with gr.Column(variant='panel'):
32
  with gr.Tabs(elem_id="sadtalker_source_image"):
33
  with gr.TabItem('Upload image'):
34
  with gr.Row():
35
+ source_image = gr.Image(label="Source image", source="upload", type="filepath").style(height=256)
36
 
37
  with gr.Tabs(elem_id="sadtalker_driven_audio"):
38
+ with gr.TabItem('Upload audio(wav/mp3 only currently)'):
39
  with gr.Column(variant='panel'):
40
  driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
41
 
 
43
  with gr.Tabs(elem_id="sadtalker_checkbox"):
44
  with gr.TabItem('Settings'):
45
  with gr.Column(variant='panel'):
46
+ is_still_mode = gr.Checkbox(label="Still Mode (fewer head motion)").style(container=True)
47
+ is_resize_mode = gr.Checkbox(label="Resize Mode (⚠️ Resize mode need manually crop the image firstly, can handle larger image crop)").style(container=True)
48
+ is_enhance_mode = gr.Checkbox(label="Enhance Mode (better face quality )").style(container=True)
49
  submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
50
 
51
  with gr.Tabs(elem_id="sadtalker_genearted"):
52
+ gen_video = gr.Video(label="Generated video", format="mp4").style(width=256)
53
  gen_text = gr.Textbox(visible=False)
54
 
55
  with gr.Row():
 
58
  'examples/source_image/art_10.png',
59
  'examples/driven_audio/deyu.wav',
60
  True,
61
+ False,
62
  False
63
+ ],
64
+ [
65
+ 'examples/source_image/art_1.png',
66
+ 'examples/driven_audio/fayu.wav',
67
+ True,
68
+ True,
69
+ False
70
+ ],
71
+ [
72
+ 'examples/source_image/art_9.png',
73
+ 'examples/driven_audio/itosinger1.wav',
74
+ True,
75
+ False,
76
+ True
77
  ]
78
  ]
79
  gr.Examples(examples=examples,
 
81
  source_image,
82
  driven_audio,
83
  is_still_mode,
84
+ is_resize_mode,
85
+ is_enhance_mode,
86
  gr.Textbox(value=result_dir, visible=False)],
87
  outputs=[gen_video, gen_text],
88
  fn=sad_talker.test,
 
93
  inputs=[source_image,
94
  driven_audio,
95
  is_still_mode,
96
+ is_resize_mode,
97
+ is_enhance_mode,
98
  gr.Textbox(value=result_dir, visible=False)],
99
  outputs=[gen_video, gen_text]
100
  )
modules/__pycache__/sadtalker_test.cpython-38.pyc CHANGED
Binary files a/modules/__pycache__/sadtalker_test.cpython-38.pyc and b/modules/__pycache__/sadtalker_test.cpython-38.pyc differ
 
modules/__pycache__/text2speech.cpython-38.pyc CHANGED
Binary files a/modules/__pycache__/text2speech.cpython-38.pyc and b/modules/__pycache__/text2speech.cpython-38.pyc differ
 
modules/sadtalker_test.py CHANGED
@@ -60,7 +60,7 @@ class SadTalker():
60
  facerender_yaml_path, device)
61
  self.device = device
62
 
63
- def test(self, source_image, driven_audio, still_mode, use_enhancer, result_dir='./'):
64
 
65
  time_tag = str(uuid.uuid4()) # strftime("%Y_%m_%d_%H.%M.%S")
66
  save_dir = os.path.join(result_dir, time_tag)
@@ -91,7 +91,7 @@ class SadTalker():
91
  #crop image and extract 3dmm from image
92
  first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
93
  os.makedirs(first_frame_dir, exist_ok=True)
94
- first_coeff_path, crop_pic_path = self.preprocess_model.generate(pic_path, first_frame_dir)
95
  if first_coeff_path is None:
96
  raise AttributeError("No face is detected")
97
 
@@ -101,7 +101,7 @@ class SadTalker():
101
  #coeff2video
102
  batch_size = 4
103
  data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode)
104
- self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None)
105
  video_name = data['video_name']
106
  print(f'The generated video is named {video_name} in {save_dir}')
107
 
 
60
  facerender_yaml_path, device)
61
  self.device = device
62
 
63
+ def test(self, source_image, driven_audio, still_mode, resize_mode, use_enhancer, result_dir='./'):
64
 
65
  time_tag = str(uuid.uuid4()) # strftime("%Y_%m_%d_%H.%M.%S")
66
  save_dir = os.path.join(result_dir, time_tag)
 
91
  #crop image and extract 3dmm from image
92
  first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
93
  os.makedirs(first_frame_dir, exist_ok=True)
94
+ first_coeff_path, crop_pic_path, original_size = self.preprocess_model.generate(pic_path, first_frame_dir, crop_or_resize= 'crop' if resize_mode == 'crop' else 'resize')
95
  if first_coeff_path is None:
96
  raise AttributeError("No face is detected")
97
 
 
101
  #coeff2video
102
  batch_size = 4
103
  data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode)
104
+ self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None, original_size=original_size)
105
  video_name = data['video_name']
106
  print(f'The generated video is named {video_name} in {save_dir}')
107
 
src/__pycache__/generate_batch.cpython-38.pyc CHANGED
Binary files a/src/__pycache__/generate_batch.cpython-38.pyc and b/src/__pycache__/generate_batch.cpython-38.pyc differ
 
src/__pycache__/generate_facerender_batch.cpython-38.pyc CHANGED
Binary files a/src/__pycache__/generate_facerender_batch.cpython-38.pyc and b/src/__pycache__/generate_facerender_batch.cpython-38.pyc differ
 
src/__pycache__/test_audio2coeff.cpython-38.pyc CHANGED
Binary files a/src/__pycache__/test_audio2coeff.cpython-38.pyc and b/src/__pycache__/test_audio2coeff.cpython-38.pyc differ
 
src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc CHANGED
Binary files a/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc and b/src/audio2exp_models/__pycache__/audio2exp.cpython-38.pyc differ
 
src/audio2exp_models/__pycache__/networks.cpython-38.pyc CHANGED
Binary files a/src/audio2exp_models/__pycache__/networks.cpython-38.pyc and b/src/audio2exp_models/__pycache__/networks.cpython-38.pyc differ
 
src/audio2exp_models/audio2exp.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import torch
2
  from torch import nn
3
 
@@ -15,15 +16,24 @@ class Audio2Exp(nn.Module):
15
  bs = mel_input.shape[0]
16
  T = mel_input.shape[1]
17
 
18
- ref = batch['ref'][:, :, :64].repeat((1,T,1)) #bs T 64
19
- ratio = batch['ratio_gt'] #bs T
20
 
21
- audiox = mel_input.view(-1, 1, 80, 16) # bs*T 1 80 16
22
- exp_coeff_pred = self.netG(audiox, ref, ratio) # bs T 64
 
 
 
 
 
 
 
 
 
 
23
 
24
  # BS x T x 64
25
  results_dict = {
26
- 'exp_coeff_pred': exp_coeff_pred
27
  }
28
  return results_dict
29
 
 
1
+ from tqdm import tqdm
2
  import torch
3
  from torch import nn
4
 
 
16
  bs = mel_input.shape[0]
17
  T = mel_input.shape[1]
18
 
19
+ exp_coeff_pred = []
 
20
 
21
+ for i in tqdm(range(0, T, 10),'audio2exp:'): # every 10 frames
22
+
23
+ current_mel_input = mel_input[:,i:i+10]
24
+
25
+ ref = batch['ref'][:, :, :64].repeat((1,current_mel_input.shape[1],1)) #bs T 64
26
+ ratio = batch['ratio_gt'][:, i:i+10] #bs T
27
+
28
+ audiox = current_mel_input.view(-1, 1, 80, 16) # bs*T 1 80 16
29
+
30
+ curr_exp_coeff_pred = self.netG(audiox, ref, ratio) # bs T 64
31
+
32
+ exp_coeff_pred += [curr_exp_coeff_pred]
33
 
34
  # BS x T x 64
35
  results_dict = {
36
+ 'exp_coeff_pred': torch.cat(exp_coeff_pred, axis=1)
37
  }
38
  return results_dict
39
 
src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc CHANGED
Binary files a/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc and b/src/audio2pose_models/__pycache__/audio2pose.cpython-38.pyc differ
 
src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc CHANGED
Binary files a/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc and b/src/audio2pose_models/__pycache__/audio_encoder.cpython-38.pyc differ
 
src/audio2pose_models/__pycache__/cvae.cpython-38.pyc CHANGED
Binary files a/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc and b/src/audio2pose_models/__pycache__/cvae.cpython-38.pyc differ
 
src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc CHANGED
Binary files a/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc and b/src/audio2pose_models/__pycache__/discriminator.cpython-38.pyc differ
 
src/audio2pose_models/__pycache__/networks.cpython-38.pyc CHANGED
Binary files a/src/audio2pose_models/__pycache__/networks.cpython-38.pyc and b/src/audio2pose_models/__pycache__/networks.cpython-38.pyc differ
 
src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc CHANGED
Binary files a/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc and b/src/audio2pose_models/__pycache__/res_unet.cpython-38.pyc differ
 
src/audio2pose_models/audio2pose.py CHANGED
@@ -76,6 +76,7 @@ class Audio2Pose(nn.Module):
76
  batch['audio_emb'] = audio_emb
77
  batch = self.netG.test(batch)
78
  pose_motion_pred_list.append(batch['pose_motion_pred']) #list of bs seq_len 6
 
79
  if re != 0:
80
  z = torch.randn(bs, self.latent_dim).to(ref.device)
81
  batch['z'] = z
 
76
  batch['audio_emb'] = audio_emb
77
  batch = self.netG.test(batch)
78
  pose_motion_pred_list.append(batch['pose_motion_pred']) #list of bs seq_len 6
79
+
80
  if re != 0:
81
  z = torch.randn(bs, self.latent_dim).to(ref.device)
82
  batch['z'] = z
src/audio2pose_models/audio_encoder.py CHANGED
@@ -19,7 +19,7 @@ class Conv2d(nn.Module):
19
  return self.act(out)
20
 
21
  class AudioEncoder(nn.Module):
22
- def __init__(self, wav2lip_checkpoint, device='cpu'):
23
  super(AudioEncoder, self).__init__()
24
 
25
  self.audio_encoder = nn.Sequential(
@@ -42,7 +42,7 @@ class AudioEncoder(nn.Module):
42
  Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
43
 
44
  #### load the pre-trained audio_encoder\
45
- wav2lip_state_dict = torch.load(wav2lip_checkpoint, map_location=device)['state_dict']
46
  state_dict = self.audio_encoder.state_dict()
47
 
48
  for k,v in wav2lip_state_dict.items():
 
19
  return self.act(out)
20
 
21
  class AudioEncoder(nn.Module):
22
+ def __init__(self, wav2lip_checkpoint):
23
  super(AudioEncoder, self).__init__()
24
 
25
  self.audio_encoder = nn.Sequential(
 
42
  Conv2d(512, 512, kernel_size=1, stride=1, padding=0),)
43
 
44
  #### load the pre-trained audio_encoder\
45
+ wav2lip_state_dict = torch.load(wav2lip_checkpoint)['state_dict']
46
  state_dict = self.audio_encoder.state_dict()
47
 
48
  for k,v in wav2lip_state_dict.items():
src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc CHANGED
Binary files a/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc and b/src/face3d/__pycache__/extract_kp_videos.cpython-38.pyc differ
 
src/face3d/extract_kp_videos.py CHANGED
@@ -71,7 +71,7 @@ def read_video(filename):
71
  def run(data):
72
  filename, opt, device = data
73
  os.environ['CUDA_VISIBLE_DEVICES'] = device
74
- kp_extractor = KeypointExtractor(device)
75
  images = read_video(filename)
76
  name = filename.split('/')[-2:]
77
  os.makedirs(os.path.join(opt.output_dir, name[-2]), exist_ok=True)
 
71
  def run(data):
72
  filename, opt, device = data
73
  os.environ['CUDA_VISIBLE_DEVICES'] = device
74
+ kp_extractor = KeypointExtractor()
75
  images = read_video(filename)
76
  name = filename.split('/')[-2:]
77
  os.makedirs(os.path.join(opt.output_dir, name[-2]), exist_ok=True)
src/face3d/models/__pycache__/__init__.cpython-38.pyc CHANGED
Binary files a/src/face3d/models/__pycache__/__init__.cpython-38.pyc and b/src/face3d/models/__pycache__/__init__.cpython-38.pyc differ
 
src/face3d/models/__pycache__/base_model.cpython-38.pyc CHANGED
Binary files a/src/face3d/models/__pycache__/base_model.cpython-38.pyc and b/src/face3d/models/__pycache__/base_model.cpython-38.pyc differ
 
src/face3d/models/__pycache__/networks.cpython-38.pyc CHANGED
Binary files a/src/face3d/models/__pycache__/networks.cpython-38.pyc and b/src/face3d/models/__pycache__/networks.cpython-38.pyc differ
 
src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc CHANGED
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/__init__.cpython-38.pyc differ
 
src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc CHANGED
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/iresnet.cpython-38.pyc differ
 
src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc CHANGED
Binary files a/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc and b/src/face3d/models/arcface_torch/backbones/__pycache__/mobilefacenet.cpython-38.pyc differ
 
src/face3d/util/__pycache__/__init__.cpython-38.pyc CHANGED
Binary files a/src/face3d/util/__pycache__/__init__.cpython-38.pyc and b/src/face3d/util/__pycache__/__init__.cpython-38.pyc differ
 
src/face3d/util/__pycache__/load_mats.cpython-38.pyc CHANGED
Binary files a/src/face3d/util/__pycache__/load_mats.cpython-38.pyc and b/src/face3d/util/__pycache__/load_mats.cpython-38.pyc differ
 
src/face3d/util/__pycache__/preprocess.cpython-38.pyc CHANGED
Binary files a/src/face3d/util/__pycache__/preprocess.cpython-38.pyc and b/src/face3d/util/__pycache__/preprocess.cpython-38.pyc differ
 
src/facerender/__pycache__/animate.cpython-38.pyc CHANGED
Binary files a/src/facerender/__pycache__/animate.cpython-38.pyc and b/src/facerender/__pycache__/animate.cpython-38.pyc differ
 
src/facerender/animate.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import yaml
3
  import numpy as np
4
  import warnings
@@ -106,7 +107,7 @@ class AnimateFromCoeff():
106
 
107
  return checkpoint['epoch']
108
 
109
- def generate(self, x, video_save_dir, enhancer=None):
110
 
111
  source_image=x['source_image'].type(torch.FloatTensor)
112
  source_semantics=x['source_semantics'].type(torch.FloatTensor)
@@ -137,6 +138,10 @@ class AnimateFromCoeff():
137
  video.append(image)
138
  result = img_as_ubyte(video)
139
 
 
 
 
 
140
  video_name = x['video_name'] + '.mp4'
141
  path = os.path.join(video_save_dir, 'temp_'+video_name)
142
  imageio.mimsave(path, result, fps=float(25))
@@ -146,6 +151,10 @@ class AnimateFromCoeff():
146
  av_path_enhancer = os.path.join(video_save_dir, video_name_enhancer)
147
  enhanced_path = os.path.join(video_save_dir, 'temp_'+video_name_enhancer)
148
  enhanced_images = face_enhancer(result, method=enhancer)
 
 
 
 
149
  imageio.mimsave(enhanced_path, enhanced_images, fps=float(25))
150
 
151
  av_path = os.path.join(video_save_dir, video_name)
 
1
  import os
2
+ import cv2
3
  import yaml
4
  import numpy as np
5
  import warnings
 
107
 
108
  return checkpoint['epoch']
109
 
110
+ def generate(self, x, video_save_dir, enhancer=None, original_size=None):
111
 
112
  source_image=x['source_image'].type(torch.FloatTensor)
113
  source_semantics=x['source_semantics'].type(torch.FloatTensor)
 
138
  video.append(image)
139
  result = img_as_ubyte(video)
140
 
141
+ ### the generated video is 256x256, so we keep the aspect ratio,
142
+ if original_size:
143
+ result = [ cv2.resize(result_i,(256, int(256.0 * original_size[1]/original_size[0]) )) for result_i in result ]
144
+
145
  video_name = x['video_name'] + '.mp4'
146
  path = os.path.join(video_save_dir, 'temp_'+video_name)
147
  imageio.mimsave(path, result, fps=float(25))
 
151
  av_path_enhancer = os.path.join(video_save_dir, video_name_enhancer)
152
  enhanced_path = os.path.join(video_save_dir, 'temp_'+video_name_enhancer)
153
  enhanced_images = face_enhancer(result, method=enhancer)
154
+
155
+ if original_size:
156
+ enhanced_images = [ cv2.resize(result_i,(256, int(256.0 * original_size[1]/original_size[0]) )) for result_i in enhanced_images ]
157
+
158
  imageio.mimsave(enhanced_path, enhanced_images, fps=float(25))
159
 
160
  av_path = os.path.join(video_save_dir, video_name)
src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc CHANGED
Binary files a/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc and b/src/facerender/modules/__pycache__/dense_motion.cpython-38.pyc differ
 
src/facerender/modules/__pycache__/generator.cpython-38.pyc CHANGED
Binary files a/src/facerender/modules/__pycache__/generator.cpython-38.pyc and b/src/facerender/modules/__pycache__/generator.cpython-38.pyc differ
 
src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc CHANGED
Binary files a/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc and b/src/facerender/modules/__pycache__/keypoint_detector.cpython-38.pyc differ
 
src/facerender/modules/__pycache__/make_animation.cpython-38.pyc CHANGED
Binary files a/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc and b/src/facerender/modules/__pycache__/make_animation.cpython-38.pyc differ
 
src/facerender/modules/__pycache__/mapping.cpython-38.pyc CHANGED
Binary files a/src/facerender/modules/__pycache__/mapping.cpython-38.pyc and b/src/facerender/modules/__pycache__/mapping.cpython-38.pyc differ
 
src/facerender/modules/__pycache__/util.cpython-38.pyc CHANGED
Binary files a/src/facerender/modules/__pycache__/util.cpython-38.pyc and b/src/facerender/modules/__pycache__/util.cpython-38.pyc differ
 
src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc CHANGED
Binary files a/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/__init__.cpython-38.pyc differ
 
src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc CHANGED
Binary files a/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/batchnorm.cpython-38.pyc differ
 
src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc CHANGED
Binary files a/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/comm.cpython-38.pyc differ
 
src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc CHANGED
Binary files a/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc and b/src/facerender/sync_batchnorm/__pycache__/replicate.cpython-38.pyc differ
 
src/generate_batch.py CHANGED
@@ -1,18 +1,11 @@
1
  import os
 
 
2
  import torch
3
  import numpy as np
4
  import random
5
  import scipy.io as scio
6
  import src.utils.audio as audio
7
- import subprocess, platform
8
-
9
- from pydub import AudioSegment
10
-
11
- def mp3_to_wav(mp3_filename,wav_filename,frame_rate):
12
- mp3_file = AudioSegment.from_mp3(file=mp3_filename)
13
- mp3_file.set_frame_rate(frame_rate).export(wav_filename,format="wav")
14
-
15
-
16
 
17
  def crop_pad_audio(wav, audio_length):
18
  if len(wav) > audio_length:
@@ -33,7 +26,6 @@ def generate_blink_seq(num_frames):
33
  ratio = np.zeros((num_frames,1))
34
  frame_id = 0
35
  while frame_id in range(num_frames):
36
- #start = random.choice(range(60,70))
37
  start = 80
38
  if frame_id+start+9<=num_frames - 1:
39
  ratio[frame_id+start:frame_id+start+9, 0] = [0.5,0.6,0.7,0.9,1, 0.9, 0.7,0.6,0.5]
@@ -48,7 +40,6 @@ def generate_blink_seq_randomly(num_frames):
48
  return ratio
49
  frame_id = 0
50
  while frame_id in range(num_frames):
51
- #start = random.choice(range(60,70))
52
  start = random.choice(range(min(10,num_frames), min(int(num_frames/2), 70)))
53
  if frame_id+start+5<=num_frames - 1:
54
  ratio[frame_id+start:frame_id+start+5, 0] = [0.5, 0.9, 1.0, 0.9, 0.5]
@@ -60,8 +51,6 @@ def generate_blink_seq_randomly(num_frames):
60
  def get_data(first_coeff_path, audio_path, device):
61
 
62
  syncnet_mel_step_size = 16
63
- syncnet_T = 5
64
- MAX_FRAME = 32
65
  fps = 25
66
 
67
  pic_name = os.path.splitext(os.path.split(first_coeff_path)[-1])[0]
@@ -71,23 +60,14 @@ def get_data(first_coeff_path, audio_path, device):
71
  source_semantics_dict = scio.loadmat(source_semantics_path)
72
  ref_coeff = source_semantics_dict['coeff_3dmm'][:1,:70] #1 70
73
 
74
- print(audio_path)
75
- if '.mp3' in audio_path:
76
- print(audio_path)
77
- mp3_to_wav(audio_path, audio_path.replace('.mp3','.wav'), 16000)
78
- new_audio = audio_path.replace('.mp3','.wav')
79
- else:
80
- new_audio = audio_path
81
-
82
- wav = audio.load_wav(new_audio, 16000)
83
-
84
  wav_length, num_frames = parse_audio_length(len(wav), 16000, 25)
85
  wav = crop_pad_audio(wav, wav_length)
86
  orig_mel = audio.melspectrogram(wav).T
87
  spec = orig_mel.copy() # nframes 80
88
  indiv_mels = []
89
 
90
- for i in range(num_frames):
91
  start_frame_num = i-2
92
  start_idx = int(80. * (start_frame_num / float(fps)))
93
  end_idx = start_idx + syncnet_mel_step_size
@@ -97,7 +77,6 @@ def get_data(first_coeff_path, audio_path, device):
97
  indiv_mels.append(m.T)
98
  indiv_mels = np.asarray(indiv_mels) # T 80 16
99
  ratio = generate_blink_seq_randomly(num_frames) # T
100
-
101
 
102
  indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1).unsqueeze(0) # bs T 1 80 16
103
  ratio = torch.FloatTensor(ratio).unsqueeze(0) # bs T
 
1
  import os
2
+
3
+ from tqdm import tqdm
4
  import torch
5
  import numpy as np
6
  import random
7
  import scipy.io as scio
8
  import src.utils.audio as audio
 
 
 
 
 
 
 
 
 
9
 
10
  def crop_pad_audio(wav, audio_length):
11
  if len(wav) > audio_length:
 
26
  ratio = np.zeros((num_frames,1))
27
  frame_id = 0
28
  while frame_id in range(num_frames):
 
29
  start = 80
30
  if frame_id+start+9<=num_frames - 1:
31
  ratio[frame_id+start:frame_id+start+9, 0] = [0.5,0.6,0.7,0.9,1, 0.9, 0.7,0.6,0.5]
 
40
  return ratio
41
  frame_id = 0
42
  while frame_id in range(num_frames):
 
43
  start = random.choice(range(min(10,num_frames), min(int(num_frames/2), 70)))
44
  if frame_id+start+5<=num_frames - 1:
45
  ratio[frame_id+start:frame_id+start+5, 0] = [0.5, 0.9, 1.0, 0.9, 0.5]
 
51
  def get_data(first_coeff_path, audio_path, device):
52
 
53
  syncnet_mel_step_size = 16
 
 
54
  fps = 25
55
 
56
  pic_name = os.path.splitext(os.path.split(first_coeff_path)[-1])[0]
 
60
  source_semantics_dict = scio.loadmat(source_semantics_path)
61
  ref_coeff = source_semantics_dict['coeff_3dmm'][:1,:70] #1 70
62
 
63
+ wav = audio.load_wav(audio_path, 16000)
 
 
 
 
 
 
 
 
 
64
  wav_length, num_frames = parse_audio_length(len(wav), 16000, 25)
65
  wav = crop_pad_audio(wav, wav_length)
66
  orig_mel = audio.melspectrogram(wav).T
67
  spec = orig_mel.copy() # nframes 80
68
  indiv_mels = []
69
 
70
+ for i in tqdm(range(num_frames), 'mel:'):
71
  start_frame_num = i-2
72
  start_idx = int(80. * (start_frame_num / float(fps)))
73
  end_idx = start_idx + syncnet_mel_step_size
 
77
  indiv_mels.append(m.T)
78
  indiv_mels = np.asarray(indiv_mels) # T 80 16
79
  ratio = generate_blink_seq_randomly(num_frames) # T
 
80
 
81
  indiv_mels = torch.FloatTensor(indiv_mels).unsqueeze(1).unsqueeze(0) # bs T 1 80 16
82
  ratio = torch.FloatTensor(ratio).unsqueeze(0) # bs T
src/gradio_demo.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, uuid
2
+ from time import gmtime, strftime
3
+ import os, sys, shutil
4
+ from src.utils.preprocess import CropAndExtract
5
+ from src.test_audio2coeff import Audio2Coeff
6
+ from src.facerender.animate import AnimateFromCoeff
7
+ from src.generate_batch import get_data
8
+ from src.generate_facerender_batch import get_facerender_data
9
+ from src.utils.text2speech import text2speech
10
+
11
+ from pydub import AudioSegment
12
+
13
+ def mp3_to_wav(mp3_filename,wav_filename,frame_rate):
14
+ mp3_file = AudioSegment.from_file(file=mp3_filename)
15
+ mp3_file.set_frame_rate(frame_rate).export(wav_filename,format="wav")
16
+
17
+
18
+ class SadTalker():
19
+
20
+ def __init__(self, checkpoint_path='checkpoints', config_path='src/config'):
21
+
22
+ if torch.cuda.is_available() :
23
+ device = "cuda"
24
+ else:
25
+ device = "cpu"
26
+
27
+ os.environ['TORCH_HOME']= checkpoint_path
28
+
29
+ path_of_lm_croper = os.path.join( checkpoint_path, 'shape_predictor_68_face_landmarks.dat')
30
+ path_of_net_recon_model = os.path.join( checkpoint_path, 'epoch_20.pth')
31
+ dir_of_BFM_fitting = os.path.join( checkpoint_path, 'BFM_Fitting')
32
+ wav2lip_checkpoint = os.path.join( checkpoint_path, 'wav2lip.pth')
33
+
34
+ audio2pose_checkpoint = os.path.join( checkpoint_path, 'auido2pose_00140-model.pth')
35
+ audio2pose_yaml_path = os.path.join( config_path, 'auido2pose.yaml')
36
+
37
+ audio2exp_checkpoint = os.path.join( checkpoint_path, 'auido2exp_00300-model.pth')
38
+ audio2exp_yaml_path = os.path.join( config_path, 'auido2exp.yaml')
39
+
40
+ free_view_checkpoint = os.path.join( checkpoint_path, 'facevid2vid_00189-model.pth.tar')
41
+ mapping_checkpoint = os.path.join( checkpoint_path, 'mapping_00229-model.pth.tar')
42
+ facerender_yaml_path = os.path.join( config_path, 'facerender.yaml')
43
+
44
+ #init model
45
+ print(path_of_lm_croper)
46
+ self.preprocess_model = CropAndExtract(path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device)
47
+
48
+ print(audio2pose_checkpoint)
49
+ self.audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path,
50
+ audio2exp_checkpoint, audio2exp_yaml_path, wav2lip_checkpoint, device)
51
+ print(free_view_checkpoint)
52
+ self.animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint,
53
+ facerender_yaml_path, device)
54
+ self.device = device
55
+
56
+ def test(self, source_image, driven_audio, still_mode, use_enhancer, result_dir='./'):
57
+
58
+ time_tag = str(uuid.uuid4())
59
+ save_dir = os.path.join(result_dir, time_tag)
60
+ os.makedirs(save_dir, exist_ok=True)
61
+
62
+ input_dir = os.path.join(save_dir, 'input')
63
+ os.makedirs(input_dir, exist_ok=True)
64
+
65
+ print(source_image)
66
+ pic_path = os.path.join(input_dir, os.path.basename(source_image))
67
+ shutil.move(source_image, input_dir)
68
+
69
+ if os.path.isfile(driven_audio):
70
+ audio_path = os.path.join(input_dir, os.path.basename(driven_audio))
71
+
72
+ #### mp3 to wav
73
+ if '.mp3' in audio_path:
74
+ mp3_to_wav(driven_audio, audio_path.replace('.mp3', '.wav'), 16000)
75
+ audio_path = audio_path.replace('.mp3', '.wav')
76
+ else:
77
+ shutil.move(driven_audio, input_dir)
78
+ else:
79
+ text2speech
80
+
81
+
82
+ os.makedirs(save_dir, exist_ok=True)
83
+ pose_style = 0
84
+ #crop image and extract 3dmm from image
85
+ first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
86
+ os.makedirs(first_frame_dir, exist_ok=True)
87
+ first_coeff_path, crop_pic_path, original_size = self.preprocess_model.generate(pic_path, first_frame_dir)
88
+
89
+ if first_coeff_path is None:
90
+ raise AttributeError("No face is detected")
91
+
92
+ #audio2ceoff
93
+ batch = get_data(first_coeff_path, audio_path, self.device) # longer audio?
94
+ coeff_path = self.audio_to_coeff.generate(batch, save_dir, pose_style)
95
+ #coeff2video
96
+ batch_size = 4
97
+ data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode)
98
+ self.animate_from_coeff.generate(data, save_dir, enhancer='gfpgan' if use_enhancer else None, original_size=original_size)
99
+ video_name = data['video_name']
100
+ print(f'The generated video is named {video_name} in {save_dir}')
101
+
102
+ torch.cuda.empty_cache()
103
+ torch.cuda.synchronize()
104
+ import gc; gc.collect()
105
+
106
+ if use_enhancer:
107
+ return os.path.join(save_dir, video_name+'_enhanced.mp4'), os.path.join(save_dir, video_name+'_enhanced.mp4')
108
+
109
+ else:
110
+ return os.path.join(save_dir, video_name+'.mp4'), os.path.join(save_dir, video_name+'.mp4')
111
+
112
+
113
+
src/test_audio2coeff.py CHANGED
@@ -81,7 +81,7 @@ class Audio2Coeff():
81
 
82
  savemat(os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name'])),
83
  {'coeff_3dmm': coeffs_pred_numpy})
84
- torch.cuda.empty_cache()
85
  return os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name']))
86
 
87
 
 
81
 
82
  savemat(os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name'])),
83
  {'coeff_3dmm': coeffs_pred_numpy})
84
+
85
  return os.path.join(coeff_save_dir, '%s##%s.mat'%(batch['pic_name'], batch['audio_name']))
86
 
87
 
src/utils/__pycache__/audio.cpython-38.pyc CHANGED
Binary files a/src/utils/__pycache__/audio.cpython-38.pyc and b/src/utils/__pycache__/audio.cpython-38.pyc differ
 
src/utils/__pycache__/croper.cpython-38.pyc CHANGED
Binary files a/src/utils/__pycache__/croper.cpython-38.pyc and b/src/utils/__pycache__/croper.cpython-38.pyc differ
 
src/utils/__pycache__/face_enhancer.cpython-38.pyc CHANGED
Binary files a/src/utils/__pycache__/face_enhancer.cpython-38.pyc and b/src/utils/__pycache__/face_enhancer.cpython-38.pyc differ
 
src/utils/__pycache__/hparams.cpython-38.pyc CHANGED
Binary files a/src/utils/__pycache__/hparams.cpython-38.pyc and b/src/utils/__pycache__/hparams.cpython-38.pyc differ
 
src/utils/__pycache__/preprocess.cpython-38.pyc CHANGED
Binary files a/src/utils/__pycache__/preprocess.cpython-38.pyc and b/src/utils/__pycache__/preprocess.cpython-38.pyc differ