ameerazam08 commited on
Commit
0ca1180
1 Parent(s): 6ac0d17

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +8 -0
  2. Dockerfile +89 -0
  3. LICENSE +21 -0
  4. README.md +13 -0
  5. app.py +426 -0
  6. assets/BBOX_SHIFT.md +26 -0
  7. assets/demo/man/man.png +3 -0
  8. assets/demo/monalisa/monalisa.png +0 -0
  9. assets/demo/musk/musk.png +0 -0
  10. assets/demo/sit/sit.jpeg +0 -0
  11. assets/demo/sun1/sun.png +0 -0
  12. assets/demo/sun2/sun.png +0 -0
  13. assets/demo/video1/video1.png +0 -0
  14. assets/demo/yongen/yongen.jpeg +0 -0
  15. assets/figs/landmark_ref.png +0 -0
  16. assets/figs/musetalk_arc.jpg +0 -0
  17. configs/inference/test.yaml +14 -0
  18. data/audio/sun.wav +3 -0
  19. data/audio/yongen.wav +3 -0
  20. data/video/fake.mp4 +0 -0
  21. data/video/sun.mp4 +3 -0
  22. data/video/yongen.mp4 +3 -0
  23. entrypoint.sh +11 -0
  24. insta.sh +5 -0
  25. install_ffmpeg.sh +70 -0
  26. models/.gitattributes +35 -0
  27. models/.huggingface/.gitignore +1 -0
  28. models/.huggingface/download/.gitattributes.lock +0 -0
  29. models/.huggingface/download/.gitattributes.metadata +3 -0
  30. models/.huggingface/download/README.md.lock +0 -0
  31. models/.huggingface/download/README.md.metadata +3 -0
  32. models/.huggingface/download/musetalk/musetalk.json.lock +0 -0
  33. models/.huggingface/download/musetalk/musetalk.json.metadata +3 -0
  34. models/.huggingface/download/musetalk/pytorch_model.bin.lock +0 -0
  35. models/.huggingface/download/musetalk/pytorch_model.bin.metadata +3 -0
  36. models/README.md +259 -0
  37. models/dwpose/.gitattributes +35 -0
  38. models/dwpose/.huggingface/.gitignore +1 -0
  39. models/dwpose/.huggingface/download/.gitattributes.lock +0 -0
  40. models/dwpose/.huggingface/download/.gitattributes.metadata +3 -0
  41. models/dwpose/.huggingface/download/README.md.lock +0 -0
  42. models/dwpose/.huggingface/download/README.md.metadata +3 -0
  43. models/dwpose/.huggingface/download/dw-ll_ucoco.pth.lock +0 -0
  44. models/dwpose/.huggingface/download/dw-ll_ucoco.pth.metadata +3 -0
  45. models/dwpose/.huggingface/download/dw-ll_ucoco_384.onnx.lock +0 -0
  46. models/dwpose/.huggingface/download/dw-ll_ucoco_384.onnx.metadata +3 -0
  47. models/dwpose/.huggingface/download/dw-ll_ucoco_384.pth.lock +0 -0
  48. models/dwpose/.huggingface/download/dw-ll_ucoco_384.pth.metadata +3 -0
  49. models/dwpose/.huggingface/download/dw-mm_ucoco.pth.lock +0 -0
  50. models/dwpose/.huggingface/download/dw-mm_ucoco.pth.metadata +3 -0
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/demo/man/man.png filter=lfs diff=lfs merge=lfs -text
37
+ data/audio/sun.wav filter=lfs diff=lfs merge=lfs -text
38
+ data/audio/yongen.wav filter=lfs diff=lfs merge=lfs -text
39
+ data/video/sun.mp4 filter=lfs diff=lfs merge=lfs -text
40
+ data/video/yongen.mp4 filter=lfs diff=lfs merge=lfs -text
41
+ results/input/outputxxx_sun_yongen.mp4 filter=lfs diff=lfs merge=lfs -text
42
+ results/output/outputxxx_sun_yongen_audio.mp4 filter=lfs diff=lfs merge=lfs -text
43
+ results/sun_sun.mp4 filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM anchorxia/musev:latest
2
+
3
+ #MAINTAINER 维护者信息
4
+ LABEL MAINTAINER="zkangchen"
5
+ LABEL Email="zkangchen@tencent.com"
6
+ LABEL Description="musev gradio image, from docker pull anchorxia/musev:latest"
7
+
8
+ SHELL ["/bin/bash", "--login", "-c"]
9
+
10
+ # Set up a new user named "user" with user ID 1000
11
+ RUN useradd -m -u 1000 user
12
+
13
+ # Switch to the "user" user
14
+ USER user
15
+
16
+ # Set home to the user's home directory
17
+ ENV HOME=/home/user \
18
+ PATH=/home/user/.local/bin:$PATH
19
+
20
+ # Set the working directory to the user's home directory
21
+ WORKDIR $HOME/app
22
+
23
+
24
+ ################################################# INSTALLING FFMPEG ##################################################
25
+ # RUN apt-get update ; apt-get install -y git build-essential gcc make yasm autoconf automake cmake libtool checkinstall libmp3lame-dev pkg-config libunwind-dev zlib1g-dev libssl-dev
26
+
27
+ # RUN apt-get update \
28
+ # && apt-get clean \
29
+ # && apt-get install -y --no-install-recommends libc6-dev libgdiplus wget software-properties-common
30
+
31
+ #RUN RUN apt-add-repository ppa:git-core/ppa && apt-get update && apt-get install -y git
32
+
33
+ # RUN wget https://www.ffmpeg.org/releases/ffmpeg-4.0.2.tar.gz
34
+ # RUN tar -xzf ffmpeg-4.0.2.tar.gz; rm -r ffmpeg-4.0.2.tar.gz
35
+ # RUN cd ./ffmpeg-4.0.2; ./configure --enable-gpl --enable-libmp3lame --enable-decoder=mjpeg,png --enable-encoder=png --enable-openssl --enable-nonfree
36
+
37
+
38
+ # RUN cd ./ffmpeg-4.0.2; make
39
+ # RUN cd ./ffmpeg-4.0.2; make install
40
+ ######################################################################################################################
41
+
42
+ RUN echo "docker start"\
43
+ && whoami \
44
+ && which python \
45
+ && pwd
46
+
47
+ RUN git clone -b main --recursive https://github.com/TMElyralab/MuseTalk.git
48
+
49
+ RUN chmod -R 777 /home/user/app/MuseTalk
50
+
51
+
52
+
53
+ RUN . /opt/conda/etc/profile.d/conda.sh \
54
+ && echo "source activate musev" >> ~/.bashrc \
55
+ && conda activate musev \
56
+ && conda env list
57
+ # && conda install ffmpeg
58
+
59
+ RUN ffmpeg -codecs
60
+
61
+
62
+
63
+
64
+
65
+ WORKDIR /home/user/app/MuseTalk/
66
+
67
+ RUN pip install -r requirements.txt \
68
+ && pip install --no-cache-dir -U openmim \
69
+ && mim install mmengine \
70
+ && mim install "mmcv>=2.0.1" \
71
+ && mim install "mmdet>=3.1.0" \
72
+ && mim install "mmpose>=1.1.0"
73
+
74
+
75
+ # Add entrypoint script
76
+ #RUN chmod 777 ./entrypoint.sh
77
+ RUN ls -l ./
78
+
79
+ EXPOSE 7860
80
+
81
+ # CMD ["/bin/bash", "-c", "python app.py"]
82
+ CMD ["./install_ffmpeg.sh"]
83
+ CMD ["./entrypoint.sh"]
84
+
85
+
86
+
87
+
88
+
89
+
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 TMElyralab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: MuseTalkDemo
3
+ emoji: 🌍
4
+ colorFrom: gray
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ license: creativeml-openrail-m
9
+ app_file: app.py
10
+ app_port: 7860
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import pdb
4
+ import re
5
+
6
+ import gradio as gr
7
+ import spaces
8
+ import numpy as np
9
+ import sys
10
+ import subprocess
11
+
12
+ from huggingface_hub import snapshot_download
13
+ import requests
14
+
15
+ import argparse
16
+ import os
17
+ from omegaconf import OmegaConf
18
+ import numpy as np
19
+ import cv2
20
+ import torch
21
+ import glob
22
+ import pickle
23
+ from tqdm import tqdm
24
+ import copy
25
+ from argparse import Namespace
26
+ import shutil
27
+ import gdown
28
+ import imageio
29
+ import ffmpeg
30
+ from moviepy.editor import *
31
+
32
+
33
+ ProjectDir = os.path.abspath(os.path.dirname(__file__))
34
+ CheckpointsDir = os.path.join(ProjectDir, "models")
35
+
36
+ def print_directory_contents(path):
37
+ for child in os.listdir(path):
38
+ child_path = os.path.join(path, child)
39
+ if os.path.isdir(child_path):
40
+ print(child_path)
41
+
42
+ def download_model():
43
+ if not os.path.exists(CheckpointsDir):
44
+ os.makedirs(CheckpointsDir)
45
+ print("Checkpoint Not Downloaded, start downloading...")
46
+ tic = time.time()
47
+ snapshot_download(
48
+ repo_id="TMElyralab/MuseTalk",
49
+ local_dir=CheckpointsDir,
50
+ max_workers=8,
51
+ local_dir_use_symlinks=True,
52
+ force_download=True, resume_download=False
53
+ )
54
+ # weight
55
+ os.makedirs(f"{CheckpointsDir}/sd-vae-ft-mse/")
56
+ snapshot_download(
57
+ repo_id="stabilityai/sd-vae-ft-mse",
58
+ local_dir=CheckpointsDir+'/sd-vae-ft-mse',
59
+ max_workers=8,
60
+ local_dir_use_symlinks=True,
61
+ force_download=True, resume_download=False
62
+ )
63
+ #dwpose
64
+ os.makedirs(f"{CheckpointsDir}/dwpose/")
65
+ snapshot_download(
66
+ repo_id="yzd-v/DWPose",
67
+ local_dir=CheckpointsDir+'/dwpose',
68
+ max_workers=8,
69
+ local_dir_use_symlinks=True,
70
+ force_download=True, resume_download=False
71
+ )
72
+ #vae
73
+ url = "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt"
74
+ response = requests.get(url)
75
+ # 确保请求成功
76
+ if response.status_code == 200:
77
+ # 指定文件保存的位置
78
+ file_path = f"{CheckpointsDir}/whisper/tiny.pt"
79
+ os.makedirs(f"{CheckpointsDir}/whisper/")
80
+ # 将文件内容写入指定位置
81
+ with open(file_path, "wb") as f:
82
+ f.write(response.content)
83
+ else:
84
+ print(f"请求失败,状态码:{response.status_code}")
85
+ #gdown face parse
86
+ url = "https://drive.google.com/uc?id=154JgKpzCPW82qINcVieuPH3fZ2e0P812"
87
+ os.makedirs(f"{CheckpointsDir}/face-parse-bisent/")
88
+ file_path = f"{CheckpointsDir}/face-parse-bisent/79999_iter.pth"
89
+ gdown.download(url, file_path, quiet=False)
90
+ #resnet
91
+ url = "https://download.pytorch.org/models/resnet18-5c106cde.pth"
92
+ response = requests.get(url)
93
+ # 确保请求成功
94
+ if response.status_code == 200:
95
+ # 指定文件保存的位置
96
+ file_path = f"{CheckpointsDir}/face-parse-bisent/resnet18-5c106cde.pth"
97
+ # 将文件内容写入指定位置
98
+ with open(file_path, "wb") as f:
99
+ f.write(response.content)
100
+ else:
101
+ print(f"请求失败,状态码:{response.status_code}")
102
+
103
+
104
+ toc = time.time()
105
+
106
+ print(f"download cost {toc-tic} seconds")
107
+ print_directory_contents(CheckpointsDir)
108
+
109
+ else:
110
+ print("Already download the model.")
111
+
112
+
113
+
114
+
115
+
116
+ download_model() # for huggingface deployment.
117
+
118
+
119
+ from musetalk.utils.utils import get_file_type,get_video_fps,datagen
120
+ from musetalk.utils.preprocessing import get_landmark_and_bbox,read_imgs,coord_placeholder,get_bbox_range
121
+ from musetalk.utils.blending import get_image
122
+ from musetalk.utils.utils import load_all_model
123
+
124
+
125
+
126
+
127
+
128
+
129
+ @spaces.GPU(duration=600)
130
+ @torch.no_grad()
131
+ def inference(audio_path,video_path,bbox_shift,progress=gr.Progress(track_tqdm=True)):
132
+ args_dict={"result_dir":'./results/output', "fps":25, "batch_size":8, "output_vid_name":'', "use_saved_coord":False}#same with inferenece script
133
+ args = Namespace(**args_dict)
134
+
135
+ input_basename = os.path.basename(video_path).split('.')[0]
136
+ audio_basename = os.path.basename(audio_path).split('.')[0]
137
+ output_basename = f"{input_basename}_{audio_basename}"
138
+ result_img_save_path = os.path.join(args.result_dir, output_basename) # related to video & audio inputs
139
+ crop_coord_save_path = os.path.join(result_img_save_path, input_basename+".pkl") # only related to video input
140
+ os.makedirs(result_img_save_path,exist_ok =True)
141
+
142
+ if args.output_vid_name=="":
143
+ output_vid_name = os.path.join(args.result_dir, output_basename+".mp4")
144
+ else:
145
+ output_vid_name = os.path.join(args.result_dir, args.output_vid_name)
146
+ ############################################## extract frames from source video ##############################################
147
+ if get_file_type(video_path)=="video":
148
+ save_dir_full = os.path.join(args.result_dir, input_basename)
149
+ os.makedirs(save_dir_full,exist_ok = True)
150
+ # cmd = f"ffmpeg -v fatal -i {video_path} -start_number 0 {save_dir_full}/%08d.png"
151
+ # os.system(cmd)
152
+ # 读取视频
153
+ reader = imageio.get_reader(video_path)
154
+
155
+ # 保存图片
156
+ for i, im in enumerate(reader):
157
+ imageio.imwrite(f"{save_dir_full}/{i:08d}.png", im)
158
+ input_img_list = sorted(glob.glob(os.path.join(save_dir_full, '*.[jpJP][pnPN]*[gG]')))
159
+ fps = get_video_fps(video_path)
160
+ else: # input img folder
161
+ input_img_list = glob.glob(os.path.join(video_path, '*.[jpJP][pnPN]*[gG]'))
162
+ input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
163
+ fps = args.fps
164
+ #print(input_img_list)
165
+ ############################################## extract audio feature ##############################################
166
+ whisper_feature = audio_processor.audio2feat(audio_path)
167
+ whisper_chunks = audio_processor.feature2chunks(feature_array=whisper_feature,fps=fps)
168
+ ############################################## preprocess input image ##############################################
169
+ if os.path.exists(crop_coord_save_path) and args.use_saved_coord:
170
+ print("using extracted coordinates")
171
+ with open(crop_coord_save_path,'rb') as f:
172
+ coord_list = pickle.load(f)
173
+ frame_list = read_imgs(input_img_list)
174
+ else:
175
+ print("extracting landmarks...time consuming")
176
+ coord_list, frame_list = get_landmark_and_bbox(input_img_list, bbox_shift)
177
+ with open(crop_coord_save_path, 'wb') as f:
178
+ pickle.dump(coord_list, f)
179
+ bbox_shift_text=get_bbox_range(input_img_list, bbox_shift)
180
+ i = 0
181
+ input_latent_list = []
182
+ for bbox, frame in zip(coord_list, frame_list):
183
+ if bbox == coord_placeholder:
184
+ continue
185
+ x1, y1, x2, y2 = bbox
186
+ crop_frame = frame[y1:y2, x1:x2]
187
+ crop_frame = cv2.resize(crop_frame,(256,256),interpolation = cv2.INTER_LANCZOS4)
188
+ latents = vae.get_latents_for_unet(crop_frame)
189
+ input_latent_list.append(latents)
190
+
191
+ # to smooth the first and the last frame
192
+ frame_list_cycle = frame_list + frame_list[::-1]
193
+ coord_list_cycle = coord_list + coord_list[::-1]
194
+ input_latent_list_cycle = input_latent_list + input_latent_list[::-1]
195
+ ############################################## inference batch by batch ##############################################
196
+ print("start inference")
197
+ video_num = len(whisper_chunks)
198
+ batch_size = args.batch_size
199
+ gen = datagen(whisper_chunks,input_latent_list_cycle,batch_size)
200
+ res_frame_list = []
201
+ for i, (whisper_batch,latent_batch) in enumerate(tqdm(gen,total=int(np.ceil(float(video_num)/batch_size)))):
202
+
203
+ tensor_list = [torch.FloatTensor(arr) for arr in whisper_batch]
204
+ audio_feature_batch = torch.stack(tensor_list).to(unet.device) # torch, B, 5*N,384
205
+ audio_feature_batch = pe(audio_feature_batch)
206
+
207
+ pred_latents = unet.model(latent_batch, timesteps, encoder_hidden_states=audio_feature_batch).sample
208
+ recon = vae.decode_latents(pred_latents)
209
+ for res_frame in recon:
210
+ res_frame_list.append(res_frame)
211
+
212
+ ############################################## pad to full image ##############################################
213
+ print("pad talking image to original video")
214
+ for i, res_frame in enumerate(tqdm(res_frame_list)):
215
+ bbox = coord_list_cycle[i%(len(coord_list_cycle))]
216
+ ori_frame = copy.deepcopy(frame_list_cycle[i%(len(frame_list_cycle))])
217
+ x1, y1, x2, y2 = bbox
218
+ try:
219
+ res_frame = cv2.resize(res_frame.astype(np.uint8),(x2-x1,y2-y1))
220
+ except:
221
+ # print(bbox)
222
+ continue
223
+
224
+ combine_frame = get_image(ori_frame,res_frame,bbox)
225
+ cv2.imwrite(f"{result_img_save_path}/{str(i).zfill(8)}.png",combine_frame)
226
+
227
+ # cmd_img2video = f"ffmpeg -y -v fatal -r {fps} -f image2 -i {result_img_save_path}/%08d.png -vcodec libx264 -vf format=rgb24,scale=out_color_matrix=bt709,format=yuv420p temp.mp4"
228
+ # print(cmd_img2video)
229
+ # os.system(cmd_img2video)
230
+ # 帧率
231
+ fps = 25
232
+ # 图片路径
233
+ # 输出视频路径
234
+ output_video = 'temp.mp4'
235
+
236
+ # 读取图片
237
+ def is_valid_image(file):
238
+ pattern = re.compile(r'\d{8}\.png')
239
+ return pattern.match(file)
240
+
241
+ images = []
242
+ files = [file for file in os.listdir(result_img_save_path) if is_valid_image(file)]
243
+ files.sort(key=lambda x: int(x.split('.')[0]))
244
+
245
+ for file in files:
246
+ filename = os.path.join(result_img_save_path, file)
247
+ images.append(imageio.imread(filename))
248
+
249
+
250
+ # 保存视频
251
+ imageio.mimwrite(output_video, images, 'FFMPEG', fps=fps, codec='libx264', pixelformat='yuv420p')
252
+
253
+ # cmd_combine_audio = f"ffmpeg -y -v fatal -i {audio_path} -i temp.mp4 {output_vid_name}"
254
+ # print(cmd_combine_audio)
255
+ # os.system(cmd_combine_audio)
256
+
257
+ input_video = './temp.mp4'
258
+ # Check if the input_video and audio_path exist
259
+ if not os.path.exists(input_video):
260
+ raise FileNotFoundError(f"Input video file not found: {input_video}")
261
+ if not os.path.exists(audio_path):
262
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
263
+
264
+ # 读取视频
265
+ reader = imageio.get_reader(input_video)
266
+ fps = reader.get_meta_data()['fps'] # 获取原视频的帧率
267
+
268
+ # 将帧存储在列表中
269
+ frames = images
270
+
271
+ # 保存视频并添加音频
272
+ # imageio.mimwrite(output_vid_name, frames, 'FFMPEG', fps=fps, codec='libx264', audio_codec='aac', input_params=['-i', audio_path])
273
+
274
+ # input_video = ffmpeg.input(input_video)
275
+
276
+ # input_audio = ffmpeg.input(audio_path)
277
+
278
+ print(len(frames))
279
+
280
+ # imageio.mimwrite(
281
+ # output_video,
282
+ # frames,
283
+ # 'FFMPEG',
284
+ # fps=25,
285
+ # codec='libx264',
286
+ # audio_codec='aac',
287
+ # input_params=['-i', audio_path],
288
+ # output_params=['-y'], # Add the '-y' flag to overwrite the output file if it exists
289
+ # )
290
+ # writer = imageio.get_writer(output_vid_name, fps = 25, codec='libx264', quality=10, pixelformat='yuvj444p')
291
+ # for im in frames:
292
+ # writer.append_data(im)
293
+ # writer.close()
294
+
295
+
296
+
297
+
298
+ # Load the video
299
+ video_clip = VideoFileClip(input_video)
300
+
301
+ # Load the audio
302
+ audio_clip = AudioFileClip(audio_path)
303
+
304
+ # Set the audio to the video
305
+ video_clip = video_clip.set_audio(audio_clip)
306
+
307
+ # Write the output video
308
+ video_clip.write_videofile(output_vid_name, codec='libx264', audio_codec='aac',fps=25)
309
+
310
+ os.remove("temp.mp4")
311
+ #shutil.rmtree(result_img_save_path)
312
+ print(f"result is save to {output_vid_name}")
313
+ return output_vid_name,bbox_shift_text
314
+
315
+
316
+
317
+ # load model weights
318
+ audio_processor,vae,unet,pe = load_all_model()
319
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
320
+ timesteps = torch.tensor([0], device=device)
321
+
322
+
323
+
324
+
325
+ def check_video(video):
326
+ if not isinstance(video, str):
327
+ return video # in case of none type
328
+ # Define the output video file name
329
+ dir_path, file_name = os.path.split(video)
330
+ if file_name.startswith("outputxxx_"):
331
+ return video
332
+ # Add the output prefix to the file name
333
+ output_file_name = "outputxxx_" + file_name
334
+
335
+ os.makedirs('./results',exist_ok=True)
336
+ os.makedirs('./results/output',exist_ok=True)
337
+ os.makedirs('./results/input',exist_ok=True)
338
+
339
+ # Combine the directory path and the new file name
340
+ output_video = os.path.join('./results/input', output_file_name)
341
+
342
+
343
+ # # Run the ffmpeg command to change the frame rate to 25fps
344
+ # command = f"ffmpeg -i {video} -r 25 -vcodec libx264 -vtag hvc1 -pix_fmt yuv420p crf 18 {output_video} -y"
345
+
346
+ # read video
347
+ reader = imageio.get_reader(video)
348
+ fps = reader.get_meta_data()['fps'] # get fps from original video
349
+
350
+ # conver fps to 25
351
+ frames = [im for im in reader]
352
+ target_fps = 25
353
+
354
+ L = len(frames)
355
+ L_target = int(L / fps * target_fps)
356
+ original_t = [x / fps for x in range(1, L+1)]
357
+ t_idx = 0
358
+ target_frames = []
359
+ for target_t in range(1, L_target+1):
360
+ while target_t / target_fps > original_t[t_idx]:
361
+ t_idx += 1 # find the first t_idx so that target_t / target_fps <= original_t[t_idx]
362
+ if t_idx >= L:
363
+ break
364
+ target_frames.append(frames[t_idx])
365
+
366
+ # save video
367
+ imageio.mimwrite(output_video, target_frames, 'FFMPEG', fps=25, codec='libx264', quality=9, pixelformat='yuv420p')
368
+ return output_video
369
+
370
+
371
+
372
+
373
+ css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
374
+
375
+ with gr.Blocks(css=css) as demo:
376
+ gr.Markdown(
377
+ "<div align='center'> <h1>MuseTalk: Real-Time High Quality Lip Synchronization with Latent Space Inpainting </span> </h1> \
378
+ <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
379
+ </br>\
380
+ Yue Zhang <sup>\*</sup>,\
381
+ Minhao Liu<sup>\*</sup>,\
382
+ Zhaokang Chen,\
383
+ Bin Wu<sup>†</sup>,\
384
+ Yingjie He,\
385
+ Chao Zhan,\
386
+ Wenjiang Zhou\
387
+ (<sup>*</sup>Equal Contribution, <sup>†</sup>Corresponding Author, benbinwu@tencent.com)\
388
+ Lyra Lab, Tencent Music Entertainment\
389
+ </h2> \
390
+ <a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseTalk'>[Github Repo]</a>\
391
+ <a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseTalk'>[Huggingface]</a>\
392
+ <a style='font-size:18px;color: #000000' href=''> [Technical report(Coming Soon)] </a>\
393
+ <a style='font-size:18px;color: #000000' href=''> [Project Page(Coming Soon)] </a> </div>"
394
+ )
395
+
396
+ with gr.Row():
397
+ with gr.Column():
398
+ audio = gr.Audio(label="Driven Audio",type="filepath")
399
+ video = gr.Video(label="Reference Video",sources=['upload'])
400
+ bbox_shift = gr.Number(label="BBox_shift value, px", value=0)
401
+ bbox_shift_scale = gr.Textbox(label="BBox_shift recommend value lower bound,The corresponding bbox range is generated after the initial result is generated. \n If the result is not good, it can be adjusted according to this reference value", value="",interactive=False)
402
+
403
+ btn = gr.Button("Generate")
404
+ out1 = gr.Video()
405
+
406
+ video.change(
407
+ fn=check_video, inputs=[video], outputs=[video]
408
+ )
409
+ btn.click(
410
+ fn=inference,
411
+ inputs=[
412
+ audio,
413
+ video,
414
+ bbox_shift,
415
+ ],
416
+ outputs=[out1,bbox_shift_scale]
417
+ )
418
+
419
+ # Set the IP and port
420
+ ip_address = "0.0.0.0" # Replace with your desired IP address
421
+ port_number = 7860 # Replace with your desired port number
422
+
423
+
424
+ demo.queue().launch(
425
+ # share=False , debug=True, server_name=ip_address, server_port=port_number
426
+ )
assets/BBOX_SHIFT.md ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Why is there a "bbox_shift" parameter?
2
+ When processing training data, we utilize the combination of face detection results (bbox) and facial landmarks to determine the region of the head segmentation box. Specifically, we use the upper bound of the bbox as the upper boundary of the segmentation box, the maximum y value of the facial landmarks coordinates as the lower boundary of the segmentation box, and the minimum and maximum x values of the landmarks coordinates as the left and right boundaries of the segmentation box. By processing the dataset in this way, we can ensure the integrity of the face.
3
+
4
+ However, we have observed that the masked ratio on the face varies across different images due to the varying face shapes of subjects. Furthermore, we found that the upper-bound of the mask mainly lies close to the landmark28, landmark29 and landmark30 landmark points (as shown in Fig.1), which correspond to proportions of 15%, 63%, and 22% in the dataset, respectively.
5
+
6
+ During the inference process, we discover that as the upper-bound of the mask gets closer to the mouth (near landmark30), the audio features contribute more to lip movements. Conversely, as the upper-bound of the mask moves away from the mouth (near landmark28), the audio features contribute more to generating details of facial appearance. Hence, we define this characteristic as a parameter that can adjust the contribution of audio features to generating lip movements, which users can modify according to their specific needs in practical scenarios.
7
+
8
+ ![landmark](figs/landmark_ref.png)
9
+
10
+ Fig.1. Facial landmarks
11
+ ### Step 0.
12
+ Running with the default configuration to obtain the adjustable value range.
13
+ ```
14
+ python -m scripts.inference --inference_config configs/inference/test.yaml
15
+ ```
16
+ ```
17
+ ********************************************bbox_shift parameter adjustment**********************************************************
18
+ Total frame:「838」 Manually adjust range : [ -9~9 ] , the current value: 0
19
+ *************************************************************************************************************************************
20
+ ```
21
+ ### Step 1.
22
+ Re-run the script within the above range.
23
+ ```
24
+ python -m scripts.inference --inference_config configs/inference/test.yaml --bbox_shift xx # where xx is in [-9, 9].
25
+ ```
26
+ In our experimental observations, we found that positive values (moving towards the lower half) generally increase mouth openness, while negative values (moving towards the upper half) generally decrease mouth openness. However, it's important to note that this is not an absolute rule, and users may need to adjust the parameter according to their specific needs and the desired effect.
assets/demo/man/man.png ADDED

Git LFS Details

  • SHA256: c3d01bb4244e5f8b98d2359e6b19e9973193a15b49dff3dc0d5a9edb8666b2e7
  • Pointer size: 132 Bytes
  • Size of remote file: 3.09 MB
assets/demo/monalisa/monalisa.png ADDED
assets/demo/musk/musk.png ADDED
assets/demo/sit/sit.jpeg ADDED
assets/demo/sun1/sun.png ADDED
assets/demo/sun2/sun.png ADDED
assets/demo/video1/video1.png ADDED
assets/demo/yongen/yongen.jpeg ADDED
assets/figs/landmark_ref.png ADDED
assets/figs/musetalk_arc.jpg ADDED
configs/inference/test.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task_0:
2
+ video_path: "data/video/fake.mp4"
3
+ audio_path: "data/audio/yongen.wav"
4
+ bbox_shift: -7
5
+
6
+
7
+ # task_1:
8
+ # video_path: "data/video/yongen.mp4"
9
+ # audio_path: "data/audio/yongen.wav"
10
+
11
+
12
+
13
+
14
+
data/audio/sun.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f163b0fe2f278504c15cab74cd37b879652749e2a8a69f7848ad32c847d8007
3
+ size 1983572
data/audio/yongen.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b775c363c968428d1d6df4456495e4c11f00e3204d3082e51caff415ec0e2ba
3
+ size 1536078
data/video/fake.mp4 ADDED
Binary file (351 kB). View file
 
data/video/sun.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f240982090f4255a7589e3cd67b4219be7820f9eb9a7461fc915eb5f0c8e075
3
+ size 2217973
data/video/yongen.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1effa976d410571cd185554779d6d43a6ba636e0e3401385db1d607daa46441f
3
+ size 1870923
entrypoint.sh ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ echo "entrypoint.sh"
4
+ whoami
5
+ which python
6
+ echo "pythonpath" $PYTHONPATH
7
+
8
+ source /opt/conda/etc/profile.d/conda.sh
9
+ conda activate musev
10
+ which python
11
+ python app.py
insta.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pip install --no-cache-dir -U openmim
2
+ mim install mmengine
3
+ mim install "mmcv>=2.0.1"
4
+ mim install "mmdet>=3.1.0"
5
+ mim install "mmpose>=1.1.0"
install_ffmpeg.sh ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ FFMPEG_PREFIX="$(echo $HOME/local)"
3
+ FFMPEG_SOURCES="$(echo $HOME/ffmpeg_sources)"
4
+ FFMPEG_BINDIR="$(echo $FFMPEG_PREFIX/bin)"
5
+ PATH=$FFMPEG_BINDIR:$PATH
6
+
7
+ mkdir -p $FFMPEG_PREFIX
8
+ mkdir -p $FFMPEG_SOURCES
9
+
10
+ cd $FFMPEG_SOURCES
11
+ wget http://www.tortall.net/projects/yasm/releases/yasm-1.2.0.tar.gz
12
+ tar xzvf yasm-1.2.0.tar.gz
13
+ cd yasm-1.2.0
14
+ ./configure --prefix="$FFMPEG_PREFIX" --bindir="$FFMPEG_BINDIR"
15
+ make
16
+ make install
17
+ make distclean
18
+
19
+
20
+
21
+ cd $FFMPEG_SOURCES
22
+ wget http://download.videolan.org/pub/x264/snapshots/last_x264.tar.bz2
23
+ tar xjvf last_x264.tar.bz2
24
+ cd x264-snapshot*
25
+ ./configure --prefix="$FFMPEG_PREFIX" --bindir="$FFMPEG_BINDIR" --enable-static
26
+ make
27
+ make install
28
+ make distclean
29
+
30
+
31
+
32
+ cd $FFMPEG_SOURCES
33
+ wget -O fdk-aac.tar.gz https://github.com/mstorsjo/fdk-aac/tarball/master
34
+ tar xzvf fdk-aac.tar.gz
35
+ cd mstorsjo-fdk-aac*
36
+ autoreconf -fiv
37
+ ./configure --prefix="$FFMPEG_PREFIX" --disable-shared
38
+ make
39
+ make install
40
+ make distclean
41
+
42
+
43
+
44
+ cd $FFMPEG_SOURCES
45
+ wget http://webm.googlecode.com/files/libvpx-v1.3.0.tar.bz2
46
+ tar xjvf libvpx-v1.3.0.tar.bz2
47
+ cd libvpx-v1.3.0
48
+ ./configure --prefix="$FFMPEG_PREFIX" --disable-examples
49
+ make
50
+ make install
51
+ make clean
52
+
53
+
54
+
55
+ cd $FFMPEG_SOURCES
56
+ wget https://github.com/FFmpeg/FFmpeg/tarball/master -O ffmpeg.tar.gz
57
+ rm -rf FFmpeg-FFmpeg*
58
+ tar -zxvf ffmpeg.tar.gz
59
+ cd FFmpeg-FFmpeg*
60
+ PKG_CONFIG_PATH="$FFMPEG_PREFIX/lib/pkgconfig"
61
+ export PKG_CONFIG_PATH
62
+ ./configure --prefix="$FFMPEG_PREFIX" --extra-cflags="-I$FFMPEG_PREFIX/include" \
63
+ --extra-ldflags="-L$FFMPEG_PREFIX/lib" --bindir="$FFMPEG_BINDIR" --extra-libs="-ldl" --enable-gpl \
64
+ --enable-libass --enable-libfdk-aac --enable-libmp3lame --enable-libtheora \
65
+ --enable-libvorbis --enable-libvpx --enable-libx264 --enable-nonfree \
66
+ --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-version3 --enable-libvo-amrwbenc
67
+ make
68
+ make install
69
+ make distclean
70
+ hash -r
models/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/.huggingface/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *
models/.huggingface/download/.gitattributes.lock ADDED
File without changes
models/.huggingface/download/.gitattributes.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 5e6f29eeef8d88c1ae8389316f122bcc36a430fe
2
+ a6344aac8c09253b3b630fb776ae94478aa0275b
3
+ 1715152071.9456775
models/.huggingface/download/README.md.lock ADDED
File without changes
models/.huggingface/download/README.md.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 5e6f29eeef8d88c1ae8389316f122bcc36a430fe
2
+ 74e11fb4b681253f7fe73d9c4b80ec0021949213
3
+ 1715152072.478483
models/.huggingface/download/musetalk/musetalk.json.lock ADDED
File without changes
models/.huggingface/download/musetalk/musetalk.json.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 5e6f29eeef8d88c1ae8389316f122bcc36a430fe
2
+ b822db87e503a283fbbee73617f89dcd294cb91c
3
+ 1715152071.9655983
models/.huggingface/download/musetalk/pytorch_model.bin.lock ADDED
File without changes
models/.huggingface/download/musetalk/pytorch_model.bin.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 5e6f29eeef8d88c1ae8389316f122bcc36a430fe
2
+ 0ee7d5ea03ea75d8dca50ea7a76df791e90633687a135c4a69393abfc0475ffe
3
+ 1715152253.217898
models/README.md ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: creativeml-openrail-m
3
+ language:
4
+ - en
5
+ ---
6
+ # MuseTalk
7
+
8
+ MuseTalk: Real-Time High Quality Lip Synchronization with Latent Space Inpainting
9
+ </br>
10
+ Yue Zhang <sup>\*</sup>,
11
+ Minhao Liu<sup>\*</sup>,
12
+ Zhaokang Chen,
13
+ Bin Wu<sup>†</sup>,
14
+ Yingjie He,
15
+ Chao Zhan,
16
+ Wenjiang Zhou
17
+ (<sup>*</sup>Equal Contribution, <sup>†</sup>Corresponding Author, benbinwu@tencent.com)
18
+
19
+ **[github](https://github.com/TMElyralab/MuseTalk)** **[huggingface](https://huggingface.co/TMElyralab/MuseTalk)** **Project(comming soon)** **Technical report (comming soon)**
20
+
21
+ We introduce `MuseTalk`, a **real-time high quality** lip-syncing model (30fps+ on an NVIDIA Tesla V100). MuseTalk can be applied with input videos, e.g., generated by [MuseV](https://github.com/TMElyralab/MuseV), as a complete virtual human solution.
22
+
23
+ # Overview
24
+ `MuseTalk` is a real-time high quality audio-driven lip-syncing model trained in the latent space of `ft-mse-vae`, which
25
+
26
+ 1. modifies an unseen face according to the input audio, with a size of face region of `256 x 256`.
27
+ 1. supports audio in various languages, such as Chinese, English, and Japanese.
28
+ 1. supports real-time inference with 30fps+ on an NVIDIA Tesla V100.
29
+ 1. supports modification of the center point of the face region proposes, which **SIGNIFICANTLY** affects generation results.
30
+ 1. checkpoint available trained on the HDTF dataset.
31
+ 1. training codes (comming soon).
32
+
33
+ # News
34
+ - [04/02/2024] Released MuseTalk project and pretrained models.
35
+
36
+ ## Model
37
+ ![Model Structure](assets/figs/musetalk_arc.jpg)
38
+ MuseTalk was trained in latent spaces, where the images were encoded by a freezed VAE. The audio was encoded by a freezed `whisper-tiny` model. The architecture of the generation network was borrowed from the UNet of the `stable-diffusion-v1-4`, where the audio embeddings were fused to the image embeddings by cross-attention.
39
+
40
+ ## Cases
41
+ ### MuseV + MuseTalk make human photos alive!
42
+ <table class="center">
43
+ <tr style="font-weight: bolder;text-align:center;">
44
+ <td width="33%">Image</td>
45
+ <td width="33%">MuseV</td>
46
+ <td width="33%">+MuseTalk</td>
47
+ </tr>
48
+ <tr>
49
+ <td>
50
+ <img src=assets/demo/musk/musk.png width="95%">
51
+ </td>
52
+ <td >
53
+ <video src=assets/demo/yongen/yongen_musev.mp4 controls preload></video>
54
+ </td>
55
+ <td >
56
+ <video src=assets/demo/yongen/yongen_musetalk.mp4 controls preload></video>
57
+ </td>
58
+ </tr>
59
+ <tr>
60
+ <td>
61
+ <img src=assets/demo/yongen/yongen.jpeg width="95%">
62
+ </td>
63
+ <td >
64
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/57ef9dee-a9fd-4dc8-839b-3fbbbf0ff3f4 controls preload></video>
65
+ </td>
66
+ <td >
67
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/94d8dcba-1bcd-4b54-9d1d-8b6fc53228f0 controls preload></video>
68
+ </td>
69
+ </tr>
70
+ <tr>
71
+ <td>
72
+ <img src=assets/demo/monalisa/monalisa.png width="95%">
73
+ </td>
74
+ <td >
75
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/1568f604-a34f-4526-a13a-7d282aa2e773 controls preload></video>
76
+ </td>
77
+ <td >
78
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/a40784fc-a885-4c1f-9b7e-8f87b7caf4e0 controls preload></video>
79
+ </td>
80
+ </tr>
81
+ <tr>
82
+ <td>
83
+ <img src=assets/demo/sun1/sun.png width="95%">
84
+ </td>
85
+ <td >
86
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/37a3a666-7b90-4244-8d3a-058cb0e44107 controls preload></video>
87
+ </td>
88
+ <td >
89
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/172f4ff1-d432-45bd-a5a7-a07dec33a26b controls preload></video>
90
+ </td>
91
+ </tr>
92
+ <tr>
93
+ <td>
94
+ <img src=assets/demo/sun2/sun.png width="95%">
95
+ </td>
96
+ <td >
97
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/37a3a666-7b90-4244-8d3a-058cb0e44107 controls preload></video>
98
+ </td>
99
+ <td >
100
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/85a6873d-a028-4cce-af2b-6c59a1f2971d controls preload></video>
101
+ </td>
102
+ </tr>
103
+ </table >
104
+
105
+ * The character of the last two rows, `Xinying Sun`, is a supermodel KOL. You can follow her on [douyin](https://www.douyin.com/user/MS4wLjABAAAAWDThbMPN_6Xmm_JgXexbOii1K-httbu2APdG8DvDyM8).
106
+
107
+ ## Video dubbing
108
+ <table class="center">
109
+ <tr style="font-weight: bolder;text-align:center;">
110
+ <td width="70%">MuseTalk</td>
111
+ <td width="30%">Original videos</td>
112
+ </tr>
113
+ <tr>
114
+ <td>
115
+ <video src=https://github.com/TMElyralab/MuseTalk/assets/163980830/4d7c5fa1-3550-4d52-8ed2-52f158150f24 controls preload></video>
116
+ </td>
117
+ <td>
118
+ <a href="//www.bilibili.com/video/BV1wT411b7HU">Link</a>
119
+ <href src=""></href>
120
+ </td>
121
+ </tr>
122
+ </table>
123
+
124
+ * For video dubbing, we applied a self-developed tool which can detect the talking person.
125
+
126
+
127
+ # TODO:
128
+ - [x] trained models and inference codes.
129
+ - [ ] technical report.
130
+ - [ ] training codes.
131
+ - [ ] online UI.
132
+ - [ ] a better model (may take longer).
133
+
134
+
135
+ # Getting Started
136
+ We provide a detailed tutorial about the installation and the basic usage of MuseTalk for new users:
137
+ ## Installation
138
+ To prepare the Python environment and install additional packages such as opencv, diffusers, mmcv, etc., please follow the steps below:
139
+ ### Build environment
140
+
141
+ We recommend a python version >=3.10 and cuda version =11.7. Then build environment as follows:
142
+
143
+ ```shell
144
+ pip install -r requirements.txt
145
+ ```
146
+ ### whisper
147
+ install whisper to extract audio feature (only encoder)
148
+ ```
149
+ pip install --editable ./musetalk/whisper
150
+ ```
151
+
152
+ ### mmlab packages
153
+ ```bash
154
+ pip install --no-cache-dir -U openmim
155
+ mim install mmengine
156
+ mim install "mmcv>=2.0.1"
157
+ mim install "mmdet>=3.1.0"
158
+ mim install "mmpose>=1.1.0"
159
+ ```
160
+
161
+ ### Download ffmpeg-static
162
+ Download the ffmpeg-static and
163
+ ```
164
+ export FFMPEG_PATH=/path/to/ffmpeg
165
+ ```
166
+ for example:
167
+ ```
168
+ export FFMPEG_PATH=/musetalk/ffmpeg-4.4-amd64-static
169
+ ```
170
+ ### Download weights
171
+ You can download weights manually as follows:
172
+
173
+ 1. Download our trained [weights](https://huggingface.co/TMElyralab/MuseTalk).
174
+
175
+ 2. Download the weights of other components:
176
+ - [sd-vae-ft-mse](https://huggingface.co/stabilityai/sd-vae-ft-mse)
177
+ - [whisper](https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt)
178
+ - [dwpose](https://huggingface.co/yzd-v/DWPose/tree/main)
179
+ - [face-parse-bisent](https://github.com/zllrunning/face-parsing.PyTorch)
180
+ - [resnet18](https://download.pytorch.org/models/resnet18-5c106cde.pth)
181
+
182
+
183
+ Finally, these weights should be organized in `models` as follows:
184
+ ```
185
+ ./models/
186
+ ├── musetalk
187
+ │ └── musetalk.json
188
+ │ └── pytorch_model.bin
189
+ ├── dwpose
190
+ │ └── dw-ll_ucoco_384.pth
191
+ ├── face-parse-bisent
192
+ │ ├── 79999_iter.pth
193
+ │ └── resnet18-5c106cde.pth
194
+ ├── sd-vae-ft-mse
195
+ │ ├── config.json
196
+ │ └── diffusion_pytorch_model.bin
197
+ └── whisper
198
+ └── tiny.pt
199
+ ```
200
+ ## Quickstart
201
+
202
+ ### Inference
203
+ Here, we provide the inference script.
204
+ ```
205
+ python -m scripts.inference --inference_config configs/inference/test.yaml
206
+ ```
207
+ configs/inference/test.yaml is the path to the inference configuration file, including video_path and audio_path.
208
+ The video_path should be either a video file or a directory of images.
209
+
210
+ #### Use of bbox_shift to have adjustable results
211
+ :mag_right: We have found that upper-bound of the mask has an important impact on mouth openness. Thus, to control the mask region, we suggest using the `bbox_shift` parameter. Positive values (moving towards the lower half) increase mouth openness, while negative values (moving towards the upper half) decrease mouth openness.
212
+
213
+ You can start by running with the default configuration to obtain the adjustable value range, and then re-run the script within this range.
214
+
215
+ For example, in the case of `Xinying Sun`, after running the default configuration, it shows that the adjustable value rage is [-9, 9]. Then, to decrease the mouth openness, we set the value to be `-7`.
216
+ ```
217
+ python -m scripts.inference --inference_config configs/inference/test.yaml --bbox_shift -7
218
+ ```
219
+ :pushpin: More technical details can be found in [bbox_shift](assets/BBOX_SHIFT.md).
220
+
221
+ #### Combining MuseV and MuseTalk
222
+
223
+ As a complete solution to virtual human generation, you are suggested to first apply [MuseV](https://github.com/TMElyralab/MuseV) to generate a video (text-to-video, image-to-video or pose-to-video) by referring [this](https://github.com/TMElyralab/MuseV?tab=readme-ov-file#text2video). Then, you can use `MuseTalk` to generate a lip-sync video by referring [this](https://github.com/TMElyralab/MuseTalk?tab=readme-ov-file#inference).
224
+
225
+ # Note
226
+
227
+ If you want to launch online video chats, you are suggested to generate videos using MuseV and apply necessary pre-processing such as face detection in advance. During online chatting, only UNet and the VAE decoder are involved, which makes MuseTalk real-time.
228
+
229
+
230
+ # Acknowledgement
231
+ 1. We thank open-source components like [whisper](https://github.com/isaacOnline/whisper/tree/extract-embeddings), [dwpose](https://github.com/IDEA-Research/DWPose), [face-alignment](https://github.com/1adrianb/face-alignment), [face-parsing](https://github.com/zllrunning/face-parsing.PyTorch), [S3FD](https://github.com/yxlijun/S3FD.pytorch).
232
+ 1. MuseTalk has referred much to [diffusers](https://github.com/huggingface/diffusers).
233
+ 1. MuseTalk has been built on `HDTF` datasets.
234
+
235
+ Thanks for open-sourcing!
236
+
237
+ # Limitations
238
+ - Resolution: Though MuseTalk uses a face region size of 256 x 256, which make it better than other open-source methods, it has not yet reached the theoretical resolution bound. We will continue to deal with this problem.
239
+ If you need higher resolution, you could apply super resolution models such as [GFPGAN](https://github.com/TencentARC/GFPGAN) in combination with MuseTalk.
240
+
241
+ - Identity preservation: Some details of the original face are not well preserved, such as mustache, lip shape and color.
242
+
243
+ - Jitter: There exists some jitter as the current pipeline adopts single-frame generation.
244
+
245
+ # Citation
246
+ ```bib
247
+ @article{musetalk,
248
+ title={MuseTalk: Real-Time High Quality Lip Synchorization with Latent Space Inpainting},
249
+ author={Zhang, Yue and Liu, Minhao and Chen, Zhaokang and Wu, Bin and He, Yingjie and Zhan, Chao and Zhou, Wenjiang},
250
+ journal={arxiv},
251
+ year={2024}
252
+ }
253
+ ```
254
+ # Disclaimer/License
255
+ 1. `code`: The code of MuseTalk is released under the MIT License. There is no limitation for both academic and commercial usage.
256
+ 1. `model`: The trained model are available for any purpose, even commercially.
257
+ 1. `other opensource model`: Other open-source models used must comply with their license, such as `whisper`, `ft-mse-vae`, `dwpose`, `S3FD`, etc..
258
+ 1. The testdata are collected from internet, which are available for non-commercial research purposes only.
259
+ 1. `AIGC`: This project strives to impact the domain of AI-driven video generation positively. Users are granted the freedom to create videos using this tool, but they are expected to comply with local laws and utilize it responsibly. The developers do not assume any responsibility for potential misuse by users.
models/dwpose/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/dwpose/.huggingface/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *
models/dwpose/.huggingface/download/.gitattributes.lock ADDED
File without changes
models/dwpose/.huggingface/download/.gitattributes.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 1a7144101628d69ee7a3768d1ee3a094070dc388
2
+ a6344aac8c09253b3b630fb776ae94478aa0275b
3
+ 1715152288.0137117
models/dwpose/.huggingface/download/README.md.lock ADDED
File without changes
models/dwpose/.huggingface/download/README.md.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 1a7144101628d69ee7a3768d1ee3a094070dc388
2
+ 154df8298fab5ecf322016157858e08cd1bccbe1
3
+ 1715152288.399598
models/dwpose/.huggingface/download/dw-ll_ucoco.pth.lock ADDED
File without changes
models/dwpose/.huggingface/download/dw-ll_ucoco.pth.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 1a7144101628d69ee7a3768d1ee3a094070dc388
2
+ e9600664e7927229ed594197d552023e3be213f810beb38847a959ec8261e0f7
3
+ 1715152341.4611125
models/dwpose/.huggingface/download/dw-ll_ucoco_384.onnx.lock ADDED
File without changes
models/dwpose/.huggingface/download/dw-ll_ucoco_384.onnx.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 1a7144101628d69ee7a3768d1ee3a094070dc388
2
+ 724f4ff2439ed61afb86fb8a1951ec39c6220682803b4a8bd4f598cd913b1843
3
+ 1715152314.1359112
models/dwpose/.huggingface/download/dw-ll_ucoco_384.pth.lock ADDED
File without changes
models/dwpose/.huggingface/download/dw-ll_ucoco_384.pth.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 1a7144101628d69ee7a3768d1ee3a094070dc388
2
+ 0d9408b13cd863c4e95a149dd31232f88f2a12aa6cf8964ed74d7d97748c7a07
3
+ 1715152341.9774292
models/dwpose/.huggingface/download/dw-mm_ucoco.pth.lock ADDED
File without changes
models/dwpose/.huggingface/download/dw-mm_ucoco.pth.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 1a7144101628d69ee7a3768d1ee3a094070dc388
2
+ b24f27f57d18d8bb7abc3af8e09bcc5f77ee9ecae13439f70a8f7d1b885413cf
3
+ 1715152324.0017805