Spaces:
Running
on
Zero
Running
on
Zero
zejunyang
commited on
Commit
•
e24f684
1
Parent(s):
d1af78b
debug
Browse files
app.py
CHANGED
@@ -118,6 +118,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
118 |
while os.path.exists(save_dir):
|
119 |
save_dir = Path(f"a2v_output/{date_str}/{save_dir_name}_{np.random.randint(10000):04d}")
|
120 |
save_dir.mkdir(exist_ok=True, parents=True)
|
|
|
|
|
121 |
|
122 |
ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
|
123 |
ref_image_np = crop_face(ref_image_np, lmk_extractor)
|
@@ -127,16 +129,22 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
127 |
ref_image_np = cv2.resize(ref_image_np, (size, size))
|
128 |
ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
|
129 |
|
|
|
|
|
130 |
face_result = lmk_extractor(ref_image_np)
|
131 |
if face_result is None:
|
132 |
return None, ref_image_pil
|
133 |
|
|
|
|
|
134 |
lmks = face_result['lmks'].astype(np.float32)
|
135 |
ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
|
136 |
|
137 |
sample = prepare_audio_feature(input_audio, wav2vec_model_path=audio_infer_config['a2m_model']['model_path'])
|
138 |
sample['audio_feature'] = torch.from_numpy(sample['audio_feature']).float().cuda()
|
139 |
sample['audio_feature'] = sample['audio_feature'].unsqueeze(0)
|
|
|
|
|
140 |
|
141 |
# inference
|
142 |
pred = a2m_model.infer(sample['audio_feature'], sample['seq_len'])
|
@@ -144,6 +152,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
144 |
pred = pred.reshape(pred.shape[0], -1, 3)
|
145 |
pred = pred + face_result['lmks3d']
|
146 |
|
|
|
|
|
147 |
if headpose_video is not None:
|
148 |
pose_seq = get_headpose_temp(headpose_video)
|
149 |
else:
|
@@ -158,6 +168,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
158 |
for i, verts in enumerate(projected_vertices):
|
159 |
lmk_img = vis.draw_landmarks((width, height), verts, normed=False)
|
160 |
pose_images.append(lmk_img)
|
|
|
|
|
161 |
|
162 |
pose_list = []
|
163 |
# pose_tensor_list = []
|
@@ -176,6 +188,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
176 |
pose_list = np.array(pose_list)
|
177 |
|
178 |
video_length = len(pose_list)
|
|
|
|
|
179 |
|
180 |
video = pipe(
|
181 |
ref_image_pil,
|
@@ -383,7 +397,7 @@ with gr.Blocks() as demo:
|
|
383 |
a2v_headpose_video = gr.Video(label="Option: upload head pose reference video", sources="upload")
|
384 |
|
385 |
with gr.Row():
|
386 |
-
a2v_size_slider = gr.Slider(minimum=256, maximum=512, step=8, value=
|
387 |
a2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
388 |
|
389 |
with gr.Row():
|
@@ -411,7 +425,7 @@ with gr.Blocks() as demo:
|
|
411 |
v2v_source_video = gr.Video(label="Upload source video", sources="upload")
|
412 |
|
413 |
with gr.Row():
|
414 |
-
v2v_size_slider = gr.Slider(minimum=256, maximum=512, step=8, value=
|
415 |
v2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
416 |
|
417 |
with gr.Row():
|
|
|
118 |
while os.path.exists(save_dir):
|
119 |
save_dir = Path(f"a2v_output/{date_str}/{save_dir_name}_{np.random.randint(10000):04d}")
|
120 |
save_dir.mkdir(exist_ok=True, parents=True)
|
121 |
+
|
122 |
+
print('=====1======')
|
123 |
|
124 |
ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
|
125 |
ref_image_np = crop_face(ref_image_np, lmk_extractor)
|
|
|
129 |
ref_image_np = cv2.resize(ref_image_np, (size, size))
|
130 |
ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
|
131 |
|
132 |
+
print('=====2======')
|
133 |
+
|
134 |
face_result = lmk_extractor(ref_image_np)
|
135 |
if face_result is None:
|
136 |
return None, ref_image_pil
|
137 |
|
138 |
+
print('=====3======')
|
139 |
+
|
140 |
lmks = face_result['lmks'].astype(np.float32)
|
141 |
ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
|
142 |
|
143 |
sample = prepare_audio_feature(input_audio, wav2vec_model_path=audio_infer_config['a2m_model']['model_path'])
|
144 |
sample['audio_feature'] = torch.from_numpy(sample['audio_feature']).float().cuda()
|
145 |
sample['audio_feature'] = sample['audio_feature'].unsqueeze(0)
|
146 |
+
|
147 |
+
print('=====4======')
|
148 |
|
149 |
# inference
|
150 |
pred = a2m_model.infer(sample['audio_feature'], sample['seq_len'])
|
|
|
152 |
pred = pred.reshape(pred.shape[0], -1, 3)
|
153 |
pred = pred + face_result['lmks3d']
|
154 |
|
155 |
+
print('=====5======')
|
156 |
+
|
157 |
if headpose_video is not None:
|
158 |
pose_seq = get_headpose_temp(headpose_video)
|
159 |
else:
|
|
|
168 |
for i, verts in enumerate(projected_vertices):
|
169 |
lmk_img = vis.draw_landmarks((width, height), verts, normed=False)
|
170 |
pose_images.append(lmk_img)
|
171 |
+
|
172 |
+
print('=====6======')
|
173 |
|
174 |
pose_list = []
|
175 |
# pose_tensor_list = []
|
|
|
188 |
pose_list = np.array(pose_list)
|
189 |
|
190 |
video_length = len(pose_list)
|
191 |
+
|
192 |
+
print('=====7======')
|
193 |
|
194 |
video = pipe(
|
195 |
ref_image_pil,
|
|
|
397 |
a2v_headpose_video = gr.Video(label="Option: upload head pose reference video", sources="upload")
|
398 |
|
399 |
with gr.Row():
|
400 |
+
a2v_size_slider = gr.Slider(minimum=256, maximum=512, step=8, value=384, label="Video size (-W & -H)")
|
401 |
a2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
402 |
|
403 |
with gr.Row():
|
|
|
425 |
v2v_source_video = gr.Video(label="Upload source video", sources="upload")
|
426 |
|
427 |
with gr.Row():
|
428 |
+
v2v_size_slider = gr.Slider(minimum=256, maximum=512, step=8, value=384, label="Video size (-W & -H)")
|
429 |
v2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
430 |
|
431 |
with gr.Row():
|