zejunyang commited on
Commit
e24f684
1 Parent(s): d1af78b
Files changed (1) hide show
  1. app.py +16 -2
app.py CHANGED
@@ -118,6 +118,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
118
  while os.path.exists(save_dir):
119
  save_dir = Path(f"a2v_output/{date_str}/{save_dir_name}_{np.random.randint(10000):04d}")
120
  save_dir.mkdir(exist_ok=True, parents=True)
 
 
121
 
122
  ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
123
  ref_image_np = crop_face(ref_image_np, lmk_extractor)
@@ -127,16 +129,22 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
127
  ref_image_np = cv2.resize(ref_image_np, (size, size))
128
  ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
129
 
 
 
130
  face_result = lmk_extractor(ref_image_np)
131
  if face_result is None:
132
  return None, ref_image_pil
133
 
 
 
134
  lmks = face_result['lmks'].astype(np.float32)
135
  ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
136
 
137
  sample = prepare_audio_feature(input_audio, wav2vec_model_path=audio_infer_config['a2m_model']['model_path'])
138
  sample['audio_feature'] = torch.from_numpy(sample['audio_feature']).float().cuda()
139
  sample['audio_feature'] = sample['audio_feature'].unsqueeze(0)
 
 
140
 
141
  # inference
142
  pred = a2m_model.infer(sample['audio_feature'], sample['seq_len'])
@@ -144,6 +152,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
144
  pred = pred.reshape(pred.shape[0], -1, 3)
145
  pred = pred + face_result['lmks3d']
146
 
 
 
147
  if headpose_video is not None:
148
  pose_seq = get_headpose_temp(headpose_video)
149
  else:
@@ -158,6 +168,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
158
  for i, verts in enumerate(projected_vertices):
159
  lmk_img = vis.draw_landmarks((width, height), verts, normed=False)
160
  pose_images.append(lmk_img)
 
 
161
 
162
  pose_list = []
163
  # pose_tensor_list = []
@@ -176,6 +188,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
176
  pose_list = np.array(pose_list)
177
 
178
  video_length = len(pose_list)
 
 
179
 
180
  video = pipe(
181
  ref_image_pil,
@@ -383,7 +397,7 @@ with gr.Blocks() as demo:
383
  a2v_headpose_video = gr.Video(label="Option: upload head pose reference video", sources="upload")
384
 
385
  with gr.Row():
386
- a2v_size_slider = gr.Slider(minimum=256, maximum=512, step=8, value=512, label="Video size (-W & -H)")
387
  a2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
388
 
389
  with gr.Row():
@@ -411,7 +425,7 @@ with gr.Blocks() as demo:
411
  v2v_source_video = gr.Video(label="Upload source video", sources="upload")
412
 
413
  with gr.Row():
414
- v2v_size_slider = gr.Slider(minimum=256, maximum=512, step=8, value=512, label="Video size (-W & -H)")
415
  v2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
416
 
417
  with gr.Row():
 
118
  while os.path.exists(save_dir):
119
  save_dir = Path(f"a2v_output/{date_str}/{save_dir_name}_{np.random.randint(10000):04d}")
120
  save_dir.mkdir(exist_ok=True, parents=True)
121
+
122
+ print('=====1======')
123
 
124
  ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
125
  ref_image_np = crop_face(ref_image_np, lmk_extractor)
 
129
  ref_image_np = cv2.resize(ref_image_np, (size, size))
130
  ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
131
 
132
+ print('=====2======')
133
+
134
  face_result = lmk_extractor(ref_image_np)
135
  if face_result is None:
136
  return None, ref_image_pil
137
 
138
+ print('=====3======')
139
+
140
  lmks = face_result['lmks'].astype(np.float32)
141
  ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
142
 
143
  sample = prepare_audio_feature(input_audio, wav2vec_model_path=audio_infer_config['a2m_model']['model_path'])
144
  sample['audio_feature'] = torch.from_numpy(sample['audio_feature']).float().cuda()
145
  sample['audio_feature'] = sample['audio_feature'].unsqueeze(0)
146
+
147
+ print('=====4======')
148
 
149
  # inference
150
  pred = a2m_model.infer(sample['audio_feature'], sample['seq_len'])
 
152
  pred = pred.reshape(pred.shape[0], -1, 3)
153
  pred = pred + face_result['lmks3d']
154
 
155
+ print('=====5======')
156
+
157
  if headpose_video is not None:
158
  pose_seq = get_headpose_temp(headpose_video)
159
  else:
 
168
  for i, verts in enumerate(projected_vertices):
169
  lmk_img = vis.draw_landmarks((width, height), verts, normed=False)
170
  pose_images.append(lmk_img)
171
+
172
+ print('=====6======')
173
 
174
  pose_list = []
175
  # pose_tensor_list = []
 
188
  pose_list = np.array(pose_list)
189
 
190
  video_length = len(pose_list)
191
+
192
+ print('=====7======')
193
 
194
  video = pipe(
195
  ref_image_pil,
 
397
  a2v_headpose_video = gr.Video(label="Option: upload head pose reference video", sources="upload")
398
 
399
  with gr.Row():
400
+ a2v_size_slider = gr.Slider(minimum=256, maximum=512, step=8, value=384, label="Video size (-W & -H)")
401
  a2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
402
 
403
  with gr.Row():
 
425
  v2v_source_video = gr.Video(label="Upload source video", sources="upload")
426
 
427
  with gr.Row():
428
+ v2v_size_slider = gr.Slider(minimum=256, maximum=512, step=8, value=384, label="Video size (-W & -H)")
429
  v2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
430
 
431
  with gr.Row():