zejunyang commited on
Commit
ac336de
1 Parent(s): 4f3b622
Files changed (3) hide show
  1. app.py +6 -25
  2. src/utils/crop_face_single.py +0 -3
  3. src/utils/mp_utils.py +0 -2
app.py CHANGED
@@ -99,9 +99,7 @@ pipe = pipe.to("cuda", dtype=weight_dtype)
99
  frame_inter_model = init_frame_interpolation_model()
100
 
101
  @spaces.GPU
102
- def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=60, seed=42):
103
- print('=====Start processing======')
104
-
105
  fps = 30
106
  cfg = 3.5
107
  fi_step = 3
@@ -121,11 +119,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
121
  while os.path.exists(save_dir):
122
  save_dir = Path(f"a2v_output/{date_str}/{save_dir_name}_{np.random.randint(10000):04d}")
123
  save_dir.mkdir(exist_ok=True, parents=True)
124
-
125
- print('=====1======')
126
 
127
  ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
128
- print('=====1======', ref_img.shape, ref_image_np.shape)
129
  ref_image_np = crop_face(ref_image_np, lmk_extractor)
130
  if ref_image_np is None:
131
  return None, Image.fromarray(ref_img)
@@ -133,22 +128,16 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
133
  ref_image_np = cv2.resize(ref_image_np, (size, size))
134
  ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
135
 
136
- print('=====2======')
137
-
138
  face_result = lmk_extractor(ref_image_np)
139
  if face_result is None:
140
  return None, ref_image_pil
141
-
142
- print('=====3======')
143
-
144
  lmks = face_result['lmks'].astype(np.float32)
145
  ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
146
 
147
  sample = prepare_audio_feature(input_audio, wav2vec_model_path=audio_infer_config['a2m_model']['model_path'])
148
  sample['audio_feature'] = torch.from_numpy(sample['audio_feature']).float().cuda()
149
  sample['audio_feature'] = sample['audio_feature'].unsqueeze(0)
150
-
151
- print('=====4======')
152
 
153
  # inference
154
  pred = a2m_model.infer(sample['audio_feature'], sample['seq_len'])
@@ -156,8 +145,6 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
156
  pred = pred.reshape(pred.shape[0], -1, 3)
157
  pred = pred + face_result['lmks3d']
158
 
159
- print('=====5======')
160
-
161
  if headpose_video is not None:
162
  pose_seq = get_headpose_temp(headpose_video)
163
  else:
@@ -172,8 +159,6 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
172
  for i, verts in enumerate(projected_vertices):
173
  lmk_img = vis.draw_landmarks((width, height), verts, normed=False)
174
  pose_images.append(lmk_img)
175
-
176
- print('=====6======')
177
 
178
  pose_list = []
179
  # pose_tensor_list = []
@@ -182,7 +167,7 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
182
  # [transforms.Resize((height, width)), transforms.ToTensor()]
183
  # )
184
  args_L = len(pose_images) if length==0 or length > len(pose_images) else length
185
- args_L = min(args_L, 60)
186
  for pose_image_np in pose_images[: args_L : fi_step]:
187
  # pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
188
  # pose_tensor_list.append(pose_transform(pose_image_pil))
@@ -192,8 +177,6 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
192
  pose_list = np.array(pose_list)
193
 
194
  video_length = len(pose_list)
195
-
196
- print('=====7======')
197
 
198
  video = pipe(
199
  ref_image_pil,
@@ -231,8 +214,6 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
231
 
232
  @spaces.GPU
233
  def video2video(ref_img, source_video, size=512, steps=25, length=60, seed=42):
234
- print('=====Start processing======')
235
-
236
  cfg = 3.5
237
  fi_step = 3
238
 
@@ -282,7 +263,7 @@ def video2video(ref_img, source_video, size=512, steps=25, length=60, seed=42):
282
  verts_list = []
283
  bs_list = []
284
  args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
285
- args_L = min(args_L, 60*step)
286
  for src_image_pil in source_images[: args_L : step*fi_step]:
287
  src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
288
  frame_height, frame_width, _ = src_img_np.shape
@@ -408,7 +389,7 @@ with gr.Blocks() as demo:
408
  a2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
409
 
410
  with gr.Row():
411
- a2v_length = gr.Slider(minimum=0, maximum=60, step=1, value=30, label="Length (-L)")
412
  a2v_seed = gr.Number(value=42, label="Seed (--seed)")
413
 
414
  a2v_botton = gr.Button("Generate", variant="primary")
@@ -436,7 +417,7 @@ with gr.Blocks() as demo:
436
  v2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
437
 
438
  with gr.Row():
439
- v2v_length = gr.Slider(minimum=0, maximum=60, step=1, value=30, label="Length (-L)")
440
  v2v_seed = gr.Number(value=42, label="Seed (--seed)")
441
 
442
  v2v_botton = gr.Button("Generate", variant="primary")
 
99
  frame_inter_model = init_frame_interpolation_model()
100
 
101
  @spaces.GPU
102
+ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=60, seed=42):
 
 
103
  fps = 30
104
  cfg = 3.5
105
  fi_step = 3
 
119
  while os.path.exists(save_dir):
120
  save_dir = Path(f"a2v_output/{date_str}/{save_dir_name}_{np.random.randint(10000):04d}")
121
  save_dir.mkdir(exist_ok=True, parents=True)
 
 
122
 
123
  ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
 
124
  ref_image_np = crop_face(ref_image_np, lmk_extractor)
125
  if ref_image_np is None:
126
  return None, Image.fromarray(ref_img)
 
128
  ref_image_np = cv2.resize(ref_image_np, (size, size))
129
  ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
130
 
 
 
131
  face_result = lmk_extractor(ref_image_np)
132
  if face_result is None:
133
  return None, ref_image_pil
134
+
 
 
135
  lmks = face_result['lmks'].astype(np.float32)
136
  ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
137
 
138
  sample = prepare_audio_feature(input_audio, wav2vec_model_path=audio_infer_config['a2m_model']['model_path'])
139
  sample['audio_feature'] = torch.from_numpy(sample['audio_feature']).float().cuda()
140
  sample['audio_feature'] = sample['audio_feature'].unsqueeze(0)
 
 
141
 
142
  # inference
143
  pred = a2m_model.infer(sample['audio_feature'], sample['seq_len'])
 
145
  pred = pred.reshape(pred.shape[0], -1, 3)
146
  pred = pred + face_result['lmks3d']
147
 
 
 
148
  if headpose_video is not None:
149
  pose_seq = get_headpose_temp(headpose_video)
150
  else:
 
159
  for i, verts in enumerate(projected_vertices):
160
  lmk_img = vis.draw_landmarks((width, height), verts, normed=False)
161
  pose_images.append(lmk_img)
 
 
162
 
163
  pose_list = []
164
  # pose_tensor_list = []
 
167
  # [transforms.Resize((height, width)), transforms.ToTensor()]
168
  # )
169
  args_L = len(pose_images) if length==0 or length > len(pose_images) else length
170
+ args_L = min(args_L, 90)
171
  for pose_image_np in pose_images[: args_L : fi_step]:
172
  # pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
173
  # pose_tensor_list.append(pose_transform(pose_image_pil))
 
177
  pose_list = np.array(pose_list)
178
 
179
  video_length = len(pose_list)
 
 
180
 
181
  video = pipe(
182
  ref_image_pil,
 
214
 
215
  @spaces.GPU
216
  def video2video(ref_img, source_video, size=512, steps=25, length=60, seed=42):
 
 
217
  cfg = 3.5
218
  fi_step = 3
219
 
 
263
  verts_list = []
264
  bs_list = []
265
  args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
266
+ args_L = min(args_L, 90*step)
267
  for src_image_pil in source_images[: args_L : step*fi_step]:
268
  src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
269
  frame_height, frame_width, _ = src_img_np.shape
 
389
  a2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
390
 
391
  with gr.Row():
392
+ a2v_length = gr.Slider(minimum=0, maximum=90, step=1, value=30, label="Length (-L)")
393
  a2v_seed = gr.Number(value=42, label="Seed (--seed)")
394
 
395
  a2v_botton = gr.Button("Generate", variant="primary")
 
417
  v2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
418
 
419
  with gr.Row():
420
+ v2v_length = gr.Slider(minimum=0, maximum=90, step=1, value=30, label="Length (-L)")
421
  v2v_seed = gr.Number(value=42, label="Seed (--seed)")
422
 
423
  v2v_botton = gr.Button("Generate", variant="primary")
src/utils/crop_face_single.py CHANGED
@@ -3,10 +3,7 @@ import cv2
3
 
4
 
5
  def crop_face(img, lmk_extractor, expand=1.5):
6
- print('****=====1======')
7
  result = lmk_extractor(img) # cv2 BGR
8
-
9
- print('****=====2======')
10
 
11
  if result is None:
12
  return None
 
3
 
4
 
5
  def crop_face(img, lmk_extractor, expand=1.5):
 
6
  result = lmk_extractor(img) # cv2 BGR
 
 
7
 
8
  if result is None:
9
  return None
src/utils/mp_utils.py CHANGED
@@ -38,7 +38,6 @@ class LMKExtractor():
38
 
39
 
40
  def __call__(self, img):
41
- print('///=====1======')
42
  frame = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
43
  image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
44
  # t0 = time.time()
@@ -61,7 +60,6 @@ class LMKExtractor():
61
  except:
62
  return None
63
 
64
- print('///=====2======')
65
  bs_list = detection_result.face_blendshapes
66
  if len(bs_list) == 1:
67
  bs = bs_list[0]
 
38
 
39
 
40
  def __call__(self, img):
 
41
  frame = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
42
  image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
43
  # t0 = time.time()
 
60
  except:
61
  return None
62
 
 
63
  bs_list = detection_result.face_blendshapes
64
  if len(bs_list) == 1:
65
  bs = bs_list[0]