Spaces:
Running
on
Zero
Running
on
Zero
zejunyang
commited on
Commit
•
ac336de
1
Parent(s):
4f3b622
update
Browse files- app.py +6 -25
- src/utils/crop_face_single.py +0 -3
- src/utils/mp_utils.py +0 -2
app.py
CHANGED
@@ -99,9 +99,7 @@ pipe = pipe.to("cuda", dtype=weight_dtype)
|
|
99 |
frame_inter_model = init_frame_interpolation_model()
|
100 |
|
101 |
@spaces.GPU
|
102 |
-
def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=60, seed=42):
|
103 |
-
print('=====Start processing======')
|
104 |
-
|
105 |
fps = 30
|
106 |
cfg = 3.5
|
107 |
fi_step = 3
|
@@ -121,11 +119,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
121 |
while os.path.exists(save_dir):
|
122 |
save_dir = Path(f"a2v_output/{date_str}/{save_dir_name}_{np.random.randint(10000):04d}")
|
123 |
save_dir.mkdir(exist_ok=True, parents=True)
|
124 |
-
|
125 |
-
print('=====1======')
|
126 |
|
127 |
ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
|
128 |
-
print('=====1======', ref_img.shape, ref_image_np.shape)
|
129 |
ref_image_np = crop_face(ref_image_np, lmk_extractor)
|
130 |
if ref_image_np is None:
|
131 |
return None, Image.fromarray(ref_img)
|
@@ -133,22 +128,16 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
133 |
ref_image_np = cv2.resize(ref_image_np, (size, size))
|
134 |
ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
|
135 |
|
136 |
-
print('=====2======')
|
137 |
-
|
138 |
face_result = lmk_extractor(ref_image_np)
|
139 |
if face_result is None:
|
140 |
return None, ref_image_pil
|
141 |
-
|
142 |
-
print('=====3======')
|
143 |
-
|
144 |
lmks = face_result['lmks'].astype(np.float32)
|
145 |
ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
|
146 |
|
147 |
sample = prepare_audio_feature(input_audio, wav2vec_model_path=audio_infer_config['a2m_model']['model_path'])
|
148 |
sample['audio_feature'] = torch.from_numpy(sample['audio_feature']).float().cuda()
|
149 |
sample['audio_feature'] = sample['audio_feature'].unsqueeze(0)
|
150 |
-
|
151 |
-
print('=====4======')
|
152 |
|
153 |
# inference
|
154 |
pred = a2m_model.infer(sample['audio_feature'], sample['seq_len'])
|
@@ -156,8 +145,6 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
156 |
pred = pred.reshape(pred.shape[0], -1, 3)
|
157 |
pred = pred + face_result['lmks3d']
|
158 |
|
159 |
-
print('=====5======')
|
160 |
-
|
161 |
if headpose_video is not None:
|
162 |
pose_seq = get_headpose_temp(headpose_video)
|
163 |
else:
|
@@ -172,8 +159,6 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
172 |
for i, verts in enumerate(projected_vertices):
|
173 |
lmk_img = vis.draw_landmarks((width, height), verts, normed=False)
|
174 |
pose_images.append(lmk_img)
|
175 |
-
|
176 |
-
print('=====6======')
|
177 |
|
178 |
pose_list = []
|
179 |
# pose_tensor_list = []
|
@@ -182,7 +167,7 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
182 |
# [transforms.Resize((height, width)), transforms.ToTensor()]
|
183 |
# )
|
184 |
args_L = len(pose_images) if length==0 or length > len(pose_images) else length
|
185 |
-
args_L = min(args_L,
|
186 |
for pose_image_np in pose_images[: args_L : fi_step]:
|
187 |
# pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
|
188 |
# pose_tensor_list.append(pose_transform(pose_image_pil))
|
@@ -192,8 +177,6 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
192 |
pose_list = np.array(pose_list)
|
193 |
|
194 |
video_length = len(pose_list)
|
195 |
-
|
196 |
-
print('=====7======')
|
197 |
|
198 |
video = pipe(
|
199 |
ref_image_pil,
|
@@ -231,8 +214,6 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
|
|
231 |
|
232 |
@spaces.GPU
|
233 |
def video2video(ref_img, source_video, size=512, steps=25, length=60, seed=42):
|
234 |
-
print('=====Start processing======')
|
235 |
-
|
236 |
cfg = 3.5
|
237 |
fi_step = 3
|
238 |
|
@@ -282,7 +263,7 @@ def video2video(ref_img, source_video, size=512, steps=25, length=60, seed=42):
|
|
282 |
verts_list = []
|
283 |
bs_list = []
|
284 |
args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
|
285 |
-
args_L = min(args_L,
|
286 |
for src_image_pil in source_images[: args_L : step*fi_step]:
|
287 |
src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
|
288 |
frame_height, frame_width, _ = src_img_np.shape
|
@@ -408,7 +389,7 @@ with gr.Blocks() as demo:
|
|
408 |
a2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
409 |
|
410 |
with gr.Row():
|
411 |
-
a2v_length = gr.Slider(minimum=0, maximum=
|
412 |
a2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
413 |
|
414 |
a2v_botton = gr.Button("Generate", variant="primary")
|
@@ -436,7 +417,7 @@ with gr.Blocks() as demo:
|
|
436 |
v2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
437 |
|
438 |
with gr.Row():
|
439 |
-
v2v_length = gr.Slider(minimum=0, maximum=
|
440 |
v2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
441 |
|
442 |
v2v_botton = gr.Button("Generate", variant="primary")
|
|
|
99 |
frame_inter_model = init_frame_interpolation_model()
|
100 |
|
101 |
@spaces.GPU
|
102 |
+
def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=60, seed=42):
|
|
|
|
|
103 |
fps = 30
|
104 |
cfg = 3.5
|
105 |
fi_step = 3
|
|
|
119 |
while os.path.exists(save_dir):
|
120 |
save_dir = Path(f"a2v_output/{date_str}/{save_dir_name}_{np.random.randint(10000):04d}")
|
121 |
save_dir.mkdir(exist_ok=True, parents=True)
|
|
|
|
|
122 |
|
123 |
ref_image_np = cv2.cvtColor(ref_img, cv2.COLOR_RGB2BGR)
|
|
|
124 |
ref_image_np = crop_face(ref_image_np, lmk_extractor)
|
125 |
if ref_image_np is None:
|
126 |
return None, Image.fromarray(ref_img)
|
|
|
128 |
ref_image_np = cv2.resize(ref_image_np, (size, size))
|
129 |
ref_image_pil = Image.fromarray(cv2.cvtColor(ref_image_np, cv2.COLOR_BGR2RGB))
|
130 |
|
|
|
|
|
131 |
face_result = lmk_extractor(ref_image_np)
|
132 |
if face_result is None:
|
133 |
return None, ref_image_pil
|
134 |
+
|
|
|
|
|
135 |
lmks = face_result['lmks'].astype(np.float32)
|
136 |
ref_pose = vis.draw_landmarks((ref_image_np.shape[1], ref_image_np.shape[0]), lmks, normed=True)
|
137 |
|
138 |
sample = prepare_audio_feature(input_audio, wav2vec_model_path=audio_infer_config['a2m_model']['model_path'])
|
139 |
sample['audio_feature'] = torch.from_numpy(sample['audio_feature']).float().cuda()
|
140 |
sample['audio_feature'] = sample['audio_feature'].unsqueeze(0)
|
|
|
|
|
141 |
|
142 |
# inference
|
143 |
pred = a2m_model.infer(sample['audio_feature'], sample['seq_len'])
|
|
|
145 |
pred = pred.reshape(pred.shape[0], -1, 3)
|
146 |
pred = pred + face_result['lmks3d']
|
147 |
|
|
|
|
|
148 |
if headpose_video is not None:
|
149 |
pose_seq = get_headpose_temp(headpose_video)
|
150 |
else:
|
|
|
159 |
for i, verts in enumerate(projected_vertices):
|
160 |
lmk_img = vis.draw_landmarks((width, height), verts, normed=False)
|
161 |
pose_images.append(lmk_img)
|
|
|
|
|
162 |
|
163 |
pose_list = []
|
164 |
# pose_tensor_list = []
|
|
|
167 |
# [transforms.Resize((height, width)), transforms.ToTensor()]
|
168 |
# )
|
169 |
args_L = len(pose_images) if length==0 or length > len(pose_images) else length
|
170 |
+
args_L = min(args_L, 90)
|
171 |
for pose_image_np in pose_images[: args_L : fi_step]:
|
172 |
# pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
|
173 |
# pose_tensor_list.append(pose_transform(pose_image_pil))
|
|
|
177 |
pose_list = np.array(pose_list)
|
178 |
|
179 |
video_length = len(pose_list)
|
|
|
|
|
180 |
|
181 |
video = pipe(
|
182 |
ref_image_pil,
|
|
|
214 |
|
215 |
@spaces.GPU
|
216 |
def video2video(ref_img, source_video, size=512, steps=25, length=60, seed=42):
|
|
|
|
|
217 |
cfg = 3.5
|
218 |
fi_step = 3
|
219 |
|
|
|
263 |
verts_list = []
|
264 |
bs_list = []
|
265 |
args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
|
266 |
+
args_L = min(args_L, 90*step)
|
267 |
for src_image_pil in source_images[: args_L : step*fi_step]:
|
268 |
src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
|
269 |
frame_height, frame_width, _ = src_img_np.shape
|
|
|
389 |
a2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
390 |
|
391 |
with gr.Row():
|
392 |
+
a2v_length = gr.Slider(minimum=0, maximum=90, step=1, value=30, label="Length (-L)")
|
393 |
a2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
394 |
|
395 |
a2v_botton = gr.Button("Generate", variant="primary")
|
|
|
417 |
v2v_step_slider = gr.Slider(minimum=5, maximum=20, step=1, value=15, label="Steps (--steps)")
|
418 |
|
419 |
with gr.Row():
|
420 |
+
v2v_length = gr.Slider(minimum=0, maximum=90, step=1, value=30, label="Length (-L)")
|
421 |
v2v_seed = gr.Number(value=42, label="Seed (--seed)")
|
422 |
|
423 |
v2v_botton = gr.Button("Generate", variant="primary")
|
src/utils/crop_face_single.py
CHANGED
@@ -3,10 +3,7 @@ import cv2
|
|
3 |
|
4 |
|
5 |
def crop_face(img, lmk_extractor, expand=1.5):
|
6 |
-
print('****=====1======')
|
7 |
result = lmk_extractor(img) # cv2 BGR
|
8 |
-
|
9 |
-
print('****=====2======')
|
10 |
|
11 |
if result is None:
|
12 |
return None
|
|
|
3 |
|
4 |
|
5 |
def crop_face(img, lmk_extractor, expand=1.5):
|
|
|
6 |
result = lmk_extractor(img) # cv2 BGR
|
|
|
|
|
7 |
|
8 |
if result is None:
|
9 |
return None
|
src/utils/mp_utils.py
CHANGED
@@ -38,7 +38,6 @@ class LMKExtractor():
|
|
38 |
|
39 |
|
40 |
def __call__(self, img):
|
41 |
-
print('///=====1======')
|
42 |
frame = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
43 |
image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
|
44 |
# t0 = time.time()
|
@@ -61,7 +60,6 @@ class LMKExtractor():
|
|
61 |
except:
|
62 |
return None
|
63 |
|
64 |
-
print('///=====2======')
|
65 |
bs_list = detection_result.face_blendshapes
|
66 |
if len(bs_list) == 1:
|
67 |
bs = bs_list[0]
|
|
|
38 |
|
39 |
|
40 |
def __call__(self, img):
|
|
|
41 |
frame = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
42 |
image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)
|
43 |
# t0 = time.time()
|
|
|
60 |
except:
|
61 |
return None
|
62 |
|
|
|
63 |
bs_list = detection_result.face_blendshapes
|
64 |
if len(bs_list) == 1:
|
65 |
bs = bs_list[0]
|