zejunyang commited on
Commit
3e99418
1 Parent(s): fa7d98a
app.py CHANGED
@@ -98,10 +98,11 @@ vis = FaceMeshVisualizer()
98
 
99
  frame_inter_model = init_frame_interpolation_model()
100
 
101
- @spaces.GPU(duration=200)
102
- def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=150, seed=42):
103
  fps = 30
104
  cfg = 3.5
 
105
 
106
  generator = torch.manual_seed(seed)
107
 
@@ -161,8 +162,8 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
161
  # [transforms.Resize((height, width)), transforms.ToTensor()]
162
  # )
163
  args_L = len(pose_images) if length==0 or length > len(pose_images) else length
164
- args_L = min(args_L, 180)
165
- for pose_image_np in pose_images[: args_L : 2]:
166
  # pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
167
  # pose_tensor_list.append(pose_transform(pose_image_pil))
168
  pose_image_np = cv2.resize(pose_image_np, (width, height))
@@ -183,19 +184,21 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
183
  cfg,
184
  generator=generator,
185
  ).videos
 
 
186
 
187
- # save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio.mp4"
188
- # save_videos_grid(
189
- # video,
190
- # save_path,
191
- # n_rows=1,
192
- # fps=fps,
193
- # )
194
 
195
- save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
196
- save_pil_imgs(video, save_path)
197
 
198
- save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(fps))
199
 
200
  stream = ffmpeg.input(save_path)
201
  audio = ffmpeg.input(input_audio)
@@ -204,9 +207,10 @@ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, l
204
 
205
  return save_path.replace('_noaudio.mp4', '.mp4'), ref_image_pil
206
 
207
- @spaces.GPU(duration=200)
208
- def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
209
  cfg = 3.5
 
210
 
211
  generator = torch.manual_seed(seed)
212
 
@@ -248,11 +252,9 @@ def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
248
  pose_trans_list = []
249
  verts_list = []
250
  bs_list = []
251
- src_tensor_list = []
252
  args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
253
- args_L = min(args_L, 180*step)
254
- for src_image_pil in source_images[: args_L : step*2]:
255
- src_tensor_list.append(pose_transform(src_image_pil))
256
  src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
257
  frame_height, frame_width, _ = src_img_np.shape
258
  src_img_result = lmk_extractor(src_img_np)
@@ -308,19 +310,21 @@ def video2video(ref_img, source_video, size=512, steps=25, length=150, seed=42):
308
  cfg,
309
  generator=generator,
310
  ).videos
 
 
311
 
312
- # save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio.mp4"
313
- # save_videos_grid(
314
- # video,
315
- # save_path,
316
- # n_rows=1,
317
- # fps=src_fps,
318
- # )
319
 
320
- save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
321
- save_pil_imgs(video, save_path)
322
 
323
- save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(src_fps))
324
 
325
  audio_output = f'{save_dir}/audio_from_video.aac'
326
  # extract audio
@@ -353,7 +357,7 @@ description = r"""
353
  """
354
 
355
  tips = r"""
356
- When the video cannot be displayed, you can download the result video.
357
  """
358
 
359
  with gr.Blocks() as demo:
@@ -372,10 +376,10 @@ with gr.Blocks() as demo:
372
 
373
  with gr.Row():
374
  a2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
375
- a2v_step_slider = gr.Slider(minimum=5, maximum=50, step=1, value=20, label="Steps (--steps)")
376
 
377
  with gr.Row():
378
- a2v_length = gr.Slider(minimum=0, maximum=180, step=1, value=60, label="Length (-L) (Set 0 to automatically calculate video length.)")
379
  a2v_seed = gr.Number(value=42, label="Seed (--seed)")
380
 
381
  a2v_botton = gr.Button("Generate", variant="primary")
@@ -400,10 +404,10 @@ with gr.Blocks() as demo:
400
 
401
  with gr.Row():
402
  v2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
403
- v2v_step_slider = gr.Slider(minimum=5, maximum=50, step=1, value=20, label="Steps (--steps)")
404
 
405
  with gr.Row():
406
- v2v_length = gr.Slider(minimum=0, maximum=180, step=1, value=60, label="Length (-L) (Set 0 to automatically calculate video length.)")
407
  v2v_seed = gr.Number(value=42, label="Seed (--seed)")
408
 
409
  v2v_botton = gr.Button("Generate", variant="primary")
 
98
 
99
  frame_inter_model = init_frame_interpolation_model()
100
 
101
+ @spaces.GPU(duration=300)
102
+ def audio2video(input_audio, ref_img, headpose_video=None, size=512, steps=25, length=60, seed=42):
103
  fps = 30
104
  cfg = 3.5
105
+ fi_step = 3
106
 
107
  generator = torch.manual_seed(seed)
108
 
 
162
  # [transforms.Resize((height, width)), transforms.ToTensor()]
163
  # )
164
  args_L = len(pose_images) if length==0 or length > len(pose_images) else length
165
+ args_L = min(args_L, 150)
166
+ for pose_image_np in pose_images[: args_L : fi_step]:
167
  # pose_image_pil = Image.fromarray(cv2.cvtColor(pose_image_np, cv2.COLOR_BGR2RGB))
168
  # pose_tensor_list.append(pose_transform(pose_image_pil))
169
  pose_image_np = cv2.resize(pose_image_np, (width, height))
 
184
  cfg,
185
  generator=generator,
186
  ).videos
187
+
188
+ video = batch_images_interpolation_tool(video, frame_inter_model, inter_frames=fi_step-1)
189
 
190
+ save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio.mp4"
191
+ save_videos_grid(
192
+ video,
193
+ save_path,
194
+ n_rows=1,
195
+ fps=fps,
196
+ )
197
 
198
+ # save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
199
+ # save_pil_imgs(video, save_path)
200
 
201
+ # save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(fps))
202
 
203
  stream = ffmpeg.input(save_path)
204
  audio = ffmpeg.input(input_audio)
 
207
 
208
  return save_path.replace('_noaudio.mp4', '.mp4'), ref_image_pil
209
 
210
+ @spaces.GPU(duration=300)
211
+ def video2video(ref_img, source_video, size=512, steps=25, length=60, seed=42):
212
  cfg = 3.5
213
+ fi_step = 3
214
 
215
  generator = torch.manual_seed(seed)
216
 
 
252
  pose_trans_list = []
253
  verts_list = []
254
  bs_list = []
 
255
  args_L = len(source_images) if length==0 or length*step > len(source_images) else length*step
256
+ args_L = min(args_L, 150*step)
257
+ for src_image_pil in source_images[: args_L : step*fi_step]:
 
258
  src_img_np = cv2.cvtColor(np.array(src_image_pil), cv2.COLOR_RGB2BGR)
259
  frame_height, frame_width, _ = src_img_np.shape
260
  src_img_result = lmk_extractor(src_img_np)
 
310
  cfg,
311
  generator=generator,
312
  ).videos
313
+
314
+ video = batch_images_interpolation_tool(video, frame_inter_model, inter_frames=fi_step-1)
315
 
316
+ save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio.mp4"
317
+ save_videos_grid(
318
+ video,
319
+ save_path,
320
+ n_rows=1,
321
+ fps=src_fps,
322
+ )
323
 
324
+ # save_path = f"{save_dir}/{size}x{size}_{time_str}_noaudio"
325
+ # save_pil_imgs(video, save_path)
326
 
327
+ # save_path = batch_images_interpolation_tool(save_path, frame_inter_model, int(src_fps))
328
 
329
  audio_output = f'{save_dir}/audio_from_video.aac'
330
  # extract audio
 
357
  """
358
 
359
  tips = r"""
360
+ Here is an accelerated version of AniPortrait. Due to limitations in computing power, the wait time will be quite long. Please utilize the source code to experience the full performance.
361
  """
362
 
363
  with gr.Blocks() as demo:
 
376
 
377
  with gr.Row():
378
  a2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
379
+ a2v_step_slider = gr.Slider(minimum=5, maximum=30, step=1, value=20, label="Steps (--steps)")
380
 
381
  with gr.Row():
382
+ a2v_length = gr.Slider(minimum=0, maximum=150, step=1, value=60, label="Length (-L) (Set 0 to automatically calculate video length.)")
383
  a2v_seed = gr.Number(value=42, label="Seed (--seed)")
384
 
385
  a2v_botton = gr.Button("Generate", variant="primary")
 
404
 
405
  with gr.Row():
406
  v2v_size_slider = gr.Slider(minimum=256, maximum=1024, step=8, value=512, label="Video size (-W & -H)")
407
+ v2v_step_slider = gr.Slider(minimum=5, maximum=30, step=1, value=20, label="Steps (--steps)")
408
 
409
  with gr.Row():
410
+ v2v_length = gr.Slider(minimum=0, maximum=150, step=1, value=60, label="Length (-L) (Set 0 to automatically calculate video length.)")
411
  v2v_seed = gr.Number(value=42, label="Seed (--seed)")
412
 
413
  v2v_botton = gr.Button("Generate", variant="primary")
src/utils/crop_face_single.py CHANGED
@@ -20,26 +20,36 @@ def crop_face(img, lmk_extractor, expand=1.5):
20
 
21
  width = x_max - x_min
22
  height = y_max - y_min
23
-
24
- center_x = x_min + width / 2
25
- center_y = y_min + height / 2
26
-
27
- width *= expand
28
- height *= expand
29
-
30
- size = max(width, height)
31
-
32
- x_min = int(center_x - size / 2)
33
- x_max = int(center_x + size / 2)
34
- y_min = int(center_y - size / 2)
35
- y_max = int(center_y + size / 2)
36
-
37
- top = max(0, -y_min)
38
- bottom = max(0, y_max - img.shape[0])
39
- left = max(0, -x_min)
40
- right = max(0, x_max - img.shape[1])
41
- img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0)
42
-
43
- cropped_img = img[y_min + top:y_max + top, x_min + left:x_max + left]
 
 
 
 
 
 
 
 
 
 
44
 
45
  return cropped_img
 
20
 
21
  width = x_max - x_min
22
  height = y_max - y_min
23
+
24
+ if width*height >= W*H*0.15:
25
+ if W == H:
26
+ return img
27
+ size = min(H, W)
28
+ offset = int((max(H, W) - size)/2)
29
+ if size == H:
30
+ return img[:, offset:-offset]
31
+ else:
32
+ return img[offset:-offset, :]
33
+ else:
34
+ center_x = x_min + width / 2
35
+ center_y = y_min + height / 2
36
+
37
+ width *= expand
38
+ height *= expand
39
+
40
+ size = max(width, height)
41
+
42
+ x_min = int(center_x - size / 2)
43
+ x_max = int(center_x + size / 2)
44
+ y_min = int(center_y - size / 2)
45
+ y_max = int(center_y + size / 2)
46
+
47
+ top = max(0, -y_min)
48
+ bottom = max(0, y_max - img.shape[0])
49
+ left = max(0, -x_min)
50
+ right = max(0, x_max - img.shape[1])
51
+ img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0)
52
+
53
+ cropped_img = img[y_min + top:y_max + top, x_min + left:x_max + left]
54
 
55
  return cropped_img
src/utils/frame_interpolation.py CHANGED
@@ -1,37 +1,32 @@
 
1
  import os
2
  import cv2
3
  import numpy as np
4
  import torch
5
  import bisect
6
  import shutil
 
 
7
 
8
  def init_frame_interpolation_model():
9
  print("Initializing frame interpolation model")
10
  checkpoint_name = os.path.join("./pretrained_model/film_net_fp16.pt")
11
 
12
- model = torch.load(checkpoint_name, map_location='cpu')
13
  model.eval()
14
  model = model.half()
15
  model = model.to(device="cuda")
16
  return model
17
 
18
 
19
- def batch_images_interpolation_tool(input_file, model, fps, inter_frames=1):
20
-
21
- image_save_dir = input_file + '_tmp'
22
- os.makedirs(image_save_dir, exist_ok=True)
23
-
24
- input_img_list = os.listdir(input_file)
25
- input_img_list.sort()
26
-
27
- for idx in range(len(input_img_list)-1):
28
- img1 = cv2.imread(os.path.join(input_file, input_img_list[idx]))
29
- img2 = cv2.imread(os.path.join(input_file, input_img_list[idx+1]))
30
 
31
- image1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB).astype(np.float32) / np.float32(255)
32
- image2 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB).astype(np.float32) / np.float32(255)
33
- image1 = torch.from_numpy(image1).unsqueeze(0).permute(0, 3, 1, 2)
34
- image2 = torch.from_numpy(image2).unsqueeze(0).permute(0, 3, 1, 2)
35
 
36
  results = [image1, image2]
37
 
@@ -66,25 +61,9 @@ def batch_images_interpolation_tool(input_file, model, fps, inter_frames=1):
66
  results.insert(insert_position, prediction.clamp(0, 1).cpu().float())
67
  del remains[step]
68
 
69
- frames = [(tensor[0] * 255).byte().flip(0).permute(1, 2, 0).numpy().copy() for tensor in results]
70
-
71
- for sub_idx in range(len(frames)):
72
- img_path = os.path.join(image_save_dir, f'{sub_idx+idx*(inter_frames+1):06d}.png')
73
- cv2.imwrite(img_path, frames[sub_idx])
74
-
75
- final_frames = []
76
- final_img_list = os.listdir(image_save_dir)
77
- final_img_list.sort()
78
- for item in final_img_list:
79
- final_frames.append(cv2.imread(os.path.join(image_save_dir, item)))
80
- w, h = final_frames[0].shape[1::-1]
81
- fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
82
- video_save_dir = input_file + '.mp4'
83
- writer = cv2.VideoWriter(video_save_dir, fourcc, fps, (w, h))
84
- for frame in final_frames:
85
- writer.write(frame)
86
- writer.release()
87
-
88
- shutil.rmtree(image_save_dir)
89
-
90
- return video_save_dir
 
1
+ # Adapted from https://github.com/dajes/frame-interpolation-pytorch
2
  import os
3
  import cv2
4
  import numpy as np
5
  import torch
6
  import bisect
7
  import shutil
8
+ import pdb
9
+ from tqdm import tqdm
10
 
11
  def init_frame_interpolation_model():
12
  print("Initializing frame interpolation model")
13
  checkpoint_name = os.path.join("./pretrained_model/film_net_fp16.pt")
14
 
15
+ model = torch.jit.load(checkpoint_name, map_location='cpu')
16
  model.eval()
17
  model = model.half()
18
  model = model.to(device="cuda")
19
  return model
20
 
21
 
22
+ def batch_images_interpolation_tool(input_tensor, model, inter_frames=1):
23
+
24
+ video_tensor = []
25
+ frame_num = input_tensor.shape[2] # bs, channel, frame, height, width
 
 
 
 
 
 
 
26
 
27
+ for idx in tqdm(range(frame_num-1)):
28
+ image1 = input_tensor[:,:,idx]
29
+ image2 = input_tensor[:,:,idx+1]
 
30
 
31
  results = [image1, image2]
32
 
 
61
  results.insert(insert_position, prediction.clamp(0, 1).cpu().float())
62
  del remains[step]
63
 
64
+ for sub_idx in range(len(results)-1):
65
+ video_tensor.append(results[sub_idx].unsqueeze(2))
66
+
67
+ video_tensor.append(input_tensor[:,:,-1].unsqueeze(2))
68
+ video_tensor = torch.cat(video_tensor, dim=2)
69
+ return video_tensor