salomonsky commited on
Commit
8190818
·
verified ·
1 Parent(s): 2fee994

Update inference.py

Browse files
Files changed (1) hide show
  1. inference.py +55 -1
inference.py CHANGED
@@ -8,10 +8,64 @@ import torch, face_detection
8
  from models import Wav2Lip
9
  import platform
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
12
  print('Using {} for inference.'.format(device))
13
 
14
-
15
  parser = argparse.ArgumentParser(description='Inference code to lip-sync videos in the wild using Wav2Lip models')
16
 
17
  parser.add_argument('--checkpoint_path', type=str,
 
8
  from models import Wav2Lip
9
  import platform
10
 
11
+ def face_detect(images):
12
+ detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
13
+ flip_input=False, device=device)
14
+
15
+ batch_size = args.face_det_batch_size
16
+ max_size = 720 # Cambia a 512 si prefieres ese tamaño máximo
17
+
18
+ # Verificar el tamaño de las imágenes y redimensionar si es necesario
19
+ for i in range(len(images)):
20
+ h, w = images[i].shape[:2]
21
+ if h > max_size or w > max_size:
22
+ print("Image too big, resizing...")
23
+ if h > w:
24
+ ratio = max_size / float(h)
25
+ new_size = (int(w * ratio), max_size)
26
+ else:
27
+ ratio = max_size / float(w)
28
+ new_size = (max_size, int(h * ratio))
29
+ images[i] = cv2.resize(images[i], new_size)
30
+
31
+ while True:
32
+ predictions = []
33
+ try:
34
+ for i in tqdm(range(0, len(images), batch_size)):
35
+ predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
36
+ except RuntimeError:
37
+ if batch_size == 1:
38
+ raise RuntimeError('Image too big to run face detection on GPU. Please use the --resize_factor argument')
39
+ batch_size //= 2
40
+ print('Recovering from OOM error; New batch size: {}'.format(batch_size))
41
+ continue
42
+ break
43
+
44
+ results = []
45
+ pady1, pady2, padx1, padx2 = args.pads
46
+ for rect, image in zip(predictions, images):
47
+ if rect is None:
48
+ cv2.imwrite('temp/faulty_frame.jpg', image)
49
+ raise ValueError('Face not detected! Ensure the video contains a face in all the frames.')
50
+
51
+ y1 = max(0, rect[1] - pady1)
52
+ y2 = min(image.shape[0], rect[3] + pady2)
53
+ x1 = max(0, rect[0] - padx1)
54
+ x2 = min(image.shape[1], rect[2] + padx2)
55
+
56
+ results.append([x1, y1, x2, y2])
57
+
58
+ boxes = np.array(results)
59
+ if not args.nosmooth: boxes = get_smoothened_boxes(boxes, T=5)
60
+ results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
61
+
62
+ del detector
63
+ return results
64
+
65
+
66
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
67
  print('Using {} for inference.'.format(device))
68
 
 
69
  parser = argparse.ArgumentParser(description='Inference code to lip-sync videos in the wild using Wav2Lip models')
70
 
71
  parser.add_argument('--checkpoint_path', type=str,