LipCoordNet / inference.py
ffeew's picture
init
ca51874
import argparse
import os
from model import LipCoordNet
from dataset import MyDataset
import torch
import cv2
import face_alignment
import numpy as np
import dlib
import glob
def get_position(size, padding=0.25):
x = [
0.000213256,
0.0752622,
0.18113,
0.29077,
0.393397,
0.586856,
0.689483,
0.799124,
0.904991,
0.98004,
0.490127,
0.490127,
0.490127,
0.490127,
0.36688,
0.426036,
0.490127,
0.554217,
0.613373,
0.121737,
0.187122,
0.265825,
0.334606,
0.260918,
0.182743,
0.645647,
0.714428,
0.793132,
0.858516,
0.79751,
0.719335,
0.254149,
0.340985,
0.428858,
0.490127,
0.551395,
0.639268,
0.726104,
0.642159,
0.556721,
0.490127,
0.423532,
0.338094,
0.290379,
0.428096,
0.490127,
0.552157,
0.689874,
0.553364,
0.490127,
0.42689,
]
y = [
0.106454,
0.038915,
0.0187482,
0.0344891,
0.0773906,
0.0773906,
0.0344891,
0.0187482,
0.038915,
0.106454,
0.203352,
0.307009,
0.409805,
0.515625,
0.587326,
0.609345,
0.628106,
0.609345,
0.587326,
0.216423,
0.178758,
0.179852,
0.231733,
0.245099,
0.244077,
0.231733,
0.179852,
0.178758,
0.216423,
0.244077,
0.245099,
0.780233,
0.745405,
0.727388,
0.742578,
0.727388,
0.745405,
0.780233,
0.864805,
0.902192,
0.909281,
0.902192,
0.864805,
0.784792,
0.778746,
0.785343,
0.778746,
0.784792,
0.824182,
0.831803,
0.824182,
]
x, y = np.array(x), np.array(y)
x = (x + padding) / (2 * padding + 1)
y = (y + padding) / (2 * padding + 1)
x = x * size
y = y * size
return np.array(list(zip(x, y)))
def transformation_from_points(points1, points2):
points1 = points1.astype(np.float64)
points2 = points2.astype(np.float64)
c1 = np.mean(points1, axis=0)
c2 = np.mean(points2, axis=0)
points1 -= c1
points2 -= c2
s1 = np.std(points1)
s2 = np.std(points2)
points1 /= s1
points2 /= s2
U, S, Vt = np.linalg.svd(points1.T * points2)
R = (U * Vt).T
return np.vstack(
[
np.hstack(((s2 / s1) * R, c2.T - (s2 / s1) * R * c1.T)),
np.matrix([0.0, 0.0, 1.0]),
]
)
def load_video(file, device: str):
# create the samples directory if it doesn't exist
if not os.path.exists("samples"):
os.makedirs("samples")
p = os.path.join("samples")
output = os.path.join("samples", "%04d.jpg")
cmd = "ffmpeg -hide_banner -loglevel error -i {} -qscale:v 2 -r 25 {}".format(
file, output
)
os.system(cmd)
files = os.listdir(p)
files = sorted(files, key=lambda x: int(os.path.splitext(x)[0]))
array = [cv2.imread(os.path.join(p, file)) for file in files]
array = list(filter(lambda im: not im is None, array))
fa = face_alignment.FaceAlignment(
face_alignment.LandmarksType._2D, flip_input=False, device=device
)
points = [fa.get_landmarks(I) for I in array]
front256 = get_position(256)
video = []
for point, scene in zip(points, array):
if point is not None:
shape = np.array(point[0])
shape = shape[17:]
M = transformation_from_points(np.matrix(shape), np.matrix(front256))
img = cv2.warpAffine(scene, M[:2], (256, 256))
(x, y) = front256[-20:].mean(0).astype(np.int32)
w = 160 // 2
img = img[y - w // 2 : y + w // 2, x - w : x + w, ...]
img = cv2.resize(img, (128, 64))
video.append(img)
video = np.stack(video, axis=0).astype(np.float32)
video = torch.FloatTensor(video.transpose(3, 0, 1, 2)) / 255.0
return video
def extract_lip_coordinates(detector, predictor, img_path):
image = cv2.imread(img_path)
image = cv2.resize(image, (600, 500))
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
rects = detector(gray)
retries = 3
while retries > 0:
try:
assert len(rects) == 1
break
except AssertionError as e:
retries -= 1
for rect in rects:
# apply the shape predictor to the face ROI
shape = predictor(gray, rect)
x = []
y = []
for n in range(48, 68):
x.append(shape.part(n).x)
y.append(shape.part(n).y)
return [x, y]
def generate_lip_coordinates(frame_images_directory, detector, predictor):
frames = glob.glob(frame_images_directory + "/*.jpg")
frames.sort()
img = cv2.imread(frames[0])
height, width, layers = img.shape
coords = []
for frame in frames:
x_coords, y_coords = extract_lip_coordinates(detector, predictor, frame)
normalized_coords = []
for x, y in zip(x_coords, y_coords):
normalized_x = x / width
normalized_y = y / height
normalized_coords.append((normalized_x, normalized_y))
coords.append(normalized_coords)
coords_array = np.array(coords, dtype=np.float32)
coords_array = torch.from_numpy(coords_array)
return coords_array
def ctc_decode(y):
y = y.argmax(-1)
t = y.size(0)
result = []
for i in range(t + 1):
result.append(MyDataset.ctc_arr2txt(y[:i], start=1))
return result
def output_video(p, txt, output_path):
files = os.listdir(p)
files = sorted(files, key=lambda x: int(os.path.splitext(x)[0]))
font = cv2.FONT_HERSHEY_SIMPLEX
for file, line in zip(files, txt):
img = cv2.imread(os.path.join(p, file))
h, w, _ = img.shape
img = cv2.putText(
img, line, (w // 8, 11 * h // 12), font, 1.2, (0, 0, 0), 3, cv2.LINE_AA
)
img = cv2.putText(
img,
line,
(w // 8, 11 * h // 12),
font,
1.2,
(255, 255, 255),
0,
cv2.LINE_AA,
)
h = h // 2
w = w // 2
img = cv2.resize(img, (w, h))
cv2.imwrite(os.path.join(p, file), img)
# create the output_videos directory if it doesn't exist
if not os.path.exists(output_path):
os.makedirs(output_path)
output = os.path.join(output_path, "output.mp4")
cmd = "ffmpeg -hide_banner -loglevel error -y -i {}/%04d.jpg -r 25 {}".format(
p, output
)
os.system(cmd)
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--weights",
type=str,
default="pretrain/LipCoordNet_coords_loss_0.025581153109669685_wer_0.01746208431890914_cer_0.006488426950253695.pt",
help="path to the weights file",
)
parser.add_argument(
"--input_video",
type=str,
help="path to the input video frames",
)
parser.add_argument(
"--device",
type=str,
default="cuda",
help="device to run the model on",
)
parser.add_argument(
"--output_path",
type=str,
default="output_videos",
help="directory to save the output video",
)
args = parser.parse_args()
# validate if device is valid
if args.device not in ("cuda", "cpu"):
raise ValueError("Invalid device, must be either cuda or cpu")
device = args.device
# load model
model = LipCoordNet()
model.load_state_dict(torch.load(args.weights))
model = model.to(device)
model.eval()
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor(
"lip_coordinate_extraction/shape_predictor_68_face_landmarks_GTX.dat"
)
# load video
video = load_video(args.input_video, device)
# generate lip coordinates
coords = generate_lip_coordinates("samples", detector, predictor)
pred = model(video[None, ...].to(device), coords[None, ...].to(device))
output = ctc_decode(pred[0])
print(output[-1])
output_video("samples", output, args.output_path)
if __name__ == "__main__":
main()