#!/usr/bin/env python from __future__ import annotations import sys from typing import Callable import cv2 import gradio as gr import huggingface_hub import numpy as np import PIL.Image import spaces import torch import torch.nn as nn import torch.nn.functional as F import torchvision import torchvision.transforms as T from scipy.spatial.transform import Rotation sys.path.insert(0, "face_detection") sys.path.insert(0, "deep-head-pose/code") from hopenet import Hopenet from ibug.face_detection import RetinaFacePredictor DESCRIPTION = "# [Hopenet](https://github.com/natanielruiz/deep-head-pose)" def load_model(model_name: str, device: torch.device) -> nn.Module: path = huggingface_hub.hf_hub_download("public-data/Hopenet", f"models/{model_name}.pkl") state_dict = torch.load(path, map_location="cpu") model = Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66) model.load_state_dict(state_dict) model.to(device) model.eval() return model def create_transform() -> Callable: transform = T.Compose( [ T.Resize(224), T.CenterCrop(224), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ] ) return transform def crop_face(image: np.ndarray, box: tuple[int, int, int, int]) -> np.ndarray: x0, y0, x1, y1 = box w = x1 - x0 h = y1 - y0 x0 -= 2 * w // 4 x1 += 2 * w // 4 y0 -= 3 * h // 4 y1 += h // 4 x0 = max(x0, 0) y0 = max(y0, 0) x1 = min(x1, image.shape[1]) y1 = min(y1, image.shape[0]) image = image[y0:y1, x0:x1] return image def draw_axis(image: np.ndarray, pose: np.ndarray, origin: np.ndarray, length: int) -> None: # (yaw, pitch, roll) -> (roll, yaw, pitch) pose = pose[[2, 0, 1]] pose *= np.array([1, -1, 1]) rot = Rotation.from_euler("zyx", pose, degrees=True) vectors = rot.as_matrix().T[:, :2] # shape: (3, 2) pts = np.round(vectors * length + origin).astype(int) cv2.line(image, tuple(origin), tuple(pts[0]), (0, 0, 255), 3) cv2.line(image, tuple(origin), tuple(pts[1]), (0, 255, 0), 3) cv2.line(image, tuple(origin), tuple(pts[2]), (255, 0, 0), 2) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") face_detector = RetinaFacePredictor(threshold=0.8, device="cpu", model=RetinaFacePredictor.get_model("mobilenet0.25")) face_detector.device = device face_detector.net.to(device) model_names = [ "hopenet_alpha1", "hopenet_alpha2", "hopenet_robust_alpha1", ] models = {name: load_model(name, device) for name in model_names} transform = create_transform() @spaces.GPU @torch.inference_mode() def run( image: np.ndarray, model_name: str, ) -> np.ndarray: model = models[model_name] # RGB -> BGR det_faces = face_detector(image[:, :, ::-1], rgb=False) indices = torch.arange(66).float().to(device) res = image[:, :, ::-1].copy() for det_face in det_faces: box = np.round(det_face[:4]).astype(int) # RGB face_image = crop_face(image, box.tolist()) face_image = PIL.Image.fromarray(face_image) data = transform(face_image) data = data.to(device) # the output of the model is a tuple of 3 tensors (yaw, pitch, roll) # the shape of each tensor is (1, 66) out = model(data[None, ...]) out = torch.stack(out, dim=1) # shape: (1, 3, 66) out = F.softmax(out, dim=2) out = (out * indices).sum(dim=2) * 3 - 99 angles = out.cpu().numpy()[0] center = (box[:2] + box[2:]) // 2 length = (box[3] - box[1]) // 2 draw_axis(res, angles, center, length) return res[:, :, ::-1] examples = [["images/pexels-ksenia-chernaya-8535230.jpg", "hopenet_alpha1"]] with gr.Blocks(css="style.css") as demo: gr.Markdown(DESCRIPTION) with gr.Row(): with gr.Column(): image = gr.Image(label="Input", type="numpy") model_name = gr.Radio(label="Model", choices=model_names, type="value", value=model_names[0]) run_button = gr.Button("Run") with gr.Column(): result = gr.Image(label="Output") gr.Examples( examples=examples, inputs=[image, model_name], outputs=result, fn=run, ) run_button.click( fn=run, inputs=[image, model_name], outputs=result, api_name="run", ) if __name__ == "__main__": demo.queue().launch()