Spaces:

Fucius
/

OMG-InstantID

Running on Zero

File size: 13,268 Bytes

2eafbc4

import math
from time import perf_counter
from typing import List, Optional, Tuple, Union

import cv2
import mediapipe as mp
import numpy as np
import onnxruntime
import torch
import torch.nn as nn
import torchvision
from mediapipe.tasks.python.components.containers.bounding_box import BoundingBox
from mediapipe.tasks.python.components.containers.category import Category
from mediapipe.tasks.python.components.containers.detections import Detection
from torchvision import transforms

from inference.core.entities.requests.gaze import GazeDetectionInferenceRequest
from inference.core.entities.responses.gaze import (
    GazeDetectionInferenceResponse,
    GazeDetectionPrediction,
)
from inference.core.entities.responses.inference import FaceDetectionPrediction, Point
from inference.core.env import (
    GAZE_MAX_BATCH_SIZE,
    MODEL_CACHE_DIR,
    REQUIRED_ONNX_PROVIDERS,
    TENSORRT_CACHE_PATH,
)
from inference.core.exceptions import OnnxProviderNotAvailable
from inference.core.models.roboflow import OnnxRoboflowCoreModel
from inference.core.utils.image_utils import load_image_rgb
from inference.models.gaze.l2cs import L2CS


class Gaze(OnnxRoboflowCoreModel):
    """Roboflow ONNX Gaze model.

    This class is responsible for handling the ONNX Gaze model, including
    loading the model, preprocessing the input, and performing inference.

    Attributes:
        gaze_onnx_session (onnxruntime.InferenceSession): ONNX Runtime session for gaze detection inference.
    """

    def __init__(self, *args, **kwargs):
        """Initializes the Gaze with the given arguments and keyword arguments."""

        t1 = perf_counter()
        super().__init__(*args, **kwargs)
        # Create an ONNX Runtime Session with a list of execution providers in priority order. ORT attempts to load providers until one is successful. This keeps the code across devices identical.
        self.log("Creating inference sessions")

        # TODO: convert face detector (TensorflowLite) to ONNX model

        self.gaze_onnx_session = onnxruntime.InferenceSession(
            self.cache_file("L2CSNet_gaze360_resnet50_90bins.onnx"),
            providers=[
                (
                    "TensorrtExecutionProvider",
                    {
                        "trt_engine_cache_enable": True,
                        "trt_engine_cache_path": TENSORRT_CACHE_PATH,
                    },
                ),
                "CUDAExecutionProvider",
                "CPUExecutionProvider",
            ],
        )

        if REQUIRED_ONNX_PROVIDERS:
            available_providers = onnxruntime.get_available_providers()
            for provider in REQUIRED_ONNX_PROVIDERS:
                if provider not in available_providers:
                    raise OnnxProviderNotAvailable(
                        f"Required ONNX Execution Provider {provider} is not availble. Check that you are using the correct docker image on a supported device."
                    )

        # init face detector
        self.face_detector = mp.tasks.vision.FaceDetector.create_from_options(
            mp.tasks.vision.FaceDetectorOptions(
                base_options=mp.tasks.BaseOptions(
                    model_asset_path=self.cache_file("mediapipe_face_detector.tflite")
                ),
                running_mode=mp.tasks.vision.RunningMode.IMAGE,
            )
        )

        # additional settings for gaze detection
        self._gaze_transformations = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Resize(448),
                transforms.Normalize(
                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
                ),
            ]
        )
        self.task_type = "gaze-detection"
        self.log(f"GAZE model loaded in {perf_counter() - t1:.2f} seconds")

    def _crop_face_img(self, np_img: np.ndarray, face: Detection) -> np.ndarray:
        """Extract facial area in an image.

        Args:
            np_img (np.ndarray): The numpy image.
            face (mediapipe.tasks.python.components.containers.detections.Detection): The detected face.

        Returns:
            np.ndarray: Cropped face image.
        """
        # extract face area
        bbox = face.bounding_box
        x_min = bbox.origin_x
        y_min = bbox.origin_y
        x_max = bbox.origin_x + bbox.width
        y_max = bbox.origin_y + bbox.height
        face_img = np_img[y_min:y_max, x_min:x_max, :]
        face_img = cv2.resize(face_img, (224, 224))
        return face_img

    def _detect_gaze(self, np_imgs: List[np.ndarray]) -> List[Tuple[float, float]]:
        """Detect faces and gazes in an image.

        Args:
            pil_imgs (List[np.ndarray]): The numpy image list, each image is a cropped facial image.

        Returns:
            List[Tuple[float, float]]: Yaw (radian) and Pitch (radian).
        """
        ret = []
        for i in range(0, len(np_imgs), GAZE_MAX_BATCH_SIZE):
            img_batch = []
            for j in range(i, min(len(np_imgs), i + GAZE_MAX_BATCH_SIZE)):
                img = self._gaze_transformations(np_imgs[j])
                img = np.expand_dims(img, axis=0).astype(np.float32)
                img_batch.append(img)

            img_batch = np.concatenate(img_batch, axis=0)
            onnx_input_image = {self.gaze_onnx_session.get_inputs()[0].name: img_batch}
            yaw, pitch = self.gaze_onnx_session.run(None, onnx_input_image)

            for j in range(len(img_batch)):
                ret.append((yaw[j], pitch[j]))

        return ret

    def _make_response(
        self,
        faces: List[Detection],
        gazes: List[Tuple[float, float]],
        imgW: int,
        imgH: int,
        time_total: float,
        time_face_det: float = None,
        time_gaze_det: float = None,
    ) -> GazeDetectionInferenceResponse:
        """Prepare response object from detected faces and corresponding gazes.

        Args:
            faces (List[Detection]): The detected faces.
            gazes (List[tuple(float, float)]): The detected gazes (yaw, pitch).
            imgW (int): The width (px) of original image.
            imgH (int): The height (px) of original image.
            time_total (float): The processing time.
            time_face_det (float): The processing time.
            time_gaze_det (float): The processing time.

        Returns:
            GazeDetectionInferenceResponse: The response object including the detected faces and gazes info.
        """
        predictions = []
        for face, gaze in zip(faces, gazes):
            landmarks = []
            for keypoint in face.keypoints:
                x = min(max(int(keypoint.x * imgW), 0), imgW - 1)
                y = min(max(int(keypoint.y * imgH), 0), imgH - 1)
                landmarks.append(Point(x=x, y=y))

            bbox = face.bounding_box
            x_center = bbox.origin_x + bbox.width / 2
            y_center = bbox.origin_y + bbox.height / 2
            score = face.categories[0].score

            prediction = GazeDetectionPrediction(
                face=FaceDetectionPrediction(
                    x=x_center,
                    y=y_center,
                    width=bbox.width,
                    height=bbox.height,
                    confidence=score,
                    class_name="face",
                    landmarks=landmarks,
                ),
                yaw=gaze[0],
                pitch=gaze[1],
            )
            predictions.append(prediction)

        response = GazeDetectionInferenceResponse(
            predictions=predictions,
            time=time_total,
            time_face_det=time_face_det,
            time_gaze_det=time_gaze_det,
        )
        return response

    def get_infer_bucket_file_list(self) -> List[str]:
        """Gets the list of files required for inference.

        Returns:
            List[str]: The list of file names.
        """
        return [
            "mediapipe_face_detector.tflite",
            "L2CSNet_gaze360_resnet50_90bins.onnx",
        ]

    def infer_from_request(
        self, request: GazeDetectionInferenceRequest
    ) -> List[GazeDetectionInferenceResponse]:
        """Detect faces and gazes in image(s).

        Args:
            request (GazeDetectionInferenceRequest): The request object containing the image.

        Returns:
            List[GazeDetectionInferenceResponse]: The list of response objects containing the faces and corresponding gazes.
        """
        if isinstance(request.image, list):
            if len(request.image) > GAZE_MAX_BATCH_SIZE:
                raise ValueError(
                    f"The maximum number of images that can be inferred with gaze detection at one time is {GAZE_MAX_BATCH_SIZE}"
                )
            imgs = request.image
        else:
            imgs = [request.image]

        time_total = perf_counter()

        # load pil images
        num_img = len(imgs)
        np_imgs = [load_image_rgb(img) for img in imgs]

        # face detection
        # TODO: face detection for batch
        time_face_det = perf_counter()
        faces = []
        for np_img in np_imgs:
            if request.do_run_face_detection:
                mp_img = mp.Image(
                    image_format=mp.ImageFormat.SRGB, data=np_img.astype(np.uint8)
                )
                faces_per_img = self.face_detector.detect(mp_img).detections
            else:
                faces_per_img = [
                    Detection(
                        bounding_box=BoundingBox(
                            origin_x=0,
                            origin_y=0,
                            width=np_img.shape[1],
                            height=np_img.shape[0],
                        ),
                        categories=[Category(score=1.0, category_name="face")],
                        keypoints=[],
                    )
                ]
            faces.append(faces_per_img)
        time_face_det = (perf_counter() - time_face_det) / num_img

        # gaze detection
        time_gaze_det = perf_counter()
        face_imgs = []
        for i, np_img in enumerate(np_imgs):
            if request.do_run_face_detection:
                face_imgs.extend(
                    [self._crop_face_img(np_img, face) for face in faces[i]]
                )
            else:
                face_imgs.append(cv2.resize(np_img, (224, 224)))
        gazes = self._detect_gaze(face_imgs)
        time_gaze_det = (perf_counter() - time_gaze_det) / num_img

        time_total = (perf_counter() - time_total) / num_img

        # prepare response
        response = []
        idx_gaze = 0
        for i in range(len(np_imgs)):
            imgH, imgW, _ = np_imgs[i].shape
            faces_per_img = faces[i]
            gazes_per_img = gazes[idx_gaze : idx_gaze + len(faces_per_img)]
            response.append(
                self._make_response(
                    faces_per_img, gazes_per_img, imgW, imgH, time_total
                )
            )

        return response


class L2C2Wrapper(L2CS):
    """Roboflow L2CS Gaze detection model.

    This class is responsible for converting L2CS model to ONNX model.
    It is ONLY intended for internal usage.

    Workflow:
        After training a L2CS model, create an instance of this wrapper class.
        Load the trained weights file, and save it as ONNX model.
    """

    def __init__(self):
        self.device = torch.device("cpu")
        self.num_bins = 90
        super().__init__(
            torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], self.num_bins
        )
        self._gaze_softmax = nn.Softmax(dim=1)
        self._gaze_idx_tensor = torch.FloatTensor([i for i in range(90)]).to(
            self.device
        )

    def forward(self, x):
        idx_tensor = torch.stack(
            [self._gaze_idx_tensor for i in range(x.shape[0])], dim=0
        )
        gaze_yaw, gaze_pitch = super().forward(x)

        yaw_predicted = self._gaze_softmax(gaze_yaw)
        yaw_radian = (
            (torch.sum(yaw_predicted * idx_tensor, dim=1) * 4 - 180) * np.pi / 180
        )

        pitch_predicted = self._gaze_softmax(gaze_pitch)
        pitch_radian = (
            (torch.sum(pitch_predicted * idx_tensor, dim=1) * 4 - 180) * np.pi / 180
        )

        return yaw_radian, pitch_radian

    def load_L2CS_model(
        self,
        file_path=f"{MODEL_CACHE_DIR}/gaze/L2CS/L2CSNet_gaze360_resnet50_90bins.pkl",
    ):
        super().load_state_dict(torch.load(file_path, map_location=self.device))
        super().to(self.device)

    def saveas_ONNX_model(
        self,
        file_path=f"{MODEL_CACHE_DIR}/gaze/L2CS/L2CSNet_gaze360_resnet50_90bins.onnx",
    ):
        dummy_input = torch.randn(1, 3, 448, 448)
        dynamic_axes = {
            "input": {0: "batch_size"},
            "output_yaw": {0: "batch_size"},
            "output_pitch": {0: "batch_size"},
        }
        torch.onnx.export(
            self,
            dummy_input,
            file_path,
            input_names=["input"],
            output_names=["output_yaw", "output_pitch"],
            dynamic_axes=dynamic_axes,
            verbose=False,
        )