Spaces:

infocusp
/

PoseSync-Video-Matching-Tool

Runtime error

File size: 7,149 Bytes

import cv2
import numpy as np
import yolov5


class CropVideo:
    """Base class for cropping a video frame-by-frame using various object
    detection method such as YOLO or cv2.Tracker

    Warning: This class should not be used directly.
    Use derived classes instead.

      Parameters:
        method : name of the object detection method
        model_path : path to object detection model

    """

    def __init__(self, method=None):
        self.method = method

    def video_crop(self, video_frames):
        """Crops given list of frames by detecting object using different
        methods such as YOLO or cv2.Tracker.

        Args:
          video_frames: A list of numpy arrays representing the input images

        Returns:
          A numpy array containing cropped frames
        """
        raise NotImplementedError


class YOLOCrop(CropVideo):

    """Class for cropping a video frame-by-frame using YOLO object detection
    method


    Parameters :
        cropping_model_path : path to object detection model

    """

    def __init__(self, method=None, model_path=None):
        super().__init__('yolo')
        self.model_path = model_path or 'models/yolo/yolov5x.pt'
        self.load_model(self.model_path)

    def load_model(self, model_path):
        """Loads object detection model.
        """
        self.model = yolov5.load(model_path)
        self.model.classes = 0

    def get_yolo_bbox(self, frame):
        """Runs YOLO object detection on an input image.

        Args:
          frame: A [height, width, 3] numpy array representing the input image

        Returns:
          A list conating boundig box parameters [x_min, y_min, x_max, y_max]
        """

        results = self.model(frame)
        predictions = results.pred[0]

        boxes = predictions[:, :4].numpy().astype(np.int32)
        if len(boxes) == 0:
            return []
        elif len(boxes) == 1:
            return list(boxes[0])
        else:
            area = []
            for i in boxes:
                area.append(cv2.contourArea(np.array([[i[:2]], [i[2:]]])))
            largest_bbox = boxes[np.argmax(np.array(area))]
            return list(largest_bbox)

    def video_crop(self, video_frames):
        """Crops given list of frames by detecting object using YOLO

        Args:
          video_frames: A list of numpy arrays representing the input images

        Returns:
          A numpy array containing cropped frames
        """

        x_width_start = []
        y_height_start = []
        x_width_end = []
        y_height_end = []
        frame_height, frame_width = 0, 0

        widths = []
        heights = []
        for frame in video_frames:
            frame_height, frame_width, _ = frame.shape
            bbox = self.get_yolo_bbox(frame)

            if len(bbox) == 0:
                continue
            else:
                x_width_start.append(int(max(bbox[0] - 100, 0)))
                y_height_start.append(int(max(bbox[1] - 100, 0)))
                x_width_end.append(int(min(bbox[2] + 100, frame.shape[1])))
                y_height_end.append(int(min(bbox[3] + 100, frame.shape[0])))

                widths.append(x_width_end[-1] - x_width_start[-1])
                heights.append(y_height_end[-1] - y_height_start[-1])

        width = np.percentile(np.array(widths), 95)
        height = np.percentile(np.array(heights), 95)
        box_len = int(max(width, height))

        cropped_frames = []

        for i in range(len(widths)):
            frame = video_frames[i]
            xs = x_width_start[i]
            xe = x_width_start[i] + box_len
            ys = y_height_start[i]
            ye = y_height_start[i] + box_len

            if ye > frame_height:
                ye = frame_height
                ys = max(0, ye - box_len)

            if xe > frame_width:
                xe = frame_width
                xs = max(0, xe - box_len)

            cropped = frame[int(ys): int(ye), int(xs): int(xe), :]
            cropped_frames.append(np.array(cropped))

        return np.array(cropped_frames)


class TrackerCrop(YOLOCrop):
    def __init__(self, model_path=None):
        super().__init__(method='yolo')
        self.tracker = cv2.TrackerMIL.create()

    @staticmethod
    def expand_bbox(bbox, frame_shape):
        """Expands given bounding box by 50 pixels

        Args:
          bbox: A list [x,y, width, height] consits of bounding box
          parameters of
                object
          frame_shape: (height, width) of a frame

        """
        bbox[0] = max(bbox[0] - 50, 0)
        bbox[1] = max(bbox[1] - 50, 0)
        bbox[2] = min(bbox[3] + 50, frame_shape[1] - bbox[0] - 1)
        bbox[3] = min(bbox[3] + 50, frame_shape[0] - bbox[1] - 1)

    @staticmethod
    def pad_bbox(crop_frame, box_len):
        """Pads given cropped frame

        Args:
          crop_frame: A numpy array representing the cropped frame
          box_len: An integer value representing maximum out of width and height

        Returns:
          A numpy array containing cropped frame with padding
        """
        if box_len > crop_frame.shape[0] or box_len > crop_frame.shape[1]:
            crop_frame = np.pad(
                crop_frame, pad_width=(
                    (0, box_len - crop_frame.shape[0]),
                    (0, box_len - crop_frame.shape[1]), (0, 0))
                )
        return crop_frame

    @staticmethod
    def clip_coordinates(x, y, box_len, frame_shape):
        """Clips (x,y) coordinates representing the centre of bounding box

        Args:
          x: x-coordinate of the centre of bounding box
          y: y-coordinate of the centre of bounding box
          box_len: An integer value representing maximum out of width and height
          frame_shape: (height, width) of a frame

        Returns:
          (x,y) clipped coordinates
        """
        if x + box_len > frame_shape[1]:
            diff = x + box_len - frame_shape[1]
            x = max(0, x - diff)
        if y + box_len > frame_shape[0]:
            diff = y + box_len - frame_shape[0]
            y = max(0, y - diff)

        return (x, y)

    def video_crop(self, video_frames):
        """Crops given list of frames by detecting object using cv2.Tracker

        Args:
          video_frames: A list of numpy arrays representing the input images

        Returns:
          A numpy array containing cropped frames
        """

        frame = video_frames[0]
        bbox = self.get_yolo_bbox(frame)
        TrackerCrop.expand_bbox(bbox, frame.shape)
        self.tracker.init(frame, bbox)
        output_frame_list = []
        for frame in video_frames:
            _, bbox = self.tracker.update(frame)
            x, y, w, h = bbox
            box_len = max(w, h)
            x, y = TrackerCrop.clip_coordinates(x, y, box_len, frame.shape)
            crop_frame = np.array(frame[y:y + box_len, x:x + box_len, :])
            crop_frame = TrackerCrop.pad_bbox(crop_frame, box_len)
            output_frame_list.append(crop_frame)

        output_frame_array = np.array(output_frame_list)

        return output_frame_array