VideoDetection

Runtime error

VideoDetection / icpr2020dfdc /blazeface /face_extract.py

Mohamed Almukhtar

Duplicate from malmukhtar/ImageDetection

fc3814c about 2 years ago

20.3 kB

	import os
	from typing import Tuple, List

	import cv2
	import numpy as np
	import torch
	from PIL import Image

	from blazeface import BlazeFace


	class FaceExtractor:
	"""Wrapper for face extraction workflow."""

	def __init__(self, video_read_fn = None, facedet: BlazeFace = None):
	"""Creates a new FaceExtractor.

	Arguments:
	video_read_fn: a function that takes in a path to a video file
	and returns a tuple consisting of a NumPy array with shape
	(num_frames, H, W, 3) and a list of frame indices, or None
	in case of an error
	facedet: the face detector object
	"""
	self.video_read_fn = video_read_fn
	self.facedet = facedet

	def process_image(self, path: str = None, img: Image.Image or np.ndarray = None) -> dict:
	"""
	Process a single image
	:param path: Path to the image
	:param img: image
	:return:
	"""

	if img is not None and path is not None:
	raise ValueError('Only one argument between path and img can be specified')
	if img is None and path is None:
	raise ValueError('At least one argument between path and img must be specified')

	target_size = self.facedet.input_size

	if img is None:
	img = np.asarray(Image.open(str(path)))
	else:
	img = np.asarray(img)

	# Split the frames into several tiles. Resize the tiles to 128x128.
	tiles, resize_info = self._tile_frames(np.expand_dims(img, 0), target_size)
	# tiles has shape (num_tiles, target_size, target_size, 3)
	# resize_info is a list of four elements [resize_factor_y, resize_factor_x, 0, 0]

	# Run the face detector. The result is a list of PyTorch tensors,
	# one for each tile in the batch.
	detections = self.facedet.predict_on_batch(tiles, apply_nms=False)

	# Convert the detections from 128x128 back to the original frame size.
	detections = self._resize_detections(detections, target_size, resize_info)

	# Because we have several tiles for each frame, combine the predictions
	# from these tiles. The result is a list of PyTorch tensors, but now one
	# for each frame (rather than each tile).
	num_frames = 1
	frame_size = (img.shape[1], img.shape[0])
	detections = self._untile_detections(num_frames, frame_size, detections)

	# The same face may have been detected in multiple tiles, so filter out
	# overlapping detections. This is done separately for each frame.
	detections = self.facedet.nms(detections)

	# Crop the faces out of the original frame.
	frameref_detections = self._add_margin_to_detections(detections[0], frame_size, 0.2)
	faces = self._crop_faces(img, frameref_detections)
	kpts = self._crop_kpts(img, detections[0], 0.3)

	# Add additional information about the frame and detections.
	scores = list(detections[0][:, 16].cpu().numpy())
	frame_dict = {"frame_w": frame_size[0],
	"frame_h": frame_size[1],
	"faces": faces,
	"kpts": kpts,
	"detections": frameref_detections.cpu().numpy(),
	"scores": scores,
	}

	# Sort faces by descending confidence
	frame_dict = self._soft_faces_by_descending_score(frame_dict)

	return frame_dict

	def _soft_faces_by_descending_score(self, frame_dict: dict) -> dict:
	if len(frame_dict['scores']) > 1:
	sort_idxs = np.argsort(frame_dict['scores'])[::-1]
	new_faces = [frame_dict['faces'][i] for i in sort_idxs]
	new_kpts = [frame_dict['kpts'][i] for i in sort_idxs]
	new_detections = frame_dict['detections'][sort_idxs]
	new_scores = [frame_dict['scores'][i] for i in sort_idxs]
	frame_dict['faces'] = new_faces
	frame_dict['kpts'] = new_kpts
	frame_dict['detections'] = new_detections
	frame_dict['scores'] = new_scores
	return frame_dict

	def process_videos(self, input_dir, filenames, video_idxs) -> List[dict]:
	"""For the specified selection of videos, grabs one or more frames
	from each video, runs the face detector, and tries to find the faces
	in each frame.

	The frames are split into tiles, and the tiles from the different videos
	are concatenated into a single batch. This means the face detector gets
	a batch of size len(video_idxs) * num_frames * num_tiles (usually 3).

	Arguments:
	input_dir: base folder where the video files are stored
	filenames: list of all video files in the input_dir
	video_idxs: one or more indices from the filenames list; these
	are the videos we'll actually process

	Returns a list of dictionaries, one for each frame read from each video.

	This dictionary contains:
	- video_idx: the video this frame was taken from
	- frame_idx: the index of the frame in the video
	- frame_w, frame_h: original dimensions of the frame
	- faces: a list containing zero or more NumPy arrays with a face crop
	- scores: a list array with the confidence score for each face crop

	If reading a video failed for some reason, it will not appear in the
	output array. Note that there's no guarantee a given video will actually
	have num_frames results (as soon as a reading problem is encountered for
	a video, we continue with the next video).
	"""
	target_size = self.facedet.input_size

	videos_read = []
	frames_read = []
	frames = []
	tiles = []
	resize_info = []

	for video_idx in video_idxs:
	# Read the full-size frames from this video.
	filename = filenames[video_idx]
	video_path = os.path.join(input_dir, filename)
	result = self.video_read_fn(video_path)

	# Error? Then skip this video.
	if result is None: continue

	videos_read.append(video_idx)

	# Keep track of the original frames (need them later).
	my_frames, my_idxs = result
	frames.append(my_frames)
	frames_read.append(my_idxs)

	# Split the frames into several tiles. Resize the tiles to 128x128.
	my_tiles, my_resize_info = self._tile_frames(my_frames, target_size)
	tiles.append(my_tiles)
	resize_info.append(my_resize_info)

	if len(tiles) == 0:
	return []
	# Put all the tiles for all the frames from all the videos into
	# a single batch.
	batch = np.concatenate(tiles)

	# Run the face detector. The result is a list of PyTorch tensors,
	# one for each image in the batch.
	all_detections = self.facedet.predict_on_batch(batch, apply_nms=False)

	result = []
	offs = 0
	for v in range(len(tiles)):
	# Not all videos may have the same number of tiles, so find which
	# detections go with which video.
	num_tiles = tiles[v].shape[0]
	detections = all_detections[offs:offs + num_tiles]
	offs += num_tiles

	# Convert the detections from 128x128 back to the original frame size.
	detections = self._resize_detections(detections, target_size, resize_info[v])

	# Because we have several tiles for each frame, combine the predictions
	# from these tiles. The result is a list of PyTorch tensors, but now one
	# for each frame (rather than each tile).
	num_frames = frames[v].shape[0]
	frame_size = (frames[v].shape[2], frames[v].shape[1])
	detections = self._untile_detections(num_frames, frame_size, detections)

	# The same face may have been detected in multiple tiles, so filter out
	# overlapping detections. This is done separately for each frame.
	detections = self.facedet.nms(detections)

	for i in range(len(detections)):
	# Crop the faces out of the original frame.
	frameref_detections = self._add_margin_to_detections(detections[i], frame_size, 0.2)
	faces = self._crop_faces(frames[v][i], frameref_detections)
	kpts = self._crop_kpts(frames[v][i], detections[i], 0.3)

	# Add additional information about the frame and detections.
	scores = list(detections[i][:, 16].cpu().numpy())
	frame_dict = {"video_idx": videos_read[v],
	"frame_idx": frames_read[v][i],
	"frame_w": frame_size[0],
	"frame_h": frame_size[1],
	"frame": frames[v][i],
	"faces": faces,
	"kpts": kpts,
	"detections": frameref_detections.cpu().numpy(),
	"scores": scores,
	}
	# Sort faces by descending confidence
	frame_dict = self._soft_faces_by_descending_score(frame_dict)

	result.append(frame_dict)

	return result

	def process_video(self, video_path):
	"""Convenience method for doing face extraction on a single video."""
	input_dir = os.path.dirname(video_path)
	filenames = [os.path.basename(video_path)]
	return self.process_videos(input_dir, filenames, [0])

	def _tile_frames(self, frames: np.ndarray, target_size: Tuple[int, int]) -> (np.ndarray, List[float]):
	"""Splits each frame into several smaller, partially overlapping tiles
	and resizes each tile to target_size.

	After a bunch of experimentation, I found that for a 1920x1080 video,
	BlazeFace works better on three 1080x1080 windows. These overlap by 420
	pixels. (Two windows also work but it's best to have a clean center crop
	in there as well.)

	I also tried 6 windows of size 720x720 (horizontally: 720\|360, 360\|720;
	vertically: 720\|1200, 480\|720\|480, 1200\|720) but that gives many false
	positives when a window has no face in it.

	For a video in portrait orientation (1080x1920), we only take a single
	crop of the top-most 1080 pixels. If we split up the video vertically,
	then we might get false positives again.

	(NOTE: Not all videos are necessarily 1080p but the code can handle this.)

	Arguments:
	frames: NumPy array of shape (num_frames, height, width, 3)
	target_size: (width, height)

	Returns:
	- a new (num_frames * N, target_size[1], target_size[0], 3) array
	where N is the number of tiles used.
	- a list [scale_w, scale_h, offset_x, offset_y] that describes how
	to map the resized and cropped tiles back to the original image
	coordinates. This is needed for scaling up the face detections
	from the smaller image to the original image, so we can take the
	face crops in the original coordinate space.
	"""
	num_frames, H, W, _ = frames.shape

	num_h, num_v, split_size, x_step, y_step = self.get_tiles_params(H, W)

	splits = np.zeros((num_frames * num_v * num_h, target_size[1], target_size[0], 3), dtype=np.uint8)

	i = 0
	for f in range(num_frames):
	y = 0
	for v in range(num_v):
	x = 0
	for h in range(num_h):
	crop = frames[f, y:y + split_size, x:x + split_size, :]
	splits[i] = cv2.resize(crop, target_size, interpolation=cv2.INTER_AREA)
	x += x_step
	i += 1
	y += y_step

	resize_info = [split_size / target_size[0], split_size / target_size[1], 0, 0]
	return splits, resize_info

	def get_tiles_params(self, H, W):
	split_size = min(H, W, 720)
	x_step = (W - split_size) // 2
	y_step = (H - split_size) // 2
	num_v = (H - split_size) // y_step + 1 if y_step > 0 else 1
	num_h = (W - split_size) // x_step + 1 if x_step > 0 else 1
	return num_h, num_v, split_size, x_step, y_step

	def _resize_detections(self, detections, target_size, resize_info):
	"""Converts a list of face detections back to the original
	coordinate system.

	Arguments:
	detections: a list containing PyTorch tensors of shape (num_faces, 17)
	target_size: (width, height)
	resize_info: [scale_w, scale_h, offset_x, offset_y]
	"""
	projected = []
	target_w, target_h = target_size
	scale_w, scale_h, offset_x, offset_y = resize_info

	for i in range(len(detections)):
	detection = detections[i].clone()

	# ymin, xmin, ymax, xmax
	for k in range(2):
	detection[:, k * 2] = (detection[:, k * 2] * target_h - offset_y) * scale_h
	detection[:, k * 2 + 1] = (detection[:, k * 2 + 1] * target_w - offset_x) * scale_w

	# keypoints are x,y
	for k in range(2, 8):
	detection[:, k * 2] = (detection[:, k * 2] * target_w - offset_x) * scale_w
	detection[:, k * 2 + 1] = (detection[:, k * 2 + 1] * target_h - offset_y) * scale_h

	projected.append(detection)

	return projected

	def _untile_detections(self, num_frames: int, frame_size: Tuple[int, int], detections: List[torch.Tensor]) -> List[
	torch.Tensor]:
	"""With N tiles per frame, there also are N times as many detections.
	This function groups together the detections for a given frame; it is
	the complement to tile_frames().
	"""
	combined_detections = []

	W, H = frame_size

	num_h, num_v, split_size, x_step, y_step = self.get_tiles_params(H, W)

	i = 0
	for f in range(num_frames):
	detections_for_frame = []
	y = 0
	for v in range(num_v):
	x = 0
	for h in range(num_h):
	# Adjust the coordinates based on the split positions.
	detection = detections[i].clone()
	if detection.shape[0] > 0:
	for k in range(2):
	detection[:, k * 2] += y
	detection[:, k * 2 + 1] += x
	for k in range(2, 8):
	detection[:, k * 2] += x
	detection[:, k * 2 + 1] += y

	detections_for_frame.append(detection)
	x += x_step
	i += 1
	y += y_step

	combined_detections.append(torch.cat(detections_for_frame))

	return combined_detections

	def _add_margin_to_detections(self, detections: torch.Tensor, frame_size: Tuple[int, int],
	margin: float = 0.2) -> torch.Tensor:
	"""Expands the face bounding box.

	NOTE: The face detections often do not include the forehead, which
	is why we use twice the margin for ymin.

	Arguments:
	detections: a PyTorch tensor of shape (num_detections, 17)
	frame_size: maximum (width, height)
	margin: a percentage of the bounding box's height

	Returns a PyTorch tensor of shape (num_detections, 17).
	"""
	offset = torch.round(margin * (detections[:, 2] - detections[:, 0]))
	detections = detections.clone()
	detections[:, 0] = torch.clamp(detections[:, 0] - offset * 2, min=0) # ymin
	detections[:, 1] = torch.clamp(detections[:, 1] - offset, min=0) # xmin
	detections[:, 2] = torch.clamp(detections[:, 2] + offset, max=frame_size[1]) # ymax
	detections[:, 3] = torch.clamp(detections[:, 3] + offset, max=frame_size[0]) # xmax
	return detections

	def _crop_faces(self, frame: np.ndarray, detections: torch.Tensor) -> List[np.ndarray]:
	"""Copies the face region(s) from the given frame into a set
	of new NumPy arrays.

	Arguments:
	frame: a NumPy array of shape (H, W, 3)
	detections: a PyTorch tensor of shape (num_detections, 17)

	Returns a list of NumPy arrays, one for each face crop. If there
	are no faces detected for this frame, returns an empty list.
	"""
	faces = []
	for i in range(len(detections)):
	ymin, xmin, ymax, xmax = detections[i, :4].cpu().numpy().astype(np.int64)
	face = frame[ymin:ymax, xmin:xmax, :]
	faces.append(face)
	return faces

	def _crop_kpts(self, frame: np.ndarray, detections: torch.Tensor, face_fraction: float):
	"""Copies the parts region(s) from the given frame into a set
	of new NumPy arrays.

	Arguments:
	frame: a NumPy array of shape (H, W, 3)
	detections: a PyTorch tensor of shape (num_detections, 17)
	face_fraction: float between 0 and 1 indicating how big are the parts to be extracted w.r.t the whole face

	Returns a list of NumPy arrays, one for each face crop. If there
	are no faces detected for this frame, returns an empty list.
	"""
	faces = []
	for i in range(len(detections)):
	kpts = []
	size = int(face_fraction * min(detections[i, 2] - detections[i, 0], detections[i, 3] - detections[i, 1]))
	kpts_coords = detections[i, 4:16].cpu().numpy().astype(np.int64)
	for kpidx in range(6):
	kpx, kpy = kpts_coords[kpidx * 2:kpidx * 2 + 2]
	kpt = frame[kpy - size // 2:kpy - size // 2 + size, kpx - size // 2:kpx - size // 2 + size, ]
	kpts.append(kpt)
	faces.append(kpts)
	return faces

	def remove_large_crops(self, crops, pct=0.1):
	"""Removes faces from the results if they take up more than X%
	of the video. Such a face is likely a false positive.

	This is an optional postprocessing step. Modifies the original
	data structure.

	Arguments:
	crops: a list of dictionaries with face crop data
	pct: maximum portion of the frame a crop may take up
	"""
	for i in range(len(crops)):
	frame_data = crops[i]
	video_area = frame_data["frame_w"] * frame_data["frame_h"]
	faces = frame_data["faces"]
	scores = frame_data["scores"]
	new_faces = []
	new_scores = []
	for j in range(len(faces)):
	face = faces[j]
	face_H, face_W, _ = face.shape
	face_area = face_H * face_W
	if face_area / video_area < 0.1:
	new_faces.append(face)
	new_scores.append(scores[j])
	frame_data["faces"] = new_faces
	frame_data["scores"] = new_scores

	def keep_only_best_face(self, crops):
	"""For each frame, only keeps the face with the highest confidence.

	This gets rid of false positives, but obviously is problematic for
	videos with two people!

	This is an optional postprocessing step. Modifies the original
	data structure.
	"""
	for i in range(len(crops)):
	frame_data = crops[i]
	if len(frame_data["faces"]) > 0:
	frame_data["faces"] = frame_data["faces"][:1]
	frame_data["scores"] = frame_data["scores"][:1]

	# TODO: def filter_likely_false_positives(self, crops):
	# if only some frames have more than 1 face, it's likely a false positive
	# if most frames have more than 1 face, it's probably two people
	# so find the % of frames with > 1 face; if > 0.X, keep the two best faces

	# TODO: def filter_by_score(self, crops, min_score) to remove any
	# crops with a confidence score lower than min_score

	# TODO: def sort_by_histogram(self, crops) for videos with 2 people.