ViTGaze / data /data_utils.py

initial commit

f9561b9 verified 7 months ago

6.11 kB

	from typing import Tuple
	import torch
	from torchvision import transforms
	import numpy as np
	import pandas as pd


	def to_numpy(tensor: torch.Tensor):
	if torch.is_tensor(tensor):
	return tensor.cpu().detach().numpy()
	elif type(tensor).__module__ != "numpy":
	raise ValueError("Cannot convert {} to numpy array".format(type(tensor)))
	return tensor


	def to_torch(ndarray: np.ndarray):
	if type(ndarray).__module__ == "numpy":
	return torch.from_numpy(ndarray)
	elif not torch.is_tensor(ndarray):
	raise ValueError("Cannot convert {} to torch tensor".format(type(ndarray)))
	return ndarray


	def get_head_box_channel(
	x_min, y_min, x_max, y_max, width, height, resolution, coordconv=False
	):
	head_box = (
	np.array([x_min / width, y_min / height, x_max / width, y_max / height])
	* resolution
	)
	int_head_box = head_box.astype(int)
	int_head_box = np.clip(int_head_box, 0, resolution - 1)
	if int_head_box[0] == int_head_box[2]:
	if int_head_box[0] == 0:
	int_head_box[2] = 1
	elif int_head_box[2] == resolution - 1:
	int_head_box[0] = resolution - 2
	elif abs(head_box[2] - int_head_box[2]) > abs(head_box[0] - int_head_box[0]):
	int_head_box[2] += 1
	else:
	int_head_box[0] -= 1
	if int_head_box[1] == int_head_box[3]:
	if int_head_box[1] == 0:
	int_head_box[3] = 1
	elif int_head_box[3] == resolution - 1:
	int_head_box[1] = resolution - 2
	elif abs(head_box[3] - int_head_box[3]) > abs(head_box[1] - int_head_box[1]):
	int_head_box[3] += 1
	else:
	int_head_box[1] -= 1
	head_box = int_head_box
	if coordconv:
	unit = np.array(range(0, resolution), dtype=np.float32)
	head_channel = []
	for i in unit:
	head_channel.append([unit + i])
	head_channel = np.squeeze(np.array(head_channel)) / float(np.max(head_channel))
	head_channel[head_box[1] : head_box[3], head_box[0] : head_box[2]] = 0
	else:
	head_channel = np.zeros((resolution, resolution), dtype=np.float32)
	head_channel[head_box[1] : head_box[3], head_box[0] : head_box[2]] = 1
	head_channel = torch.from_numpy(head_channel)
	return head_channel


	def draw_labelmap(img, pt, sigma, type="Gaussian"):
	# Draw a 2D gaussian
	# Adopted from https://github.com/anewell/pose-hg-train/blob/master/src/pypose/draw.py
	img = to_numpy(img)

	# Check that any part of the gaussian is in-bounds
	size = int(6 * sigma + 1)
	ul = [int(pt[0] - 3 * sigma), int(pt[1] - 3 * sigma)]
	br = [ul[0] + size, ul[1] + size]
	if ul[0] >= img.shape[1] or ul[1] >= img.shape[0] or br[0] < 0 or br[1] < 0:
	# If not, just return the image as is
	return to_torch(img)

	# Generate gaussian
	x = np.arange(0, size, 1, float)
	y = x[:, np.newaxis]
	x0 = y0 = size // 2
	# The gaussian is not normalized, we want the center value to equal 1
	if type == "Gaussian":
	g = np.exp(-((x - x0) 2 + (y - y0) 2) / (2 * sigma**2))
	elif type == "Cauchy":
	g = sigma / (((x - x0) 2 + (y - y0) 2 + sigma2) 1.5)

	# Usable gaussian range
	g_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0]
	g_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1]
	# Image range
	img_x = max(0, ul[0]), min(br[0], img.shape[1])
	img_y = max(0, ul[1]), min(br[1], img.shape[0])

	img[img_y[0] : img_y[1], img_x[0] : img_x[1]] += g[g_y[0] : g_y[1], g_x[0] : g_x[1]]
	# img = img / np.max(img)
	return to_torch(img)


	def draw_labelmap_no_quant(img, pt, sigma, type="Gaussian"):
	img = to_numpy(img)
	shape = img.shape
	x = np.arange(shape[0])
	y = np.arange(shape[1])
	xx, yy = np.meshgrid(x, y, indexing="ij")
	dist_matrix = (yy - float(pt[0])) 2 + (xx - float(pt[1])) 2
	if type == "Gaussian":
	g = np.exp(-dist_matrix / (2 * sigma**2))
	elif type == "Cauchy":
	g = sigma / ((dist_matrix + sigma2) 1.5)
	g[dist_matrix > 10 * sigma**2] = 0
	img += g
	# img = img / np.max(img)
	return to_torch(img)


	def multi_hot_targets(gaze_pts, out_res):
	w, h = out_res
	target_map = np.zeros((h, w))
	for p in gaze_pts:
	if p[0] >= 0:
	x, y = map(int, [p[0] * float(w), p[1] * float(h)])
	x = min(x, w - 1)
	y = min(y, h - 1)
	target_map[y, x] = 1
	return target_map


	def get_cone(tgt, src, wh, theta=150):
	eye = src * wh
	gaze = tgt * wh

	pixel_mat = np.stack(
	np.meshgrid(np.arange(wh[0]), np.arange(wh[1])),
	-1,
	)

	dot_prod = np.sum((pixel_mat - eye) * (gaze - eye), axis=-1)
	gaze_vector_norm = np.sqrt(np.sum((gaze - eye) ** 2))
	pixel_mat_norm = np.sqrt(np.sum((pixel_mat - eye) ** 2, axis=-1))

	gaze_cones = dot_prod / (gaze_vector_norm * pixel_mat_norm)
	gaze_cones = np.nan_to_num(gaze_cones, nan=1)

	theta = theta * (np.pi / 180)
	beta = np.arccos(gaze_cones)
	# Create mask where true if beta is less than theta/2
	pixel_mat_presence = beta < (theta / 2)

	# Zero out values outside the gaze cone
	gaze_cones[~pixel_mat_presence] = 0
	gaze_cones = np.clip(gaze_cones, 0, None)

	return torch.from_numpy(gaze_cones).unsqueeze(0).float()


	def get_transform(
	input_resolution: int, mean: Tuple[int, int, int], std: Tuple[int, int, int]
	):
	return transforms.Compose(
	[
	transforms.Resize((input_resolution, input_resolution)),
	transforms.ToTensor(),
	transforms.Normalize(mean=mean, std=std),
	]
	)


	def smooth_by_conv(window_size, df, col):
	padded_track = pd.concat(
	[
	pd.DataFrame([[df.iloc[0][col]]] * (window_size // 2), columns=[0]),
	df[col],
	pd.DataFrame([[df.iloc[-1][col]]] * (window_size // 2), columns=[0]),
	]
	)
	smoothed_signals = np.convolve(
	padded_track.squeeze(), np.ones(window_size) / window_size, mode="valid"
	)
	return smoothed_signals