ViTGaze / data /data_utils.py
yhsong's picture
initial commit
f9561b9 verified
from typing import Tuple
import torch
from torchvision import transforms
import numpy as np
import pandas as pd
def to_numpy(tensor: torch.Tensor):
if torch.is_tensor(tensor):
return tensor.cpu().detach().numpy()
elif type(tensor).__module__ != "numpy":
raise ValueError("Cannot convert {} to numpy array".format(type(tensor)))
return tensor
def to_torch(ndarray: np.ndarray):
if type(ndarray).__module__ == "numpy":
return torch.from_numpy(ndarray)
elif not torch.is_tensor(ndarray):
raise ValueError("Cannot convert {} to torch tensor".format(type(ndarray)))
return ndarray
def get_head_box_channel(
x_min, y_min, x_max, y_max, width, height, resolution, coordconv=False
):
head_box = (
np.array([x_min / width, y_min / height, x_max / width, y_max / height])
* resolution
)
int_head_box = head_box.astype(int)
int_head_box = np.clip(int_head_box, 0, resolution - 1)
if int_head_box[0] == int_head_box[2]:
if int_head_box[0] == 0:
int_head_box[2] = 1
elif int_head_box[2] == resolution - 1:
int_head_box[0] = resolution - 2
elif abs(head_box[2] - int_head_box[2]) > abs(head_box[0] - int_head_box[0]):
int_head_box[2] += 1
else:
int_head_box[0] -= 1
if int_head_box[1] == int_head_box[3]:
if int_head_box[1] == 0:
int_head_box[3] = 1
elif int_head_box[3] == resolution - 1:
int_head_box[1] = resolution - 2
elif abs(head_box[3] - int_head_box[3]) > abs(head_box[1] - int_head_box[1]):
int_head_box[3] += 1
else:
int_head_box[1] -= 1
head_box = int_head_box
if coordconv:
unit = np.array(range(0, resolution), dtype=np.float32)
head_channel = []
for i in unit:
head_channel.append([unit + i])
head_channel = np.squeeze(np.array(head_channel)) / float(np.max(head_channel))
head_channel[head_box[1] : head_box[3], head_box[0] : head_box[2]] = 0
else:
head_channel = np.zeros((resolution, resolution), dtype=np.float32)
head_channel[head_box[1] : head_box[3], head_box[0] : head_box[2]] = 1
head_channel = torch.from_numpy(head_channel)
return head_channel
def draw_labelmap(img, pt, sigma, type="Gaussian"):
# Draw a 2D gaussian
# Adopted from https://github.com/anewell/pose-hg-train/blob/master/src/pypose/draw.py
img = to_numpy(img)
# Check that any part of the gaussian is in-bounds
size = int(6 * sigma + 1)
ul = [int(pt[0] - 3 * sigma), int(pt[1] - 3 * sigma)]
br = [ul[0] + size, ul[1] + size]
if ul[0] >= img.shape[1] or ul[1] >= img.shape[0] or br[0] < 0 or br[1] < 0:
# If not, just return the image as is
return to_torch(img)
# Generate gaussian
x = np.arange(0, size, 1, float)
y = x[:, np.newaxis]
x0 = y0 = size // 2
# The gaussian is not normalized, we want the center value to equal 1
if type == "Gaussian":
g = np.exp(-((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma**2))
elif type == "Cauchy":
g = sigma / (((x - x0) ** 2 + (y - y0) ** 2 + sigma**2) ** 1.5)
# Usable gaussian range
g_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0]
g_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1]
# Image range
img_x = max(0, ul[0]), min(br[0], img.shape[1])
img_y = max(0, ul[1]), min(br[1], img.shape[0])
img[img_y[0] : img_y[1], img_x[0] : img_x[1]] += g[g_y[0] : g_y[1], g_x[0] : g_x[1]]
# img = img / np.max(img)
return to_torch(img)
def draw_labelmap_no_quant(img, pt, sigma, type="Gaussian"):
img = to_numpy(img)
shape = img.shape
x = np.arange(shape[0])
y = np.arange(shape[1])
xx, yy = np.meshgrid(x, y, indexing="ij")
dist_matrix = (yy - float(pt[0])) ** 2 + (xx - float(pt[1])) ** 2
if type == "Gaussian":
g = np.exp(-dist_matrix / (2 * sigma**2))
elif type == "Cauchy":
g = sigma / ((dist_matrix + sigma**2) ** 1.5)
g[dist_matrix > 10 * sigma**2] = 0
img += g
# img = img / np.max(img)
return to_torch(img)
def multi_hot_targets(gaze_pts, out_res):
w, h = out_res
target_map = np.zeros((h, w))
for p in gaze_pts:
if p[0] >= 0:
x, y = map(int, [p[0] * float(w), p[1] * float(h)])
x = min(x, w - 1)
y = min(y, h - 1)
target_map[y, x] = 1
return target_map
def get_cone(tgt, src, wh, theta=150):
eye = src * wh
gaze = tgt * wh
pixel_mat = np.stack(
np.meshgrid(np.arange(wh[0]), np.arange(wh[1])),
-1,
)
dot_prod = np.sum((pixel_mat - eye) * (gaze - eye), axis=-1)
gaze_vector_norm = np.sqrt(np.sum((gaze - eye) ** 2))
pixel_mat_norm = np.sqrt(np.sum((pixel_mat - eye) ** 2, axis=-1))
gaze_cones = dot_prod / (gaze_vector_norm * pixel_mat_norm)
gaze_cones = np.nan_to_num(gaze_cones, nan=1)
theta = theta * (np.pi / 180)
beta = np.arccos(gaze_cones)
# Create mask where true if beta is less than theta/2
pixel_mat_presence = beta < (theta / 2)
# Zero out values outside the gaze cone
gaze_cones[~pixel_mat_presence] = 0
gaze_cones = np.clip(gaze_cones, 0, None)
return torch.from_numpy(gaze_cones).unsqueeze(0).float()
def get_transform(
input_resolution: int, mean: Tuple[int, int, int], std: Tuple[int, int, int]
):
return transforms.Compose(
[
transforms.Resize((input_resolution, input_resolution)),
transforms.ToTensor(),
transforms.Normalize(mean=mean, std=std),
]
)
def smooth_by_conv(window_size, df, col):
padded_track = pd.concat(
[
pd.DataFrame([[df.iloc[0][col]]] * (window_size // 2), columns=[0]),
df[col],
pd.DataFrame([[df.iloc[-1][col]]] * (window_size // 2), columns=[0]),
]
)
smoothed_signals = np.convolve(
padded_track.squeeze(), np.ones(window_size) / window_size, mode="valid"
)
return smoothed_signals