Spaces:
Runtime error
Runtime error
import os | |
import glob | |
import torch | |
import cv2 | |
import matplotlib.pyplot as plt | |
import os | |
import numpy as np | |
import torch.fft as fft | |
import ipdb | |
import copy | |
import wget | |
from midas.model_loader import load_model | |
import torch.nn.functional as F | |
first_execution = True | |
thisdir = os.path.abspath(os.path.dirname(__file__)) | |
class MiDas(): | |
def __init__(self, device, model_type) -> None: | |
self.device = device | |
torch.backends.cudnn.enabled = True | |
torch.backends.cudnn.benchmark = True | |
model_weights = os.path.join(thisdir, '..' ,f"./weights/{model_type}.pt") | |
if not os.path.exists(model_weights): | |
os.makedirs(os.path.dirname(model_weights), exist_ok=True) | |
if '384' in model_type: | |
wget.download('https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt', model_weights) | |
elif '512' in model_type: | |
wget.download('https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt', model_weights) | |
else: | |
assert False, 'please select correct depth estimation model.' | |
print("Device: %s" % device) | |
model, transform, net_w, net_h = load_model( | |
device, model_weights, model_type, optimize=False, height=None, square=False | |
) | |
self.model = model | |
self.transform = transform | |
self.model_type = model_type | |
self.net_w = net_w | |
self.net_h = net_h | |
def process( | |
self, device, model, model_type, image, input_size, target_size, optimize, use_camera | |
): | |
""" | |
Run the inference and interpolate. | |
Args: | |
device (torch.device): the torch device used | |
model: the model used for inference | |
model_type: the type of the model | |
image: the image fed into the neural network | |
input_size: the size (width, height) of the neural network input (for OpenVINO) | |
target_size: the size (width, height) the neural network output is interpolated to | |
optimize: optimize the model to half-floats on CUDA? | |
use_camera: is the camera used? | |
Returns: | |
the prediction | |
""" | |
global first_execution | |
if "openvino" in model_type: | |
if first_execution or not use_camera: | |
# print( | |
# f" Input resized to {input_size[0]}x{input_size[1]} before entering the encoder" | |
# ) | |
first_execution = False | |
sample = [np.reshape(image, (1, 3, *input_size))] | |
prediction = model(sample)[model.output(0)][0] | |
prediction = cv2.resize( | |
prediction, dsize=target_size, interpolation=cv2.INTER_CUBIC | |
) | |
else: | |
sample = torch.from_numpy(image).to(device).unsqueeze(0) | |
if optimize and device == torch.device("cuda"): | |
if first_execution: | |
print( | |
" Optimization to half-floats activated. Use with caution, because models like Swin require\n" | |
" float precision to work properly and may yield non-finite depth values to some extent for\n" | |
" half-floats." | |
) | |
sample = sample.to(memory_format=torch.channels_last) | |
sample = sample.half() | |
if first_execution or not use_camera: | |
height, width = sample.shape[2:] | |
print(f" Input resized to {width}x{height} before entering the encoder") | |
first_execution = False | |
prediction = model.forward(sample) | |
prediction = ( | |
torch.nn.functional.interpolate( | |
prediction.unsqueeze(1), | |
size=target_size[::-1], | |
mode="bicubic", | |
align_corners=False, | |
) | |
.squeeze() | |
.cpu() | |
.numpy() | |
) | |
return prediction | |
def prediction2depth(self, depth): | |
bits = 1 | |
if not np.isfinite(depth).all(): | |
depth=np.nan_to_num(depth, nan=0.0, posinf=0.0, neginf=0.0) | |
print("WARNING: Non-finite depth values present") | |
depth_min = depth.min() | |
depth_max = depth.max() | |
max_val = (2**(8*bits))-1 | |
if depth_max - depth_min > np.finfo("float").eps: | |
out = max_val * (depth - depth_min) / (depth_max - depth_min) | |
else: | |
out = np.zeros(depth.shape, dtype=depth.dtype) | |
# out = cv2.applyColorMap(np.uint8(out), cv2.COLORMAP_INFERNO) | |
return out | |
def calc_R(self, theta_z, theta_x, theta_y): | |
theta_z, theta_x, theta_y = theta_z/180*np.pi, theta_x/180*np.pi, theta_y/180*np.pi, | |
Rz = np.array([[np.cos(theta_z), np.sin(theta_z), 0], | |
[-np.sin(theta_z), np.cos(theta_z), 0], | |
[0,0,1]]) | |
Rx = np.array([[1,0,0], | |
[0,np.cos(theta_x), np.sin(theta_x)], | |
[0, -np.sin(theta_x), np.cos(theta_x)]]) | |
Ry = np.array([[np.cos(theta_y), 0, np.sin(theta_y)], | |
[0,1,0], | |
[-np.sin(theta_y), 0, np.cos(theta_y)]]) | |
R = Rz @ Rx @ Ry | |
return R | |
def render_new_view(self, img, depth, R, t, K): | |
h, w, _ = img.shape | |
new_img = np.zeros_like(img) | |
for y in range(h): | |
for x in range(w): | |
# Back-project | |
Z = depth[y, x] | |
X = (x - K[0, 2]) * Z / K[0, 0] | |
Y = (y - K[1, 2]) * Z / K[1, 1] | |
point3D = np.array([X, Y, Z, 1]) | |
# Transform | |
point3D_new = R @ point3D[:3] + t | |
if point3D_new[2] <= 0: # point is behind the camera | |
continue | |
# Project to new view | |
u = int(K[0, 0] * point3D_new[0] / point3D_new[2] + K[0, 2]) | |
v = int(K[1, 1] * point3D_new[1] / point3D_new[2] + K[1, 2]) | |
if 0 <= u < w and 0 <= v < h: | |
new_img[v, u] = img[y, x] | |
return new_img | |
def wrap_img(self, img, depth_map, K, R, T, target_point=None): | |
h, w = img.shape[:2] | |
# Generate grid of coordinates (x, y) | |
x, y = np.meshgrid(np.arange(w), np.arange(h)) | |
ones = np.ones_like(x) | |
# Flatten and stack to get homogeneous coordinates | |
homogeneous_coordinates = np.stack((x.flatten(), y.flatten(), ones.flatten()), axis=1).T | |
# Inverse intrinsic matrix | |
K_inv = np.linalg.inv(K) | |
# Inverse rotation and translation | |
R_inv = R.T | |
T_inv = -R_inv @ T | |
# Project to 3D using depth map | |
world_coordinates = K_inv @ homogeneous_coordinates | |
world_coordinates *= depth_map.flatten() | |
# Apply inverse transformation | |
transformed_world_coordinates = R_inv @ world_coordinates + T_inv.reshape(-1, 1) | |
# Project back to 2D | |
valid = transformed_world_coordinates[2, :] > 0 | |
projected_2D = K @ transformed_world_coordinates | |
projected_2D /= projected_2D[2, :] | |
# Initialize map_x and map_y | |
map_x = np.zeros((h, w), dtype=np.float32) | |
map_y = np.zeros((h, w), dtype=np.float32) | |
# Assign valid projection values to map_x and map_y | |
map_x.flat[valid] = projected_2D[0, valid] | |
map_y.flat[valid] = projected_2D[1, valid] | |
# Perform the warping | |
wrapped_img = cv2.remap(img, map_x, map_y, interpolation=cv2.INTER_LINEAR) | |
if target_point is None: | |
return wrapped_img | |
else: | |
target_point = (map_x[int(target_point[1]), int(target_point[0])], map_y[int(target_point[1]), int(target_point[0])]) | |
target_point = tuple(max(0, min(511, x)) for x in target_point) | |
return wrapped_img, target_point | |
def get_low_high_frequent_tensors(self, x, threshold=4): | |
dtype = x.dtype | |
x = x.type(torch.float32) | |
# FFT | |
x_freq = fft.fftn(x, dim=(-2, -1)) | |
x_freq = fft.fftshift(x_freq, dim=(-2, -1)) | |
B,C,H,W = x_freq.shape | |
mask = torch.ones((B, C, H, W)).to(x.device) | |
crow, ccol = H // 2, W //2 | |
mask[..., crow - threshold:crow + threshold, ccol - threshold:ccol + threshold] = 0 # low 0 high 1 | |
x_freq_high = x_freq * mask | |
x_freq_low = x_freq * (1 - mask) | |
x_freq_high = fft.ifftshift(x_freq_high, dim=(-2, -1)) | |
x_high = fft.ifftn(x_freq_high, dim=(-2, -1)).real | |
x_high = x_high.type(dtype) | |
x_freq_low = fft.ifftshift(x_freq_low, dim=(-2, -1)) | |
x_low = fft.ifftn(x_freq_low, dim=(-2, -1)).real | |
x_low = x_low.type(dtype) | |
return x_high, x_low, x_freq_high, x_freq_low, mask | |
def combine_low_and_high(self, freq_low, freq_high, mask): | |
freq = freq_high * mask + freq_low * (1-mask) | |
freq = fft.ifftshift(freq, dim=(-2, -1)) | |
x = fft.ifftn(freq, dim=(-2, -1)).real | |
return x | |
def wrap_img_tensor_w_fft(self, img_tensor, depth_tensor, | |
theta_z=0, theta_x=0, theta_y=-10, T=[0,0,-2], threshold=4): | |
_, img_tensor, high_freq, low_freq, fft_mask = self.get_low_high_frequent_tensors(img_tensor, threshold) | |
intrinsic_matrix = np.array([[1000, 0, img_tensor.shape[-1]/2], | |
[0, 1000, img_tensor.shape[-2]/2], | |
[0, 0, 1]]) # Example intrinsic matrix | |
ori_size = None | |
if depth_tensor.shape[-1] != img_tensor.shape[-1]: | |
scale = depth_tensor.shape[-1] / img_tensor.shape[-1] | |
ori_size = (img_tensor.shape[-2], img_tensor.shape[-1]) | |
img_tensor_ori = img_tensor.clone() | |
# img_tensor = F.interpolate(img_tensor, size=(depth_tensor.shape[-2], depth_tensor.shape[-1])) | |
depth_tensor = F.interpolate(depth_tensor.unsqueeze(0).unsqueeze(0), size=ori_size, mode='bilinear').squeeze().to(torch.float16) | |
intrinsic_matrix[0,0] /= scale | |
intrinsic_matrix[1,1] /= scale | |
rotation_matrix = self.calc_R(theta_z=theta_z, theta_x=theta_x, theta_y=theta_y) | |
translation_vector = np.array(T) # Translation vector to shift camera to the right | |
h,w = img_tensor.shape[2:] | |
xy_src = np.mgrid[0:h, 0:w].reshape(2, -1) | |
xy_src_homogeneous = np.vstack((xy_src, np.ones((1, xy_src.shape[1])))) | |
# Convert to torch tensors | |
xy_src_homogeneous_tensor = torch.tensor(xy_src_homogeneous, dtype=torch.float16, device=img_tensor.device) | |
# Compute the coordinates in the world frame | |
xy_world = torch.inverse(torch.tensor(intrinsic_matrix, dtype=torch.float32, device=img_tensor.device)).to(torch.float16) @ xy_src_homogeneous_tensor | |
xy_world = xy_world * depth_tensor.view(1, -1) | |
# Compute the coordinates in the new camera frame | |
xy_new_cam = torch.inverse(torch.tensor(rotation_matrix, dtype=torch.float32, device=img_tensor.device)).to(torch.float16) @ (xy_world - torch.tensor(translation_vector, dtype=torch.float16, device=img_tensor.device).view(3,1)) | |
# Compute the coordinates in the new image | |
xy_dst_homogeneous = torch.tensor(intrinsic_matrix, dtype=torch.float16, device=img_tensor.device) @ xy_new_cam | |
xy_dst = xy_dst_homogeneous[:2, :] / xy_dst_homogeneous[2, :] | |
# Reshape to a 2D grid and normalize to [-1, 1] | |
xy_dst = xy_dst.reshape(2, h, w) | |
xy_dst = (xy_dst - torch.tensor([[w/2.0], [h/2.0]], dtype=torch.float16, device=img_tensor.device).unsqueeze(-1)) / torch.tensor([[w/2.0], [h/2.0]], dtype=torch.float16, device=img_tensor.device).unsqueeze(-1) | |
xy_dst = torch.flip(xy_dst, [0]) | |
xy_dst = xy_dst.permute(1, 2, 0) | |
# Perform the warping | |
wrapped_img = F.grid_sample(img_tensor, xy_dst.to(torch.float16)[None], align_corners=True, mode='bilinear', padding_mode='reflection') | |
wrapped_freq = fft.fftn(wrapped_img, dim=(-2, -1)) | |
wrapped_freq = fft.fftshift(wrapped_freq, dim=(-2, -1)) | |
wrapped_img = self.combine_low_and_high(wrapped_freq, high_freq, fft_mask) | |
return wrapped_img | |
def wrap_img_tensor_w_fft_ext(self, img_tensor, depth_tensor, K,R,T, threshold=4): | |
_, img_tensor, high_freq, low_freq, fft_mask = self.get_low_high_frequent_tensors(img_tensor, threshold) | |
ori_size = None | |
if depth_tensor.shape[-1] != img_tensor.shape[-1]: | |
scale = depth_tensor.shape[-1] / img_tensor.shape[-1] | |
ori_size = (img_tensor.shape[-2], img_tensor.shape[-1]) | |
# img_tensor = F.interpolate(img_tensor, size=(depth_tensor.shape[-2], depth_tensor.shape[-1])) | |
depth_tensor = F.interpolate(depth_tensor.unsqueeze(0).unsqueeze(0), size=ori_size, mode='bilinear').squeeze().to(torch.float16) | |
intrinsic = copy.deepcopy(K) | |
intrinsic = K / scale | |
intrinsic[2,2] = 1 | |
h,w = img_tensor.shape[2:] | |
xy_src = np.mgrid[0:h, 0:w].reshape(2, -1) | |
xy_src_homogeneous = np.vstack((xy_src, np.ones((1, xy_src.shape[1])))) | |
# Convert to torch tensors | |
xy_src_homogeneous_tensor = torch.tensor(xy_src_homogeneous, dtype=img_tensor.dtype, device=img_tensor.device) | |
# Compute the coordinates in the world frame | |
# xy_world = torch.inverse(torch.tensor(K, dtype=torch.float32, device=img_tensor.device)).to(torch.float16) @ xy_src_homogeneous_tensor | |
xy_world = torch.tensor(np.linalg.inv(intrinsic)).to(img_tensor.dtype).to(img_tensor.device) @ xy_src_homogeneous_tensor | |
xy_world = xy_world * depth_tensor.view(1, -1) | |
# Compute the coordinates in the new camera frame | |
xy_new_cam = torch.inverse(torch.tensor(R, dtype=torch.float32, device=img_tensor.device)).to(img_tensor.dtype) @ (xy_world - torch.tensor(T, dtype=img_tensor.dtype, device=img_tensor.device).view(3,1)) | |
# Compute the coordinates in the new image | |
xy_dst_homogeneous = torch.tensor(intrinsic, dtype=img_tensor.dtype, device=img_tensor.device) @ xy_new_cam | |
xy_dst = xy_dst_homogeneous[:2, :] / xy_dst_homogeneous[2, :] | |
# Reshape to a 2D grid and normalize to [-1, 1] | |
xy_dst = xy_dst.reshape(2, h, w) | |
xy_dst = (xy_dst - torch.tensor([[w/2.0], [h/2.0]], dtype=img_tensor.dtype, device=img_tensor.device).unsqueeze(-1)) / torch.tensor([[w/2.0], [h/2.0]], dtype=img_tensor.dtype, device=img_tensor.device).unsqueeze(-1) | |
xy_dst = torch.flip(xy_dst, [0]) | |
xy_dst = xy_dst.permute(1, 2, 0) | |
# Perform the warping | |
wrapped_img = F.grid_sample(img_tensor, xy_dst.to(img_tensor.dtype)[None], align_corners=True, mode='bilinear', padding_mode='reflection') | |
wrapped_freq = fft.fftn(wrapped_img, dim=(-2, -1)) | |
wrapped_freq = fft.fftshift(wrapped_freq, dim=(-2, -1)) | |
wrapped_img = self.combine_low_and_high(wrapped_freq, high_freq, fft_mask) | |
return wrapped_img | |
def wrap_img_tensor_w_fft_matrix(self, img_tensor, depth_tensor, | |
theta_z=0, theta_x=0, theta_y=-10, T=[0,0,-2], threshold=4): | |
_, img_tensor, high_freq, low_freq, fft_mask = self.get_low_high_frequent_tensors(img_tensor, threshold) | |
intrinsic_matrix = np.array([[1000, 0, img_tensor.shape[-1]/2], | |
[0, 1000, img_tensor.shape[-2]/2], | |
[0, 0, 1]]) # Example intrinsic matrix | |
ori_size = None | |
if depth_tensor.shape[-1] != img_tensor.shape[-1]: | |
scale = depth_tensor.shape[-1] / img_tensor.shape[-1] | |
ori_size = (img_tensor.shape[-2], img_tensor.shape[-1]) | |
img_tensor_ori = img_tensor.clone() | |
# img_tensor = F.interpolate(img_tensor, size=(depth_tensor.shape[-2], depth_tensor.shape[-1])) | |
depth_tensor = F.interpolate(depth_tensor.unsqueeze(0).unsqueeze(0), size=ori_size, mode='bilinear').squeeze().to(torch.float16) | |
intrinsic_matrix[0,0] /= scale | |
intrinsic_matrix[1,1] /= scale | |
rotation_matrix = self.calc_R(theta_z=theta_z, theta_x=theta_x, theta_y=theta_y) | |
translation_vector = np.array(T) # Translation vector to shift camera to the right | |
h,w = img_tensor.shape[2:] | |
xy_src = np.mgrid[0:h, 0:w].reshape(2, -1) | |
xy_src_homogeneous = np.vstack((xy_src, np.ones((1, xy_src.shape[1])))) | |
# Convert to torch tensors | |
xy_src_homogeneous_tensor = torch.tensor(xy_src_homogeneous, dtype=torch.float16, device=img_tensor.device) | |
# Compute the coordinates in the world frame | |
xy_world = torch.inverse(torch.tensor(intrinsic_matrix, dtype=torch.float32, device=img_tensor.device)).to(torch.float16) @ xy_src_homogeneous_tensor | |
xy_world = xy_world * depth_tensor.view(1, -1) | |
# Compute the coordinates in the new camera frame | |
xy_new_cam = torch.inverse(torch.tensor(rotation_matrix, dtype=torch.float32, device=img_tensor.device)).to(torch.float16) @ (xy_world - torch.tensor(translation_vector, dtype=torch.float16, device=img_tensor.device).view(3,1)) | |
# Compute the coordinates in the new image | |
xy_dst_homogeneous = torch.tensor(intrinsic_matrix, dtype=torch.float16, device=img_tensor.device) @ xy_new_cam | |
xy_dst = xy_dst_homogeneous[:2, :] / xy_dst_homogeneous[2, :] | |
# Reshape to a 2D grid and normalize to [-1, 1] | |
xy_dst = xy_dst.reshape(2, h, w) | |
xy_dst = (xy_dst - torch.tensor([[w/2.0], [h/2.0]], dtype=torch.float16, device=img_tensor.device).unsqueeze(-1)) / torch.tensor([[w/2.0], [h/2.0]], dtype=torch.float16, device=img_tensor.device).unsqueeze(-1) | |
xy_dst = torch.flip(xy_dst, [0]) | |
xy_dst = xy_dst.permute(1, 2, 0) | |
# Perform the warping | |
wrapped_img = F.grid_sample(img_tensor, xy_dst.to(torch.float16)[None], align_corners=True, mode='bilinear', padding_mode='reflection') | |
wrapped_freq = fft.fftn(wrapped_img, dim=(-2, -1)) | |
wrapped_freq = fft.fftshift(wrapped_freq, dim=(-2, -1)) | |
wrapped_img = self.combine_low_and_high(wrapped_freq, high_freq, fft_mask) | |
return wrapped_img | |
def wrap_img_tensor(self, img_tensor, depth_tensor, | |
theta_z=0, theta_x=0, theta_y=-10, T=[0,0,-2]): | |
intrinsic_matrix = np.array([[1000, 0, img_tensor.shape[-1]/2], | |
[0, 1000, img_tensor.shape[-2]/2], | |
[0, 0, 1]]) # Example intrinsic matrix | |
ori_size = None | |
if depth_tensor.shape[-1] != img_tensor.shape[-1]: | |
scale = depth_tensor.shape[-1] / img_tensor.shape[-1] | |
ori_size = (img_tensor.shape[-2], img_tensor.shape[-1]) | |
img_tensor_ori = img_tensor.clone() | |
# img_tensor = F.interpolate(img_tensor, size=(depth_tensor.shape[-2], depth_tensor.shape[-1])) | |
depth_tensor = F.interpolate(depth_tensor.unsqueeze(0).unsqueeze(0), size=ori_size, mode='bilinear').squeeze().to(torch.float16) | |
intrinsic_matrix[0,0] /= scale | |
intrinsic_matrix[1,1] /= scale | |
rotation_matrix = self.calc_R(theta_z=theta_z, theta_x=theta_x, theta_y=theta_y) | |
translation_vector = np.array(T) # Translation vector to shift camera to the right | |
h,w = img_tensor.shape[2:] | |
xy_src = np.mgrid[0:h, 0:w].reshape(2, -1) | |
xy_src_homogeneous = np.vstack((xy_src, np.ones((1, xy_src.shape[1])))) | |
# Convert to torch tensors | |
xy_src_homogeneous_tensor = torch.tensor(xy_src_homogeneous, dtype=torch.float16, device=img_tensor.device) | |
# Compute the coordinates in the world frame | |
xy_world = torch.inverse(torch.tensor(intrinsic_matrix, dtype=torch.float32, device=img_tensor.device)).to(torch.float16) @ xy_src_homogeneous_tensor | |
xy_world = xy_world * depth_tensor.view(1, -1) | |
# Compute the coordinates in the new camera frame | |
xy_new_cam = torch.inverse(torch.tensor(rotation_matrix, dtype=torch.float32, device=img_tensor.device)).to(torch.float16) @ (xy_world - torch.tensor(translation_vector, dtype=torch.float16, device=img_tensor.device).view(3,1)) | |
# Compute the coordinates in the new image | |
xy_dst_homogeneous = torch.tensor(intrinsic_matrix, dtype=torch.float16, device=img_tensor.device) @ xy_new_cam | |
xy_dst = xy_dst_homogeneous[:2, :] / xy_dst_homogeneous[2, :] | |
# Reshape to a 2D grid and normalize to [-1, 1] | |
xy_dst = xy_dst.reshape(2, h, w) | |
xy_dst = (xy_dst - torch.tensor([[w/2.0], [h/2.0]], dtype=torch.float16, device=img_tensor.device).unsqueeze(-1)) / torch.tensor([[w/2.0], [h/2.0]], dtype=torch.float16, device=img_tensor.device).unsqueeze(-1) | |
xy_dst = torch.flip(xy_dst, [0]) | |
xy_dst = xy_dst.permute(1, 2, 0) | |
# Perform the warping | |
wrapped_img = F.grid_sample(img_tensor, xy_dst.to(torch.float16)[None], align_corners=True, mode='bilinear') | |
return wrapped_img | |
def __call__(self, img_array, theta_z=0, theta_x=0, theta_y=-10, T=[0,0,-2]): | |
img_depth = self.transform({"image": img_array})["image"] | |
# compute | |
prediction = self.process( | |
self.device, | |
self.model, | |
self.model_type, | |
img_depth, | |
(self.net_w, self.net_h), | |
img_array.shape[1::-1], | |
optimize=False, | |
use_camera=False, | |
) | |
depth = self.prediction2depth(prediction) | |
# img = img_array.copy() | |
# img = img / 2. + 0.5 | |
K = np.array([[1000, 0, img_array.shape[1]/2], | |
[0, 1000, img_array.shape[0]/2], | |
[0, 0, 1]]) # Example intrinsic matrix | |
R = self.calc_R(theta_z=theta_z, theta_x=theta_x, theta_y=theta_y) | |
T = np.array(T) # Translation vector to shift camera to the right | |
# new_img = self.render_new_view(img_array, depth, R, T, K) | |
new_img = self.wrap_img(img_array, depth, K, R, T) | |
mask = np.all(new_img == [0,0,0], axis=2).astype(np.uint8) * 255 | |
mask = 255 - mask | |
return new_img, mask, depth | |