Spaces:

imsuperkong
/

dreamdrone

Runtime error

App Files Files Community

dreamdrone / lib /midas.py

imsuperkong

Upload 31 files

dc47947 11 months ago

raw

history blame contribute delete

22.3 kB

	import os
	import glob
	import torch
	import cv2
	import matplotlib.pyplot as plt
	import os

	import numpy as np
	import torch.fft as fft
	import ipdb
	import copy
	import wget

	from midas.model_loader import load_model
	import torch.nn.functional as F
	first_execution = True
	thisdir = os.path.abspath(os.path.dirname(__file__))

	class MiDas():
	def __init__(self, device, model_type) -> None:
	self.device = device

	torch.backends.cudnn.enabled = True
	torch.backends.cudnn.benchmark = True

	model_weights = os.path.join(thisdir, '..' ,f"./weights/{model_type}.pt")
	if not os.path.exists(model_weights):
	os.makedirs(os.path.dirname(model_weights), exist_ok=True)
	if '384' in model_type:
	wget.download('https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt', model_weights)
	elif '512' in model_type:
	wget.download('https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt', model_weights)
	else:
	assert False, 'please select correct depth estimation model.'
	print("Device: %s" % device)
	model, transform, net_w, net_h = load_model(
	device, model_weights, model_type, optimize=False, height=None, square=False
	)
	self.model = model
	self.transform = transform
	self.model_type = model_type
	self.net_w = net_w
	self.net_h = net_h

	def process(
	self, device, model, model_type, image, input_size, target_size, optimize, use_camera
	):
	"""
	Run the inference and interpolate.

	Args:
	device (torch.device): the torch device used
	model: the model used for inference
	model_type: the type of the model
	image: the image fed into the neural network
	input_size: the size (width, height) of the neural network input (for OpenVINO)
	target_size: the size (width, height) the neural network output is interpolated to
	optimize: optimize the model to half-floats on CUDA?
	use_camera: is the camera used?

	Returns:
	the prediction
	"""
	global first_execution

	if "openvino" in model_type:
	if first_execution or not use_camera:
	# print(
	# f" Input resized to {input_size[0]}x{input_size[1]} before entering the encoder"
	# )
	first_execution = False

	sample = [np.reshape(image, (1, 3, *input_size))]
	prediction = model(sample)[model.output(0)][0]
	prediction = cv2.resize(
	prediction, dsize=target_size, interpolation=cv2.INTER_CUBIC
	)
	else:
	sample = torch.from_numpy(image).to(device).unsqueeze(0)

	if optimize and device == torch.device("cuda"):
	if first_execution:
	print(
	" Optimization to half-floats activated. Use with caution, because models like Swin require\n"
	" float precision to work properly and may yield non-finite depth values to some extent for\n"
	" half-floats."
	)
	sample = sample.to(memory_format=torch.channels_last)
	sample = sample.half()

	if first_execution or not use_camera:
	height, width = sample.shape[2:]
	print(f" Input resized to {width}x{height} before entering the encoder")
	first_execution = False

	prediction = model.forward(sample)
	prediction = (
	torch.nn.functional.interpolate(
	prediction.unsqueeze(1),
	size=target_size[::-1],
	mode="bicubic",
	align_corners=False,
	)
	.squeeze()
	.cpu()
	.numpy()
	)

	return prediction

	def prediction2depth(self, depth):
	bits = 1
	if not np.isfinite(depth).all():
	depth=np.nan_to_num(depth, nan=0.0, posinf=0.0, neginf=0.0)
	print("WARNING: Non-finite depth values present")

	depth_min = depth.min()
	depth_max = depth.max()

	max_val = (2*(8bits))-1

	if depth_max - depth_min > np.finfo("float").eps:
	out = max_val * (depth - depth_min) / (depth_max - depth_min)
	else:
	out = np.zeros(depth.shape, dtype=depth.dtype)
	# out = cv2.applyColorMap(np.uint8(out), cv2.COLORMAP_INFERNO)
	return out

	def calc_R(self, theta_z, theta_x, theta_y):
	theta_z, theta_x, theta_y = theta_z/180np.pi, theta_x/180np.pi, theta_y/180*np.pi,
	Rz = np.array([[np.cos(theta_z), np.sin(theta_z), 0],
	[-np.sin(theta_z), np.cos(theta_z), 0],
	[0,0,1]])
	Rx = np.array([[1,0,0],
	[0,np.cos(theta_x), np.sin(theta_x)],
	[0, -np.sin(theta_x), np.cos(theta_x)]])
	Ry = np.array([[np.cos(theta_y), 0, np.sin(theta_y)],
	[0,1,0],
	[-np.sin(theta_y), 0, np.cos(theta_y)]])

	R = Rz @ Rx @ Ry
	return R

	def render_new_view(self, img, depth, R, t, K):
	h, w, _ = img.shape
	new_img = np.zeros_like(img)

	for y in range(h):
	for x in range(w):
	# Back-project
	Z = depth[y, x]
	X = (x - K[0, 2]) * Z / K[0, 0]
	Y = (y - K[1, 2]) * Z / K[1, 1]
	point3D = np.array([X, Y, Z, 1])

	# Transform
	point3D_new = R @ point3D[:3] + t
	if point3D_new[2] <= 0: # point is behind the camera
	continue

	# Project to new view
	u = int(K[0, 0] * point3D_new[0] / point3D_new[2] + K[0, 2])
	v = int(K[1, 1] * point3D_new[1] / point3D_new[2] + K[1, 2])

	if 0 <= u < w and 0 <= v < h:
	new_img[v, u] = img[y, x]
	return new_img

	def wrap_img(self, img, depth_map, K, R, T, target_point=None):
	h, w = img.shape[:2]
	# Generate grid of coordinates (x, y)
	x, y = np.meshgrid(np.arange(w), np.arange(h))
	ones = np.ones_like(x)

	# Flatten and stack to get homogeneous coordinates
	homogeneous_coordinates = np.stack((x.flatten(), y.flatten(), ones.flatten()), axis=1).T

	# Inverse intrinsic matrix
	K_inv = np.linalg.inv(K)

	# Inverse rotation and translation
	R_inv = R.T
	T_inv = -R_inv @ T

	# Project to 3D using depth map
	world_coordinates = K_inv @ homogeneous_coordinates
	world_coordinates *= depth_map.flatten()

	# Apply inverse transformation
	transformed_world_coordinates = R_inv @ world_coordinates + T_inv.reshape(-1, 1)

	# Project back to 2D
	valid = transformed_world_coordinates[2, :] > 0
	projected_2D = K @ transformed_world_coordinates
	projected_2D /= projected_2D[2, :]

	# Initialize map_x and map_y
	map_x = np.zeros((h, w), dtype=np.float32)
	map_y = np.zeros((h, w), dtype=np.float32)

	# Assign valid projection values to map_x and map_y
	map_x.flat[valid] = projected_2D[0, valid]
	map_y.flat[valid] = projected_2D[1, valid]

	# Perform the warping
	wrapped_img = cv2.remap(img, map_x, map_y, interpolation=cv2.INTER_LINEAR)
	if target_point is None:
	return wrapped_img
	else:
	target_point = (map_x[int(target_point[1]), int(target_point[0])], map_y[int(target_point[1]), int(target_point[0])])
	target_point = tuple(max(0, min(511, x)) for x in target_point)
	return wrapped_img, target_point

	def get_low_high_frequent_tensors(self, x, threshold=4):
	dtype = x.dtype
	x = x.type(torch.float32)

	# FFT
	x_freq = fft.fftn(x, dim=(-2, -1))
	x_freq = fft.fftshift(x_freq, dim=(-2, -1))
	B,C,H,W = x_freq.shape
	mask = torch.ones((B, C, H, W)).to(x.device)

	crow, ccol = H // 2, W //2
	mask[..., crow - threshold:crow + threshold, ccol - threshold:ccol + threshold] = 0 # low 0 high 1
	x_freq_high = x_freq * mask
	x_freq_low = x_freq * (1 - mask)

	x_freq_high = fft.ifftshift(x_freq_high, dim=(-2, -1))
	x_high = fft.ifftn(x_freq_high, dim=(-2, -1)).real
	x_high = x_high.type(dtype)

	x_freq_low = fft.ifftshift(x_freq_low, dim=(-2, -1))
	x_low = fft.ifftn(x_freq_low, dim=(-2, -1)).real
	x_low = x_low.type(dtype)
	return x_high, x_low, x_freq_high, x_freq_low, mask

	def combine_low_and_high(self, freq_low, freq_high, mask):
	freq = freq_high * mask + freq_low * (1-mask)
	freq = fft.ifftshift(freq, dim=(-2, -1))
	x = fft.ifftn(freq, dim=(-2, -1)).real
	return x


	def wrap_img_tensor_w_fft(self, img_tensor, depth_tensor,
	theta_z=0, theta_x=0, theta_y=-10, T=[0,0,-2], threshold=4):
	_, img_tensor, high_freq, low_freq, fft_mask = self.get_low_high_frequent_tensors(img_tensor, threshold)

	intrinsic_matrix = np.array([[1000, 0, img_tensor.shape[-1]/2],
	[0, 1000, img_tensor.shape[-2]/2],
	[0, 0, 1]]) # Example intrinsic matrix
	ori_size = None
	if depth_tensor.shape[-1] != img_tensor.shape[-1]:
	scale = depth_tensor.shape[-1] / img_tensor.shape[-1]
	ori_size = (img_tensor.shape[-2], img_tensor.shape[-1])
	img_tensor_ori = img_tensor.clone()
	# img_tensor = F.interpolate(img_tensor, size=(depth_tensor.shape[-2], depth_tensor.shape[-1]))
	depth_tensor = F.interpolate(depth_tensor.unsqueeze(0).unsqueeze(0), size=ori_size, mode='bilinear').squeeze().to(torch.float16)
	intrinsic_matrix[0,0] /= scale
	intrinsic_matrix[1,1] /= scale
	rotation_matrix = self.calc_R(theta_z=theta_z, theta_x=theta_x, theta_y=theta_y)
	translation_vector = np.array(T) # Translation vector to shift camera to the right

	h,w = img_tensor.shape[2:]

	xy_src = np.mgrid[0:h, 0:w].reshape(2, -1)

	xy_src_homogeneous = np.vstack((xy_src, np.ones((1, xy_src.shape[1]))))

	# Convert to torch tensors
	xy_src_homogeneous_tensor = torch.tensor(xy_src_homogeneous, dtype=torch.float16, device=img_tensor.device)

	# Compute the coordinates in the world frame
	xy_world = torch.inverse(torch.tensor(intrinsic_matrix, dtype=torch.float32, device=img_tensor.device)).to(torch.float16) @ xy_src_homogeneous_tensor
	xy_world = xy_world * depth_tensor.view(1, -1)

	# Compute the coordinates in the new camera frame
	xy_new_cam = torch.inverse(torch.tensor(rotation_matrix, dtype=torch.float32, device=img_tensor.device)).to(torch.float16) @ (xy_world - torch.tensor(translation_vector, dtype=torch.float16, device=img_tensor.device).view(3,1))

	# Compute the coordinates in the new image
	xy_dst_homogeneous = torch.tensor(intrinsic_matrix, dtype=torch.float16, device=img_tensor.device) @ xy_new_cam
	xy_dst = xy_dst_homogeneous[:2, :] / xy_dst_homogeneous[2, :]

	# Reshape to a 2D grid and normalize to [-1, 1]
	xy_dst = xy_dst.reshape(2, h, w)
	xy_dst = (xy_dst - torch.tensor([[w/2.0], [h/2.0]], dtype=torch.float16, device=img_tensor.device).unsqueeze(-1)) / torch.tensor([[w/2.0], [h/2.0]], dtype=torch.float16, device=img_tensor.device).unsqueeze(-1)
	xy_dst = torch.flip(xy_dst, [0])
	xy_dst = xy_dst.permute(1, 2, 0)

	# Perform the warping
	wrapped_img = F.grid_sample(img_tensor, xy_dst.to(torch.float16)[None], align_corners=True, mode='bilinear', padding_mode='reflection')
	wrapped_freq = fft.fftn(wrapped_img, dim=(-2, -1))
	wrapped_freq = fft.fftshift(wrapped_freq, dim=(-2, -1))
	wrapped_img = self.combine_low_and_high(wrapped_freq, high_freq, fft_mask)
	return wrapped_img

	def wrap_img_tensor_w_fft_ext(self, img_tensor, depth_tensor, K,R,T, threshold=4):
	_, img_tensor, high_freq, low_freq, fft_mask = self.get_low_high_frequent_tensors(img_tensor, threshold)

	ori_size = None

	if depth_tensor.shape[-1] != img_tensor.shape[-1]:
	scale = depth_tensor.shape[-1] / img_tensor.shape[-1]
	ori_size = (img_tensor.shape[-2], img_tensor.shape[-1])
	# img_tensor = F.interpolate(img_tensor, size=(depth_tensor.shape[-2], depth_tensor.shape[-1]))
	depth_tensor = F.interpolate(depth_tensor.unsqueeze(0).unsqueeze(0), size=ori_size, mode='bilinear').squeeze().to(torch.float16)
	intrinsic = copy.deepcopy(K)
	intrinsic = K / scale
	intrinsic[2,2] = 1

	h,w = img_tensor.shape[2:]

	xy_src = np.mgrid[0:h, 0:w].reshape(2, -1)

	xy_src_homogeneous = np.vstack((xy_src, np.ones((1, xy_src.shape[1]))))

	# Convert to torch tensors
	xy_src_homogeneous_tensor = torch.tensor(xy_src_homogeneous, dtype=img_tensor.dtype, device=img_tensor.device)

	# Compute the coordinates in the world frame
	# xy_world = torch.inverse(torch.tensor(K, dtype=torch.float32, device=img_tensor.device)).to(torch.float16) @ xy_src_homogeneous_tensor
	xy_world = torch.tensor(np.linalg.inv(intrinsic)).to(img_tensor.dtype).to(img_tensor.device) @ xy_src_homogeneous_tensor
	xy_world = xy_world * depth_tensor.view(1, -1)

	# Compute the coordinates in the new camera frame
	xy_new_cam = torch.inverse(torch.tensor(R, dtype=torch.float32, device=img_tensor.device)).to(img_tensor.dtype) @ (xy_world - torch.tensor(T, dtype=img_tensor.dtype, device=img_tensor.device).view(3,1))

	# Compute the coordinates in the new image
	xy_dst_homogeneous = torch.tensor(intrinsic, dtype=img_tensor.dtype, device=img_tensor.device) @ xy_new_cam
	xy_dst = xy_dst_homogeneous[:2, :] / xy_dst_homogeneous[2, :]

	# Reshape to a 2D grid and normalize to [-1, 1]
	xy_dst = xy_dst.reshape(2, h, w)
	xy_dst = (xy_dst - torch.tensor([[w/2.0], [h/2.0]], dtype=img_tensor.dtype, device=img_tensor.device).unsqueeze(-1)) / torch.tensor([[w/2.0], [h/2.0]], dtype=img_tensor.dtype, device=img_tensor.device).unsqueeze(-1)
	xy_dst = torch.flip(xy_dst, [0])
	xy_dst = xy_dst.permute(1, 2, 0)

	# Perform the warping
	wrapped_img = F.grid_sample(img_tensor, xy_dst.to(img_tensor.dtype)[None], align_corners=True, mode='bilinear', padding_mode='reflection')
	wrapped_freq = fft.fftn(wrapped_img, dim=(-2, -1))
	wrapped_freq = fft.fftshift(wrapped_freq, dim=(-2, -1))
	wrapped_img = self.combine_low_and_high(wrapped_freq, high_freq, fft_mask)
	return wrapped_img

	def wrap_img_tensor_w_fft_matrix(self, img_tensor, depth_tensor,
	theta_z=0, theta_x=0, theta_y=-10, T=[0,0,-2], threshold=4):
	_, img_tensor, high_freq, low_freq, fft_mask = self.get_low_high_frequent_tensors(img_tensor, threshold)

	intrinsic_matrix = np.array([[1000, 0, img_tensor.shape[-1]/2],
	[0, 1000, img_tensor.shape[-2]/2],
	[0, 0, 1]]) # Example intrinsic matrix
	ori_size = None
	if depth_tensor.shape[-1] != img_tensor.shape[-1]:
	scale = depth_tensor.shape[-1] / img_tensor.shape[-1]
	ori_size = (img_tensor.shape[-2], img_tensor.shape[-1])
	img_tensor_ori = img_tensor.clone()
	# img_tensor = F.interpolate(img_tensor, size=(depth_tensor.shape[-2], depth_tensor.shape[-1]))
	depth_tensor = F.interpolate(depth_tensor.unsqueeze(0).unsqueeze(0), size=ori_size, mode='bilinear').squeeze().to(torch.float16)
	intrinsic_matrix[0,0] /= scale
	intrinsic_matrix[1,1] /= scale
	rotation_matrix = self.calc_R(theta_z=theta_z, theta_x=theta_x, theta_y=theta_y)
	translation_vector = np.array(T) # Translation vector to shift camera to the right

	h,w = img_tensor.shape[2:]

	xy_src = np.mgrid[0:h, 0:w].reshape(2, -1)

	xy_src_homogeneous = np.vstack((xy_src, np.ones((1, xy_src.shape[1]))))

	# Convert to torch tensors
	xy_src_homogeneous_tensor = torch.tensor(xy_src_homogeneous, dtype=torch.float16, device=img_tensor.device)

	# Compute the coordinates in the world frame
	xy_world = torch.inverse(torch.tensor(intrinsic_matrix, dtype=torch.float32, device=img_tensor.device)).to(torch.float16) @ xy_src_homogeneous_tensor
	xy_world = xy_world * depth_tensor.view(1, -1)

	# Compute the coordinates in the new camera frame
	xy_new_cam = torch.inverse(torch.tensor(rotation_matrix, dtype=torch.float32, device=img_tensor.device)).to(torch.float16) @ (xy_world - torch.tensor(translation_vector, dtype=torch.float16, device=img_tensor.device).view(3,1))

	# Compute the coordinates in the new image
	xy_dst_homogeneous = torch.tensor(intrinsic_matrix, dtype=torch.float16, device=img_tensor.device) @ xy_new_cam
	xy_dst = xy_dst_homogeneous[:2, :] / xy_dst_homogeneous[2, :]

	# Reshape to a 2D grid and normalize to [-1, 1]
	xy_dst = xy_dst.reshape(2, h, w)
	xy_dst = (xy_dst - torch.tensor([[w/2.0], [h/2.0]], dtype=torch.float16, device=img_tensor.device).unsqueeze(-1)) / torch.tensor([[w/2.0], [h/2.0]], dtype=torch.float16, device=img_tensor.device).unsqueeze(-1)
	xy_dst = torch.flip(xy_dst, [0])
	xy_dst = xy_dst.permute(1, 2, 0)

	# Perform the warping
	wrapped_img = F.grid_sample(img_tensor, xy_dst.to(torch.float16)[None], align_corners=True, mode='bilinear', padding_mode='reflection')
	wrapped_freq = fft.fftn(wrapped_img, dim=(-2, -1))
	wrapped_freq = fft.fftshift(wrapped_freq, dim=(-2, -1))
	wrapped_img = self.combine_low_and_high(wrapped_freq, high_freq, fft_mask)


	return wrapped_img


	def wrap_img_tensor(self, img_tensor, depth_tensor,
	theta_z=0, theta_x=0, theta_y=-10, T=[0,0,-2]):
	intrinsic_matrix = np.array([[1000, 0, img_tensor.shape[-1]/2],
	[0, 1000, img_tensor.shape[-2]/2],
	[0, 0, 1]]) # Example intrinsic matrix
	ori_size = None
	if depth_tensor.shape[-1] != img_tensor.shape[-1]:
	scale = depth_tensor.shape[-1] / img_tensor.shape[-1]
	ori_size = (img_tensor.shape[-2], img_tensor.shape[-1])
	img_tensor_ori = img_tensor.clone()
	# img_tensor = F.interpolate(img_tensor, size=(depth_tensor.shape[-2], depth_tensor.shape[-1]))
	depth_tensor = F.interpolate(depth_tensor.unsqueeze(0).unsqueeze(0), size=ori_size, mode='bilinear').squeeze().to(torch.float16)
	intrinsic_matrix[0,0] /= scale
	intrinsic_matrix[1,1] /= scale
	rotation_matrix = self.calc_R(theta_z=theta_z, theta_x=theta_x, theta_y=theta_y)
	translation_vector = np.array(T) # Translation vector to shift camera to the right

	h,w = img_tensor.shape[2:]

	xy_src = np.mgrid[0:h, 0:w].reshape(2, -1)

	xy_src_homogeneous = np.vstack((xy_src, np.ones((1, xy_src.shape[1]))))

	# Convert to torch tensors
	xy_src_homogeneous_tensor = torch.tensor(xy_src_homogeneous, dtype=torch.float16, device=img_tensor.device)

	# Compute the coordinates in the world frame
	xy_world = torch.inverse(torch.tensor(intrinsic_matrix, dtype=torch.float32, device=img_tensor.device)).to(torch.float16) @ xy_src_homogeneous_tensor
	xy_world = xy_world * depth_tensor.view(1, -1)

	# Compute the coordinates in the new camera frame
	xy_new_cam = torch.inverse(torch.tensor(rotation_matrix, dtype=torch.float32, device=img_tensor.device)).to(torch.float16) @ (xy_world - torch.tensor(translation_vector, dtype=torch.float16, device=img_tensor.device).view(3,1))

	# Compute the coordinates in the new image
	xy_dst_homogeneous = torch.tensor(intrinsic_matrix, dtype=torch.float16, device=img_tensor.device) @ xy_new_cam
	xy_dst = xy_dst_homogeneous[:2, :] / xy_dst_homogeneous[2, :]

	# Reshape to a 2D grid and normalize to [-1, 1]
	xy_dst = xy_dst.reshape(2, h, w)
	xy_dst = (xy_dst - torch.tensor([[w/2.0], [h/2.0]], dtype=torch.float16, device=img_tensor.device).unsqueeze(-1)) / torch.tensor([[w/2.0], [h/2.0]], dtype=torch.float16, device=img_tensor.device).unsqueeze(-1)
	xy_dst = torch.flip(xy_dst, [0])
	xy_dst = xy_dst.permute(1, 2, 0)

	# Perform the warping
	wrapped_img = F.grid_sample(img_tensor, xy_dst.to(torch.float16)[None], align_corners=True, mode='bilinear')



	return wrapped_img

	@torch.no_grad()
	def __call__(self, img_array, theta_z=0, theta_x=0, theta_y=-10, T=[0,0,-2]):
	img_depth = self.transform({"image": img_array})["image"]

	# compute
	prediction = self.process(
	self.device,
	self.model,
	self.model_type,
	img_depth,
	(self.net_w, self.net_h),
	img_array.shape[1::-1],
	optimize=False,
	use_camera=False,
	)

	depth = self.prediction2depth(prediction)

	# img = img_array.copy()
	# img = img / 2. + 0.5
	K = np.array([[1000, 0, img_array.shape[1]/2],
	[0, 1000, img_array.shape[0]/2],
	[0, 0, 1]]) # Example intrinsic matrix

	R = self.calc_R(theta_z=theta_z, theta_x=theta_x, theta_y=theta_y)
	T = np.array(T) # Translation vector to shift camera to the right

	# new_img = self.render_new_view(img_array, depth, R, T, K)
	new_img = self.wrap_img(img_array, depth, K, R, T)

	mask = np.all(new_img == [0,0,0], axis=2).astype(np.uint8) * 255
	mask = 255 - mask
	return new_img, mask, depth