File size: 6,697 Bytes
8166792 661e202 8166792 661e202 8166792 661e202 8166792 661e202 8166792 661e202 8166792 661e202 8166792 661e202 8166792 661e202 8166792 661e202 8166792 661e202 8166792 661e202 8166792 661e202 8166792 661e202 8166792 661e202 8166792 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import cv2
import torch
import numpy as np
import time
from midas.model_loader import default_models, load_model
import os
import urllib.request
"midas_v21_small_256" : "",
"dpt_hybrid_384" : "",
"dpt_large_384" : "",
"dpt_swin2_large_384" : "",
"dpt_beit_large_512" : "",
class MonocularDepthEstimator:
def __init__(self,
# model type
# MiDaS 3.1:
# For highest quality: dpt_beit_large_512
# For moderately less quality, but better speed-performance trade-off: dpt_swin2_large_384
# For embedded devices: dpt_swin2_tiny_256, dpt_levit_224
# For inference on Intel CPUs, OpenVINO may be used for the small legacy model: openvino_midas_v21_small .xml, .bin
# MiDaS 3.0:
# Legacy transformer models dpt_large_384 and dpt_hybrid_384
# MiDaS 2.1:
# Legacy convolutional models midas_v21_384 and midas_v21_small_256
# params
print("Initializing parameters and model...")
self.is_optimize = optimize
self.is_square = square
self.is_grayscale = grayscale
self.height = height
self.side_by_side = side_by_side
# select device
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running inference on : %s" % self.device)
# loading model
if not os.path.exists(model_weights_path+model_type+".pt"):
print("Model file not found. Downloading...")
# Download the model file
urllib.request.urlretrieve(MODEL_FILE_URL[model_type], model_weights_path+model_type+".pt")
print("Model file downloaded successfully.")
self.model, self.transform, self.net_w, self.net_h = load_model(self.device, model_weights_path+model_type+".pt",
model_type, optimize, height, square)
print("Net width and height: ", (self.net_w, self.net_h))
def predict(self, image, model, target_size):
# convert img to tensor and load to gpu
img_tensor = torch.from_numpy(image).to(self.device).unsqueeze(0)
if self.is_optimize and self.device == torch.device("cuda"):
img_tensor =
img_tensor = img_tensor.half()
prediction = model.forward(img_tensor)
prediction = (
return prediction
def process_prediction(self, depth_map):
Take an RGB image and depth map and place them side by side. This includes a proper normalization of the depth map
for better visibility.
original_img: the RGB image
depth_img: the depth map
is_grayscale: use a grayscale colormap?
the image and depth map place side by side
# normalizing depth image
depth_min = depth_map.min()
depth_max = depth_map.max()
normalized_depth = 255 * (depth_map - depth_min) / (depth_max - depth_min)
# normalized_depth *= 3
# grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2) / 3
grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2)
depth_colormap = cv2.applyColorMap(np.uint8(grayscale_depthmap), cv2.COLORMAP_INFERNO)
return normalized_depth/255, depth_colormap/255
def make_prediction(self, image):
image = image.copy()
with torch.no_grad():
original_image_rgb = np.flip(image, 2) # in [0, 255] (flip required to get RGB)
# resizing the image to feed to the model
image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
# monocular depth prediction
pred = self.predict(image_tranformed, self.model, target_size=original_image_rgb.shape[1::-1])
# process the model predictions
depthmap, depth_colormap = self.process_prediction(pred)
return depthmap, depth_colormap
def run(self, input_path):
# input video
cap = cv2.VideoCapture(input_path)
# Check if camera opened successfully
if not cap.isOpened():
print("Error opening video file")
with torch.no_grad():
while cap.isOpened():
# Capture frame-by-frame
inference_start_time = time.time()
ret, frame =
if ret == True:
_, depth_colormap = self.make_prediction(frame)
inference_end_time = time.time()
fps = round(1/(inference_end_time - inference_start_time))
cv2.putText(depth_colormap, f'FPS: {fps}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (10, 255, 100), 2)
cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', depth_colormap)
# Press ESC on keyboard to exit
if cv2.waitKey(1) == 27: # Escape key
# When everything done, release
# the video capture object
# Closes all the frames
if __name__ == "__main__":
# params
INPUT_PATH = "assets/videos/testvideo2.mp4"
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# set torch options
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True
depth_estimator = MonocularDepthEstimator(model_type="dpt_hybrid_384")