"""Compute depth maps for images in the input folder. """ import os import glob import torch import cv2 import argparse import util.io from torchvision.transforms import Compose from dpt.models import DPTDepthModel from dpt.midas_net import MidasNet_large from dpt.transforms import Resize, NormalizeImage, PrepareForNet #from util.misc import visualize_attention def run(input_path, output_path, model_path, model_type="dpt_hybrid", optimize=True): """Run MonoDepthNN to compute depth maps. Args: input_path (str): path to input folder output_path (str): path to output folder model_path (str): path to saved model """ print("initialize") # select device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device: %s" % device) # load network if model_type == "dpt_large": # DPT-Large net_w = net_h = 384 model = DPTDepthModel( path=model_path, backbone="vitl16_384", non_negative=True, enable_attention_hooks=False, ) normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "dpt_hybrid": # DPT-Hybrid net_w = net_h = 384 model = DPTDepthModel( path=model_path, backbone="vitb_rn50_384", non_negative=True, enable_attention_hooks=False, ) normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "dpt_hybrid_kitti": net_w = 1216 net_h = 352 model = DPTDepthModel( path=model_path, scale=0.00006016, shift=0.00579, invert=True, backbone="vitb_rn50_384", non_negative=True, enable_attention_hooks=False, ) normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "dpt_hybrid_nyu": net_w = 640 net_h = 480 model = DPTDepthModel( path=model_path, scale=0.000305, shift=0.1378, invert=True, backbone="vitb_rn50_384", non_negative=True, enable_attention_hooks=False, ) normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) elif model_type == "midas_v21": # Convolutional model net_w = net_h = 384 model = MidasNet_large(model_path, non_negative=True) normalization = NormalizeImage( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ) else: assert ( False ), f"model_type '{model_type}' not implemented, use: --model_type [dpt_large|dpt_hybrid|dpt_hybrid_kitti|dpt_hybrid_nyu|midas_v21]" transform = Compose( [ Resize( net_w, net_h, resize_target=None, keep_aspect_ratio=True, ensure_multiple_of=32, resize_method="minimal", image_interpolation_method=cv2.INTER_CUBIC, ), normalization, PrepareForNet(), ] ) model.eval() if optimize == True and device == torch.device("cuda"): model = model.to(memory_format=torch.channels_last) model = model.half() model.to(device) # get input img_names = glob.glob(os.path.join(input_path, "*")) num_images = len(img_names) # create output folder os.makedirs(output_path, exist_ok=True) print("start processing") for ind, img_name in enumerate(img_names): if os.path.isdir(img_name): continue print(" processing {} ({}/{})".format(img_name, ind + 1, num_images)) # input img = util.io.read_image(img_name) if args.kitti_crop is True: height, width, _ = img.shape top = height - 352 left = (width - 1216) // 2 img = img[top : top + 352, left : left + 1216, :] img_input = transform({"image": img})["image"] # compute with torch.no_grad(): sample = torch.from_numpy(img_input).to(device).unsqueeze(0) if optimize == True and device == torch.device("cuda"): sample = sample.to(memory_format=torch.channels_last) sample = sample.half() prediction = model.forward(sample) prediction = ( torch.nn.functional.interpolate( prediction.unsqueeze(1), size=img.shape[:2], mode="bicubic", align_corners=False, ) .squeeze() .cpu() .numpy() ) if model_type == "dpt_hybrid_kitti": prediction *= 256 if model_type == "dpt_hybrid_nyu": prediction *= 1000.0 filename = os.path.join( output_path, os.path.splitext(os.path.basename(img_name))[0] ) util.io.write_depth(filename, prediction, bits=2, absolute_depth=args.absolute_depth) print("finished") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-i", "--input_path", default="input", help="folder with input images" ) parser.add_argument( "-o", "--output_path", default="output_monodepth", help="folder for output images", ) parser.add_argument( "-m", "--model_weights", default=None, help="path to model weights" ) parser.add_argument( "-t", "--model_type", default="dpt_hybrid", help="model type [dpt_large|dpt_hybrid|midas_v21]", ) parser.add_argument("--kitti_crop", dest="kitti_crop", action="store_true") parser.add_argument("--absolute_depth", dest="absolute_depth", action="store_true") parser.add_argument("--optimize", dest="optimize", action="store_true") parser.add_argument("--no-optimize", dest="optimize", action="store_false") parser.set_defaults(optimize=True) parser.set_defaults(kitti_crop=False) parser.set_defaults(absolute_depth=False) args = parser.parse_args() default_models = { "midas_v21": "weights/midas_v21-f6b98070.pt", "dpt_large": "weights/dpt_large-midas-2f21e586.pt", "dpt_hybrid": "weights/dpt_hybrid-midas-501f0c75.pt", "dpt_hybrid_kitti": "weights/dpt_hybrid_kitti-cb926ef4.pt", "dpt_hybrid_nyu": "weights/dpt_hybrid_nyu-2ce69ec7.pt", } if args.model_weights is None: args.model_weights = default_models[args.model_type] # set torch options torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True # compute depth maps run( args.input_path, args.output_path, args.model_weights, args.model_type, args.optimize, )