Spaces:

vaishanthr
/

Simultaneous-Segmented-Depth-Prediction

Running

App Files Files Community

vaishanthr commited on Jul 11, 2023

Commit

661e202

1 Parent(s): 32d9fed

added distance estimation feature

Browse files

Files changed (12) hide show

app.py +73 -8
image_segmenter.py +49 -35
models/dpt_hybrid_384.pt +3 -0
models/dpt_swin2_large_384.pt +3 -0
models/midas_v21_small_256.pt +3 -0
models/midas_v21_small_256.txt +3 -0
models/yolov8m-seg.pt +3 -0
models/yolov8n-seg.pt +3 -0
models/yolov8s-seg.pt +3 -0
monocular_depth_estimator.py +36 -32
point_cloud_generator.py +46 -41
utils.py +49 -0

app.py CHANGED Viewed

@@ -4,18 +4,30 @@ import gradio as gr
 import numpy as np
 import os
 import torch
 from image_segmenter import ImageSegmenter
 from monocular_depth_estimator import MonocularDepthEstimator
 # params
 CANCEL_PROCESSING = False
-img_seg = ImageSegmenter(model_type='n')
-depth_estimator = MonocularDepthEstimator(side_by_side=False)
 def process_image(image):
-    return img_seg.predict(image), depth_estimator.make_prediction(image)
 def process_video(vid_path=None):
     vid_cap = cv2.VideoCapture(vid_path)
@@ -23,7 +35,11 @@ def process_video(vid_path=None):
         ret, frame = vid_cap.read()
         if ret:
             print("making predictions ....")
-            yield cv2.cvtColor(img_seg.predict(frame), cv2.COLOR_BGR2RGB), depth_estimator.make_prediction(frame)
     return None
@@ -35,13 +51,39 @@ def update_segmentation_options(options):
 def update_confidence_threshold(thres_val):
     img_seg.confidence_threshold = thres_val/100
 def cancel():
     CANCEL_PROCESSING = True
 if __name__ == "__main__":
     # img_1 = cv2.imread("assets/images/bus.jpg")
-    # pred_img = image_segmentation(img_1)
-    # cv2.imshow("output", pred_img)
     # cv2.waitKey(0)
     # cv2.destroyAllWindows()
@@ -60,6 +102,12 @@ if __name__ == "__main__":
             with gr.Row():
                 with gr.Column(scale=1):
                     img_input = gr.Image()
                     options_checkbox_img = gr.CheckboxGroup(["Show Boundary Box", "Show Segmentation Region", "Show Segmentation Boundary"], label="Options")
                     conf_thres_img = gr.Slider(1, 100, value=60, label="Confidence Threshold", info="Choose the threshold above which objects should be detected")
                     submit_btn_img = gr.Button(value="Predict")
@@ -68,6 +116,10 @@ if __name__ == "__main__":
                     with gr.Row():
                         segmentation_img_output = gr.Image(height=300, label="Segmentation")
                         depth_img_output = gr.Image(height=300, label="Depth Estimation")
             gr.Markdown("## Sample Images")
             gr.Examples(
@@ -82,6 +134,13 @@ if __name__ == "__main__":
             with gr.Row():
                 with gr.Column(scale=1):
                     vid_input = gr.Video()
                     options_checkbox_vid = gr.CheckboxGroup(["Show Boundary Box", "Show Segmentation Region", "Show Segmentation Boundary"], label="Options")
                     conf_thres_vid = gr.Slider(1, 100, value=60, label="Confidence Threshold", info="Choose the threshold above which objects should be detected")
                     with gr.Row():
@@ -92,6 +151,10 @@ if __name__ == "__main__":
                     with gr.Row():
                         segmentation_vid_output = gr.Image(height=400, label="Segmentation")
                         depth_vid_output = gr.Image(height=400, label="Depth Estimation")
             gr.Markdown("## Sample Videos")
             gr.Examples(
@@ -102,15 +165,17 @@ if __name__ == "__main__":
             )
         # image tab logic
-        submit_btn_img.click(process_image, inputs=img_input, outputs=[segmentation_img_output, depth_img_output])
         options_checkbox_img.change(update_segmentation_options, options_checkbox_img, [])
         conf_thres_img.change(update_confidence_threshold, conf_thres_img, [])
         # video tab logic
-        submit_btn_vid.click(process_video, inputs=vid_input, outputs=[segmentation_vid_output, depth_vid_output])
         cancel_btn.click(cancel, inputs=[], outputs=[])
         options_checkbox_vid.change(update_segmentation_options, options_checkbox_vid, [])
         conf_thres_vid.change(update_confidence_threshold, conf_thres_vid, [])
     my_app.queue(concurrency_count=5, max_size=20).launch()

 import numpy as np
 import os
 import torch
+import utils
 from image_segmenter import ImageSegmenter
 from monocular_depth_estimator import MonocularDepthEstimator
+from point_cloud_generator import display_pcd
 # params
 CANCEL_PROCESSING = False
+img_seg = ImageSegmenter(model_type="yolov8s-seg")
+depth_estimator = MonocularDepthEstimator(model_type="midas_v21_small_256")
 def process_image(image):
+    image = utils.resize(image)
+    image_segmentation, objects_data = img_seg.predict(image)
+    depthmap, depth_colormap = depth_estimator.make_prediction(image)
+    dist_image = utils.draw_depth_info(image, depthmap, objects_data)
+    return image_segmentation, depth_colormap, dist_image
+def test_process_img(image):
+    image = utils.resize(image)
+    image_segmentation, objects_data = img_seg.predict(image)
+    depthmap, depth_colormap = depth_estimator.make_prediction(image)
+    return image_segmentation, objects_data, depthmap, depth_colormap
 def process_video(vid_path=None):
     vid_cap = cv2.VideoCapture(vid_path)
         ret, frame = vid_cap.read()
         if ret:
             print("making predictions ....")
+            frame = utils.resize(frame)
+            image_segmentation, objects_data = img_seg.predict(frame)
+            depthmap, depth_colormap = depth_estimator.make_prediction(frame)
+            dist_image = utils.draw_depth_info(frame, depthmap, objects_data)
+            yield cv2.cvtColor(image_segmentation, cv2.COLOR_BGR2RGB), depth_colormap, dist_image
     return None
 def update_confidence_threshold(thres_val):
     img_seg.confidence_threshold = thres_val/100
+def model_selector(model_type):
+    if "Small - Better performance and less accuracy" == model_type:
+        midas_model, yolo_model = "midas_v21_small_256", "yolov8s-seg"
+    elif "Medium - Balanced performance and accuracy" == model_type:
+        midas_model, yolo_model = "dpt_hybrid_384", "yolov8m-seg"
+    elif "Large - Slow performance and high accuracy" == model_type:
+        midas_model, yolo_model = "dpt_large_384", "yolov8l-seg"
+    else:
+        midas_model, yolo_model = "midas_v21_small_256", "yolov8s-seg"
+    img_seg = ImageSegmenter(model_type=yolo_model)
+    depth_estimator = MonocularDepthEstimator(model_type=midas_model)
 def cancel():
     CANCEL_PROCESSING = True
 if __name__ == "__main__":
+    # testing
     # img_1 = cv2.imread("assets/images/bus.jpg")
+    # img_1 = utils.resize(img_1)
+    # image_segmentation, objects_data, depthmap, depth_colormap = test_process_img(img_1)
+    # final_image = utils.draw_depth_info(image_segmentation, depthmap, objects_data)
+    # objs_pcd = utils.generate_obj_pcd(depthmap, objects_data[2][3])
+    # # print(objs_pcd[0][0])
+    # # display_pcd(objs_pcd, use_matplotlib=False)
+    # cv2.imshow("Segmentation", image_segmentation)
+    # cv2.imshow("Depth", depthmap*objects_data[2][3])
+    # cv2.imshow("Final", final_image)
     # cv2.waitKey(0)
     # cv2.destroyAllWindows()
             with gr.Row():
                 with gr.Column(scale=1):
                     img_input = gr.Image()
+                    model_type_img = gr.Dropdown(
+                        ["Small - Better performance and less accuracy",
+                         "Medium - Balanced performance and accuracy",
+                         "Large - Slow performance and high accuracy"],
+                        label="Model Type", value="Small - Better performance and less accuracy",
+                        info="Select the inference model before running predictions!")
                     options_checkbox_img = gr.CheckboxGroup(["Show Boundary Box", "Show Segmentation Region", "Show Segmentation Boundary"], label="Options")
                     conf_thres_img = gr.Slider(1, 100, value=60, label="Confidence Threshold", info="Choose the threshold above which objects should be detected")
                     submit_btn_img = gr.Button(value="Predict")
                     with gr.Row():
                         segmentation_img_output = gr.Image(height=300, label="Segmentation")
                         depth_img_output = gr.Image(height=300, label="Depth Estimation")
+                    with gr.Row():
+                        dist_img_output = gr.Image(height=300, label="Distance")
+                        pcd_img_output = gr.Image(height=300, label="Point Cloud")
             gr.Markdown("## Sample Images")
             gr.Examples(
             with gr.Row():
                 with gr.Column(scale=1):
                     vid_input = gr.Video()
+                    model_type_vid = gr.Dropdown(
+                        ["Small - Better performance and less accuracy",
+                         "Medium - Balanced performance and accuracy",
+                         "Large - Slow performance and high accuracy"],
+                        label="Model Type", value="Small - Better performance and less accuracy",
+                        info="Select the inference model before running predictions!"),
                     options_checkbox_vid = gr.CheckboxGroup(["Show Boundary Box", "Show Segmentation Region", "Show Segmentation Boundary"], label="Options")
                     conf_thres_vid = gr.Slider(1, 100, value=60, label="Confidence Threshold", info="Choose the threshold above which objects should be detected")
                     with gr.Row():
                     with gr.Row():
                         segmentation_vid_output = gr.Image(height=400, label="Segmentation")
                         depth_vid_output = gr.Image(height=400, label="Depth Estimation")
+                    with gr.Row():
+                        dist_vid_output = gr.Image(height=300, label="Distance")
+                        pcd_vid_output = gr.Image(height=300, label="Point Cloud")
             gr.Markdown("## Sample Videos")
             gr.Examples(
             )
         # image tab logic
+        submit_btn_img.click(process_image, inputs=img_input, outputs=[segmentation_img_output, depth_img_output, dist_img_output])
         options_checkbox_img.change(update_segmentation_options, options_checkbox_img, [])
         conf_thres_img.change(update_confidence_threshold, conf_thres_img, [])
+        model_type_img.change(model_selector, model_type_img, [])
         # video tab logic
+        submit_btn_vid.click(process_video, inputs=vid_input, outputs=[segmentation_vid_output, depth_vid_output, dist_vid_output])
         cancel_btn.click(cancel, inputs=[], outputs=[])
         options_checkbox_vid.change(update_segmentation_options, options_checkbox_vid, [])
         conf_thres_vid.change(update_confidence_threshold, conf_thres_vid, [])
     my_app.queue(concurrency_count=5, max_size=20).launch()

image_segmenter.py CHANGED Viewed

@@ -6,15 +6,15 @@ import random
 import torch
 class ImageSegmenter:
-    def __init__(self, model_type="n") -> None:
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.model = YOLO('models/yolov8'+ model_type +'-seg.pt')
         self.model.to(self.device)
-        self.is_show_bounding_boxes = False
         self.is_show_segmentation_boundary = False
-        self.is_show_segmentation = True
         self.confidence_threshold = 0.5
         self.cls_clr = {}
@@ -22,6 +22,9 @@ class ImageSegmenter:
         self.bb_thickness = 2
         self.bb_clr = (255, 0, 0)
     def get_cls_clr(self, cls_id):
         if cls_id in self.cls_clr:
@@ -34,9 +37,10 @@ class ImageSegmenter:
         self.cls_clr[cls_id] = (r, g, b)
         return (r, g, b)
-    def predict(self, image):
-        # resizing the image for faster prediction
-        image = cv2.resize(image, (480, 640))
         predictions = self.model.predict(image)
         cls_ids = predictions[0].boxes.cls.cpu().numpy()
@@ -52,39 +56,49 @@ class ImageSegmenter:
         for id, cls in enumerate(cls_ids):
             cls_clr = self.get_cls_clr(cls)
-            # draw bounding box with class name and score
-            if self.is_show_bounding_boxes and cls_conf[id] > self.confidence_threshold:
-                (x1, y1, x2, y2) = bounding_boxes[id]
-                cls_name = self.model.names[cls]
-                cls_confidence = cls_conf[id]
-                disp_str = cls_name +'  '+ str(round(cls_confidence, 2))
-                cv2.rectangle(image, (x1, y1), (x2, y2), cls_clr, self.bb_thickness)
-                cv2.rectangle(image, (x1, y1), (x1+(len(disp_str)*18), y1+45), cls_clr, -1)
-                cv2.putText(image, disp_str, (x1+10, y1+30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
-            # draw segmentation boundary
-            if len(seg_mask_boundary) and self.is_show_segmentation_boundary and cls_conf[id] > self.confidence_threshold:
-                cv2.polylines(image, [np.array(seg_mask_boundary[id], dtype=np.int32)], isClosed=True, color=cls_clr, thickness=2)
             # draw filled segmentation region
-            if seg_mask.any() and self.is_show_segmentation and cls_conf[id] > self.confidence_threshold:
-                alpha = 0.8
-                # converting the mask from 1 channel to 3 channels
-                colored_mask = np.expand_dims(seg_mask[id], 0).repeat(3, axis=0)
-                colored_mask = np.moveaxis(colored_mask, 0, -1)
-                # Resize the mask to match the image size, if necessary
-                if image.shape[:2] != seg_mask[id].shape[:2]:
-                    colored_mask = cv2.resize(colored_mask, (image.shape[1], image.shape[0]))
-                # filling the mased area with class color
-                masked = np.ma.MaskedArray(image, mask=colored_mask, fill_value=cls_clr)
-                image_overlay = masked.filled()
-                image = cv2.addWeighted(image, 1 - alpha, image_overlay, alpha, 0)
-        return image

 import torch
 class ImageSegmenter:
+    def __init__(self, model_type="yolov8s-seg") -> None:
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model = YOLO('models/'+ model_type +'.pt')
         self.model.to(self.device)
+        self.is_show_bounding_boxes = True
         self.is_show_segmentation_boundary = False
+        self.is_show_segmentation = False
         self.confidence_threshold = 0.5
         self.cls_clr = {}
         self.bb_thickness = 2
         self.bb_clr = (255, 0, 0)
+        # variables
+        self.masks = {}
     def get_cls_clr(self, cls_id):
         if cls_id in self.cls_clr:
         self.cls_clr[cls_id] = (r, g, b)
         return (r, g, b)
+    def predict(self, image):
+        # params
+        objects_data = []
+        image = image.copy()
         predictions = self.model.predict(image)
         cls_ids = predictions[0].boxes.cls.cpu().numpy()
         for id, cls in enumerate(cls_ids):
             cls_clr = self.get_cls_clr(cls)
             # draw filled segmentation region
+            if seg_mask.any() and  cls_conf[id] > self.confidence_threshold:
+                self.masks[id] = seg_mask[id]
+                if self.is_show_segmentation:
+                    alpha = 0.8
+                    # converting the mask from 1 channel to 3 channels
+                    colored_mask = np.expand_dims(seg_mask[id], 0).repeat(3, axis=0)
+                    colored_mask = np.moveaxis(colored_mask, 0, -1)
+                    # Resize the mask to match the image size, if necessary
+                    if image.shape[:2] != seg_mask[id].shape[:2]:
+                        colored_mask = cv2.resize(colored_mask, (image.shape[1], image.shape[0]))
+                    # filling the mased area with class color
+                    masked = np.ma.MaskedArray(image, mask=colored_mask, fill_value=cls_clr)
+                    image_overlay = masked.filled()
+                    image = cv2.addWeighted(image, 1 - alpha, image_overlay, alpha, 0)
+                # draw bounding box with class name and score
+                if self.is_show_bounding_boxes and cls_conf[id] > self.confidence_threshold:
+                    (x1, y1, x2, y2) = bounding_boxes[id]
+                    cls_name = self.model.names[cls]
+                    cls_confidence = cls_conf[id]
+                    disp_str = cls_name +' '+ str(round(cls_confidence, 2))
+                    cv2.rectangle(image, (x1, y1), (x2, y2), cls_clr, self.bb_thickness)
+                    cv2.rectangle(image, (x1, y1), (x1+(len(disp_str)*9), y1+15), cls_clr, -1)
+                    cv2.putText(image, disp_str, (x1+5, y1+10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
+                # draw segmentation boundary
+                if len(seg_mask_boundary) and self.is_show_segmentation_boundary and cls_conf[id] > self.confidence_threshold:
+                    cv2.polylines(image, [np.array(seg_mask_boundary[id], dtype=np.int32)], isClosed=True, color=cls_clr, thickness=2)
+                # object variables
+                (x1, y1, x2, y2) = bounding_boxes[id]
+                center = x1+(x2-x1)//2, y1+(y2-y1)//2
+                objects_data.append([cls, self.model.names[cls], center, self.masks[id], cls_clr])
+        return image, objects_data

models/dpt_hybrid_384.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:501f0c75b3bca7daec6b3682c5054c09b366765aef6fa3a09d03a5cb4b230853
+size 492757791

models/dpt_swin2_large_384.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8820c77a09e6b1d4bf2bcf71d1f52509239aa07c5f51709d8eee95c8775d5ec8
+size 880390791

models/midas_v21_small_256.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70d6b9c891758c67f974a6097fb0c608c7ee67fb81ac3e5588847d5596d56fca
+size 85761505

models/midas_v21_small_256.txt ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ Insert midas_v21_small_256.pt here
2	+
3	+ https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt

models/yolov8m-seg.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9fc740ca0824e14b44681d491dc601efa664ec6ecea9a870acf876053826448
+size 54899779

models/yolov8n-seg.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d39e867b2c3a5dbc1aa764411544b475cb14727bf6af1ec46c238f8bb1351ab9
+size 7054355

models/yolov8s-seg.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c035c6c5f9c48ee962518ef648854c2cc5e60fa404fac443e17d306fdda16543
+size 23897299

monocular_depth_estimator.py CHANGED Viewed

@@ -6,15 +6,23 @@ from midas.model_loader import default_models, load_model
 import os
 import urllib.request
 class MonocularDepthEstimator:
     def __init__(self,
-                    model_type="midas_v21_small_256",
-                    model_weights_path="models/midas_v21_small_256.pt",
-                    optimize=False,
-                    side_by_side=True,
-                    height=None,
-                    square=False,
-                    grayscale=False):
         # model type
         # MiDaS 3.1:
@@ -40,16 +48,15 @@ class MonocularDepthEstimator:
         # select device
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         print("Running inference on : %s" % self.device)
-        model_file_url = "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt"
         # loading model
-        if not os.path.exists(model_weights_path):
             print("Model file not found. Downloading...")
             # Download the model file
-            urllib.request.urlretrieve(model_file_url, model_weights_path)
             print("Model file downloaded successfully.")
-        self.model, self.transform, self.net_w, self.net_h = load_model(self.device, model_weights_path,
                                                                         model_type, optimize, height, square)
         print("Net width and height: ", (self.net_w, self.net_h))
@@ -79,7 +86,7 @@ class MonocularDepthEstimator:
         return prediction
-    def process_prediction(self, original_img, depth_img, is_grayscale=False, side_by_side=False):
         """
         Take an RGB image and depth map and place them side by side. This includes a proper normalization of the depth map
         for better visibility.
@@ -92,33 +99,30 @@ class MonocularDepthEstimator:
         """
         # normalizing depth image
-        depth_min = depth_img.min()
-        depth_max = depth_img.max()
-        normalized_depth = 255 * (depth_img - depth_min) / (depth_max - depth_min)
-        normalized_depth *= 3
-        depth_side = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2) / 3
-        if not is_grayscale:
-            depth_side = cv2.applyColorMap(np.uint8(depth_side), cv2.COLORMAP_INFERNO)
-        if side_by_side:
-            return np.concatenate((original_img, depth_side), axis=1)/255
-        return depth_side/255
     def make_prediction(self, image):
         with torch.no_grad():
             original_image_rgb = np.flip(image, 2)  # in [0, 255] (flip required to get RGB)
             # resizing the image to feed to the model
             image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
             # monocular depth prediction
-            prediction = self.predict(image_tranformed, self.model, target_size=original_image_rgb.shape[1::-1])
-            original_image_bgr = np.flip(original_image_rgb, 2) if self.side_by_side else None
             # process the model predictions
-            output = self.process_prediction(original_image_bgr, prediction, is_grayscale=self.is_grayscale, side_by_side=self.side_by_side)
-        return output
     def run(self, input_path):
@@ -137,11 +141,11 @@ class MonocularDepthEstimator:
                 ret, frame = cap.read()
                 if ret == True:
-                    output = self.make_prediction(frame)
                     inference_end_time = time.time()
                     fps = round(1/(inference_end_time - inference_start_time))
-                    cv2.putText(output, f'FPS: {fps}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (10, 255, 100), 2)
-                    cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', output)
                     # Press ESC on keyboard to exit
                     if cv2.waitKey(1) == 27:  # Escape key
@@ -170,6 +174,6 @@ if __name__ == "__main__":
     torch.backends.cudnn.enabled = True
     torch.backends.cudnn.benchmark = True
-    depth_estimator = MonocularDepthEstimator(side_by_side=False)
     depth_estimator.run(INPUT_PATH)

 import os
 import urllib.request
+MODEL_FILE_URL = {
+    "midas_v21_small_256" : "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt",
+    "dpt_hybrid_384" : "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt",
+    "dpt_large_384" : "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt",
+    "dpt_swin2_large_384" : "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt",
+    "dpt_beit_large_512" : "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt",
+}
 class MonocularDepthEstimator:
     def __init__(self,
+        model_type="midas_v21_small_256",
+        model_weights_path="models/",
+        optimize=False,
+        side_by_side=False,
+        height=None,
+        square=False,
+        grayscale=False):
         # model type
         # MiDaS 3.1:
         # select device
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         print("Running inference on : %s" % self.device)
         # loading model
+        if not os.path.exists(model_weights_path+model_type+".pt"):
             print("Model file not found. Downloading...")
             # Download the model file
+            urllib.request.urlretrieve(MODEL_FILE_URL[model_type], model_weights_path+model_type+".pt")
             print("Model file downloaded successfully.")
+        self.model, self.transform, self.net_w, self.net_h = load_model(self.device, model_weights_path+model_type+".pt",
                                                                         model_type, optimize, height, square)
         print("Net width and height: ", (self.net_w, self.net_h))
         return prediction
+    def process_prediction(self, depth_map):
         """
         Take an RGB image and depth map and place them side by side. This includes a proper normalization of the depth map
         for better visibility.
         """
         # normalizing depth image
+        depth_min = depth_map.min()
+        depth_max = depth_map.max()
+        normalized_depth = 255 * (depth_map - depth_min) / (depth_max - depth_min)
+        # normalized_depth *= 3
+        # grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2) / 3
+        grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2)
+        depth_colormap = cv2.applyColorMap(np.uint8(grayscale_depthmap), cv2.COLORMAP_INFERNO)
+        return normalized_depth/255, depth_colormap/255
     def make_prediction(self, image):
+        image = image.copy()
         with torch.no_grad():
             original_image_rgb = np.flip(image, 2)  # in [0, 255] (flip required to get RGB)
             # resizing the image to feed to the model
             image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
             # monocular depth prediction
+            pred = self.predict(image_tranformed, self.model, target_size=original_image_rgb.shape[1::-1])
             # process the model predictions
+            depthmap, depth_colormap = self.process_prediction(pred)
+        return depthmap, depth_colormap
     def run(self, input_path):
                 ret, frame = cap.read()
                 if ret == True:
+                    _, depth_colormap = self.make_prediction(frame)
                     inference_end_time = time.time()
                     fps = round(1/(inference_end_time - inference_start_time))
+                    cv2.putText(depth_colormap, f'FPS: {fps}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (10, 255, 100), 2)
+                    cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', depth_colormap)
                     # Press ESC on keyboard to exit
                     if cv2.waitKey(1) == 27:  # Escape key
     torch.backends.cudnn.enabled = True
     torch.backends.cudnn.benchmark = True
+    depth_estimator = MonocularDepthEstimator(model_type="dpt_hybrid_384")
     depth_estimator.run(INPUT_PATH)

point_cloud_generator.py CHANGED Viewed

@@ -49,7 +49,7 @@ class PointCloudGenerator:
         return pcd
-    def conver_to_point_cloud_v2(self, depth_img):
         # get depth resolution:
         height, width = depth_img.shape
@@ -68,65 +68,70 @@ class PointCloudGenerator:
         return pcd
-    def generate_point_cloud(self, image_path, vectorize=False):
-        depth_img = cv2.imread(image_path, 0)
-        print(f"Image resolution: {depth_img.shape}")
-        print(f"Data type: {depth_img.dtype}")
-        print(f"Min value: {np.min(depth_img)}")
-        print(f"Max value: {np.max(depth_img)}")
-        # normalizing depth image
-        depth_min = depth_img.min()
-        depth_max = depth_img.max()
-        normalized_depth = 255 * ((depth_img - depth_min) / (depth_max - depth_min))
-        depth_img = normalized_depth
-        print("After normalization: ")
-        print(f"Image resolution: {depth_img.shape}")
-        print(f"Data type: {depth_img.dtype}")
-        print(f"Min value: {np.min(depth_img)}")
-        print(f"Max value: {np.max(depth_img)}")
-        # convert depth to point cloud
-        if not vectorize:
-            self.pcd = self.conver_to_point_cloud_v1(depth_img)
-        if vectorize:
-            self.pcd = self.conver_to_point_cloud_v2(depth_img)
-        return self.pcd
-    def viz_point_cloud(self, use_matplotlib=False):
-        points = np.array(self.pcd)
-        skip = 200
         point_range = range(0, points.shape[0], skip) # skip points to prevent crash
-        if use_matplotlib:
-            fig = plt.figure()
-            ax = fig.add_subplot(111, projection='3d')
-            ax.scatter(points[point_range, 0], points[point_range, 1], points[point_range, 2], c='r', marker='o')
-            ax.set_xlabel('X Label')
-            ax.set_ylabel('Y Label')
-            ax.set_zlabel('Z Label')
-            plt.show()
         if not use_matplotlib:
             pcd_o3d = o3d.geometry.PointCloud()  # create point cloud object
-            pcd_o3d.points = o3d.utility.Vector3dVector(pcd)  # set pcd_np as the point cloud points
             # Visualize:
             o3d.visualization.draw_geometries([pcd_o3d])
 if __name__ == "__main__":
-    input_image = "test/inputs/depth.png"
     point_cloud_gen = PointCloudGenerator()
-    pcd = point_cloud_gen.generate_point_cloud(input_image)
-    point_cloud_gen.viz_point_cloud()

         return pcd
+    def conver_to_point_cloud(self, depth_img):
         # get depth resolution:
         height, width = depth_img.shape
         return pcd
+    def generate_point_cloud(self, depth_img, normalize=False):
+        if normalize:
+            # normalizing depth image
+            depth_min = depth_img.min()
+            depth_max = depth_img.max()
+            normalized_depth = 255 * ((depth_img - depth_min) / (depth_max - depth_min))
+            depth_img = normalized_depth
+        # convert depth to point cloud
+        # point_cloud = self.conver_to_point_cloud(depth_img)
+        depth_image = o3d.geometry.Image(depth_img)
+        # Create open3d camera intrinsic object
+        intrinsic_matrix = np.array([[self.fx_depth, 0, self.cx_depth], [0, self.fy_depth, self.cy_depth], [0, 0, 1]])
+        camera_intrinsic = o3d.camera.PinholeCameraIntrinsic()
+        # camera_intrinsic.intrinsic_matrix = intrinsic_matrix
+        camera_intrinsic.set_intrinsics(depth_image.width, depth_image.height, self.fx_depth, self.fy_depth, self.cx_depth, self.cy_depth)
+        # Create open3d point cloud from depth image
+        point_cloud = o3d.geometry.PointCloud.create_from_depth_image(depth_img, camera_intrinsic)
+        return point_cloud
+def display_pcd(pcd_data, use_matplotlib=True):
+    if use_matplotlib:
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+    for data, clr in pcd_data:
+        points = np.array(data)
+        skip = 5
         point_range = range(0, points.shape[0], skip) # skip points to prevent crash
+        if use_matplotlib:
+            ax.scatter(points[point_range, 0], points[point_range, 1], points[point_range, 2], c='r', marker='o')
         if not use_matplotlib:
             pcd_o3d = o3d.geometry.PointCloud()  # create point cloud object
+            pcd_o3d.points = o3d.utility.Vector3dVector(points)  # set pcd_np as the point cloud points
             # Visualize:
             o3d.visualization.draw_geometries([pcd_o3d])
+    if use_matplotlib:
+        ax.set_xlabel('X Label')
+        ax.set_ylabel('Y Label')
+        ax.set_zlabel('Z Label')
+        ax.view_init(elev=90, azim=0, roll=0)
+        plt.show()
+    if not use_matplotlib:
+        o3d.visualization.draw_geometries([pcd_o3d])
 if __name__ == "__main__":
+    depth_img_path = "assets/images/depth_map_p1.png"
+    depth_img = cv2.imread(depth_img_path, 0)
+    depth_img = depth_img/255
     point_cloud_gen = PointCloudGenerator()
+    pcd = point_cloud_gen.generate_point_cloud(depth_img)
+    display_pcd([pcd], use_matplotlib=True)

utils.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import cv2
+import numpy as np
+from point_cloud_generator import PointCloudGenerator
+# pcd_generator = PointCloudGenerator()
+def resize(image):
+    """
+    resize the input nd array
+    """
+    h, w = image.shape[:2]
+    if h > w:
+        return cv2.resize(image, (480, 640))
+    else:
+        return cv2.resize(image, (640, 480))
+def get_masked_depth(depth_map, mask):
+    masked_depth_map = depth_map*mask
+    pixel_depth_vals = masked_depth_map[masked_depth_map>0]
+    mean_depth = np.mean(pixel_depth_vals)
+    return masked_depth_map, 1-mean_depth
+def draw_depth_info(image, depth_map, objects_data):
+    image = image.copy()
+    # object data -> [cls_id, cls_name, cls_center, cls_mask, cls_clr]
+    for data in objects_data:
+        center = data[2]
+        mask = data[3]
+        _, depth = get_masked_depth(depth_map, mask)
+        cv2.putText(image, str(round(depth*10, 2))+'m', center, cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
+    return image
+def generate_obj_pcd(depth_map, objects_data):
+    objs_pcd = []
+    pcd_generator = PointCloudGenerator()
+    for data in objects_data[:2]:
+        mask = data[3]
+        cls_clr = data[4]
+        masked_depth = depth_map*mask
+        # generating point cloud using masked depth
+        pcd = pcd_generator.generate_point_cloud(masked_depth)
+        objs_pcd.append((pcd, cls_clr))
+    return objs_pcd