vaishanthr commited on
Commit
661e202
1 Parent(s): 32d9fed

added distance estimation feature

Browse files
app.py CHANGED
@@ -4,18 +4,30 @@ import gradio as gr
4
  import numpy as np
5
  import os
6
  import torch
 
7
 
8
  from image_segmenter import ImageSegmenter
9
  from monocular_depth_estimator import MonocularDepthEstimator
 
10
 
11
  # params
12
  CANCEL_PROCESSING = False
13
 
14
- img_seg = ImageSegmenter(model_type='n')
15
- depth_estimator = MonocularDepthEstimator(side_by_side=False)
16
 
17
  def process_image(image):
18
- return img_seg.predict(image), depth_estimator.make_prediction(image)
 
 
 
 
 
 
 
 
 
 
19
 
20
  def process_video(vid_path=None):
21
  vid_cap = cv2.VideoCapture(vid_path)
@@ -23,7 +35,11 @@ def process_video(vid_path=None):
23
  ret, frame = vid_cap.read()
24
  if ret:
25
  print("making predictions ....")
26
- yield cv2.cvtColor(img_seg.predict(frame), cv2.COLOR_BGR2RGB), depth_estimator.make_prediction(frame)
 
 
 
 
27
 
28
  return None
29
 
@@ -35,13 +51,39 @@ def update_segmentation_options(options):
35
  def update_confidence_threshold(thres_val):
36
  img_seg.confidence_threshold = thres_val/100
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def cancel():
39
  CANCEL_PROCESSING = True
40
 
41
  if __name__ == "__main__":
 
 
42
  # img_1 = cv2.imread("assets/images/bus.jpg")
43
- # pred_img = image_segmentation(img_1)
44
- # cv2.imshow("output", pred_img)
 
 
 
 
 
 
 
 
 
 
45
  # cv2.waitKey(0)
46
  # cv2.destroyAllWindows()
47
 
@@ -60,6 +102,12 @@ if __name__ == "__main__":
60
  with gr.Row():
61
  with gr.Column(scale=1):
62
  img_input = gr.Image()
 
 
 
 
 
 
63
  options_checkbox_img = gr.CheckboxGroup(["Show Boundary Box", "Show Segmentation Region", "Show Segmentation Boundary"], label="Options")
64
  conf_thres_img = gr.Slider(1, 100, value=60, label="Confidence Threshold", info="Choose the threshold above which objects should be detected")
65
  submit_btn_img = gr.Button(value="Predict")
@@ -68,6 +116,10 @@ if __name__ == "__main__":
68
  with gr.Row():
69
  segmentation_img_output = gr.Image(height=300, label="Segmentation")
70
  depth_img_output = gr.Image(height=300, label="Depth Estimation")
 
 
 
 
71
 
72
  gr.Markdown("## Sample Images")
73
  gr.Examples(
@@ -82,6 +134,13 @@ if __name__ == "__main__":
82
  with gr.Row():
83
  with gr.Column(scale=1):
84
  vid_input = gr.Video()
 
 
 
 
 
 
 
85
  options_checkbox_vid = gr.CheckboxGroup(["Show Boundary Box", "Show Segmentation Region", "Show Segmentation Boundary"], label="Options")
86
  conf_thres_vid = gr.Slider(1, 100, value=60, label="Confidence Threshold", info="Choose the threshold above which objects should be detected")
87
  with gr.Row():
@@ -92,6 +151,10 @@ if __name__ == "__main__":
92
  with gr.Row():
93
  segmentation_vid_output = gr.Image(height=400, label="Segmentation")
94
  depth_vid_output = gr.Image(height=400, label="Depth Estimation")
 
 
 
 
95
 
96
  gr.Markdown("## Sample Videos")
97
  gr.Examples(
@@ -102,15 +165,17 @@ if __name__ == "__main__":
102
  )
103
 
104
  # image tab logic
105
- submit_btn_img.click(process_image, inputs=img_input, outputs=[segmentation_img_output, depth_img_output])
106
  options_checkbox_img.change(update_segmentation_options, options_checkbox_img, [])
107
  conf_thres_img.change(update_confidence_threshold, conf_thres_img, [])
 
108
 
109
  # video tab logic
110
- submit_btn_vid.click(process_video, inputs=vid_input, outputs=[segmentation_vid_output, depth_vid_output])
111
  cancel_btn.click(cancel, inputs=[], outputs=[])
112
  options_checkbox_vid.change(update_segmentation_options, options_checkbox_vid, [])
113
  conf_thres_vid.change(update_confidence_threshold, conf_thres_vid, [])
 
114
 
115
 
116
  my_app.queue(concurrency_count=5, max_size=20).launch()
 
4
  import numpy as np
5
  import os
6
  import torch
7
+ import utils
8
 
9
  from image_segmenter import ImageSegmenter
10
  from monocular_depth_estimator import MonocularDepthEstimator
11
+ from point_cloud_generator import display_pcd
12
 
13
  # params
14
  CANCEL_PROCESSING = False
15
 
16
+ img_seg = ImageSegmenter(model_type="yolov8s-seg")
17
+ depth_estimator = MonocularDepthEstimator(model_type="midas_v21_small_256")
18
 
19
  def process_image(image):
20
+ image = utils.resize(image)
21
+ image_segmentation, objects_data = img_seg.predict(image)
22
+ depthmap, depth_colormap = depth_estimator.make_prediction(image)
23
+ dist_image = utils.draw_depth_info(image, depthmap, objects_data)
24
+ return image_segmentation, depth_colormap, dist_image
25
+
26
+ def test_process_img(image):
27
+ image = utils.resize(image)
28
+ image_segmentation, objects_data = img_seg.predict(image)
29
+ depthmap, depth_colormap = depth_estimator.make_prediction(image)
30
+ return image_segmentation, objects_data, depthmap, depth_colormap
31
 
32
  def process_video(vid_path=None):
33
  vid_cap = cv2.VideoCapture(vid_path)
 
35
  ret, frame = vid_cap.read()
36
  if ret:
37
  print("making predictions ....")
38
+ frame = utils.resize(frame)
39
+ image_segmentation, objects_data = img_seg.predict(frame)
40
+ depthmap, depth_colormap = depth_estimator.make_prediction(frame)
41
+ dist_image = utils.draw_depth_info(frame, depthmap, objects_data)
42
+ yield cv2.cvtColor(image_segmentation, cv2.COLOR_BGR2RGB), depth_colormap, dist_image
43
 
44
  return None
45
 
 
51
  def update_confidence_threshold(thres_val):
52
  img_seg.confidence_threshold = thres_val/100
53
 
54
+ def model_selector(model_type):
55
+
56
+ if "Small - Better performance and less accuracy" == model_type:
57
+ midas_model, yolo_model = "midas_v21_small_256", "yolov8s-seg"
58
+ elif "Medium - Balanced performance and accuracy" == model_type:
59
+ midas_model, yolo_model = "dpt_hybrid_384", "yolov8m-seg"
60
+ elif "Large - Slow performance and high accuracy" == model_type:
61
+ midas_model, yolo_model = "dpt_large_384", "yolov8l-seg"
62
+ else:
63
+ midas_model, yolo_model = "midas_v21_small_256", "yolov8s-seg"
64
+
65
+ img_seg = ImageSegmenter(model_type=yolo_model)
66
+ depth_estimator = MonocularDepthEstimator(model_type=midas_model)
67
+
68
  def cancel():
69
  CANCEL_PROCESSING = True
70
 
71
  if __name__ == "__main__":
72
+
73
+ # testing
74
  # img_1 = cv2.imread("assets/images/bus.jpg")
75
+ # img_1 = utils.resize(img_1)
76
+
77
+ # image_segmentation, objects_data, depthmap, depth_colormap = test_process_img(img_1)
78
+ # final_image = utils.draw_depth_info(image_segmentation, depthmap, objects_data)
79
+ # objs_pcd = utils.generate_obj_pcd(depthmap, objects_data[2][3])
80
+ # # print(objs_pcd[0][0])
81
+ # # display_pcd(objs_pcd, use_matplotlib=False)
82
+
83
+ # cv2.imshow("Segmentation", image_segmentation)
84
+ # cv2.imshow("Depth", depthmap*objects_data[2][3])
85
+ # cv2.imshow("Final", final_image)
86
+
87
  # cv2.waitKey(0)
88
  # cv2.destroyAllWindows()
89
 
 
102
  with gr.Row():
103
  with gr.Column(scale=1):
104
  img_input = gr.Image()
105
+ model_type_img = gr.Dropdown(
106
+ ["Small - Better performance and less accuracy",
107
+ "Medium - Balanced performance and accuracy",
108
+ "Large - Slow performance and high accuracy"],
109
+ label="Model Type", value="Small - Better performance and less accuracy",
110
+ info="Select the inference model before running predictions!")
111
  options_checkbox_img = gr.CheckboxGroup(["Show Boundary Box", "Show Segmentation Region", "Show Segmentation Boundary"], label="Options")
112
  conf_thres_img = gr.Slider(1, 100, value=60, label="Confidence Threshold", info="Choose the threshold above which objects should be detected")
113
  submit_btn_img = gr.Button(value="Predict")
 
116
  with gr.Row():
117
  segmentation_img_output = gr.Image(height=300, label="Segmentation")
118
  depth_img_output = gr.Image(height=300, label="Depth Estimation")
119
+
120
+ with gr.Row():
121
+ dist_img_output = gr.Image(height=300, label="Distance")
122
+ pcd_img_output = gr.Image(height=300, label="Point Cloud")
123
 
124
  gr.Markdown("## Sample Images")
125
  gr.Examples(
 
134
  with gr.Row():
135
  with gr.Column(scale=1):
136
  vid_input = gr.Video()
137
+ model_type_vid = gr.Dropdown(
138
+ ["Small - Better performance and less accuracy",
139
+ "Medium - Balanced performance and accuracy",
140
+ "Large - Slow performance and high accuracy"],
141
+ label="Model Type", value="Small - Better performance and less accuracy",
142
+ info="Select the inference model before running predictions!"),
143
+
144
  options_checkbox_vid = gr.CheckboxGroup(["Show Boundary Box", "Show Segmentation Region", "Show Segmentation Boundary"], label="Options")
145
  conf_thres_vid = gr.Slider(1, 100, value=60, label="Confidence Threshold", info="Choose the threshold above which objects should be detected")
146
  with gr.Row():
 
151
  with gr.Row():
152
  segmentation_vid_output = gr.Image(height=400, label="Segmentation")
153
  depth_vid_output = gr.Image(height=400, label="Depth Estimation")
154
+
155
+ with gr.Row():
156
+ dist_vid_output = gr.Image(height=300, label="Distance")
157
+ pcd_vid_output = gr.Image(height=300, label="Point Cloud")
158
 
159
  gr.Markdown("## Sample Videos")
160
  gr.Examples(
 
165
  )
166
 
167
  # image tab logic
168
+ submit_btn_img.click(process_image, inputs=img_input, outputs=[segmentation_img_output, depth_img_output, dist_img_output])
169
  options_checkbox_img.change(update_segmentation_options, options_checkbox_img, [])
170
  conf_thres_img.change(update_confidence_threshold, conf_thres_img, [])
171
+ model_type_img.change(model_selector, model_type_img, [])
172
 
173
  # video tab logic
174
+ submit_btn_vid.click(process_video, inputs=vid_input, outputs=[segmentation_vid_output, depth_vid_output, dist_vid_output])
175
  cancel_btn.click(cancel, inputs=[], outputs=[])
176
  options_checkbox_vid.change(update_segmentation_options, options_checkbox_vid, [])
177
  conf_thres_vid.change(update_confidence_threshold, conf_thres_vid, [])
178
+
179
 
180
 
181
  my_app.queue(concurrency_count=5, max_size=20).launch()
image_segmenter.py CHANGED
@@ -6,15 +6,15 @@ import random
6
  import torch
7
 
8
  class ImageSegmenter:
9
- def __init__(self, model_type="n") -> None:
10
 
11
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
12
- self.model = YOLO('models/yolov8'+ model_type +'-seg.pt')
13
  self.model.to(self.device)
14
 
15
- self.is_show_bounding_boxes = False
16
  self.is_show_segmentation_boundary = False
17
- self.is_show_segmentation = True
18
  self.confidence_threshold = 0.5
19
  self.cls_clr = {}
20
 
@@ -22,6 +22,9 @@ class ImageSegmenter:
22
  self.bb_thickness = 2
23
  self.bb_clr = (255, 0, 0)
24
 
 
 
 
25
 
26
  def get_cls_clr(self, cls_id):
27
  if cls_id in self.cls_clr:
@@ -34,9 +37,10 @@ class ImageSegmenter:
34
  self.cls_clr[cls_id] = (r, g, b)
35
  return (r, g, b)
36
 
37
- def predict(self, image):
38
- # resizing the image for faster prediction
39
- image = cv2.resize(image, (480, 640))
 
40
  predictions = self.model.predict(image)
41
 
42
  cls_ids = predictions[0].boxes.cls.cpu().numpy()
@@ -52,39 +56,49 @@ class ImageSegmenter:
52
  for id, cls in enumerate(cls_ids):
53
  cls_clr = self.get_cls_clr(cls)
54
 
55
- # draw bounding box with class name and score
56
- if self.is_show_bounding_boxes and cls_conf[id] > self.confidence_threshold:
57
- (x1, y1, x2, y2) = bounding_boxes[id]
58
- cls_name = self.model.names[cls]
59
- cls_confidence = cls_conf[id]
60
- disp_str = cls_name +' '+ str(round(cls_confidence, 2))
61
- cv2.rectangle(image, (x1, y1), (x2, y2), cls_clr, self.bb_thickness)
62
- cv2.rectangle(image, (x1, y1), (x1+(len(disp_str)*18), y1+45), cls_clr, -1)
63
- cv2.putText(image, disp_str, (x1+10, y1+30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
64
-
65
-
66
- # draw segmentation boundary
67
- if len(seg_mask_boundary) and self.is_show_segmentation_boundary and cls_conf[id] > self.confidence_threshold:
68
- cv2.polylines(image, [np.array(seg_mask_boundary[id], dtype=np.int32)], isClosed=True, color=cls_clr, thickness=2)
69
-
70
  # draw filled segmentation region
71
- if seg_mask.any() and self.is_show_segmentation and cls_conf[id] > self.confidence_threshold:
72
- alpha = 0.8
 
 
 
 
73
 
74
- # converting the mask from 1 channel to 3 channels
75
- colored_mask = np.expand_dims(seg_mask[id], 0).repeat(3, axis=0)
76
- colored_mask = np.moveaxis(colored_mask, 0, -1)
77
 
78
- # Resize the mask to match the image size, if necessary
79
- if image.shape[:2] != seg_mask[id].shape[:2]:
80
- colored_mask = cv2.resize(colored_mask, (image.shape[1], image.shape[0]))
81
 
82
- # filling the mased area with class color
83
- masked = np.ma.MaskedArray(image, mask=colored_mask, fill_value=cls_clr)
84
- image_overlay = masked.filled()
85
- image = cv2.addWeighted(image, 1 - alpha, image_overlay, alpha, 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- return image
88
 
89
 
90
 
 
6
  import torch
7
 
8
  class ImageSegmenter:
9
+ def __init__(self, model_type="yolov8s-seg") -> None:
10
 
11
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
12
+ self.model = YOLO('models/'+ model_type +'.pt')
13
  self.model.to(self.device)
14
 
15
+ self.is_show_bounding_boxes = True
16
  self.is_show_segmentation_boundary = False
17
+ self.is_show_segmentation = False
18
  self.confidence_threshold = 0.5
19
  self.cls_clr = {}
20
 
 
22
  self.bb_thickness = 2
23
  self.bb_clr = (255, 0, 0)
24
 
25
+ # variables
26
+ self.masks = {}
27
+
28
 
29
  def get_cls_clr(self, cls_id):
30
  if cls_id in self.cls_clr:
 
37
  self.cls_clr[cls_id] = (r, g, b)
38
  return (r, g, b)
39
 
40
+ def predict(self, image):
41
+ # params
42
+ objects_data = []
43
+ image = image.copy()
44
  predictions = self.model.predict(image)
45
 
46
  cls_ids = predictions[0].boxes.cls.cpu().numpy()
 
56
  for id, cls in enumerate(cls_ids):
57
  cls_clr = self.get_cls_clr(cls)
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  # draw filled segmentation region
60
+ if seg_mask.any() and cls_conf[id] > self.confidence_threshold:
61
+
62
+ self.masks[id] = seg_mask[id]
63
+
64
+ if self.is_show_segmentation:
65
+ alpha = 0.8
66
 
67
+ # converting the mask from 1 channel to 3 channels
68
+ colored_mask = np.expand_dims(seg_mask[id], 0).repeat(3, axis=0)
69
+ colored_mask = np.moveaxis(colored_mask, 0, -1)
70
 
71
+ # Resize the mask to match the image size, if necessary
72
+ if image.shape[:2] != seg_mask[id].shape[:2]:
73
+ colored_mask = cv2.resize(colored_mask, (image.shape[1], image.shape[0]))
74
 
75
+ # filling the mased area with class color
76
+ masked = np.ma.MaskedArray(image, mask=colored_mask, fill_value=cls_clr)
77
+ image_overlay = masked.filled()
78
+ image = cv2.addWeighted(image, 1 - alpha, image_overlay, alpha, 0)
79
+
80
+ # draw bounding box with class name and score
81
+ if self.is_show_bounding_boxes and cls_conf[id] > self.confidence_threshold:
82
+ (x1, y1, x2, y2) = bounding_boxes[id]
83
+ cls_name = self.model.names[cls]
84
+ cls_confidence = cls_conf[id]
85
+ disp_str = cls_name +' '+ str(round(cls_confidence, 2))
86
+ cv2.rectangle(image, (x1, y1), (x2, y2), cls_clr, self.bb_thickness)
87
+ cv2.rectangle(image, (x1, y1), (x1+(len(disp_str)*9), y1+15), cls_clr, -1)
88
+ cv2.putText(image, disp_str, (x1+5, y1+10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
89
+
90
+
91
+ # draw segmentation boundary
92
+ if len(seg_mask_boundary) and self.is_show_segmentation_boundary and cls_conf[id] > self.confidence_threshold:
93
+ cv2.polylines(image, [np.array(seg_mask_boundary[id], dtype=np.int32)], isClosed=True, color=cls_clr, thickness=2)
94
+
95
+
96
+ # object variables
97
+ (x1, y1, x2, y2) = bounding_boxes[id]
98
+ center = x1+(x2-x1)//2, y1+(y2-y1)//2
99
+ objects_data.append([cls, self.model.names[cls], center, self.masks[id], cls_clr])
100
 
101
+ return image, objects_data
102
 
103
 
104
 
models/dpt_hybrid_384.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:501f0c75b3bca7daec6b3682c5054c09b366765aef6fa3a09d03a5cb4b230853
3
+ size 492757791
models/dpt_swin2_large_384.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8820c77a09e6b1d4bf2bcf71d1f52509239aa07c5f51709d8eee95c8775d5ec8
3
+ size 880390791
models/midas_v21_small_256.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70d6b9c891758c67f974a6097fb0c608c7ee67fb81ac3e5588847d5596d56fca
3
+ size 85761505
models/midas_v21_small_256.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Insert midas_v21_small_256.pt here
2
+
3
+ https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt
models/yolov8m-seg.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9fc740ca0824e14b44681d491dc601efa664ec6ecea9a870acf876053826448
3
+ size 54899779
models/yolov8n-seg.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d39e867b2c3a5dbc1aa764411544b475cb14727bf6af1ec46c238f8bb1351ab9
3
+ size 7054355
models/yolov8s-seg.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c035c6c5f9c48ee962518ef648854c2cc5e60fa404fac443e17d306fdda16543
3
+ size 23897299
monocular_depth_estimator.py CHANGED
@@ -6,15 +6,23 @@ from midas.model_loader import default_models, load_model
6
  import os
7
  import urllib.request
8
 
 
 
 
 
 
 
 
 
9
  class MonocularDepthEstimator:
10
  def __init__(self,
11
- model_type="midas_v21_small_256",
12
- model_weights_path="models/midas_v21_small_256.pt",
13
- optimize=False,
14
- side_by_side=True,
15
- height=None,
16
- square=False,
17
- grayscale=False):
18
 
19
  # model type
20
  # MiDaS 3.1:
@@ -40,16 +48,15 @@ class MonocularDepthEstimator:
40
  # select device
41
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
42
  print("Running inference on : %s" % self.device)
43
- model_file_url = "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt"
44
 
45
  # loading model
46
- if not os.path.exists(model_weights_path):
47
  print("Model file not found. Downloading...")
48
  # Download the model file
49
- urllib.request.urlretrieve(model_file_url, model_weights_path)
50
  print("Model file downloaded successfully.")
51
 
52
- self.model, self.transform, self.net_w, self.net_h = load_model(self.device, model_weights_path,
53
  model_type, optimize, height, square)
54
  print("Net width and height: ", (self.net_w, self.net_h))
55
 
@@ -79,7 +86,7 @@ class MonocularDepthEstimator:
79
 
80
  return prediction
81
 
82
- def process_prediction(self, original_img, depth_img, is_grayscale=False, side_by_side=False):
83
  """
84
  Take an RGB image and depth map and place them side by side. This includes a proper normalization of the depth map
85
  for better visibility.
@@ -92,33 +99,30 @@ class MonocularDepthEstimator:
92
  """
93
 
94
  # normalizing depth image
95
- depth_min = depth_img.min()
96
- depth_max = depth_img.max()
97
- normalized_depth = 255 * (depth_img - depth_min) / (depth_max - depth_min)
98
- normalized_depth *= 3
99
-
100
- depth_side = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2) / 3
101
- if not is_grayscale:
102
- depth_side = cv2.applyColorMap(np.uint8(depth_side), cv2.COLORMAP_INFERNO)
103
-
104
- if side_by_side:
105
- return np.concatenate((original_img, depth_side), axis=1)/255
106
 
107
- return depth_side/255
108
 
109
  def make_prediction(self, image):
 
110
  with torch.no_grad():
111
  original_image_rgb = np.flip(image, 2) # in [0, 255] (flip required to get RGB)
112
  # resizing the image to feed to the model
113
  image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
114
 
115
  # monocular depth prediction
116
- prediction = self.predict(image_tranformed, self.model, target_size=original_image_rgb.shape[1::-1])
117
- original_image_bgr = np.flip(original_image_rgb, 2) if self.side_by_side else None
118
 
119
  # process the model predictions
120
- output = self.process_prediction(original_image_bgr, prediction, is_grayscale=self.is_grayscale, side_by_side=self.side_by_side)
121
- return output
122
 
123
  def run(self, input_path):
124
 
@@ -137,11 +141,11 @@ class MonocularDepthEstimator:
137
  ret, frame = cap.read()
138
 
139
  if ret == True:
140
- output = self.make_prediction(frame)
141
  inference_end_time = time.time()
142
  fps = round(1/(inference_end_time - inference_start_time))
143
- cv2.putText(output, f'FPS: {fps}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (10, 255, 100), 2)
144
- cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', output)
145
 
146
  # Press ESC on keyboard to exit
147
  if cv2.waitKey(1) == 27: # Escape key
@@ -170,6 +174,6 @@ if __name__ == "__main__":
170
  torch.backends.cudnn.enabled = True
171
  torch.backends.cudnn.benchmark = True
172
 
173
- depth_estimator = MonocularDepthEstimator(side_by_side=False)
174
  depth_estimator.run(INPUT_PATH)
175
 
 
6
  import os
7
  import urllib.request
8
 
9
+ MODEL_FILE_URL = {
10
+ "midas_v21_small_256" : "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt",
11
+ "dpt_hybrid_384" : "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt",
12
+ "dpt_large_384" : "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt",
13
+ "dpt_swin2_large_384" : "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt",
14
+ "dpt_beit_large_512" : "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt",
15
+ }
16
+
17
  class MonocularDepthEstimator:
18
  def __init__(self,
19
+ model_type="midas_v21_small_256",
20
+ model_weights_path="models/",
21
+ optimize=False,
22
+ side_by_side=False,
23
+ height=None,
24
+ square=False,
25
+ grayscale=False):
26
 
27
  # model type
28
  # MiDaS 3.1:
 
48
  # select device
49
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
50
  print("Running inference on : %s" % self.device)
 
51
 
52
  # loading model
53
+ if not os.path.exists(model_weights_path+model_type+".pt"):
54
  print("Model file not found. Downloading...")
55
  # Download the model file
56
+ urllib.request.urlretrieve(MODEL_FILE_URL[model_type], model_weights_path+model_type+".pt")
57
  print("Model file downloaded successfully.")
58
 
59
+ self.model, self.transform, self.net_w, self.net_h = load_model(self.device, model_weights_path+model_type+".pt",
60
  model_type, optimize, height, square)
61
  print("Net width and height: ", (self.net_w, self.net_h))
62
 
 
86
 
87
  return prediction
88
 
89
+ def process_prediction(self, depth_map):
90
  """
91
  Take an RGB image and depth map and place them side by side. This includes a proper normalization of the depth map
92
  for better visibility.
 
99
  """
100
 
101
  # normalizing depth image
102
+ depth_min = depth_map.min()
103
+ depth_max = depth_map.max()
104
+ normalized_depth = 255 * (depth_map - depth_min) / (depth_max - depth_min)
105
+
106
+ # normalized_depth *= 3
107
+ # grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2) / 3
108
+ grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2)
109
+ depth_colormap = cv2.applyColorMap(np.uint8(grayscale_depthmap), cv2.COLORMAP_INFERNO)
 
 
 
110
 
111
+ return normalized_depth/255, depth_colormap/255
112
 
113
  def make_prediction(self, image):
114
+ image = image.copy()
115
  with torch.no_grad():
116
  original_image_rgb = np.flip(image, 2) # in [0, 255] (flip required to get RGB)
117
  # resizing the image to feed to the model
118
  image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
119
 
120
  # monocular depth prediction
121
+ pred = self.predict(image_tranformed, self.model, target_size=original_image_rgb.shape[1::-1])
 
122
 
123
  # process the model predictions
124
+ depthmap, depth_colormap = self.process_prediction(pred)
125
+ return depthmap, depth_colormap
126
 
127
  def run(self, input_path):
128
 
 
141
  ret, frame = cap.read()
142
 
143
  if ret == True:
144
+ _, depth_colormap = self.make_prediction(frame)
145
  inference_end_time = time.time()
146
  fps = round(1/(inference_end_time - inference_start_time))
147
+ cv2.putText(depth_colormap, f'FPS: {fps}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (10, 255, 100), 2)
148
+ cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', depth_colormap)
149
 
150
  # Press ESC on keyboard to exit
151
  if cv2.waitKey(1) == 27: # Escape key
 
174
  torch.backends.cudnn.enabled = True
175
  torch.backends.cudnn.benchmark = True
176
 
177
+ depth_estimator = MonocularDepthEstimator(model_type="dpt_hybrid_384")
178
  depth_estimator.run(INPUT_PATH)
179
 
point_cloud_generator.py CHANGED
@@ -49,7 +49,7 @@ class PointCloudGenerator:
49
 
50
  return pcd
51
 
52
- def conver_to_point_cloud_v2(self, depth_img):
53
 
54
  # get depth resolution:
55
  height, width = depth_img.shape
@@ -68,65 +68,70 @@ class PointCloudGenerator:
68
 
69
  return pcd
70
 
71
- def generate_point_cloud(self, image_path, vectorize=False):
72
- depth_img = cv2.imread(image_path, 0)
73
 
74
- print(f"Image resolution: {depth_img.shape}")
75
- print(f"Data type: {depth_img.dtype}")
76
- print(f"Min value: {np.min(depth_img)}")
77
- print(f"Max value: {np.max(depth_img)}")
 
 
78
 
 
 
79
 
80
- # normalizing depth image
81
- depth_min = depth_img.min()
82
- depth_max = depth_img.max()
83
- normalized_depth = 255 * ((depth_img - depth_min) / (depth_max - depth_min))
84
 
85
- depth_img = normalized_depth
86
- print("After normalization: ")
87
- print(f"Image resolution: {depth_img.shape}")
88
- print(f"Data type: {depth_img.dtype}")
89
- print(f"Min value: {np.min(depth_img)}")
90
- print(f"Max value: {np.max(depth_img)}")
91
 
92
 
93
- # convert depth to point cloud
94
- if not vectorize:
95
- self.pcd = self.conver_to_point_cloud_v1(depth_img)
96
- if vectorize:
97
- self.pcd = self.conver_to_point_cloud_v2(depth_img)
98
 
99
-
100
- return self.pcd
101
 
102
- def viz_point_cloud(self, use_matplotlib=False):
103
-
104
- points = np.array(self.pcd)
105
- skip = 200
 
 
 
 
 
106
  point_range = range(0, points.shape[0], skip) # skip points to prevent crash
107
 
108
- if use_matplotlib:
109
- fig = plt.figure()
110
- ax = fig.add_subplot(111, projection='3d')
111
- ax.scatter(points[point_range, 0], points[point_range, 1], points[point_range, 2], c='r', marker='o')
112
- ax.set_xlabel('X Label')
113
- ax.set_ylabel('Y Label')
114
- ax.set_zlabel('Z Label')
115
- plt.show()
116
 
117
  if not use_matplotlib:
118
-
119
  pcd_o3d = o3d.geometry.PointCloud() # create point cloud object
120
- pcd_o3d.points = o3d.utility.Vector3dVector(pcd) # set pcd_np as the point cloud points
121
  # Visualize:
122
  o3d.visualization.draw_geometries([pcd_o3d])
123
 
 
 
 
 
 
 
124
 
 
 
 
125
  if __name__ == "__main__":
126
- input_image = "test/inputs/depth.png"
 
 
127
  point_cloud_gen = PointCloudGenerator()
128
- pcd = point_cloud_gen.generate_point_cloud(input_image)
129
- point_cloud_gen.viz_point_cloud()
130
 
131
 
132
 
 
49
 
50
  return pcd
51
 
52
+ def conver_to_point_cloud(self, depth_img):
53
 
54
  # get depth resolution:
55
  height, width = depth_img.shape
 
68
 
69
  return pcd
70
 
71
+ def generate_point_cloud(self, depth_img, normalize=False):
72
+
73
 
74
+ if normalize:
75
+ # normalizing depth image
76
+ depth_min = depth_img.min()
77
+ depth_max = depth_img.max()
78
+ normalized_depth = 255 * ((depth_img - depth_min) / (depth_max - depth_min))
79
+ depth_img = normalized_depth
80
 
81
+ # convert depth to point cloud
82
+ # point_cloud = self.conver_to_point_cloud(depth_img)
83
 
84
+ depth_image = o3d.geometry.Image(depth_img)
 
 
 
85
 
86
+ # Create open3d camera intrinsic object
87
+ intrinsic_matrix = np.array([[self.fx_depth, 0, self.cx_depth], [0, self.fy_depth, self.cy_depth], [0, 0, 1]])
88
+ camera_intrinsic = o3d.camera.PinholeCameraIntrinsic()
89
+ # camera_intrinsic.intrinsic_matrix = intrinsic_matrix
90
+ camera_intrinsic.set_intrinsics(depth_image.width, depth_image.height, self.fx_depth, self.fy_depth, self.cx_depth, self.cy_depth)
 
91
 
92
 
93
+ # Create open3d point cloud from depth image
94
+ point_cloud = o3d.geometry.PointCloud.create_from_depth_image(depth_img, camera_intrinsic)
 
 
 
95
 
96
+ return point_cloud
 
97
 
98
+ def display_pcd(pcd_data, use_matplotlib=True):
99
+
100
+ if use_matplotlib:
101
+ fig = plt.figure()
102
+ ax = fig.add_subplot(111, projection='3d')
103
+
104
+ for data, clr in pcd_data:
105
+ points = np.array(data)
106
+ skip = 5
107
  point_range = range(0, points.shape[0], skip) # skip points to prevent crash
108
 
109
+ if use_matplotlib:
110
+ ax.scatter(points[point_range, 0], points[point_range, 1], points[point_range, 2], c='r', marker='o')
 
 
 
 
 
 
111
 
112
  if not use_matplotlib:
 
113
  pcd_o3d = o3d.geometry.PointCloud() # create point cloud object
114
+ pcd_o3d.points = o3d.utility.Vector3dVector(points) # set pcd_np as the point cloud points
115
  # Visualize:
116
  o3d.visualization.draw_geometries([pcd_o3d])
117
 
118
+ if use_matplotlib:
119
+ ax.set_xlabel('X Label')
120
+ ax.set_ylabel('Y Label')
121
+ ax.set_zlabel('Z Label')
122
+ ax.view_init(elev=90, azim=0, roll=0)
123
+ plt.show()
124
 
125
+ if not use_matplotlib:
126
+ o3d.visualization.draw_geometries([pcd_o3d])
127
+
128
  if __name__ == "__main__":
129
+ depth_img_path = "assets/images/depth_map_p1.png"
130
+ depth_img = cv2.imread(depth_img_path, 0)
131
+ depth_img = depth_img/255
132
  point_cloud_gen = PointCloudGenerator()
133
+ pcd = point_cloud_gen.generate_point_cloud(depth_img)
134
+ display_pcd([pcd], use_matplotlib=True)
135
 
136
 
137
 
utils.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ from point_cloud_generator import PointCloudGenerator
4
+
5
+ # pcd_generator = PointCloudGenerator()
6
+
7
+ def resize(image):
8
+ """
9
+ resize the input nd array
10
+ """
11
+ h, w = image.shape[:2]
12
+ if h > w:
13
+ return cv2.resize(image, (480, 640))
14
+ else:
15
+ return cv2.resize(image, (640, 480))
16
+
17
+ def get_masked_depth(depth_map, mask):
18
+ masked_depth_map = depth_map*mask
19
+ pixel_depth_vals = masked_depth_map[masked_depth_map>0]
20
+ mean_depth = np.mean(pixel_depth_vals)
21
+ return masked_depth_map, 1-mean_depth
22
+
23
+ def draw_depth_info(image, depth_map, objects_data):
24
+ image = image.copy()
25
+ # object data -> [cls_id, cls_name, cls_center, cls_mask, cls_clr]
26
+ for data in objects_data:
27
+ center = data[2]
28
+ mask = data[3]
29
+ _, depth = get_masked_depth(depth_map, mask)
30
+ cv2.putText(image, str(round(depth*10, 2))+'m', center, cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
31
+
32
+ return image
33
+
34
+ def generate_obj_pcd(depth_map, objects_data):
35
+ objs_pcd = []
36
+ pcd_generator = PointCloudGenerator()
37
+
38
+ for data in objects_data[:2]:
39
+ mask = data[3]
40
+ cls_clr = data[4]
41
+ masked_depth = depth_map*mask
42
+ # generating point cloud using masked depth
43
+ pcd = pcd_generator.generate_point_cloud(masked_depth)
44
+ objs_pcd.append((pcd, cls_clr))
45
+ return objs_pcd
46
+
47
+
48
+
49
+