vaishanthr
commited on
Commit
•
661e202
1
Parent(s):
32d9fed
added distance estimation feature
Browse files- app.py +73 -8
- image_segmenter.py +49 -35
- models/dpt_hybrid_384.pt +3 -0
- models/dpt_swin2_large_384.pt +3 -0
- models/midas_v21_small_256.pt +3 -0
- models/midas_v21_small_256.txt +3 -0
- models/yolov8m-seg.pt +3 -0
- models/yolov8n-seg.pt +3 -0
- models/yolov8s-seg.pt +3 -0
- monocular_depth_estimator.py +36 -32
- point_cloud_generator.py +46 -41
- utils.py +49 -0
app.py
CHANGED
@@ -4,18 +4,30 @@ import gradio as gr
|
|
4 |
import numpy as np
|
5 |
import os
|
6 |
import torch
|
|
|
7 |
|
8 |
from image_segmenter import ImageSegmenter
|
9 |
from monocular_depth_estimator import MonocularDepthEstimator
|
|
|
10 |
|
11 |
# params
|
12 |
CANCEL_PROCESSING = False
|
13 |
|
14 |
-
img_seg = ImageSegmenter(model_type=
|
15 |
-
depth_estimator = MonocularDepthEstimator(
|
16 |
|
17 |
def process_image(image):
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
def process_video(vid_path=None):
|
21 |
vid_cap = cv2.VideoCapture(vid_path)
|
@@ -23,7 +35,11 @@ def process_video(vid_path=None):
|
|
23 |
ret, frame = vid_cap.read()
|
24 |
if ret:
|
25 |
print("making predictions ....")
|
26 |
-
|
|
|
|
|
|
|
|
|
27 |
|
28 |
return None
|
29 |
|
@@ -35,13 +51,39 @@ def update_segmentation_options(options):
|
|
35 |
def update_confidence_threshold(thres_val):
|
36 |
img_seg.confidence_threshold = thres_val/100
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
def cancel():
|
39 |
CANCEL_PROCESSING = True
|
40 |
|
41 |
if __name__ == "__main__":
|
|
|
|
|
42 |
# img_1 = cv2.imread("assets/images/bus.jpg")
|
43 |
-
#
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
# cv2.waitKey(0)
|
46 |
# cv2.destroyAllWindows()
|
47 |
|
@@ -60,6 +102,12 @@ if __name__ == "__main__":
|
|
60 |
with gr.Row():
|
61 |
with gr.Column(scale=1):
|
62 |
img_input = gr.Image()
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
options_checkbox_img = gr.CheckboxGroup(["Show Boundary Box", "Show Segmentation Region", "Show Segmentation Boundary"], label="Options")
|
64 |
conf_thres_img = gr.Slider(1, 100, value=60, label="Confidence Threshold", info="Choose the threshold above which objects should be detected")
|
65 |
submit_btn_img = gr.Button(value="Predict")
|
@@ -68,6 +116,10 @@ if __name__ == "__main__":
|
|
68 |
with gr.Row():
|
69 |
segmentation_img_output = gr.Image(height=300, label="Segmentation")
|
70 |
depth_img_output = gr.Image(height=300, label="Depth Estimation")
|
|
|
|
|
|
|
|
|
71 |
|
72 |
gr.Markdown("## Sample Images")
|
73 |
gr.Examples(
|
@@ -82,6 +134,13 @@ if __name__ == "__main__":
|
|
82 |
with gr.Row():
|
83 |
with gr.Column(scale=1):
|
84 |
vid_input = gr.Video()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
options_checkbox_vid = gr.CheckboxGroup(["Show Boundary Box", "Show Segmentation Region", "Show Segmentation Boundary"], label="Options")
|
86 |
conf_thres_vid = gr.Slider(1, 100, value=60, label="Confidence Threshold", info="Choose the threshold above which objects should be detected")
|
87 |
with gr.Row():
|
@@ -92,6 +151,10 @@ if __name__ == "__main__":
|
|
92 |
with gr.Row():
|
93 |
segmentation_vid_output = gr.Image(height=400, label="Segmentation")
|
94 |
depth_vid_output = gr.Image(height=400, label="Depth Estimation")
|
|
|
|
|
|
|
|
|
95 |
|
96 |
gr.Markdown("## Sample Videos")
|
97 |
gr.Examples(
|
@@ -102,15 +165,17 @@ if __name__ == "__main__":
|
|
102 |
)
|
103 |
|
104 |
# image tab logic
|
105 |
-
submit_btn_img.click(process_image, inputs=img_input, outputs=[segmentation_img_output, depth_img_output])
|
106 |
options_checkbox_img.change(update_segmentation_options, options_checkbox_img, [])
|
107 |
conf_thres_img.change(update_confidence_threshold, conf_thres_img, [])
|
|
|
108 |
|
109 |
# video tab logic
|
110 |
-
submit_btn_vid.click(process_video, inputs=vid_input, outputs=[segmentation_vid_output, depth_vid_output])
|
111 |
cancel_btn.click(cancel, inputs=[], outputs=[])
|
112 |
options_checkbox_vid.change(update_segmentation_options, options_checkbox_vid, [])
|
113 |
conf_thres_vid.change(update_confidence_threshold, conf_thres_vid, [])
|
|
|
114 |
|
115 |
|
116 |
my_app.queue(concurrency_count=5, max_size=20).launch()
|
|
|
4 |
import numpy as np
|
5 |
import os
|
6 |
import torch
|
7 |
+
import utils
|
8 |
|
9 |
from image_segmenter import ImageSegmenter
|
10 |
from monocular_depth_estimator import MonocularDepthEstimator
|
11 |
+
from point_cloud_generator import display_pcd
|
12 |
|
13 |
# params
|
14 |
CANCEL_PROCESSING = False
|
15 |
|
16 |
+
img_seg = ImageSegmenter(model_type="yolov8s-seg")
|
17 |
+
depth_estimator = MonocularDepthEstimator(model_type="midas_v21_small_256")
|
18 |
|
19 |
def process_image(image):
|
20 |
+
image = utils.resize(image)
|
21 |
+
image_segmentation, objects_data = img_seg.predict(image)
|
22 |
+
depthmap, depth_colormap = depth_estimator.make_prediction(image)
|
23 |
+
dist_image = utils.draw_depth_info(image, depthmap, objects_data)
|
24 |
+
return image_segmentation, depth_colormap, dist_image
|
25 |
+
|
26 |
+
def test_process_img(image):
|
27 |
+
image = utils.resize(image)
|
28 |
+
image_segmentation, objects_data = img_seg.predict(image)
|
29 |
+
depthmap, depth_colormap = depth_estimator.make_prediction(image)
|
30 |
+
return image_segmentation, objects_data, depthmap, depth_colormap
|
31 |
|
32 |
def process_video(vid_path=None):
|
33 |
vid_cap = cv2.VideoCapture(vid_path)
|
|
|
35 |
ret, frame = vid_cap.read()
|
36 |
if ret:
|
37 |
print("making predictions ....")
|
38 |
+
frame = utils.resize(frame)
|
39 |
+
image_segmentation, objects_data = img_seg.predict(frame)
|
40 |
+
depthmap, depth_colormap = depth_estimator.make_prediction(frame)
|
41 |
+
dist_image = utils.draw_depth_info(frame, depthmap, objects_data)
|
42 |
+
yield cv2.cvtColor(image_segmentation, cv2.COLOR_BGR2RGB), depth_colormap, dist_image
|
43 |
|
44 |
return None
|
45 |
|
|
|
51 |
def update_confidence_threshold(thres_val):
|
52 |
img_seg.confidence_threshold = thres_val/100
|
53 |
|
54 |
+
def model_selector(model_type):
|
55 |
+
|
56 |
+
if "Small - Better performance and less accuracy" == model_type:
|
57 |
+
midas_model, yolo_model = "midas_v21_small_256", "yolov8s-seg"
|
58 |
+
elif "Medium - Balanced performance and accuracy" == model_type:
|
59 |
+
midas_model, yolo_model = "dpt_hybrid_384", "yolov8m-seg"
|
60 |
+
elif "Large - Slow performance and high accuracy" == model_type:
|
61 |
+
midas_model, yolo_model = "dpt_large_384", "yolov8l-seg"
|
62 |
+
else:
|
63 |
+
midas_model, yolo_model = "midas_v21_small_256", "yolov8s-seg"
|
64 |
+
|
65 |
+
img_seg = ImageSegmenter(model_type=yolo_model)
|
66 |
+
depth_estimator = MonocularDepthEstimator(model_type=midas_model)
|
67 |
+
|
68 |
def cancel():
|
69 |
CANCEL_PROCESSING = True
|
70 |
|
71 |
if __name__ == "__main__":
|
72 |
+
|
73 |
+
# testing
|
74 |
# img_1 = cv2.imread("assets/images/bus.jpg")
|
75 |
+
# img_1 = utils.resize(img_1)
|
76 |
+
|
77 |
+
# image_segmentation, objects_data, depthmap, depth_colormap = test_process_img(img_1)
|
78 |
+
# final_image = utils.draw_depth_info(image_segmentation, depthmap, objects_data)
|
79 |
+
# objs_pcd = utils.generate_obj_pcd(depthmap, objects_data[2][3])
|
80 |
+
# # print(objs_pcd[0][0])
|
81 |
+
# # display_pcd(objs_pcd, use_matplotlib=False)
|
82 |
+
|
83 |
+
# cv2.imshow("Segmentation", image_segmentation)
|
84 |
+
# cv2.imshow("Depth", depthmap*objects_data[2][3])
|
85 |
+
# cv2.imshow("Final", final_image)
|
86 |
+
|
87 |
# cv2.waitKey(0)
|
88 |
# cv2.destroyAllWindows()
|
89 |
|
|
|
102 |
with gr.Row():
|
103 |
with gr.Column(scale=1):
|
104 |
img_input = gr.Image()
|
105 |
+
model_type_img = gr.Dropdown(
|
106 |
+
["Small - Better performance and less accuracy",
|
107 |
+
"Medium - Balanced performance and accuracy",
|
108 |
+
"Large - Slow performance and high accuracy"],
|
109 |
+
label="Model Type", value="Small - Better performance and less accuracy",
|
110 |
+
info="Select the inference model before running predictions!")
|
111 |
options_checkbox_img = gr.CheckboxGroup(["Show Boundary Box", "Show Segmentation Region", "Show Segmentation Boundary"], label="Options")
|
112 |
conf_thres_img = gr.Slider(1, 100, value=60, label="Confidence Threshold", info="Choose the threshold above which objects should be detected")
|
113 |
submit_btn_img = gr.Button(value="Predict")
|
|
|
116 |
with gr.Row():
|
117 |
segmentation_img_output = gr.Image(height=300, label="Segmentation")
|
118 |
depth_img_output = gr.Image(height=300, label="Depth Estimation")
|
119 |
+
|
120 |
+
with gr.Row():
|
121 |
+
dist_img_output = gr.Image(height=300, label="Distance")
|
122 |
+
pcd_img_output = gr.Image(height=300, label="Point Cloud")
|
123 |
|
124 |
gr.Markdown("## Sample Images")
|
125 |
gr.Examples(
|
|
|
134 |
with gr.Row():
|
135 |
with gr.Column(scale=1):
|
136 |
vid_input = gr.Video()
|
137 |
+
model_type_vid = gr.Dropdown(
|
138 |
+
["Small - Better performance and less accuracy",
|
139 |
+
"Medium - Balanced performance and accuracy",
|
140 |
+
"Large - Slow performance and high accuracy"],
|
141 |
+
label="Model Type", value="Small - Better performance and less accuracy",
|
142 |
+
info="Select the inference model before running predictions!"),
|
143 |
+
|
144 |
options_checkbox_vid = gr.CheckboxGroup(["Show Boundary Box", "Show Segmentation Region", "Show Segmentation Boundary"], label="Options")
|
145 |
conf_thres_vid = gr.Slider(1, 100, value=60, label="Confidence Threshold", info="Choose the threshold above which objects should be detected")
|
146 |
with gr.Row():
|
|
|
151 |
with gr.Row():
|
152 |
segmentation_vid_output = gr.Image(height=400, label="Segmentation")
|
153 |
depth_vid_output = gr.Image(height=400, label="Depth Estimation")
|
154 |
+
|
155 |
+
with gr.Row():
|
156 |
+
dist_vid_output = gr.Image(height=300, label="Distance")
|
157 |
+
pcd_vid_output = gr.Image(height=300, label="Point Cloud")
|
158 |
|
159 |
gr.Markdown("## Sample Videos")
|
160 |
gr.Examples(
|
|
|
165 |
)
|
166 |
|
167 |
# image tab logic
|
168 |
+
submit_btn_img.click(process_image, inputs=img_input, outputs=[segmentation_img_output, depth_img_output, dist_img_output])
|
169 |
options_checkbox_img.change(update_segmentation_options, options_checkbox_img, [])
|
170 |
conf_thres_img.change(update_confidence_threshold, conf_thres_img, [])
|
171 |
+
model_type_img.change(model_selector, model_type_img, [])
|
172 |
|
173 |
# video tab logic
|
174 |
+
submit_btn_vid.click(process_video, inputs=vid_input, outputs=[segmentation_vid_output, depth_vid_output, dist_vid_output])
|
175 |
cancel_btn.click(cancel, inputs=[], outputs=[])
|
176 |
options_checkbox_vid.change(update_segmentation_options, options_checkbox_vid, [])
|
177 |
conf_thres_vid.change(update_confidence_threshold, conf_thres_vid, [])
|
178 |
+
|
179 |
|
180 |
|
181 |
my_app.queue(concurrency_count=5, max_size=20).launch()
|
image_segmenter.py
CHANGED
@@ -6,15 +6,15 @@ import random
|
|
6 |
import torch
|
7 |
|
8 |
class ImageSegmenter:
|
9 |
-
def __init__(self, model_type="
|
10 |
|
11 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
12 |
-
self.model = YOLO('models/
|
13 |
self.model.to(self.device)
|
14 |
|
15 |
-
self.is_show_bounding_boxes =
|
16 |
self.is_show_segmentation_boundary = False
|
17 |
-
self.is_show_segmentation =
|
18 |
self.confidence_threshold = 0.5
|
19 |
self.cls_clr = {}
|
20 |
|
@@ -22,6 +22,9 @@ class ImageSegmenter:
|
|
22 |
self.bb_thickness = 2
|
23 |
self.bb_clr = (255, 0, 0)
|
24 |
|
|
|
|
|
|
|
25 |
|
26 |
def get_cls_clr(self, cls_id):
|
27 |
if cls_id in self.cls_clr:
|
@@ -34,9 +37,10 @@ class ImageSegmenter:
|
|
34 |
self.cls_clr[cls_id] = (r, g, b)
|
35 |
return (r, g, b)
|
36 |
|
37 |
-
def predict(self, image):
|
38 |
-
#
|
39 |
-
|
|
|
40 |
predictions = self.model.predict(image)
|
41 |
|
42 |
cls_ids = predictions[0].boxes.cls.cpu().numpy()
|
@@ -52,39 +56,49 @@ class ImageSegmenter:
|
|
52 |
for id, cls in enumerate(cls_ids):
|
53 |
cls_clr = self.get_cls_clr(cls)
|
54 |
|
55 |
-
# draw bounding box with class name and score
|
56 |
-
if self.is_show_bounding_boxes and cls_conf[id] > self.confidence_threshold:
|
57 |
-
(x1, y1, x2, y2) = bounding_boxes[id]
|
58 |
-
cls_name = self.model.names[cls]
|
59 |
-
cls_confidence = cls_conf[id]
|
60 |
-
disp_str = cls_name +' '+ str(round(cls_confidence, 2))
|
61 |
-
cv2.rectangle(image, (x1, y1), (x2, y2), cls_clr, self.bb_thickness)
|
62 |
-
cv2.rectangle(image, (x1, y1), (x1+(len(disp_str)*18), y1+45), cls_clr, -1)
|
63 |
-
cv2.putText(image, disp_str, (x1+10, y1+30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
|
64 |
-
|
65 |
-
|
66 |
-
# draw segmentation boundary
|
67 |
-
if len(seg_mask_boundary) and self.is_show_segmentation_boundary and cls_conf[id] > self.confidence_threshold:
|
68 |
-
cv2.polylines(image, [np.array(seg_mask_boundary[id], dtype=np.int32)], isClosed=True, color=cls_clr, thickness=2)
|
69 |
-
|
70 |
# draw filled segmentation region
|
71 |
-
if seg_mask.any() and
|
72 |
-
|
|
|
|
|
|
|
|
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
-
return image
|
88 |
|
89 |
|
90 |
|
|
|
6 |
import torch
|
7 |
|
8 |
class ImageSegmenter:
|
9 |
+
def __init__(self, model_type="yolov8s-seg") -> None:
|
10 |
|
11 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
12 |
+
self.model = YOLO('models/'+ model_type +'.pt')
|
13 |
self.model.to(self.device)
|
14 |
|
15 |
+
self.is_show_bounding_boxes = True
|
16 |
self.is_show_segmentation_boundary = False
|
17 |
+
self.is_show_segmentation = False
|
18 |
self.confidence_threshold = 0.5
|
19 |
self.cls_clr = {}
|
20 |
|
|
|
22 |
self.bb_thickness = 2
|
23 |
self.bb_clr = (255, 0, 0)
|
24 |
|
25 |
+
# variables
|
26 |
+
self.masks = {}
|
27 |
+
|
28 |
|
29 |
def get_cls_clr(self, cls_id):
|
30 |
if cls_id in self.cls_clr:
|
|
|
37 |
self.cls_clr[cls_id] = (r, g, b)
|
38 |
return (r, g, b)
|
39 |
|
40 |
+
def predict(self, image):
|
41 |
+
# params
|
42 |
+
objects_data = []
|
43 |
+
image = image.copy()
|
44 |
predictions = self.model.predict(image)
|
45 |
|
46 |
cls_ids = predictions[0].boxes.cls.cpu().numpy()
|
|
|
56 |
for id, cls in enumerate(cls_ids):
|
57 |
cls_clr = self.get_cls_clr(cls)
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
# draw filled segmentation region
|
60 |
+
if seg_mask.any() and cls_conf[id] > self.confidence_threshold:
|
61 |
+
|
62 |
+
self.masks[id] = seg_mask[id]
|
63 |
+
|
64 |
+
if self.is_show_segmentation:
|
65 |
+
alpha = 0.8
|
66 |
|
67 |
+
# converting the mask from 1 channel to 3 channels
|
68 |
+
colored_mask = np.expand_dims(seg_mask[id], 0).repeat(3, axis=0)
|
69 |
+
colored_mask = np.moveaxis(colored_mask, 0, -1)
|
70 |
|
71 |
+
# Resize the mask to match the image size, if necessary
|
72 |
+
if image.shape[:2] != seg_mask[id].shape[:2]:
|
73 |
+
colored_mask = cv2.resize(colored_mask, (image.shape[1], image.shape[0]))
|
74 |
|
75 |
+
# filling the mased area with class color
|
76 |
+
masked = np.ma.MaskedArray(image, mask=colored_mask, fill_value=cls_clr)
|
77 |
+
image_overlay = masked.filled()
|
78 |
+
image = cv2.addWeighted(image, 1 - alpha, image_overlay, alpha, 0)
|
79 |
+
|
80 |
+
# draw bounding box with class name and score
|
81 |
+
if self.is_show_bounding_boxes and cls_conf[id] > self.confidence_threshold:
|
82 |
+
(x1, y1, x2, y2) = bounding_boxes[id]
|
83 |
+
cls_name = self.model.names[cls]
|
84 |
+
cls_confidence = cls_conf[id]
|
85 |
+
disp_str = cls_name +' '+ str(round(cls_confidence, 2))
|
86 |
+
cv2.rectangle(image, (x1, y1), (x2, y2), cls_clr, self.bb_thickness)
|
87 |
+
cv2.rectangle(image, (x1, y1), (x1+(len(disp_str)*9), y1+15), cls_clr, -1)
|
88 |
+
cv2.putText(image, disp_str, (x1+5, y1+10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
89 |
+
|
90 |
+
|
91 |
+
# draw segmentation boundary
|
92 |
+
if len(seg_mask_boundary) and self.is_show_segmentation_boundary and cls_conf[id] > self.confidence_threshold:
|
93 |
+
cv2.polylines(image, [np.array(seg_mask_boundary[id], dtype=np.int32)], isClosed=True, color=cls_clr, thickness=2)
|
94 |
+
|
95 |
+
|
96 |
+
# object variables
|
97 |
+
(x1, y1, x2, y2) = bounding_boxes[id]
|
98 |
+
center = x1+(x2-x1)//2, y1+(y2-y1)//2
|
99 |
+
objects_data.append([cls, self.model.names[cls], center, self.masks[id], cls_clr])
|
100 |
|
101 |
+
return image, objects_data
|
102 |
|
103 |
|
104 |
|
models/dpt_hybrid_384.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:501f0c75b3bca7daec6b3682c5054c09b366765aef6fa3a09d03a5cb4b230853
|
3 |
+
size 492757791
|
models/dpt_swin2_large_384.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8820c77a09e6b1d4bf2bcf71d1f52509239aa07c5f51709d8eee95c8775d5ec8
|
3 |
+
size 880390791
|
models/midas_v21_small_256.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70d6b9c891758c67f974a6097fb0c608c7ee67fb81ac3e5588847d5596d56fca
|
3 |
+
size 85761505
|
models/midas_v21_small_256.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
Insert midas_v21_small_256.pt here
|
2 |
+
|
3 |
+
https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt
|
models/yolov8m-seg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f9fc740ca0824e14b44681d491dc601efa664ec6ecea9a870acf876053826448
|
3 |
+
size 54899779
|
models/yolov8n-seg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d39e867b2c3a5dbc1aa764411544b475cb14727bf6af1ec46c238f8bb1351ab9
|
3 |
+
size 7054355
|
models/yolov8s-seg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c035c6c5f9c48ee962518ef648854c2cc5e60fa404fac443e17d306fdda16543
|
3 |
+
size 23897299
|
monocular_depth_estimator.py
CHANGED
@@ -6,15 +6,23 @@ from midas.model_loader import default_models, load_model
|
|
6 |
import os
|
7 |
import urllib.request
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
class MonocularDepthEstimator:
|
10 |
def __init__(self,
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
|
19 |
# model type
|
20 |
# MiDaS 3.1:
|
@@ -40,16 +48,15 @@ class MonocularDepthEstimator:
|
|
40 |
# select device
|
41 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
42 |
print("Running inference on : %s" % self.device)
|
43 |
-
model_file_url = "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt"
|
44 |
|
45 |
# loading model
|
46 |
-
if not os.path.exists(model_weights_path):
|
47 |
print("Model file not found. Downloading...")
|
48 |
# Download the model file
|
49 |
-
urllib.request.urlretrieve(
|
50 |
print("Model file downloaded successfully.")
|
51 |
|
52 |
-
self.model, self.transform, self.net_w, self.net_h = load_model(self.device, model_weights_path,
|
53 |
model_type, optimize, height, square)
|
54 |
print("Net width and height: ", (self.net_w, self.net_h))
|
55 |
|
@@ -79,7 +86,7 @@ class MonocularDepthEstimator:
|
|
79 |
|
80 |
return prediction
|
81 |
|
82 |
-
def process_prediction(self,
|
83 |
"""
|
84 |
Take an RGB image and depth map and place them side by side. This includes a proper normalization of the depth map
|
85 |
for better visibility.
|
@@ -92,33 +99,30 @@ class MonocularDepthEstimator:
|
|
92 |
"""
|
93 |
|
94 |
# normalizing depth image
|
95 |
-
depth_min =
|
96 |
-
depth_max =
|
97 |
-
normalized_depth = 255 * (
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
if side_by_side:
|
105 |
-
return np.concatenate((original_img, depth_side), axis=1)/255
|
106 |
|
107 |
-
return
|
108 |
|
109 |
def make_prediction(self, image):
|
|
|
110 |
with torch.no_grad():
|
111 |
original_image_rgb = np.flip(image, 2) # in [0, 255] (flip required to get RGB)
|
112 |
# resizing the image to feed to the model
|
113 |
image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
|
114 |
|
115 |
# monocular depth prediction
|
116 |
-
|
117 |
-
original_image_bgr = np.flip(original_image_rgb, 2) if self.side_by_side else None
|
118 |
|
119 |
# process the model predictions
|
120 |
-
|
121 |
-
return
|
122 |
|
123 |
def run(self, input_path):
|
124 |
|
@@ -137,11 +141,11 @@ class MonocularDepthEstimator:
|
|
137 |
ret, frame = cap.read()
|
138 |
|
139 |
if ret == True:
|
140 |
-
|
141 |
inference_end_time = time.time()
|
142 |
fps = round(1/(inference_end_time - inference_start_time))
|
143 |
-
cv2.putText(
|
144 |
-
cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ',
|
145 |
|
146 |
# Press ESC on keyboard to exit
|
147 |
if cv2.waitKey(1) == 27: # Escape key
|
@@ -170,6 +174,6 @@ if __name__ == "__main__":
|
|
170 |
torch.backends.cudnn.enabled = True
|
171 |
torch.backends.cudnn.benchmark = True
|
172 |
|
173 |
-
depth_estimator = MonocularDepthEstimator(
|
174 |
depth_estimator.run(INPUT_PATH)
|
175 |
|
|
|
6 |
import os
|
7 |
import urllib.request
|
8 |
|
9 |
+
MODEL_FILE_URL = {
|
10 |
+
"midas_v21_small_256" : "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt",
|
11 |
+
"dpt_hybrid_384" : "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt",
|
12 |
+
"dpt_large_384" : "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt",
|
13 |
+
"dpt_swin2_large_384" : "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt",
|
14 |
+
"dpt_beit_large_512" : "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt",
|
15 |
+
}
|
16 |
+
|
17 |
class MonocularDepthEstimator:
|
18 |
def __init__(self,
|
19 |
+
model_type="midas_v21_small_256",
|
20 |
+
model_weights_path="models/",
|
21 |
+
optimize=False,
|
22 |
+
side_by_side=False,
|
23 |
+
height=None,
|
24 |
+
square=False,
|
25 |
+
grayscale=False):
|
26 |
|
27 |
# model type
|
28 |
# MiDaS 3.1:
|
|
|
48 |
# select device
|
49 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
50 |
print("Running inference on : %s" % self.device)
|
|
|
51 |
|
52 |
# loading model
|
53 |
+
if not os.path.exists(model_weights_path+model_type+".pt"):
|
54 |
print("Model file not found. Downloading...")
|
55 |
# Download the model file
|
56 |
+
urllib.request.urlretrieve(MODEL_FILE_URL[model_type], model_weights_path+model_type+".pt")
|
57 |
print("Model file downloaded successfully.")
|
58 |
|
59 |
+
self.model, self.transform, self.net_w, self.net_h = load_model(self.device, model_weights_path+model_type+".pt",
|
60 |
model_type, optimize, height, square)
|
61 |
print("Net width and height: ", (self.net_w, self.net_h))
|
62 |
|
|
|
86 |
|
87 |
return prediction
|
88 |
|
89 |
+
def process_prediction(self, depth_map):
|
90 |
"""
|
91 |
Take an RGB image and depth map and place them side by side. This includes a proper normalization of the depth map
|
92 |
for better visibility.
|
|
|
99 |
"""
|
100 |
|
101 |
# normalizing depth image
|
102 |
+
depth_min = depth_map.min()
|
103 |
+
depth_max = depth_map.max()
|
104 |
+
normalized_depth = 255 * (depth_map - depth_min) / (depth_max - depth_min)
|
105 |
+
|
106 |
+
# normalized_depth *= 3
|
107 |
+
# grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2) / 3
|
108 |
+
grayscale_depthmap = np.repeat(np.expand_dims(normalized_depth, 2), 3, axis=2)
|
109 |
+
depth_colormap = cv2.applyColorMap(np.uint8(grayscale_depthmap), cv2.COLORMAP_INFERNO)
|
|
|
|
|
|
|
110 |
|
111 |
+
return normalized_depth/255, depth_colormap/255
|
112 |
|
113 |
def make_prediction(self, image):
|
114 |
+
image = image.copy()
|
115 |
with torch.no_grad():
|
116 |
original_image_rgb = np.flip(image, 2) # in [0, 255] (flip required to get RGB)
|
117 |
# resizing the image to feed to the model
|
118 |
image_tranformed = self.transform({"image": original_image_rgb/255})["image"]
|
119 |
|
120 |
# monocular depth prediction
|
121 |
+
pred = self.predict(image_tranformed, self.model, target_size=original_image_rgb.shape[1::-1])
|
|
|
122 |
|
123 |
# process the model predictions
|
124 |
+
depthmap, depth_colormap = self.process_prediction(pred)
|
125 |
+
return depthmap, depth_colormap
|
126 |
|
127 |
def run(self, input_path):
|
128 |
|
|
|
141 |
ret, frame = cap.read()
|
142 |
|
143 |
if ret == True:
|
144 |
+
_, depth_colormap = self.make_prediction(frame)
|
145 |
inference_end_time = time.time()
|
146 |
fps = round(1/(inference_end_time - inference_start_time))
|
147 |
+
cv2.putText(depth_colormap, f'FPS: {fps}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (10, 255, 100), 2)
|
148 |
+
cv2.imshow('MiDaS Depth Estimation - Press Escape to close window ', depth_colormap)
|
149 |
|
150 |
# Press ESC on keyboard to exit
|
151 |
if cv2.waitKey(1) == 27: # Escape key
|
|
|
174 |
torch.backends.cudnn.enabled = True
|
175 |
torch.backends.cudnn.benchmark = True
|
176 |
|
177 |
+
depth_estimator = MonocularDepthEstimator(model_type="dpt_hybrid_384")
|
178 |
depth_estimator.run(INPUT_PATH)
|
179 |
|
point_cloud_generator.py
CHANGED
@@ -49,7 +49,7 @@ class PointCloudGenerator:
|
|
49 |
|
50 |
return pcd
|
51 |
|
52 |
-
def
|
53 |
|
54 |
# get depth resolution:
|
55 |
height, width = depth_img.shape
|
@@ -68,65 +68,70 @@ class PointCloudGenerator:
|
|
68 |
|
69 |
return pcd
|
70 |
|
71 |
-
def generate_point_cloud(self,
|
72 |
-
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
78 |
|
|
|
|
|
79 |
|
80 |
-
|
81 |
-
depth_min = depth_img.min()
|
82 |
-
depth_max = depth_img.max()
|
83 |
-
normalized_depth = 255 * ((depth_img - depth_min) / (depth_max - depth_min))
|
84 |
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
print(f"Max value: {np.max(depth_img)}")
|
91 |
|
92 |
|
93 |
-
#
|
94 |
-
|
95 |
-
self.pcd = self.conver_to_point_cloud_v1(depth_img)
|
96 |
-
if vectorize:
|
97 |
-
self.pcd = self.conver_to_point_cloud_v2(depth_img)
|
98 |
|
99 |
-
|
100 |
-
return self.pcd
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
106 |
point_range = range(0, points.shape[0], skip) # skip points to prevent crash
|
107 |
|
108 |
-
if use_matplotlib:
|
109 |
-
|
110 |
-
ax = fig.add_subplot(111, projection='3d')
|
111 |
-
ax.scatter(points[point_range, 0], points[point_range, 1], points[point_range, 2], c='r', marker='o')
|
112 |
-
ax.set_xlabel('X Label')
|
113 |
-
ax.set_ylabel('Y Label')
|
114 |
-
ax.set_zlabel('Z Label')
|
115 |
-
plt.show()
|
116 |
|
117 |
if not use_matplotlib:
|
118 |
-
|
119 |
pcd_o3d = o3d.geometry.PointCloud() # create point cloud object
|
120 |
-
pcd_o3d.points = o3d.utility.Vector3dVector(
|
121 |
# Visualize:
|
122 |
o3d.visualization.draw_geometries([pcd_o3d])
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
|
|
|
|
|
|
125 |
if __name__ == "__main__":
|
126 |
-
|
|
|
|
|
127 |
point_cloud_gen = PointCloudGenerator()
|
128 |
-
pcd = point_cloud_gen.generate_point_cloud(
|
129 |
-
|
130 |
|
131 |
|
132 |
|
|
|
49 |
|
50 |
return pcd
|
51 |
|
52 |
+
def conver_to_point_cloud(self, depth_img):
|
53 |
|
54 |
# get depth resolution:
|
55 |
height, width = depth_img.shape
|
|
|
68 |
|
69 |
return pcd
|
70 |
|
71 |
+
def generate_point_cloud(self, depth_img, normalize=False):
|
72 |
+
|
73 |
|
74 |
+
if normalize:
|
75 |
+
# normalizing depth image
|
76 |
+
depth_min = depth_img.min()
|
77 |
+
depth_max = depth_img.max()
|
78 |
+
normalized_depth = 255 * ((depth_img - depth_min) / (depth_max - depth_min))
|
79 |
+
depth_img = normalized_depth
|
80 |
|
81 |
+
# convert depth to point cloud
|
82 |
+
# point_cloud = self.conver_to_point_cloud(depth_img)
|
83 |
|
84 |
+
depth_image = o3d.geometry.Image(depth_img)
|
|
|
|
|
|
|
85 |
|
86 |
+
# Create open3d camera intrinsic object
|
87 |
+
intrinsic_matrix = np.array([[self.fx_depth, 0, self.cx_depth], [0, self.fy_depth, self.cy_depth], [0, 0, 1]])
|
88 |
+
camera_intrinsic = o3d.camera.PinholeCameraIntrinsic()
|
89 |
+
# camera_intrinsic.intrinsic_matrix = intrinsic_matrix
|
90 |
+
camera_intrinsic.set_intrinsics(depth_image.width, depth_image.height, self.fx_depth, self.fy_depth, self.cx_depth, self.cy_depth)
|
|
|
91 |
|
92 |
|
93 |
+
# Create open3d point cloud from depth image
|
94 |
+
point_cloud = o3d.geometry.PointCloud.create_from_depth_image(depth_img, camera_intrinsic)
|
|
|
|
|
|
|
95 |
|
96 |
+
return point_cloud
|
|
|
97 |
|
98 |
+
def display_pcd(pcd_data, use_matplotlib=True):
|
99 |
+
|
100 |
+
if use_matplotlib:
|
101 |
+
fig = plt.figure()
|
102 |
+
ax = fig.add_subplot(111, projection='3d')
|
103 |
+
|
104 |
+
for data, clr in pcd_data:
|
105 |
+
points = np.array(data)
|
106 |
+
skip = 5
|
107 |
point_range = range(0, points.shape[0], skip) # skip points to prevent crash
|
108 |
|
109 |
+
if use_matplotlib:
|
110 |
+
ax.scatter(points[point_range, 0], points[point_range, 1], points[point_range, 2], c='r', marker='o')
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
if not use_matplotlib:
|
|
|
113 |
pcd_o3d = o3d.geometry.PointCloud() # create point cloud object
|
114 |
+
pcd_o3d.points = o3d.utility.Vector3dVector(points) # set pcd_np as the point cloud points
|
115 |
# Visualize:
|
116 |
o3d.visualization.draw_geometries([pcd_o3d])
|
117 |
|
118 |
+
if use_matplotlib:
|
119 |
+
ax.set_xlabel('X Label')
|
120 |
+
ax.set_ylabel('Y Label')
|
121 |
+
ax.set_zlabel('Z Label')
|
122 |
+
ax.view_init(elev=90, azim=0, roll=0)
|
123 |
+
plt.show()
|
124 |
|
125 |
+
if not use_matplotlib:
|
126 |
+
o3d.visualization.draw_geometries([pcd_o3d])
|
127 |
+
|
128 |
if __name__ == "__main__":
|
129 |
+
depth_img_path = "assets/images/depth_map_p1.png"
|
130 |
+
depth_img = cv2.imread(depth_img_path, 0)
|
131 |
+
depth_img = depth_img/255
|
132 |
point_cloud_gen = PointCloudGenerator()
|
133 |
+
pcd = point_cloud_gen.generate_point_cloud(depth_img)
|
134 |
+
display_pcd([pcd], use_matplotlib=True)
|
135 |
|
136 |
|
137 |
|
utils.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import numpy as np
|
3 |
+
from point_cloud_generator import PointCloudGenerator
|
4 |
+
|
5 |
+
# pcd_generator = PointCloudGenerator()
|
6 |
+
|
7 |
+
def resize(image):
|
8 |
+
"""
|
9 |
+
resize the input nd array
|
10 |
+
"""
|
11 |
+
h, w = image.shape[:2]
|
12 |
+
if h > w:
|
13 |
+
return cv2.resize(image, (480, 640))
|
14 |
+
else:
|
15 |
+
return cv2.resize(image, (640, 480))
|
16 |
+
|
17 |
+
def get_masked_depth(depth_map, mask):
|
18 |
+
masked_depth_map = depth_map*mask
|
19 |
+
pixel_depth_vals = masked_depth_map[masked_depth_map>0]
|
20 |
+
mean_depth = np.mean(pixel_depth_vals)
|
21 |
+
return masked_depth_map, 1-mean_depth
|
22 |
+
|
23 |
+
def draw_depth_info(image, depth_map, objects_data):
|
24 |
+
image = image.copy()
|
25 |
+
# object data -> [cls_id, cls_name, cls_center, cls_mask, cls_clr]
|
26 |
+
for data in objects_data:
|
27 |
+
center = data[2]
|
28 |
+
mask = data[3]
|
29 |
+
_, depth = get_masked_depth(depth_map, mask)
|
30 |
+
cv2.putText(image, str(round(depth*10, 2))+'m', center, cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
|
31 |
+
|
32 |
+
return image
|
33 |
+
|
34 |
+
def generate_obj_pcd(depth_map, objects_data):
|
35 |
+
objs_pcd = []
|
36 |
+
pcd_generator = PointCloudGenerator()
|
37 |
+
|
38 |
+
for data in objects_data[:2]:
|
39 |
+
mask = data[3]
|
40 |
+
cls_clr = data[4]
|
41 |
+
masked_depth = depth_map*mask
|
42 |
+
# generating point cloud using masked depth
|
43 |
+
pcd = pcd_generator.generate_point_cloud(masked_depth)
|
44 |
+
objs_pcd.append((pcd, cls_clr))
|
45 |
+
return objs_pcd
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
|