|
import numpy as np |
|
import cv2 as cv |
|
import argparse |
|
|
|
|
|
opencv_python_version = lambda str_version: tuple(map(int, (str_version.split(".")))) |
|
assert opencv_python_version(cv.__version__) >= opencv_python_version("4.10.0"), \ |
|
"Please install latest opencv-python for benchmark: python3 -m pip install --upgrade opencv-python" |
|
|
|
from nanodet import NanoDet |
|
|
|
|
|
backend_target_pairs = [ |
|
[cv.dnn.DNN_BACKEND_OPENCV, cv.dnn.DNN_TARGET_CPU], |
|
[cv.dnn.DNN_BACKEND_CUDA, cv.dnn.DNN_TARGET_CUDA], |
|
[cv.dnn.DNN_BACKEND_CUDA, cv.dnn.DNN_TARGET_CUDA_FP16], |
|
[cv.dnn.DNN_BACKEND_TIMVX, cv.dnn.DNN_TARGET_NPU], |
|
[cv.dnn.DNN_BACKEND_CANN, cv.dnn.DNN_TARGET_NPU] |
|
] |
|
|
|
classes = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', |
|
'train', 'truck', 'boat', 'traffic light', 'fire hydrant', |
|
'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', |
|
'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', |
|
'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', |
|
'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', |
|
'baseball glove', 'skateboard', 'surfboard', 'tennis racket', |
|
'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', |
|
'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', |
|
'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', |
|
'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', |
|
'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', |
|
'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', |
|
'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') |
|
|
|
def letterbox(srcimg, target_size=(416, 416)): |
|
img = srcimg.copy() |
|
|
|
top, left, newh, neww = 0, 0, target_size[0], target_size[1] |
|
if img.shape[0] != img.shape[1]: |
|
hw_scale = img.shape[0] / img.shape[1] |
|
if hw_scale > 1: |
|
newh, neww = target_size[0], int(target_size[1] / hw_scale) |
|
img = cv.resize(img, (neww, newh), interpolation=cv.INTER_AREA) |
|
left = int((target_size[1] - neww) * 0.5) |
|
img = cv.copyMakeBorder(img, 0, 0, left, target_size[1] - neww - left, cv.BORDER_CONSTANT, value=0) |
|
else: |
|
newh, neww = int(target_size[0] * hw_scale), target_size[1] |
|
img = cv.resize(img, (neww, newh), interpolation=cv.INTER_AREA) |
|
top = int((target_size[0] - newh) * 0.5) |
|
img = cv.copyMakeBorder(img, top, target_size[0] - newh - top, 0, 0, cv.BORDER_CONSTANT, value=0) |
|
else: |
|
img = cv.resize(img, target_size, interpolation=cv.INTER_AREA) |
|
|
|
letterbox_scale = [top, left, newh, neww] |
|
return img, letterbox_scale |
|
|
|
def unletterbox(bbox, original_image_shape, letterbox_scale): |
|
ret = bbox.copy() |
|
|
|
h, w = original_image_shape |
|
top, left, newh, neww = letterbox_scale |
|
|
|
if h == w: |
|
ratio = h / newh |
|
ret = ret * ratio |
|
return ret |
|
|
|
ratioh, ratiow = h / newh, w / neww |
|
ret[0] = max((ret[0] - left) * ratiow, 0) |
|
ret[1] = max((ret[1] - top) * ratioh, 0) |
|
ret[2] = min((ret[2] - left) * ratiow, w) |
|
ret[3] = min((ret[3] - top) * ratioh, h) |
|
|
|
return ret.astype(np.int32) |
|
|
|
def vis(preds, res_img, letterbox_scale, fps=None): |
|
ret = res_img.copy() |
|
|
|
|
|
if fps is not None: |
|
fps_label = "FPS: %.2f" % fps |
|
cv.putText(ret, fps_label, (10, 25), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) |
|
|
|
|
|
for pred in preds: |
|
bbox = pred[:4] |
|
conf = pred[-2] |
|
classid = pred[-1].astype(np.int32) |
|
|
|
|
|
xmin, ymin, xmax, ymax = unletterbox(bbox, ret.shape[:2], letterbox_scale) |
|
cv.rectangle(ret, (xmin, ymin), (xmax, ymax), (0, 255, 0), thickness=2) |
|
|
|
|
|
label = "{:s}: {:.2f}".format(classes[classid], conf) |
|
cv.putText(ret, label, (xmin, ymin - 10), cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), thickness=2) |
|
|
|
return ret |
|
|
|
if __name__=='__main__': |
|
parser = argparse.ArgumentParser(description='Nanodet inference using OpenCV an contribution by Sri Siddarth Chakaravarthy part of GSOC_2022') |
|
parser.add_argument('--input', '-i', type=str, |
|
help='Path to the input image. Omit for using default camera.') |
|
parser.add_argument('--model', '-m', type=str, |
|
default='object_detection_nanodet_2022nov.onnx', help="Path to the model") |
|
parser.add_argument('--backend_target', '-bt', type=int, default=0, |
|
help='''Choose one of the backend-target pair to run this demo: |
|
{:d}: (default) OpenCV implementation + CPU, |
|
{:d}: CUDA + GPU (CUDA), |
|
{:d}: CUDA + GPU (CUDA FP16), |
|
{:d}: TIM-VX + NPU, |
|
{:d}: CANN + NPU |
|
'''.format(*[x for x in range(len(backend_target_pairs))])) |
|
parser.add_argument('--confidence', default=0.35, type=float, |
|
help='Class confidence') |
|
parser.add_argument('--nms', default=0.6, type=float, |
|
help='Enter nms IOU threshold') |
|
parser.add_argument('--save', '-s', action='store_true', |
|
help='Specify to save results. This flag is invalid when using camera.') |
|
parser.add_argument('--vis', '-v', action='store_true', |
|
help='Specify to open a window for result visualization. This flag is invalid when using camera.') |
|
args = parser.parse_args() |
|
|
|
backend_id = backend_target_pairs[args.backend_target][0] |
|
target_id = backend_target_pairs[args.backend_target][1] |
|
|
|
model = NanoDet(modelPath= args.model, |
|
prob_threshold=args.confidence, |
|
iou_threshold=args.nms, |
|
backend_id=backend_id, |
|
target_id=target_id) |
|
|
|
tm = cv.TickMeter() |
|
tm.reset() |
|
if args.input is not None: |
|
image = cv.imread(args.input) |
|
input_blob = cv.cvtColor(image, cv.COLOR_BGR2RGB) |
|
|
|
|
|
input_blob, letterbox_scale = letterbox(input_blob) |
|
|
|
|
|
tm.start() |
|
preds = model.infer(input_blob) |
|
tm.stop() |
|
print("Inference time: {:.2f} ms".format(tm.getTimeMilli())) |
|
|
|
img = vis(preds, image, letterbox_scale) |
|
|
|
if args.save: |
|
print('Results saved to result.jpg\n') |
|
cv.imwrite('result.jpg', img) |
|
|
|
if args.vis: |
|
cv.namedWindow(args.input, cv.WINDOW_AUTOSIZE) |
|
cv.imshow(args.input, img) |
|
cv.waitKey(0) |
|
|
|
else: |
|
print("Press any key to stop video capture") |
|
deviceId = 0 |
|
cap = cv.VideoCapture(deviceId) |
|
|
|
while cv.waitKey(1) < 0: |
|
hasFrame, frame = cap.read() |
|
if not hasFrame: |
|
print('No frames grabbed!') |
|
break |
|
|
|
input_blob = cv.cvtColor(frame, cv.COLOR_BGR2RGB) |
|
input_blob, letterbox_scale = letterbox(input_blob) |
|
|
|
tm.start() |
|
preds = model.infer(input_blob) |
|
tm.stop() |
|
|
|
img = vis(preds, frame, letterbox_scale, fps=tm.getFPS()) |
|
|
|
cv.imshow("NanoDet Demo", img) |
|
|
|
tm.reset() |
|
|