yolov4tiny / example.py
gbahlnxp's picture
Upload folder using huggingface_hub
1d6d5bf verified
#!/usr/bin/env python3
# Copyright 2023-2024 NXP
# SPDX-License-Identifier: MIT
import cv2
import tensorflow as tf
import numpy as np
import time
import random
random.seed(42)
OBJECT_DETECTOR_TFLITE = 'yolov4-tiny_416_quant.tflite'
LABELS_FILE = 'coco-labels-2014_2017.txt'
IMAGE_FILENAME = 'example_input.jpg'
SCORE_THRESHOLD = 0.20
NMS_IOU_THRESHOLD = 0.5
INFERENCE_IMG_SIZE = 416
MAX_DETS = 100
ANCHORS = [[[81, 82], [135, 169], [344, 319]], [[23, 27], [37, 58], [81, 82]]]
SIGMOID_FACTOR = [1.05, 1.05]
NUM_ANCHORS = 3
STRIDES = [32, 16]
GRID_SIZES = [int(INFERENCE_IMG_SIZE / s) for s in STRIDES]
with open(LABELS_FILE, 'r') as f:
COCO_CLASSES = [line.strip() for line in f.readlines()]
interpreter = tf.lite.Interpreter(OBJECT_DETECTOR_TFLITE)
interpreter.allocate_tensors()
def gen_box_colors():
colors = []
for _ in range(len(COCO_CLASSES)):
r = random.randint(100, 255)
g = random.randint(100, 255)
b = random.randint(100, 255)
colors.append((r, g, b))
return colors
BOX_COLORS = gen_box_colors()
def load_image(filename):
orig_image = cv2.imread(filename, 1)
image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB)
image = cv2.resize(image, (INFERENCE_IMG_SIZE, INFERENCE_IMG_SIZE))
image = np.expand_dims(image, axis=0)
image = image / 255.0
return orig_image, image
def np_sigmoid(x):
return 1 / (1 + np.exp(-x))
def reciprocal_sigmoid(x):
return -np.log(1 / x - 1)
def decode_boxes_prediction(yolo_output):
# Each output level represents a grid of predictions.
# The first output level is a 26x26 grid and the second 13x13.
# Each cell of each grid is assigned to 3 anchor bounding boxes.
# The bounding box predictions are regressed
# relatively to these anchor boxes.
# Thus, the model predicts 3 bounding boxes per cell per output level.
# The output is structured as follows:
# For each cell [[x, y, w, h, conf, cl_0, cl_1, ..., cl_79], # anchor 1
# [x, y, w, h, conf, cl_0, cl_1, ..., cl_79], # anchor 2
# [x, y, w, h, conf, cl_0, cl_1, ..., cl_79]] # anchor 3
# Hence, we have 85 values per anchor box, and thus 255 values per cell.
# The decoding of the output bounding boxes is described in Figure 2 of
# the YOLOv3 paper https://arxiv.org/pdf/1804.02767.pdf;
boxes_list = []
scores_list = []
classes_list = []
for idx, feats in enumerate(yolo_output):
features = np.reshape(feats, (NUM_ANCHORS * GRID_SIZES[idx] ** 2, 85))
anchor = np.array(ANCHORS[idx])
factor = SIGMOID_FACTOR[idx]
grid_size = GRID_SIZES[idx]
stride = STRIDES[idx]
cell_confidence = features[..., 4]
logit_threshold = reciprocal_sigmoid(SCORE_THRESHOLD)
over_threshold_list = np.where(cell_confidence > logit_threshold)
if over_threshold_list[0].size > 0:
indices = np.array(over_threshold_list[0])
box_positions = np.floor_divide(indices, 3)
list_xy = np.array(np.divmod(box_positions, grid_size)).T
list_xy = list_xy[..., ::-1]
boxes_xy = np.reshape(list_xy, (int(list_xy.size / 2), 2))
outxy = features[indices, :2]
# boxes center coordinates
centers = np_sigmoid(outxy * factor) - 0.5 * (factor - 1)
centers += boxes_xy
centers *= stride
# boxes width and height
width_height = np.exp(features[indices, 2:4])
width_height *= anchor[np.divmod(indices, NUM_ANCHORS)[1]]
boxes_list.append(np.stack([centers[:, 0] - width_height[:, 0]/2,
centers[:, 1] - width_height[:, 1]/2,
centers[:, 0] + width_height[:, 0]/2,
centers[:, 1] + width_height[:, 1]/2],
axis=1))
# confidence that cell contains an object
scores_list.append(np_sigmoid(features[indices, 4:5]))
# class with the highest probability in this cell
classes_list.append(np.argmax(features[indices, 5:], axis=1))
if len(boxes_list) > 0:
boxes = np.concatenate(boxes_list, axis=0)
scores = np.concatenate(scores_list, axis=0)[:, 0]
classes = np.concatenate(classes_list, axis=0)
return boxes, scores, classes
else:
return np.zeros((0, 4)), np.zeros((0)), np.zeros((0))
def decode_output(yolo_outputs,
score_threshold=SCORE_THRESHOLD,
iou_threshold=NMS_IOU_THRESHOLD):
'''
Decode output from YOLOv4 tiny in inference size referential (416x416)
'''
boxes, scores, classes = decode_boxes_prediction(yolo_outputs)
# apply NMS from tensorflow
inds = tf.image.non_max_suppression(boxes, scores, MAX_DETS,
score_threshold=score_threshold,
iou_threshold=iou_threshold)
# keep only selected boxes
boxes = tf.gather(boxes, inds)
scores = tf.gather(scores, inds)
classes = tf.gather(classes, inds)
return scores, boxes, classes
def run_inference(interpreter, image, threshold=SCORE_THRESHOLD):
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
input_scale, input_zero_point = input_details[0]["quantization"]
image = image / input_scale + input_zero_point
image = image.astype(np.int8)
interpreter.set_tensor(input_details[0]['index'], image)
interpreter.invoke()
boxes = interpreter.get_tensor(output_details[0]['index'])
boxes2 = interpreter.get_tensor(output_details[1]['index'])
return [boxes, boxes2]
if __name__ == "__main__":
orig_image, processed_image = load_image(IMAGE_FILENAME)
start = time.time()
yolo_output = run_inference(interpreter, processed_image)
end = time.time()
scores, boxes, classes = decode_output(yolo_output)
# rescale boxes for display
shp = orig_image.shape
boxes = boxes.numpy()
boxes /= INFERENCE_IMG_SIZE
boxes *= np.array([shp[1], shp[0], shp[1], shp[0]])
boxes = boxes.astype(np.int32)
print("Inference time", end - start, "ms")
print("Detected", boxes.shape[0], "object(s)")
print("Box coordinates:")
for i in range(boxes.shape[0]):
box = boxes[i, :]
print(box, end=" ")
class_name = COCO_CLASSES[classes[i].numpy()]
score = scores[i].numpy()
color = BOX_COLORS[classes[i]]
print("class", class_name, end=" ")
print("score", score)
cv2.rectangle(orig_image, (box[0], box[1]), (box[2], box[3]),
color, 3)
cv2.putText(orig_image, f"{class_name} {score:.2f}",
(box[0], box[1] - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
cv2.imwrite('example_output.jpg', orig_image)
cv2.imshow('', orig_image)
cv2.waitKey()