|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Post-processing operations on detected boxes.""" |
|
|
|
import numpy as np |
|
import tensorflow as tf |
|
|
|
from object_detection.core import box_list |
|
from object_detection.core import box_list_ops |
|
from object_detection.core import standard_fields as fields |
|
from object_detection.utils import shape_utils |
|
|
|
|
|
def multiclass_non_max_suppression(boxes, |
|
scores, |
|
score_thresh, |
|
iou_thresh, |
|
max_size_per_class, |
|
max_total_size=0, |
|
clip_window=None, |
|
change_coordinate_frame=False, |
|
masks=None, |
|
boundaries=None, |
|
pad_to_max_output_size=False, |
|
additional_fields=None, |
|
scope=None): |
|
"""Multi-class version of non maximum suppression. |
|
|
|
This op greedily selects a subset of detection bounding boxes, pruning |
|
away boxes that have high IOU (intersection over union) overlap (> thresh) |
|
with already selected boxes. It operates independently for each class for |
|
which scores are provided (via the scores field of the input box_list), |
|
pruning boxes with score less than a provided threshold prior to |
|
applying NMS. |
|
|
|
Please note that this operation is performed on *all* classes, therefore any |
|
background classes should be removed prior to calling this function. |
|
|
|
Selected boxes are guaranteed to be sorted in decreasing order by score (but |
|
the sort is not guaranteed to be stable). |
|
|
|
Args: |
|
boxes: A [k, q, 4] float32 tensor containing k detections. `q` can be either |
|
number of classes or 1 depending on whether a separate box is predicted |
|
per class. |
|
scores: A [k, num_classes] float32 tensor containing the scores for each of |
|
the k detections. The scores have to be non-negative when |
|
pad_to_max_output_size is True. |
|
score_thresh: scalar threshold for score (low scoring boxes are removed). |
|
iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap |
|
with previously selected boxes are removed). |
|
max_size_per_class: maximum number of retained boxes per class. |
|
max_total_size: maximum number of boxes retained over all classes. By |
|
default returns all boxes retained after capping boxes per class. |
|
clip_window: A float32 tensor of the form [y_min, x_min, y_max, x_max] |
|
representing the window to clip and normalize boxes to before performing |
|
non-max suppression. |
|
change_coordinate_frame: Whether to normalize coordinates after clipping |
|
relative to clip_window (this can only be set to True if a clip_window |
|
is provided) |
|
masks: (optional) a [k, q, mask_height, mask_width] float32 tensor |
|
containing box masks. `q` can be either number of classes or 1 depending |
|
on whether a separate mask is predicted per class. |
|
boundaries: (optional) a [k, q, boundary_height, boundary_width] float32 |
|
tensor containing box boundaries. `q` can be either number of classes or 1 |
|
depending on whether a separate boundary is predicted per class. |
|
pad_to_max_output_size: If true, the output nmsed boxes are padded to be of |
|
length `max_size_per_class`. Defaults to false. |
|
additional_fields: (optional) If not None, a dictionary that maps keys to |
|
tensors whose first dimensions are all of size `k`. After non-maximum |
|
suppression, all tensors corresponding to the selected boxes will be |
|
added to resulting BoxList. |
|
scope: name scope. |
|
|
|
Returns: |
|
A tuple of sorted_boxes and num_valid_nms_boxes. The sorted_boxes is a |
|
BoxList holds M boxes with a rank-1 scores field representing |
|
corresponding scores for each box with scores sorted in decreasing order |
|
and a rank-1 classes field representing a class label for each box. The |
|
num_valid_nms_boxes is a 0-D integer tensor representing the number of |
|
valid elements in `BoxList`, with the valid elements appearing first. |
|
|
|
Raises: |
|
ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not have |
|
a valid scores field. |
|
""" |
|
if not 0 <= iou_thresh <= 1.0: |
|
raise ValueError('iou_thresh must be between 0 and 1') |
|
if scores.shape.ndims != 2: |
|
raise ValueError('scores field must be of rank 2') |
|
if scores.shape[1].value is None: |
|
raise ValueError('scores must have statically defined second ' |
|
'dimension') |
|
if boxes.shape.ndims != 3: |
|
raise ValueError('boxes must be of rank 3.') |
|
if not (boxes.shape[1].value == scores.shape[1].value or |
|
boxes.shape[1].value == 1): |
|
raise ValueError('second dimension of boxes must be either 1 or equal ' |
|
'to the second dimension of scores') |
|
if boxes.shape[2].value != 4: |
|
raise ValueError('last dimension of boxes must be of size 4.') |
|
if change_coordinate_frame and clip_window is None: |
|
raise ValueError('if change_coordinate_frame is True, then a clip_window' |
|
'must be specified.') |
|
|
|
with tf.name_scope(scope, 'MultiClassNonMaxSuppression'): |
|
num_scores = tf.shape(scores)[0] |
|
num_classes = scores.get_shape()[1] |
|
|
|
selected_boxes_list = [] |
|
num_valid_nms_boxes_cumulative = tf.constant(0) |
|
per_class_boxes_list = tf.unstack(boxes, axis=1) |
|
if masks is not None: |
|
per_class_masks_list = tf.unstack(masks, axis=1) |
|
if boundaries is not None: |
|
per_class_boundaries_list = tf.unstack(boundaries, axis=1) |
|
boxes_ids = (range(num_classes) if len(per_class_boxes_list) > 1 |
|
else [0] * num_classes.value) |
|
for class_idx, boxes_idx in zip(range(num_classes), boxes_ids): |
|
per_class_boxes = per_class_boxes_list[boxes_idx] |
|
boxlist_and_class_scores = box_list.BoxList(per_class_boxes) |
|
class_scores = tf.reshape( |
|
tf.slice(scores, [0, class_idx], tf.stack([num_scores, 1])), [-1]) |
|
|
|
boxlist_and_class_scores.add_field(fields.BoxListFields.scores, |
|
class_scores) |
|
if masks is not None: |
|
per_class_masks = per_class_masks_list[boxes_idx] |
|
boxlist_and_class_scores.add_field(fields.BoxListFields.masks, |
|
per_class_masks) |
|
if boundaries is not None: |
|
per_class_boundaries = per_class_boundaries_list[boxes_idx] |
|
boxlist_and_class_scores.add_field(fields.BoxListFields.boundaries, |
|
per_class_boundaries) |
|
if additional_fields is not None: |
|
for key, tensor in additional_fields.items(): |
|
boxlist_and_class_scores.add_field(key, tensor) |
|
|
|
if pad_to_max_output_size: |
|
max_selection_size = max_size_per_class |
|
selected_indices, num_valid_nms_boxes = ( |
|
tf.image.non_max_suppression_padded( |
|
boxlist_and_class_scores.get(), |
|
boxlist_and_class_scores.get_field(fields.BoxListFields.scores), |
|
max_selection_size, |
|
iou_threshold=iou_thresh, |
|
score_threshold=score_thresh, |
|
pad_to_max_output_size=True)) |
|
else: |
|
max_selection_size = tf.minimum(max_size_per_class, |
|
boxlist_and_class_scores.num_boxes()) |
|
selected_indices = tf.image.non_max_suppression( |
|
boxlist_and_class_scores.get(), |
|
boxlist_and_class_scores.get_field(fields.BoxListFields.scores), |
|
max_selection_size, |
|
iou_threshold=iou_thresh, |
|
score_threshold=score_thresh) |
|
num_valid_nms_boxes = tf.shape(selected_indices)[0] |
|
selected_indices = tf.concat( |
|
[selected_indices, |
|
tf.zeros(max_selection_size-num_valid_nms_boxes, tf.int32)], 0) |
|
nms_result = box_list_ops.gather(boxlist_and_class_scores, |
|
selected_indices) |
|
|
|
valid_nms_boxes_indx = tf.less( |
|
tf.range(max_selection_size), num_valid_nms_boxes) |
|
nms_scores = nms_result.get_field(fields.BoxListFields.scores) |
|
nms_result.add_field(fields.BoxListFields.scores, |
|
tf.where(valid_nms_boxes_indx, |
|
nms_scores, -1*tf.ones(max_selection_size))) |
|
num_valid_nms_boxes_cumulative += num_valid_nms_boxes |
|
|
|
nms_result.add_field( |
|
fields.BoxListFields.classes, (tf.zeros_like( |
|
nms_result.get_field(fields.BoxListFields.scores)) + class_idx)) |
|
selected_boxes_list.append(nms_result) |
|
selected_boxes = box_list_ops.concatenate(selected_boxes_list) |
|
sorted_boxes = box_list_ops.sort_by_field(selected_boxes, |
|
fields.BoxListFields.scores) |
|
if clip_window is not None: |
|
|
|
|
|
sorted_boxes = box_list_ops.clip_to_window( |
|
sorted_boxes, |
|
clip_window, |
|
filter_nonoverlapping=not pad_to_max_output_size) |
|
|
|
|
|
sorted_boxes_size = tf.shape(sorted_boxes.get())[0] |
|
non_zero_box_area = tf.cast(box_list_ops.area(sorted_boxes), tf.bool) |
|
sorted_boxes_scores = tf.where( |
|
non_zero_box_area, |
|
sorted_boxes.get_field(fields.BoxListFields.scores), |
|
-1*tf.ones(sorted_boxes_size)) |
|
sorted_boxes.add_field(fields.BoxListFields.scores, sorted_boxes_scores) |
|
num_valid_nms_boxes_cumulative = tf.reduce_sum( |
|
tf.cast(tf.greater_equal(sorted_boxes_scores, 0), tf.int32)) |
|
sorted_boxes = box_list_ops.sort_by_field(sorted_boxes, |
|
fields.BoxListFields.scores) |
|
if change_coordinate_frame: |
|
sorted_boxes = box_list_ops.change_coordinate_frame( |
|
sorted_boxes, clip_window) |
|
|
|
if max_total_size: |
|
max_total_size = tf.minimum(max_total_size, |
|
sorted_boxes.num_boxes()) |
|
sorted_boxes = box_list_ops.gather(sorted_boxes, |
|
tf.range(max_total_size)) |
|
num_valid_nms_boxes_cumulative = tf.where( |
|
max_total_size > num_valid_nms_boxes_cumulative, |
|
num_valid_nms_boxes_cumulative, max_total_size) |
|
|
|
if not pad_to_max_output_size: |
|
sorted_boxes = box_list_ops.gather( |
|
sorted_boxes, tf.range(num_valid_nms_boxes_cumulative)) |
|
|
|
return sorted_boxes, num_valid_nms_boxes_cumulative |
|
|
|
|
|
def batch_multiclass_non_max_suppression(boxes, |
|
scores, |
|
score_thresh, |
|
iou_thresh, |
|
max_size_per_class, |
|
max_total_size=0, |
|
clip_window=None, |
|
change_coordinate_frame=False, |
|
num_valid_boxes=None, |
|
masks=None, |
|
additional_fields=None, |
|
scope=None, |
|
use_static_shapes=False, |
|
parallel_iterations=32): |
|
"""Multi-class version of non maximum suppression that operates on a batch. |
|
|
|
This op is similar to `multiclass_non_max_suppression` but operates on a batch |
|
of boxes and scores. See documentation for `multiclass_non_max_suppression` |
|
for details. |
|
|
|
Args: |
|
boxes: A [batch_size, num_anchors, q, 4] float32 tensor containing |
|
detections. If `q` is 1 then same boxes are used for all classes |
|
otherwise, if `q` is equal to number of classes, class-specific boxes |
|
are used. |
|
scores: A [batch_size, num_anchors, num_classes] float32 tensor containing |
|
the scores for each of the `num_anchors` detections. The scores have to be |
|
non-negative when use_static_shapes is set True. |
|
score_thresh: scalar threshold for score (low scoring boxes are removed). |
|
iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap |
|
with previously selected boxes are removed). |
|
max_size_per_class: maximum number of retained boxes per class. |
|
max_total_size: maximum number of boxes retained over all classes. By |
|
default returns all boxes retained after capping boxes per class. |
|
clip_window: A float32 tensor of shape [batch_size, 4] where each entry is |
|
of the form [y_min, x_min, y_max, x_max] representing the window to clip |
|
boxes to before performing non-max suppression. This argument can also be |
|
a tensor of shape [4] in which case, the same clip window is applied to |
|
all images in the batch. If clip_widow is None, all boxes are used to |
|
perform non-max suppression. |
|
change_coordinate_frame: Whether to normalize coordinates after clipping |
|
relative to clip_window (this can only be set to True if a clip_window |
|
is provided) |
|
num_valid_boxes: (optional) a Tensor of type `int32`. A 1-D tensor of shape |
|
[batch_size] representing the number of valid boxes to be considered |
|
for each image in the batch. This parameter allows for ignoring zero |
|
paddings. |
|
masks: (optional) a [batch_size, num_anchors, q, mask_height, mask_width] |
|
float32 tensor containing box masks. `q` can be either number of classes |
|
or 1 depending on whether a separate mask is predicted per class. |
|
additional_fields: (optional) If not None, a dictionary that maps keys to |
|
tensors whose dimensions are [batch_size, num_anchors, ...]. |
|
scope: tf scope name. |
|
use_static_shapes: If true, the output nmsed boxes are padded to be of |
|
length `max_size_per_class` and it doesn't clip boxes to max_total_size. |
|
Defaults to false. |
|
parallel_iterations: (optional) number of batch items to process in |
|
parallel. |
|
|
|
Returns: |
|
'nmsed_boxes': A [batch_size, max_detections, 4] float32 tensor |
|
containing the non-max suppressed boxes. |
|
'nmsed_scores': A [batch_size, max_detections] float32 tensor containing |
|
the scores for the boxes. |
|
'nmsed_classes': A [batch_size, max_detections] float32 tensor |
|
containing the class for boxes. |
|
'nmsed_masks': (optional) a |
|
[batch_size, max_detections, mask_height, mask_width] float32 tensor |
|
containing masks for each selected box. This is set to None if input |
|
`masks` is None. |
|
'nmsed_additional_fields': (optional) a dictionary of |
|
[batch_size, max_detections, ...] float32 tensors corresponding to the |
|
tensors specified in the input `additional_fields`. This is not returned |
|
if input `additional_fields` is None. |
|
'num_detections': A [batch_size] int32 tensor indicating the number of |
|
valid detections per batch item. Only the top num_detections[i] entries in |
|
nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the |
|
entries are zero paddings. |
|
|
|
Raises: |
|
ValueError: if `q` in boxes.shape is not 1 or not equal to number of |
|
classes as inferred from scores.shape. |
|
""" |
|
q = boxes.shape[2].value |
|
num_classes = scores.shape[2].value |
|
if q != 1 and q != num_classes: |
|
raise ValueError('third dimension of boxes must be either 1 or equal ' |
|
'to the third dimension of scores') |
|
if change_coordinate_frame and clip_window is None: |
|
raise ValueError('if change_coordinate_frame is True, then a clip_window' |
|
'must be specified.') |
|
original_masks = masks |
|
original_additional_fields = additional_fields |
|
with tf.name_scope(scope, 'BatchMultiClassNonMaxSuppression'): |
|
boxes_shape = boxes.shape |
|
batch_size = boxes_shape[0].value |
|
num_anchors = boxes_shape[1].value |
|
|
|
if batch_size is None: |
|
batch_size = tf.shape(boxes)[0] |
|
if num_anchors is None: |
|
num_anchors = tf.shape(boxes)[1] |
|
|
|
|
|
|
|
if num_valid_boxes is None: |
|
num_valid_boxes = tf.ones([batch_size], dtype=tf.int32) * num_anchors |
|
|
|
|
|
|
|
if masks is None: |
|
masks_shape = tf.stack([batch_size, num_anchors, q, 1, 1]) |
|
masks = tf.zeros(masks_shape) |
|
|
|
if clip_window is None: |
|
clip_window = tf.stack([ |
|
tf.reduce_min(boxes[:, :, :, 0]), |
|
tf.reduce_min(boxes[:, :, :, 1]), |
|
tf.reduce_max(boxes[:, :, :, 2]), |
|
tf.reduce_max(boxes[:, :, :, 3]) |
|
]) |
|
if clip_window.shape.ndims == 1: |
|
clip_window = tf.tile(tf.expand_dims(clip_window, 0), [batch_size, 1]) |
|
|
|
if additional_fields is None: |
|
additional_fields = {} |
|
|
|
def _single_image_nms_fn(args): |
|
"""Runs NMS on a single image and returns padded output. |
|
|
|
Args: |
|
args: A list of tensors consisting of the following: |
|
per_image_boxes - A [num_anchors, q, 4] float32 tensor containing |
|
detections. If `q` is 1 then same boxes are used for all classes |
|
otherwise, if `q` is equal to number of classes, class-specific |
|
boxes are used. |
|
per_image_scores - A [num_anchors, num_classes] float32 tensor |
|
containing the scores for each of the `num_anchors` detections. |
|
per_image_masks - A [num_anchors, q, mask_height, mask_width] float32 |
|
tensor containing box masks. `q` can be either number of classes |
|
or 1 depending on whether a separate mask is predicted per class. |
|
per_image_clip_window - A 1D float32 tensor of the form |
|
[ymin, xmin, ymax, xmax] representing the window to clip the boxes |
|
to. |
|
per_image_additional_fields - (optional) A variable number of float32 |
|
tensors each with size [num_anchors, ...]. |
|
per_image_num_valid_boxes - A tensor of type `int32`. A 1-D tensor of |
|
shape [batch_size] representing the number of valid boxes to be |
|
considered for each image in the batch. This parameter allows for |
|
ignoring zero paddings. |
|
|
|
Returns: |
|
'nmsed_boxes': A [max_detections, 4] float32 tensor containing the |
|
non-max suppressed boxes. |
|
'nmsed_scores': A [max_detections] float32 tensor containing the scores |
|
for the boxes. |
|
'nmsed_classes': A [max_detections] float32 tensor containing the class |
|
for boxes. |
|
'nmsed_masks': (optional) a [max_detections, mask_height, mask_width] |
|
float32 tensor containing masks for each selected box. This is set to |
|
None if input `masks` is None. |
|
'nmsed_additional_fields': (optional) A variable number of float32 |
|
tensors each with size [max_detections, ...] corresponding to the |
|
input `per_image_additional_fields`. |
|
'num_detections': A [batch_size] int32 tensor indicating the number of |
|
valid detections per batch item. Only the top num_detections[i] |
|
entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The |
|
rest of the entries are zero paddings. |
|
""" |
|
per_image_boxes = args[0] |
|
per_image_scores = args[1] |
|
per_image_masks = args[2] |
|
per_image_clip_window = args[3] |
|
per_image_additional_fields = { |
|
key: value |
|
for key, value in zip(additional_fields, args[4:-1]) |
|
} |
|
per_image_num_valid_boxes = args[-1] |
|
if use_static_shapes: |
|
total_proposals = tf.shape(per_image_scores) |
|
per_image_scores = tf.where( |
|
tf.less(tf.range(total_proposals[0]), per_image_num_valid_boxes), |
|
per_image_scores, |
|
tf.fill(total_proposals, np.finfo('float32').min)) |
|
else: |
|
per_image_boxes = tf.reshape( |
|
tf.slice(per_image_boxes, 3 * [0], |
|
tf.stack([per_image_num_valid_boxes, -1, -1])), [-1, q, 4]) |
|
per_image_scores = tf.reshape( |
|
tf.slice(per_image_scores, [0, 0], |
|
tf.stack([per_image_num_valid_boxes, -1])), |
|
[-1, num_classes]) |
|
per_image_masks = tf.reshape( |
|
tf.slice(per_image_masks, 4 * [0], |
|
tf.stack([per_image_num_valid_boxes, -1, -1, -1])), |
|
[-1, q, per_image_masks.shape[2].value, |
|
per_image_masks.shape[3].value]) |
|
if per_image_additional_fields is not None: |
|
for key, tensor in per_image_additional_fields.items(): |
|
additional_field_shape = tensor.get_shape() |
|
additional_field_dim = len(additional_field_shape) |
|
per_image_additional_fields[key] = tf.reshape( |
|
tf.slice(per_image_additional_fields[key], |
|
additional_field_dim * [0], |
|
tf.stack([per_image_num_valid_boxes] + |
|
(additional_field_dim - 1) * [-1])), |
|
[-1] + [dim.value for dim in additional_field_shape[1:]]) |
|
|
|
nmsed_boxlist, num_valid_nms_boxes = multiclass_non_max_suppression( |
|
per_image_boxes, |
|
per_image_scores, |
|
score_thresh, |
|
iou_thresh, |
|
max_size_per_class, |
|
max_total_size, |
|
clip_window=per_image_clip_window, |
|
change_coordinate_frame=change_coordinate_frame, |
|
masks=per_image_masks, |
|
pad_to_max_output_size=use_static_shapes, |
|
additional_fields=per_image_additional_fields) |
|
|
|
if not use_static_shapes: |
|
nmsed_boxlist = box_list_ops.pad_or_clip_box_list( |
|
nmsed_boxlist, max_total_size) |
|
num_detections = num_valid_nms_boxes |
|
nmsed_boxes = nmsed_boxlist.get() |
|
nmsed_scores = nmsed_boxlist.get_field(fields.BoxListFields.scores) |
|
nmsed_classes = nmsed_boxlist.get_field(fields.BoxListFields.classes) |
|
nmsed_masks = nmsed_boxlist.get_field(fields.BoxListFields.masks) |
|
nmsed_additional_fields = [ |
|
nmsed_boxlist.get_field(key) for key in per_image_additional_fields |
|
] |
|
return ([nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_masks] + |
|
nmsed_additional_fields + [num_detections]) |
|
|
|
num_additional_fields = 0 |
|
if additional_fields is not None: |
|
num_additional_fields = len(additional_fields) |
|
num_nmsed_outputs = 4 + num_additional_fields |
|
|
|
batch_outputs = shape_utils.static_or_dynamic_map_fn( |
|
_single_image_nms_fn, |
|
elems=([boxes, scores, masks, clip_window] + |
|
list(additional_fields.values()) + [num_valid_boxes]), |
|
dtype=(num_nmsed_outputs * [tf.float32] + [tf.int32]), |
|
parallel_iterations=parallel_iterations) |
|
|
|
batch_nmsed_boxes = batch_outputs[0] |
|
batch_nmsed_scores = batch_outputs[1] |
|
batch_nmsed_classes = batch_outputs[2] |
|
batch_nmsed_masks = batch_outputs[3] |
|
batch_nmsed_additional_fields = { |
|
key: value |
|
for key, value in zip(additional_fields, batch_outputs[4:-1]) |
|
} |
|
batch_num_detections = batch_outputs[-1] |
|
|
|
if original_masks is None: |
|
batch_nmsed_masks = None |
|
|
|
if original_additional_fields is None: |
|
batch_nmsed_additional_fields = None |
|
|
|
return (batch_nmsed_boxes, batch_nmsed_scores, batch_nmsed_classes, |
|
batch_nmsed_masks, batch_nmsed_additional_fields, |
|
batch_num_detections) |
|
|