DR-App / object_detection /predictors /convolutional_box_predictor.py
pat229988's picture
Upload 653 files
9a393e2
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Convolutional Box Predictors with and without weight sharing."""
import functools
import tensorflow as tf
from object_detection.core import box_predictor
from object_detection.utils import static_shape
slim = tf.contrib.slim
BOX_ENCODINGS = box_predictor.BOX_ENCODINGS
CLASS_PREDICTIONS_WITH_BACKGROUND = (
box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND)
MASK_PREDICTIONS = box_predictor.MASK_PREDICTIONS
class _NoopVariableScope(object):
"""A dummy class that does not push any scope."""
def __enter__(self):
return None
def __exit__(self, exc_type, exc_value, traceback):
return False
class ConvolutionalBoxPredictor(box_predictor.BoxPredictor):
"""Convolutional Box Predictor.
Optionally add an intermediate 1x1 convolutional layer after features and
predict in parallel branches box_encodings and
class_predictions_with_background.
Currently this box predictor assumes that predictions are "shared" across
classes --- that is each anchor makes box predictions which do not depend
on class.
"""
def __init__(self,
is_training,
num_classes,
box_prediction_head,
class_prediction_head,
other_heads,
conv_hyperparams_fn,
num_layers_before_predictor,
min_depth,
max_depth):
"""Constructor.
Args:
is_training: Indicates whether the BoxPredictor is in training mode.
num_classes: number of classes. Note that num_classes *does not*
include the background category, so if groundtruth labels take values
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
assigned classification targets can range from {0,... K}).
box_prediction_head: The head that predicts the boxes.
class_prediction_head: The head that predicts the classes.
other_heads: A dictionary mapping head names to convolutional
head classes.
conv_hyperparams_fn: A function to generate tf-slim arg_scope with
hyperparameters for convolution ops.
num_layers_before_predictor: Number of the additional conv layers before
the predictor.
min_depth: Minimum feature depth prior to predicting box encodings
and class predictions.
max_depth: Maximum feature depth prior to predicting box encodings
and class predictions. If max_depth is set to 0, no additional
feature map will be inserted before location and class predictions.
Raises:
ValueError: if min_depth > max_depth.
"""
super(ConvolutionalBoxPredictor, self).__init__(is_training, num_classes)
self._box_prediction_head = box_prediction_head
self._class_prediction_head = class_prediction_head
self._other_heads = other_heads
self._conv_hyperparams_fn = conv_hyperparams_fn
self._min_depth = min_depth
self._max_depth = max_depth
self._num_layers_before_predictor = num_layers_before_predictor
@property
def num_classes(self):
return self._num_classes
def _predict(self, image_features, num_predictions_per_location_list):
"""Computes encoded object locations and corresponding confidences.
Args:
image_features: A list of float tensors of shape [batch_size, height_i,
width_i, channels_i] containing features for a batch of images.
num_predictions_per_location_list: A list of integers representing the
number of box predictions to be made per spatial location for each
feature map.
Returns:
A dictionary containing:
box_encodings: A list of float tensors of shape
[batch_size, num_anchors_i, q, code_size] representing the location of
the objects, where q is 1 or the number of classes. Each entry in the
list corresponds to a feature map in the input `image_features` list.
class_predictions_with_background: A list of float tensors of shape
[batch_size, num_anchors_i, num_classes + 1] representing the class
predictions for the proposals. Each entry in the list corresponds to a
feature map in the input `image_features` list.
(optional) Predictions from other heads.
"""
predictions = {
BOX_ENCODINGS: [],
CLASS_PREDICTIONS_WITH_BACKGROUND: [],
}
for head_name in self._other_heads.keys():
predictions[head_name] = []
# TODO(rathodv): Come up with a better way to generate scope names
# in box predictor once we have time to retrain all models in the zoo.
# The following lines create scope names to be backwards compatible with the
# existing checkpoints.
box_predictor_scopes = [_NoopVariableScope()]
if len(image_features) > 1:
box_predictor_scopes = [
tf.variable_scope('BoxPredictor_{}'.format(i))
for i in range(len(image_features))
]
for (image_feature,
num_predictions_per_location, box_predictor_scope) in zip(
image_features, num_predictions_per_location_list,
box_predictor_scopes):
net = image_feature
with box_predictor_scope:
with slim.arg_scope(self._conv_hyperparams_fn()):
with slim.arg_scope([slim.dropout], is_training=self._is_training):
# Add additional conv layers before the class predictor.
features_depth = static_shape.get_depth(image_feature.get_shape())
depth = max(min(features_depth, self._max_depth), self._min_depth)
tf.logging.info('depth of additional conv before box predictor: {}'.
format(depth))
if depth > 0 and self._num_layers_before_predictor > 0:
for i in range(self._num_layers_before_predictor):
net = slim.conv2d(
net,
depth, [1, 1],
reuse=tf.AUTO_REUSE,
scope='Conv2d_%d_1x1_%d' % (i, depth))
sorted_keys = sorted(self._other_heads.keys())
sorted_keys.append(BOX_ENCODINGS)
sorted_keys.append(CLASS_PREDICTIONS_WITH_BACKGROUND)
for head_name in sorted_keys:
if head_name == BOX_ENCODINGS:
head_obj = self._box_prediction_head
elif head_name == CLASS_PREDICTIONS_WITH_BACKGROUND:
head_obj = self._class_prediction_head
else:
head_obj = self._other_heads[head_name]
prediction = head_obj.predict(
features=net,
num_predictions_per_location=num_predictions_per_location)
predictions[head_name].append(prediction)
return predictions
# TODO(rathodv): Replace with slim.arg_scope_func_key once its available
# externally.
def _arg_scope_func_key(op):
"""Returns a key that can be used to index arg_scope dictionary."""
return getattr(op, '_key_op', str(op))
# TODO(rathodv): Merge the implementation with ConvolutionalBoxPredictor above
# since they are very similar.
class WeightSharedConvolutionalBoxPredictor(box_predictor.BoxPredictor):
"""Convolutional Box Predictor with weight sharing.
Defines the box predictor as defined in
https://arxiv.org/abs/1708.02002. This class differs from
ConvolutionalBoxPredictor in that it shares weights and biases while
predicting from different feature maps. However, batch_norm parameters are not
shared because the statistics of the activations vary among the different
feature maps.
Also note that separate multi-layer towers are constructed for the box
encoding and class predictors respectively.
"""
def __init__(self,
is_training,
num_classes,
box_prediction_head,
class_prediction_head,
other_heads,
conv_hyperparams_fn,
depth,
num_layers_before_predictor,
kernel_size=3,
apply_batch_norm=False,
share_prediction_tower=False,
use_depthwise=False):
"""Constructor.
Args:
is_training: Indicates whether the BoxPredictor is in training mode.
num_classes: number of classes. Note that num_classes *does not*
include the background category, so if groundtruth labels take values
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
assigned classification targets can range from {0,... K}).
box_prediction_head: The head that predicts the boxes.
class_prediction_head: The head that predicts the classes.
other_heads: A dictionary mapping head names to convolutional
head classes.
conv_hyperparams_fn: A function to generate tf-slim arg_scope with
hyperparameters for convolution ops.
depth: depth of conv layers.
num_layers_before_predictor: Number of the additional conv layers before
the predictor.
kernel_size: Size of final convolution kernel.
apply_batch_norm: Whether to apply batch normalization to conv layers in
this predictor.
share_prediction_tower: Whether to share the multi-layer tower among box
prediction head, class prediction head and other heads.
use_depthwise: Whether to use depthwise separable conv2d instead of
regular conv2d.
"""
super(WeightSharedConvolutionalBoxPredictor, self).__init__(is_training,
num_classes)
self._box_prediction_head = box_prediction_head
self._class_prediction_head = class_prediction_head
self._other_heads = other_heads
self._conv_hyperparams_fn = conv_hyperparams_fn
self._depth = depth
self._num_layers_before_predictor = num_layers_before_predictor
self._kernel_size = kernel_size
self._apply_batch_norm = apply_batch_norm
self._share_prediction_tower = share_prediction_tower
self._use_depthwise = use_depthwise
@property
def num_classes(self):
return self._num_classes
def _insert_additional_projection_layer(self, image_feature,
inserted_layer_counter,
target_channel):
if inserted_layer_counter < 0:
return image_feature, inserted_layer_counter
image_feature = slim.conv2d(
image_feature,
target_channel, [1, 1],
stride=1,
padding='SAME',
activation_fn=None,
normalizer_fn=(tf.identity if self._apply_batch_norm else None),
scope='ProjectionLayer/conv2d_{}'.format(
inserted_layer_counter))
if self._apply_batch_norm:
image_feature = slim.batch_norm(
image_feature,
scope='ProjectionLayer/conv2d_{}/BatchNorm'.format(
inserted_layer_counter))
inserted_layer_counter += 1
return image_feature, inserted_layer_counter
def _compute_base_tower(self, tower_name_scope, image_feature, feature_index):
net = image_feature
for i in range(self._num_layers_before_predictor):
if self._use_depthwise:
conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1)
else:
conv_op = slim.conv2d
net = conv_op(
net,
self._depth, [self._kernel_size, self._kernel_size],
stride=1,
padding='SAME',
activation_fn=None,
normalizer_fn=(tf.identity if self._apply_batch_norm else None),
scope='{}/conv2d_{}'.format(tower_name_scope, i))
if self._apply_batch_norm:
net = slim.batch_norm(
net,
scope='{}/conv2d_{}/BatchNorm/feature_{}'.
format(tower_name_scope, i, feature_index))
net = tf.nn.relu6(net)
return net
def _predict_head(self, head_name, head_obj, image_feature, box_tower_feature,
feature_index, num_predictions_per_location):
if head_name == CLASS_PREDICTIONS_WITH_BACKGROUND:
tower_name_scope = 'ClassPredictionTower'
else:
tower_name_scope = head_name + 'PredictionTower'
if self._share_prediction_tower:
head_tower_feature = box_tower_feature
else:
head_tower_feature = self._compute_base_tower(
tower_name_scope=tower_name_scope,
image_feature=image_feature,
feature_index=feature_index)
return head_obj.predict(
features=head_tower_feature,
num_predictions_per_location=num_predictions_per_location)
def _predict(self, image_features, num_predictions_per_location_list):
"""Computes encoded object locations and corresponding confidences.
Args:
image_features: A list of float tensors of shape [batch_size, height_i,
width_i, channels] containing features for a batch of images. Note that
when not all tensors in the list have the same number of channels, an
additional projection layer will be added on top the tensor to generate
feature map with number of channels consitent with the majority.
num_predictions_per_location_list: A list of integers representing the
number of box predictions to be made per spatial location for each
feature map. Note that all values must be the same since the weights are
shared.
Returns:
A dictionary containing:
box_encodings: A list of float tensors of shape
[batch_size, num_anchors_i, code_size] representing the location of
the objects. Each entry in the list corresponds to a feature map in
the input `image_features` list.
class_predictions_with_background: A list of float tensors of shape
[batch_size, num_anchors_i, num_classes + 1] representing the class
predictions for the proposals. Each entry in the list corresponds to a
feature map in the input `image_features` list.
(optional) Predictions from other heads.
E.g., mask_predictions: A list of float tensors of shape
[batch_size, num_anchord_i, num_classes, mask_height, mask_width].
Raises:
ValueError: If the num predictions per locations differs between the
feature maps.
"""
if len(set(num_predictions_per_location_list)) > 1:
raise ValueError('num predictions per location must be same for all'
'feature maps, found: {}'.format(
num_predictions_per_location_list))
feature_channels = [
image_feature.shape[3].value for image_feature in image_features
]
has_different_feature_channels = len(set(feature_channels)) > 1
if has_different_feature_channels:
inserted_layer_counter = 0
target_channel = max(set(feature_channels), key=feature_channels.count)
tf.logging.info('Not all feature maps have the same number of '
'channels, found: {}, addition project layers '
'to bring all feature maps to uniform channels '
'of {}'.format(feature_channels, target_channel))
else:
# Place holder variables if has_different_feature_channels is False.
target_channel = -1
inserted_layer_counter = -1
predictions = {
BOX_ENCODINGS: [],
CLASS_PREDICTIONS_WITH_BACKGROUND: [],
}
for head_name in self._other_heads.keys():
predictions[head_name] = []
for feature_index, (image_feature,
num_predictions_per_location) in enumerate(
zip(image_features,
num_predictions_per_location_list)):
with tf.variable_scope('WeightSharedConvolutionalBoxPredictor',
reuse=tf.AUTO_REUSE):
with slim.arg_scope(self._conv_hyperparams_fn()):
(image_feature,
inserted_layer_counter) = self._insert_additional_projection_layer(
image_feature, inserted_layer_counter, target_channel)
if self._share_prediction_tower:
box_tower_scope = 'PredictionTower'
else:
box_tower_scope = 'BoxPredictionTower'
box_tower_feature = self._compute_base_tower(
tower_name_scope=box_tower_scope,
image_feature=image_feature,
feature_index=feature_index)
box_encodings = self._box_prediction_head.predict(
features=box_tower_feature,
num_predictions_per_location=num_predictions_per_location)
predictions[BOX_ENCODINGS].append(box_encodings)
sorted_keys = sorted(self._other_heads.keys())
sorted_keys.append(CLASS_PREDICTIONS_WITH_BACKGROUND)
for head_name in sorted_keys:
if head_name == CLASS_PREDICTIONS_WITH_BACKGROUND:
head_obj = self._class_prediction_head
else:
head_obj = self._other_heads[head_name]
prediction = self._predict_head(
head_name=head_name,
head_obj=head_obj,
image_feature=image_feature,
box_tower_feature=box_tower_feature,
feature_index=feature_index,
num_predictions_per_location=num_predictions_per_location)
predictions[head_name].append(prediction)
return predictions