File size: 12,749 Bytes
9a393e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 |
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""NASNet Faster R-CNN implementation.
Learning Transferable Architectures for Scalable Image Recognition
Barret Zoph, Vijay Vasudevan, Jonathon Shlens, Quoc V. Le
https://arxiv.org/abs/1707.07012
"""
import tensorflow as tf
from object_detection.meta_architectures import faster_rcnn_meta_arch
from nets.nasnet import nasnet
from nets.nasnet import nasnet_utils
arg_scope = tf.contrib.framework.arg_scope
slim = tf.contrib.slim
def nasnet_large_arg_scope_for_detection(is_batch_norm_training=False):
"""Defines the default arg scope for the NASNet-A Large for object detection.
This provides a small edit to switch batch norm training on and off.
Args:
is_batch_norm_training: Boolean indicating whether to train with batch norm.
Returns:
An `arg_scope` to use for the NASNet Large Model.
"""
imagenet_scope = nasnet.nasnet_large_arg_scope()
with arg_scope(imagenet_scope):
with arg_scope([slim.batch_norm], is_training=is_batch_norm_training) as sc:
return sc
# Note: This is largely a copy of _build_nasnet_base inside nasnet.py but
# with special edits to remove instantiation of the stem and the special
# ability to receive as input a pair of hidden states.
def _build_nasnet_base(hidden_previous,
hidden,
normal_cell,
reduction_cell,
hparams,
true_cell_num,
start_cell_num):
"""Constructs a NASNet image model."""
# Find where to place the reduction cells or stride normal cells
reduction_indices = nasnet_utils.calc_reduction_layers(
hparams.num_cells, hparams.num_reduction_layers)
# Note: The None is prepended to match the behavior of _imagenet_stem()
cell_outputs = [None, hidden_previous, hidden]
net = hidden
# NOTE: In the nasnet.py code, filter_scaling starts at 1.0. We instead
# start at 2.0 because 1 reduction cell has been created which would
# update the filter_scaling to 2.0.
filter_scaling = 2.0
# Run the cells
for cell_num in range(start_cell_num, hparams.num_cells):
stride = 1
if hparams.skip_reduction_layer_input:
prev_layer = cell_outputs[-2]
if cell_num in reduction_indices:
filter_scaling *= hparams.filter_scaling_rate
net = reduction_cell(
net,
scope='reduction_cell_{}'.format(reduction_indices.index(cell_num)),
filter_scaling=filter_scaling,
stride=2,
prev_layer=cell_outputs[-2],
cell_num=true_cell_num)
true_cell_num += 1
cell_outputs.append(net)
if not hparams.skip_reduction_layer_input:
prev_layer = cell_outputs[-2]
net = normal_cell(
net,
scope='cell_{}'.format(cell_num),
filter_scaling=filter_scaling,
stride=stride,
prev_layer=prev_layer,
cell_num=true_cell_num)
true_cell_num += 1
cell_outputs.append(net)
# Final nonlinearity.
# Note that we have dropped the final pooling, dropout and softmax layers
# from the default nasnet version.
with tf.variable_scope('final_layer'):
net = tf.nn.relu(net)
return net
# TODO(shlens): Only fixed_shape_resizer is currently supported for NASNet
# featurization. The reason for this is that nasnet.py only supports
# inputs with fully known shapes. We need to update nasnet.py to handle
# shapes not known at compile time.
class FasterRCNNNASFeatureExtractor(
faster_rcnn_meta_arch.FasterRCNNFeatureExtractor):
"""Faster R-CNN with NASNet-A feature extractor implementation."""
def __init__(self,
is_training,
first_stage_features_stride,
batch_norm_trainable=False,
reuse_weights=None,
weight_decay=0.0):
"""Constructor.
Args:
is_training: See base class.
first_stage_features_stride: See base class.
batch_norm_trainable: See base class.
reuse_weights: See base class.
weight_decay: See base class.
Raises:
ValueError: If `first_stage_features_stride` is not 16.
"""
if first_stage_features_stride != 16:
raise ValueError('`first_stage_features_stride` must be 16.')
super(FasterRCNNNASFeatureExtractor, self).__init__(
is_training, first_stage_features_stride, batch_norm_trainable,
reuse_weights, weight_decay)
def preprocess(self, resized_inputs):
"""Faster R-CNN with NAS preprocessing.
Maps pixel values to the range [-1, 1].
Args:
resized_inputs: A [batch, height_in, width_in, channels] float32 tensor
representing a batch of images with values between 0 and 255.0.
Returns:
preprocessed_inputs: A [batch, height_out, width_out, channels] float32
tensor representing a batch of images.
"""
return (2.0 / 255.0) * resized_inputs - 1.0
def _extract_proposal_features(self, preprocessed_inputs, scope):
"""Extracts first stage RPN features.
Extracts features using the first half of the NASNet network.
We construct the network in `align_feature_maps=True` mode, which means
that all VALID paddings in the network are changed to SAME padding so that
the feature maps are aligned.
Args:
preprocessed_inputs: A [batch, height, width, channels] float32 tensor
representing a batch of images.
scope: A scope name.
Returns:
rpn_feature_map: A tensor with shape [batch, height, width, depth]
end_points: A dictionary mapping feature extractor tensor names to tensors
Raises:
ValueError: If the created network is missing the required activation.
"""
del scope
if len(preprocessed_inputs.get_shape().as_list()) != 4:
raise ValueError('`preprocessed_inputs` must be 4 dimensional, got a '
'tensor of shape %s' % preprocessed_inputs.get_shape())
with slim.arg_scope(nasnet_large_arg_scope_for_detection(
is_batch_norm_training=self._train_batch_norm)):
with arg_scope([slim.conv2d,
slim.batch_norm,
slim.separable_conv2d],
reuse=self._reuse_weights):
_, end_points = nasnet.build_nasnet_large(
preprocessed_inputs, num_classes=None,
is_training=self._is_training,
final_endpoint='Cell_11')
# Note that both 'Cell_10' and 'Cell_11' have equal depth = 2016.
rpn_feature_map = tf.concat([end_points['Cell_10'],
end_points['Cell_11']], 3)
# nasnet.py does not maintain the batch size in the first dimension.
# This work around permits us retaining the batch for below.
batch = preprocessed_inputs.get_shape().as_list()[0]
shape_without_batch = rpn_feature_map.get_shape().as_list()[1:]
rpn_feature_map_shape = [batch] + shape_without_batch
rpn_feature_map.set_shape(rpn_feature_map_shape)
return rpn_feature_map, end_points
def _extract_box_classifier_features(self, proposal_feature_maps, scope):
"""Extracts second stage box classifier features.
This function reconstructs the "second half" of the NASNet-A
network after the part defined in `_extract_proposal_features`.
Args:
proposal_feature_maps: A 4-D float tensor with shape
[batch_size * self.max_num_proposals, crop_height, crop_width, depth]
representing the feature map cropped to each proposal.
scope: A scope name.
Returns:
proposal_classifier_features: A 4-D float tensor with shape
[batch_size * self.max_num_proposals, height, width, depth]
representing box classifier features for each proposal.
"""
del scope
# Note that we always feed into 2 layers of equal depth
# where the first N channels corresponds to previous hidden layer
# and the second N channels correspond to the final hidden layer.
hidden_previous, hidden = tf.split(proposal_feature_maps, 2, axis=3)
# Note that what follows is largely a copy of build_nasnet_large() within
# nasnet.py. We are copying to minimize code pollution in slim.
# TODO(shlens,skornblith): Determine the appropriate drop path schedule.
# For now the schedule is the default (1.0->0.7 over 250,000 train steps).
hparams = nasnet.large_imagenet_config()
if not self._is_training:
hparams.set_hparam('drop_path_keep_prob', 1.0)
# Calculate the total number of cells in the network
# -- Add 2 for the reduction cells.
total_num_cells = hparams.num_cells + 2
# -- And add 2 for the stem cells for ImageNet training.
total_num_cells += 2
normal_cell = nasnet_utils.NasNetANormalCell(
hparams.num_conv_filters, hparams.drop_path_keep_prob,
total_num_cells, hparams.total_training_steps)
reduction_cell = nasnet_utils.NasNetAReductionCell(
hparams.num_conv_filters, hparams.drop_path_keep_prob,
total_num_cells, hparams.total_training_steps)
with arg_scope([slim.dropout, nasnet_utils.drop_path],
is_training=self._is_training):
with arg_scope([slim.batch_norm], is_training=self._train_batch_norm):
with arg_scope([slim.avg_pool2d,
slim.max_pool2d,
slim.conv2d,
slim.batch_norm,
slim.separable_conv2d,
nasnet_utils.factorized_reduction,
nasnet_utils.global_avg_pool,
nasnet_utils.get_channel_index,
nasnet_utils.get_channel_dim],
data_format=hparams.data_format):
# This corresponds to the cell number just past 'Cell_11' used by
# by _extract_proposal_features().
start_cell_num = 12
# Note that this number equals:
# start_cell_num + 2 stem cells + 1 reduction cell
true_cell_num = 15
with slim.arg_scope(nasnet.nasnet_large_arg_scope()):
net = _build_nasnet_base(hidden_previous,
hidden,
normal_cell=normal_cell,
reduction_cell=reduction_cell,
hparams=hparams,
true_cell_num=true_cell_num,
start_cell_num=start_cell_num)
proposal_classifier_features = net
return proposal_classifier_features
def restore_from_classification_checkpoint_fn(
self,
first_stage_feature_extractor_scope,
second_stage_feature_extractor_scope):
"""Returns a map of variables to load from a foreign checkpoint.
Note that this overrides the default implementation in
faster_rcnn_meta_arch.FasterRCNNFeatureExtractor which does not work for
NASNet-A checkpoints.
Args:
first_stage_feature_extractor_scope: A scope name for the first stage
feature extractor.
second_stage_feature_extractor_scope: A scope name for the second stage
feature extractor.
Returns:
A dict mapping variable names (to load from a checkpoint) to variables in
the model graph.
"""
# Note that the NAS checkpoint only contains the moving average version of
# the Variables so we need to generate an appropriate dictionary mapping.
variables_to_restore = {}
for variable in tf.global_variables():
if variable.op.name.startswith(
first_stage_feature_extractor_scope):
var_name = variable.op.name.replace(
first_stage_feature_extractor_scope + '/', '')
var_name += '/ExponentialMovingAverage'
variables_to_restore[var_name] = variable
if variable.op.name.startswith(
second_stage_feature_extractor_scope):
var_name = variable.op.name.replace(
second_stage_feature_extractor_scope + '/', '')
var_name += '/ExponentialMovingAverage'
variables_to_restore[var_name] = variable
return variables_to_restore
|