|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""R-FCN meta-architecture definition. |
|
|
|
R-FCN: Dai, Jifeng, et al. "R-FCN: Object Detection via Region-based |
|
Fully Convolutional Networks." arXiv preprint arXiv:1605.06409 (2016). |
|
|
|
The R-FCN meta architecture is similar to Faster R-CNN and only differs in the |
|
second stage. Hence this class inherits FasterRCNNMetaArch and overrides only |
|
the `_predict_second_stage` method. |
|
|
|
Similar to Faster R-CNN we allow for two modes: number_of_stages=1 and |
|
number_of_stages=2. In the former setting, all of the user facing methods |
|
(e.g., predict, postprocess, loss) can be used as if the model consisted |
|
only of the RPN, returning class agnostic proposals (these can be thought of as |
|
approximate detections with no associated class information). In the latter |
|
setting, proposals are computed, then passed through a second stage |
|
"box classifier" to yield (multi-class) detections. |
|
|
|
Implementations of R-FCN models must define a new FasterRCNNFeatureExtractor and |
|
override three methods: `preprocess`, `_extract_proposal_features` (the first |
|
stage of the model), and `_extract_box_classifier_features` (the second stage of |
|
the model). Optionally, the `restore_fn` method can be overridden. See tests |
|
for an example. |
|
|
|
See notes in the documentation of Faster R-CNN meta-architecture as they all |
|
apply here. |
|
""" |
|
import tensorflow as tf |
|
|
|
from object_detection.core import box_predictor |
|
from object_detection.meta_architectures import faster_rcnn_meta_arch |
|
from object_detection.utils import ops |
|
|
|
|
|
class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch): |
|
"""R-FCN Meta-architecture definition.""" |
|
|
|
def __init__(self, |
|
is_training, |
|
num_classes, |
|
image_resizer_fn, |
|
feature_extractor, |
|
number_of_stages, |
|
first_stage_anchor_generator, |
|
first_stage_target_assigner, |
|
first_stage_atrous_rate, |
|
first_stage_box_predictor_arg_scope_fn, |
|
first_stage_box_predictor_kernel_size, |
|
first_stage_box_predictor_depth, |
|
first_stage_minibatch_size, |
|
first_stage_sampler, |
|
first_stage_non_max_suppression_fn, |
|
first_stage_max_proposals, |
|
first_stage_localization_loss_weight, |
|
first_stage_objectness_loss_weight, |
|
crop_and_resize_fn, |
|
second_stage_target_assigner, |
|
second_stage_rfcn_box_predictor, |
|
second_stage_batch_size, |
|
second_stage_sampler, |
|
second_stage_non_max_suppression_fn, |
|
second_stage_score_conversion_fn, |
|
second_stage_localization_loss_weight, |
|
second_stage_classification_loss_weight, |
|
second_stage_classification_loss, |
|
hard_example_miner, |
|
parallel_iterations=16, |
|
add_summaries=True, |
|
clip_anchors_to_image=False, |
|
use_static_shapes=False, |
|
resize_masks=False): |
|
"""RFCNMetaArch Constructor. |
|
|
|
Args: |
|
is_training: A boolean indicating whether the training version of the |
|
computation graph should be constructed. |
|
num_classes: Number of classes. Note that num_classes *does not* |
|
include the background category, so if groundtruth labels take values |
|
in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the |
|
assigned classification targets can range from {0,... K}). |
|
image_resizer_fn: A callable for image resizing. This callable always |
|
takes a rank-3 image tensor (corresponding to a single image) and |
|
returns a rank-3 image tensor, possibly with new spatial dimensions. |
|
See builders/image_resizer_builder.py. |
|
feature_extractor: A FasterRCNNFeatureExtractor object. |
|
number_of_stages: Valid values are {1, 2}. If 1 will only construct the |
|
Region Proposal Network (RPN) part of the model. |
|
first_stage_anchor_generator: An anchor_generator.AnchorGenerator object |
|
(note that currently we only support |
|
grid_anchor_generator.GridAnchorGenerator objects) |
|
first_stage_target_assigner: Target assigner to use for first stage of |
|
R-FCN (RPN). |
|
first_stage_atrous_rate: A single integer indicating the atrous rate for |
|
the single convolution op which is applied to the `rpn_features_to_crop` |
|
tensor to obtain a tensor to be used for box prediction. Some feature |
|
extractors optionally allow for producing feature maps computed at |
|
denser resolutions. The atrous rate is used to compensate for the |
|
denser feature maps by using an effectively larger receptive field. |
|
(This should typically be set to 1). |
|
first_stage_box_predictor_arg_scope_fn: A function to generate tf-slim |
|
arg_scope for conv2d, separable_conv2d and fully_connected ops for the |
|
RPN box predictor. |
|
first_stage_box_predictor_kernel_size: Kernel size to use for the |
|
convolution op just prior to RPN box predictions. |
|
first_stage_box_predictor_depth: Output depth for the convolution op |
|
just prior to RPN box predictions. |
|
first_stage_minibatch_size: The "batch size" to use for computing the |
|
objectness and location loss of the region proposal network. This |
|
"batch size" refers to the number of anchors selected as contributing |
|
to the loss function for any given image within the image batch and is |
|
only called "batch_size" due to terminology from the Faster R-CNN paper. |
|
first_stage_sampler: The sampler for the boxes used to calculate the RPN |
|
loss after the first stage. |
|
first_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression |
|
callable that takes `boxes`, `scores` and optional `clip_window`(with |
|
all other inputs already set) and returns a dictionary containing |
|
tensors with keys: `detection_boxes`, `detection_scores`, |
|
`detection_classes`, `num_detections`. This is used to perform non max |
|
suppression on the boxes predicted by the Region Proposal Network |
|
(RPN). |
|
See `post_processing.batch_multiclass_non_max_suppression` for the type |
|
and shape of these tensors. |
|
first_stage_max_proposals: Maximum number of boxes to retain after |
|
performing Non-Max Suppression (NMS) on the boxes predicted by the |
|
Region Proposal Network (RPN). |
|
first_stage_localization_loss_weight: A float |
|
first_stage_objectness_loss_weight: A float |
|
crop_and_resize_fn: A differentiable resampler to use for cropping RPN |
|
proposal features. |
|
second_stage_target_assigner: Target assigner to use for second stage of |
|
R-FCN. If the model is configured with multiple prediction heads, this |
|
target assigner is used to generate targets for all heads (with the |
|
correct `unmatched_class_label`). |
|
second_stage_rfcn_box_predictor: RFCN box predictor to use for |
|
second stage. |
|
second_stage_batch_size: The batch size used for computing the |
|
classification and refined location loss of the box classifier. This |
|
"batch size" refers to the number of proposals selected as contributing |
|
to the loss function for any given image within the image batch and is |
|
only called "batch_size" due to terminology from the Faster R-CNN paper. |
|
second_stage_sampler: The sampler for the boxes used for second stage |
|
box classifier. |
|
second_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression |
|
callable that takes `boxes`, `scores`, optional `clip_window` and |
|
optional (kwarg) `mask` inputs (with all other inputs already set) |
|
and returns a dictionary containing tensors with keys: |
|
`detection_boxes`, `detection_scores`, `detection_classes`, |
|
`num_detections`, and (optionally) `detection_masks`. See |
|
`post_processing.batch_multiclass_non_max_suppression` for the type and |
|
shape of these tensors. |
|
second_stage_score_conversion_fn: Callable elementwise nonlinearity |
|
(that takes tensors as inputs and returns tensors). This is usually |
|
used to convert logits to probabilities. |
|
second_stage_localization_loss_weight: A float |
|
second_stage_classification_loss_weight: A float |
|
second_stage_classification_loss: A string indicating which loss function |
|
to use, supports 'softmax' and 'sigmoid'. |
|
hard_example_miner: A losses.HardExampleMiner object (can be None). |
|
parallel_iterations: (Optional) The number of iterations allowed to run |
|
in parallel for calls to tf.map_fn. |
|
add_summaries: boolean (default: True) controlling whether summary ops |
|
should be added to tensorflow graph. |
|
clip_anchors_to_image: The anchors generated are clip to the |
|
window size without filtering the nonoverlapping anchors. This generates |
|
a static number of anchors. This argument is unused. |
|
use_static_shapes: If True, uses implementation of ops with static shape |
|
guarantees. |
|
resize_masks: Indicates whether the masks presend in the groundtruth |
|
should be resized in the model with `image_resizer_fn` |
|
|
|
Raises: |
|
ValueError: If `second_stage_batch_size` > `first_stage_max_proposals` |
|
ValueError: If first_stage_anchor_generator is not of type |
|
grid_anchor_generator.GridAnchorGenerator. |
|
""" |
|
|
|
|
|
super(RFCNMetaArch, self).__init__( |
|
is_training, |
|
num_classes, |
|
image_resizer_fn, |
|
feature_extractor, |
|
number_of_stages, |
|
first_stage_anchor_generator, |
|
first_stage_target_assigner, |
|
first_stage_atrous_rate, |
|
first_stage_box_predictor_arg_scope_fn, |
|
first_stage_box_predictor_kernel_size, |
|
first_stage_box_predictor_depth, |
|
first_stage_minibatch_size, |
|
first_stage_sampler, |
|
first_stage_non_max_suppression_fn, |
|
first_stage_max_proposals, |
|
first_stage_localization_loss_weight, |
|
first_stage_objectness_loss_weight, |
|
crop_and_resize_fn, |
|
None, |
|
None, |
|
None, |
|
second_stage_target_assigner, |
|
None, |
|
second_stage_batch_size, |
|
second_stage_sampler, |
|
second_stage_non_max_suppression_fn, |
|
second_stage_score_conversion_fn, |
|
second_stage_localization_loss_weight, |
|
second_stage_classification_loss_weight, |
|
second_stage_classification_loss, |
|
1.0, |
|
hard_example_miner, |
|
parallel_iterations, |
|
add_summaries, |
|
clip_anchors_to_image, |
|
use_static_shapes, |
|
resize_masks) |
|
|
|
self._rfcn_box_predictor = second_stage_rfcn_box_predictor |
|
|
|
def _predict_second_stage(self, rpn_box_encodings, |
|
rpn_objectness_predictions_with_background, |
|
rpn_features, |
|
anchors, |
|
image_shape, |
|
true_image_shapes): |
|
"""Predicts the output tensors from 2nd stage of R-FCN. |
|
|
|
Args: |
|
rpn_box_encodings: 3-D float tensor of shape |
|
[batch_size, num_valid_anchors, self._box_coder.code_size] containing |
|
predicted boxes. |
|
rpn_objectness_predictions_with_background: 3-D float tensor of shape |
|
[batch_size, num_valid_anchors, 2] containing class |
|
predictions (logits) for each of the anchors. Note that this |
|
tensor *includes* background class predictions (at class index 0). |
|
rpn_features: A 4-D float32 tensor with shape |
|
[batch_size, height, width, depth] representing image features from the |
|
RPN. |
|
anchors: 2-D float tensor of shape |
|
[num_anchors, self._box_coder.code_size]. |
|
image_shape: A 1D int32 tensors of size [4] containing the image shape. |
|
true_image_shapes: int32 tensor of shape [batch, 3] where each row is |
|
of the form [height, width, channels] indicating the shapes |
|
of true images in the resized images, as resized images can be padded |
|
with zeros. |
|
|
|
Returns: |
|
prediction_dict: a dictionary holding "raw" prediction tensors: |
|
1) refined_box_encodings: a 3-D tensor with shape |
|
[total_num_proposals, num_classes, 4] representing predicted |
|
(final) refined box encodings, where |
|
total_num_proposals=batch_size*self._max_num_proposals |
|
2) class_predictions_with_background: a 2-D tensor with shape |
|
[total_num_proposals, num_classes + 1] containing class |
|
predictions (logits) for each of the anchors, where |
|
total_num_proposals=batch_size*self._max_num_proposals. |
|
Note that this tensor *includes* background class predictions |
|
(at class index 0). |
|
3) num_proposals: An int32 tensor of shape [batch_size] representing the |
|
number of proposals generated by the RPN. `num_proposals` allows us |
|
to keep track of which entries are to be treated as zero paddings and |
|
which are not since we always pad the number of proposals to be |
|
`self.max_num_proposals` for each image. |
|
4) proposal_boxes: A float32 tensor of shape |
|
[batch_size, self.max_num_proposals, 4] representing |
|
decoded proposal bounding boxes (in absolute coordinates). |
|
5) proposal_boxes_normalized: A float32 tensor of shape |
|
[batch_size, self.max_num_proposals, 4] representing decoded proposal |
|
bounding boxes (in normalized coordinates). Can be used to override |
|
the boxes proposed by the RPN, thus enabling one to extract box |
|
classification and prediction for externally selected areas of the |
|
image. |
|
6) box_classifier_features: a 4-D float32 tensor, of shape |
|
[batch_size, feature_map_height, feature_map_width, depth], |
|
representing the box classifier features. |
|
""" |
|
image_shape_2d = tf.tile(tf.expand_dims(image_shape[1:], 0), |
|
[image_shape[0], 1]) |
|
proposal_boxes_normalized, _, num_proposals, _, _ = self._postprocess_rpn( |
|
rpn_box_encodings, rpn_objectness_predictions_with_background, |
|
anchors, image_shape_2d, true_image_shapes) |
|
|
|
box_classifier_features = ( |
|
self._feature_extractor.extract_box_classifier_features( |
|
rpn_features, |
|
scope=self.second_stage_feature_extractor_scope)) |
|
|
|
if self._rfcn_box_predictor.is_keras_model: |
|
box_predictions = self._rfcn_box_predictor( |
|
[box_classifier_features], |
|
proposal_boxes=proposal_boxes_normalized) |
|
else: |
|
box_predictions = self._rfcn_box_predictor.predict( |
|
[box_classifier_features], |
|
num_predictions_per_location=[1], |
|
scope=self.second_stage_box_predictor_scope, |
|
proposal_boxes=proposal_boxes_normalized) |
|
refined_box_encodings = tf.squeeze( |
|
tf.concat(box_predictions[box_predictor.BOX_ENCODINGS], axis=1), axis=1) |
|
class_predictions_with_background = tf.squeeze( |
|
tf.concat( |
|
box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND], |
|
axis=1), |
|
axis=1) |
|
|
|
absolute_proposal_boxes = ops.normalized_to_image_coordinates( |
|
proposal_boxes_normalized, image_shape, |
|
parallel_iterations=self._parallel_iterations) |
|
|
|
prediction_dict = { |
|
'refined_box_encodings': refined_box_encodings, |
|
'class_predictions_with_background': |
|
class_predictions_with_background, |
|
'num_proposals': num_proposals, |
|
'proposal_boxes': absolute_proposal_boxes, |
|
'box_classifier_features': box_classifier_features, |
|
'proposal_boxes_normalized': proposal_boxes_normalized, |
|
} |
|
return prediction_dict |
|
|