Spaces:

pat229988
/

DR-App

Sleeping

App Files Files Community

DR-App / object_detection /meta_architectures /rfcn_meta_arch.py

pat229988

Upload 653 files

9a393e2 over 1 year ago

raw

history blame contribute delete

16.8 kB

	# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""R-FCN meta-architecture definition.

	R-FCN: Dai, Jifeng, et al. "R-FCN: Object Detection via Region-based
	Fully Convolutional Networks." arXiv preprint arXiv:1605.06409 (2016).

	The R-FCN meta architecture is similar to Faster R-CNN and only differs in the
	second stage. Hence this class inherits FasterRCNNMetaArch and overrides only
	the `_predict_second_stage` method.

	Similar to Faster R-CNN we allow for two modes: number_of_stages=1 and
	number_of_stages=2. In the former setting, all of the user facing methods
	(e.g., predict, postprocess, loss) can be used as if the model consisted
	only of the RPN, returning class agnostic proposals (these can be thought of as
	approximate detections with no associated class information). In the latter
	setting, proposals are computed, then passed through a second stage
	"box classifier" to yield (multi-class) detections.

	Implementations of R-FCN models must define a new FasterRCNNFeatureExtractor and
	override three methods: `preprocess`, `_extract_proposal_features` (the first
	stage of the model), and `_extract_box_classifier_features` (the second stage of
	the model). Optionally, the `restore_fn` method can be overridden. See tests
	for an example.

	See notes in the documentation of Faster R-CNN meta-architecture as they all
	apply here.
	"""
	import tensorflow as tf

	from object_detection.core import box_predictor
	from object_detection.meta_architectures import faster_rcnn_meta_arch
	from object_detection.utils import ops


	class RFCNMetaArch(faster_rcnn_meta_arch.FasterRCNNMetaArch):
	"""R-FCN Meta-architecture definition."""

	def __init__(self,
	is_training,
	num_classes,
	image_resizer_fn,
	feature_extractor,
	number_of_stages,
	first_stage_anchor_generator,
	first_stage_target_assigner,
	first_stage_atrous_rate,
	first_stage_box_predictor_arg_scope_fn,
	first_stage_box_predictor_kernel_size,
	first_stage_box_predictor_depth,
	first_stage_minibatch_size,
	first_stage_sampler,
	first_stage_non_max_suppression_fn,
	first_stage_max_proposals,
	first_stage_localization_loss_weight,
	first_stage_objectness_loss_weight,
	crop_and_resize_fn,
	second_stage_target_assigner,
	second_stage_rfcn_box_predictor,
	second_stage_batch_size,
	second_stage_sampler,
	second_stage_non_max_suppression_fn,
	second_stage_score_conversion_fn,
	second_stage_localization_loss_weight,
	second_stage_classification_loss_weight,
	second_stage_classification_loss,
	hard_example_miner,
	parallel_iterations=16,
	add_summaries=True,
	clip_anchors_to_image=False,
	use_static_shapes=False,
	resize_masks=False):
	"""RFCNMetaArch Constructor.

	Args:
	is_training: A boolean indicating whether the training version of the
	computation graph should be constructed.
	num_classes: Number of classes. Note that num_classes does not
	include the background category, so if groundtruth labels take values
	in {0, 1, .., K-1}, num_classes=K (and not K+1, even though the
	assigned classification targets can range from {0,... K}).
	image_resizer_fn: A callable for image resizing. This callable always
	takes a rank-3 image tensor (corresponding to a single image) and
	returns a rank-3 image tensor, possibly with new spatial dimensions.
	See builders/image_resizer_builder.py.
	feature_extractor: A FasterRCNNFeatureExtractor object.
	number_of_stages: Valid values are {1, 2}. If 1 will only construct the
	Region Proposal Network (RPN) part of the model.
	first_stage_anchor_generator: An anchor_generator.AnchorGenerator object
	(note that currently we only support
	grid_anchor_generator.GridAnchorGenerator objects)
	first_stage_target_assigner: Target assigner to use for first stage of
	R-FCN (RPN).
	first_stage_atrous_rate: A single integer indicating the atrous rate for
	the single convolution op which is applied to the `rpn_features_to_crop`
	tensor to obtain a tensor to be used for box prediction. Some feature
	extractors optionally allow for producing feature maps computed at
	denser resolutions. The atrous rate is used to compensate for the
	denser feature maps by using an effectively larger receptive field.
	(This should typically be set to 1).
	first_stage_box_predictor_arg_scope_fn: A function to generate tf-slim
	arg_scope for conv2d, separable_conv2d and fully_connected ops for the
	RPN box predictor.
	first_stage_box_predictor_kernel_size: Kernel size to use for the
	convolution op just prior to RPN box predictions.
	first_stage_box_predictor_depth: Output depth for the convolution op
	just prior to RPN box predictions.
	first_stage_minibatch_size: The "batch size" to use for computing the
	objectness and location loss of the region proposal network. This
	"batch size" refers to the number of anchors selected as contributing
	to the loss function for any given image within the image batch and is
	only called "batch_size" due to terminology from the Faster R-CNN paper.
	first_stage_sampler: The sampler for the boxes used to calculate the RPN
	loss after the first stage.
	first_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression
	callable that takes `boxes`, `scores` and optional `clip_window`(with
	all other inputs already set) and returns a dictionary containing
	tensors with keys: `detection_boxes`, `detection_scores`,
	`detection_classes`, `num_detections`. This is used to perform non max
	suppression on the boxes predicted by the Region Proposal Network
	(RPN).
	See `post_processing.batch_multiclass_non_max_suppression` for the type
	and shape of these tensors.
	first_stage_max_proposals: Maximum number of boxes to retain after
	performing Non-Max Suppression (NMS) on the boxes predicted by the
	Region Proposal Network (RPN).
	first_stage_localization_loss_weight: A float
	first_stage_objectness_loss_weight: A float
	crop_and_resize_fn: A differentiable resampler to use for cropping RPN
	proposal features.
	second_stage_target_assigner: Target assigner to use for second stage of
	R-FCN. If the model is configured with multiple prediction heads, this
	target assigner is used to generate targets for all heads (with the
	correct `unmatched_class_label`).
	second_stage_rfcn_box_predictor: RFCN box predictor to use for
	second stage.
	second_stage_batch_size: The batch size used for computing the
	classification and refined location loss of the box classifier. This
	"batch size" refers to the number of proposals selected as contributing
	to the loss function for any given image within the image batch and is
	only called "batch_size" due to terminology from the Faster R-CNN paper.
	second_stage_sampler: The sampler for the boxes used for second stage
	box classifier.
	second_stage_non_max_suppression_fn: batch_multiclass_non_max_suppression
	callable that takes `boxes`, `scores`, optional `clip_window` and
	optional (kwarg) `mask` inputs (with all other inputs already set)
	and returns a dictionary containing tensors with keys:
	`detection_boxes`, `detection_scores`, `detection_classes`,
	`num_detections`, and (optionally) `detection_masks`. See
	`post_processing.batch_multiclass_non_max_suppression` for the type and
	shape of these tensors.
	second_stage_score_conversion_fn: Callable elementwise nonlinearity
	(that takes tensors as inputs and returns tensors). This is usually
	used to convert logits to probabilities.
	second_stage_localization_loss_weight: A float
	second_stage_classification_loss_weight: A float
	second_stage_classification_loss: A string indicating which loss function
	to use, supports 'softmax' and 'sigmoid'.
	hard_example_miner: A losses.HardExampleMiner object (can be None).
	parallel_iterations: (Optional) The number of iterations allowed to run
	in parallel for calls to tf.map_fn.
	add_summaries: boolean (default: True) controlling whether summary ops
	should be added to tensorflow graph.
	clip_anchors_to_image: The anchors generated are clip to the
	window size without filtering the nonoverlapping anchors. This generates
	a static number of anchors. This argument is unused.
	use_static_shapes: If True, uses implementation of ops with static shape
	guarantees.
	resize_masks: Indicates whether the masks presend in the groundtruth
	should be resized in the model with `image_resizer_fn`

	Raises:
	ValueError: If `second_stage_batch_size` > `first_stage_max_proposals`
	ValueError: If first_stage_anchor_generator is not of type
	grid_anchor_generator.GridAnchorGenerator.
	"""
	# TODO(rathodv): add_summaries and crop_and_resize_fn is currently
	# unused. Respect that directive in the future.
	super(RFCNMetaArch, self).__init__(
	is_training,
	num_classes,
	image_resizer_fn,
	feature_extractor,
	number_of_stages,
	first_stage_anchor_generator,
	first_stage_target_assigner,
	first_stage_atrous_rate,
	first_stage_box_predictor_arg_scope_fn,
	first_stage_box_predictor_kernel_size,
	first_stage_box_predictor_depth,
	first_stage_minibatch_size,
	first_stage_sampler,
	first_stage_non_max_suppression_fn,
	first_stage_max_proposals,
	first_stage_localization_loss_weight,
	first_stage_objectness_loss_weight,
	crop_and_resize_fn,
	None, # initial_crop_size is not used in R-FCN
	None, # maxpool_kernel_size is not use in R-FCN
	None, # maxpool_stride is not use in R-FCN
	second_stage_target_assigner,
	None, # fully_connected_box_predictor is not used in R-FCN.
	second_stage_batch_size,
	second_stage_sampler,
	second_stage_non_max_suppression_fn,
	second_stage_score_conversion_fn,
	second_stage_localization_loss_weight,
	second_stage_classification_loss_weight,
	second_stage_classification_loss,
	1.0, # second stage mask prediction loss weight isn't used in R-FCN.
	hard_example_miner,
	parallel_iterations,
	add_summaries,
	clip_anchors_to_image,
	use_static_shapes,
	resize_masks)

	self._rfcn_box_predictor = second_stage_rfcn_box_predictor

	def _predict_second_stage(self, rpn_box_encodings,
	rpn_objectness_predictions_with_background,
	rpn_features,
	anchors,
	image_shape,
	true_image_shapes):
	"""Predicts the output tensors from 2nd stage of R-FCN.

	Args:
	rpn_box_encodings: 3-D float tensor of shape
	[batch_size, num_valid_anchors, self._box_coder.code_size] containing
	predicted boxes.
	rpn_objectness_predictions_with_background: 3-D float tensor of shape
	[batch_size, num_valid_anchors, 2] containing class
	predictions (logits) for each of the anchors. Note that this
	tensor includes background class predictions (at class index 0).
	rpn_features: A 4-D float32 tensor with shape
	[batch_size, height, width, depth] representing image features from the
	RPN.
	anchors: 2-D float tensor of shape
	[num_anchors, self._box_coder.code_size].
	image_shape: A 1D int32 tensors of size [4] containing the image shape.
	true_image_shapes: int32 tensor of shape [batch, 3] where each row is
	of the form [height, width, channels] indicating the shapes
	of true images in the resized images, as resized images can be padded
	with zeros.

	Returns:
	prediction_dict: a dictionary holding "raw" prediction tensors:
	1) refined_box_encodings: a 3-D tensor with shape
	[total_num_proposals, num_classes, 4] representing predicted
	(final) refined box encodings, where
	total_num_proposals=batch_size*self._max_num_proposals
	2) class_predictions_with_background: a 2-D tensor with shape
	[total_num_proposals, num_classes + 1] containing class
	predictions (logits) for each of the anchors, where
	total_num_proposals=batch_size*self._max_num_proposals.
	Note that this tensor includes background class predictions
	(at class index 0).
	3) num_proposals: An int32 tensor of shape [batch_size] representing the
	number of proposals generated by the RPN. `num_proposals` allows us
	to keep track of which entries are to be treated as zero paddings and
	which are not since we always pad the number of proposals to be
	`self.max_num_proposals` for each image.
	4) proposal_boxes: A float32 tensor of shape
	[batch_size, self.max_num_proposals, 4] representing
	decoded proposal bounding boxes (in absolute coordinates).
	5) proposal_boxes_normalized: A float32 tensor of shape
	[batch_size, self.max_num_proposals, 4] representing decoded proposal
	bounding boxes (in normalized coordinates). Can be used to override
	the boxes proposed by the RPN, thus enabling one to extract box
	classification and prediction for externally selected areas of the
	image.
	6) box_classifier_features: a 4-D float32 tensor, of shape
	[batch_size, feature_map_height, feature_map_width, depth],
	representing the box classifier features.
	"""
	image_shape_2d = tf.tile(tf.expand_dims(image_shape[1:], 0),
	[image_shape[0], 1])
	proposal_boxes_normalized, _, num_proposals, _, _ = self._postprocess_rpn(
	rpn_box_encodings, rpn_objectness_predictions_with_background,
	anchors, image_shape_2d, true_image_shapes)

	box_classifier_features = (
	self._feature_extractor.extract_box_classifier_features(
	rpn_features,
	scope=self.second_stage_feature_extractor_scope))

	if self._rfcn_box_predictor.is_keras_model:
	box_predictions = self._rfcn_box_predictor(
	[box_classifier_features],
	proposal_boxes=proposal_boxes_normalized)
	else:
	box_predictions = self._rfcn_box_predictor.predict(
	[box_classifier_features],
	num_predictions_per_location=[1],
	scope=self.second_stage_box_predictor_scope,
	proposal_boxes=proposal_boxes_normalized)
	refined_box_encodings = tf.squeeze(
	tf.concat(box_predictions[box_predictor.BOX_ENCODINGS], axis=1), axis=1)
	class_predictions_with_background = tf.squeeze(
	tf.concat(
	box_predictions[box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND],
	axis=1),
	axis=1)

	absolute_proposal_boxes = ops.normalized_to_image_coordinates(
	proposal_boxes_normalized, image_shape,
	parallel_iterations=self._parallel_iterations)

	prediction_dict = {
	'refined_box_encodings': refined_box_encodings,
	'class_predictions_with_background':
	class_predictions_with_background,
	'num_proposals': num_proposals,
	'proposal_boxes': absolute_proposal_boxes,
	'box_classifier_features': box_classifier_features,
	'proposal_boxes_normalized': proposal_boxes_normalized,
	}
	return prediction_dict