deanna-emery's picture
updates
93528c6
raw
history blame
17.1 kB
# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Losses for maskrcnn model."""
# Import libraries
import tensorflow as tf, tf_keras
class RpnScoreLoss(object):
"""Region Proposal Network score loss function."""
def __init__(self, rpn_batch_size_per_im):
self._rpn_batch_size_per_im = rpn_batch_size_per_im
self._binary_crossentropy = tf_keras.losses.BinaryCrossentropy(
reduction=tf_keras.losses.Reduction.SUM, from_logits=True)
def __call__(self, score_outputs, labels):
"""Computes total RPN detection loss.
Computes total RPN detection loss including box and score from all levels.
Args:
score_outputs: an OrderDict with keys representing levels and values
representing scores in [batch_size, height, width, num_anchors].
labels: the dictionary that returned from dataloader that includes
ground-truth targets.
Returns:
rpn_score_loss: a scalar tensor representing total score loss.
"""
with tf.name_scope('rpn_loss'):
levels = sorted(score_outputs.keys())
score_losses = []
for level in levels:
score_losses.append(
self._rpn_score_loss(
score_outputs[level],
labels[level],
normalizer=tf.cast(
tf.shape(score_outputs[level])[0] *
self._rpn_batch_size_per_im,
dtype=score_outputs[level].dtype)))
# Sums per level losses to total loss.
return tf.math.add_n(score_losses)
def _rpn_score_loss(self, score_outputs, score_targets, normalizer=1.0):
"""Computes score loss."""
# score_targets has three values:
# (1) score_targets[i]=1, the anchor is a positive sample.
# (2) score_targets[i]=0, negative.
# (3) score_targets[i]=-1, the anchor is don't care (ignore).
with tf.name_scope('rpn_score_loss'):
mask = tf.math.logical_or(tf.math.equal(score_targets, 1),
tf.math.equal(score_targets, 0))
score_targets = tf.math.maximum(score_targets,
tf.zeros_like(score_targets))
score_targets = tf.expand_dims(score_targets, axis=-1)
score_outputs = tf.expand_dims(score_outputs, axis=-1)
score_loss = self._binary_crossentropy(
score_targets, score_outputs, sample_weight=mask)
score_loss /= normalizer
return score_loss
class RpnBoxLoss(object):
"""Region Proposal Network box regression loss function."""
def __init__(self, huber_loss_delta: float):
# The delta is typically around the mean value of regression target.
# for instances, the regression targets of 512x512 input with 6 anchors on
# P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
self._huber_loss = tf_keras.losses.Huber(
delta=huber_loss_delta, reduction=tf_keras.losses.Reduction.SUM)
def __call__(self, box_outputs, labels):
"""Computes total RPN detection loss.
Computes total RPN detection loss including box and score from all levels.
Args:
box_outputs: an OrderDict with keys representing levels and values
representing box regression targets in [batch_size, height, width,
num_anchors * 4].
labels: the dictionary that returned from dataloader that includes
ground-truth targets.
Returns:
rpn_box_loss: a scalar tensor representing total box regression loss.
"""
with tf.name_scope('rpn_loss'):
levels = sorted(box_outputs.keys())
box_losses = []
for level in levels:
box_losses.append(self._rpn_box_loss(box_outputs[level], labels[level]))
# Sum per level losses to total loss.
return tf.add_n(box_losses)
def _rpn_box_loss(self, box_outputs, box_targets, normalizer=1.0):
"""Computes box regression loss."""
with tf.name_scope('rpn_box_loss'):
_, height, width, num_anchors_vertices = box_targets.get_shape().as_list()
# (batch_size, height, width, num_anchors, 4)
reshaped_box_targets = tf.reshape(
box_targets, [-1, height, width, num_anchors_vertices // 4, 4])
# The box is valid if at least one of the ymin, xmin, ymax, ymax is not 0.
# (batch_size, height, width, num_anchors)
valid_mask = tf.reduce_any(
tf.math.abs(reshaped_box_targets) > 1e-6, axis=-1)
# (batch_size, height, width, num_anchors * 4)
valid_mask = tf.cast(
tf.repeat(valid_mask, 4, axis=-1), dtype=box_outputs.dtype)
# (batch_size, height, width, num_anchors * 4, 1)
box_targets = tf.expand_dims(box_targets, axis=-1)
# (batch_size, height, width, num_anchors * 4, 1)
box_outputs = tf.expand_dims(box_outputs, axis=-1)
box_loss = self._huber_loss(
box_targets, box_outputs, sample_weight=valid_mask)
# The loss is normalized by the sum of non-zero weights and additional
# normalizer provided by the function caller. Using + 0.01 here to avoid
# division by zero. For each replica, get the sum of non-zero masks. Then
# get the mean of sums from all replicas. Note there is an extra division
# by `num_replicas` in train_step(). So it is equivalent to normalizing
# the box loss by the global sum of non-zero masks.
replica_context = tf.distribute.get_replica_context()
valid_mask = tf.reduce_sum(valid_mask)
valid_mask_mean = replica_context.all_reduce(
tf.distribute.ReduceOp.MEAN, valid_mask
)
box_loss /= normalizer * (valid_mask_mean + 0.01)
return box_loss
class FastrcnnClassLoss(object):
"""Fast R-CNN classification loss function."""
def __init__(self,
use_binary_cross_entropy: bool = False,
top_k_percent: float = 1.0):
"""Initializes loss computation.
Args:
use_binary_cross_entropy: If true, uses binary cross entropy loss,
otherwise uses categorical cross entropy loss.
top_k_percent: a float, the value lies in [0.0, 1.0]. When its value < 1.,
only aggregate the top k percent of losses. This is useful for hard
example mining.
"""
self._use_binary_cross_entropy = use_binary_cross_entropy
self._top_k_percent = top_k_percent
def __call__(self, class_outputs, class_targets, class_weights=None):
"""Computes the class loss (Fast-RCNN branch) of Mask-RCNN.
This function implements the classification loss of the Fast-RCNN.
The classification loss is categorical (or binary) cross entropy on all
RoIs.
Reference:
https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py
# pylint: disable=line-too-long
Args:
class_outputs: a float tensor representing the class prediction for each
box with a shape of [batch_size, num_boxes, num_classes].
class_targets: a float tensor representing the class label for each box
with a shape of [batch_size, num_boxes].
class_weights: A float list containing the weight of each class.
Returns:
a scalar tensor representing total class loss.
"""
with tf.name_scope('fast_rcnn_loss'):
output_dtype = class_outputs.dtype
num_classes = class_outputs.get_shape().as_list()[-1]
class_weights = (
class_weights if class_weights is not None else [1.0] * num_classes
)
if num_classes != len(class_weights):
raise ValueError(
'Length of class_weights should be {}'.format(num_classes)
)
class_weights = tf.constant(class_weights, dtype=output_dtype)
class_targets_one_hot = tf.one_hot(
tf.cast(class_targets, dtype=tf.int32),
num_classes,
dtype=class_outputs.dtype)
if self._use_binary_cross_entropy:
# (batch_size, num_boxes, num_classes)
cross_entropy_loss = tf.nn.sigmoid_cross_entropy_with_logits(
labels=class_targets_one_hot, logits=class_outputs)
cross_entropy_loss *= class_weights
else:
# (batch_size, num_boxes)
cross_entropy_loss = tf.nn.softmax_cross_entropy_with_logits(
labels=class_targets_one_hot, logits=class_outputs)
class_weight_mask = tf.einsum(
'...y,y->...', class_targets_one_hot, class_weights
)
cross_entropy_loss *= class_weight_mask
if self._top_k_percent < 1.0:
return self.aggregate_loss_top_k(cross_entropy_loss)
else:
return tf.reduce_mean(cross_entropy_loss)
def aggregate_loss_top_k(self, loss, num_valid_values=None):
"""Aggregate the top-k the greatest loss values.
Args:
loss: a float tensor in shape (batch_size, num_boxes) or (batch_size,
num_boxes, num_classes) which stores the loss values.
num_valid_values: the number of loss values which are not ignored. The
default value is None, which means all the loss values are valid.
Returns:
A 0-D float which stores the overall loss of the batch.
"""
loss = tf.reshape(loss, shape=[-1])
top_k_num = tf.cast(
self._top_k_percent * tf.size(loss, out_type=tf.float32), tf.int32)
top_k_losses, _ = tf.math.top_k(loss, k=top_k_num)
normalizer = tf.cast(top_k_num, loss.dtype)
if num_valid_values is not None:
normalizer = tf.minimum(normalizer, tf.cast(num_valid_values, loss.dtype))
return tf.reduce_sum(top_k_losses) / (normalizer + 1e-5)
class FastrcnnBoxLoss(object):
"""Fast R-CNN box regression loss function."""
def __init__(self,
huber_loss_delta: float,
class_agnostic_bbox_pred: bool = False):
"""Initiate Faster RCNN box loss.
Args:
huber_loss_delta: the delta is typically around the mean value of
regression target. For instances, the regression targets of 512x512
input with 6 anchors on P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
class_agnostic_bbox_pred: if True, class agnostic bounding box prediction
is performed.
"""
self._huber_loss = tf_keras.losses.Huber(
delta=huber_loss_delta, reduction=tf_keras.losses.Reduction.SUM)
self._class_agnostic_bbox_pred = class_agnostic_bbox_pred
def __call__(self, box_outputs, class_targets, box_targets):
"""Computes the box loss (Fast-RCNN branch) of Mask-RCNN.
This function implements the box regression loss of the Fast-RCNN. As the
`box_outputs` produces `num_classes` boxes for each RoI, the reference model
expands `box_targets` to match the shape of `box_outputs` and selects only
the target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/fast_rcnn.py) # pylint: disable=line-too-long
Instead, this function selects the `box_outputs` by the `class_targets` so
that it doesn't expand `box_targets`.
The box loss is smooth L1-loss on only positive samples of RoIs.
Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py # pylint: disable=line-too-long
Args:
box_outputs: a float tensor representing the box prediction for each box
with a shape of [batch_size, num_boxes, num_classes * 4].
class_targets: a float tensor representing the class label for each box
with a shape of [batch_size, num_boxes].
box_targets: a float tensor representing the box label for each box
with a shape of [batch_size, num_boxes, 4].
Returns:
box_loss: a scalar tensor representing total box regression loss.
"""
with tf.name_scope('fast_rcnn_loss'):
class_targets = tf.cast(class_targets, dtype=tf.int32)
if not self._class_agnostic_bbox_pred:
box_outputs = self._assign_class_targets(box_outputs, class_targets)
return self._fast_rcnn_box_loss(box_outputs, box_targets, class_targets)
def _assign_class_targets(self, box_outputs, class_targets):
"""Selects the box from `box_outputs` based on `class_targets`, with which the box has the maximum overlap."""
_, num_rois, num_class_specific_boxes = box_outputs.get_shape().as_list()
num_classes = num_class_specific_boxes // 4
box_outputs = tf.reshape(box_outputs, [-1, num_rois, num_classes, 4])
class_targets_ont_hot = tf.one_hot(
class_targets, num_classes, dtype=box_outputs.dtype
)
return tf.einsum('bnij,bni->bnj', box_outputs, class_targets_ont_hot)
def _fast_rcnn_box_loss(self, box_outputs, box_targets, class_targets,
normalizer=1.0):
"""Computes box regression loss."""
with tf.name_scope('fast_rcnn_box_loss'):
mask = tf.tile(
tf.expand_dims(tf.greater(class_targets, 0), axis=2), [1, 1, 4])
mask = tf.cast(mask, dtype=box_outputs.dtype)
box_targets = tf.expand_dims(box_targets, axis=-1)
box_outputs = tf.expand_dims(box_outputs, axis=-1)
box_loss = self._huber_loss(box_targets, box_outputs, sample_weight=mask)
# The loss is normalized by the number of ones in mask,
# additional normalizer provided by the user and using 0.01 here to avoid
# division by 0. For each replica, get the sum of non-zero masks. Then
# get the mean of sums from all replicas. Note there is an extra division
# by `num_replicas` in train_step(). So it is equivalent to normalizing
# the box loss by the global sum of non-zero masks.
replica_context = tf.distribute.get_replica_context()
mask = tf.reduce_sum(mask)
mask_mean = replica_context.all_reduce(
tf.distribute.ReduceOp.MEAN, mask
)
box_loss /= normalizer * (mask_mean + 0.01)
return box_loss
class MaskrcnnLoss(object):
"""Mask R-CNN instance segmentation mask loss function."""
def __init__(self):
self._binary_crossentropy = tf_keras.losses.BinaryCrossentropy(
reduction=tf_keras.losses.Reduction.SUM, from_logits=True)
def __call__(self, mask_outputs, mask_targets, select_class_targets):
"""Computes the mask loss of Mask-RCNN.
This function implements the mask loss of Mask-RCNN. As the `mask_outputs`
produces `num_classes` masks for each RoI, the reference model expands
`mask_targets` to match the shape of `mask_outputs` and selects only the
target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/mask_rcnn.py) # pylint: disable=line-too-long
Instead, this implementation selects the `mask_outputs` by the
`class_targets` so that it doesn't expand `mask_targets`. Note that the
selection logic is done in the post-processing of mask_rcnn_fn in
mask_rcnn_architecture.py.
Args:
mask_outputs: a float tensor representing the prediction for each mask,
with a shape of
[batch_size, num_masks, mask_height, mask_width].
mask_targets: a float tensor representing the binary mask of ground truth
labels for each mask with a shape of
[batch_size, num_masks, mask_height, mask_width].
select_class_targets: a tensor with a shape of [batch_size, num_masks],
representing the foreground mask targets.
Returns:
mask_loss: a float tensor representing total mask loss.
"""
with tf.name_scope('mask_rcnn_loss'):
_, _, mask_height, mask_width = mask_outputs.get_shape().as_list()
weights = tf.tile(
tf.greater(select_class_targets, 0)[:, :, tf.newaxis, tf.newaxis],
[1, 1, mask_height, mask_width],
)
weights = tf.cast(weights, dtype=mask_outputs.dtype)
mask_targets = tf.expand_dims(mask_targets, axis=-1)
mask_outputs = tf.expand_dims(mask_outputs, axis=-1)
mask_loss = self._binary_crossentropy(mask_targets, mask_outputs,
sample_weight=weights)
# For each replica, get the sum of non-zero weights. Then get the mean of
# sums from all replicas. Note there is an extra division by
# `num_replicas` in train_step(). So it is equivalent to normalizing the
# mask loss by the global sum of non-zero weights.
replica_context = tf.distribute.get_replica_context()
weights = tf.reduce_sum(weights)
weights_mean = replica_context.all_reduce(
tf.distribute.ReduceOp.MEAN, weights
)
# The loss is normalized by the number of 1's in weights and
# + 0.01 is used to avoid division by zero.
return mask_loss / (weights_mean + 0.01)