# Copyright 2017 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Keypoint box coder. The keypoint box coder follows the coding schema described below (this is similar to the FasterRcnnBoxCoder, except that it encodes keypoints in addition to box coordinates): ty = (y - ya) / ha tx = (x - xa) / wa th = log(h / ha) tw = log(w / wa) tky0 = (ky0 - ya) / ha tkx0 = (kx0 - xa) / wa tky1 = (ky1 - ya) / ha tkx1 = (kx1 - xa) / wa ... where x, y, w, h denote the box's center coordinates, width and height respectively. Similarly, xa, ya, wa, ha denote the anchor's center coordinates, width and height. tx, ty, tw and th denote the anchor-encoded center, width and height respectively. ky0, kx0, ky1, kx1, ... denote the keypoints' coordinates, and tky0, tkx0, tky1, tkx1, ... denote the anchor-encoded keypoint coordinates. """ import tensorflow as tf from object_detection.core import box_coder from object_detection.core import box_list from object_detection.core import standard_fields as fields EPSILON = 1e-8 class KeypointBoxCoder(box_coder.BoxCoder): """Keypoint box coder.""" def __init__(self, num_keypoints, scale_factors=None): """Constructor for KeypointBoxCoder. Args: num_keypoints: Number of keypoints to encode/decode. scale_factors: List of 4 positive scalars to scale ty, tx, th and tw. In addition to scaling ty and tx, the first 2 scalars are used to scale the y and x coordinates of the keypoints as well. If set to None, does not perform scaling. """ self._num_keypoints = num_keypoints if scale_factors: assert len(scale_factors) == 4 for scalar in scale_factors: assert scalar > 0 self._scale_factors = scale_factors self._keypoint_scale_factors = None if scale_factors is not None: self._keypoint_scale_factors = tf.expand_dims(tf.tile( [tf.to_float(scale_factors[0]), tf.to_float(scale_factors[1])], [num_keypoints]), 1) @property def code_size(self): return 4 + self._num_keypoints * 2 def _encode(self, boxes, anchors): """Encode a box and keypoint collection with respect to anchor collection. Args: boxes: BoxList holding N boxes and keypoints to be encoded. Boxes are tensors with the shape [N, 4], and keypoints are tensors with the shape [N, num_keypoints, 2]. anchors: BoxList of anchors. Returns: a tensor representing N anchor-encoded boxes of the format [ty, tx, th, tw, tky0, tkx0, tky1, tkx1, ...] where tky0 and tkx0 represent the y and x coordinates of the first keypoint, tky1 and tkx1 represent the y and x coordinates of the second keypoint, and so on. """ # Convert anchors to the center coordinate representation. ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() ycenter, xcenter, h, w = boxes.get_center_coordinates_and_sizes() keypoints = boxes.get_field(fields.BoxListFields.keypoints) keypoints = tf.transpose(tf.reshape(keypoints, [-1, self._num_keypoints * 2])) num_boxes = boxes.num_boxes() # Avoid NaN in division and log below. ha += EPSILON wa += EPSILON h += EPSILON w += EPSILON tx = (xcenter - xcenter_a) / wa ty = (ycenter - ycenter_a) / ha tw = tf.log(w / wa) th = tf.log(h / ha) tiled_anchor_centers = tf.tile( tf.stack([ycenter_a, xcenter_a]), [self._num_keypoints, 1]) tiled_anchor_sizes = tf.tile( tf.stack([ha, wa]), [self._num_keypoints, 1]) tkeypoints = (keypoints - tiled_anchor_centers) / tiled_anchor_sizes # Scales location targets as used in paper for joint training. if self._scale_factors: ty *= self._scale_factors[0] tx *= self._scale_factors[1] th *= self._scale_factors[2] tw *= self._scale_factors[3] tkeypoints *= tf.tile(self._keypoint_scale_factors, [1, num_boxes]) tboxes = tf.stack([ty, tx, th, tw]) return tf.transpose(tf.concat([tboxes, tkeypoints], 0)) def _decode(self, rel_codes, anchors): """Decode relative codes to boxes and keypoints. Args: rel_codes: a tensor with shape [N, 4 + 2 * num_keypoints] representing N anchor-encoded boxes and keypoints anchors: BoxList of anchors. Returns: boxes: BoxList holding N bounding boxes and keypoints. """ ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() num_codes = tf.shape(rel_codes)[0] result = tf.unstack(tf.transpose(rel_codes)) ty, tx, th, tw = result[:4] tkeypoints = result[4:] if self._scale_factors: ty /= self._scale_factors[0] tx /= self._scale_factors[1] th /= self._scale_factors[2] tw /= self._scale_factors[3] tkeypoints /= tf.tile(self._keypoint_scale_factors, [1, num_codes]) w = tf.exp(tw) * wa h = tf.exp(th) * ha ycenter = ty * ha + ycenter_a xcenter = tx * wa + xcenter_a ymin = ycenter - h / 2. xmin = xcenter - w / 2. ymax = ycenter + h / 2. xmax = xcenter + w / 2. decoded_boxes_keypoints = box_list.BoxList( tf.transpose(tf.stack([ymin, xmin, ymax, xmax]))) tiled_anchor_centers = tf.tile( tf.stack([ycenter_a, xcenter_a]), [self._num_keypoints, 1]) tiled_anchor_sizes = tf.tile( tf.stack([ha, wa]), [self._num_keypoints, 1]) keypoints = tkeypoints * tiled_anchor_sizes + tiled_anchor_centers keypoints = tf.reshape(tf.transpose(keypoints), [-1, self._num_keypoints, 2]) decoded_boxes_keypoints.add_field(fields.BoxListFields.keypoints, keypoints) return decoded_boxes_keypoints