# coding=utf-8
# Copyright 2021 The Deeplab2 Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""This file contains code to get a sample from a dataset."""

import functools

import numpy as np
import tensorflow as tf

from deeplab2 import common
from deeplab2.data import dataset_utils
from deeplab2.data.preprocessing import input_preprocessing as preprocessing


def _compute_gaussian_from_std(sigma):
  """Computes the Gaussian and its size from a given standard deviation."""
  size = int(6 * sigma + 3)
  x = np.arange(size, dtype=np.float)
  y = x[:, np.newaxis]
  x0, y0 = 3 * sigma + 1, 3 * sigma + 1
  return np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)), size


class PanopticSampleGenerator:
  """This class generates samples from images and labels."""

  def __init__(self,
               dataset_info,
               is_training,
               crop_size,
               min_resize_value=None,
               max_resize_value=None,
               resize_factor=None,
               min_scale_factor=1.,
               max_scale_factor=1.,
               scale_factor_step_size=0,
               autoaugment_policy_name=None,
               only_semantic_annotations=False,
               thing_id_mask_annotations=False,
               max_thing_id=128,
               sigma=8,
               focus_small_instances=None):
    """Initializes the panoptic segmentation generator.

    Args:
      dataset_info: A dictionary with the following keys.
      - `name`: String, dataset name.
      - `ignore_label`: Integer, ignore label.
      - `class_has_instances_list`: A list of integers indicating which
        class has instance annotations.
      - `panoptic_label_divisor`: Integer, panoptic label divisor.
      - `num_classes`: Integer, number of classes.
      - `is_video_dataset`: Boolean, is video dataset or not.
      is_training: Boolean, is training mode or not.
      crop_size: Image crop size [height, width].
      min_resize_value: A 2-tuple of (height, width), desired minimum value
        after resize. If a single element is given, then height and width share
        the same value. None, empty or having 0 indicates no minimum value will
        be used.
      max_resize_value: A 2-tuple of (height, width), maximum allowed value
        after resize. If a single element is given, then height and width
        share the same value. None, empty or having 0 indicates no maximum
        value will be used.
      resize_factor: Resized dimensions are multiple of factor plus one.
      min_scale_factor: Minimum scale factor for random scale augmentation.
      max_scale_factor: Maximum scale factor for random scale augmentation.
      scale_factor_step_size: The step size from min scale factor to max scale
        factor. The input is randomly scaled based on the value of
        (min_scale_factor, max_scale_factor, scale_factor_step_size).
      autoaugment_policy_name: String, autoaugment policy name. See
        autoaugment_policy.py for available policies.
      only_semantic_annotations: An optional flag indicating whether the model
        needs only semantic annotations (default: False).
      thing_id_mask_annotations: An optional flag indicating whether the model
        needs thing_id_mask annotations. When `thing_id_mask_annotations` is
        True, we will additionally return mask annotation for each `thing`
        instance, encoded with a unique thing_id. This ground-truth annotation
        could be used to learn a better segmentation mask for each instance.
        `thing_id` indicates the number of unique thing-ID to each instance in
        an image, starting the counting from 0 (default: False).
      max_thing_id: The maximum number of possible thing instances per image. It
        is used together when thing_id_mask_annotations = True, representing the
        maximum thing ID encoded in the thing_id_mask. (default: 128).
      sigma: The standard deviation of the Gaussian used to encode the center
        keypoint (default: 8).
      focus_small_instances: An optional dict that defines how to deal with
        small instances (default: None):
        -`threshold`: An integer defining the threshold pixel number for an
          instance to be considered small.
        -`weight`: A number that defines the loss weight for small instances.
    """
    self._dataset_info = dataset_info
    self._ignore_label = self._dataset_info['ignore_label']
    self._only_semantic_annotations = only_semantic_annotations
    self._sigma = sigma
    self._instance_area_threshold = 0
    self._small_instance_weight = 1.0
    self._thing_id_mask_annotations = thing_id_mask_annotations
    self._max_thing_id = max_thing_id
    self._is_training = is_training
    self._preprocessing_fn = functools.partial(
        preprocessing.preprocess_image_and_label,
        crop_height=crop_size[0],
        crop_width=crop_size[1],
        min_resize_value=min_resize_value,
        max_resize_value=max_resize_value,
        resize_factor=resize_factor,
        min_scale_factor=min_scale_factor,
        max_scale_factor=max_scale_factor,
        scale_factor_step_size=scale_factor_step_size,
        autoaugment_policy_name=autoaugment_policy_name,
        ignore_label=self._ignore_label *
        self._dataset_info['panoptic_label_divisor'],
        is_training=self._is_training)

    if focus_small_instances is not None:
      self._instance_area_threshold = focus_small_instances['threshold']
      self._small_instance_weight = focus_small_instances['weight']

    self._gaussian, self._gaussian_size = _compute_gaussian_from_std(
        self._sigma)
    self._gaussian = tf.cast(tf.reshape(self._gaussian, [-1]), tf.float32)

  def __call__(self, sample_dict):
    """Gets a sample.

    Args:
      sample_dict: A dictionary with the following keys and values:
      - `image`: A tensor of shape [image_height, image_width, 3].
      - `image_name`: String, image name.
      - `label`: A tensor of shape [label_height, label_width, 1] or None.
      - `height`: An integer specifying the height of the image.
      - `width`: An integer specifying the width of the image.
      - `sequence`: An optional string specifying the sequence name.
      - `prev_image`: An optional tensor of the same shape as `image`.
      - `prev_label`: An optional tensor of the same shape as `label`.
      - `next_image`: An optional next-frame tensor of the shape of `image`.
      - `next_label`: An optional next-frame tensor of the shape of `label`.

    Returns:
      sample: A dictionary storing required data for panoptic segmentation.
    """
    return self.call(**sample_dict)

  def call(self,
           image,
           image_name,
           label,
           height,
           width,
           sequence='',
           prev_image=None,
           prev_label=None,
           next_image=None,
           next_label=None):
    """Gets a sample.

    Args:
      image: A tensor of shape [image_height, image_width, 3].
      image_name: String, image name.
      label: A tensor of shape [label_height, label_width, 1] or None.
      height: An integer specifying the height of the image.
      width: An integer specifying the width of the image.
      sequence: An optional string specifying the sequence name.
      prev_image: An optional tensor of shape [image_height, image_width, 3].
      prev_label: An optional tensor of shape [label_height, label_width, 1].
      next_image: An optional tensor of shape [image_height, image_width, 3].
      next_label: An optional tensor of shape [label_height, label_width, 1].

    Returns:
      sample: A dictionary storing required data for panoptic segmentation.

    Raises:
      ValueError: An error occurs when the label shape is invalid.
      NotImplementedError: An error occurs when thing_id_mask_annotations comes
        together with prev_image or prev_label, not currently implemented.
    """
    if label is not None:
      label.get_shape().assert_is_compatible_with(
          tf.TensorShape([None, None, 1]))
      original_label = tf.cast(label, dtype=tf.int32, name='original_label')
      if next_label is not None:
        original_next_label = tf.cast(
            next_label, dtype=tf.int32, name='original_next_label')
    # Reusing the preprocessing function for both next and prev samples.
    if next_image is not None:
      resized_image, image, label, next_image, next_label = (
          self._preprocessing_fn(
              image, label, prev_image=next_image, prev_label=next_label))
    else:
      resized_image, image, label, prev_image, prev_label = (
          self._preprocessing_fn(
              image, label, prev_image=prev_image, prev_label=prev_label))
    sample = {
        common.IMAGE: image
    }
    if prev_image is not None:
      sample[common.IMAGE] = tf.concat([image, prev_image], axis=2)
    if next_image is not None:
      sample[common.NEXT_IMAGE] = next_image
      sample[common.IMAGE] = tf.concat([image, next_image], axis=2)
    if label is not None:
      # Panoptic label for crowd regions will be ignore_label.
      semantic_label, panoptic_label, thing_mask, crowd_region = (
          dataset_utils.get_semantic_and_panoptic_label(
              self._dataset_info, label, self._ignore_label))
      sample[common.GT_SEMANTIC_KEY] = tf.squeeze(semantic_label, axis=2)
      semantic_weights = tf.ones_like(semantic_label, dtype=tf.float32)
      sample[common.SEMANTIC_LOSS_WEIGHT_KEY] = tf.squeeze(
          semantic_weights, axis=2)
      sample[common.GT_IS_CROWD] = tf.squeeze(crowd_region, axis=2)

      if not self._only_semantic_annotations:
        # The sample will have the original label including crowd regions.
        sample[common.GT_PANOPTIC_KEY] = tf.squeeze(label, axis=2)
        # Compute center loss for all non-crowd and non-ignore pixels.
        non_crowd_and_non_ignore_regions = tf.logical_and(
            tf.logical_not(crowd_region),
            tf.not_equal(semantic_label, self._ignore_label))
        sample[common.CENTER_LOSS_WEIGHT_KEY] = tf.squeeze(tf.cast(
            non_crowd_and_non_ignore_regions, tf.float32), axis=2)
        # Compute regression loss only for thing pixels that are not crowd.
        non_crowd_things = tf.logical_and(
            tf.logical_not(crowd_region), thing_mask)
        sample[common.REGRESSION_LOSS_WEIGHT_KEY] = tf.squeeze(tf.cast(
            non_crowd_things, tf.float32), axis=2)

        prev_panoptic_label = None
        next_panoptic_label = None
        if prev_label is not None:
          _, prev_panoptic_label, _, _ = (
              dataset_utils.get_semantic_and_panoptic_label(
                  self._dataset_info, prev_label, self._ignore_label))
        if next_label is not None:
          _, next_panoptic_label, _, _ = (
              dataset_utils.get_semantic_and_panoptic_label(
                  self._dataset_info, next_label, self._ignore_label))
        (sample[common.GT_INSTANCE_CENTER_KEY],
         sample[common.GT_INSTANCE_REGRESSION_KEY],
         sample[common.SEMANTIC_LOSS_WEIGHT_KEY],
         prev_center_map,
         frame_center_offsets,
         next_offset) = self._generate_gt_center_and_offset(
             panoptic_label, semantic_weights, prev_panoptic_label,
             next_panoptic_label)

        sample[common.GT_INSTANCE_REGRESSION_KEY] = tf.cast(
            sample[common.GT_INSTANCE_REGRESSION_KEY], tf.float32)

        if next_label is not None:
          sample[common.GT_NEXT_INSTANCE_REGRESSION_KEY] = tf.cast(
              next_offset, tf.float32)
          sample[common.NEXT_REGRESSION_LOSS_WEIGHT_KEY] = tf.cast(
              tf.greater(tf.reduce_sum(tf.abs(next_offset), axis=2), 0),
              tf.float32)

        # Only squeeze center map and semantic loss weights, as regression map
        # has two channels (x and y offsets).
        sample[common.GT_INSTANCE_CENTER_KEY] = tf.squeeze(
            sample[common.GT_INSTANCE_CENTER_KEY], axis=2)
        sample[common.SEMANTIC_LOSS_WEIGHT_KEY] = tf.squeeze(
            sample[common.SEMANTIC_LOSS_WEIGHT_KEY], axis=2)

        if prev_label is not None:
          sample[common.GT_FRAME_OFFSET_KEY] = frame_center_offsets
          sample[common.GT_FRAME_OFFSET_KEY] = tf.cast(
              sample[common.GT_FRAME_OFFSET_KEY], tf.float32)
          frame_offsets_present = tf.logical_or(
              tf.not_equal(frame_center_offsets[..., 0], 0),
              tf.not_equal(frame_center_offsets[..., 1], 0))
          sample[common.FRAME_REGRESSION_LOSS_WEIGHT_KEY] = tf.cast(
              frame_offsets_present, tf.float32)
          if self._is_training:
            sample[common.IMAGE] = tf.concat(
                [sample[common.IMAGE], prev_center_map], axis=2)

        if self._thing_id_mask_annotations:
          if any([prev_image is not None,
                  prev_label is not None,
                  next_image is not None,
                  next_label is not None]):
            raise NotImplementedError(
                'Current implementation of Max-DeepLab does not support '
                + 'prev_image, prev_label, next_image, or next_label.')
          thing_id_mask, thing_id_class = (
              self._generate_thing_id_mask_and_class(
                  panoptic_label, non_crowd_things))
          sample[common.GT_THING_ID_MASK_KEY] = tf.squeeze(
              thing_id_mask, axis=2)
          sample[common.GT_THING_ID_CLASS_KEY] = thing_id_class

    if not self._is_training:
      # Resized image is only used during visualization.
      sample[common.RESIZED_IMAGE] = resized_image
      sample[common.IMAGE_NAME] = image_name
      sample[common.GT_SIZE_RAW] = tf.stack([height, width], axis=0)
      if self._dataset_info['is_video_dataset']:
        sample[common.SEQUENCE_ID] = sequence
      # Keep original labels for evaluation.
      if label is not None:
        orig_semantic_label, _, _, orig_crowd_region = (
            dataset_utils.get_semantic_and_panoptic_label(
                self._dataset_info, original_label, self._ignore_label))
        sample[common.GT_SEMANTIC_RAW] = tf.squeeze(orig_semantic_label, axis=2)
        if not self._only_semantic_annotations:
          sample[common.GT_PANOPTIC_RAW] = tf.squeeze(original_label, axis=2)
          sample[common.GT_IS_CROWD_RAW] = tf.squeeze(orig_crowd_region)
          if next_label is not None:
            sample[common.GT_NEXT_PANOPTIC_RAW] = tf.squeeze(
                original_next_label, axis=2)
    return sample

  def _generate_thing_id_mask_and_class(self,
                                        panoptic_label,
                                        non_crowd_things):
    """Generates the ground-truth thing-ID masks and their class labels.

    It computes thing-ID mask and class with unique ID for each thing instance.
    `thing_id` indicates the number of unique thing-ID to each instance in an
    image, starting the counting from 0. Each pixel in thing_id_mask is labeled
    with the corresponding thing-ID.

    Args:
      panoptic_label: A tf.Tensor of shape [height, width, 1].
      non_crowd_things: A tf.Tensor of shape [height, width, 1], indicating
        non-crowd and thing-class regions.

    Returns:
      thing_id_mask: A tf.Tensor of shape [height, width, 1]. It assigns each
        non-crowd thing instance a unique mask-ID label, starting from 0.
        Unassigned pixels are set to -1.
      thing_id_class: A tf.Tensor of shape [max_thing_id]. It contains semantic
        ID of each instance assigned to thing_id_mask. The remaining
        (max_thing_id - num_things) elements are set to -1.

    Raises:
      ValueError: An error occurs when the thing-ID mask contains stuff or crowd
        region.
      ValueError: An error occurs when thing_count is greater or equal to
        self._max_thing_id.

    """
    unique_ids, _ = tf.unique(tf.reshape(panoptic_label, [-1]))
    thing_id_mask = -tf.ones_like(panoptic_label)
    thing_id_class = -tf.ones(self._max_thing_id)
    thing_count = 0
    for panoptic_id in unique_ids:
      semantic_id = panoptic_id // self._dataset_info['panoptic_label_divisor']
      # Filter out IDs that are not thing instances (i.e., IDs for ignore_label,
      # stuff classes or crowd). Stuff classes and crowd regions both have IDs
      # of the form panoptic_id = semantic_id * label_divisor (i.e., instance id
      # = 0)
      if (semantic_id == self._dataset_info['ignore_label'] or
          panoptic_id % self._dataset_info['panoptic_label_divisor'] == 0):
        continue

      assert_stuff_crowd = tf.debugging.Assert(
          tf.reduce_all(non_crowd_things[panoptic_label == panoptic_id]),
          ['thing-ID mask here must not contain stuff or crowd region.'])
      with tf.control_dependencies([assert_stuff_crowd]):
        panoptic_id = tf.identity(panoptic_id)

      thing_id_mask = tf.where(panoptic_label == panoptic_id,
                               thing_count, thing_id_mask)

      assert_thing_count = tf.debugging.Assert(
          thing_count < self._max_thing_id,
          ['thing_count must be smaller than self._max_thing_id.'])
      with tf.control_dependencies([assert_thing_count]):
        thing_count = tf.identity(thing_count)

      thing_id_class = tf.tensor_scatter_nd_update(
          thing_id_class, [[thing_count]], [semantic_id])
      thing_count += 1
    return thing_id_mask, thing_id_class

  def _generate_prev_centers_with_noise(self,
                                        panoptic_label,
                                        offset_noise_factor=0.05,
                                        false_positive_rate=0.2,
                                        false_positive_noise_factor=0.05):
    """Generates noisy center predictions for the previous frame.

    Args:
      panoptic_label: A tf.Tensor of shape [height, width, 1].
      offset_noise_factor: An optional float defining the maximum fraction of
        the object size that is used to displace the previous center.
      false_positive_rate: An optional float indicating at which probability
        false positives should be added.
      false_positive_noise_factor: An optional float defining the maximum
        fraction of the object size that is used to displace the false positive
        center.

    Returns:
      A tuple of (center, ids_to_center) with both being tf.Tensor of shape
      [height, width, 1] and shape [N, 2] where N is the number of unique IDs.
    """
    height = tf.shape(panoptic_label)[0]
    width = tf.shape(panoptic_label)[1]

    # Pad center to make boundary handling easier.
    center_pad_begin = int(round(3 * self._sigma + 1))
    center_pad_end = int(round(3 * self._sigma + 2))
    center_pad = center_pad_begin + center_pad_end

    center = tf.zeros((height + center_pad, width + center_pad))
    unique_ids, _ = tf.unique(tf.reshape(panoptic_label, [-1]))
    ids_to_center_x = tf.zeros_like(unique_ids, dtype=tf.int32)
    ids_to_center_y = tf.zeros_like(unique_ids, dtype=tf.int32)

    for panoptic_id in unique_ids:
      semantic_id = panoptic_id // self._dataset_info['panoptic_label_divisor']
      # Filter out IDs that should be ignored, are stuff classes or crowd.
      # Stuff classes and crowd regions both have IDs of the form panoptic_id =
      # semantic_id * label_divisor
      if (semantic_id == self._dataset_info['ignore_label'] or
          panoptic_id % self._dataset_info['panoptic_label_divisor'] == 0):
        continue

      # Convert [[y0, x0, 0], ...] to [[y0, ...], [x0, ...], [0, ...]].
      mask_index = tf.cast(
          tf.transpose(tf.where(panoptic_label == panoptic_id)), tf.float32)
      centers = tf.reduce_mean(mask_index, axis=1)
      bbox_size = (
          tf.reduce_max(mask_index, axis=1) - tf.reduce_min(mask_index, axis=1))

      # Add noise.
      center_y = (
          centers[0] + tf.random.normal([], dtype=tf.float32) *
          offset_noise_factor * bbox_size[0])
      center_x = (
          centers[1] + tf.random.normal([], dtype=tf.float32) *
          offset_noise_factor * bbox_size[1])

      center_x = tf.minimum(
          tf.maximum(tf.cast(tf.round(center_x), tf.int32), 0), width - 1)
      center_y = tf.minimum(
          tf.maximum(tf.cast(tf.round(center_y), tf.int32), 0), height - 1)

      id_index = tf.where(tf.equal(panoptic_id, unique_ids))
      ids_to_center_x = tf.tensor_scatter_nd_update(
          ids_to_center_x, id_index, tf.expand_dims(center_x, axis=0))
      ids_to_center_y = tf.tensor_scatter_nd_update(
          ids_to_center_y, id_index, tf.expand_dims(center_y, axis=0))

      def add_center_gaussian(center_x_coord, center_y_coord, center):
        # Due to the padding with center_pad_begin in center, the computed
        # center becomes the upper left corner in the center tensor.
        upper_left = center_x_coord, center_y_coord
        bottom_right = (upper_left[0] + self._gaussian_size,
                        upper_left[1] + self._gaussian_size)

        indices_x, indices_y = tf.meshgrid(
            tf.range(upper_left[0], bottom_right[0]),
            tf.range(upper_left[1], bottom_right[1]))
        indices = tf.transpose(
            tf.stack([tf.reshape(indices_y, [-1]),
                      tf.reshape(indices_x, [-1])]))

        return tf.tensor_scatter_nd_max(
            center, indices, self._gaussian, name='center_scatter')

      center = add_center_gaussian(center_x, center_y, center)
      # Generate false positives.
      center_y = (
          tf.cast(center_y, dtype=tf.float32) +
          tf.random.normal([], dtype=tf.float32) * false_positive_noise_factor *
          bbox_size[0])
      center_x = (
          tf.cast(center_x, dtype=tf.float32) +
          tf.random.normal([], dtype=tf.float32) * false_positive_noise_factor *
          bbox_size[1])

      center_x = tf.minimum(
          tf.maximum(tf.cast(tf.round(center_x), tf.int32), 0), width - 1)
      center_y = tf.minimum(
          tf.maximum(tf.cast(tf.round(center_y), tf.int32), 0), height - 1)
      # Draw a sample to decide whether to add a false positive or not.
      center = center + tf.cast(
          tf.random.uniform([], dtype=tf.float32) < false_positive_rate,
          tf.float32) * (
              add_center_gaussian(center_x, center_y, center) - center)

    center = center[center_pad_begin:(center_pad_begin + height),
                    center_pad_begin:(center_pad_begin + width)]
    center = tf.expand_dims(center, -1)
    return center, unique_ids, ids_to_center_x, ids_to_center_y

  def _generate_gt_center_and_offset(self,
                                     panoptic_label,
                                     semantic_weights,
                                     prev_panoptic_label=None,
                                     next_panoptic_label=None):
    """Generates the ground-truth center and offset from the panoptic labels.

    Additionally, the per-pixel weights for the semantic branch are increased
    for small instances. In case, prev_panoptic_label is passed, it also
    computes the previous center heatmap with random noise and the offsets
    between center maps.

    Args:
      panoptic_label: A tf.Tensor of shape [height, width, 1].
      semantic_weights: A tf.Tensor of shape [height, width, 1].
      prev_panoptic_label: An optional tf.Tensor of shape [height, width, 1].
      next_panoptic_label: An optional tf.Tensor of shape [height, width, 1].

    Returns:
      A tuple (center, offsets, weights, prev_center, frame_offset*,
      next_offset) with each being a tf.Tensor of shape [height, width, 1 (2*)].
      If prev_panoptic_label is None, prev_center and frame_offset are None.
      If next_panoptic_label is None, next_offset is None.
    """
    height = tf.shape(panoptic_label)[0]
    width = tf.shape(panoptic_label)[1]

    # Pad center to make boundary handling easier.
    center_pad_begin = int(round(3 * self._sigma + 1))
    center_pad_end = int(round(3 * self._sigma + 2))
    center_pad = center_pad_begin + center_pad_end

    center = tf.zeros((height + center_pad, width + center_pad))
    offset_x = tf.zeros((height, width, 1), dtype=tf.int32)
    offset_y = tf.zeros((height, width, 1), dtype=tf.int32)
    unique_ids, _ = tf.unique(tf.reshape(panoptic_label, [-1]))

    prev_center = None
    frame_offsets = None
    # Due to loop handling in tensorflow, these variables had to be defined for
    # all cases.
    frame_offset_x = tf.zeros((height, width, 1), dtype=tf.int32)
    frame_offset_y = tf.zeros((height, width, 1), dtype=tf.int32)

    # Next-frame instance offsets.
    next_offset = None
    next_offset_y = tf.zeros((height, width, 1), dtype=tf.int32)
    next_offset_x = tf.zeros((height, width, 1), dtype=tf.int32)

    if prev_panoptic_label is not None:
      (prev_center, prev_unique_ids, prev_centers_x, prev_centers_y
      ) = self._generate_prev_centers_with_noise(prev_panoptic_label)

    for panoptic_id in unique_ids:
      semantic_id = panoptic_id // self._dataset_info['panoptic_label_divisor']
      # Filter out IDs that should be ignored, are stuff classes or crowd.
      # Stuff classes and crowd regions both have IDs of the form panopti_id =
      # semantic_id * label_divisor
      if (semantic_id == self._dataset_info['ignore_label'] or
          panoptic_id % self._dataset_info['panoptic_label_divisor'] == 0):
        continue

      # Convert [[y0, x0, 0], ...] to [[y0, ...], [x0, ...], [0, ...]].
      mask_index = tf.transpose(tf.where(panoptic_label == panoptic_id))
      mask_y_index = mask_index[0]
      mask_x_index = mask_index[1]

      next_mask_index = None
      next_mask_y_index = None
      next_mask_x_index = None
      if next_panoptic_label is not None:
        next_mask_index = tf.transpose(
            tf.where(next_panoptic_label == panoptic_id))
        next_mask_y_index = next_mask_index[0]
        next_mask_x_index = next_mask_index[1]

      instance_area = tf.shape(mask_x_index)
      if instance_area < self._instance_area_threshold:
        semantic_weights = tf.where(panoptic_label == panoptic_id,
                                    self._small_instance_weight,
                                    semantic_weights)

      centers = tf.reduce_mean(tf.cast(mask_index, tf.float32), axis=1)

      center_x = tf.cast(tf.round(centers[1]), tf.int32)
      center_y = tf.cast(tf.round(centers[0]), tf.int32)

      # Due to the padding with center_pad_begin in center, the computed center
      # becomes the upper left corner in the center tensor.
      upper_left = center_x, center_y
      bottom_right = (upper_left[0] + self._gaussian_size,
                      upper_left[1] + self._gaussian_size)

      indices_x, indices_y = tf.meshgrid(
          tf.range(upper_left[0], bottom_right[0]),
          tf.range(upper_left[1], bottom_right[1]))
      indices = tf.transpose(
          tf.stack([tf.reshape(indices_y, [-1]),
                    tf.reshape(indices_x, [-1])]))

      center = tf.tensor_scatter_nd_max(
          center, indices, self._gaussian, name='center_scatter')
      offset_y = tf.tensor_scatter_nd_update(
          offset_y,
          tf.transpose(mask_index),
          center_y - tf.cast(mask_y_index, tf.int32),
          name='offset_y_scatter')
      offset_x = tf.tensor_scatter_nd_update(
          offset_x,
          tf.transpose(mask_index),
          center_x - tf.cast(mask_x_index, tf.int32),
          name='offset_x_scatter')
      if prev_panoptic_label is not None:
        mask = tf.equal(prev_unique_ids, panoptic_id)
        if tf.math.count_nonzero(mask) > 0:
          prev_center_x = prev_centers_x[mask]
          prev_center_y = prev_centers_y[mask]

          frame_offset_y = tf.tensor_scatter_nd_update(
              frame_offset_y,
              tf.transpose(mask_index),
              prev_center_y - tf.cast(mask_y_index, tf.int32),
              name='frame_offset_y_scatter')
          frame_offset_x = tf.tensor_scatter_nd_update(
              frame_offset_x,
              tf.transpose(mask_index),
              prev_center_x - tf.cast(mask_x_index, tf.int32),
              name='frame_offset_x_scatter')
      if next_panoptic_label is not None:
        next_offset_y = tf.tensor_scatter_nd_update(
            next_offset_y,
            tf.transpose(next_mask_index),
            center_y - tf.cast(next_mask_y_index, tf.int32),
            name='next_offset_y_scatter')
        next_offset_x = tf.tensor_scatter_nd_update(
            next_offset_x,
            tf.transpose(next_mask_index),
            center_x - tf.cast(next_mask_x_index, tf.int32),
            name='next_offset_x_scatter')

    offset = tf.concat([offset_y, offset_x], axis=2)
    center = center[center_pad_begin:(center_pad_begin + height),
                    center_pad_begin:(center_pad_begin + width)]
    center = tf.expand_dims(center, -1)
    if prev_panoptic_label is not None:
      frame_offsets = tf.concat([frame_offset_y, frame_offset_x], axis=2)
    if next_panoptic_label is not None:
      next_offset = tf.concat([next_offset_y, next_offset_x], axis=2)
    return (center, offset, semantic_weights, prev_center, frame_offsets,
            next_offset)