# coding=utf-8
# Copyright 2021 The Deeplab2 Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Implements convolutional and attentional residual block groups."""

import math
import tensorflow as tf

from deeplab2.model import utils
from deeplab2.model.layers import activations
from deeplab2.model.layers import axial_blocks
from deeplab2.model.layers import drop_path
from deeplab2.model.layers import dual_path_transformer
from deeplab2.model.layers import positional_encodings
from deeplab2.model.layers import recompute_grad as recompute_grad_lib

# We will apply 10x larger learning rates on transformer layers. This global
# variable name will be accessed when we build the optimizers. This keyword is
# reserved and should not be a part of the variable names in a classification
# pretrained backbone.
TRANSFORMER = 'transformer'


def _get_current_names(index):
  current_name = '_block{}'.format(index + 1)
  transformer_current_name = '_block{}_{}'.format(index + 1, TRANSFORMER)
  return current_name, transformer_current_name


class BlockGroup(tf.keras.layers.Layer):
  """Applies a group of residual blocks with dual path transformer layers [1].

  An optional dual-path transformer layer is inserted after each residual block.
  The transformer layer performs memory2pixel attention, pixel2memory attention,
  and memory2memory self-attention, while the standard residual block applies
  the pixel2pixel axial-attention, global-attention, or spatial convolution.

  Reference:
  [1] MaX-DeepLab: End-to-End Panoptic Segmentation with Mask Transformers,
      CVPR 2021. https://arxiv.org/abs/2012.00759
        Huiyu Wang, Yukun Zhu, Hartwig Adam, Alan Yuille, Liang-Chieh Chen.
  """

  def __init__(self,
               filters,
               num_blocks,
               name,
               original_resnet_stride,
               original_resnet_input_stride,
               output_stride=16,
               backbone_type='resnet_beta',
               positional_encoding_type=None,
               use_global_beyond_stride=0,
               use_axial_beyond_stride=16,
               use_transformer_beyond_stride=32,
               use_sac_beyond_stride=0,
               use_squeeze_and_excite=False,
               conv_use_recompute_grad=False,
               axial_use_recompute_grad=True,
               recompute_within_stride=0,
               transformer_use_recompute_grad=False,
               transformer_expansion=1,
               drop_path_keep_prob=0.8,
               drop_path_beyond_stride=16,
               drop_path_schedule='constant',
               activation='relu',
               attention_bottleneck_expansion=2,
               axial_layer_config=None,
               dual_path_transformer_layer_config=None,
               bn_layer=tf.keras.layers.BatchNormalization,
               conv_kernel_weight_decay=0.0):
    """Initializes a BlockGroup layer.

    Args:
      filters: An integer, the base number of channels for this block group.
      num_blocks: An integer, the number of blocks for this block group.
      name: A string, the name of the block group.
      original_resnet_stride: An integer, the original resnet stride for this
        block, usually 1 or 2. The stride will be applied if
        original_resnet_input_stride is smaller than the desired output_stride.
        Otherwise, the stride will not be applied, and atrous convolution will
        be used after the first block.
      original_resnet_input_stride: An integer, the total input stride in the
        original resnet. For example, the total input stride for the last stage
        of the original resnet is 16, and the total output stride is 32. This
        stride differs from the true stride of the feature in that we might use
        atrous convolution to change both the input and output stride to, e.g.
        8, but its original resnet input stride remains the same. In this case,
        we also use the original resnet input stride to compute the atrous rate.
      output_stride: An integer, the desired output_stride for the ResNet.
      backbone_type: A string, the type of the backbone. Supports 'resnet',
        'resnet_beta', and 'wider_resnet'. The 'resnet' refers to the original
        resnet with a 7x7 convolutional stem. The 'resnet_beta' means a resnet
        but with an inception stem. The 'wider_resnet' is a wider variant of
        resnet with extensively used 3x3 convolutions.
      positional_encoding_type: A string, type of the positional encoding.
        Support '2D', '1D', and None.
      use_global_beyond_stride: An integer, the stride beyond which we use
        global attention. Set to 0 if no global attention is desired. Defaults
        to 0, i.e. we do not use global attention.
      use_axial_beyond_stride: An integer, the stride beyond which we use axial
        attention. Note that use_global_beyond_stride has a higher priority,
        i.e. we use global attention if the stride is also beyond
        use_global_beyond_stride. Set to 0 if no axial attention is desired.
        Defaults to 16 as in MaX-DeepLab.
      use_transformer_beyond_stride: An integer, the stride beyond which we use
        a transformer layer. Set to 0 if no transformer is desired. Defaults to
        32 as in MaX-DeepLab-S.
      use_sac_beyond_stride: An integer. Use the Switchable Atrous Convolution
        (SAC) beyond the specified stride. For example, if
        `use_sac_beyond_stride` = 16, SAC will be applied to the network stage
        whose output stride >= 16 (i.e., 16 and 32). Set to 0 or -1 to disable
        it. Defaults to 0 as SAC is not used in MaX-DeepLab.
      use_squeeze_and_excite: A boolean, whether squeeze-and-excite (SE) is
        used. Defaults to False as SE is not used in MaX-DeepLab.
      conv_use_recompute_grad: A boolean, whether to use the gradient
        checkpointing trick for convolutional blocks. This trick reduces
        accelerator memory usage, but takes longer to compute gradients.
        Defaults to False since convolutional layers are memory efficient.
      axial_use_recompute_grad: A boolean, whether to use the gradient
        checkpointing trick for axial blocks. This trick reduces accelerator
        memory usage, but takes longer to compute gradients. Defaults to True
        since it saves memory for axial blocks.
      recompute_within_stride: An integer, the stride within which we use the
        gradient checkpointing trick. This trick reduces accelerator memory
        usage, but takes longer to compute gradients. Defaults to 0 (do not
        recompute any layer).
      transformer_use_recompute_grad: A boolean, whether to use the gradient
        checkpointing trick for dual-path transformer blocks. This trick reduces
        accelerator memory usage, but takes longer to compute gradients.
        Defaults to False.
      transformer_expansion: An integer, the expansion ratio for the transformer
        bottleneck.
      drop_path_keep_prob: A float, the keep probability for dropping path.
        Defaults to 0.8 as in MaX-DeepLab-S.
      drop_path_beyond_stride: An integer, the stride beyond which we apply drop
        path augmentation. Defaults to 16 as in MaX-DeepLab-S.
      drop_path_schedule: A string, the drop path schedule. Currently, we
        support 'constant': use the same drop path keep probability for all
        stages, and 'linear': linearly decrease the drop path keep probability
        from 1.0 at 0-th stage (or STEM) to `drop_path_keep_prob` at last stage.
      activation: A string, type of activation function to apply. Support
        'relu', 'swish' (or 'silu'), 'gelu', 'approximated_gelu', and 'elu'.
      attention_bottleneck_expansion: An integer, the expansion ratio for
        axial attention blocks.
      axial_layer_config: A dict, an argument dictionary for the axial layer.
      dual_path_transformer_layer_config: A dict, an argument dictionary for the
        transformer.
      bn_layer: An optional tf.keras.layers.Layer that computes the
        normalization (default: tf.keras.layers.BatchNormalization).
      conv_kernel_weight_decay: A float, the weight decay for convolution
        kernels.

    Raises:
      ValueError: If backbone_type is not one of 'resnet', 'resnet_beta', or
        'wider_resnet'.
      ValueError: original_resnet_input_stride is not power of 2.
      ValueError: output_stride is not power of 2.
    """
    if original_resnet_input_stride & (original_resnet_input_stride - 1):
      raise ValueError('original_resnet_input_stride is not power of 2.')
    if output_stride & (output_stride - 1):
      raise ValueError('output_stride is not power of 2.')

    super(BlockGroup, self).__init__(name=name)
    self._add_absolute_positional_encoding = None
    self._activation_fn = activations.get_activation(activation)
    self._num_blocks = num_blocks
    self._drop_path_keep_prob = []
    self._recompute_grad = []
    self._transformer_use_recompute_grad = transformer_use_recompute_grad
    if dual_path_transformer_layer_config is None:
      dual_path_transformer_layer_config = {}
    original_resnet_current_stride = original_resnet_input_stride

    use_sac = (original_resnet_input_stride * original_resnet_stride >=
               use_sac_beyond_stride > 0)

    recompute_grad = (original_resnet_input_stride * original_resnet_stride <=
                      recompute_within_stride)

    for index in range(num_blocks):
      current_name, transformer_current_name = _get_current_names(index)

      # Compute the current strides. If there is a stride for this block group,
      # we do it in the first residual block.
      if index == 0 and original_resnet_input_stride < output_stride:
        current_strides = original_resnet_stride
      else:
        current_strides = 1

      # Compute the current atrous rate.
      if original_resnet_current_stride > output_stride:
        atrous_rate = original_resnet_current_stride // output_stride
      else:
        atrous_rate = 1

      # Compute the atrous rate for the second conv in the first basic block.
      if (index == 0 and original_resnet_input_stride * original_resnet_stride >
          output_stride):
        basic_block_second_conv_atrous_rate = (
            original_resnet_input_stride * original_resnet_stride //
            output_stride)
      else:
        basic_block_second_conv_atrous_rate = atrous_rate

      # Compute the current drop_path_keep_prob.
      current_stage = math.log2(original_resnet_current_stride) - 1
      if original_resnet_current_stride >= drop_path_beyond_stride:
        current_drop_path_keep_prob = drop_path.get_drop_path_keep_prob(
            drop_path_keep_prob, drop_path_schedule,
            current_stage=int(round(current_stage)),
            num_stages=4)
      else:
        current_drop_path_keep_prob = 1.0

      # Compute which block_fn to use for this residual block.
      if original_resnet_current_stride >= use_global_beyond_stride > 0:
        attention_type = 'global'
        recompute_grad = axial_use_recompute_grad or recompute_grad
        filters_list = [filters * attention_bottleneck_expansion,
                        filters,
                        filters * 4]
      elif original_resnet_current_stride >= use_axial_beyond_stride > 0:
        attention_type = 'axial'
        recompute_grad = axial_use_recompute_grad or recompute_grad
        filters_list = [filters * attention_bottleneck_expansion,
                        filters,
                        filters * 4]
      elif backbone_type == 'resnet' or backbone_type == 'resnet_beta':
        attention_type = None
        recompute_grad = conv_use_recompute_grad or recompute_grad
        filters_list = [filters,
                        filters,
                        filters * 4]
      elif backbone_type == 'wider_resnet':
        if original_resnet_input_stride * original_resnet_stride < 32:
          # Wider-ResNet uses conv basic blocks except the last stage.
          attention_type = None
          recompute_grad = conv_use_recompute_grad or recompute_grad
          filters_list = [filters * 4,
                          filters * 4]
        else:
          # Wider-ResNet uses an expanded bottleneck block in the last stage.
          attention_type = None
          recompute_grad = conv_use_recompute_grad or recompute_grad
          filters_list = [filters,
                          filters * 2,
                          filters * 4]
      else:
        raise ValueError(backbone_type + ' is not supported.')

      self._drop_path_keep_prob.append(current_drop_path_keep_prob)
      # Apply the residual block.
      # The inputs to block_fn should be activated features.
      block_fn = axial_blocks.AxialBlock(
          filters_list,
          kernel_size=3,
          strides=current_strides,
          atrous_rate=atrous_rate,
          use_squeeze_and_excite=use_squeeze_and_excite,
          use_sac=use_sac,
          bn_layer=bn_layer,
          activation=activation,
          name=current_name[1:],
          conv_kernel_weight_decay=conv_kernel_weight_decay,
          basic_block_second_conv_atrous_rate=(
              basic_block_second_conv_atrous_rate),
          attention_type=attention_type,
          axial_layer_config=axial_layer_config)
      self._recompute_grad.append(recompute_grad)
      utils.safe_setattr(self, current_name, block_fn)

      # Modify the original_resnet_stride according to the strides.
      if index == 0 and original_resnet_stride > 1:
        original_resnet_current_stride *= original_resnet_stride
        # Add absolute positional encoding if we will apply global attention
        # beyond this stride.
        if original_resnet_current_stride == use_global_beyond_stride > 0:
          self._add_absolute_positional_encoding = (
              positional_encodings.AddAbsolutePositionalEncoding(
                  'add_absolute_positional_encoding',
                  positional_encoding_type, bn_layer, conv_kernel_weight_decay))
      if original_resnet_current_stride >= use_transformer_beyond_stride > 0:
        # Apply a dual-path transformer.
        transformer_block_fn = dual_path_transformer.DualPathTransformerLayer(
            name=transformer_current_name[1:],
            filters=int(128 * transformer_expansion),
            activation=activation,
            bn_layer=bn_layer,
            conv_kernel_weight_decay=conv_kernel_weight_decay,
            **dual_path_transformer_layer_config)
        utils.safe_setattr(self, transformer_current_name, transformer_block_fn)
      else:
        utils.safe_setattr(self, transformer_current_name, None)
    # Avoid using recompute_grad for the first call that builds the sub-layers.
    # Otherwise, recompute_grad will not track newly built model parameters.
    self._first_building_call = True

  def call(self, inputs, training=False):
    """Performs a forward pass.

    Args:
      inputs: two tensors. The first tensor is a pixel_space_input with shape
        [batch, height, width, pixel_channels]. The second tensor is
        memory_space_input with shape [batch, length, memory_channels]. This
        input will be used only if a transformer is used. Otherwise, the input
        is returned unmodified.
      training: A boolean flag indicating whether training behavior should be
        used (default: False).

    Returns:
      output: An output [batch, height, width, filters * 4] tensor.
      activated_output: An activated output [batch, height, width, filters * 4]
        tensor.
      memory_space_output: A memory space output [batch, length,
        memory_channels] tensor.
    """
    # The pixel space inputs are activated features.
    activated_features, memory_space_output = inputs

    # Recompute_grad takes only float tensors as inputs. It does not allow
    # bools or boolean tensors. For this reason, we cast training to a float
    # tensor and cast it back after we go through the recompute_grad wrap.
    float_tensor_training = tf.cast(training, tf.float32)

    for index in range(self._num_blocks):
      current_name, transformer_current_name = _get_current_names(index)
      block_fn_no_recompute = getattr(
          self, current_name)
      transformer_block_fn_no_recompute = getattr(
          self, transformer_current_name)
      current_drop_path_keep_prob = self._drop_path_keep_prob[index]

      # Wrap the layer if we want to recompute it in the backward pass.
      if (self._recompute_grad[index] and training):
        # The seed is not actually used since we do not have any random
        # operation in the recomputed function. The purpose of the provided seed
        # is to prevent recompute_grad from generating a new seed variable which
        # is not compatible with model exporting.
        block_fn = recompute_grad_lib.recompute_grad(
            block_fn_no_recompute, seed=tf.constant(0, tf.int32))
      else:
        block_fn = block_fn_no_recompute

      # The inputs to block_fn should be activated features.
      block_fn_inputs = [activated_features, float_tensor_training]
      # We have to define drop_path_masks outside the layer call and pass it
      # into the layer, because tf.recompute_grad (gradient checkpointing) does
      # not allow any randomness within the function call. In addition,
      # recompute_grad functions can only take Tensors as inputs, so we do not
      # pass the drop_path_random_mask (when it is None) into block_fn.
      if current_drop_path_keep_prob < 1.0 and training:
        drop_path_random_mask = drop_path.generate_drop_path_random_mask(
            activated_features, current_drop_path_keep_prob)

        block_fn_inputs.append(drop_path_random_mask)

      # Build the sub-layers when the block_fn is called for the first time.
      # Otherwise, recompute_grad will not track newly built model parameters.
      if self._first_building_call:
        _ = block_fn_no_recompute(tuple(block_fn_inputs))
      # Apply the residual block.
      features, activated_features = block_fn(tuple(block_fn_inputs))

      if index == 0 and self._add_absolute_positional_encoding is not None:
        features = self._add_absolute_positional_encoding(features,
                                                          training=training)
        activated_features = self._activation_fn(features)

      if transformer_block_fn_no_recompute is not None:
        # Reshape pixel space features from 4D to 3D.
        _, height, width, channels = features.get_shape().as_list()
        features = tf.reshape(
            features, [-1, height * width, channels])

        # Wrap the layer if we want to recompute it in the backward pass.
        if (self._transformer_use_recompute_grad and training):
          # The seed is not actually used since we do not have any random
          # operation in the recomputed function. The purpose of the provided
          # seed is to prevent recompute_grad from generating a new seed
          # variable which is not compatible with model exporting.
          transformer_block_fn = recompute_grad_lib.recompute_grad(
              transformer_block_fn_no_recompute, seed=tf.constant(0, tf.int32))
        else:
          transformer_block_fn = transformer_block_fn_no_recompute

        transformer_block_fn_input_list = [
            features, memory_space_output, float_tensor_training]
        # We have to define drop_path_masks outside the layer call and pass it
        # into the layer, because recompute_grad (gradient checkpointing) does
        # not allow any randomness within the function call. In addition,
        # recompute_grad functions can only take Tensors as inputs, so we do not
        # pass the drop_path_masks (when they are None) into
        # transformer_block_fn.
        if current_drop_path_keep_prob < 1.0 and training:
          # Drop path random mask for pixel space attention.
          pixel_space_drop_path_mask = drop_path.generate_drop_path_random_mask(
              memory_space_output, current_drop_path_keep_prob)
          # Drop path random mask for memory space attention.
          memory_space_attention_drop_path_mask = (
              drop_path.generate_drop_path_random_mask(
                  memory_space_output, current_drop_path_keep_prob))
          # Drop path random mask for memory space feed-forward network.
          memory_space_feed_forward_network_drop_path_mask = (
              drop_path.generate_drop_path_random_mask(
                  memory_space_output, current_drop_path_keep_prob))
          transformer_block_fn_input_list += [
              pixel_space_drop_path_mask,
              memory_space_attention_drop_path_mask,
              memory_space_feed_forward_network_drop_path_mask]

        # Build the sub-layers when the transformer_block_fn is called for the
        # first time. Otherwise, recompute_grad will not track newly built model
        # parameters.
        if self._first_building_call:
          _ = transformer_block_fn_no_recompute(
              tuple(transformer_block_fn_input_list))
        # Apply a dual-path transformer.
        features, activated_features, memory_space_output = (
            transformer_block_fn(tuple(transformer_block_fn_input_list)))

        # Reshape pixel space features back to 4D.
        features = tf.reshape(features, [-1, height, width, channels])
        activated_features = tf.reshape(activated_features,
                                        [-1, height, width, channels])
    # Now the first call has finished and the sub-layers have been built.
    self._first_building_call = False
    # We also return the non-activated output so that the function is compatible
    # with a decoder that takes a non-activated tensor as input.
    return features, activated_features, memory_space_output