""" Hiera model configuration""" import math from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) # HIERA_PRETRAINED_CONFIG_ARCHIVE_MAP = { # "hoge/hoge": ("/config.json"), # } class HieraConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate a Hiera model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the Hiera [/]() architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: image_size (`int`, *optional*, defaults to 224): The size (resolution) of each image. patch_size (`list(int)`, *optional*, defaults to [7, 7]): The size (resolution) of each patch. stride_size (`list(int)`, *optional*, defaults to [4, 4]): The size (resolution) of each stride. padding_size (`list(int)`, *optional*, defaults to [3, 3]): The size (resolution) of each padding. num_channels (`int`, *optional*, defaults to 3): The number of input channels. embed_dim (`int`, *optional*, defaults to 96): Dimensionality of patch embedding. depths (`list(int)`, *optional*, defaults to `[2, 3, 16, 3]`): Depth of each layer in the Transformer encoder. num_heads (`list(int)`, *optional*, defaults to `[1, 2, 4, 8]`): Number of attention heads in each layer of the Transformer encoder. q_pool (`int`, *optional*, defaults to 3): Number of q_pool stages. q_stride (`list(int)`, *optional*, defaults to [2, 2]): Size of stride of q_pool, mask_unit_size (`list(int)`, *optional*, defaults to [8, 8]): Size of mask unit in attention. mask_unit_attention (`list(bool)`, *optional*, defaults to [True, True, False, False]): Whether or not to enable mask unit attention in each stage. separate_positional_embeds (`bool`, *optional*, defaults to False): Whether or not to use separeted positional embeddings. mlp_ratio (`float`, *optional*, defaults to 4.0): Ratio of MLP hidden dimensionality to embedding dimensionality. drop_path_rate (`float`, *optional*, defaults to 0.1): Stochastic depth rate. hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. hidden_dropout_prob (`float`, *optional*, defaults to 0.0): The dropout probability for all fully connected layers in the embeddings and encoder. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. initializer_bias (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all bias matrices. Example: ```python >>> from transformers import HieraConfig, HieraModel >>> # Initializing a Hiera / style configuration >>> configuration = HieraConfig() >>> # Initializing a model (with random weights) from the / style configuration >>> model = HieraModel(configuration) >>> # Accessing the model configuration >>> configuration = model.config ```""" model_type = "hiera" attribute_map = {} def __init__( self, image_size=224, patch_size=[7, 7], stride_size=[4, 4], padding_size=[3, 3], num_channels=3, embed_dim=96, depths=[2, 3, 16, 3], num_heads=[1, 2, 4, 8], q_pool=3, # number of q_pool stages q_stride=[2, 2], mask_unit_size=[8, 8], mask_unit_attention=[True, True, False, False], separate_positional_embeds=False, mlp_ratio=4.0, drop_path_rate=0.0, hidden_act="gelu", layer_norm_eps=1e-6, hidden_dropout_prob=0.0, initializer_range=0.02, initializer_bias=0.02, **kwargs, ): super().__init__(**kwargs) self.image_size = image_size self.patch_size = patch_size self.stride_size = stride_size self.padding_size = padding_size self.num_channels = num_channels self.embed_dim = embed_dim self.depths = depths self.num_layers = len(depths) self.num_heads = num_heads self.mlp_ratio = mlp_ratio self.hidden_dropout_prob = hidden_dropout_prob self.drop_path_rate = drop_path_rate self.hidden_act = hidden_act self.layer_norm_eps = layer_norm_eps assert q_pool < len(depths), "q_pool must be less than depths" self.mask_unit_size = mask_unit_size self.flat_mask_unit_size = int(math.prod(mask_unit_size)) self.mask_unit_attention = mask_unit_attention self.q_pool = q_pool self.q_stride = q_stride self.flat_q_stride = int(math.prod(q_stride)) self.separate_positional_embeds = separate_positional_embeds self.initializer_range = initializer_range self.initializer_bias = initializer_bias