nthngdy commited on
Commit
3b35c00
1 Parent(s): 56d4bda

Create configuration_manta.py

Browse files
Files changed (1) hide show
  1. configuration_manta.py +197 -0
configuration_manta.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2020, The Manta Authors and HuggingFace Inc.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Manta model configuration"""
16
+ from typing import Mapping
17
+
18
+ from transformers.configuration_utils import PretrainedConfig
19
+ from transformers.onnx import OnnxSeq2SeqConfigWithPast
20
+ from transformers.utils import logging
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ MANTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
26
+ "nthngdy/manta-base": "https://huggingface.co/nthngdy/manta-base/resolve/main/config.json",
27
+ }
28
+
29
+
30
+ class MantaConfig(PretrainedConfig):
31
+ r"""
32
+ This is the configuration class to store the configuration of a [`MantaModel`] or a [`TFMantaModel`]. It is used to
33
+ instantiate a Manta model according to the specified arguments, defining the model architecture. Instantiating a
34
+ configuration with the defaults will yield a similar configuration to that of the Manta-base architecture.
35
+
36
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
37
+ documentation from [`PretrainedConfig`] for more information.
38
+
39
+ Arguments:
40
+ vocab_size (`int`, *optional*, defaults to 32128):
41
+ Vocabulary size of the Manta model. Defines the number of different tokens that can be represented by the
42
+ `inputs_ids` passed when calling [`MantaModel`] or [`TFMantaModel`].
43
+ byte_embedding_dim (`int`, *optional*, defaults to 64):
44
+ Size of the input byte embeddings fed to the MANTa tokenization module.
45
+ frontier_predictor_num_layers (`int`, *optional*, defaults to 1):
46
+ Number of sliding window attention layers in the frontier predictor of the tokenization module.
47
+ frontier_predictor_num_attention_heads (`int`, *optional*, defaults to 8):
48
+ Number of attention heads in the frontier predictor of the tokenization module.
49
+ frontier_predictor_attention_window (`int`, *optional*, defaults to 16):
50
+ Size of the sliding attention window along the byte sequence.
51
+ pooling_variance_regularization (`float`, *optional*, defaults to 1.0e-6):
52
+ Variance regularization term used in the computation of the byte-block assignment map.
53
+ pooling_kernel_size (`int` or `List[List[int]]`, *optional*, defaults to 3):
54
+ Size(s) of the 1D-convolution kernel(s) used for the byte pooling operation in the tokenization module. Providing an integer
55
+ will imply using a convolution filter of `(pooling_kernel_size, byte_embedding_dim)`. Several kernel sizes can be provided
56
+ in the form `[(kernel_size, num_channels), ...]`. These will be concatenated in the style of [Character BERT](https://arxiv.org/pdf/2010.10392.pdf).
57
+ pooling_depthwise_convolution (`bool`, *optional*, defaults to `True`):
58
+ Activates depthwise convolution in the pooling operation of the tokenization module. Depthwise convolution will be faster, but might be
59
+ less powerful than normal convolution, and impedes using different number of channels.
60
+ pooling_mean_pool (`bool`, *optional*, defaults to `False`):
61
+ Activates mean-pooling instead of default max-pooling as the reduction operation for each block.
62
+ max_length_inputs (`int`, *optional*, defaults to 256):
63
+ Maximum sequence length of the byte input sequences. Can be greater than max_length_encoder_decoder.
64
+ max_length_encoder_decoder (`int`, *optional*, defaults to 256):
65
+ Maximum output sequence length of the tokenization module. This allows to control the length of the sequences that the encoder-decoder model receives.
66
+ d_model (`int`, *optional*, defaults to 512):
67
+ Size of the encoder layers and the pooler layer.
68
+ d_kv (`int`, *optional*, defaults to 64):
69
+ Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
70
+ num_heads`.
71
+ d_ff (`int`, *optional*, defaults to 2048):
72
+ Size of the intermediate feed forward layer in each `MantaBlock`.
73
+ num_layers (`int`, *optional*, defaults to 6):
74
+ Number of hidden layers in the Transformer encoder.
75
+ num_decoder_layers (`int`, *optional*):
76
+ Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
77
+ num_heads (`int`, *optional*, defaults to 8):
78
+ Number of attention heads for each attention layer in the Transformer encoder.
79
+ relative_attention_num_buckets (`int`, *optional*, defaults to 32):
80
+ The number of buckets to use for each attention layer.
81
+ relative_attention_max_distance (`int`, *optional*, defaults to 128):
82
+ The maximum distance of the longer sequences for the bucket separation.
83
+ dropout_rate (`float`, *optional*, defaults to 0.1):
84
+ The ratio for all dropout layers.
85
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
86
+ The epsilon used by the layer normalization layers.
87
+ initializer_factor (`float`, *optional*, defaults to 1):
88
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
89
+ testing).
90
+ feed_forward_proj (`string`, *optional*, defaults to `"relu"`):
91
+ Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. Mantav1.1 uses the
92
+ `"gated-gelu"` feed forward projection. Original Manta uses `"relu"`.
93
+ use_cache (`bool`, *optional*, defaults to `True`):
94
+ Whether or not the model should return the last key/values attentions (not used by all models).
95
+ """
96
+ model_type = "manta"
97
+ keys_to_ignore_at_inference = ["past_key_values"]
98
+ attribute_map = {"hidden_size": "d_model", "num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
99
+
100
+ def __init__(
101
+ self,
102
+ vocab_size=384,
103
+ byte_embedding_dim=64,
104
+ frontier_predictor_num_layers=1,
105
+ frontier_predictor_num_attention_heads=8,
106
+ frontier_predictor_attention_window=16,
107
+ pooling_variance_regularization=1.0e-6,
108
+ pooling_kernel_size=3,
109
+ pooling_depthwise_convolution=True,
110
+ pooling_mean_pool=False,
111
+ max_length_inputs=256,
112
+ max_length_encoder_decoder=256,
113
+ d_model=512,
114
+ d_kv=64,
115
+ d_ff=2048,
116
+ num_layers=6,
117
+ num_decoder_layers=None,
118
+ num_heads=8,
119
+ relative_attention_num_buckets=32,
120
+ relative_attention_max_distance=128,
121
+ dropout_rate=0.1,
122
+ layer_norm_epsilon=1e-6,
123
+ initializer_factor=1.0,
124
+ feed_forward_proj="relu",
125
+ is_encoder_decoder=True,
126
+ use_cache=True,
127
+ pad_token_id=0,
128
+ eos_token_id=1,
129
+ **kwargs
130
+ ):
131
+ self.vocab_size = vocab_size
132
+ self.byte_embedding_dim = byte_embedding_dim
133
+ self.frontier_predictor_num_layers = frontier_predictor_num_layers
134
+ self.frontier_predictor_num_attention_heads = frontier_predictor_num_attention_heads
135
+ self.frontier_predictor_attention_window = frontier_predictor_attention_window
136
+ self.pooling_variance_regularization = pooling_variance_regularization
137
+ self.pooling_kernel_size = pooling_kernel_size
138
+ self.pooling_depthwise_convolution = pooling_depthwise_convolution
139
+ self.pooling_mean_pool = pooling_mean_pool
140
+ self.max_length_inputs = max_length_inputs
141
+ self.max_length_encoder_decoder = max_length_encoder_decoder
142
+ self.d_model = d_model
143
+ self.d_kv = d_kv
144
+ self.d_ff = d_ff
145
+ self.num_layers = num_layers
146
+ self.num_decoder_layers = (
147
+ num_decoder_layers if num_decoder_layers is not None else self.num_layers
148
+ ) # default = symmetry
149
+ self.num_heads = num_heads
150
+ self.relative_attention_num_buckets = relative_attention_num_buckets
151
+ self.relative_attention_max_distance = relative_attention_max_distance
152
+ self.dropout_rate = dropout_rate
153
+ self.layer_norm_epsilon = layer_norm_epsilon
154
+ self.initializer_factor = initializer_factor
155
+ self.feed_forward_proj = feed_forward_proj
156
+ self.use_cache = use_cache
157
+
158
+ act_info = self.feed_forward_proj.split("-")
159
+ self.dense_act_fn = act_info[-1]
160
+ self.is_gated_act = act_info[0] == "gated"
161
+
162
+ if len(act_info) > 1 and act_info[0] != "gated" or len(act_info) > 2:
163
+ raise ValueError(
164
+ f"`feed_forward_proj`: {feed_forward_proj} is not a valid activation function of the dense layer."
165
+ "Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. "
166
+ "'gated-gelu' or 'relu'"
167
+ )
168
+
169
+ if (
170
+ pooling_depthwise_convolution
171
+ and isinstance(pooling_kernel_size, list)
172
+ and any(size != byte_embedding_dim for _, size in pooling_kernel_size)
173
+ ):
174
+ raise ValueError(
175
+ f"`pooling_kernel_size`: {pooling_kernel_size} is not a valid list of kernels when "
176
+ f"`pooling_depthwise_convolution` is True. Please set all kernel dimensions to {byte_embedding_dim}"
177
+ f"(=`byte_embedding_dim`) or `pooling_depthwise_convolution“ to False."
178
+ )
179
+
180
+ tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
181
+ if tie_word_embeddings and byte_embedding_dim != d_model:
182
+ raise ValueError(
183
+ f"The input embedding dimension (`byte_embedding_dim={byte_embedding_dim}`) is not the same as the "
184
+ f"model hidden dimension (`d_model={d_model}`), making it impossible to tie input and output weights."
185
+ )
186
+
187
+ # for backwards compatibility
188
+ if feed_forward_proj == "gated-gelu":
189
+ self.dense_act_fn = "gelu_new"
190
+
191
+ super().__init__(
192
+ pad_token_id=pad_token_id,
193
+ eos_token_id=eos_token_id,
194
+ is_encoder_decoder=is_encoder_decoder,
195
+ tie_word_embeddings=tie_word_embeddings,
196
+ **kwargs,
197
+ )