ammarnasr commited on
Commit
bd546f3
1 Parent(s): f55be2c

Create configuration_codegen.py

Browse files
Files changed (1) hide show
  1. configuration_codegen.py +220 -0
configuration_codegen.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 Salesforce authors, The EleutherAI, and HuggingFace Teams. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ CodeGen model configuration"""
16
+ from collections import OrderedDict
17
+ from typing import Any, List, Mapping, Optional
18
+
19
+ from transformers import PreTrainedTokenizer, TensorType, is_torch_available
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.onnx import OnnxConfigWithPast, PatchingSpec
22
+ from transformers.utils import logging
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+
28
+ class CodeGenConfig(PretrainedConfig):
29
+ r"""
30
+ This is the configuration class to store the configuration of a [`CodeGenModel`]. It is used to instantiate a
31
+ CodeGen model according to the specified arguments, defining the model architecture. Instantiating a configuration
32
+ with the defaults will yield a similar configuration to that of the CodeGen
33
+ [Salesforce/codegen-2B-mono](https://huggingface.co/Salesforce/codegen-2B-mono) architecture. Configuration objects
34
+ inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
35
+ [`PretrainedConfig`] for more information.
36
+
37
+ Args:
38
+ vocab_size (`int`, *optional*, defaults to 50400):
39
+ Vocabulary size of the CodeGen model. Defines the number of different tokens that can be represented by the
40
+ `inputs_ids` passed when calling [`CodeGenModel`].
41
+ n_positions (`int`, *optional*, defaults to 2048):
42
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
43
+ just in case (e.g., 512 or 1024 or 2048).
44
+ n_embd (`int`, *optional*, defaults to 4096):
45
+ Dimensionality of the embeddings and hidden states.
46
+ n_layer (`int`, *optional*, defaults to 28):
47
+ Number of hidden layers in the Transformer encoder.
48
+ n_head (`int`, *optional*, defaults to 16):
49
+ Number of attention heads for each attention layer in the Transformer encoder.
50
+ rotary_dim (`int`, *optional*, defaults to 64):
51
+ Number of dimensions in the embedding that Rotary Position Embedding is applied to.
52
+ n_inner (`int`, *optional*, defaults to None):
53
+ Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
54
+ activation_function (`str`, *optional*, defaults to `"gelu_new"`):
55
+ Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
56
+ resid_pdrop (`float`, *optional*, defaults to 0.1):
57
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
58
+ embd_pdrop (`int`, *optional*, defaults to 0.1):
59
+ The dropout ratio for the embeddings.
60
+ attn_pdrop (`float`, *optional*, defaults to 0.1):
61
+ The dropout ratio for the attention.
62
+ layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
63
+ The epsilon to use in the layer normalization layers.
64
+ initializer_range (`float`, *optional*, defaults to 0.02):
65
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
66
+ scale_attn_weights (`bool`, *optional*, defaults to `True`):
67
+ Scale attention weights by dividing by sqrt(hidden_size).
68
+ use_cache (`bool`, *optional*, defaults to `True`):
69
+ Whether or not the model should return the last key/values attentions (not used by all models).
70
+
71
+ Example:
72
+
73
+ ```python
74
+ >>> from transformers import CodeGenModel, CodeGenConfig
75
+
76
+ >>> # Initializing a CodeGen 6B configuration
77
+ >>> configuration = CodeGenConfig()
78
+
79
+ >>> # Initializing a model from the configuration
80
+ >>> model = CodeGenModel(configuration)
81
+
82
+ >>> # Accessing the model configuration
83
+ >>> configuration = model.config
84
+ ```"""
85
+ model_type = "codegen"
86
+ attribute_map = {
87
+ "max_position_embeddings": "n_positions",
88
+ "hidden_size": "n_embd",
89
+ "num_attention_heads": "n_head",
90
+ "num_hidden_layers": "n_layer",
91
+ }
92
+
93
+ def __init__(
94
+ self,
95
+ vocab_size=50400,
96
+ n_positions=2048,
97
+ n_ctx=2048,
98
+ n_embd=4096,
99
+ n_layer=28,
100
+ n_head=16,
101
+ rotary_dim=64,
102
+ n_inner=None,
103
+ activation_function="gelu_new",
104
+ resid_pdrop=0.0,
105
+ embd_pdrop=0.0,
106
+ attn_pdrop=0.0,
107
+ layer_norm_epsilon=1e-5,
108
+ initializer_range=0.02,
109
+ scale_attn_weights=True,
110
+ use_cache=True,
111
+ bos_token_id=50256,
112
+ eos_token_id=50256,
113
+ tie_word_embeddings=False,
114
+ **kwargs
115
+ ):
116
+ self.vocab_size = vocab_size
117
+ self.n_ctx = n_ctx
118
+ self.n_positions = n_positions
119
+ self.n_embd = n_embd
120
+ self.n_layer = n_layer
121
+ self.n_head = n_head
122
+ self.n_inner = n_inner
123
+ self.rotary_dim = rotary_dim
124
+ self.activation_function = activation_function
125
+ self.resid_pdrop = resid_pdrop
126
+ self.embd_pdrop = embd_pdrop
127
+ self.attn_pdrop = attn_pdrop
128
+ self.layer_norm_epsilon = layer_norm_epsilon
129
+ self.initializer_range = initializer_range
130
+ self.scale_attn_weights = scale_attn_weights
131
+ self.use_cache = use_cache
132
+
133
+ self.bos_token_id = bos_token_id
134
+ self.eos_token_id = eos_token_id
135
+
136
+ super().__init__(
137
+ bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
138
+ )
139
+
140
+
141
+ # Copied from transformers.models.gpt2.configuration_gpt2.GPT2OnnxConfig
142
+ class CodeGenOnnxConfig(OnnxConfigWithPast):
143
+ def __init__(
144
+ self,
145
+ config: PretrainedConfig,
146
+ task: str = "default",
147
+ patching_specs: List[PatchingSpec] = None,
148
+ use_past: bool = False,
149
+ ):
150
+ super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
151
+ if not getattr(self._config, "pad_token_id", None):
152
+ # TODO: how to do that better?
153
+ self._config.pad_token_id = 0
154
+
155
+ @property
156
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
157
+ common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
158
+ if self.use_past:
159
+ self.fill_with_past_key_values_(common_inputs, direction="inputs")
160
+ common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
161
+ else:
162
+ common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
163
+
164
+ return common_inputs
165
+
166
+ @property
167
+ def num_layers(self) -> int:
168
+ return self._config.n_layer
169
+
170
+ @property
171
+ def num_attention_heads(self) -> int:
172
+ return self._config.n_head
173
+
174
+ def generate_dummy_inputs(
175
+ self,
176
+ tokenizer: PreTrainedTokenizer,
177
+ batch_size: int = -1,
178
+ seq_length: int = -1,
179
+ is_pair: bool = False,
180
+ framework: Optional[TensorType] = None,
181
+ ) -> Mapping[str, Any]:
182
+ common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
183
+ tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
184
+ )
185
+
186
+ # We need to order the input in the way they appears in the forward()
187
+ ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
188
+
189
+ # Need to add the past_keys
190
+ if self.use_past:
191
+ if not is_torch_available():
192
+ raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
193
+ else:
194
+ import torch
195
+
196
+ batch, seqlen = common_inputs["input_ids"].shape
197
+ # Not using the same length for past_key_values
198
+ past_key_values_length = seqlen + 2
199
+ past_shape = (
200
+ batch,
201
+ self.num_attention_heads,
202
+ past_key_values_length,
203
+ self._config.hidden_size // self.num_attention_heads,
204
+ )
205
+ ordered_inputs["past_key_values"] = [
206
+ (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
207
+ ]
208
+
209
+ ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
210
+ if self.use_past:
211
+ mask_dtype = ordered_inputs["attention_mask"].dtype
212
+ ordered_inputs["attention_mask"] = torch.cat(
213
+ [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
214
+ )
215
+
216
+ return ordered_inputs
217
+
218
+ @property
219
+ def default_onnx_opset(self) -> int:
220
+ return 13