Qubitium commited on
Commit
3784dd7
1 Parent(s): 3961c63

Upload configuration_dbrx.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configuration_dbrx.py +264 -0
configuration_dbrx.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Dbrx configuration."""
2
+ from typing import Any, Optional
3
+
4
+ from transformers.configuration_utils import PretrainedConfig
5
+ from transformers.utils import logging
6
+
7
+ logger = logging.get_logger(__name__)
8
+
9
+ DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
10
+
11
+
12
+ class DbrxAttentionConfig(PretrainedConfig):
13
+ """Configuration class for Dbrx Attention.
14
+
15
+ [`DbrxAttention`] class. It is used to instantiate attention layers
16
+ according to the specified arguments, defining the layers architecture.
17
+
18
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
19
+ documentation from [`PretrainedConfig`] for more information.
20
+
21
+ Args:
22
+ attn_pdrop (`float`, *optional*, defaults to 0.0):
23
+ The dropout probability for the attention layers.
24
+ clip_qkv (`float`, *optional*, defualts to None):
25
+ If not `None`, clip the queries, keys, and values in the attention layer to this value.
26
+ kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
27
+ rope_theta (float): The base frequency for rope.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ attn_pdrop: float = 0,
33
+ clip_qkv: Optional[float] = None,
34
+ kv_n_heads: int = 1,
35
+ rope_theta: float = 10000.0,
36
+ **kwargs: Any,
37
+ ):
38
+ super().__init__(**kwargs)
39
+ self.attn_pdrop = attn_pdrop
40
+ self.clip_qkv = clip_qkv
41
+ self.kv_n_heads = kv_n_heads
42
+ self.rope_theta = rope_theta
43
+
44
+ for k in ['model_type']:
45
+ if k in kwargs:
46
+ kwargs.pop(k)
47
+ if len(kwargs) != 0:
48
+ raise ValueError(f'Found unknown {kwargs=}')
49
+
50
+ @classmethod
51
+ def from_pretrained(cls, pretrained_model_name_or_path: str,
52
+ **kwargs: Any) -> 'PretrainedConfig':
53
+ cls._set_token_in_kwargs(kwargs)
54
+
55
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
56
+ **kwargs)
57
+
58
+ if config_dict.get('model_type') == 'dbrx':
59
+ config_dict = config_dict['attn_config']
60
+
61
+ if 'model_type' in config_dict and hasattr(
62
+ cls,
63
+ 'model_type') and config_dict['model_type'] != cls.model_type:
64
+ logger.warning(
65
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
66
+ +
67
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
68
+ )
69
+
70
+ return cls.from_dict(config_dict, **kwargs)
71
+
72
+
73
+ class DbrxFFNConfig(PretrainedConfig):
74
+ """Configuration class for Dbrx FFN.
75
+
76
+ [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
77
+ the specified arguments, defining the layers architecture.
78
+
79
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
80
+ documentation from [`PretrainedConfig`] for more information.
81
+
82
+ Args:
83
+ ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
84
+ The dict should have a key 'name' with the value being the name of
85
+ the activation function along with any additional keyword arguments.
86
+ ffn_hidden_size (int, optional): The hidden size of the feedforward network.
87
+ moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
88
+ moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
89
+ moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
90
+ moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
91
+ moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
92
+ uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
93
+ This should only be used for benchmarking purposes.
94
+ """
95
+
96
+ def __init__(
97
+ self,
98
+ ffn_act_fn: Optional[dict] = None,
99
+ ffn_hidden_size: int = 3584,
100
+ moe_num_experts: int = 4,
101
+ moe_top_k: int = 1,
102
+ moe_jitter_eps: Optional[float] = None,
103
+ moe_loss_weight: float = 0.01,
104
+ moe_normalize_expert_weights: Optional[float] = 1,
105
+ uniform_expert_assignment: bool = False,
106
+ **kwargs: Any,
107
+ ):
108
+ super().__init__()
109
+ if ffn_act_fn is None:
110
+ ffn_act_fn = {'name': 'silu'}
111
+ self.ffn_act_fn = ffn_act_fn
112
+ self.ffn_hidden_size = ffn_hidden_size
113
+ self.moe_num_experts = moe_num_experts
114
+ self.moe_top_k = moe_top_k
115
+ self.moe_jitter_eps = moe_jitter_eps
116
+ self.moe_loss_weight = moe_loss_weight
117
+ self.moe_normalize_expert_weights = moe_normalize_expert_weights
118
+ self.uniform_expert_assignment = uniform_expert_assignment
119
+
120
+ for k in ['model_type']:
121
+ if k in kwargs:
122
+ kwargs.pop(k)
123
+ if len(kwargs) != 0:
124
+ raise ValueError(f'Found unknown {kwargs=}')
125
+
126
+ @classmethod
127
+ def from_pretrained(cls, pretrained_model_name_or_path: str,
128
+ **kwargs: Any) -> 'PretrainedConfig':
129
+ cls._set_token_in_kwargs(kwargs)
130
+
131
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
132
+ **kwargs)
133
+
134
+ if config_dict.get('model_type') == 'dbrx':
135
+ config_dict = config_dict['ffn_config']
136
+
137
+ if 'model_type' in config_dict and hasattr(
138
+ cls,
139
+ 'model_type') and config_dict['model_type'] != cls.model_type:
140
+ logger.warning(
141
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
142
+ +
143
+ f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
144
+ )
145
+
146
+ return cls.from_dict(config_dict, **kwargs)
147
+
148
+
149
+ class DbrxConfig(PretrainedConfig):
150
+ """Configuration class for Dbrx.
151
+
152
+ [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
153
+ specified arguments, defining the model architecture.
154
+
155
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
156
+ documentation from [`PretrainedConfig`] for more information.
157
+
158
+
159
+ Args:
160
+ d_model (`int`, *optional*, defaults to 6144):
161
+ Dimensionality of the embeddings and hidden states.
162
+ n_heads (`int`, *optional*, defaults to 48):
163
+ Number of attention heads for each attention layer in the Transformer encoder.
164
+ n_layers (`int`, *optional*, defaults to 40):
165
+ Number of hidden layers in the Transformer encoder.
166
+ max_seq_len (`int`, *optional*, defaults to 32768):
167
+ The maximum sequence length of the model.
168
+ vocab_size (`int`, *optional*, defaults to 100352):
169
+ Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
170
+ the `inputs_ids` passed when calling [`DbrxModel`].
171
+ resid_pdrop (`float`, *optional*, defaults to 0.0):
172
+ The dropout probability applied to the attention output before combining with residual.
173
+ emb_pdrop (`float`, *optional*, defaults to 0.0):
174
+ The dropout probability for the embedding layer.
175
+ attn_config (`dict`, *optional*):
176
+ A dictionary used to configure the model's attention module.
177
+ ffn_config (`dict`, *optional*):
178
+ A dictionary used to configure the model's FFN module.
179
+ use_cache (`bool`, *optional*, defaults to `False`):
180
+ Whether or not the model should return the last key/values attentions (not used by all models).
181
+ initializer_range (`float`, *optional*, defaults to 0.02):
182
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
183
+ output_router_logits (`bool`, *optional*, defaults to `False`):
184
+ Whether or not the router logits should be returned by the model. Enabling this will also
185
+ allow the model to output the auxiliary loss. See [here]() for more details
186
+ router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
187
+ The aux loss factor for the total loss.
188
+
189
+
190
+ Example:
191
+ ```python
192
+ >>> from transformers import DbrxConfig, DbrxModel
193
+
194
+ >>> # Initializing a Dbrx configuration
195
+ >>> configuration = DbrxConfig()
196
+
197
+ >>> # Initializing a model (with random weights) from the configuration
198
+ >>> model = DbrxModel(configuration)
199
+
200
+ >>> # Accessing the model configuration
201
+ >>> configuration = model.config
202
+ ```
203
+ """
204
+
205
+ model_type = 'dbrx'
206
+ attribute_map = {
207
+ 'num_attention_heads': 'n_heads',
208
+ 'hidden_size': 'd_model',
209
+ 'num_hidden_layers': 'n_layers',
210
+ 'max_position_embeddings': 'max_seq_len'
211
+ }
212
+
213
+ def __init__(
214
+ self,
215
+ d_model: int = 2048,
216
+ n_heads: int = 16,
217
+ n_layers: int = 24,
218
+ max_seq_len: int = 2048,
219
+ vocab_size: int = 32000,
220
+ resid_pdrop: float = 0.0,
221
+ emb_pdrop: float = 0.0,
222
+ attn_config: Optional[DbrxAttentionConfig] = None,
223
+ ffn_config: Optional[DbrxFFNConfig] = None,
224
+ use_cache: bool = True,
225
+ initializer_range: float = 0.02,
226
+ output_router_logits: bool = False,
227
+ router_aux_loss_coef: float = 0.05,
228
+ **kwargs: Any,
229
+ ):
230
+ if attn_config is None:
231
+ self.attn_config = DbrxAttentionConfig()
232
+ elif isinstance(attn_config, dict):
233
+ self.attn_config = DbrxAttentionConfig(**attn_config)
234
+ else:
235
+ self.attn_config = attn_config
236
+
237
+ if ffn_config is None:
238
+ self.ffn_config = DbrxFFNConfig()
239
+ elif isinstance(ffn_config, dict):
240
+ self.ffn_config = DbrxFFNConfig(**ffn_config)
241
+ else:
242
+ self.ffn_config = ffn_config
243
+
244
+ self.d_model = d_model
245
+ self.n_heads = n_heads
246
+ self.n_layers = n_layers
247
+ self.max_seq_len = max_seq_len
248
+ self.vocab_size = vocab_size
249
+ self.resid_pdrop = resid_pdrop
250
+ self.emb_pdrop = emb_pdrop
251
+ self.use_cache = use_cache
252
+ self.initializer_range = initializer_range
253
+ self.output_router_logits = output_router_logits
254
+ self.router_aux_loss_coef = router_aux_loss_coef
255
+
256
+ tie_word_embeddings = kwargs.pop('tie_word_embeddings', False)
257
+ if tie_word_embeddings:
258
+ raise ValueError(
259
+ 'tie_word_embeddings is not supported for Dbrx models.')
260
+
261
+ super().__init__(
262
+ tie_word_embeddings=tie_word_embeddings,
263
+ **kwargs,
264
+ )