matthewkenney commited on
Commit
2a0e8c9
1 Parent(s): e93c96f

Create configuration_mixformer_sequential.py

Browse files
configuration_mixformer_sequential.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Copyright (c) Microsoft Corporation.
3
+ # Licensed under the MIT license.
4
+
5
+ import math
6
+ from typing import Any, Dict, List, Optional, Union
7
+
8
+ from transformers import PretrainedConfig
9
+
10
+
11
+ class MixFormerSequentialConfig(PretrainedConfig):
12
+ """MixFormer (sequential for DeepSpeed) configuration."""
13
+
14
+ model_type = "mixformer-sequential"
15
+
16
+ attribute_map = {
17
+ "max_position_embeddings": "n_positions",
18
+ "hidden_size": "n_embd",
19
+ "num_attention_heads": "n_head",
20
+ "num_hidden_layers": "n_layer",
21
+ "input_emb_layer": "embd_layer", # `input_emb_layer` key is for backward compatibility
22
+ "blocks": "architecture", # `blocks` key is for backward compatibility
23
+ }
24
+
25
+ def __init__(
26
+ self,
27
+ vocab_size: Optional[int] = 50304,
28
+ n_positions: Optional[int] = 2048,
29
+ n_embd: Optional[int] = 1024,
30
+ n_layer: Optional[int] = 20,
31
+ n_inner: Optional[int] = None,
32
+ n_head: Optional[int] = 16,
33
+ rotary_dim: Optional[int] = 32,
34
+ activation_function: Optional[str] = "gelu_new",
35
+ embd_layer: Optional[str] = "default",
36
+ architecture: Union[Dict[str, Any], List[Dict[str, Any]]] = None,
37
+ embd_pdrop: Optional[float] = 0.0,
38
+ resid_pdrop: Optional[float] = 0.0,
39
+ layer_norm_epsilon: Optional[float] = 1e-5,
40
+ initializer_range: Optional[float] = 0.02,
41
+ tie_word_embeddings: Optional[bool] = False,
42
+ pad_vocab_size_multiple: Optional[int] = 64,
43
+ **kwargs
44
+ ) -> None:
45
+ self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple)
46
+ self.n_positions = n_positions
47
+ self.n_embd = n_embd
48
+ self.n_layer = n_layer
49
+ self.n_inner = n_inner
50
+ self.n_head = n_head
51
+ self.rotary_dim = min(rotary_dim, n_embd // n_head)
52
+ self.activation_function = activation_function
53
+ self.embd_layer = embd_layer
54
+ self.architecture = architecture
55
+ self.embd_pdrop = embd_pdrop
56
+ self.resid_pdrop = resid_pdrop
57
+ self.layer_norm_epsilon = layer_norm_epsilon
58
+ self.initializer_range = initializer_range
59
+
60
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)