gugarosa commited on
Commit
bd98e4e
1 Parent(s): 34b22f4

Delete configuration_mixformer_sequential.py

Browse files
configuration_mixformer_sequential.py DELETED
@@ -1,61 +0,0 @@
1
- # Copyright (c) Microsoft Corporation.
2
- # Licensed under the MIT license.
3
-
4
- import math
5
- from typing import Optional
6
-
7
- from transformers import PretrainedConfig
8
-
9
-
10
- class MixFormerSequentialConfig(PretrainedConfig):
11
- """MixFormer (sequential for DeepSpeed) configuration."""
12
-
13
- model_type = "mixformer-sequential"
14
-
15
- attribute_map = {
16
- "max_position_embeddings": "n_positions",
17
- "hidden_size": "n_embd",
18
- "num_attention_heads": "n_head",
19
- "num_hidden_layers": "n_layer",
20
- }
21
-
22
- def __init__(
23
- self,
24
- vocab_size: int = 50304,
25
- n_positions: int = 2048,
26
- n_embd: int = 1024,
27
- n_layer: int = 20,
28
- n_inner: Optional[int] = None,
29
- n_head: int = 16,
30
- n_head_kv: Optional[int] = None,
31
- rotary_dim: Optional[int] = 32,
32
- activation_function: Optional[str] = "gelu_new",
33
- flash_rotary: bool = False,
34
- fused_dense: bool = False,
35
- attn_pdrop: float = 0.0,
36
- embd_pdrop: float = 0.0,
37
- resid_pdrop: float = 0.0,
38
- layer_norm_epsilon: float = 1e-5,
39
- initializer_range: float = 0.02,
40
- tie_word_embeddings: bool = False,
41
- pad_vocab_size_multiple: int = 64,
42
- **kwargs
43
- ) -> None:
44
- self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple)
45
- self.n_positions = n_positions
46
- self.n_embd = n_embd
47
- self.n_layer = n_layer
48
- self.n_inner = n_inner
49
- self.n_head = n_head
50
- self.n_head_kv = n_head_kv
51
- self.rotary_dim = min(rotary_dim, n_embd // n_head)
52
- self.activation_function = activation_function
53
- self.flash_rotary = flash_rotary
54
- self.fused_dense = fused_dense
55
- self.attn_pdrop = attn_pdrop
56
- self.embd_pdrop = embd_pdrop
57
- self.resid_pdrop = resid_pdrop
58
- self.layer_norm_epsilon = layer_norm_epsilon
59
- self.initializer_range = initializer_range
60
-
61
- super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)