simlamkr1 commited on
Commit
16cf2c1
1 Parent(s): 377cfb4

Upload 5 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "\t\t": 50294,
3
+ "\t\t\t": 50293,
4
+ "\t\t\t\t": 50292,
5
+ "\t\t\t\t\t": 50291,
6
+ "\t\t\t\t\t\t": 50290,
7
+ "\t\t\t\t\t\t\t": 50289,
8
+ "\t\t\t\t\t\t\t\t": 50288,
9
+ "\t\t\t\t\t\t\t\t\t": 50287,
10
+ " ": 50286,
11
+ " ": 50285,
12
+ " ": 50284,
13
+ " ": 50283,
14
+ " ": 50282,
15
+ " ": 50281,
16
+ " ": 50280,
17
+ " ": 50279,
18
+ " ": 50278,
19
+ " ": 50277,
20
+ " ": 50276,
21
+ " ": 50275,
22
+ " ": 50274,
23
+ " ": 50273,
24
+ " ": 50272,
25
+ " ": 50271,
26
+ " ": 50270,
27
+ " ": 50269,
28
+ " ": 50268,
29
+ " ": 50267,
30
+ " ": 50266,
31
+ " ": 50265,
32
+ " ": 50264,
33
+ " ": 50263,
34
+ " ": 50262,
35
+ " ": 50261,
36
+ " ": 50260,
37
+ " ": 50259,
38
+ " ": 50258,
39
+ " ": 50257
40
+ }
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "phi-1.5-half",
3
+ "activation_function": "gelu_new",
4
+ "architecture": {
5
+ "block_cls": "parallel",
6
+ "mixer": {},
7
+ "mlp": {
8
+ "mlp_cls": "mlp"
9
+ }
10
+ },
11
+ "architectures": [
12
+ "MixFormerSequentialForCausalLM"
13
+ ],
14
+ "auto_map": {
15
+ "AutoConfig": "configuration_mixformer_sequential.MixFormerSequentialConfig",
16
+ "AutoModelForCausalLM": "modeling_mixformer_sequential.MixFormerSequentialForCausalLM"
17
+ },
18
+ "embd_layer": "default",
19
+ "embd_pdrop": 0.0,
20
+ "initializer_range": 0.02,
21
+ "layer_norm_epsilon": 1e-05,
22
+ "model_type": "mixformer-sequential",
23
+ "n_embd": 2048,
24
+ "n_head": 32,
25
+ "n_inner": null,
26
+ "n_layer": 24,
27
+ "n_positions": 2048,
28
+ "phyagi_version": "0.0.4.dev",
29
+ "resid_pdrop": 0.0,
30
+ "rotary_dim": 32,
31
+ "tie_word_embeddings": false,
32
+ "torch_dtype": "float16",
33
+ "transformers_version": "4.32.1",
34
+ "vocab_size": 51200
35
+ }
configuration_mixformer_sequential.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ import math
5
+ from typing import Any, Dict, List, Optional, Union
6
+
7
+ from transformers import PretrainedConfig
8
+
9
+
10
+ class MixFormerSequentialConfig(PretrainedConfig):
11
+ """MixFormer (sequential for DeepSpeed) configuration."""
12
+
13
+ model_type = "mixformer-sequential"
14
+
15
+ attribute_map = {
16
+ "max_position_embeddings": "n_positions",
17
+ "hidden_size": "n_embd",
18
+ "num_attention_heads": "n_head",
19
+ "num_hidden_layers": "n_layer",
20
+ "input_emb_layer": "embd_layer", # `input_emb_layer` key is for backward compatibility
21
+ "blocks": "architecture", # `blocks` key is for backward compatibility
22
+ }
23
+
24
+ def __init__(
25
+ self,
26
+ vocab_size: Optional[int] = 50304,
27
+ n_positions: Optional[int] = 2048,
28
+ n_embd: Optional[int] = 1024,
29
+ n_layer: Optional[int] = 20,
30
+ n_inner: Optional[int] = None,
31
+ n_head: Optional[int] = 16,
32
+ rotary_dim: Optional[int] = 32,
33
+ activation_function: Optional[str] = "gelu_new",
34
+ embd_layer: Optional[str] = "default",
35
+ architecture: Union[Dict[str, Any], List[Dict[str, Any]]] = None,
36
+ embd_pdrop: Optional[float] = 0.0,
37
+ resid_pdrop: Optional[float] = 0.0,
38
+ layer_norm_epsilon: Optional[float] = 1e-5,
39
+ initializer_range: Optional[float] = 0.02,
40
+ tie_word_embeddings: Optional[bool] = False,
41
+ pad_vocab_size_multiple: Optional[int] = 64,
42
+ **kwargs
43
+ ) -> None:
44
+ self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple)
45
+ self.n_positions = n_positions
46
+ self.n_embd = n_embd
47
+ self.n_layer = n_layer
48
+ self.n_inner = n_inner
49
+ self.n_head = n_head
50
+ self.rotary_dim = min(rotary_dim, n_embd // n_head)
51
+ self.activation_function = activation_function
52
+ self.embd_layer = embd_layer
53
+ self.architecture = architecture
54
+ self.embd_pdrop = embd_pdrop
55
+ self.resid_pdrop = resid_pdrop
56
+ self.layer_norm_epsilon = layer_norm_epsilon
57
+ self.initializer_range = initializer_range
58
+
59
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.32.1"
4
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff