vince62s commited on
Commit
d70f024
1 Parent(s): 20f6b45

Upload 6 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "\t\t": 50294,
3
+ "\t\t\t": 50293,
4
+ "\t\t\t\t": 50292,
5
+ "\t\t\t\t\t": 50291,
6
+ "\t\t\t\t\t\t": 50290,
7
+ "\t\t\t\t\t\t\t": 50289,
8
+ "\t\t\t\t\t\t\t\t": 50288,
9
+ "\t\t\t\t\t\t\t\t\t": 50287,
10
+ " ": 50286,
11
+ " ": 50285,
12
+ " ": 50284,
13
+ " ": 50283,
14
+ " ": 50282,
15
+ " ": 50281,
16
+ " ": 50280,
17
+ " ": 50279,
18
+ " ": 50278,
19
+ " ": 50277,
20
+ " ": 50276,
21
+ " ": 50275,
22
+ " ": 50274,
23
+ " ": 50273,
24
+ " ": 50272,
25
+ " ": 50271,
26
+ " ": 50270,
27
+ " ": 50269,
28
+ " ": 50268,
29
+ " ": 50267,
30
+ " ": 50266,
31
+ " ": 50265,
32
+ " ": 50264,
33
+ " ": 50263,
34
+ " ": 50262,
35
+ " ": 50261,
36
+ " ": 50260,
37
+ " ": 50259,
38
+ " ": 50258,
39
+ " ": 50257,
40
+ "<|im_end|>": 50295,
41
+ "<|im_start|>": 50296
42
+ }
config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "vince62s/phi-2-psy",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "PhiForCausalLM"
6
+ ],
7
+ "attn_pdrop": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "vince62s/phi-2-psy--configuration_phi.PhiConfig",
10
+ "AutoModelForCausalLM": "vince62s/phi-2-psy--modeling_phi.PhiForCausalLM"
11
+ },
12
+ "embd_pdrop": 0.0,
13
+ "flash_attn": false,
14
+ "flash_rotary": false,
15
+ "fused_dense": false,
16
+ "img_processor": null,
17
+ "initializer_range": 0.02,
18
+ "layer_norm_epsilon": 1e-05,
19
+ "model_type": "phi-msft",
20
+ "n_embd": 2560,
21
+ "n_head": 32,
22
+ "n_head_kv": null,
23
+ "n_inner": null,
24
+ "n_layer": 32,
25
+ "n_positions": 2048,
26
+ "resid_pdrop": 0.1,
27
+ "rotary_dim": 32,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "bfloat16",
30
+ "transformers_version": "4.35.2",
31
+ "use_cache": false,
32
+ "vocab_size": 51200
33
+ }
configuration_phi.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ import math
5
+ from typing import Optional
6
+
7
+ from transformers import PretrainedConfig
8
+
9
+
10
+ class PhiConfig(PretrainedConfig):
11
+ """Phi configuration."""
12
+
13
+ model_type = "phi-msft"
14
+ attribute_map = {
15
+ "max_position_embeddings": "n_positions",
16
+ "hidden_size": "n_embd",
17
+ "num_attention_heads": "n_head",
18
+ "num_hidden_layers": "n_layer",
19
+ }
20
+
21
+ def __init__(
22
+ self,
23
+ vocab_size: int = 50304,
24
+ n_positions: int = 2048,
25
+ n_embd: int = 1024,
26
+ n_layer: int = 20,
27
+ n_inner: Optional[int] = None,
28
+ n_head: int = 16,
29
+ n_head_kv: Optional[int] = None,
30
+ rotary_dim: Optional[int] = 32,
31
+ activation_function: Optional[str] = "gelu_new",
32
+ flash_attn: bool = False,
33
+ flash_rotary: bool = False,
34
+ fused_dense: bool = False,
35
+ attn_pdrop: float = 0.0,
36
+ embd_pdrop: float = 0.0,
37
+ resid_pdrop: float = 0.0,
38
+ layer_norm_epsilon: float = 1e-5,
39
+ initializer_range: float = 0.02,
40
+ tie_word_embeddings: bool = False,
41
+ pad_vocab_size_multiple: int = 64,
42
+ **kwargs
43
+ ) -> None:
44
+ self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple)
45
+ self.n_positions = n_positions
46
+ self.n_embd = n_embd
47
+ self.n_layer = n_layer
48
+ self.n_inner = n_inner
49
+ self.n_head = n_head
50
+ self.n_head_kv = n_head_kv
51
+ self.rotary_dim = min(rotary_dim, n_embd // n_head)
52
+ self.activation_function = activation_function
53
+ self.flash_attn = flash_attn
54
+ self.flash_rotary = flash_rotary
55
+ self.fused_dense = fused_dense
56
+ self.attn_pdrop = attn_pdrop
57
+ self.embd_pdrop = embd_pdrop
58
+ self.resid_pdrop = resid_pdrop
59
+ self.layer_norm_epsilon = layer_norm_epsilon
60
+ self.initializer_range = initializer_range
61
+
62
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
mergekit_config.yml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ slices:
3
+ - sources:
4
+ - model: rhysjones/phi-2-orange
5
+ layer_range: [0, 32]
6
+ - model: cognitivecomputations/dolphin-2_6-phi-2
7
+ layer_range: [0, 32]
8
+ merge_method: slerp
9
+ base_model: rhysjones/phi-2-orange
10
+ parameters:
11
+ t:
12
+ - filter: self_attn
13
+ value: [0, 0.5, 0.3, 0.7, 1]
14
+ - filter: mlp
15
+ value: [1, 0.5, 0.7, 0.3, 0]
16
+ - value: 0.5
17
+ dtype: bfloat16
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors.index.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"metadata": {"mergekit_version": "0.0.3.2"}, "weight_map": {"transformer.embd.wte.weight": "model-00001-of-00003.safetensors", "transformer.h.0.ln.bias": "model-00001-of-00003.safetensors", "transformer.h.0.ln.weight": "model-00001-of-00003.safetensors", "transformer.h.0.mixer.Wqkv.bias": "model-00001-of-00003.safetensors", "transformer.h.0.mixer.Wqkv.weight": "model-00001-of-00003.safetensors", "transformer.h.0.mixer.out_proj.bias": "model-00001-of-00003.safetensors", "transformer.h.0.mixer.out_proj.weight": "model-00001-of-00003.safetensors", "transformer.h.0.mlp.fc1.bias": "model-00001-of-00003.safetensors", "transformer.h.0.mlp.fc1.weight": "model-00001-of-00003.safetensors", "transformer.h.0.mlp.fc2.bias": "model-00001-of-00003.safetensors", "transformer.h.0.mlp.fc2.weight": "model-00001-of-00003.safetensors", "transformer.h.1.ln.bias": "model-00001-of-00003.safetensors", "transformer.h.1.ln.weight": "model-00001-of-00003.safetensors", "transformer.h.1.mixer.Wqkv.bias": "model-00001-of-00003.safetensors", "transformer.h.1.mixer.Wqkv.weight": "model-00001-of-00003.safetensors", "transformer.h.1.mixer.out_proj.bias": "model-00001-of-00003.safetensors", "transformer.h.1.mixer.out_proj.weight": "model-00001-of-00003.safetensors", "transformer.h.1.mlp.fc1.bias": "model-00001-of-00003.safetensors", "transformer.h.1.mlp.fc1.weight": "model-00001-of-00003.safetensors", "transformer.h.1.mlp.fc2.bias": "model-00001-of-00003.safetensors", "transformer.h.1.mlp.fc2.weight": "model-00001-of-00003.safetensors", "transformer.h.10.ln.bias": "model-00001-of-00003.safetensors", "transformer.h.10.ln.weight": "model-00001-of-00003.safetensors", "transformer.h.10.mixer.Wqkv.bias": "model-00001-of-00003.safetensors", "transformer.h.10.mixer.Wqkv.weight": "model-00001-of-00003.safetensors", "transformer.h.10.mixer.out_proj.bias": "model-00001-of-00003.safetensors", "transformer.h.10.mixer.out_proj.weight": "model-00001-of-00003.safetensors", "transformer.h.10.mlp.fc1.bias": "model-00001-of-00003.safetensors", "transformer.h.10.mlp.fc1.weight": "model-00001-of-00003.safetensors", "transformer.h.10.mlp.fc2.bias": "model-00001-of-00003.safetensors", "transformer.h.10.mlp.fc2.weight": "model-00001-of-00003.safetensors", "transformer.h.11.ln.bias": "model-00001-of-00003.safetensors", "transformer.h.11.ln.weight": "model-00001-of-00003.safetensors", "transformer.h.11.mixer.Wqkv.bias": "model-00001-of-00003.safetensors", "transformer.h.11.mixer.Wqkv.weight": "model-00001-of-00003.safetensors", "transformer.h.11.mixer.out_proj.bias": "model-00001-of-00003.safetensors", "transformer.h.11.mixer.out_proj.weight": "model-00001-of-00003.safetensors", "transformer.h.11.mlp.fc1.bias": "model-00001-of-00003.safetensors", "transformer.h.11.mlp.fc1.weight": "model-00001-of-00003.safetensors", "transformer.h.11.mlp.fc2.bias": "model-00001-of-00003.safetensors", "transformer.h.11.mlp.fc2.weight": "model-00001-of-00003.safetensors", "transformer.h.12.ln.bias": "model-00001-of-00003.safetensors", "transformer.h.12.ln.weight": "model-00001-of-00003.safetensors", "transformer.h.12.mixer.Wqkv.bias": "model-00001-of-00003.safetensors", "transformer.h.12.mixer.Wqkv.weight": "model-00001-of-00003.safetensors", "transformer.h.12.mixer.out_proj.bias": "model-00001-of-00003.safetensors", "transformer.h.12.mixer.out_proj.weight": "model-00001-of-00003.safetensors", "transformer.h.12.mlp.fc1.bias": "model-00001-of-00003.safetensors", "transformer.h.12.mlp.fc1.weight": "model-00001-of-00003.safetensors", "transformer.h.12.mlp.fc2.bias": "model-00001-of-00003.safetensors", "transformer.h.12.mlp.fc2.weight": "model-00001-of-00003.safetensors", "transformer.h.13.ln.bias": "model-00001-of-00003.safetensors", "transformer.h.13.ln.weight": "model-00001-of-00003.safetensors", "transformer.h.13.mixer.Wqkv.bias": "model-00001-of-00003.safetensors", "transformer.h.13.mixer.Wqkv.weight": "model-00001-of-00003.safetensors", "transformer.h.13.mixer.out_proj.bias": "model-00001-of-00003.safetensors", "transformer.h.13.mixer.out_proj.weight": "model-00001-of-00003.safetensors", "transformer.h.13.mlp.fc1.bias": "model-00001-of-00003.safetensors", "transformer.h.13.mlp.fc1.weight": "model-00001-of-00003.safetensors", "transformer.h.13.mlp.fc2.bias": "model-00001-of-00003.safetensors", "transformer.h.13.mlp.fc2.weight": "model-00001-of-00003.safetensors", "transformer.h.14.ln.bias": "model-00001-of-00003.safetensors", "transformer.h.14.ln.weight": "model-00001-of-00003.safetensors", "transformer.h.14.mixer.Wqkv.bias": "model-00001-of-00003.safetensors", "transformer.h.14.mixer.Wqkv.weight": "model-00001-of-00003.safetensors", "transformer.h.14.mixer.out_proj.bias": "model-00001-of-00003.safetensors", "transformer.h.14.mixer.out_proj.weight": "model-00001-of-00003.safetensors", "transformer.h.14.mlp.fc1.bias": "model-00001-of-00003.safetensors", "transformer.h.14.mlp.fc1.weight": "model-00001-of-00003.safetensors", "transformer.h.14.mlp.fc2.bias": "model-00001-of-00003.safetensors", "transformer.h.14.mlp.fc2.weight": "model-00001-of-00003.safetensors", "transformer.h.15.ln.bias": "model-00001-of-00003.safetensors", "transformer.h.15.ln.weight": "model-00001-of-00003.safetensors", "transformer.h.15.mixer.Wqkv.bias": "model-00001-of-00003.safetensors", "transformer.h.15.mixer.Wqkv.weight": "model-00001-of-00003.safetensors", "transformer.h.15.mixer.out_proj.bias": "model-00001-of-00003.safetensors", "transformer.h.15.mixer.out_proj.weight": "model-00001-of-00003.safetensors", "transformer.h.15.mlp.fc1.bias": "model-00001-of-00003.safetensors", "transformer.h.15.mlp.fc1.weight": "model-00001-of-00003.safetensors", "transformer.h.15.mlp.fc2.bias": "model-00001-of-00003.safetensors", "transformer.h.15.mlp.fc2.weight": "model-00001-of-00003.safetensors", "transformer.h.16.ln.bias": "model-00001-of-00003.safetensors", "transformer.h.16.ln.weight": "model-00001-of-00003.safetensors", "transformer.h.16.mixer.Wqkv.bias": "model-00001-of-00003.safetensors", "transformer.h.16.mixer.Wqkv.weight": "model-00001-of-00003.safetensors", "transformer.h.16.mixer.out_proj.bias": "model-00001-of-00003.safetensors", "transformer.h.16.mixer.out_proj.weight": "model-00001-of-00003.safetensors", "transformer.h.16.mlp.fc1.bias": "model-00001-of-00003.safetensors", "transformer.h.16.mlp.fc1.weight": "model-00001-of-00003.safetensors", "transformer.h.16.mlp.fc2.bias": "model-00001-of-00003.safetensors", "transformer.h.16.mlp.fc2.weight": "model-00001-of-00003.safetensors", "transformer.h.17.ln.bias": "model-00001-of-00003.safetensors", "transformer.h.17.ln.weight": "model-00001-of-00003.safetensors", "transformer.h.17.mixer.Wqkv.bias": "model-00001-of-00003.safetensors", "transformer.h.17.mixer.Wqkv.weight": "model-00001-of-00003.safetensors", "transformer.h.17.mixer.out_proj.bias": "model-00001-of-00003.safetensors", "transformer.h.17.mixer.out_proj.weight": "model-00001-of-00003.safetensors", "transformer.h.17.mlp.fc1.bias": "model-00001-of-00003.safetensors", "transformer.h.17.mlp.fc1.weight": "model-00001-of-00003.safetensors", "transformer.h.17.mlp.fc2.bias": "model-00001-of-00003.safetensors", "transformer.h.17.mlp.fc2.weight": "model-00001-of-00003.safetensors", "transformer.h.18.ln.bias": "model-00001-of-00003.safetensors", "transformer.h.18.ln.weight": "model-00001-of-00003.safetensors", "transformer.h.18.mixer.Wqkv.bias": "model-00001-of-00003.safetensors", "transformer.h.18.mixer.Wqkv.weight": "model-00001-of-00003.safetensors", "transformer.h.18.mixer.out_proj.bias": "model-00001-of-00003.safetensors", "transformer.h.18.mixer.out_proj.weight": "model-00001-of-00003.safetensors", "transformer.h.18.mlp.fc1.bias": "model-00001-of-00003.safetensors", "transformer.h.18.mlp.fc1.weight": "model-00001-of-00003.safetensors", "transformer.h.18.mlp.fc2.bias": "model-00001-of-00003.safetensors", "transformer.h.18.mlp.fc2.weight": "model-00001-of-00003.safetensors", "transformer.h.19.ln.bias": "model-00001-of-00003.safetensors", "transformer.h.19.ln.weight": "model-00001-of-00003.safetensors", "transformer.h.19.mixer.Wqkv.bias": "model-00001-of-00003.safetensors", "transformer.h.19.mixer.Wqkv.weight": "model-00002-of-00003.safetensors", "transformer.h.19.mixer.out_proj.bias": "model-00002-of-00003.safetensors", "transformer.h.19.mixer.out_proj.weight": "model-00002-of-00003.safetensors", "transformer.h.19.mlp.fc1.bias": "model-00002-of-00003.safetensors", "transformer.h.19.mlp.fc1.weight": "model-00002-of-00003.safetensors", "transformer.h.19.mlp.fc2.bias": "model-00002-of-00003.safetensors", "transformer.h.19.mlp.fc2.weight": "model-00002-of-00003.safetensors", "transformer.h.2.ln.bias": "model-00002-of-00003.safetensors", "transformer.h.2.ln.weight": "model-00002-of-00003.safetensors", "transformer.h.2.mixer.Wqkv.bias": "model-00002-of-00003.safetensors", "transformer.h.2.mixer.Wqkv.weight": "model-00002-of-00003.safetensors", "transformer.h.2.mixer.out_proj.bias": "model-00002-of-00003.safetensors", "transformer.h.2.mixer.out_proj.weight": "model-00002-of-00003.safetensors", "transformer.h.2.mlp.fc1.bias": "model-00002-of-00003.safetensors", "transformer.h.2.mlp.fc1.weight": "model-00002-of-00003.safetensors", "transformer.h.2.mlp.fc2.bias": "model-00002-of-00003.safetensors", "transformer.h.2.mlp.fc2.weight": "model-00002-of-00003.safetensors", "transformer.h.20.ln.bias": "model-00002-of-00003.safetensors", "transformer.h.20.ln.weight": "model-00002-of-00003.safetensors", "transformer.h.20.mixer.Wqkv.bias": "model-00002-of-00003.safetensors", "transformer.h.20.mixer.Wqkv.weight": "model-00002-of-00003.safetensors", "transformer.h.20.mixer.out_proj.bias": "model-00002-of-00003.safetensors", "transformer.h.20.mixer.out_proj.weight": "model-00002-of-00003.safetensors", "transformer.h.20.mlp.fc1.bias": "model-00002-of-00003.safetensors", "transformer.h.20.mlp.fc1.weight": "model-00002-of-00003.safetensors", "transformer.h.20.mlp.fc2.bias": "model-00002-of-00003.safetensors", "transformer.h.20.mlp.fc2.weight": "model-00002-of-00003.safetensors", "transformer.h.21.ln.bias": "model-00002-of-00003.safetensors", "transformer.h.21.ln.weight": "model-00002-of-00003.safetensors", "transformer.h.21.mixer.Wqkv.bias": "model-00002-of-00003.safetensors", "transformer.h.21.mixer.Wqkv.weight": "model-00002-of-00003.safetensors", "transformer.h.21.mixer.out_proj.bias": "model-00002-of-00003.safetensors", "transformer.h.21.mixer.out_proj.weight": "model-00002-of-00003.safetensors", "transformer.h.21.mlp.fc1.bias": "model-00002-of-00003.safetensors", "transformer.h.21.mlp.fc1.weight": "model-00002-of-00003.safetensors", "transformer.h.21.mlp.fc2.bias": "model-00002-of-00003.safetensors", "transformer.h.21.mlp.fc2.weight": "model-00002-of-00003.safetensors", "transformer.h.22.ln.bias": "model-00002-of-00003.safetensors", "transformer.h.22.ln.weight": "model-00002-of-00003.safetensors", "transformer.h.22.mixer.Wqkv.bias": "model-00002-of-00003.safetensors", "transformer.h.22.mixer.Wqkv.weight": "model-00002-of-00003.safetensors", "transformer.h.22.mixer.out_proj.bias": "model-00002-of-00003.safetensors", "transformer.h.22.mixer.out_proj.weight": "model-00002-of-00003.safetensors", "transformer.h.22.mlp.fc1.bias": "model-00002-of-00003.safetensors", "transformer.h.22.mlp.fc1.weight": "model-00002-of-00003.safetensors", "transformer.h.22.mlp.fc2.bias": "model-00002-of-00003.safetensors", "transformer.h.22.mlp.fc2.weight": "model-00002-of-00003.safetensors", "transformer.h.23.ln.bias": "model-00002-of-00003.safetensors", "transformer.h.23.ln.weight": "model-00002-of-00003.safetensors", "transformer.h.23.mixer.Wqkv.bias": "model-00002-of-00003.safetensors", "transformer.h.23.mixer.Wqkv.weight": "model-00002-of-00003.safetensors", "transformer.h.23.mixer.out_proj.bias": "model-00002-of-00003.safetensors", "transformer.h.23.mixer.out_proj.weight": "model-00002-of-00003.safetensors", "transformer.h.23.mlp.fc1.bias": "model-00002-of-00003.safetensors", "transformer.h.23.mlp.fc1.weight": "model-00002-of-00003.safetensors", "transformer.h.23.mlp.fc2.bias": "model-00002-of-00003.safetensors", "transformer.h.23.mlp.fc2.weight": "model-00002-of-00003.safetensors", "transformer.h.24.ln.bias": "model-00002-of-00003.safetensors", "transformer.h.24.ln.weight": "model-00002-of-00003.safetensors", "transformer.h.24.mixer.Wqkv.bias": "model-00002-of-00003.safetensors", "transformer.h.24.mixer.Wqkv.weight": "model-00002-of-00003.safetensors", "transformer.h.24.mixer.out_proj.bias": "model-00002-of-00003.safetensors", "transformer.h.24.mixer.out_proj.weight": "model-00002-of-00003.safetensors", "transformer.h.24.mlp.fc1.bias": "model-00002-of-00003.safetensors", "transformer.h.24.mlp.fc1.weight": "model-00002-of-00003.safetensors", "transformer.h.24.mlp.fc2.bias": "model-00002-of-00003.safetensors", "transformer.h.24.mlp.fc2.weight": "model-00002-of-00003.safetensors", "transformer.h.25.ln.bias": "model-00002-of-00003.safetensors", "transformer.h.25.ln.weight": "model-00002-of-00003.safetensors", "transformer.h.25.mixer.Wqkv.bias": "model-00002-of-00003.safetensors", "transformer.h.25.mixer.Wqkv.weight": "model-00002-of-00003.safetensors", "transformer.h.25.mixer.out_proj.bias": "model-00002-of-00003.safetensors", "transformer.h.25.mixer.out_proj.weight": "model-00002-of-00003.safetensors", "transformer.h.25.mlp.fc1.bias": "model-00002-of-00003.safetensors", "transformer.h.25.mlp.fc1.weight": "model-00002-of-00003.safetensors", "transformer.h.25.mlp.fc2.bias": "model-00002-of-00003.safetensors", "transformer.h.25.mlp.fc2.weight": "model-00002-of-00003.safetensors", "transformer.h.26.ln.bias": "model-00002-of-00003.safetensors", "transformer.h.26.ln.weight": "model-00002-of-00003.safetensors", "transformer.h.26.mixer.Wqkv.bias": "model-00002-of-00003.safetensors", "transformer.h.26.mixer.Wqkv.weight": "model-00002-of-00003.safetensors", "transformer.h.26.mixer.out_proj.bias": "model-00002-of-00003.safetensors", "transformer.h.26.mixer.out_proj.weight": "model-00002-of-00003.safetensors", "transformer.h.26.mlp.fc1.bias": "model-00002-of-00003.safetensors", "transformer.h.26.mlp.fc1.weight": "model-00002-of-00003.safetensors", "transformer.h.26.mlp.fc2.bias": "model-00002-of-00003.safetensors", "transformer.h.26.mlp.fc2.weight": "model-00002-of-00003.safetensors", "transformer.h.27.ln.bias": "model-00002-of-00003.safetensors", "transformer.h.27.ln.weight": "model-00002-of-00003.safetensors", "transformer.h.27.mixer.Wqkv.bias": "model-00002-of-00003.safetensors", "transformer.h.27.mixer.Wqkv.weight": "model-00002-of-00003.safetensors", "transformer.h.27.mixer.out_proj.bias": "model-00002-of-00003.safetensors", "transformer.h.27.mixer.out_proj.weight": "model-00002-of-00003.safetensors", "transformer.h.27.mlp.fc1.bias": "model-00002-of-00003.safetensors", "transformer.h.27.mlp.fc1.weight": "model-00002-of-00003.safetensors", "transformer.h.27.mlp.fc2.bias": "model-00002-of-00003.safetensors", "transformer.h.27.mlp.fc2.weight": "model-00002-of-00003.safetensors", "transformer.h.28.ln.bias": "model-00002-of-00003.safetensors", "transformer.h.28.ln.weight": "model-00002-of-00003.safetensors", "transformer.h.28.mixer.Wqkv.bias": "model-00002-of-00003.safetensors", "transformer.h.28.mixer.Wqkv.weight": "model-00002-of-00003.safetensors", "transformer.h.28.mixer.out_proj.bias": "model-00002-of-00003.safetensors", "transformer.h.28.mixer.out_proj.weight": "model-00002-of-00003.safetensors", "transformer.h.28.mlp.fc1.bias": "model-00002-of-00003.safetensors", "transformer.h.28.mlp.fc1.weight": "model-00002-of-00003.safetensors", "transformer.h.28.mlp.fc2.bias": "model-00002-of-00003.safetensors", "transformer.h.28.mlp.fc2.weight": "model-00002-of-00003.safetensors", "transformer.h.29.ln.bias": "model-00002-of-00003.safetensors", "transformer.h.29.ln.weight": "model-00002-of-00003.safetensors", "transformer.h.29.mixer.Wqkv.bias": "model-00002-of-00003.safetensors", "transformer.h.29.mixer.Wqkv.weight": "model-00002-of-00003.safetensors", "transformer.h.29.mixer.out_proj.bias": "model-00002-of-00003.safetensors", "transformer.h.29.mixer.out_proj.weight": "model-00002-of-00003.safetensors", "transformer.h.29.mlp.fc1.bias": "model-00002-of-00003.safetensors", "transformer.h.29.mlp.fc1.weight": "model-00002-of-00003.safetensors", "transformer.h.29.mlp.fc2.bias": "model-00002-of-00003.safetensors", "transformer.h.29.mlp.fc2.weight": "model-00002-of-00003.safetensors", "transformer.h.3.ln.bias": "model-00002-of-00003.safetensors", "transformer.h.3.ln.weight": "model-00002-of-00003.safetensors", "transformer.h.3.mixer.Wqkv.bias": "model-00002-of-00003.safetensors", "transformer.h.3.mixer.Wqkv.weight": "model-00002-of-00003.safetensors", "transformer.h.3.mixer.out_proj.bias": "model-00002-of-00003.safetensors", "transformer.h.3.mixer.out_proj.weight": "model-00002-of-00003.safetensors", "transformer.h.3.mlp.fc1.bias": "model-00002-of-00003.safetensors", "transformer.h.3.mlp.fc1.weight": "model-00002-of-00003.safetensors", "transformer.h.3.mlp.fc2.bias": "model-00002-of-00003.safetensors", "transformer.h.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", "transformer.h.30.ln.bias": "model-00003-of-00003.safetensors", "transformer.h.30.ln.weight": "model-00003-of-00003.safetensors", "transformer.h.4.ln.bias": "model-00003-of-00003.safetensors", "transformer.h.4.ln.weight": "model-00003-of-00003.safetensors", "transformer.h.4.mixer.Wqkv.bias": "model-00003-of-00003.safetensors", "transformer.h.4.mixer.Wqkv.weight": "model-00003-of-00003.safetensors", "transformer.h.4.mixer.out_proj.bias": "model-00003-of-00003.safetensors", "transformer.h.4.mixer.out_proj.weight": "model-00003-of-00003.safetensors", "transformer.h.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", "transformer.h.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", "transformer.h.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", "transformer.h.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", "transformer.h.5.ln.bias": "model-00003-of-00003.safetensors", "transformer.h.5.ln.weight": "model-00003-of-00003.safetensors", "transformer.h.5.mixer.Wqkv.bias": "model-00003-of-00003.safetensors", "transformer.h.5.mixer.Wqkv.weight": "model-00003-of-00003.safetensors", "transformer.h.5.mixer.out_proj.bias": "model-00003-of-00003.safetensors", "transformer.h.5.mixer.out_proj.weight": "model-00003-of-00003.safetensors", "transformer.h.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", "transformer.h.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", "transformer.h.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", "transformer.h.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", "transformer.h.6.ln.bias": "model-00003-of-00003.safetensors", "transformer.h.6.ln.weight": "model-00003-of-00003.safetensors", "transformer.h.6.mixer.Wqkv.bias": "model-00003-of-00003.safetensors", "transformer.h.6.mixer.Wqkv.weight": "model-00003-of-00003.safetensors", "transformer.h.6.mixer.out_proj.bias": "model-00003-of-00003.safetensors", "transformer.h.6.mixer.out_proj.weight": "model-00003-of-00003.safetensors", "transformer.h.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", "transformer.h.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", "transformer.h.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", "transformer.h.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", "transformer.h.7.ln.bias": "model-00003-of-00003.safetensors", "transformer.h.7.ln.weight": "model-00003-of-00003.safetensors", "transformer.h.7.mixer.Wqkv.bias": "model-00003-of-00003.safetensors", "transformer.h.7.mixer.Wqkv.weight": "model-00003-of-00003.safetensors", "transformer.h.7.mixer.out_proj.bias": "model-00003-of-00003.safetensors", "transformer.h.7.mixer.out_proj.weight": "model-00003-of-00003.safetensors", "transformer.h.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", "transformer.h.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", "transformer.h.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", "transformer.h.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", "transformer.h.8.ln.bias": "model-00003-of-00003.safetensors", "transformer.h.8.ln.weight": "model-00003-of-00003.safetensors", "transformer.h.8.mixer.Wqkv.bias": "model-00003-of-00003.safetensors", "transformer.h.8.mixer.Wqkv.weight": "model-00003-of-00003.safetensors", "transformer.h.8.mixer.out_proj.bias": "model-00003-of-00003.safetensors", "transformer.h.8.mixer.out_proj.weight": "model-00003-of-00003.safetensors", "transformer.h.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", "transformer.h.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", "transformer.h.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", "transformer.h.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", "transformer.h.9.ln.bias": "model-00003-of-00003.safetensors", "transformer.h.9.ln.weight": "model-00003-of-00003.safetensors", "transformer.h.9.mixer.Wqkv.bias": "model-00003-of-00003.safetensors", "transformer.h.9.mixer.Wqkv.weight": "model-00003-of-00003.safetensors", "transformer.h.9.mixer.out_proj.bias": "model-00003-of-00003.safetensors", "transformer.h.9.mixer.out_proj.weight": "model-00003-of-00003.safetensors", "transformer.h.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", "transformer.h.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", "transformer.h.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", "transformer.h.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", "lm_head.linear.bias": "model-00003-of-00003.safetensors", "lm_head.linear.weight": "model-00003-of-00003.safetensors", "lm_head.ln.bias": "model-00003-of-00003.safetensors", "lm_head.ln.weight": "model-00003-of-00003.safetensors", "transformer.h.30.mixer.Wqkv.bias": "model-00003-of-00003.safetensors", "transformer.h.30.mixer.Wqkv.weight": "model-00003-of-00003.safetensors", "transformer.h.30.mixer.out_proj.bias": "model-00003-of-00003.safetensors", "transformer.h.30.mixer.out_proj.weight": "model-00003-of-00003.safetensors", "transformer.h.30.mlp.fc1.bias": "model-00003-of-00003.safetensors", "transformer.h.30.mlp.fc1.weight": "model-00003-of-00003.safetensors", "transformer.h.30.mlp.fc2.bias": "model-00003-of-00003.safetensors", "transformer.h.30.mlp.fc2.weight": "model-00003-of-00003.safetensors", "transformer.h.31.ln.bias": "model-00003-of-00003.safetensors", "transformer.h.31.ln.weight": "model-00003-of-00003.safetensors", "transformer.h.31.mixer.Wqkv.bias": "model-00003-of-00003.safetensors", "transformer.h.31.mixer.Wqkv.weight": "model-00003-of-00003.safetensors", "transformer.h.31.mixer.out_proj.bias": "model-00003-of-00003.safetensors", "transformer.h.31.mixer.out_proj.weight": "model-00003-of-00003.safetensors", "transformer.h.31.mlp.fc1.bias": "model-00003-of-00003.safetensors", "transformer.h.31.mlp.fc1.weight": "model-00003-of-00003.safetensors", "transformer.h.31.mlp.fc2.bias": "model-00003-of-00003.safetensors", "transformer.h.31.mlp.fc2.weight": "model-00003-of-00003.safetensors"}}