Tongjilibo commited on
Commit
3082b91
1 Parent(s): 4a5ef8a

更新Yi 1.5

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ config.json
Yi-1.5-6B-Chat/bert4torch_config.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 4096,
9
+ "model": "llama",
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 32,
12
+ "num_key_value_heads": 4,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-06,
15
+ "rope_theta": 5000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "vocab_size": 64000,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "segment_vocab_size": 0,
22
+ "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 2},
23
+ "mapping": {
24
+ "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
+ "LayerNormFinal.weight": "model.norm.weight",
26
+ "lm_head.weight": "lm_head.weight",
27
+ "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
+ "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
+ "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
+ "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
+ "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
+ "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
+ "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
+ "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
+ "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
+ "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
+ "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
+ "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
+ "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
+ "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
+ "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
+ "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
+ "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
+ "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
+ "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
+ "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
+ "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
+ "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
+ "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
+ "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
+ "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
+ "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
+ "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
+ "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
+ "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
+ "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
+ "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
+ "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
+ "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
+ "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
+ "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
+ "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
+ "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
+ "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
+ "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
+ "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
+ "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
+ "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
+ "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
+ "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
+ "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
+ "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
+ "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
+ "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
+ "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
+ "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
+ "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
+ "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
+ "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
+ "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
+ "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
+ "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
+ "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
+ "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
+ "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
+ "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
+ "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
+ "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
+ "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
+ "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
+ "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
+ "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
+ "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
+ "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
+ "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
+ "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
+ "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
+ "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
+ "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
+ "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
+ "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
+ "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
+ "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
+ "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
+ "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
+ "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
+ "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
+ "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
+ "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
+ "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
+ "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
+ "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
+ "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
+ "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
+ "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
+ "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
+ "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
+ "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
+ "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
+ "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
+ "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
+ "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
+ "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
+ "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
+ "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
+ "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
+ "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
+ "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
+ "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
+ "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
+ "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
+ "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
+ "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
+ "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
+ "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
+ "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
+ "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
+ "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
+ "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
+ "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
+ "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
+ "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
+ "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
+ "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
+ "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
+ "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
+ "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
+ "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
+ "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
+ "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
+ "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
+ "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
+ "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
+ "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
+ "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
+ "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
+ "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
+ "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
+ "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
+ "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
+ "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
+ "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
+ "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
+ "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
+ "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
+ "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
+ "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
+ "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
+ "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
+ "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
+ "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
+ "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
+ "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
+ "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
+ "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
+ "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
+ "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
+ "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
+ "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
+ "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
+ "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
+ "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
+ "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
+ "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
+ "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
+ "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
+ "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
+ "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
+ "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
+ "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
+ "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
+ "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
+ "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
+ "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
+ "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
+ "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
+ "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
+ "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
+ "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
+ "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
+ "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
+ "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
+ "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
+ "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
+ "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
+ "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
+ "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
+ "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
+ "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
+ "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
+ "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
+ "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
+ "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
+ "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
+ "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
+ "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
+ "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
+ "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
+ "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
+ "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
+ "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
+ "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
+ "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
+ "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
+ "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
+ "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
+ "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
+ "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
+ "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
+ "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
+ "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
+ "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
+ "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
+ "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
+ "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
+ "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
+ "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
+ "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
+ "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
+ "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
+ "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
+ "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
+ "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
+ "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
+ "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
+ "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
+ "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
+ "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
+ "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
+ "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
+ "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
+ "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
+ "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
+ "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
+ "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
+ "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
+ "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
+ "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
+ "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
+ "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
+ "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
+ "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
+ "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
+ "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
+ "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
+ "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
+ "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
+ "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
+ "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
+ "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
+ "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
+ "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
+ "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
+ "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
+ "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
+ "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
+ "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
+ "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
+ "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
+ "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
+ "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
+ "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
+ "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
+ "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
+ "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
+ "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
+ "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
+ "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
+ "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
+ "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
+ "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
+ "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
+ "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
+ "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
+ "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
+ "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
+ "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
+ "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
+ "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
+ "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
+ "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
+ "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
+ "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
+ "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
+ "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
+ "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
+ "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
+ "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
+ "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
+ "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
+ "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
+ "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
+ "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
+ "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight"
315
+ }
316
+ }
Yi-1.5-6B/bert4torch_config.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 4096,
9
+ "model": "llama",
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 32,
12
+ "num_key_value_heads": 4,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-06,
15
+ "rope_theta": 5000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "vocab_size": 64000,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "segment_vocab_size": 0,
22
+ "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 2},
23
+ "mapping": {
24
+ "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
+ "LayerNormFinal.weight": "model.norm.weight",
26
+ "lm_head.weight": "lm_head.weight",
27
+ "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
+ "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
+ "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
+ "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
+ "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
+ "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
+ "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
+ "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
+ "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
+ "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
+ "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
+ "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
+ "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
+ "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
+ "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
+ "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
+ "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
+ "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
+ "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
+ "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
+ "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
+ "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
+ "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
+ "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
+ "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
+ "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
+ "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
+ "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
+ "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
+ "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
+ "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
+ "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
+ "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
+ "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
+ "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
+ "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
+ "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
+ "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
+ "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
+ "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
+ "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
+ "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
+ "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
+ "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
+ "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
+ "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
+ "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
+ "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
+ "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
+ "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
+ "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
+ "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
+ "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
+ "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
+ "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
+ "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
+ "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
+ "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
+ "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
+ "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
+ "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
+ "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
+ "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
+ "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
+ "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
+ "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
+ "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
+ "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
+ "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
+ "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
+ "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
+ "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
+ "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
+ "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
+ "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
+ "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
+ "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
+ "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
+ "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
+ "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
+ "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
+ "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
+ "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
+ "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
+ "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
+ "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
+ "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
+ "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
+ "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
+ "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
+ "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
+ "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
+ "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
+ "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
+ "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
+ "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
+ "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
+ "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
+ "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
+ "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
+ "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
+ "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
+ "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
+ "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
+ "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
+ "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
+ "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
+ "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
+ "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
+ "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
+ "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
+ "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
+ "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
+ "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
+ "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
+ "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
+ "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
+ "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
+ "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
+ "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
+ "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
+ "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
+ "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
+ "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
+ "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
+ "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
+ "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
+ "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
+ "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
+ "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
+ "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
+ "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
+ "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
+ "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
+ "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
+ "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
+ "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
+ "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
+ "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
+ "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
+ "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
+ "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
+ "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
+ "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
+ "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
+ "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
+ "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
+ "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
+ "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
+ "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
+ "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
+ "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
+ "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
+ "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
+ "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
+ "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
+ "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
+ "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
+ "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
+ "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
+ "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
+ "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
+ "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
+ "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
+ "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
+ "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
+ "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
+ "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
+ "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
+ "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
+ "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
+ "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
+ "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
+ "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
+ "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
+ "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
+ "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
+ "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
+ "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
+ "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
+ "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
+ "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
+ "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
+ "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
+ "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
+ "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
+ "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
+ "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
+ "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
+ "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
+ "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
+ "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
+ "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
+ "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
+ "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
+ "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
+ "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
+ "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
+ "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
+ "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
+ "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
+ "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
+ "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
+ "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
+ "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
+ "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
+ "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
+ "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
+ "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
+ "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
+ "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
+ "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
+ "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
+ "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
+ "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
+ "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
+ "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
+ "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
+ "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
+ "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
+ "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
+ "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
+ "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
+ "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
+ "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
+ "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
+ "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
+ "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
+ "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
+ "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
+ "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
+ "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
+ "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
+ "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
+ "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
+ "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
+ "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
+ "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
+ "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
+ "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
+ "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
+ "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
+ "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
+ "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
+ "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
+ "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
+ "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
+ "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
+ "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
+ "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
+ "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
+ "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
+ "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
+ "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
+ "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
+ "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
+ "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
+ "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
+ "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
+ "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
+ "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
+ "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
+ "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
+ "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
+ "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
+ "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
+ "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
+ "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
+ "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
+ "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
+ "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
+ "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
+ "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
+ "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
+ "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
+ "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
+ "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
+ "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
+ "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
+ "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
+ "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
+ "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
+ "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
+ "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
+ "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
+ "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
+ "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
+ "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight"
315
+ }
316
+ }
Yi-1.5-9B-32K/bert4torch_config.json ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 32768,
9
+ "model": "llama",
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 48,
12
+ "num_key_value_heads": 4,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-06,
15
+ "rope_theta": 5000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "vocab_size": 64000,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "segment_vocab_size": 0,
22
+ "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 32768, "eos_token_id": 2},
23
+ "mapping": {
24
+ "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
+ "LayerNormFinal.weight": "model.norm.weight",
26
+ "lm_head.weight": "lm_head.weight",
27
+ "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
+ "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
+ "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
+ "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
+ "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
+ "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
+ "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
+ "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
+ "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
+ "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
+ "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
+ "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
+ "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
+ "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
+ "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
+ "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
+ "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
+ "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
+ "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
+ "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
+ "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
+ "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
+ "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
+ "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
+ "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
+ "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
+ "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
+ "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
+ "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
+ "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
+ "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
+ "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
+ "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
+ "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
+ "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
+ "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
+ "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
+ "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
+ "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
+ "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
+ "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
+ "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
+ "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
+ "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
+ "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
+ "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
+ "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
+ "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
+ "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
+ "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
+ "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
+ "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
+ "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
+ "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
+ "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
+ "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
+ "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
+ "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
+ "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
+ "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
+ "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
+ "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
+ "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
+ "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
+ "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
+ "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
+ "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
+ "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
+ "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
+ "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
+ "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
+ "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
+ "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
+ "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
+ "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
+ "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
+ "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
+ "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
+ "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
+ "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
+ "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
+ "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
+ "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
+ "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
+ "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
+ "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
+ "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
+ "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
+ "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
+ "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
+ "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
+ "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
+ "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
+ "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
+ "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
+ "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
+ "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
+ "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
+ "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
+ "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
+ "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
+ "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
+ "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
+ "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
+ "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
+ "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
+ "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
+ "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
+ "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
+ "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
+ "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
+ "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
+ "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
+ "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
+ "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
+ "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
+ "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
+ "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
+ "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
+ "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
+ "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
+ "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
+ "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
+ "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
+ "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
+ "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
+ "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
+ "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
+ "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
+ "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
+ "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
+ "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
+ "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
+ "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
+ "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
+ "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
+ "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
+ "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
+ "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
+ "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
+ "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
+ "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
+ "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
+ "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
+ "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
+ "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
+ "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
+ "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
+ "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
+ "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
+ "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
+ "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
+ "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
+ "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
+ "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
+ "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
+ "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
+ "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
+ "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
+ "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
+ "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
+ "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
+ "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
+ "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
+ "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
+ "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
+ "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
+ "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
+ "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
+ "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
+ "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
+ "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
+ "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
+ "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
+ "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
+ "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
+ "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
+ "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
+ "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
+ "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
+ "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
+ "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
+ "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
+ "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
+ "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
+ "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
+ "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
+ "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
+ "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
+ "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
+ "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
+ "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
+ "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
+ "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
+ "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
+ "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
+ "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
+ "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
+ "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
+ "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
+ "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
+ "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
+ "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
+ "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
+ "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
+ "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
+ "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
+ "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
+ "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
+ "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
+ "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
+ "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
+ "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
+ "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
+ "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
+ "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
+ "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
+ "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
+ "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
+ "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
+ "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
+ "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
+ "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
+ "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
+ "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
+ "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
+ "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
+ "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
+ "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
+ "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
+ "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
+ "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
+ "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
+ "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
+ "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
+ "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
+ "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
+ "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
+ "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
+ "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
+ "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
+ "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
+ "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
+ "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
+ "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
+ "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
+ "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
+ "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
+ "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
+ "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
+ "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
+ "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
+ "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
+ "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
+ "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
+ "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
+ "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
+ "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
+ "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
+ "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
+ "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
+ "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
+ "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
+ "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
+ "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
+ "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
+ "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
+ "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
+ "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
+ "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
+ "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
+ "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
+ "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
+ "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
+ "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
+ "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
+ "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
+ "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
+ "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
+ "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
+ "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
+ "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
+ "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
+ "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
+ "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
+ "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
+ "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
+ "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight",
315
+ "decoderLayer.32.multiHeadAttention.q.weight": "model.layers.32.self_attn.q_proj.weight",
316
+ "decoderLayer.32.multiHeadAttention.k.weight": "model.layers.32.self_attn.k_proj.weight",
317
+ "decoderLayer.32.multiHeadAttention.v.weight": "model.layers.32.self_attn.v_proj.weight",
318
+ "decoderLayer.32.multiHeadAttention.o.weight": "model.layers.32.self_attn.o_proj.weight",
319
+ "decoderLayer.32.attnLayerNorm.weight": "model.layers.32.ln1.weight",
320
+ "decoderLayer.32.feedForward.intermediateDense.weight": "model.layers.32.mlp.gate_proj.weight",
321
+ "decoderLayer.32.feedForward.outputDense.weight": "model.layers.32.mlp.down_proj.weight",
322
+ "decoderLayer.32.ffnLayerNorm.weight": "model.layers.32.ln2.weight",
323
+ "decoderLayer.32.feedForward.intermediateDense2.weight": "model.layers.32.mlp.up_proj.weight",
324
+ "decoderLayer.33.multiHeadAttention.q.weight": "model.layers.33.self_attn.q_proj.weight",
325
+ "decoderLayer.33.multiHeadAttention.k.weight": "model.layers.33.self_attn.k_proj.weight",
326
+ "decoderLayer.33.multiHeadAttention.v.weight": "model.layers.33.self_attn.v_proj.weight",
327
+ "decoderLayer.33.multiHeadAttention.o.weight": "model.layers.33.self_attn.o_proj.weight",
328
+ "decoderLayer.33.attnLayerNorm.weight": "model.layers.33.ln1.weight",
329
+ "decoderLayer.33.feedForward.intermediateDense.weight": "model.layers.33.mlp.gate_proj.weight",
330
+ "decoderLayer.33.feedForward.outputDense.weight": "model.layers.33.mlp.down_proj.weight",
331
+ "decoderLayer.33.ffnLayerNorm.weight": "model.layers.33.ln2.weight",
332
+ "decoderLayer.33.feedForward.intermediateDense2.weight": "model.layers.33.mlp.up_proj.weight",
333
+ "decoderLayer.34.multiHeadAttention.q.weight": "model.layers.34.self_attn.q_proj.weight",
334
+ "decoderLayer.34.multiHeadAttention.k.weight": "model.layers.34.self_attn.k_proj.weight",
335
+ "decoderLayer.34.multiHeadAttention.v.weight": "model.layers.34.self_attn.v_proj.weight",
336
+ "decoderLayer.34.multiHeadAttention.o.weight": "model.layers.34.self_attn.o_proj.weight",
337
+ "decoderLayer.34.attnLayerNorm.weight": "model.layers.34.ln1.weight",
338
+ "decoderLayer.34.feedForward.intermediateDense.weight": "model.layers.34.mlp.gate_proj.weight",
339
+ "decoderLayer.34.feedForward.outputDense.weight": "model.layers.34.mlp.down_proj.weight",
340
+ "decoderLayer.34.ffnLayerNorm.weight": "model.layers.34.ln2.weight",
341
+ "decoderLayer.34.feedForward.intermediateDense2.weight": "model.layers.34.mlp.up_proj.weight",
342
+ "decoderLayer.35.multiHeadAttention.q.weight": "model.layers.35.self_attn.q_proj.weight",
343
+ "decoderLayer.35.multiHeadAttention.k.weight": "model.layers.35.self_attn.k_proj.weight",
344
+ "decoderLayer.35.multiHeadAttention.v.weight": "model.layers.35.self_attn.v_proj.weight",
345
+ "decoderLayer.35.multiHeadAttention.o.weight": "model.layers.35.self_attn.o_proj.weight",
346
+ "decoderLayer.35.attnLayerNorm.weight": "model.layers.35.ln1.weight",
347
+ "decoderLayer.35.feedForward.intermediateDense.weight": "model.layers.35.mlp.gate_proj.weight",
348
+ "decoderLayer.35.feedForward.outputDense.weight": "model.layers.35.mlp.down_proj.weight",
349
+ "decoderLayer.35.ffnLayerNorm.weight": "model.layers.35.ln2.weight",
350
+ "decoderLayer.35.feedForward.intermediateDense2.weight": "model.layers.35.mlp.up_proj.weight",
351
+ "decoderLayer.36.multiHeadAttention.q.weight": "model.layers.36.self_attn.q_proj.weight",
352
+ "decoderLayer.36.multiHeadAttention.k.weight": "model.layers.36.self_attn.k_proj.weight",
353
+ "decoderLayer.36.multiHeadAttention.v.weight": "model.layers.36.self_attn.v_proj.weight",
354
+ "decoderLayer.36.multiHeadAttention.o.weight": "model.layers.36.self_attn.o_proj.weight",
355
+ "decoderLayer.36.attnLayerNorm.weight": "model.layers.36.ln1.weight",
356
+ "decoderLayer.36.feedForward.intermediateDense.weight": "model.layers.36.mlp.gate_proj.weight",
357
+ "decoderLayer.36.feedForward.outputDense.weight": "model.layers.36.mlp.down_proj.weight",
358
+ "decoderLayer.36.ffnLayerNorm.weight": "model.layers.36.ln2.weight",
359
+ "decoderLayer.36.feedForward.intermediateDense2.weight": "model.layers.36.mlp.up_proj.weight",
360
+ "decoderLayer.37.multiHeadAttention.q.weight": "model.layers.37.self_attn.q_proj.weight",
361
+ "decoderLayer.37.multiHeadAttention.k.weight": "model.layers.37.self_attn.k_proj.weight",
362
+ "decoderLayer.37.multiHeadAttention.v.weight": "model.layers.37.self_attn.v_proj.weight",
363
+ "decoderLayer.37.multiHeadAttention.o.weight": "model.layers.37.self_attn.o_proj.weight",
364
+ "decoderLayer.37.attnLayerNorm.weight": "model.layers.37.ln1.weight",
365
+ "decoderLayer.37.feedForward.intermediateDense.weight": "model.layers.37.mlp.gate_proj.weight",
366
+ "decoderLayer.37.feedForward.outputDense.weight": "model.layers.37.mlp.down_proj.weight",
367
+ "decoderLayer.37.ffnLayerNorm.weight": "model.layers.37.ln2.weight",
368
+ "decoderLayer.37.feedForward.intermediateDense2.weight": "model.layers.37.mlp.up_proj.weight",
369
+ "decoderLayer.38.multiHeadAttention.q.weight": "model.layers.38.self_attn.q_proj.weight",
370
+ "decoderLayer.38.multiHeadAttention.k.weight": "model.layers.38.self_attn.k_proj.weight",
371
+ "decoderLayer.38.multiHeadAttention.v.weight": "model.layers.38.self_attn.v_proj.weight",
372
+ "decoderLayer.38.multiHeadAttention.o.weight": "model.layers.38.self_attn.o_proj.weight",
373
+ "decoderLayer.38.attnLayerNorm.weight": "model.layers.38.ln1.weight",
374
+ "decoderLayer.38.feedForward.intermediateDense.weight": "model.layers.38.mlp.gate_proj.weight",
375
+ "decoderLayer.38.feedForward.outputDense.weight": "model.layers.38.mlp.down_proj.weight",
376
+ "decoderLayer.38.ffnLayerNorm.weight": "model.layers.38.ln2.weight",
377
+ "decoderLayer.38.feedForward.intermediateDense2.weight": "model.layers.38.mlp.up_proj.weight",
378
+ "decoderLayer.39.multiHeadAttention.q.weight": "model.layers.39.self_attn.q_proj.weight",
379
+ "decoderLayer.39.multiHeadAttention.k.weight": "model.layers.39.self_attn.k_proj.weight",
380
+ "decoderLayer.39.multiHeadAttention.v.weight": "model.layers.39.self_attn.v_proj.weight",
381
+ "decoderLayer.39.multiHeadAttention.o.weight": "model.layers.39.self_attn.o_proj.weight",
382
+ "decoderLayer.39.attnLayerNorm.weight": "model.layers.39.ln1.weight",
383
+ "decoderLayer.39.feedForward.intermediateDense.weight": "model.layers.39.mlp.gate_proj.weight",
384
+ "decoderLayer.39.feedForward.outputDense.weight": "model.layers.39.mlp.down_proj.weight",
385
+ "decoderLayer.39.ffnLayerNorm.weight": "model.layers.39.ln2.weight",
386
+ "decoderLayer.39.feedForward.intermediateDense2.weight": "model.layers.39.mlp.up_proj.weight",
387
+ "decoderLayer.40.multiHeadAttention.q.weight": "model.layers.40.self_attn.q_proj.weight",
388
+ "decoderLayer.40.multiHeadAttention.k.weight": "model.layers.40.self_attn.k_proj.weight",
389
+ "decoderLayer.40.multiHeadAttention.v.weight": "model.layers.40.self_attn.v_proj.weight",
390
+ "decoderLayer.40.multiHeadAttention.o.weight": "model.layers.40.self_attn.o_proj.weight",
391
+ "decoderLayer.40.attnLayerNorm.weight": "model.layers.40.ln1.weight",
392
+ "decoderLayer.40.feedForward.intermediateDense.weight": "model.layers.40.mlp.gate_proj.weight",
393
+ "decoderLayer.40.feedForward.outputDense.weight": "model.layers.40.mlp.down_proj.weight",
394
+ "decoderLayer.40.ffnLayerNorm.weight": "model.layers.40.ln2.weight",
395
+ "decoderLayer.40.feedForward.intermediateDense2.weight": "model.layers.40.mlp.up_proj.weight",
396
+ "decoderLayer.41.multiHeadAttention.q.weight": "model.layers.41.self_attn.q_proj.weight",
397
+ "decoderLayer.41.multiHeadAttention.k.weight": "model.layers.41.self_attn.k_proj.weight",
398
+ "decoderLayer.41.multiHeadAttention.v.weight": "model.layers.41.self_attn.v_proj.weight",
399
+ "decoderLayer.41.multiHeadAttention.o.weight": "model.layers.41.self_attn.o_proj.weight",
400
+ "decoderLayer.41.attnLayerNorm.weight": "model.layers.41.ln1.weight",
401
+ "decoderLayer.41.feedForward.intermediateDense.weight": "model.layers.41.mlp.gate_proj.weight",
402
+ "decoderLayer.41.feedForward.outputDense.weight": "model.layers.41.mlp.down_proj.weight",
403
+ "decoderLayer.41.ffnLayerNorm.weight": "model.layers.41.ln2.weight",
404
+ "decoderLayer.41.feedForward.intermediateDense2.weight": "model.layers.41.mlp.up_proj.weight",
405
+ "decoderLayer.42.multiHeadAttention.q.weight": "model.layers.42.self_attn.q_proj.weight",
406
+ "decoderLayer.42.multiHeadAttention.k.weight": "model.layers.42.self_attn.k_proj.weight",
407
+ "decoderLayer.42.multiHeadAttention.v.weight": "model.layers.42.self_attn.v_proj.weight",
408
+ "decoderLayer.42.multiHeadAttention.o.weight": "model.layers.42.self_attn.o_proj.weight",
409
+ "decoderLayer.42.attnLayerNorm.weight": "model.layers.42.ln1.weight",
410
+ "decoderLayer.42.feedForward.intermediateDense.weight": "model.layers.42.mlp.gate_proj.weight",
411
+ "decoderLayer.42.feedForward.outputDense.weight": "model.layers.42.mlp.down_proj.weight",
412
+ "decoderLayer.42.ffnLayerNorm.weight": "model.layers.42.ln2.weight",
413
+ "decoderLayer.42.feedForward.intermediateDense2.weight": "model.layers.42.mlp.up_proj.weight",
414
+ "decoderLayer.43.multiHeadAttention.q.weight": "model.layers.43.self_attn.q_proj.weight",
415
+ "decoderLayer.43.multiHeadAttention.k.weight": "model.layers.43.self_attn.k_proj.weight",
416
+ "decoderLayer.43.multiHeadAttention.v.weight": "model.layers.43.self_attn.v_proj.weight",
417
+ "decoderLayer.43.multiHeadAttention.o.weight": "model.layers.43.self_attn.o_proj.weight",
418
+ "decoderLayer.43.attnLayerNorm.weight": "model.layers.43.ln1.weight",
419
+ "decoderLayer.43.feedForward.intermediateDense.weight": "model.layers.43.mlp.gate_proj.weight",
420
+ "decoderLayer.43.feedForward.outputDense.weight": "model.layers.43.mlp.down_proj.weight",
421
+ "decoderLayer.43.ffnLayerNorm.weight": "model.layers.43.ln2.weight",
422
+ "decoderLayer.43.feedForward.intermediateDense2.weight": "model.layers.43.mlp.up_proj.weight",
423
+ "decoderLayer.44.multiHeadAttention.q.weight": "model.layers.44.self_attn.q_proj.weight",
424
+ "decoderLayer.44.multiHeadAttention.k.weight": "model.layers.44.self_attn.k_proj.weight",
425
+ "decoderLayer.44.multiHeadAttention.v.weight": "model.layers.44.self_attn.v_proj.weight",
426
+ "decoderLayer.44.multiHeadAttention.o.weight": "model.layers.44.self_attn.o_proj.weight",
427
+ "decoderLayer.44.attnLayerNorm.weight": "model.layers.44.ln1.weight",
428
+ "decoderLayer.44.feedForward.intermediateDense.weight": "model.layers.44.mlp.gate_proj.weight",
429
+ "decoderLayer.44.feedForward.outputDense.weight": "model.layers.44.mlp.down_proj.weight",
430
+ "decoderLayer.44.ffnLayerNorm.weight": "model.layers.44.ln2.weight",
431
+ "decoderLayer.44.feedForward.intermediateDense2.weight": "model.layers.44.mlp.up_proj.weight",
432
+ "decoderLayer.45.multiHeadAttention.q.weight": "model.layers.45.self_attn.q_proj.weight",
433
+ "decoderLayer.45.multiHeadAttention.k.weight": "model.layers.45.self_attn.k_proj.weight",
434
+ "decoderLayer.45.multiHeadAttention.v.weight": "model.layers.45.self_attn.v_proj.weight",
435
+ "decoderLayer.45.multiHeadAttention.o.weight": "model.layers.45.self_attn.o_proj.weight",
436
+ "decoderLayer.45.attnLayerNorm.weight": "model.layers.45.ln1.weight",
437
+ "decoderLayer.45.feedForward.intermediateDense.weight": "model.layers.45.mlp.gate_proj.weight",
438
+ "decoderLayer.45.feedForward.outputDense.weight": "model.layers.45.mlp.down_proj.weight",
439
+ "decoderLayer.45.ffnLayerNorm.weight": "model.layers.45.ln2.weight",
440
+ "decoderLayer.45.feedForward.intermediateDense2.weight": "model.layers.45.mlp.up_proj.weight",
441
+ "decoderLayer.46.multiHeadAttention.q.weight": "model.layers.46.self_attn.q_proj.weight",
442
+ "decoderLayer.46.multiHeadAttention.k.weight": "model.layers.46.self_attn.k_proj.weight",
443
+ "decoderLayer.46.multiHeadAttention.v.weight": "model.layers.46.self_attn.v_proj.weight",
444
+ "decoderLayer.46.multiHeadAttention.o.weight": "model.layers.46.self_attn.o_proj.weight",
445
+ "decoderLayer.46.attnLayerNorm.weight": "model.layers.46.ln1.weight",
446
+ "decoderLayer.46.feedForward.intermediateDense.weight": "model.layers.46.mlp.gate_proj.weight",
447
+ "decoderLayer.46.feedForward.outputDense.weight": "model.layers.46.mlp.down_proj.weight",
448
+ "decoderLayer.46.ffnLayerNorm.weight": "model.layers.46.ln2.weight",
449
+ "decoderLayer.46.feedForward.intermediateDense2.weight": "model.layers.46.mlp.up_proj.weight",
450
+ "decoderLayer.47.multiHeadAttention.q.weight": "model.layers.47.self_attn.q_proj.weight",
451
+ "decoderLayer.47.multiHeadAttention.k.weight": "model.layers.47.self_attn.k_proj.weight",
452
+ "decoderLayer.47.multiHeadAttention.v.weight": "model.layers.47.self_attn.v_proj.weight",
453
+ "decoderLayer.47.multiHeadAttention.o.weight": "model.layers.47.self_attn.o_proj.weight",
454
+ "decoderLayer.47.attnLayerNorm.weight": "model.layers.47.ln1.weight",
455
+ "decoderLayer.47.feedForward.intermediateDense.weight": "model.layers.47.mlp.gate_proj.weight",
456
+ "decoderLayer.47.feedForward.outputDense.weight": "model.layers.47.mlp.down_proj.weight",
457
+ "decoderLayer.47.ffnLayerNorm.weight": "model.layers.47.ln2.weight",
458
+ "decoderLayer.47.feedForward.intermediateDense2.weight": "model.layers.47.mlp.up_proj.weight"
459
+ }
460
+ }
Yi-1.5-9B-Chat/bert4torch_config.json ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 16384,
9
+ "model": "llama",
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 48,
12
+ "num_key_value_heads": 4,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-06,
15
+ "rope_theta": 5000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "vocab_size": 64000,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "segment_vocab_size": 0,
22
+ "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 16384, "eos_token_id": 2},
23
+ "mapping": {
24
+ "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
+ "LayerNormFinal.weight": "model.norm.weight",
26
+ "lm_head.weight": "lm_head.weight",
27
+ "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
+ "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
+ "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
+ "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
+ "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
+ "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
+ "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
+ "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
+ "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
+ "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
+ "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
+ "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
+ "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
+ "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
+ "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
+ "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
+ "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
+ "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
+ "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
+ "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
+ "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
+ "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
+ "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
+ "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
+ "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
+ "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
+ "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
+ "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
+ "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
+ "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
+ "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
+ "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
+ "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
+ "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
+ "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
+ "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
+ "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
+ "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
+ "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
+ "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
+ "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
+ "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
+ "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
+ "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
+ "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
+ "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
+ "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
+ "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
+ "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
+ "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
+ "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
+ "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
+ "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
+ "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
+ "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
+ "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
+ "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
+ "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
+ "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
+ "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
+ "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
+ "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
+ "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
+ "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
+ "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
+ "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
+ "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
+ "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
+ "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
+ "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
+ "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
+ "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
+ "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
+ "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
+ "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
+ "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
+ "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
+ "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
+ "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
+ "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
+ "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
+ "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
+ "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
+ "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
+ "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
+ "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
+ "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
+ "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
+ "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
+ "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
+ "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
+ "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
+ "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
+ "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
+ "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
+ "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
+ "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
+ "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
+ "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
+ "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
+ "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
+ "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
+ "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
+ "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
+ "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
+ "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
+ "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
+ "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
+ "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
+ "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
+ "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
+ "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
+ "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
+ "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
+ "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
+ "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
+ "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
+ "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
+ "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
+ "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
+ "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
+ "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
+ "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
+ "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
+ "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
+ "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
+ "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
+ "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
+ "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
+ "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
+ "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
+ "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
+ "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
+ "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
+ "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
+ "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
+ "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
+ "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
+ "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
+ "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
+ "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
+ "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
+ "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
+ "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
+ "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
+ "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
+ "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
+ "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
+ "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
+ "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
+ "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
+ "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
+ "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
+ "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
+ "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
+ "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
+ "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
+ "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
+ "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
+ "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
+ "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
+ "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
+ "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
+ "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
+ "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
+ "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
+ "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
+ "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
+ "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
+ "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
+ "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
+ "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
+ "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
+ "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
+ "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
+ "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
+ "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
+ "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
+ "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
+ "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
+ "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
+ "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
+ "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
+ "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
+ "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
+ "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
+ "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
+ "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
+ "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
+ "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
+ "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
+ "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
+ "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
+ "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
+ "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
+ "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
+ "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
+ "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
+ "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
+ "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
+ "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
+ "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
+ "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
+ "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
+ "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
+ "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
+ "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
+ "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
+ "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
+ "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
+ "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
+ "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
+ "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
+ "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
+ "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
+ "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
+ "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
+ "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
+ "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
+ "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
+ "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
+ "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
+ "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
+ "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
+ "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
+ "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
+ "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
+ "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
+ "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
+ "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
+ "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
+ "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
+ "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
+ "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
+ "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
+ "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
+ "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
+ "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
+ "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
+ "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
+ "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
+ "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
+ "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
+ "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
+ "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
+ "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
+ "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
+ "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
+ "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
+ "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
+ "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
+ "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
+ "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
+ "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
+ "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
+ "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
+ "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
+ "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
+ "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
+ "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
+ "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
+ "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
+ "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
+ "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
+ "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
+ "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
+ "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
+ "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
+ "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
+ "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
+ "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
+ "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
+ "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
+ "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
+ "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
+ "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
+ "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
+ "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
+ "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
+ "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
+ "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
+ "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
+ "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
+ "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
+ "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
+ "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
+ "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
+ "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight",
315
+ "decoderLayer.32.multiHeadAttention.q.weight": "model.layers.32.self_attn.q_proj.weight",
316
+ "decoderLayer.32.multiHeadAttention.k.weight": "model.layers.32.self_attn.k_proj.weight",
317
+ "decoderLayer.32.multiHeadAttention.v.weight": "model.layers.32.self_attn.v_proj.weight",
318
+ "decoderLayer.32.multiHeadAttention.o.weight": "model.layers.32.self_attn.o_proj.weight",
319
+ "decoderLayer.32.attnLayerNorm.weight": "model.layers.32.ln1.weight",
320
+ "decoderLayer.32.feedForward.intermediateDense.weight": "model.layers.32.mlp.gate_proj.weight",
321
+ "decoderLayer.32.feedForward.outputDense.weight": "model.layers.32.mlp.down_proj.weight",
322
+ "decoderLayer.32.ffnLayerNorm.weight": "model.layers.32.ln2.weight",
323
+ "decoderLayer.32.feedForward.intermediateDense2.weight": "model.layers.32.mlp.up_proj.weight",
324
+ "decoderLayer.33.multiHeadAttention.q.weight": "model.layers.33.self_attn.q_proj.weight",
325
+ "decoderLayer.33.multiHeadAttention.k.weight": "model.layers.33.self_attn.k_proj.weight",
326
+ "decoderLayer.33.multiHeadAttention.v.weight": "model.layers.33.self_attn.v_proj.weight",
327
+ "decoderLayer.33.multiHeadAttention.o.weight": "model.layers.33.self_attn.o_proj.weight",
328
+ "decoderLayer.33.attnLayerNorm.weight": "model.layers.33.ln1.weight",
329
+ "decoderLayer.33.feedForward.intermediateDense.weight": "model.layers.33.mlp.gate_proj.weight",
330
+ "decoderLayer.33.feedForward.outputDense.weight": "model.layers.33.mlp.down_proj.weight",
331
+ "decoderLayer.33.ffnLayerNorm.weight": "model.layers.33.ln2.weight",
332
+ "decoderLayer.33.feedForward.intermediateDense2.weight": "model.layers.33.mlp.up_proj.weight",
333
+ "decoderLayer.34.multiHeadAttention.q.weight": "model.layers.34.self_attn.q_proj.weight",
334
+ "decoderLayer.34.multiHeadAttention.k.weight": "model.layers.34.self_attn.k_proj.weight",
335
+ "decoderLayer.34.multiHeadAttention.v.weight": "model.layers.34.self_attn.v_proj.weight",
336
+ "decoderLayer.34.multiHeadAttention.o.weight": "model.layers.34.self_attn.o_proj.weight",
337
+ "decoderLayer.34.attnLayerNorm.weight": "model.layers.34.ln1.weight",
338
+ "decoderLayer.34.feedForward.intermediateDense.weight": "model.layers.34.mlp.gate_proj.weight",
339
+ "decoderLayer.34.feedForward.outputDense.weight": "model.layers.34.mlp.down_proj.weight",
340
+ "decoderLayer.34.ffnLayerNorm.weight": "model.layers.34.ln2.weight",
341
+ "decoderLayer.34.feedForward.intermediateDense2.weight": "model.layers.34.mlp.up_proj.weight",
342
+ "decoderLayer.35.multiHeadAttention.q.weight": "model.layers.35.self_attn.q_proj.weight",
343
+ "decoderLayer.35.multiHeadAttention.k.weight": "model.layers.35.self_attn.k_proj.weight",
344
+ "decoderLayer.35.multiHeadAttention.v.weight": "model.layers.35.self_attn.v_proj.weight",
345
+ "decoderLayer.35.multiHeadAttention.o.weight": "model.layers.35.self_attn.o_proj.weight",
346
+ "decoderLayer.35.attnLayerNorm.weight": "model.layers.35.ln1.weight",
347
+ "decoderLayer.35.feedForward.intermediateDense.weight": "model.layers.35.mlp.gate_proj.weight",
348
+ "decoderLayer.35.feedForward.outputDense.weight": "model.layers.35.mlp.down_proj.weight",
349
+ "decoderLayer.35.ffnLayerNorm.weight": "model.layers.35.ln2.weight",
350
+ "decoderLayer.35.feedForward.intermediateDense2.weight": "model.layers.35.mlp.up_proj.weight",
351
+ "decoderLayer.36.multiHeadAttention.q.weight": "model.layers.36.self_attn.q_proj.weight",
352
+ "decoderLayer.36.multiHeadAttention.k.weight": "model.layers.36.self_attn.k_proj.weight",
353
+ "decoderLayer.36.multiHeadAttention.v.weight": "model.layers.36.self_attn.v_proj.weight",
354
+ "decoderLayer.36.multiHeadAttention.o.weight": "model.layers.36.self_attn.o_proj.weight",
355
+ "decoderLayer.36.attnLayerNorm.weight": "model.layers.36.ln1.weight",
356
+ "decoderLayer.36.feedForward.intermediateDense.weight": "model.layers.36.mlp.gate_proj.weight",
357
+ "decoderLayer.36.feedForward.outputDense.weight": "model.layers.36.mlp.down_proj.weight",
358
+ "decoderLayer.36.ffnLayerNorm.weight": "model.layers.36.ln2.weight",
359
+ "decoderLayer.36.feedForward.intermediateDense2.weight": "model.layers.36.mlp.up_proj.weight",
360
+ "decoderLayer.37.multiHeadAttention.q.weight": "model.layers.37.self_attn.q_proj.weight",
361
+ "decoderLayer.37.multiHeadAttention.k.weight": "model.layers.37.self_attn.k_proj.weight",
362
+ "decoderLayer.37.multiHeadAttention.v.weight": "model.layers.37.self_attn.v_proj.weight",
363
+ "decoderLayer.37.multiHeadAttention.o.weight": "model.layers.37.self_attn.o_proj.weight",
364
+ "decoderLayer.37.attnLayerNorm.weight": "model.layers.37.ln1.weight",
365
+ "decoderLayer.37.feedForward.intermediateDense.weight": "model.layers.37.mlp.gate_proj.weight",
366
+ "decoderLayer.37.feedForward.outputDense.weight": "model.layers.37.mlp.down_proj.weight",
367
+ "decoderLayer.37.ffnLayerNorm.weight": "model.layers.37.ln2.weight",
368
+ "decoderLayer.37.feedForward.intermediateDense2.weight": "model.layers.37.mlp.up_proj.weight",
369
+ "decoderLayer.38.multiHeadAttention.q.weight": "model.layers.38.self_attn.q_proj.weight",
370
+ "decoderLayer.38.multiHeadAttention.k.weight": "model.layers.38.self_attn.k_proj.weight",
371
+ "decoderLayer.38.multiHeadAttention.v.weight": "model.layers.38.self_attn.v_proj.weight",
372
+ "decoderLayer.38.multiHeadAttention.o.weight": "model.layers.38.self_attn.o_proj.weight",
373
+ "decoderLayer.38.attnLayerNorm.weight": "model.layers.38.ln1.weight",
374
+ "decoderLayer.38.feedForward.intermediateDense.weight": "model.layers.38.mlp.gate_proj.weight",
375
+ "decoderLayer.38.feedForward.outputDense.weight": "model.layers.38.mlp.down_proj.weight",
376
+ "decoderLayer.38.ffnLayerNorm.weight": "model.layers.38.ln2.weight",
377
+ "decoderLayer.38.feedForward.intermediateDense2.weight": "model.layers.38.mlp.up_proj.weight",
378
+ "decoderLayer.39.multiHeadAttention.q.weight": "model.layers.39.self_attn.q_proj.weight",
379
+ "decoderLayer.39.multiHeadAttention.k.weight": "model.layers.39.self_attn.k_proj.weight",
380
+ "decoderLayer.39.multiHeadAttention.v.weight": "model.layers.39.self_attn.v_proj.weight",
381
+ "decoderLayer.39.multiHeadAttention.o.weight": "model.layers.39.self_attn.o_proj.weight",
382
+ "decoderLayer.39.attnLayerNorm.weight": "model.layers.39.ln1.weight",
383
+ "decoderLayer.39.feedForward.intermediateDense.weight": "model.layers.39.mlp.gate_proj.weight",
384
+ "decoderLayer.39.feedForward.outputDense.weight": "model.layers.39.mlp.down_proj.weight",
385
+ "decoderLayer.39.ffnLayerNorm.weight": "model.layers.39.ln2.weight",
386
+ "decoderLayer.39.feedForward.intermediateDense2.weight": "model.layers.39.mlp.up_proj.weight",
387
+ "decoderLayer.40.multiHeadAttention.q.weight": "model.layers.40.self_attn.q_proj.weight",
388
+ "decoderLayer.40.multiHeadAttention.k.weight": "model.layers.40.self_attn.k_proj.weight",
389
+ "decoderLayer.40.multiHeadAttention.v.weight": "model.layers.40.self_attn.v_proj.weight",
390
+ "decoderLayer.40.multiHeadAttention.o.weight": "model.layers.40.self_attn.o_proj.weight",
391
+ "decoderLayer.40.attnLayerNorm.weight": "model.layers.40.ln1.weight",
392
+ "decoderLayer.40.feedForward.intermediateDense.weight": "model.layers.40.mlp.gate_proj.weight",
393
+ "decoderLayer.40.feedForward.outputDense.weight": "model.layers.40.mlp.down_proj.weight",
394
+ "decoderLayer.40.ffnLayerNorm.weight": "model.layers.40.ln2.weight",
395
+ "decoderLayer.40.feedForward.intermediateDense2.weight": "model.layers.40.mlp.up_proj.weight",
396
+ "decoderLayer.41.multiHeadAttention.q.weight": "model.layers.41.self_attn.q_proj.weight",
397
+ "decoderLayer.41.multiHeadAttention.k.weight": "model.layers.41.self_attn.k_proj.weight",
398
+ "decoderLayer.41.multiHeadAttention.v.weight": "model.layers.41.self_attn.v_proj.weight",
399
+ "decoderLayer.41.multiHeadAttention.o.weight": "model.layers.41.self_attn.o_proj.weight",
400
+ "decoderLayer.41.attnLayerNorm.weight": "model.layers.41.ln1.weight",
401
+ "decoderLayer.41.feedForward.intermediateDense.weight": "model.layers.41.mlp.gate_proj.weight",
402
+ "decoderLayer.41.feedForward.outputDense.weight": "model.layers.41.mlp.down_proj.weight",
403
+ "decoderLayer.41.ffnLayerNorm.weight": "model.layers.41.ln2.weight",
404
+ "decoderLayer.41.feedForward.intermediateDense2.weight": "model.layers.41.mlp.up_proj.weight",
405
+ "decoderLayer.42.multiHeadAttention.q.weight": "model.layers.42.self_attn.q_proj.weight",
406
+ "decoderLayer.42.multiHeadAttention.k.weight": "model.layers.42.self_attn.k_proj.weight",
407
+ "decoderLayer.42.multiHeadAttention.v.weight": "model.layers.42.self_attn.v_proj.weight",
408
+ "decoderLayer.42.multiHeadAttention.o.weight": "model.layers.42.self_attn.o_proj.weight",
409
+ "decoderLayer.42.attnLayerNorm.weight": "model.layers.42.ln1.weight",
410
+ "decoderLayer.42.feedForward.intermediateDense.weight": "model.layers.42.mlp.gate_proj.weight",
411
+ "decoderLayer.42.feedForward.outputDense.weight": "model.layers.42.mlp.down_proj.weight",
412
+ "decoderLayer.42.ffnLayerNorm.weight": "model.layers.42.ln2.weight",
413
+ "decoderLayer.42.feedForward.intermediateDense2.weight": "model.layers.42.mlp.up_proj.weight",
414
+ "decoderLayer.43.multiHeadAttention.q.weight": "model.layers.43.self_attn.q_proj.weight",
415
+ "decoderLayer.43.multiHeadAttention.k.weight": "model.layers.43.self_attn.k_proj.weight",
416
+ "decoderLayer.43.multiHeadAttention.v.weight": "model.layers.43.self_attn.v_proj.weight",
417
+ "decoderLayer.43.multiHeadAttention.o.weight": "model.layers.43.self_attn.o_proj.weight",
418
+ "decoderLayer.43.attnLayerNorm.weight": "model.layers.43.ln1.weight",
419
+ "decoderLayer.43.feedForward.intermediateDense.weight": "model.layers.43.mlp.gate_proj.weight",
420
+ "decoderLayer.43.feedForward.outputDense.weight": "model.layers.43.mlp.down_proj.weight",
421
+ "decoderLayer.43.ffnLayerNorm.weight": "model.layers.43.ln2.weight",
422
+ "decoderLayer.43.feedForward.intermediateDense2.weight": "model.layers.43.mlp.up_proj.weight",
423
+ "decoderLayer.44.multiHeadAttention.q.weight": "model.layers.44.self_attn.q_proj.weight",
424
+ "decoderLayer.44.multiHeadAttention.k.weight": "model.layers.44.self_attn.k_proj.weight",
425
+ "decoderLayer.44.multiHeadAttention.v.weight": "model.layers.44.self_attn.v_proj.weight",
426
+ "decoderLayer.44.multiHeadAttention.o.weight": "model.layers.44.self_attn.o_proj.weight",
427
+ "decoderLayer.44.attnLayerNorm.weight": "model.layers.44.ln1.weight",
428
+ "decoderLayer.44.feedForward.intermediateDense.weight": "model.layers.44.mlp.gate_proj.weight",
429
+ "decoderLayer.44.feedForward.outputDense.weight": "model.layers.44.mlp.down_proj.weight",
430
+ "decoderLayer.44.ffnLayerNorm.weight": "model.layers.44.ln2.weight",
431
+ "decoderLayer.44.feedForward.intermediateDense2.weight": "model.layers.44.mlp.up_proj.weight",
432
+ "decoderLayer.45.multiHeadAttention.q.weight": "model.layers.45.self_attn.q_proj.weight",
433
+ "decoderLayer.45.multiHeadAttention.k.weight": "model.layers.45.self_attn.k_proj.weight",
434
+ "decoderLayer.45.multiHeadAttention.v.weight": "model.layers.45.self_attn.v_proj.weight",
435
+ "decoderLayer.45.multiHeadAttention.o.weight": "model.layers.45.self_attn.o_proj.weight",
436
+ "decoderLayer.45.attnLayerNorm.weight": "model.layers.45.ln1.weight",
437
+ "decoderLayer.45.feedForward.intermediateDense.weight": "model.layers.45.mlp.gate_proj.weight",
438
+ "decoderLayer.45.feedForward.outputDense.weight": "model.layers.45.mlp.down_proj.weight",
439
+ "decoderLayer.45.ffnLayerNorm.weight": "model.layers.45.ln2.weight",
440
+ "decoderLayer.45.feedForward.intermediateDense2.weight": "model.layers.45.mlp.up_proj.weight",
441
+ "decoderLayer.46.multiHeadAttention.q.weight": "model.layers.46.self_attn.q_proj.weight",
442
+ "decoderLayer.46.multiHeadAttention.k.weight": "model.layers.46.self_attn.k_proj.weight",
443
+ "decoderLayer.46.multiHeadAttention.v.weight": "model.layers.46.self_attn.v_proj.weight",
444
+ "decoderLayer.46.multiHeadAttention.o.weight": "model.layers.46.self_attn.o_proj.weight",
445
+ "decoderLayer.46.attnLayerNorm.weight": "model.layers.46.ln1.weight",
446
+ "decoderLayer.46.feedForward.intermediateDense.weight": "model.layers.46.mlp.gate_proj.weight",
447
+ "decoderLayer.46.feedForward.outputDense.weight": "model.layers.46.mlp.down_proj.weight",
448
+ "decoderLayer.46.ffnLayerNorm.weight": "model.layers.46.ln2.weight",
449
+ "decoderLayer.46.feedForward.intermediateDense2.weight": "model.layers.46.mlp.up_proj.weight",
450
+ "decoderLayer.47.multiHeadAttention.q.weight": "model.layers.47.self_attn.q_proj.weight",
451
+ "decoderLayer.47.multiHeadAttention.k.weight": "model.layers.47.self_attn.k_proj.weight",
452
+ "decoderLayer.47.multiHeadAttention.v.weight": "model.layers.47.self_attn.v_proj.weight",
453
+ "decoderLayer.47.multiHeadAttention.o.weight": "model.layers.47.self_attn.o_proj.weight",
454
+ "decoderLayer.47.attnLayerNorm.weight": "model.layers.47.ln1.weight",
455
+ "decoderLayer.47.feedForward.intermediateDense.weight": "model.layers.47.mlp.gate_proj.weight",
456
+ "decoderLayer.47.feedForward.outputDense.weight": "model.layers.47.mlp.down_proj.weight",
457
+ "decoderLayer.47.ffnLayerNorm.weight": "model.layers.47.ln2.weight",
458
+ "decoderLayer.47.feedForward.intermediateDense2.weight": "model.layers.47.mlp.up_proj.weight"
459
+ }
460
+ }
Yi-1.5-9B/bert4torch_config.json ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 4096,
9
+ "model": "llama",
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 48,
12
+ "num_key_value_heads": 4,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-06,
15
+ "rope_theta": 5000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "vocab_size": 64000,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "segment_vocab_size": 0,
22
+ "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 2},
23
+ "mapping": {
24
+ "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
+ "LayerNormFinal.weight": "model.norm.weight",
26
+ "lm_head.weight": "lm_head.weight",
27
+ "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
+ "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
+ "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
+ "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
+ "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
+ "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
+ "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
+ "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
+ "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
+ "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
+ "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
+ "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
+ "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
+ "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
+ "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
+ "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
+ "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
+ "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
+ "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
+ "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
+ "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
+ "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
+ "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
+ "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
+ "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
+ "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
+ "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
+ "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
+ "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
+ "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
+ "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
+ "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
+ "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
+ "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
+ "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
+ "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
+ "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
+ "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
+ "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
+ "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
+ "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
+ "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
+ "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
+ "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
+ "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
+ "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
+ "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
+ "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
+ "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
+ "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
+ "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
+ "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
+ "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
+ "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
+ "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
+ "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
+ "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
+ "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
+ "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
+ "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
+ "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
+ "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
+ "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
+ "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
+ "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
+ "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
+ "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
+ "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
+ "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
+ "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
+ "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
+ "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
+ "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
+ "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
+ "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
+ "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
+ "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
+ "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
+ "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
+ "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
+ "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
+ "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
+ "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
+ "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
+ "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
+ "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
+ "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
+ "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
+ "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
+ "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
+ "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
+ "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
+ "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
+ "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
+ "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
+ "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
+ "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
+ "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
+ "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
+ "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
+ "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
+ "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
+ "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
+ "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
+ "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
+ "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
+ "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
+ "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
+ "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
+ "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
+ "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
+ "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
+ "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
+ "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
+ "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
+ "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
+ "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
+ "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
+ "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
+ "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
+ "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
+ "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
+ "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
+ "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
+ "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
+ "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
+ "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
+ "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
+ "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
+ "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
+ "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
+ "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
+ "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
+ "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
+ "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
+ "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
+ "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
+ "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
+ "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
+ "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
+ "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
+ "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
+ "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
+ "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
+ "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
+ "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
+ "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
+ "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
+ "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
+ "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
+ "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
+ "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
+ "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
+ "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
+ "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
+ "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
+ "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
+ "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
+ "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
+ "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
+ "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
+ "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
+ "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
+ "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
+ "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
+ "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
+ "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
+ "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
+ "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
+ "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
+ "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
+ "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
+ "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
+ "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
+ "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
+ "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
+ "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
+ "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
+ "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
+ "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
+ "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
+ "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
+ "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
+ "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
+ "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
+ "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
+ "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
+ "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
+ "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
+ "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
+ "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
+ "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
+ "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
+ "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
+ "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
+ "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
+ "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
+ "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
+ "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
+ "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
+ "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
+ "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
+ "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
+ "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
+ "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
+ "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
+ "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
+ "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
+ "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
+ "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
+ "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
+ "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
+ "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
+ "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
+ "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
+ "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
+ "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
+ "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
+ "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
+ "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
+ "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
+ "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
+ "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
+ "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
+ "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
+ "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
+ "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
+ "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
+ "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
+ "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
+ "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
+ "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
+ "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
+ "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
+ "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
+ "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
+ "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
+ "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
+ "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
+ "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
+ "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
+ "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
+ "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
+ "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
+ "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
+ "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
+ "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
+ "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
+ "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
+ "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
+ "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
+ "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
+ "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
+ "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
+ "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
+ "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
+ "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
+ "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
+ "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
+ "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
+ "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
+ "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
+ "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
+ "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
+ "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
+ "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
+ "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
+ "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
+ "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
+ "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
+ "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
+ "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
+ "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
+ "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
+ "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
+ "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
+ "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
+ "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
+ "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
+ "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
+ "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
+ "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
+ "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
+ "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
+ "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
+ "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
+ "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
+ "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight",
315
+ "decoderLayer.32.multiHeadAttention.q.weight": "model.layers.32.self_attn.q_proj.weight",
316
+ "decoderLayer.32.multiHeadAttention.k.weight": "model.layers.32.self_attn.k_proj.weight",
317
+ "decoderLayer.32.multiHeadAttention.v.weight": "model.layers.32.self_attn.v_proj.weight",
318
+ "decoderLayer.32.multiHeadAttention.o.weight": "model.layers.32.self_attn.o_proj.weight",
319
+ "decoderLayer.32.attnLayerNorm.weight": "model.layers.32.ln1.weight",
320
+ "decoderLayer.32.feedForward.intermediateDense.weight": "model.layers.32.mlp.gate_proj.weight",
321
+ "decoderLayer.32.feedForward.outputDense.weight": "model.layers.32.mlp.down_proj.weight",
322
+ "decoderLayer.32.ffnLayerNorm.weight": "model.layers.32.ln2.weight",
323
+ "decoderLayer.32.feedForward.intermediateDense2.weight": "model.layers.32.mlp.up_proj.weight",
324
+ "decoderLayer.33.multiHeadAttention.q.weight": "model.layers.33.self_attn.q_proj.weight",
325
+ "decoderLayer.33.multiHeadAttention.k.weight": "model.layers.33.self_attn.k_proj.weight",
326
+ "decoderLayer.33.multiHeadAttention.v.weight": "model.layers.33.self_attn.v_proj.weight",
327
+ "decoderLayer.33.multiHeadAttention.o.weight": "model.layers.33.self_attn.o_proj.weight",
328
+ "decoderLayer.33.attnLayerNorm.weight": "model.layers.33.ln1.weight",
329
+ "decoderLayer.33.feedForward.intermediateDense.weight": "model.layers.33.mlp.gate_proj.weight",
330
+ "decoderLayer.33.feedForward.outputDense.weight": "model.layers.33.mlp.down_proj.weight",
331
+ "decoderLayer.33.ffnLayerNorm.weight": "model.layers.33.ln2.weight",
332
+ "decoderLayer.33.feedForward.intermediateDense2.weight": "model.layers.33.mlp.up_proj.weight",
333
+ "decoderLayer.34.multiHeadAttention.q.weight": "model.layers.34.self_attn.q_proj.weight",
334
+ "decoderLayer.34.multiHeadAttention.k.weight": "model.layers.34.self_attn.k_proj.weight",
335
+ "decoderLayer.34.multiHeadAttention.v.weight": "model.layers.34.self_attn.v_proj.weight",
336
+ "decoderLayer.34.multiHeadAttention.o.weight": "model.layers.34.self_attn.o_proj.weight",
337
+ "decoderLayer.34.attnLayerNorm.weight": "model.layers.34.ln1.weight",
338
+ "decoderLayer.34.feedForward.intermediateDense.weight": "model.layers.34.mlp.gate_proj.weight",
339
+ "decoderLayer.34.feedForward.outputDense.weight": "model.layers.34.mlp.down_proj.weight",
340
+ "decoderLayer.34.ffnLayerNorm.weight": "model.layers.34.ln2.weight",
341
+ "decoderLayer.34.feedForward.intermediateDense2.weight": "model.layers.34.mlp.up_proj.weight",
342
+ "decoderLayer.35.multiHeadAttention.q.weight": "model.layers.35.self_attn.q_proj.weight",
343
+ "decoderLayer.35.multiHeadAttention.k.weight": "model.layers.35.self_attn.k_proj.weight",
344
+ "decoderLayer.35.multiHeadAttention.v.weight": "model.layers.35.self_attn.v_proj.weight",
345
+ "decoderLayer.35.multiHeadAttention.o.weight": "model.layers.35.self_attn.o_proj.weight",
346
+ "decoderLayer.35.attnLayerNorm.weight": "model.layers.35.ln1.weight",
347
+ "decoderLayer.35.feedForward.intermediateDense.weight": "model.layers.35.mlp.gate_proj.weight",
348
+ "decoderLayer.35.feedForward.outputDense.weight": "model.layers.35.mlp.down_proj.weight",
349
+ "decoderLayer.35.ffnLayerNorm.weight": "model.layers.35.ln2.weight",
350
+ "decoderLayer.35.feedForward.intermediateDense2.weight": "model.layers.35.mlp.up_proj.weight",
351
+ "decoderLayer.36.multiHeadAttention.q.weight": "model.layers.36.self_attn.q_proj.weight",
352
+ "decoderLayer.36.multiHeadAttention.k.weight": "model.layers.36.self_attn.k_proj.weight",
353
+ "decoderLayer.36.multiHeadAttention.v.weight": "model.layers.36.self_attn.v_proj.weight",
354
+ "decoderLayer.36.multiHeadAttention.o.weight": "model.layers.36.self_attn.o_proj.weight",
355
+ "decoderLayer.36.attnLayerNorm.weight": "model.layers.36.ln1.weight",
356
+ "decoderLayer.36.feedForward.intermediateDense.weight": "model.layers.36.mlp.gate_proj.weight",
357
+ "decoderLayer.36.feedForward.outputDense.weight": "model.layers.36.mlp.down_proj.weight",
358
+ "decoderLayer.36.ffnLayerNorm.weight": "model.layers.36.ln2.weight",
359
+ "decoderLayer.36.feedForward.intermediateDense2.weight": "model.layers.36.mlp.up_proj.weight",
360
+ "decoderLayer.37.multiHeadAttention.q.weight": "model.layers.37.self_attn.q_proj.weight",
361
+ "decoderLayer.37.multiHeadAttention.k.weight": "model.layers.37.self_attn.k_proj.weight",
362
+ "decoderLayer.37.multiHeadAttention.v.weight": "model.layers.37.self_attn.v_proj.weight",
363
+ "decoderLayer.37.multiHeadAttention.o.weight": "model.layers.37.self_attn.o_proj.weight",
364
+ "decoderLayer.37.attnLayerNorm.weight": "model.layers.37.ln1.weight",
365
+ "decoderLayer.37.feedForward.intermediateDense.weight": "model.layers.37.mlp.gate_proj.weight",
366
+ "decoderLayer.37.feedForward.outputDense.weight": "model.layers.37.mlp.down_proj.weight",
367
+ "decoderLayer.37.ffnLayerNorm.weight": "model.layers.37.ln2.weight",
368
+ "decoderLayer.37.feedForward.intermediateDense2.weight": "model.layers.37.mlp.up_proj.weight",
369
+ "decoderLayer.38.multiHeadAttention.q.weight": "model.layers.38.self_attn.q_proj.weight",
370
+ "decoderLayer.38.multiHeadAttention.k.weight": "model.layers.38.self_attn.k_proj.weight",
371
+ "decoderLayer.38.multiHeadAttention.v.weight": "model.layers.38.self_attn.v_proj.weight",
372
+ "decoderLayer.38.multiHeadAttention.o.weight": "model.layers.38.self_attn.o_proj.weight",
373
+ "decoderLayer.38.attnLayerNorm.weight": "model.layers.38.ln1.weight",
374
+ "decoderLayer.38.feedForward.intermediateDense.weight": "model.layers.38.mlp.gate_proj.weight",
375
+ "decoderLayer.38.feedForward.outputDense.weight": "model.layers.38.mlp.down_proj.weight",
376
+ "decoderLayer.38.ffnLayerNorm.weight": "model.layers.38.ln2.weight",
377
+ "decoderLayer.38.feedForward.intermediateDense2.weight": "model.layers.38.mlp.up_proj.weight",
378
+ "decoderLayer.39.multiHeadAttention.q.weight": "model.layers.39.self_attn.q_proj.weight",
379
+ "decoderLayer.39.multiHeadAttention.k.weight": "model.layers.39.self_attn.k_proj.weight",
380
+ "decoderLayer.39.multiHeadAttention.v.weight": "model.layers.39.self_attn.v_proj.weight",
381
+ "decoderLayer.39.multiHeadAttention.o.weight": "model.layers.39.self_attn.o_proj.weight",
382
+ "decoderLayer.39.attnLayerNorm.weight": "model.layers.39.ln1.weight",
383
+ "decoderLayer.39.feedForward.intermediateDense.weight": "model.layers.39.mlp.gate_proj.weight",
384
+ "decoderLayer.39.feedForward.outputDense.weight": "model.layers.39.mlp.down_proj.weight",
385
+ "decoderLayer.39.ffnLayerNorm.weight": "model.layers.39.ln2.weight",
386
+ "decoderLayer.39.feedForward.intermediateDense2.weight": "model.layers.39.mlp.up_proj.weight",
387
+ "decoderLayer.40.multiHeadAttention.q.weight": "model.layers.40.self_attn.q_proj.weight",
388
+ "decoderLayer.40.multiHeadAttention.k.weight": "model.layers.40.self_attn.k_proj.weight",
389
+ "decoderLayer.40.multiHeadAttention.v.weight": "model.layers.40.self_attn.v_proj.weight",
390
+ "decoderLayer.40.multiHeadAttention.o.weight": "model.layers.40.self_attn.o_proj.weight",
391
+ "decoderLayer.40.attnLayerNorm.weight": "model.layers.40.ln1.weight",
392
+ "decoderLayer.40.feedForward.intermediateDense.weight": "model.layers.40.mlp.gate_proj.weight",
393
+ "decoderLayer.40.feedForward.outputDense.weight": "model.layers.40.mlp.down_proj.weight",
394
+ "decoderLayer.40.ffnLayerNorm.weight": "model.layers.40.ln2.weight",
395
+ "decoderLayer.40.feedForward.intermediateDense2.weight": "model.layers.40.mlp.up_proj.weight",
396
+ "decoderLayer.41.multiHeadAttention.q.weight": "model.layers.41.self_attn.q_proj.weight",
397
+ "decoderLayer.41.multiHeadAttention.k.weight": "model.layers.41.self_attn.k_proj.weight",
398
+ "decoderLayer.41.multiHeadAttention.v.weight": "model.layers.41.self_attn.v_proj.weight",
399
+ "decoderLayer.41.multiHeadAttention.o.weight": "model.layers.41.self_attn.o_proj.weight",
400
+ "decoderLayer.41.attnLayerNorm.weight": "model.layers.41.ln1.weight",
401
+ "decoderLayer.41.feedForward.intermediateDense.weight": "model.layers.41.mlp.gate_proj.weight",
402
+ "decoderLayer.41.feedForward.outputDense.weight": "model.layers.41.mlp.down_proj.weight",
403
+ "decoderLayer.41.ffnLayerNorm.weight": "model.layers.41.ln2.weight",
404
+ "decoderLayer.41.feedForward.intermediateDense2.weight": "model.layers.41.mlp.up_proj.weight",
405
+ "decoderLayer.42.multiHeadAttention.q.weight": "model.layers.42.self_attn.q_proj.weight",
406
+ "decoderLayer.42.multiHeadAttention.k.weight": "model.layers.42.self_attn.k_proj.weight",
407
+ "decoderLayer.42.multiHeadAttention.v.weight": "model.layers.42.self_attn.v_proj.weight",
408
+ "decoderLayer.42.multiHeadAttention.o.weight": "model.layers.42.self_attn.o_proj.weight",
409
+ "decoderLayer.42.attnLayerNorm.weight": "model.layers.42.ln1.weight",
410
+ "decoderLayer.42.feedForward.intermediateDense.weight": "model.layers.42.mlp.gate_proj.weight",
411
+ "decoderLayer.42.feedForward.outputDense.weight": "model.layers.42.mlp.down_proj.weight",
412
+ "decoderLayer.42.ffnLayerNorm.weight": "model.layers.42.ln2.weight",
413
+ "decoderLayer.42.feedForward.intermediateDense2.weight": "model.layers.42.mlp.up_proj.weight",
414
+ "decoderLayer.43.multiHeadAttention.q.weight": "model.layers.43.self_attn.q_proj.weight",
415
+ "decoderLayer.43.multiHeadAttention.k.weight": "model.layers.43.self_attn.k_proj.weight",
416
+ "decoderLayer.43.multiHeadAttention.v.weight": "model.layers.43.self_attn.v_proj.weight",
417
+ "decoderLayer.43.multiHeadAttention.o.weight": "model.layers.43.self_attn.o_proj.weight",
418
+ "decoderLayer.43.attnLayerNorm.weight": "model.layers.43.ln1.weight",
419
+ "decoderLayer.43.feedForward.intermediateDense.weight": "model.layers.43.mlp.gate_proj.weight",
420
+ "decoderLayer.43.feedForward.outputDense.weight": "model.layers.43.mlp.down_proj.weight",
421
+ "decoderLayer.43.ffnLayerNorm.weight": "model.layers.43.ln2.weight",
422
+ "decoderLayer.43.feedForward.intermediateDense2.weight": "model.layers.43.mlp.up_proj.weight",
423
+ "decoderLayer.44.multiHeadAttention.q.weight": "model.layers.44.self_attn.q_proj.weight",
424
+ "decoderLayer.44.multiHeadAttention.k.weight": "model.layers.44.self_attn.k_proj.weight",
425
+ "decoderLayer.44.multiHeadAttention.v.weight": "model.layers.44.self_attn.v_proj.weight",
426
+ "decoderLayer.44.multiHeadAttention.o.weight": "model.layers.44.self_attn.o_proj.weight",
427
+ "decoderLayer.44.attnLayerNorm.weight": "model.layers.44.ln1.weight",
428
+ "decoderLayer.44.feedForward.intermediateDense.weight": "model.layers.44.mlp.gate_proj.weight",
429
+ "decoderLayer.44.feedForward.outputDense.weight": "model.layers.44.mlp.down_proj.weight",
430
+ "decoderLayer.44.ffnLayerNorm.weight": "model.layers.44.ln2.weight",
431
+ "decoderLayer.44.feedForward.intermediateDense2.weight": "model.layers.44.mlp.up_proj.weight",
432
+ "decoderLayer.45.multiHeadAttention.q.weight": "model.layers.45.self_attn.q_proj.weight",
433
+ "decoderLayer.45.multiHeadAttention.k.weight": "model.layers.45.self_attn.k_proj.weight",
434
+ "decoderLayer.45.multiHeadAttention.v.weight": "model.layers.45.self_attn.v_proj.weight",
435
+ "decoderLayer.45.multiHeadAttention.o.weight": "model.layers.45.self_attn.o_proj.weight",
436
+ "decoderLayer.45.attnLayerNorm.weight": "model.layers.45.ln1.weight",
437
+ "decoderLayer.45.feedForward.intermediateDense.weight": "model.layers.45.mlp.gate_proj.weight",
438
+ "decoderLayer.45.feedForward.outputDense.weight": "model.layers.45.mlp.down_proj.weight",
439
+ "decoderLayer.45.ffnLayerNorm.weight": "model.layers.45.ln2.weight",
440
+ "decoderLayer.45.feedForward.intermediateDense2.weight": "model.layers.45.mlp.up_proj.weight",
441
+ "decoderLayer.46.multiHeadAttention.q.weight": "model.layers.46.self_attn.q_proj.weight",
442
+ "decoderLayer.46.multiHeadAttention.k.weight": "model.layers.46.self_attn.k_proj.weight",
443
+ "decoderLayer.46.multiHeadAttention.v.weight": "model.layers.46.self_attn.v_proj.weight",
444
+ "decoderLayer.46.multiHeadAttention.o.weight": "model.layers.46.self_attn.o_proj.weight",
445
+ "decoderLayer.46.attnLayerNorm.weight": "model.layers.46.ln1.weight",
446
+ "decoderLayer.46.feedForward.intermediateDense.weight": "model.layers.46.mlp.gate_proj.weight",
447
+ "decoderLayer.46.feedForward.outputDense.weight": "model.layers.46.mlp.down_proj.weight",
448
+ "decoderLayer.46.ffnLayerNorm.weight": "model.layers.46.ln2.weight",
449
+ "decoderLayer.46.feedForward.intermediateDense2.weight": "model.layers.46.mlp.up_proj.weight",
450
+ "decoderLayer.47.multiHeadAttention.q.weight": "model.layers.47.self_attn.q_proj.weight",
451
+ "decoderLayer.47.multiHeadAttention.k.weight": "model.layers.47.self_attn.k_proj.weight",
452
+ "decoderLayer.47.multiHeadAttention.v.weight": "model.layers.47.self_attn.v_proj.weight",
453
+ "decoderLayer.47.multiHeadAttention.o.weight": "model.layers.47.self_attn.o_proj.weight",
454
+ "decoderLayer.47.attnLayerNorm.weight": "model.layers.47.ln1.weight",
455
+ "decoderLayer.47.feedForward.intermediateDense.weight": "model.layers.47.mlp.gate_proj.weight",
456
+ "decoderLayer.47.feedForward.outputDense.weight": "model.layers.47.mlp.down_proj.weight",
457
+ "decoderLayer.47.ffnLayerNorm.weight": "model.layers.47.ln2.weight",
458
+ "decoderLayer.47.feedForward.intermediateDense2.weight": "model.layers.47.mlp.up_proj.weight"
459
+ }
460
+ }