Tongjilibo commited on
Commit
e131816
1 Parent(s): 85a13bf

修改yi1.5

Browse files
Yi-1.5-6B-Chat/bert4torch_config.json CHANGED
@@ -7,6 +7,7 @@
7
  "intermediate_size": 11008,
8
  "max_position_embeddings": 4096,
9
  "model": "llama",
 
10
  "num_attention_heads": 32,
11
  "num_hidden_layers": 32,
12
  "num_key_value_heads": 4,
@@ -19,298 +20,5 @@
19
  "skip_init": true,
20
  "rope_rank": "updown",
21
  "segment_vocab_size": 0,
22
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 2},
23
- "mapping": {
24
- "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
- "LayerNormFinal.weight": "model.norm.weight",
26
- "lm_head.weight": "lm_head.weight",
27
- "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
- "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
- "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
- "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
- "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
- "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
- "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
- "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
- "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
- "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
- "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
- "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
- "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
- "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
- "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
- "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
- "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
- "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
- "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
- "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
- "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
- "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
- "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
- "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
- "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
- "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
- "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
- "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
- "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
- "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
- "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
- "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
- "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
- "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
- "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
- "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
- "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
- "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
- "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
- "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
- "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
- "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
- "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
- "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
- "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
- "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
- "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
- "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
- "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
- "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
- "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
- "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
- "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
- "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
- "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
- "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
- "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
- "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
- "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
- "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
- "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
- "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
- "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
- "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
- "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
- "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
- "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
- "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
- "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
- "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
- "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
- "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
- "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
- "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
- "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
- "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
- "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
- "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
- "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
- "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
- "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
- "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
- "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
- "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
- "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
- "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
- "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
- "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
- "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
- "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
- "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
- "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
- "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
- "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
- "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
- "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
- "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
- "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
- "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
- "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
- "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
- "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
- "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
- "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
- "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
- "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
- "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
- "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
- "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
- "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
- "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
- "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
- "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
- "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
- "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
- "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
- "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
- "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
- "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
- "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
- "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
- "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
- "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
- "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
- "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
- "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
- "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
- "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
- "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
- "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
- "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
- "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
- "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
- "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
- "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
- "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
- "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
- "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
- "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
- "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
- "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
- "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
- "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
- "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
- "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
- "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
- "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
- "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
- "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
- "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
- "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
- "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
- "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
- "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
- "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
- "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
- "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
- "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
- "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
- "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
- "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
- "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
- "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
- "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
- "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
- "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
- "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
- "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
- "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
- "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
- "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
- "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
- "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
- "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
- "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
- "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
- "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
- "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
- "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
- "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
- "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
- "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
- "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
- "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
- "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
- "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
- "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
- "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
- "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
- "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
- "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
- "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
- "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
- "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
- "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
- "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
- "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
- "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
- "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
- "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
- "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
- "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
- "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
- "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
- "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
- "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
- "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
- "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
- "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
- "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
- "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
- "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
- "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
- "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
- "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
- "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
- "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
- "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
- "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
- "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
- "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
- "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
- "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
- "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
- "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
- "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
- "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
- "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
- "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
- "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
- "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
- "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
- "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
- "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
- "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
- "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
- "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
- "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
- "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
- "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
- "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
- "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
- "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
- "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
- "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
- "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
- "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
- "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
- "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
- "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
- "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
- "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
- "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
- "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
- "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
- "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
- "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
- "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
- "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
- "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
- "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
- "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
- "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
- "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
- "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
- "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
- "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
- "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
- "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
- "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
- "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
- "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
- "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
- "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
- "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
- "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
- "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
- "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
- "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
- "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
- "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
- "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
- "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
- "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
- "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
- "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
- "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
- "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight"
315
- }
316
  }
 
7
  "intermediate_size": 11008,
8
  "max_position_embeddings": 4096,
9
  "model": "llama",
10
+ "template": "llama3",
11
  "num_attention_heads": 32,
12
  "num_hidden_layers": 32,
13
  "num_key_value_heads": 4,
 
20
  "skip_init": true,
21
  "rope_rank": "updown",
22
  "segment_vocab_size": 0,
23
+ "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 7}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
Yi-1.5-6B/bert4torch_config.json CHANGED
@@ -19,298 +19,5 @@
19
  "skip_init": true,
20
  "rope_rank": "updown",
21
  "segment_vocab_size": 0,
22
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 2},
23
- "mapping": {
24
- "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
- "LayerNormFinal.weight": "model.norm.weight",
26
- "lm_head.weight": "lm_head.weight",
27
- "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
- "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
- "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
- "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
- "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
- "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
- "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
- "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
- "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
- "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
- "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
- "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
- "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
- "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
- "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
- "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
- "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
- "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
- "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
- "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
- "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
- "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
- "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
- "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
- "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
- "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
- "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
- "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
- "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
- "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
- "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
- "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
- "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
- "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
- "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
- "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
- "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
- "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
- "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
- "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
- "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
- "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
- "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
- "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
- "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
- "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
- "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
- "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
- "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
- "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
- "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
- "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
- "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
- "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
- "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
- "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
- "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
- "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
- "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
- "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
- "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
- "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
- "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
- "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
- "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
- "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
- "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
- "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
- "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
- "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
- "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
- "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
- "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
- "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
- "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
- "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
- "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
- "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
- "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
- "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
- "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
- "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
- "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
- "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
- "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
- "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
- "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
- "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
- "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
- "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
- "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
- "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
- "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
- "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
- "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
- "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
- "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
- "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
- "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
- "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
- "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
- "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
- "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
- "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
- "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
- "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
- "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
- "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
- "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
- "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
- "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
- "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
- "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
- "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
- "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
- "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
- "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
- "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
- "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
- "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
- "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
- "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
- "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
- "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
- "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
- "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
- "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
- "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
- "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
- "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
- "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
- "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
- "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
- "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
- "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
- "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
- "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
- "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
- "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
- "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
- "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
- "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
- "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
- "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
- "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
- "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
- "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
- "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
- "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
- "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
- "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
- "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
- "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
- "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
- "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
- "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
- "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
- "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
- "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
- "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
- "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
- "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
- "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
- "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
- "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
- "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
- "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
- "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
- "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
- "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
- "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
- "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
- "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
- "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
- "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
- "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
- "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
- "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
- "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
- "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
- "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
- "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
- "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
- "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
- "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
- "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
- "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
- "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
- "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
- "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
- "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
- "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
- "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
- "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
- "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
- "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
- "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
- "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
- "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
- "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
- "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
- "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
- "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
- "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
- "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
- "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
- "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
- "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
- "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
- "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
- "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
- "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
- "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
- "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
- "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
- "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
- "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
- "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
- "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
- "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
- "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
- "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
- "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
- "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
- "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
- "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
- "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
- "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
- "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
- "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
- "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
- "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
- "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
- "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
- "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
- "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
- "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
- "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
- "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
- "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
- "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
- "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
- "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
- "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
- "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
- "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
- "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
- "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
- "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
- "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
- "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
- "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
- "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
- "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
- "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
- "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
- "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
- "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
- "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
- "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
- "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
- "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
- "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
- "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
- "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
- "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
- "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
- "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
- "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
- "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
- "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
- "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
- "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
- "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
- "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
- "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
- "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
- "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
- "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
- "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
- "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
- "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
- "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
- "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
- "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
- "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
- "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
- "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight"
315
- }
316
  }
 
19
  "skip_init": true,
20
  "rope_rank": "updown",
21
  "segment_vocab_size": 0,
22
+ "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 2}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
Yi-1.5-9B-32K/bert4torch_config.json CHANGED
@@ -19,442 +19,5 @@
19
  "skip_init": true,
20
  "rope_rank": "updown",
21
  "segment_vocab_size": 0,
22
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 32768, "eos_token_id": 2},
23
- "mapping": {
24
- "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
- "LayerNormFinal.weight": "model.norm.weight",
26
- "lm_head.weight": "lm_head.weight",
27
- "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
- "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
- "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
- "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
- "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
- "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
- "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
- "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
- "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
- "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
- "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
- "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
- "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
- "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
- "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
- "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
- "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
- "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
- "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
- "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
- "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
- "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
- "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
- "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
- "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
- "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
- "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
- "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
- "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
- "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
- "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
- "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
- "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
- "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
- "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
- "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
- "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
- "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
- "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
- "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
- "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
- "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
- "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
- "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
- "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
- "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
- "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
- "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
- "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
- "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
- "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
- "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
- "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
- "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
- "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
- "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
- "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
- "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
- "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
- "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
- "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
- "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
- "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
- "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
- "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
- "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
- "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
- "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
- "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
- "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
- "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
- "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
- "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
- "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
- "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
- "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
- "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
- "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
- "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
- "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
- "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
- "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
- "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
- "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
- "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
- "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
- "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
- "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
- "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
- "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
- "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
- "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
- "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
- "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
- "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
- "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
- "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
- "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
- "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
- "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
- "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
- "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
- "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
- "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
- "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
- "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
- "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
- "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
- "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
- "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
- "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
- "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
- "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
- "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
- "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
- "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
- "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
- "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
- "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
- "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
- "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
- "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
- "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
- "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
- "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
- "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
- "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
- "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
- "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
- "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
- "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
- "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
- "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
- "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
- "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
- "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
- "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
- "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
- "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
- "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
- "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
- "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
- "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
- "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
- "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
- "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
- "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
- "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
- "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
- "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
- "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
- "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
- "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
- "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
- "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
- "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
- "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
- "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
- "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
- "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
- "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
- "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
- "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
- "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
- "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
- "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
- "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
- "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
- "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
- "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
- "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
- "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
- "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
- "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
- "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
- "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
- "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
- "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
- "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
- "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
- "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
- "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
- "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
- "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
- "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
- "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
- "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
- "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
- "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
- "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
- "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
- "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
- "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
- "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
- "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
- "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
- "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
- "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
- "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
- "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
- "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
- "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
- "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
- "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
- "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
- "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
- "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
- "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
- "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
- "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
- "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
- "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
- "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
- "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
- "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
- "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
- "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
- "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
- "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
- "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
- "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
- "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
- "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
- "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
- "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
- "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
- "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
- "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
- "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
- "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
- "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
- "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
- "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
- "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
- "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
- "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
- "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
- "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
- "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
- "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
- "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
- "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
- "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
- "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
- "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
- "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
- "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
- "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
- "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
- "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
- "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
- "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
- "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
- "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
- "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
- "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
- "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
- "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
- "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
- "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
- "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
- "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
- "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
- "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
- "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
- "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
- "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
- "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
- "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
- "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
- "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
- "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
- "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
- "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
- "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
- "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
- "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
- "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
- "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
- "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
- "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
- "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
- "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
- "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
- "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
- "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
- "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
- "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight",
315
- "decoderLayer.32.multiHeadAttention.q.weight": "model.layers.32.self_attn.q_proj.weight",
316
- "decoderLayer.32.multiHeadAttention.k.weight": "model.layers.32.self_attn.k_proj.weight",
317
- "decoderLayer.32.multiHeadAttention.v.weight": "model.layers.32.self_attn.v_proj.weight",
318
- "decoderLayer.32.multiHeadAttention.o.weight": "model.layers.32.self_attn.o_proj.weight",
319
- "decoderLayer.32.attnLayerNorm.weight": "model.layers.32.ln1.weight",
320
- "decoderLayer.32.feedForward.intermediateDense.weight": "model.layers.32.mlp.gate_proj.weight",
321
- "decoderLayer.32.feedForward.outputDense.weight": "model.layers.32.mlp.down_proj.weight",
322
- "decoderLayer.32.ffnLayerNorm.weight": "model.layers.32.ln2.weight",
323
- "decoderLayer.32.feedForward.intermediateDense2.weight": "model.layers.32.mlp.up_proj.weight",
324
- "decoderLayer.33.multiHeadAttention.q.weight": "model.layers.33.self_attn.q_proj.weight",
325
- "decoderLayer.33.multiHeadAttention.k.weight": "model.layers.33.self_attn.k_proj.weight",
326
- "decoderLayer.33.multiHeadAttention.v.weight": "model.layers.33.self_attn.v_proj.weight",
327
- "decoderLayer.33.multiHeadAttention.o.weight": "model.layers.33.self_attn.o_proj.weight",
328
- "decoderLayer.33.attnLayerNorm.weight": "model.layers.33.ln1.weight",
329
- "decoderLayer.33.feedForward.intermediateDense.weight": "model.layers.33.mlp.gate_proj.weight",
330
- "decoderLayer.33.feedForward.outputDense.weight": "model.layers.33.mlp.down_proj.weight",
331
- "decoderLayer.33.ffnLayerNorm.weight": "model.layers.33.ln2.weight",
332
- "decoderLayer.33.feedForward.intermediateDense2.weight": "model.layers.33.mlp.up_proj.weight",
333
- "decoderLayer.34.multiHeadAttention.q.weight": "model.layers.34.self_attn.q_proj.weight",
334
- "decoderLayer.34.multiHeadAttention.k.weight": "model.layers.34.self_attn.k_proj.weight",
335
- "decoderLayer.34.multiHeadAttention.v.weight": "model.layers.34.self_attn.v_proj.weight",
336
- "decoderLayer.34.multiHeadAttention.o.weight": "model.layers.34.self_attn.o_proj.weight",
337
- "decoderLayer.34.attnLayerNorm.weight": "model.layers.34.ln1.weight",
338
- "decoderLayer.34.feedForward.intermediateDense.weight": "model.layers.34.mlp.gate_proj.weight",
339
- "decoderLayer.34.feedForward.outputDense.weight": "model.layers.34.mlp.down_proj.weight",
340
- "decoderLayer.34.ffnLayerNorm.weight": "model.layers.34.ln2.weight",
341
- "decoderLayer.34.feedForward.intermediateDense2.weight": "model.layers.34.mlp.up_proj.weight",
342
- "decoderLayer.35.multiHeadAttention.q.weight": "model.layers.35.self_attn.q_proj.weight",
343
- "decoderLayer.35.multiHeadAttention.k.weight": "model.layers.35.self_attn.k_proj.weight",
344
- "decoderLayer.35.multiHeadAttention.v.weight": "model.layers.35.self_attn.v_proj.weight",
345
- "decoderLayer.35.multiHeadAttention.o.weight": "model.layers.35.self_attn.o_proj.weight",
346
- "decoderLayer.35.attnLayerNorm.weight": "model.layers.35.ln1.weight",
347
- "decoderLayer.35.feedForward.intermediateDense.weight": "model.layers.35.mlp.gate_proj.weight",
348
- "decoderLayer.35.feedForward.outputDense.weight": "model.layers.35.mlp.down_proj.weight",
349
- "decoderLayer.35.ffnLayerNorm.weight": "model.layers.35.ln2.weight",
350
- "decoderLayer.35.feedForward.intermediateDense2.weight": "model.layers.35.mlp.up_proj.weight",
351
- "decoderLayer.36.multiHeadAttention.q.weight": "model.layers.36.self_attn.q_proj.weight",
352
- "decoderLayer.36.multiHeadAttention.k.weight": "model.layers.36.self_attn.k_proj.weight",
353
- "decoderLayer.36.multiHeadAttention.v.weight": "model.layers.36.self_attn.v_proj.weight",
354
- "decoderLayer.36.multiHeadAttention.o.weight": "model.layers.36.self_attn.o_proj.weight",
355
- "decoderLayer.36.attnLayerNorm.weight": "model.layers.36.ln1.weight",
356
- "decoderLayer.36.feedForward.intermediateDense.weight": "model.layers.36.mlp.gate_proj.weight",
357
- "decoderLayer.36.feedForward.outputDense.weight": "model.layers.36.mlp.down_proj.weight",
358
- "decoderLayer.36.ffnLayerNorm.weight": "model.layers.36.ln2.weight",
359
- "decoderLayer.36.feedForward.intermediateDense2.weight": "model.layers.36.mlp.up_proj.weight",
360
- "decoderLayer.37.multiHeadAttention.q.weight": "model.layers.37.self_attn.q_proj.weight",
361
- "decoderLayer.37.multiHeadAttention.k.weight": "model.layers.37.self_attn.k_proj.weight",
362
- "decoderLayer.37.multiHeadAttention.v.weight": "model.layers.37.self_attn.v_proj.weight",
363
- "decoderLayer.37.multiHeadAttention.o.weight": "model.layers.37.self_attn.o_proj.weight",
364
- "decoderLayer.37.attnLayerNorm.weight": "model.layers.37.ln1.weight",
365
- "decoderLayer.37.feedForward.intermediateDense.weight": "model.layers.37.mlp.gate_proj.weight",
366
- "decoderLayer.37.feedForward.outputDense.weight": "model.layers.37.mlp.down_proj.weight",
367
- "decoderLayer.37.ffnLayerNorm.weight": "model.layers.37.ln2.weight",
368
- "decoderLayer.37.feedForward.intermediateDense2.weight": "model.layers.37.mlp.up_proj.weight",
369
- "decoderLayer.38.multiHeadAttention.q.weight": "model.layers.38.self_attn.q_proj.weight",
370
- "decoderLayer.38.multiHeadAttention.k.weight": "model.layers.38.self_attn.k_proj.weight",
371
- "decoderLayer.38.multiHeadAttention.v.weight": "model.layers.38.self_attn.v_proj.weight",
372
- "decoderLayer.38.multiHeadAttention.o.weight": "model.layers.38.self_attn.o_proj.weight",
373
- "decoderLayer.38.attnLayerNorm.weight": "model.layers.38.ln1.weight",
374
- "decoderLayer.38.feedForward.intermediateDense.weight": "model.layers.38.mlp.gate_proj.weight",
375
- "decoderLayer.38.feedForward.outputDense.weight": "model.layers.38.mlp.down_proj.weight",
376
- "decoderLayer.38.ffnLayerNorm.weight": "model.layers.38.ln2.weight",
377
- "decoderLayer.38.feedForward.intermediateDense2.weight": "model.layers.38.mlp.up_proj.weight",
378
- "decoderLayer.39.multiHeadAttention.q.weight": "model.layers.39.self_attn.q_proj.weight",
379
- "decoderLayer.39.multiHeadAttention.k.weight": "model.layers.39.self_attn.k_proj.weight",
380
- "decoderLayer.39.multiHeadAttention.v.weight": "model.layers.39.self_attn.v_proj.weight",
381
- "decoderLayer.39.multiHeadAttention.o.weight": "model.layers.39.self_attn.o_proj.weight",
382
- "decoderLayer.39.attnLayerNorm.weight": "model.layers.39.ln1.weight",
383
- "decoderLayer.39.feedForward.intermediateDense.weight": "model.layers.39.mlp.gate_proj.weight",
384
- "decoderLayer.39.feedForward.outputDense.weight": "model.layers.39.mlp.down_proj.weight",
385
- "decoderLayer.39.ffnLayerNorm.weight": "model.layers.39.ln2.weight",
386
- "decoderLayer.39.feedForward.intermediateDense2.weight": "model.layers.39.mlp.up_proj.weight",
387
- "decoderLayer.40.multiHeadAttention.q.weight": "model.layers.40.self_attn.q_proj.weight",
388
- "decoderLayer.40.multiHeadAttention.k.weight": "model.layers.40.self_attn.k_proj.weight",
389
- "decoderLayer.40.multiHeadAttention.v.weight": "model.layers.40.self_attn.v_proj.weight",
390
- "decoderLayer.40.multiHeadAttention.o.weight": "model.layers.40.self_attn.o_proj.weight",
391
- "decoderLayer.40.attnLayerNorm.weight": "model.layers.40.ln1.weight",
392
- "decoderLayer.40.feedForward.intermediateDense.weight": "model.layers.40.mlp.gate_proj.weight",
393
- "decoderLayer.40.feedForward.outputDense.weight": "model.layers.40.mlp.down_proj.weight",
394
- "decoderLayer.40.ffnLayerNorm.weight": "model.layers.40.ln2.weight",
395
- "decoderLayer.40.feedForward.intermediateDense2.weight": "model.layers.40.mlp.up_proj.weight",
396
- "decoderLayer.41.multiHeadAttention.q.weight": "model.layers.41.self_attn.q_proj.weight",
397
- "decoderLayer.41.multiHeadAttention.k.weight": "model.layers.41.self_attn.k_proj.weight",
398
- "decoderLayer.41.multiHeadAttention.v.weight": "model.layers.41.self_attn.v_proj.weight",
399
- "decoderLayer.41.multiHeadAttention.o.weight": "model.layers.41.self_attn.o_proj.weight",
400
- "decoderLayer.41.attnLayerNorm.weight": "model.layers.41.ln1.weight",
401
- "decoderLayer.41.feedForward.intermediateDense.weight": "model.layers.41.mlp.gate_proj.weight",
402
- "decoderLayer.41.feedForward.outputDense.weight": "model.layers.41.mlp.down_proj.weight",
403
- "decoderLayer.41.ffnLayerNorm.weight": "model.layers.41.ln2.weight",
404
- "decoderLayer.41.feedForward.intermediateDense2.weight": "model.layers.41.mlp.up_proj.weight",
405
- "decoderLayer.42.multiHeadAttention.q.weight": "model.layers.42.self_attn.q_proj.weight",
406
- "decoderLayer.42.multiHeadAttention.k.weight": "model.layers.42.self_attn.k_proj.weight",
407
- "decoderLayer.42.multiHeadAttention.v.weight": "model.layers.42.self_attn.v_proj.weight",
408
- "decoderLayer.42.multiHeadAttention.o.weight": "model.layers.42.self_attn.o_proj.weight",
409
- "decoderLayer.42.attnLayerNorm.weight": "model.layers.42.ln1.weight",
410
- "decoderLayer.42.feedForward.intermediateDense.weight": "model.layers.42.mlp.gate_proj.weight",
411
- "decoderLayer.42.feedForward.outputDense.weight": "model.layers.42.mlp.down_proj.weight",
412
- "decoderLayer.42.ffnLayerNorm.weight": "model.layers.42.ln2.weight",
413
- "decoderLayer.42.feedForward.intermediateDense2.weight": "model.layers.42.mlp.up_proj.weight",
414
- "decoderLayer.43.multiHeadAttention.q.weight": "model.layers.43.self_attn.q_proj.weight",
415
- "decoderLayer.43.multiHeadAttention.k.weight": "model.layers.43.self_attn.k_proj.weight",
416
- "decoderLayer.43.multiHeadAttention.v.weight": "model.layers.43.self_attn.v_proj.weight",
417
- "decoderLayer.43.multiHeadAttention.o.weight": "model.layers.43.self_attn.o_proj.weight",
418
- "decoderLayer.43.attnLayerNorm.weight": "model.layers.43.ln1.weight",
419
- "decoderLayer.43.feedForward.intermediateDense.weight": "model.layers.43.mlp.gate_proj.weight",
420
- "decoderLayer.43.feedForward.outputDense.weight": "model.layers.43.mlp.down_proj.weight",
421
- "decoderLayer.43.ffnLayerNorm.weight": "model.layers.43.ln2.weight",
422
- "decoderLayer.43.feedForward.intermediateDense2.weight": "model.layers.43.mlp.up_proj.weight",
423
- "decoderLayer.44.multiHeadAttention.q.weight": "model.layers.44.self_attn.q_proj.weight",
424
- "decoderLayer.44.multiHeadAttention.k.weight": "model.layers.44.self_attn.k_proj.weight",
425
- "decoderLayer.44.multiHeadAttention.v.weight": "model.layers.44.self_attn.v_proj.weight",
426
- "decoderLayer.44.multiHeadAttention.o.weight": "model.layers.44.self_attn.o_proj.weight",
427
- "decoderLayer.44.attnLayerNorm.weight": "model.layers.44.ln1.weight",
428
- "decoderLayer.44.feedForward.intermediateDense.weight": "model.layers.44.mlp.gate_proj.weight",
429
- "decoderLayer.44.feedForward.outputDense.weight": "model.layers.44.mlp.down_proj.weight",
430
- "decoderLayer.44.ffnLayerNorm.weight": "model.layers.44.ln2.weight",
431
- "decoderLayer.44.feedForward.intermediateDense2.weight": "model.layers.44.mlp.up_proj.weight",
432
- "decoderLayer.45.multiHeadAttention.q.weight": "model.layers.45.self_attn.q_proj.weight",
433
- "decoderLayer.45.multiHeadAttention.k.weight": "model.layers.45.self_attn.k_proj.weight",
434
- "decoderLayer.45.multiHeadAttention.v.weight": "model.layers.45.self_attn.v_proj.weight",
435
- "decoderLayer.45.multiHeadAttention.o.weight": "model.layers.45.self_attn.o_proj.weight",
436
- "decoderLayer.45.attnLayerNorm.weight": "model.layers.45.ln1.weight",
437
- "decoderLayer.45.feedForward.intermediateDense.weight": "model.layers.45.mlp.gate_proj.weight",
438
- "decoderLayer.45.feedForward.outputDense.weight": "model.layers.45.mlp.down_proj.weight",
439
- "decoderLayer.45.ffnLayerNorm.weight": "model.layers.45.ln2.weight",
440
- "decoderLayer.45.feedForward.intermediateDense2.weight": "model.layers.45.mlp.up_proj.weight",
441
- "decoderLayer.46.multiHeadAttention.q.weight": "model.layers.46.self_attn.q_proj.weight",
442
- "decoderLayer.46.multiHeadAttention.k.weight": "model.layers.46.self_attn.k_proj.weight",
443
- "decoderLayer.46.multiHeadAttention.v.weight": "model.layers.46.self_attn.v_proj.weight",
444
- "decoderLayer.46.multiHeadAttention.o.weight": "model.layers.46.self_attn.o_proj.weight",
445
- "decoderLayer.46.attnLayerNorm.weight": "model.layers.46.ln1.weight",
446
- "decoderLayer.46.feedForward.intermediateDense.weight": "model.layers.46.mlp.gate_proj.weight",
447
- "decoderLayer.46.feedForward.outputDense.weight": "model.layers.46.mlp.down_proj.weight",
448
- "decoderLayer.46.ffnLayerNorm.weight": "model.layers.46.ln2.weight",
449
- "decoderLayer.46.feedForward.intermediateDense2.weight": "model.layers.46.mlp.up_proj.weight",
450
- "decoderLayer.47.multiHeadAttention.q.weight": "model.layers.47.self_attn.q_proj.weight",
451
- "decoderLayer.47.multiHeadAttention.k.weight": "model.layers.47.self_attn.k_proj.weight",
452
- "decoderLayer.47.multiHeadAttention.v.weight": "model.layers.47.self_attn.v_proj.weight",
453
- "decoderLayer.47.multiHeadAttention.o.weight": "model.layers.47.self_attn.o_proj.weight",
454
- "decoderLayer.47.attnLayerNorm.weight": "model.layers.47.ln1.weight",
455
- "decoderLayer.47.feedForward.intermediateDense.weight": "model.layers.47.mlp.gate_proj.weight",
456
- "decoderLayer.47.feedForward.outputDense.weight": "model.layers.47.mlp.down_proj.weight",
457
- "decoderLayer.47.ffnLayerNorm.weight": "model.layers.47.ln2.weight",
458
- "decoderLayer.47.feedForward.intermediateDense2.weight": "model.layers.47.mlp.up_proj.weight"
459
- }
460
  }
 
19
  "skip_init": true,
20
  "rope_rank": "updown",
21
  "segment_vocab_size": 0,
22
+ "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 32768, "eos_token_id": 2}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
Yi-1.5-9B-Chat-16K/bert4torch_config.json CHANGED
@@ -7,6 +7,7 @@
7
  "intermediate_size": 11008,
8
  "max_position_embeddings": 16384,
9
  "model": "llama",
 
10
  "num_attention_heads": 32,
11
  "num_hidden_layers": 48,
12
  "num_key_value_heads": 4,
@@ -19,442 +20,6 @@
19
  "skip_init": true,
20
  "rope_rank": "updown",
21
  "segment_vocab_size": 0,
22
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 16384, "eos_token_id": 2},
23
- "mapping": {
24
- "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
- "LayerNormFinal.weight": "model.norm.weight",
26
- "lm_head.weight": "lm_head.weight",
27
- "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
- "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
- "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
- "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
- "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
- "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
- "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
- "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
- "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
- "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
- "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
- "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
- "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
- "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
- "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
- "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
- "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
- "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
- "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
- "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
- "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
- "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
- "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
- "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
- "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
- "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
- "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
- "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
- "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
- "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
- "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
- "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
- "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
- "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
- "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
- "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
- "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
- "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
- "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
- "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
- "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
- "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
- "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
- "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
- "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
- "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
- "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
- "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
- "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
- "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
- "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
- "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
- "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
- "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
- "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
- "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
- "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
- "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
- "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
- "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
- "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
- "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
- "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
- "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
- "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
- "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
- "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
- "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
- "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
- "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
- "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
- "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
- "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
- "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
- "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
- "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
- "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
- "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
- "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
- "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
- "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
- "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
- "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
- "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
- "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
- "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
- "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
- "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
- "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
- "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
- "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
- "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
- "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
- "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
- "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
- "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
- "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
- "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
- "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
- "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
- "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
- "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
- "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
- "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
- "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
- "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
- "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
- "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
- "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
- "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
- "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
- "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
- "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
- "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
- "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
- "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
- "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
- "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
- "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
- "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
- "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
- "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
- "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
- "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
- "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
- "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
- "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
- "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
- "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
- "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
- "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
- "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
- "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
- "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
- "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
- "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
- "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
- "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
- "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
- "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
- "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
- "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
- "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
- "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
- "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
- "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
- "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
- "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
- "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
- "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
- "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
- "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
- "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
- "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
- "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
- "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
- "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
- "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
- "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
- "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
- "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
- "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
- "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
- "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
- "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
- "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
- "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
- "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
- "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
- "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
- "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
- "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
- "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
- "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
- "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
- "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
- "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
- "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
- "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
- "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
- "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
- "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
- "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
- "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
- "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
- "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
- "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
- "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
- "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
- "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
- "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
- "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
- "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
- "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
- "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
- "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
- "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
- "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
- "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
- "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
- "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
- "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
- "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
- "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
- "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
- "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
- "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
- "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
- "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
- "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
- "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
- "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
- "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
- "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
- "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
- "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
- "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
- "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
- "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
- "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
- "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
- "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
- "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
- "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
- "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
- "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
- "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
- "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
- "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
- "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
- "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
- "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
- "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
- "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
- "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
- "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
- "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
- "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
- "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
- "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
- "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
- "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
- "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
- "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
- "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
- "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
- "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
- "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
- "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
- "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
- "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
- "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
- "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
- "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
- "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
- "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
- "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
- "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
- "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
- "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
- "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
- "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
- "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
- "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
- "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
- "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
- "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
- "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
- "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
- "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
- "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
- "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
- "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
- "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
- "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
- "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
- "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
- "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
- "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
- "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
- "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
- "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
- "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
- "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
- "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
- "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
- "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
- "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight",
315
- "decoderLayer.32.multiHeadAttention.q.weight": "model.layers.32.self_attn.q_proj.weight",
316
- "decoderLayer.32.multiHeadAttention.k.weight": "model.layers.32.self_attn.k_proj.weight",
317
- "decoderLayer.32.multiHeadAttention.v.weight": "model.layers.32.self_attn.v_proj.weight",
318
- "decoderLayer.32.multiHeadAttention.o.weight": "model.layers.32.self_attn.o_proj.weight",
319
- "decoderLayer.32.attnLayerNorm.weight": "model.layers.32.ln1.weight",
320
- "decoderLayer.32.feedForward.intermediateDense.weight": "model.layers.32.mlp.gate_proj.weight",
321
- "decoderLayer.32.feedForward.outputDense.weight": "model.layers.32.mlp.down_proj.weight",
322
- "decoderLayer.32.ffnLayerNorm.weight": "model.layers.32.ln2.weight",
323
- "decoderLayer.32.feedForward.intermediateDense2.weight": "model.layers.32.mlp.up_proj.weight",
324
- "decoderLayer.33.multiHeadAttention.q.weight": "model.layers.33.self_attn.q_proj.weight",
325
- "decoderLayer.33.multiHeadAttention.k.weight": "model.layers.33.self_attn.k_proj.weight",
326
- "decoderLayer.33.multiHeadAttention.v.weight": "model.layers.33.self_attn.v_proj.weight",
327
- "decoderLayer.33.multiHeadAttention.o.weight": "model.layers.33.self_attn.o_proj.weight",
328
- "decoderLayer.33.attnLayerNorm.weight": "model.layers.33.ln1.weight",
329
- "decoderLayer.33.feedForward.intermediateDense.weight": "model.layers.33.mlp.gate_proj.weight",
330
- "decoderLayer.33.feedForward.outputDense.weight": "model.layers.33.mlp.down_proj.weight",
331
- "decoderLayer.33.ffnLayerNorm.weight": "model.layers.33.ln2.weight",
332
- "decoderLayer.33.feedForward.intermediateDense2.weight": "model.layers.33.mlp.up_proj.weight",
333
- "decoderLayer.34.multiHeadAttention.q.weight": "model.layers.34.self_attn.q_proj.weight",
334
- "decoderLayer.34.multiHeadAttention.k.weight": "model.layers.34.self_attn.k_proj.weight",
335
- "decoderLayer.34.multiHeadAttention.v.weight": "model.layers.34.self_attn.v_proj.weight",
336
- "decoderLayer.34.multiHeadAttention.o.weight": "model.layers.34.self_attn.o_proj.weight",
337
- "decoderLayer.34.attnLayerNorm.weight": "model.layers.34.ln1.weight",
338
- "decoderLayer.34.feedForward.intermediateDense.weight": "model.layers.34.mlp.gate_proj.weight",
339
- "decoderLayer.34.feedForward.outputDense.weight": "model.layers.34.mlp.down_proj.weight",
340
- "decoderLayer.34.ffnLayerNorm.weight": "model.layers.34.ln2.weight",
341
- "decoderLayer.34.feedForward.intermediateDense2.weight": "model.layers.34.mlp.up_proj.weight",
342
- "decoderLayer.35.multiHeadAttention.q.weight": "model.layers.35.self_attn.q_proj.weight",
343
- "decoderLayer.35.multiHeadAttention.k.weight": "model.layers.35.self_attn.k_proj.weight",
344
- "decoderLayer.35.multiHeadAttention.v.weight": "model.layers.35.self_attn.v_proj.weight",
345
- "decoderLayer.35.multiHeadAttention.o.weight": "model.layers.35.self_attn.o_proj.weight",
346
- "decoderLayer.35.attnLayerNorm.weight": "model.layers.35.ln1.weight",
347
- "decoderLayer.35.feedForward.intermediateDense.weight": "model.layers.35.mlp.gate_proj.weight",
348
- "decoderLayer.35.feedForward.outputDense.weight": "model.layers.35.mlp.down_proj.weight",
349
- "decoderLayer.35.ffnLayerNorm.weight": "model.layers.35.ln2.weight",
350
- "decoderLayer.35.feedForward.intermediateDense2.weight": "model.layers.35.mlp.up_proj.weight",
351
- "decoderLayer.36.multiHeadAttention.q.weight": "model.layers.36.self_attn.q_proj.weight",
352
- "decoderLayer.36.multiHeadAttention.k.weight": "model.layers.36.self_attn.k_proj.weight",
353
- "decoderLayer.36.multiHeadAttention.v.weight": "model.layers.36.self_attn.v_proj.weight",
354
- "decoderLayer.36.multiHeadAttention.o.weight": "model.layers.36.self_attn.o_proj.weight",
355
- "decoderLayer.36.attnLayerNorm.weight": "model.layers.36.ln1.weight",
356
- "decoderLayer.36.feedForward.intermediateDense.weight": "model.layers.36.mlp.gate_proj.weight",
357
- "decoderLayer.36.feedForward.outputDense.weight": "model.layers.36.mlp.down_proj.weight",
358
- "decoderLayer.36.ffnLayerNorm.weight": "model.layers.36.ln2.weight",
359
- "decoderLayer.36.feedForward.intermediateDense2.weight": "model.layers.36.mlp.up_proj.weight",
360
- "decoderLayer.37.multiHeadAttention.q.weight": "model.layers.37.self_attn.q_proj.weight",
361
- "decoderLayer.37.multiHeadAttention.k.weight": "model.layers.37.self_attn.k_proj.weight",
362
- "decoderLayer.37.multiHeadAttention.v.weight": "model.layers.37.self_attn.v_proj.weight",
363
- "decoderLayer.37.multiHeadAttention.o.weight": "model.layers.37.self_attn.o_proj.weight",
364
- "decoderLayer.37.attnLayerNorm.weight": "model.layers.37.ln1.weight",
365
- "decoderLayer.37.feedForward.intermediateDense.weight": "model.layers.37.mlp.gate_proj.weight",
366
- "decoderLayer.37.feedForward.outputDense.weight": "model.layers.37.mlp.down_proj.weight",
367
- "decoderLayer.37.ffnLayerNorm.weight": "model.layers.37.ln2.weight",
368
- "decoderLayer.37.feedForward.intermediateDense2.weight": "model.layers.37.mlp.up_proj.weight",
369
- "decoderLayer.38.multiHeadAttention.q.weight": "model.layers.38.self_attn.q_proj.weight",
370
- "decoderLayer.38.multiHeadAttention.k.weight": "model.layers.38.self_attn.k_proj.weight",
371
- "decoderLayer.38.multiHeadAttention.v.weight": "model.layers.38.self_attn.v_proj.weight",
372
- "decoderLayer.38.multiHeadAttention.o.weight": "model.layers.38.self_attn.o_proj.weight",
373
- "decoderLayer.38.attnLayerNorm.weight": "model.layers.38.ln1.weight",
374
- "decoderLayer.38.feedForward.intermediateDense.weight": "model.layers.38.mlp.gate_proj.weight",
375
- "decoderLayer.38.feedForward.outputDense.weight": "model.layers.38.mlp.down_proj.weight",
376
- "decoderLayer.38.ffnLayerNorm.weight": "model.layers.38.ln2.weight",
377
- "decoderLayer.38.feedForward.intermediateDense2.weight": "model.layers.38.mlp.up_proj.weight",
378
- "decoderLayer.39.multiHeadAttention.q.weight": "model.layers.39.self_attn.q_proj.weight",
379
- "decoderLayer.39.multiHeadAttention.k.weight": "model.layers.39.self_attn.k_proj.weight",
380
- "decoderLayer.39.multiHeadAttention.v.weight": "model.layers.39.self_attn.v_proj.weight",
381
- "decoderLayer.39.multiHeadAttention.o.weight": "model.layers.39.self_attn.o_proj.weight",
382
- "decoderLayer.39.attnLayerNorm.weight": "model.layers.39.ln1.weight",
383
- "decoderLayer.39.feedForward.intermediateDense.weight": "model.layers.39.mlp.gate_proj.weight",
384
- "decoderLayer.39.feedForward.outputDense.weight": "model.layers.39.mlp.down_proj.weight",
385
- "decoderLayer.39.ffnLayerNorm.weight": "model.layers.39.ln2.weight",
386
- "decoderLayer.39.feedForward.intermediateDense2.weight": "model.layers.39.mlp.up_proj.weight",
387
- "decoderLayer.40.multiHeadAttention.q.weight": "model.layers.40.self_attn.q_proj.weight",
388
- "decoderLayer.40.multiHeadAttention.k.weight": "model.layers.40.self_attn.k_proj.weight",
389
- "decoderLayer.40.multiHeadAttention.v.weight": "model.layers.40.self_attn.v_proj.weight",
390
- "decoderLayer.40.multiHeadAttention.o.weight": "model.layers.40.self_attn.o_proj.weight",
391
- "decoderLayer.40.attnLayerNorm.weight": "model.layers.40.ln1.weight",
392
- "decoderLayer.40.feedForward.intermediateDense.weight": "model.layers.40.mlp.gate_proj.weight",
393
- "decoderLayer.40.feedForward.outputDense.weight": "model.layers.40.mlp.down_proj.weight",
394
- "decoderLayer.40.ffnLayerNorm.weight": "model.layers.40.ln2.weight",
395
- "decoderLayer.40.feedForward.intermediateDense2.weight": "model.layers.40.mlp.up_proj.weight",
396
- "decoderLayer.41.multiHeadAttention.q.weight": "model.layers.41.self_attn.q_proj.weight",
397
- "decoderLayer.41.multiHeadAttention.k.weight": "model.layers.41.self_attn.k_proj.weight",
398
- "decoderLayer.41.multiHeadAttention.v.weight": "model.layers.41.self_attn.v_proj.weight",
399
- "decoderLayer.41.multiHeadAttention.o.weight": "model.layers.41.self_attn.o_proj.weight",
400
- "decoderLayer.41.attnLayerNorm.weight": "model.layers.41.ln1.weight",
401
- "decoderLayer.41.feedForward.intermediateDense.weight": "model.layers.41.mlp.gate_proj.weight",
402
- "decoderLayer.41.feedForward.outputDense.weight": "model.layers.41.mlp.down_proj.weight",
403
- "decoderLayer.41.ffnLayerNorm.weight": "model.layers.41.ln2.weight",
404
- "decoderLayer.41.feedForward.intermediateDense2.weight": "model.layers.41.mlp.up_proj.weight",
405
- "decoderLayer.42.multiHeadAttention.q.weight": "model.layers.42.self_attn.q_proj.weight",
406
- "decoderLayer.42.multiHeadAttention.k.weight": "model.layers.42.self_attn.k_proj.weight",
407
- "decoderLayer.42.multiHeadAttention.v.weight": "model.layers.42.self_attn.v_proj.weight",
408
- "decoderLayer.42.multiHeadAttention.o.weight": "model.layers.42.self_attn.o_proj.weight",
409
- "decoderLayer.42.attnLayerNorm.weight": "model.layers.42.ln1.weight",
410
- "decoderLayer.42.feedForward.intermediateDense.weight": "model.layers.42.mlp.gate_proj.weight",
411
- "decoderLayer.42.feedForward.outputDense.weight": "model.layers.42.mlp.down_proj.weight",
412
- "decoderLayer.42.ffnLayerNorm.weight": "model.layers.42.ln2.weight",
413
- "decoderLayer.42.feedForward.intermediateDense2.weight": "model.layers.42.mlp.up_proj.weight",
414
- "decoderLayer.43.multiHeadAttention.q.weight": "model.layers.43.self_attn.q_proj.weight",
415
- "decoderLayer.43.multiHeadAttention.k.weight": "model.layers.43.self_attn.k_proj.weight",
416
- "decoderLayer.43.multiHeadAttention.v.weight": "model.layers.43.self_attn.v_proj.weight",
417
- "decoderLayer.43.multiHeadAttention.o.weight": "model.layers.43.self_attn.o_proj.weight",
418
- "decoderLayer.43.attnLayerNorm.weight": "model.layers.43.ln1.weight",
419
- "decoderLayer.43.feedForward.intermediateDense.weight": "model.layers.43.mlp.gate_proj.weight",
420
- "decoderLayer.43.feedForward.outputDense.weight": "model.layers.43.mlp.down_proj.weight",
421
- "decoderLayer.43.ffnLayerNorm.weight": "model.layers.43.ln2.weight",
422
- "decoderLayer.43.feedForward.intermediateDense2.weight": "model.layers.43.mlp.up_proj.weight",
423
- "decoderLayer.44.multiHeadAttention.q.weight": "model.layers.44.self_attn.q_proj.weight",
424
- "decoderLayer.44.multiHeadAttention.k.weight": "model.layers.44.self_attn.k_proj.weight",
425
- "decoderLayer.44.multiHeadAttention.v.weight": "model.layers.44.self_attn.v_proj.weight",
426
- "decoderLayer.44.multiHeadAttention.o.weight": "model.layers.44.self_attn.o_proj.weight",
427
- "decoderLayer.44.attnLayerNorm.weight": "model.layers.44.ln1.weight",
428
- "decoderLayer.44.feedForward.intermediateDense.weight": "model.layers.44.mlp.gate_proj.weight",
429
- "decoderLayer.44.feedForward.outputDense.weight": "model.layers.44.mlp.down_proj.weight",
430
- "decoderLayer.44.ffnLayerNorm.weight": "model.layers.44.ln2.weight",
431
- "decoderLayer.44.feedForward.intermediateDense2.weight": "model.layers.44.mlp.up_proj.weight",
432
- "decoderLayer.45.multiHeadAttention.q.weight": "model.layers.45.self_attn.q_proj.weight",
433
- "decoderLayer.45.multiHeadAttention.k.weight": "model.layers.45.self_attn.k_proj.weight",
434
- "decoderLayer.45.multiHeadAttention.v.weight": "model.layers.45.self_attn.v_proj.weight",
435
- "decoderLayer.45.multiHeadAttention.o.weight": "model.layers.45.self_attn.o_proj.weight",
436
- "decoderLayer.45.attnLayerNorm.weight": "model.layers.45.ln1.weight",
437
- "decoderLayer.45.feedForward.intermediateDense.weight": "model.layers.45.mlp.gate_proj.weight",
438
- "decoderLayer.45.feedForward.outputDense.weight": "model.layers.45.mlp.down_proj.weight",
439
- "decoderLayer.45.ffnLayerNorm.weight": "model.layers.45.ln2.weight",
440
- "decoderLayer.45.feedForward.intermediateDense2.weight": "model.layers.45.mlp.up_proj.weight",
441
- "decoderLayer.46.multiHeadAttention.q.weight": "model.layers.46.self_attn.q_proj.weight",
442
- "decoderLayer.46.multiHeadAttention.k.weight": "model.layers.46.self_attn.k_proj.weight",
443
- "decoderLayer.46.multiHeadAttention.v.weight": "model.layers.46.self_attn.v_proj.weight",
444
- "decoderLayer.46.multiHeadAttention.o.weight": "model.layers.46.self_attn.o_proj.weight",
445
- "decoderLayer.46.attnLayerNorm.weight": "model.layers.46.ln1.weight",
446
- "decoderLayer.46.feedForward.intermediateDense.weight": "model.layers.46.mlp.gate_proj.weight",
447
- "decoderLayer.46.feedForward.outputDense.weight": "model.layers.46.mlp.down_proj.weight",
448
- "decoderLayer.46.ffnLayerNorm.weight": "model.layers.46.ln2.weight",
449
- "decoderLayer.46.feedForward.intermediateDense2.weight": "model.layers.46.mlp.up_proj.weight",
450
- "decoderLayer.47.multiHeadAttention.q.weight": "model.layers.47.self_attn.q_proj.weight",
451
- "decoderLayer.47.multiHeadAttention.k.weight": "model.layers.47.self_attn.k_proj.weight",
452
- "decoderLayer.47.multiHeadAttention.v.weight": "model.layers.47.self_attn.v_proj.weight",
453
- "decoderLayer.47.multiHeadAttention.o.weight": "model.layers.47.self_attn.o_proj.weight",
454
- "decoderLayer.47.attnLayerNorm.weight": "model.layers.47.ln1.weight",
455
- "decoderLayer.47.feedForward.intermediateDense.weight": "model.layers.47.mlp.gate_proj.weight",
456
- "decoderLayer.47.feedForward.outputDense.weight": "model.layers.47.mlp.down_proj.weight",
457
- "decoderLayer.47.ffnLayerNorm.weight": "model.layers.47.ln2.weight",
458
- "decoderLayer.47.feedForward.intermediateDense2.weight": "model.layers.47.mlp.up_proj.weight"
459
  }
460
- }
 
7
  "intermediate_size": 11008,
8
  "max_position_embeddings": 16384,
9
  "model": "llama",
10
+ "template": "llama3",
11
  "num_attention_heads": 32,
12
  "num_hidden_layers": 48,
13
  "num_key_value_heads": 4,
 
20
  "skip_init": true,
21
  "rope_rank": "updown",
22
  "segment_vocab_size": 0,
23
+ "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 16384, "eos_token_id": 7}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
25
+
Yi-1.5-9B-Chat/bert4torch_config.json CHANGED
@@ -7,6 +7,7 @@
7
  "intermediate_size": 11008,
8
  "max_position_embeddings": 4096,
9
  "model": "llama",
 
10
  "num_attention_heads": 32,
11
  "num_hidden_layers": 48,
12
  "num_key_value_heads": 4,
@@ -19,442 +20,5 @@
19
  "skip_init": true,
20
  "rope_rank": "updown",
21
  "segment_vocab_size": 0,
22
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 2},
23
- "mapping": {
24
- "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
- "LayerNormFinal.weight": "model.norm.weight",
26
- "lm_head.weight": "lm_head.weight",
27
- "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
- "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
- "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
- "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
- "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
- "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
- "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
- "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
- "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
- "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
- "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
- "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
- "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
- "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
- "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
- "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
- "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
- "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
- "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
- "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
- "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
- "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
- "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
- "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
- "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
- "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
- "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
- "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
- "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
- "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
- "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
- "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
- "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
- "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
- "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
- "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
- "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
- "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
- "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
- "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
- "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
- "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
- "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
- "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
- "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
- "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
- "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
- "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
- "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
- "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
- "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
- "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
- "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
- "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
- "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
- "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
- "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
- "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
- "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
- "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
- "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
- "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
- "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
- "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
- "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
- "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
- "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
- "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
- "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
- "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
- "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
- "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
- "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
- "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
- "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
- "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
- "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
- "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
- "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
- "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
- "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
- "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
- "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
- "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
- "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
- "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
- "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
- "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
- "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
- "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
- "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
- "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
- "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
- "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
- "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
- "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
- "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
- "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
- "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
- "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
- "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
- "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
- "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
- "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
- "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
- "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
- "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
- "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
- "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
- "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
- "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
- "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
- "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
- "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
- "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
- "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
- "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
- "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
- "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
- "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
- "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
- "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
- "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
- "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
- "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
- "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
- "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
- "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
- "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
- "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
- "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
- "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
- "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
- "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
- "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
- "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
- "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
- "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
- "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
- "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
- "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
- "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
- "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
- "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
- "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
- "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
- "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
- "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
- "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
- "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
- "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
- "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
- "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
- "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
- "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
- "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
- "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
- "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
- "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
- "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
- "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
- "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
- "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
- "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
- "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
- "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
- "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
- "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
- "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
- "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
- "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
- "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
- "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
- "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
- "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
- "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
- "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
- "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
- "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
- "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
- "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
- "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
- "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
- "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
- "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
- "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
- "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
- "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
- "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
- "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
- "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
- "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
- "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
- "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
- "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
- "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
- "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
- "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
- "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
- "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
- "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
- "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
- "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
- "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
- "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
- "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
- "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
- "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
- "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
- "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
- "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
- "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
- "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
- "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
- "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
- "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
- "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
- "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
- "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
- "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
- "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
- "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
- "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
- "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
- "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
- "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
- "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
- "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
- "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
- "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
- "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
- "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
- "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
- "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
- "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
- "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
- "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
- "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
- "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
- "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
- "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
- "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
- "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
- "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
- "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
- "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
- "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
- "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
- "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
- "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
- "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
- "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
- "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
- "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
- "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
- "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
- "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
- "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
- "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
- "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
- "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
- "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
- "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
- "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
- "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
- "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
- "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
- "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
- "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
- "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
- "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
- "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
- "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
- "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
- "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
- "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
- "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
- "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
- "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
- "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
- "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
- "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
- "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
- "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
- "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
- "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
- "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
- "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight",
315
- "decoderLayer.32.multiHeadAttention.q.weight": "model.layers.32.self_attn.q_proj.weight",
316
- "decoderLayer.32.multiHeadAttention.k.weight": "model.layers.32.self_attn.k_proj.weight",
317
- "decoderLayer.32.multiHeadAttention.v.weight": "model.layers.32.self_attn.v_proj.weight",
318
- "decoderLayer.32.multiHeadAttention.o.weight": "model.layers.32.self_attn.o_proj.weight",
319
- "decoderLayer.32.attnLayerNorm.weight": "model.layers.32.ln1.weight",
320
- "decoderLayer.32.feedForward.intermediateDense.weight": "model.layers.32.mlp.gate_proj.weight",
321
- "decoderLayer.32.feedForward.outputDense.weight": "model.layers.32.mlp.down_proj.weight",
322
- "decoderLayer.32.ffnLayerNorm.weight": "model.layers.32.ln2.weight",
323
- "decoderLayer.32.feedForward.intermediateDense2.weight": "model.layers.32.mlp.up_proj.weight",
324
- "decoderLayer.33.multiHeadAttention.q.weight": "model.layers.33.self_attn.q_proj.weight",
325
- "decoderLayer.33.multiHeadAttention.k.weight": "model.layers.33.self_attn.k_proj.weight",
326
- "decoderLayer.33.multiHeadAttention.v.weight": "model.layers.33.self_attn.v_proj.weight",
327
- "decoderLayer.33.multiHeadAttention.o.weight": "model.layers.33.self_attn.o_proj.weight",
328
- "decoderLayer.33.attnLayerNorm.weight": "model.layers.33.ln1.weight",
329
- "decoderLayer.33.feedForward.intermediateDense.weight": "model.layers.33.mlp.gate_proj.weight",
330
- "decoderLayer.33.feedForward.outputDense.weight": "model.layers.33.mlp.down_proj.weight",
331
- "decoderLayer.33.ffnLayerNorm.weight": "model.layers.33.ln2.weight",
332
- "decoderLayer.33.feedForward.intermediateDense2.weight": "model.layers.33.mlp.up_proj.weight",
333
- "decoderLayer.34.multiHeadAttention.q.weight": "model.layers.34.self_attn.q_proj.weight",
334
- "decoderLayer.34.multiHeadAttention.k.weight": "model.layers.34.self_attn.k_proj.weight",
335
- "decoderLayer.34.multiHeadAttention.v.weight": "model.layers.34.self_attn.v_proj.weight",
336
- "decoderLayer.34.multiHeadAttention.o.weight": "model.layers.34.self_attn.o_proj.weight",
337
- "decoderLayer.34.attnLayerNorm.weight": "model.layers.34.ln1.weight",
338
- "decoderLayer.34.feedForward.intermediateDense.weight": "model.layers.34.mlp.gate_proj.weight",
339
- "decoderLayer.34.feedForward.outputDense.weight": "model.layers.34.mlp.down_proj.weight",
340
- "decoderLayer.34.ffnLayerNorm.weight": "model.layers.34.ln2.weight",
341
- "decoderLayer.34.feedForward.intermediateDense2.weight": "model.layers.34.mlp.up_proj.weight",
342
- "decoderLayer.35.multiHeadAttention.q.weight": "model.layers.35.self_attn.q_proj.weight",
343
- "decoderLayer.35.multiHeadAttention.k.weight": "model.layers.35.self_attn.k_proj.weight",
344
- "decoderLayer.35.multiHeadAttention.v.weight": "model.layers.35.self_attn.v_proj.weight",
345
- "decoderLayer.35.multiHeadAttention.o.weight": "model.layers.35.self_attn.o_proj.weight",
346
- "decoderLayer.35.attnLayerNorm.weight": "model.layers.35.ln1.weight",
347
- "decoderLayer.35.feedForward.intermediateDense.weight": "model.layers.35.mlp.gate_proj.weight",
348
- "decoderLayer.35.feedForward.outputDense.weight": "model.layers.35.mlp.down_proj.weight",
349
- "decoderLayer.35.ffnLayerNorm.weight": "model.layers.35.ln2.weight",
350
- "decoderLayer.35.feedForward.intermediateDense2.weight": "model.layers.35.mlp.up_proj.weight",
351
- "decoderLayer.36.multiHeadAttention.q.weight": "model.layers.36.self_attn.q_proj.weight",
352
- "decoderLayer.36.multiHeadAttention.k.weight": "model.layers.36.self_attn.k_proj.weight",
353
- "decoderLayer.36.multiHeadAttention.v.weight": "model.layers.36.self_attn.v_proj.weight",
354
- "decoderLayer.36.multiHeadAttention.o.weight": "model.layers.36.self_attn.o_proj.weight",
355
- "decoderLayer.36.attnLayerNorm.weight": "model.layers.36.ln1.weight",
356
- "decoderLayer.36.feedForward.intermediateDense.weight": "model.layers.36.mlp.gate_proj.weight",
357
- "decoderLayer.36.feedForward.outputDense.weight": "model.layers.36.mlp.down_proj.weight",
358
- "decoderLayer.36.ffnLayerNorm.weight": "model.layers.36.ln2.weight",
359
- "decoderLayer.36.feedForward.intermediateDense2.weight": "model.layers.36.mlp.up_proj.weight",
360
- "decoderLayer.37.multiHeadAttention.q.weight": "model.layers.37.self_attn.q_proj.weight",
361
- "decoderLayer.37.multiHeadAttention.k.weight": "model.layers.37.self_attn.k_proj.weight",
362
- "decoderLayer.37.multiHeadAttention.v.weight": "model.layers.37.self_attn.v_proj.weight",
363
- "decoderLayer.37.multiHeadAttention.o.weight": "model.layers.37.self_attn.o_proj.weight",
364
- "decoderLayer.37.attnLayerNorm.weight": "model.layers.37.ln1.weight",
365
- "decoderLayer.37.feedForward.intermediateDense.weight": "model.layers.37.mlp.gate_proj.weight",
366
- "decoderLayer.37.feedForward.outputDense.weight": "model.layers.37.mlp.down_proj.weight",
367
- "decoderLayer.37.ffnLayerNorm.weight": "model.layers.37.ln2.weight",
368
- "decoderLayer.37.feedForward.intermediateDense2.weight": "model.layers.37.mlp.up_proj.weight",
369
- "decoderLayer.38.multiHeadAttention.q.weight": "model.layers.38.self_attn.q_proj.weight",
370
- "decoderLayer.38.multiHeadAttention.k.weight": "model.layers.38.self_attn.k_proj.weight",
371
- "decoderLayer.38.multiHeadAttention.v.weight": "model.layers.38.self_attn.v_proj.weight",
372
- "decoderLayer.38.multiHeadAttention.o.weight": "model.layers.38.self_attn.o_proj.weight",
373
- "decoderLayer.38.attnLayerNorm.weight": "model.layers.38.ln1.weight",
374
- "decoderLayer.38.feedForward.intermediateDense.weight": "model.layers.38.mlp.gate_proj.weight",
375
- "decoderLayer.38.feedForward.outputDense.weight": "model.layers.38.mlp.down_proj.weight",
376
- "decoderLayer.38.ffnLayerNorm.weight": "model.layers.38.ln2.weight",
377
- "decoderLayer.38.feedForward.intermediateDense2.weight": "model.layers.38.mlp.up_proj.weight",
378
- "decoderLayer.39.multiHeadAttention.q.weight": "model.layers.39.self_attn.q_proj.weight",
379
- "decoderLayer.39.multiHeadAttention.k.weight": "model.layers.39.self_attn.k_proj.weight",
380
- "decoderLayer.39.multiHeadAttention.v.weight": "model.layers.39.self_attn.v_proj.weight",
381
- "decoderLayer.39.multiHeadAttention.o.weight": "model.layers.39.self_attn.o_proj.weight",
382
- "decoderLayer.39.attnLayerNorm.weight": "model.layers.39.ln1.weight",
383
- "decoderLayer.39.feedForward.intermediateDense.weight": "model.layers.39.mlp.gate_proj.weight",
384
- "decoderLayer.39.feedForward.outputDense.weight": "model.layers.39.mlp.down_proj.weight",
385
- "decoderLayer.39.ffnLayerNorm.weight": "model.layers.39.ln2.weight",
386
- "decoderLayer.39.feedForward.intermediateDense2.weight": "model.layers.39.mlp.up_proj.weight",
387
- "decoderLayer.40.multiHeadAttention.q.weight": "model.layers.40.self_attn.q_proj.weight",
388
- "decoderLayer.40.multiHeadAttention.k.weight": "model.layers.40.self_attn.k_proj.weight",
389
- "decoderLayer.40.multiHeadAttention.v.weight": "model.layers.40.self_attn.v_proj.weight",
390
- "decoderLayer.40.multiHeadAttention.o.weight": "model.layers.40.self_attn.o_proj.weight",
391
- "decoderLayer.40.attnLayerNorm.weight": "model.layers.40.ln1.weight",
392
- "decoderLayer.40.feedForward.intermediateDense.weight": "model.layers.40.mlp.gate_proj.weight",
393
- "decoderLayer.40.feedForward.outputDense.weight": "model.layers.40.mlp.down_proj.weight",
394
- "decoderLayer.40.ffnLayerNorm.weight": "model.layers.40.ln2.weight",
395
- "decoderLayer.40.feedForward.intermediateDense2.weight": "model.layers.40.mlp.up_proj.weight",
396
- "decoderLayer.41.multiHeadAttention.q.weight": "model.layers.41.self_attn.q_proj.weight",
397
- "decoderLayer.41.multiHeadAttention.k.weight": "model.layers.41.self_attn.k_proj.weight",
398
- "decoderLayer.41.multiHeadAttention.v.weight": "model.layers.41.self_attn.v_proj.weight",
399
- "decoderLayer.41.multiHeadAttention.o.weight": "model.layers.41.self_attn.o_proj.weight",
400
- "decoderLayer.41.attnLayerNorm.weight": "model.layers.41.ln1.weight",
401
- "decoderLayer.41.feedForward.intermediateDense.weight": "model.layers.41.mlp.gate_proj.weight",
402
- "decoderLayer.41.feedForward.outputDense.weight": "model.layers.41.mlp.down_proj.weight",
403
- "decoderLayer.41.ffnLayerNorm.weight": "model.layers.41.ln2.weight",
404
- "decoderLayer.41.feedForward.intermediateDense2.weight": "model.layers.41.mlp.up_proj.weight",
405
- "decoderLayer.42.multiHeadAttention.q.weight": "model.layers.42.self_attn.q_proj.weight",
406
- "decoderLayer.42.multiHeadAttention.k.weight": "model.layers.42.self_attn.k_proj.weight",
407
- "decoderLayer.42.multiHeadAttention.v.weight": "model.layers.42.self_attn.v_proj.weight",
408
- "decoderLayer.42.multiHeadAttention.o.weight": "model.layers.42.self_attn.o_proj.weight",
409
- "decoderLayer.42.attnLayerNorm.weight": "model.layers.42.ln1.weight",
410
- "decoderLayer.42.feedForward.intermediateDense.weight": "model.layers.42.mlp.gate_proj.weight",
411
- "decoderLayer.42.feedForward.outputDense.weight": "model.layers.42.mlp.down_proj.weight",
412
- "decoderLayer.42.ffnLayerNorm.weight": "model.layers.42.ln2.weight",
413
- "decoderLayer.42.feedForward.intermediateDense2.weight": "model.layers.42.mlp.up_proj.weight",
414
- "decoderLayer.43.multiHeadAttention.q.weight": "model.layers.43.self_attn.q_proj.weight",
415
- "decoderLayer.43.multiHeadAttention.k.weight": "model.layers.43.self_attn.k_proj.weight",
416
- "decoderLayer.43.multiHeadAttention.v.weight": "model.layers.43.self_attn.v_proj.weight",
417
- "decoderLayer.43.multiHeadAttention.o.weight": "model.layers.43.self_attn.o_proj.weight",
418
- "decoderLayer.43.attnLayerNorm.weight": "model.layers.43.ln1.weight",
419
- "decoderLayer.43.feedForward.intermediateDense.weight": "model.layers.43.mlp.gate_proj.weight",
420
- "decoderLayer.43.feedForward.outputDense.weight": "model.layers.43.mlp.down_proj.weight",
421
- "decoderLayer.43.ffnLayerNorm.weight": "model.layers.43.ln2.weight",
422
- "decoderLayer.43.feedForward.intermediateDense2.weight": "model.layers.43.mlp.up_proj.weight",
423
- "decoderLayer.44.multiHeadAttention.q.weight": "model.layers.44.self_attn.q_proj.weight",
424
- "decoderLayer.44.multiHeadAttention.k.weight": "model.layers.44.self_attn.k_proj.weight",
425
- "decoderLayer.44.multiHeadAttention.v.weight": "model.layers.44.self_attn.v_proj.weight",
426
- "decoderLayer.44.multiHeadAttention.o.weight": "model.layers.44.self_attn.o_proj.weight",
427
- "decoderLayer.44.attnLayerNorm.weight": "model.layers.44.ln1.weight",
428
- "decoderLayer.44.feedForward.intermediateDense.weight": "model.layers.44.mlp.gate_proj.weight",
429
- "decoderLayer.44.feedForward.outputDense.weight": "model.layers.44.mlp.down_proj.weight",
430
- "decoderLayer.44.ffnLayerNorm.weight": "model.layers.44.ln2.weight",
431
- "decoderLayer.44.feedForward.intermediateDense2.weight": "model.layers.44.mlp.up_proj.weight",
432
- "decoderLayer.45.multiHeadAttention.q.weight": "model.layers.45.self_attn.q_proj.weight",
433
- "decoderLayer.45.multiHeadAttention.k.weight": "model.layers.45.self_attn.k_proj.weight",
434
- "decoderLayer.45.multiHeadAttention.v.weight": "model.layers.45.self_attn.v_proj.weight",
435
- "decoderLayer.45.multiHeadAttention.o.weight": "model.layers.45.self_attn.o_proj.weight",
436
- "decoderLayer.45.attnLayerNorm.weight": "model.layers.45.ln1.weight",
437
- "decoderLayer.45.feedForward.intermediateDense.weight": "model.layers.45.mlp.gate_proj.weight",
438
- "decoderLayer.45.feedForward.outputDense.weight": "model.layers.45.mlp.down_proj.weight",
439
- "decoderLayer.45.ffnLayerNorm.weight": "model.layers.45.ln2.weight",
440
- "decoderLayer.45.feedForward.intermediateDense2.weight": "model.layers.45.mlp.up_proj.weight",
441
- "decoderLayer.46.multiHeadAttention.q.weight": "model.layers.46.self_attn.q_proj.weight",
442
- "decoderLayer.46.multiHeadAttention.k.weight": "model.layers.46.self_attn.k_proj.weight",
443
- "decoderLayer.46.multiHeadAttention.v.weight": "model.layers.46.self_attn.v_proj.weight",
444
- "decoderLayer.46.multiHeadAttention.o.weight": "model.layers.46.self_attn.o_proj.weight",
445
- "decoderLayer.46.attnLayerNorm.weight": "model.layers.46.ln1.weight",
446
- "decoderLayer.46.feedForward.intermediateDense.weight": "model.layers.46.mlp.gate_proj.weight",
447
- "decoderLayer.46.feedForward.outputDense.weight": "model.layers.46.mlp.down_proj.weight",
448
- "decoderLayer.46.ffnLayerNorm.weight": "model.layers.46.ln2.weight",
449
- "decoderLayer.46.feedForward.intermediateDense2.weight": "model.layers.46.mlp.up_proj.weight",
450
- "decoderLayer.47.multiHeadAttention.q.weight": "model.layers.47.self_attn.q_proj.weight",
451
- "decoderLayer.47.multiHeadAttention.k.weight": "model.layers.47.self_attn.k_proj.weight",
452
- "decoderLayer.47.multiHeadAttention.v.weight": "model.layers.47.self_attn.v_proj.weight",
453
- "decoderLayer.47.multiHeadAttention.o.weight": "model.layers.47.self_attn.o_proj.weight",
454
- "decoderLayer.47.attnLayerNorm.weight": "model.layers.47.ln1.weight",
455
- "decoderLayer.47.feedForward.intermediateDense.weight": "model.layers.47.mlp.gate_proj.weight",
456
- "decoderLayer.47.feedForward.outputDense.weight": "model.layers.47.mlp.down_proj.weight",
457
- "decoderLayer.47.ffnLayerNorm.weight": "model.layers.47.ln2.weight",
458
- "decoderLayer.47.feedForward.intermediateDense2.weight": "model.layers.47.mlp.up_proj.weight"
459
- }
460
  }
 
7
  "intermediate_size": 11008,
8
  "max_position_embeddings": 4096,
9
  "model": "llama",
10
+ "template": "llama3",
11
  "num_attention_heads": 32,
12
  "num_hidden_layers": 48,
13
  "num_key_value_heads": 4,
 
20
  "skip_init": true,
21
  "rope_rank": "updown",
22
  "segment_vocab_size": 0,
23
+ "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 7}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
Yi-1.5-9B/bert4torch_config.json CHANGED
@@ -19,442 +19,5 @@
19
  "skip_init": true,
20
  "rope_rank": "updown",
21
  "segment_vocab_size": 0,
22
- "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 2},
23
- "mapping": {
24
- "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
25
- "LayerNormFinal.weight": "model.norm.weight",
26
- "lm_head.weight": "lm_head.weight",
27
- "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
28
- "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
29
- "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
30
- "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
31
- "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
32
- "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
33
- "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
34
- "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
35
- "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
36
- "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
37
- "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
38
- "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
39
- "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
40
- "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
41
- "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
42
- "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
43
- "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
44
- "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
45
- "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
46
- "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
47
- "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
48
- "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
49
- "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
50
- "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
51
- "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
52
- "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
53
- "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
54
- "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
55
- "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
56
- "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
57
- "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
58
- "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
59
- "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
60
- "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
61
- "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
62
- "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
63
- "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
64
- "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
65
- "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
66
- "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
67
- "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
68
- "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
69
- "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
70
- "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
71
- "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
72
- "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
73
- "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
74
- "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
75
- "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
76
- "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
77
- "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
78
- "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
79
- "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
80
- "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
81
- "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
82
- "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
83
- "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
84
- "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
85
- "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
86
- "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
87
- "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
88
- "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
89
- "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
90
- "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
91
- "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
92
- "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
93
- "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
94
- "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
95
- "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
96
- "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
97
- "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
98
- "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
99
- "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
100
- "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
101
- "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
102
- "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
103
- "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
104
- "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
105
- "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
106
- "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
107
- "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
108
- "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
109
- "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
110
- "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
111
- "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
112
- "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
113
- "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
114
- "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
115
- "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
116
- "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
117
- "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
118
- "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
119
- "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
120
- "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
121
- "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
122
- "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
123
- "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
124
- "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
125
- "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
126
- "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
127
- "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
128
- "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
129
- "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
130
- "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
131
- "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
132
- "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
133
- "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
134
- "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
135
- "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
136
- "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
137
- "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
138
- "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
139
- "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
140
- "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
141
- "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
142
- "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
143
- "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
144
- "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
145
- "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
146
- "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
147
- "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
148
- "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
149
- "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
150
- "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
151
- "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
152
- "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
153
- "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
154
- "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
155
- "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
156
- "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
157
- "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
158
- "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
159
- "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
160
- "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
161
- "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
162
- "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
163
- "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
164
- "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
165
- "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
166
- "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
167
- "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
168
- "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
169
- "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
170
- "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
171
- "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
172
- "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
173
- "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
174
- "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
175
- "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
176
- "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
177
- "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
178
- "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
179
- "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
180
- "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
181
- "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
182
- "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
183
- "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
184
- "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
185
- "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
186
- "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
187
- "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
188
- "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
189
- "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
190
- "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
191
- "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
192
- "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
193
- "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
194
- "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
195
- "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
196
- "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
197
- "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
198
- "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
199
- "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
200
- "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
201
- "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
202
- "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
203
- "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
204
- "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
205
- "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
206
- "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
207
- "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
208
- "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
209
- "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
210
- "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
211
- "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
212
- "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
213
- "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
214
- "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
215
- "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
216
- "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
217
- "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
218
- "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
219
- "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
220
- "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
221
- "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
222
- "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
223
- "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
224
- "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
225
- "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
226
- "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
227
- "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
228
- "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
229
- "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
230
- "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
231
- "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
232
- "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
233
- "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
234
- "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
235
- "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
236
- "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
237
- "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
238
- "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
239
- "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
240
- "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
241
- "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
242
- "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
243
- "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
244
- "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
245
- "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
246
- "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
247
- "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
248
- "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
249
- "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
250
- "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
251
- "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
252
- "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
253
- "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
254
- "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
255
- "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
256
- "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
257
- "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
258
- "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
259
- "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
260
- "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
261
- "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
262
- "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
263
- "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
264
- "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
265
- "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
266
- "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
267
- "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
268
- "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
269
- "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
270
- "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
271
- "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
272
- "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
273
- "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
274
- "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
275
- "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
276
- "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
277
- "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
278
- "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
279
- "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
280
- "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
281
- "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
282
- "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
283
- "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
284
- "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
285
- "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
286
- "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
287
- "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
288
- "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
289
- "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
290
- "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
291
- "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
292
- "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
293
- "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
294
- "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
295
- "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
296
- "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
297
- "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
298
- "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
299
- "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
300
- "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
301
- "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
302
- "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
303
- "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
304
- "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
305
- "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
306
- "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
307
- "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
308
- "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
309
- "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
310
- "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
311
- "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
312
- "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
313
- "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
314
- "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight",
315
- "decoderLayer.32.multiHeadAttention.q.weight": "model.layers.32.self_attn.q_proj.weight",
316
- "decoderLayer.32.multiHeadAttention.k.weight": "model.layers.32.self_attn.k_proj.weight",
317
- "decoderLayer.32.multiHeadAttention.v.weight": "model.layers.32.self_attn.v_proj.weight",
318
- "decoderLayer.32.multiHeadAttention.o.weight": "model.layers.32.self_attn.o_proj.weight",
319
- "decoderLayer.32.attnLayerNorm.weight": "model.layers.32.ln1.weight",
320
- "decoderLayer.32.feedForward.intermediateDense.weight": "model.layers.32.mlp.gate_proj.weight",
321
- "decoderLayer.32.feedForward.outputDense.weight": "model.layers.32.mlp.down_proj.weight",
322
- "decoderLayer.32.ffnLayerNorm.weight": "model.layers.32.ln2.weight",
323
- "decoderLayer.32.feedForward.intermediateDense2.weight": "model.layers.32.mlp.up_proj.weight",
324
- "decoderLayer.33.multiHeadAttention.q.weight": "model.layers.33.self_attn.q_proj.weight",
325
- "decoderLayer.33.multiHeadAttention.k.weight": "model.layers.33.self_attn.k_proj.weight",
326
- "decoderLayer.33.multiHeadAttention.v.weight": "model.layers.33.self_attn.v_proj.weight",
327
- "decoderLayer.33.multiHeadAttention.o.weight": "model.layers.33.self_attn.o_proj.weight",
328
- "decoderLayer.33.attnLayerNorm.weight": "model.layers.33.ln1.weight",
329
- "decoderLayer.33.feedForward.intermediateDense.weight": "model.layers.33.mlp.gate_proj.weight",
330
- "decoderLayer.33.feedForward.outputDense.weight": "model.layers.33.mlp.down_proj.weight",
331
- "decoderLayer.33.ffnLayerNorm.weight": "model.layers.33.ln2.weight",
332
- "decoderLayer.33.feedForward.intermediateDense2.weight": "model.layers.33.mlp.up_proj.weight",
333
- "decoderLayer.34.multiHeadAttention.q.weight": "model.layers.34.self_attn.q_proj.weight",
334
- "decoderLayer.34.multiHeadAttention.k.weight": "model.layers.34.self_attn.k_proj.weight",
335
- "decoderLayer.34.multiHeadAttention.v.weight": "model.layers.34.self_attn.v_proj.weight",
336
- "decoderLayer.34.multiHeadAttention.o.weight": "model.layers.34.self_attn.o_proj.weight",
337
- "decoderLayer.34.attnLayerNorm.weight": "model.layers.34.ln1.weight",
338
- "decoderLayer.34.feedForward.intermediateDense.weight": "model.layers.34.mlp.gate_proj.weight",
339
- "decoderLayer.34.feedForward.outputDense.weight": "model.layers.34.mlp.down_proj.weight",
340
- "decoderLayer.34.ffnLayerNorm.weight": "model.layers.34.ln2.weight",
341
- "decoderLayer.34.feedForward.intermediateDense2.weight": "model.layers.34.mlp.up_proj.weight",
342
- "decoderLayer.35.multiHeadAttention.q.weight": "model.layers.35.self_attn.q_proj.weight",
343
- "decoderLayer.35.multiHeadAttention.k.weight": "model.layers.35.self_attn.k_proj.weight",
344
- "decoderLayer.35.multiHeadAttention.v.weight": "model.layers.35.self_attn.v_proj.weight",
345
- "decoderLayer.35.multiHeadAttention.o.weight": "model.layers.35.self_attn.o_proj.weight",
346
- "decoderLayer.35.attnLayerNorm.weight": "model.layers.35.ln1.weight",
347
- "decoderLayer.35.feedForward.intermediateDense.weight": "model.layers.35.mlp.gate_proj.weight",
348
- "decoderLayer.35.feedForward.outputDense.weight": "model.layers.35.mlp.down_proj.weight",
349
- "decoderLayer.35.ffnLayerNorm.weight": "model.layers.35.ln2.weight",
350
- "decoderLayer.35.feedForward.intermediateDense2.weight": "model.layers.35.mlp.up_proj.weight",
351
- "decoderLayer.36.multiHeadAttention.q.weight": "model.layers.36.self_attn.q_proj.weight",
352
- "decoderLayer.36.multiHeadAttention.k.weight": "model.layers.36.self_attn.k_proj.weight",
353
- "decoderLayer.36.multiHeadAttention.v.weight": "model.layers.36.self_attn.v_proj.weight",
354
- "decoderLayer.36.multiHeadAttention.o.weight": "model.layers.36.self_attn.o_proj.weight",
355
- "decoderLayer.36.attnLayerNorm.weight": "model.layers.36.ln1.weight",
356
- "decoderLayer.36.feedForward.intermediateDense.weight": "model.layers.36.mlp.gate_proj.weight",
357
- "decoderLayer.36.feedForward.outputDense.weight": "model.layers.36.mlp.down_proj.weight",
358
- "decoderLayer.36.ffnLayerNorm.weight": "model.layers.36.ln2.weight",
359
- "decoderLayer.36.feedForward.intermediateDense2.weight": "model.layers.36.mlp.up_proj.weight",
360
- "decoderLayer.37.multiHeadAttention.q.weight": "model.layers.37.self_attn.q_proj.weight",
361
- "decoderLayer.37.multiHeadAttention.k.weight": "model.layers.37.self_attn.k_proj.weight",
362
- "decoderLayer.37.multiHeadAttention.v.weight": "model.layers.37.self_attn.v_proj.weight",
363
- "decoderLayer.37.multiHeadAttention.o.weight": "model.layers.37.self_attn.o_proj.weight",
364
- "decoderLayer.37.attnLayerNorm.weight": "model.layers.37.ln1.weight",
365
- "decoderLayer.37.feedForward.intermediateDense.weight": "model.layers.37.mlp.gate_proj.weight",
366
- "decoderLayer.37.feedForward.outputDense.weight": "model.layers.37.mlp.down_proj.weight",
367
- "decoderLayer.37.ffnLayerNorm.weight": "model.layers.37.ln2.weight",
368
- "decoderLayer.37.feedForward.intermediateDense2.weight": "model.layers.37.mlp.up_proj.weight",
369
- "decoderLayer.38.multiHeadAttention.q.weight": "model.layers.38.self_attn.q_proj.weight",
370
- "decoderLayer.38.multiHeadAttention.k.weight": "model.layers.38.self_attn.k_proj.weight",
371
- "decoderLayer.38.multiHeadAttention.v.weight": "model.layers.38.self_attn.v_proj.weight",
372
- "decoderLayer.38.multiHeadAttention.o.weight": "model.layers.38.self_attn.o_proj.weight",
373
- "decoderLayer.38.attnLayerNorm.weight": "model.layers.38.ln1.weight",
374
- "decoderLayer.38.feedForward.intermediateDense.weight": "model.layers.38.mlp.gate_proj.weight",
375
- "decoderLayer.38.feedForward.outputDense.weight": "model.layers.38.mlp.down_proj.weight",
376
- "decoderLayer.38.ffnLayerNorm.weight": "model.layers.38.ln2.weight",
377
- "decoderLayer.38.feedForward.intermediateDense2.weight": "model.layers.38.mlp.up_proj.weight",
378
- "decoderLayer.39.multiHeadAttention.q.weight": "model.layers.39.self_attn.q_proj.weight",
379
- "decoderLayer.39.multiHeadAttention.k.weight": "model.layers.39.self_attn.k_proj.weight",
380
- "decoderLayer.39.multiHeadAttention.v.weight": "model.layers.39.self_attn.v_proj.weight",
381
- "decoderLayer.39.multiHeadAttention.o.weight": "model.layers.39.self_attn.o_proj.weight",
382
- "decoderLayer.39.attnLayerNorm.weight": "model.layers.39.ln1.weight",
383
- "decoderLayer.39.feedForward.intermediateDense.weight": "model.layers.39.mlp.gate_proj.weight",
384
- "decoderLayer.39.feedForward.outputDense.weight": "model.layers.39.mlp.down_proj.weight",
385
- "decoderLayer.39.ffnLayerNorm.weight": "model.layers.39.ln2.weight",
386
- "decoderLayer.39.feedForward.intermediateDense2.weight": "model.layers.39.mlp.up_proj.weight",
387
- "decoderLayer.40.multiHeadAttention.q.weight": "model.layers.40.self_attn.q_proj.weight",
388
- "decoderLayer.40.multiHeadAttention.k.weight": "model.layers.40.self_attn.k_proj.weight",
389
- "decoderLayer.40.multiHeadAttention.v.weight": "model.layers.40.self_attn.v_proj.weight",
390
- "decoderLayer.40.multiHeadAttention.o.weight": "model.layers.40.self_attn.o_proj.weight",
391
- "decoderLayer.40.attnLayerNorm.weight": "model.layers.40.ln1.weight",
392
- "decoderLayer.40.feedForward.intermediateDense.weight": "model.layers.40.mlp.gate_proj.weight",
393
- "decoderLayer.40.feedForward.outputDense.weight": "model.layers.40.mlp.down_proj.weight",
394
- "decoderLayer.40.ffnLayerNorm.weight": "model.layers.40.ln2.weight",
395
- "decoderLayer.40.feedForward.intermediateDense2.weight": "model.layers.40.mlp.up_proj.weight",
396
- "decoderLayer.41.multiHeadAttention.q.weight": "model.layers.41.self_attn.q_proj.weight",
397
- "decoderLayer.41.multiHeadAttention.k.weight": "model.layers.41.self_attn.k_proj.weight",
398
- "decoderLayer.41.multiHeadAttention.v.weight": "model.layers.41.self_attn.v_proj.weight",
399
- "decoderLayer.41.multiHeadAttention.o.weight": "model.layers.41.self_attn.o_proj.weight",
400
- "decoderLayer.41.attnLayerNorm.weight": "model.layers.41.ln1.weight",
401
- "decoderLayer.41.feedForward.intermediateDense.weight": "model.layers.41.mlp.gate_proj.weight",
402
- "decoderLayer.41.feedForward.outputDense.weight": "model.layers.41.mlp.down_proj.weight",
403
- "decoderLayer.41.ffnLayerNorm.weight": "model.layers.41.ln2.weight",
404
- "decoderLayer.41.feedForward.intermediateDense2.weight": "model.layers.41.mlp.up_proj.weight",
405
- "decoderLayer.42.multiHeadAttention.q.weight": "model.layers.42.self_attn.q_proj.weight",
406
- "decoderLayer.42.multiHeadAttention.k.weight": "model.layers.42.self_attn.k_proj.weight",
407
- "decoderLayer.42.multiHeadAttention.v.weight": "model.layers.42.self_attn.v_proj.weight",
408
- "decoderLayer.42.multiHeadAttention.o.weight": "model.layers.42.self_attn.o_proj.weight",
409
- "decoderLayer.42.attnLayerNorm.weight": "model.layers.42.ln1.weight",
410
- "decoderLayer.42.feedForward.intermediateDense.weight": "model.layers.42.mlp.gate_proj.weight",
411
- "decoderLayer.42.feedForward.outputDense.weight": "model.layers.42.mlp.down_proj.weight",
412
- "decoderLayer.42.ffnLayerNorm.weight": "model.layers.42.ln2.weight",
413
- "decoderLayer.42.feedForward.intermediateDense2.weight": "model.layers.42.mlp.up_proj.weight",
414
- "decoderLayer.43.multiHeadAttention.q.weight": "model.layers.43.self_attn.q_proj.weight",
415
- "decoderLayer.43.multiHeadAttention.k.weight": "model.layers.43.self_attn.k_proj.weight",
416
- "decoderLayer.43.multiHeadAttention.v.weight": "model.layers.43.self_attn.v_proj.weight",
417
- "decoderLayer.43.multiHeadAttention.o.weight": "model.layers.43.self_attn.o_proj.weight",
418
- "decoderLayer.43.attnLayerNorm.weight": "model.layers.43.ln1.weight",
419
- "decoderLayer.43.feedForward.intermediateDense.weight": "model.layers.43.mlp.gate_proj.weight",
420
- "decoderLayer.43.feedForward.outputDense.weight": "model.layers.43.mlp.down_proj.weight",
421
- "decoderLayer.43.ffnLayerNorm.weight": "model.layers.43.ln2.weight",
422
- "decoderLayer.43.feedForward.intermediateDense2.weight": "model.layers.43.mlp.up_proj.weight",
423
- "decoderLayer.44.multiHeadAttention.q.weight": "model.layers.44.self_attn.q_proj.weight",
424
- "decoderLayer.44.multiHeadAttention.k.weight": "model.layers.44.self_attn.k_proj.weight",
425
- "decoderLayer.44.multiHeadAttention.v.weight": "model.layers.44.self_attn.v_proj.weight",
426
- "decoderLayer.44.multiHeadAttention.o.weight": "model.layers.44.self_attn.o_proj.weight",
427
- "decoderLayer.44.attnLayerNorm.weight": "model.layers.44.ln1.weight",
428
- "decoderLayer.44.feedForward.intermediateDense.weight": "model.layers.44.mlp.gate_proj.weight",
429
- "decoderLayer.44.feedForward.outputDense.weight": "model.layers.44.mlp.down_proj.weight",
430
- "decoderLayer.44.ffnLayerNorm.weight": "model.layers.44.ln2.weight",
431
- "decoderLayer.44.feedForward.intermediateDense2.weight": "model.layers.44.mlp.up_proj.weight",
432
- "decoderLayer.45.multiHeadAttention.q.weight": "model.layers.45.self_attn.q_proj.weight",
433
- "decoderLayer.45.multiHeadAttention.k.weight": "model.layers.45.self_attn.k_proj.weight",
434
- "decoderLayer.45.multiHeadAttention.v.weight": "model.layers.45.self_attn.v_proj.weight",
435
- "decoderLayer.45.multiHeadAttention.o.weight": "model.layers.45.self_attn.o_proj.weight",
436
- "decoderLayer.45.attnLayerNorm.weight": "model.layers.45.ln1.weight",
437
- "decoderLayer.45.feedForward.intermediateDense.weight": "model.layers.45.mlp.gate_proj.weight",
438
- "decoderLayer.45.feedForward.outputDense.weight": "model.layers.45.mlp.down_proj.weight",
439
- "decoderLayer.45.ffnLayerNorm.weight": "model.layers.45.ln2.weight",
440
- "decoderLayer.45.feedForward.intermediateDense2.weight": "model.layers.45.mlp.up_proj.weight",
441
- "decoderLayer.46.multiHeadAttention.q.weight": "model.layers.46.self_attn.q_proj.weight",
442
- "decoderLayer.46.multiHeadAttention.k.weight": "model.layers.46.self_attn.k_proj.weight",
443
- "decoderLayer.46.multiHeadAttention.v.weight": "model.layers.46.self_attn.v_proj.weight",
444
- "decoderLayer.46.multiHeadAttention.o.weight": "model.layers.46.self_attn.o_proj.weight",
445
- "decoderLayer.46.attnLayerNorm.weight": "model.layers.46.ln1.weight",
446
- "decoderLayer.46.feedForward.intermediateDense.weight": "model.layers.46.mlp.gate_proj.weight",
447
- "decoderLayer.46.feedForward.outputDense.weight": "model.layers.46.mlp.down_proj.weight",
448
- "decoderLayer.46.ffnLayerNorm.weight": "model.layers.46.ln2.weight",
449
- "decoderLayer.46.feedForward.intermediateDense2.weight": "model.layers.46.mlp.up_proj.weight",
450
- "decoderLayer.47.multiHeadAttention.q.weight": "model.layers.47.self_attn.q_proj.weight",
451
- "decoderLayer.47.multiHeadAttention.k.weight": "model.layers.47.self_attn.k_proj.weight",
452
- "decoderLayer.47.multiHeadAttention.v.weight": "model.layers.47.self_attn.v_proj.weight",
453
- "decoderLayer.47.multiHeadAttention.o.weight": "model.layers.47.self_attn.o_proj.weight",
454
- "decoderLayer.47.attnLayerNorm.weight": "model.layers.47.ln1.weight",
455
- "decoderLayer.47.feedForward.intermediateDense.weight": "model.layers.47.mlp.gate_proj.weight",
456
- "decoderLayer.47.feedForward.outputDense.weight": "model.layers.47.mlp.down_proj.weight",
457
- "decoderLayer.47.ffnLayerNorm.weight": "model.layers.47.ln2.weight",
458
- "decoderLayer.47.feedForward.intermediateDense2.weight": "model.layers.47.mlp.up_proj.weight"
459
- }
460
  }
 
19
  "skip_init": true,
20
  "rope_rank": "updown",
21
  "segment_vocab_size": 0,
22
+ "generation_config": {"tokenizer_decode_config": {"skip_special_tokens": true}, "max_length": 4096, "eos_token_id": 2}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }