NovelHack-ja
commited on
Commit
•
c9aed4a
1
Parent(s):
24686a5
Upload folder using huggingface_hub
Browse files- README.md +757 -0
- best_config.yaml +728 -0
- config.json +26 -0
- mergekit_config.yml +728 -0
- model-00001-of-00001.safetensors +3 -0
- model.safetensors.index.json +1 -0
- special_tokens_map.json +23 -0
- tokenizer.json +0 -0
- tokenizer_config.json +44 -0
README.md
ADDED
@@ -0,0 +1,757 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: []
|
3 |
+
library_name: transformers
|
4 |
+
tags:
|
5 |
+
- mergekit
|
6 |
+
- merge
|
7 |
+
|
8 |
+
---
|
9 |
+
# final_model
|
10 |
+
|
11 |
+
This is a merge of pre-trained language models created using [mergekit](https://github.com/cg123/mergekit).
|
12 |
+
|
13 |
+
## Merge Details
|
14 |
+
### Merge Method
|
15 |
+
|
16 |
+
This model was merged using the breadcrumbs_ties merge method using ./Yosegi-0603 as a base.
|
17 |
+
|
18 |
+
### Models Merged
|
19 |
+
|
20 |
+
The following models were included in the merge:
|
21 |
+
* ./Ninja-2B_JP
|
22 |
+
* ./Yosegi-0601
|
23 |
+
|
24 |
+
### Configuration
|
25 |
+
|
26 |
+
The following YAML configuration was used to produce this model:
|
27 |
+
|
28 |
+
```yaml
|
29 |
+
base_model: ./Yosegi-0603
|
30 |
+
dtype: bfloat16
|
31 |
+
merge_method: breadcrumbs_ties
|
32 |
+
parameters:
|
33 |
+
int8_mask: 1.0
|
34 |
+
normalize: 0.0
|
35 |
+
slices:
|
36 |
+
- sources:
|
37 |
+
- layer_range: [0, 2]
|
38 |
+
model: ./Yosegi-0601
|
39 |
+
parameters:
|
40 |
+
density:
|
41 |
+
- filter: self_attn
|
42 |
+
value: 1.0
|
43 |
+
- filter: mlp
|
44 |
+
value: 1.0
|
45 |
+
- value: 1.0
|
46 |
+
gamma:
|
47 |
+
- filter: self_attn
|
48 |
+
value: -0.050387850856855765
|
49 |
+
- filter: mlp
|
50 |
+
value: -0.17075015661203768
|
51 |
+
- value: -0.008041653902986862
|
52 |
+
weight:
|
53 |
+
- filter: self_attn
|
54 |
+
value: 0.0999312941470471
|
55 |
+
- filter: mlp
|
56 |
+
value: 0.541727762184749
|
57 |
+
- value: 0.6837012779994258
|
58 |
+
- layer_range: [0, 2]
|
59 |
+
model: ./Ninja-2B_JP
|
60 |
+
parameters:
|
61 |
+
density:
|
62 |
+
- filter: self_attn
|
63 |
+
value: 0.8218846237599902
|
64 |
+
- filter: mlp
|
65 |
+
value: 1.0
|
66 |
+
- value: 0.9254078866667358
|
67 |
+
gamma:
|
68 |
+
- filter: self_attn
|
69 |
+
value: -0.11213758231875963
|
70 |
+
- filter: mlp
|
71 |
+
value: 0.021586098873668948
|
72 |
+
- value: -0.12827998218659437
|
73 |
+
weight:
|
74 |
+
- filter: self_attn
|
75 |
+
value: 0.40391646444657003
|
76 |
+
- filter: mlp
|
77 |
+
value: 0.623121864641881
|
78 |
+
- value: 0.5967833694632534
|
79 |
+
- layer_range: [0, 2]
|
80 |
+
model: ./Yosegi-0603
|
81 |
+
- sources:
|
82 |
+
- layer_range: [2, 4]
|
83 |
+
model: ./Yosegi-0601
|
84 |
+
parameters:
|
85 |
+
density:
|
86 |
+
- filter: self_attn
|
87 |
+
value: 0.8079479346300947
|
88 |
+
- filter: mlp
|
89 |
+
value: 1.0
|
90 |
+
- value: 0.710146185559419
|
91 |
+
gamma:
|
92 |
+
- filter: self_attn
|
93 |
+
value: 0.1383609589681566
|
94 |
+
- filter: mlp
|
95 |
+
value: 0.21188532059635062
|
96 |
+
- value: 0.2994723556443468
|
97 |
+
weight:
|
98 |
+
- filter: self_attn
|
99 |
+
value: 0.48107070906079974
|
100 |
+
- filter: mlp
|
101 |
+
value: 0.5848073552919492
|
102 |
+
- value: 0.4583842493359253
|
103 |
+
- layer_range: [2, 4]
|
104 |
+
model: ./Ninja-2B_JP
|
105 |
+
parameters:
|
106 |
+
density:
|
107 |
+
- filter: self_attn
|
108 |
+
value: 1.0
|
109 |
+
- filter: mlp
|
110 |
+
value: 0.934378153535579
|
111 |
+
- value: 1.0
|
112 |
+
gamma:
|
113 |
+
- filter: self_attn
|
114 |
+
value: 0.073192612278188
|
115 |
+
- filter: mlp
|
116 |
+
value: 0.07939126555063317
|
117 |
+
- value: -0.06891845030175699
|
118 |
+
weight:
|
119 |
+
- filter: self_attn
|
120 |
+
value: 0.32120386994101
|
121 |
+
- filter: mlp
|
122 |
+
value: 0.5001108459121922
|
123 |
+
- value: 0.9138710221666694
|
124 |
+
- layer_range: [2, 4]
|
125 |
+
model: ./Yosegi-0603
|
126 |
+
- sources:
|
127 |
+
- layer_range: [4, 6]
|
128 |
+
model: ./Yosegi-0601
|
129 |
+
parameters:
|
130 |
+
density:
|
131 |
+
- filter: self_attn
|
132 |
+
value: 1.0
|
133 |
+
- filter: mlp
|
134 |
+
value: 0.7237519222177541
|
135 |
+
- value: 0.776951124863642
|
136 |
+
gamma:
|
137 |
+
- filter: self_attn
|
138 |
+
value: -0.2265121048274062
|
139 |
+
- filter: mlp
|
140 |
+
value: -0.1757947421960496
|
141 |
+
- value: -0.11401593728931929
|
142 |
+
weight:
|
143 |
+
- filter: self_attn
|
144 |
+
value: 0.6448742737026658
|
145 |
+
- filter: mlp
|
146 |
+
value: 0.13809748641457986
|
147 |
+
- value: 0.3950550285769662
|
148 |
+
- layer_range: [4, 6]
|
149 |
+
model: ./Ninja-2B_JP
|
150 |
+
parameters:
|
151 |
+
density:
|
152 |
+
- filter: self_attn
|
153 |
+
value: 0.9649359194114893
|
154 |
+
- filter: mlp
|
155 |
+
value: 0.916637032428399
|
156 |
+
- value: 1.0
|
157 |
+
gamma:
|
158 |
+
- filter: self_attn
|
159 |
+
value: -0.16291684846287688
|
160 |
+
- filter: mlp
|
161 |
+
value: -0.19013548712121703
|
162 |
+
- value: 0.038409066391918795
|
163 |
+
weight:
|
164 |
+
- filter: self_attn
|
165 |
+
value: 0.1977358472772336
|
166 |
+
- filter: mlp
|
167 |
+
value: 0.22661167907612348
|
168 |
+
- value: 0.6426575016448257
|
169 |
+
- layer_range: [4, 6]
|
170 |
+
model: ./Yosegi-0603
|
171 |
+
- sources:
|
172 |
+
- layer_range: [6, 8]
|
173 |
+
model: ./Yosegi-0601
|
174 |
+
parameters:
|
175 |
+
density:
|
176 |
+
- filter: self_attn
|
177 |
+
value: 0.8727809666891416
|
178 |
+
- filter: mlp
|
179 |
+
value: 1.0
|
180 |
+
- value: 0.5160677785559116
|
181 |
+
gamma:
|
182 |
+
- filter: self_attn
|
183 |
+
value: 0.14245180617134273
|
184 |
+
- filter: mlp
|
185 |
+
value: 0.08189992601998919
|
186 |
+
- value: -0.1038827997670827
|
187 |
+
weight:
|
188 |
+
- filter: self_attn
|
189 |
+
value: 0.23575676914257698
|
190 |
+
- filter: mlp
|
191 |
+
value: 0.4047231670507743
|
192 |
+
- value: 0.34207794631274374
|
193 |
+
- layer_range: [6, 8]
|
194 |
+
model: ./Ninja-2B_JP
|
195 |
+
parameters:
|
196 |
+
density:
|
197 |
+
- filter: self_attn
|
198 |
+
value: 1.0
|
199 |
+
- filter: mlp
|
200 |
+
value: 1.0
|
201 |
+
- value: 1.0
|
202 |
+
gamma:
|
203 |
+
- filter: self_attn
|
204 |
+
value: 0.576775501046583
|
205 |
+
- filter: mlp
|
206 |
+
value: -0.046028636298718645
|
207 |
+
- value: -0.024161321403060265
|
208 |
+
weight:
|
209 |
+
- filter: self_attn
|
210 |
+
value: 0.833089842843994
|
211 |
+
- filter: mlp
|
212 |
+
value: 0.5434667434613458
|
213 |
+
- value: 0.2946693008513797
|
214 |
+
- layer_range: [6, 8]
|
215 |
+
model: ./Yosegi-0603
|
216 |
+
- sources:
|
217 |
+
- layer_range: [8, 10]
|
218 |
+
model: ./Yosegi-0601
|
219 |
+
parameters:
|
220 |
+
density:
|
221 |
+
- filter: self_attn
|
222 |
+
value: 1.0
|
223 |
+
- filter: mlp
|
224 |
+
value: 1.0
|
225 |
+
- value: 0.9930269337531187
|
226 |
+
gamma:
|
227 |
+
- filter: self_attn
|
228 |
+
value: 0.4549980941970383
|
229 |
+
- filter: mlp
|
230 |
+
value: 0.10362988739411173
|
231 |
+
- value: -0.43800391668559174
|
232 |
+
weight:
|
233 |
+
- filter: self_attn
|
234 |
+
value: 0.19663450954683193
|
235 |
+
- filter: mlp
|
236 |
+
value: 0.16783989984505265
|
237 |
+
- value: 0.7465091417598162
|
238 |
+
- layer_range: [8, 10]
|
239 |
+
model: ./Ninja-2B_JP
|
240 |
+
parameters:
|
241 |
+
density:
|
242 |
+
- filter: self_attn
|
243 |
+
value: 0.797370597380894
|
244 |
+
- filter: mlp
|
245 |
+
value: 1.0
|
246 |
+
- value: 1.0
|
247 |
+
gamma:
|
248 |
+
- filter: self_attn
|
249 |
+
value: -0.0665958634205702
|
250 |
+
- filter: mlp
|
251 |
+
value: -0.058297473060129834
|
252 |
+
- value: -0.38206760673090134
|
253 |
+
weight:
|
254 |
+
- filter: self_attn
|
255 |
+
value: 0.7015967347604024
|
256 |
+
- filter: mlp
|
257 |
+
value: 0.7733694864324641
|
258 |
+
- value: 0.7636921732342238
|
259 |
+
- layer_range: [8, 10]
|
260 |
+
model: ./Yosegi-0603
|
261 |
+
- sources:
|
262 |
+
- layer_range: [10, 12]
|
263 |
+
model: ./Yosegi-0601
|
264 |
+
parameters:
|
265 |
+
density:
|
266 |
+
- filter: self_attn
|
267 |
+
value: 0.8047576867589878
|
268 |
+
- filter: mlp
|
269 |
+
value: 0.8852533319203653
|
270 |
+
- value: 0.7707342647603538
|
271 |
+
gamma:
|
272 |
+
- filter: self_attn
|
273 |
+
value: -0.054343999574509694
|
274 |
+
- filter: mlp
|
275 |
+
value: -0.3465154355167133
|
276 |
+
- value: 0.022315854655582765
|
277 |
+
weight:
|
278 |
+
- filter: self_attn
|
279 |
+
value: 0.4396484757291151
|
280 |
+
- filter: mlp
|
281 |
+
value: 0.34318396468602314
|
282 |
+
- value: 0.8236034746664869
|
283 |
+
- layer_range: [10, 12]
|
284 |
+
model: ./Ninja-2B_JP
|
285 |
+
parameters:
|
286 |
+
density:
|
287 |
+
- filter: self_attn
|
288 |
+
value: 0.9058471193805165
|
289 |
+
- filter: mlp
|
290 |
+
value: 1.0
|
291 |
+
- value: 1.0
|
292 |
+
gamma:
|
293 |
+
- filter: self_attn
|
294 |
+
value: 0.1221058588826469
|
295 |
+
- filter: mlp
|
296 |
+
value: -0.4004985640890659
|
297 |
+
- value: 0.3219195440395816
|
298 |
+
weight:
|
299 |
+
- filter: self_attn
|
300 |
+
value: 0.3565443612269864
|
301 |
+
- filter: mlp
|
302 |
+
value: 0.2817057075232181
|
303 |
+
- value: 0.5934890337808251
|
304 |
+
- layer_range: [10, 12]
|
305 |
+
model: ./Yosegi-0603
|
306 |
+
- sources:
|
307 |
+
- layer_range: [12, 14]
|
308 |
+
model: ./Yosegi-0601
|
309 |
+
parameters:
|
310 |
+
density:
|
311 |
+
- filter: self_attn
|
312 |
+
value: 1.0
|
313 |
+
- filter: mlp
|
314 |
+
value: 1.0
|
315 |
+
- value: 1.0
|
316 |
+
gamma:
|
317 |
+
- filter: self_attn
|
318 |
+
value: -0.027897116191693133
|
319 |
+
- filter: mlp
|
320 |
+
value: -0.1765379388255607
|
321 |
+
- value: 0.09108936063176161
|
322 |
+
weight:
|
323 |
+
- filter: self_attn
|
324 |
+
value: 0.4499753137521779
|
325 |
+
- filter: mlp
|
326 |
+
value: 0.901296236087911
|
327 |
+
- value: 0.3548680126954006
|
328 |
+
- layer_range: [12, 14]
|
329 |
+
model: ./Ninja-2B_JP
|
330 |
+
parameters:
|
331 |
+
density:
|
332 |
+
- filter: self_attn
|
333 |
+
value: 0.8973815150776497
|
334 |
+
- filter: mlp
|
335 |
+
value: 0.6029953465961999
|
336 |
+
- value: 1.0
|
337 |
+
gamma:
|
338 |
+
- filter: self_attn
|
339 |
+
value: 0.10393082898402586
|
340 |
+
- filter: mlp
|
341 |
+
value: 0.15993577688878796
|
342 |
+
- value: 0.011410411917833683
|
343 |
+
weight:
|
344 |
+
- filter: self_attn
|
345 |
+
value: 0.2211644023056492
|
346 |
+
- filter: mlp
|
347 |
+
value: 0.5677387594231849
|
348 |
+
- value: 0.1316535663010981
|
349 |
+
- layer_range: [12, 14]
|
350 |
+
model: ./Yosegi-0603
|
351 |
+
- sources:
|
352 |
+
- layer_range: [14, 16]
|
353 |
+
model: ./Yosegi-0601
|
354 |
+
parameters:
|
355 |
+
density:
|
356 |
+
- filter: self_attn
|
357 |
+
value: 0.9584597245055072
|
358 |
+
- filter: mlp
|
359 |
+
value: 1.0
|
360 |
+
- value: 1.0
|
361 |
+
gamma:
|
362 |
+
- filter: self_attn
|
363 |
+
value: -0.17789727632680347
|
364 |
+
- filter: mlp
|
365 |
+
value: 0.2182263440314275
|
366 |
+
- value: 0.1449547656126498
|
367 |
+
weight:
|
368 |
+
- filter: self_attn
|
369 |
+
value: 0.4551004762874224
|
370 |
+
- filter: mlp
|
371 |
+
value: 0.9182082826762857
|
372 |
+
- value: 0.3736989395186422
|
373 |
+
- layer_range: [14, 16]
|
374 |
+
model: ./Ninja-2B_JP
|
375 |
+
parameters:
|
376 |
+
density:
|
377 |
+
- filter: self_attn
|
378 |
+
value: 0.7414465107848625
|
379 |
+
- filter: mlp
|
380 |
+
value: 1.0
|
381 |
+
- value: 0.7894887419395906
|
382 |
+
gamma:
|
383 |
+
- filter: self_attn
|
384 |
+
value: -0.07343933395880992
|
385 |
+
- filter: mlp
|
386 |
+
value: 0.250800731630588
|
387 |
+
- value: -0.2948778134297542
|
388 |
+
weight:
|
389 |
+
- filter: self_attn
|
390 |
+
value: 0.43125199001016495
|
391 |
+
- filter: mlp
|
392 |
+
value: 0.6182726353394477
|
393 |
+
- value: 0.838902157446268
|
394 |
+
- layer_range: [14, 16]
|
395 |
+
model: ./Yosegi-0603
|
396 |
+
- sources:
|
397 |
+
- layer_range: [16, 18]
|
398 |
+
model: ./Yosegi-0601
|
399 |
+
parameters:
|
400 |
+
density:
|
401 |
+
- filter: self_attn
|
402 |
+
value: 0.9474287877268394
|
403 |
+
- filter: mlp
|
404 |
+
value: 1.0
|
405 |
+
- value: 0.9613380133344519
|
406 |
+
gamma:
|
407 |
+
- filter: self_attn
|
408 |
+
value: -0.08608895546593046
|
409 |
+
- filter: mlp
|
410 |
+
value: -0.07275416053291164
|
411 |
+
- value: -0.5796137860399382
|
412 |
+
weight:
|
413 |
+
- filter: self_attn
|
414 |
+
value: 0.5593420897751296
|
415 |
+
- filter: mlp
|
416 |
+
value: 0.7339447992880666
|
417 |
+
- value: 0.5447558586689005
|
418 |
+
- layer_range: [16, 18]
|
419 |
+
model: ./Ninja-2B_JP
|
420 |
+
parameters:
|
421 |
+
density:
|
422 |
+
- filter: self_attn
|
423 |
+
value: 0.9321536960575384
|
424 |
+
- filter: mlp
|
425 |
+
value: 1.0
|
426 |
+
- value: 0.9613033408813294
|
427 |
+
gamma:
|
428 |
+
- filter: self_attn
|
429 |
+
value: 0.20610728738224296
|
430 |
+
- filter: mlp
|
431 |
+
value: 0.2002206706624053
|
432 |
+
- value: -0.45349278793293785
|
433 |
+
weight:
|
434 |
+
- filter: self_attn
|
435 |
+
value: 0.16162975594196963
|
436 |
+
- filter: mlp
|
437 |
+
value: 0.21262726992327483
|
438 |
+
- value: 0.061213622827234075
|
439 |
+
- layer_range: [16, 18]
|
440 |
+
model: ./Yosegi-0603
|
441 |
+
- sources:
|
442 |
+
- layer_range: [18, 20]
|
443 |
+
model: ./Yosegi-0601
|
444 |
+
parameters:
|
445 |
+
density:
|
446 |
+
- filter: self_attn
|
447 |
+
value: 1.0
|
448 |
+
- filter: mlp
|
449 |
+
value: 1.0
|
450 |
+
- value: 1.0
|
451 |
+
gamma:
|
452 |
+
- filter: self_attn
|
453 |
+
value: 0.03922456593148313
|
454 |
+
- filter: mlp
|
455 |
+
value: 0.3318035822806869
|
456 |
+
- value: -0.10373990685028205
|
457 |
+
weight:
|
458 |
+
- filter: self_attn
|
459 |
+
value: 0.8254441016674987
|
460 |
+
- filter: mlp
|
461 |
+
value: 0.4568039342431161
|
462 |
+
- value: 0.3152648515747969
|
463 |
+
- layer_range: [18, 20]
|
464 |
+
model: ./Ninja-2B_JP
|
465 |
+
parameters:
|
466 |
+
density:
|
467 |
+
- filter: self_attn
|
468 |
+
value: 1.0
|
469 |
+
- filter: mlp
|
470 |
+
value: 1.0
|
471 |
+
- value: 0.9807358937293073
|
472 |
+
gamma:
|
473 |
+
- filter: self_attn
|
474 |
+
value: -0.22734036563128657
|
475 |
+
- filter: mlp
|
476 |
+
value: 0.26113222150270854
|
477 |
+
- value: 0.17739039022957015
|
478 |
+
weight:
|
479 |
+
- filter: self_attn
|
480 |
+
value: 0.33759130475641996
|
481 |
+
- filter: mlp
|
482 |
+
value: 0.616639215544168
|
483 |
+
- value: 0.47560658618977714
|
484 |
+
- layer_range: [18, 20]
|
485 |
+
model: ./Yosegi-0603
|
486 |
+
- sources:
|
487 |
+
- layer_range: [20, 22]
|
488 |
+
model: ./Yosegi-0601
|
489 |
+
parameters:
|
490 |
+
density:
|
491 |
+
- filter: self_attn
|
492 |
+
value: 0.9394514442960196
|
493 |
+
- filter: mlp
|
494 |
+
value: 1.0
|
495 |
+
- value: 0.9885037757465567
|
496 |
+
gamma:
|
497 |
+
- filter: self_attn
|
498 |
+
value: -0.17365709450334324
|
499 |
+
- filter: mlp
|
500 |
+
value: 0.0712279381144505
|
501 |
+
- value: 0.11809665485306464
|
502 |
+
weight:
|
503 |
+
- filter: self_attn
|
504 |
+
value: 0.485610337254665
|
505 |
+
- filter: mlp
|
506 |
+
value: 0.8406593173801935
|
507 |
+
- value: 0.5024102481819739
|
508 |
+
- layer_range: [20, 22]
|
509 |
+
model: ./Ninja-2B_JP
|
510 |
+
parameters:
|
511 |
+
density:
|
512 |
+
- filter: self_attn
|
513 |
+
value: 1.0
|
514 |
+
- filter: mlp
|
515 |
+
value: 1.0
|
516 |
+
- value: 1.0
|
517 |
+
gamma:
|
518 |
+
- filter: self_attn
|
519 |
+
value: -0.09980202641768818
|
520 |
+
- filter: mlp
|
521 |
+
value: 0.051454493742856926
|
522 |
+
- value: 0.14619126408666103
|
523 |
+
weight:
|
524 |
+
- filter: self_attn
|
525 |
+
value: 0.54772456079406
|
526 |
+
- filter: mlp
|
527 |
+
value: 0.3440893571099615
|
528 |
+
- value: 0.3747271233512448
|
529 |
+
- layer_range: [20, 22]
|
530 |
+
model: ./Yosegi-0603
|
531 |
+
- sources:
|
532 |
+
- layer_range: [22, 24]
|
533 |
+
model: ./Yosegi-0601
|
534 |
+
parameters:
|
535 |
+
density:
|
536 |
+
- filter: self_attn
|
537 |
+
value: 1.0
|
538 |
+
- filter: mlp
|
539 |
+
value: 0.9474712362889293
|
540 |
+
- value: 1.0
|
541 |
+
gamma:
|
542 |
+
- filter: self_attn
|
543 |
+
value: -0.16020032978118146
|
544 |
+
- filter: mlp
|
545 |
+
value: -0.025085248873309034
|
546 |
+
- value: 0.06046174910893976
|
547 |
+
weight:
|
548 |
+
- filter: self_attn
|
549 |
+
value: 0.8654189362345427
|
550 |
+
- filter: mlp
|
551 |
+
value: 0.6344956382288498
|
552 |
+
- value: 0.6383979001549549
|
553 |
+
- layer_range: [22, 24]
|
554 |
+
model: ./Ninja-2B_JP
|
555 |
+
parameters:
|
556 |
+
density:
|
557 |
+
- filter: self_attn
|
558 |
+
value: 0.8240762427167851
|
559 |
+
- filter: mlp
|
560 |
+
value: 1.0
|
561 |
+
- value: 0.9004913821398048
|
562 |
+
gamma:
|
563 |
+
- filter: self_attn
|
564 |
+
value: -0.12224186789525764
|
565 |
+
- filter: mlp
|
566 |
+
value: -0.25877585460700525
|
567 |
+
- value: 0.35149388360871714
|
568 |
+
weight:
|
569 |
+
- filter: self_attn
|
570 |
+
value: 0.4294356408713786
|
571 |
+
- filter: mlp
|
572 |
+
value: 0.3920647298630233
|
573 |
+
- value: 0.795891295390721
|
574 |
+
- layer_range: [22, 24]
|
575 |
+
model: ./Yosegi-0603
|
576 |
+
- sources:
|
577 |
+
- layer_range: [24, 26]
|
578 |
+
model: ./Yosegi-0601
|
579 |
+
parameters:
|
580 |
+
density:
|
581 |
+
- filter: self_attn
|
582 |
+
value: 1.0
|
583 |
+
- filter: mlp
|
584 |
+
value: 1.0
|
585 |
+
- value: 1.0
|
586 |
+
gamma:
|
587 |
+
- filter: self_attn
|
588 |
+
value: 0.16915580088030202
|
589 |
+
- filter: mlp
|
590 |
+
value: 0.2602652727555053
|
591 |
+
- value: 0.16985672723305376
|
592 |
+
weight:
|
593 |
+
- filter: self_attn
|
594 |
+
value: 0.420377024485687
|
595 |
+
- filter: mlp
|
596 |
+
value: 0.3401141209432324
|
597 |
+
- value: 0.4953511256159331
|
598 |
+
- layer_range: [24, 26]
|
599 |
+
model: ./Ninja-2B_JP
|
600 |
+
parameters:
|
601 |
+
density:
|
602 |
+
- filter: self_attn
|
603 |
+
value: 0.7290652609253236
|
604 |
+
- filter: mlp
|
605 |
+
value: 1.0
|
606 |
+
- value: 1.0
|
607 |
+
gamma:
|
608 |
+
- filter: self_attn
|
609 |
+
value: -0.1039167464696765
|
610 |
+
- filter: mlp
|
611 |
+
value: -0.18476572570059685
|
612 |
+
- value: 0.1221387313921081
|
613 |
+
weight:
|
614 |
+
- filter: self_attn
|
615 |
+
value: 0.2925002157134928
|
616 |
+
- filter: mlp
|
617 |
+
value: 0.3854740639588027
|
618 |
+
- value: 0.555448110317977
|
619 |
+
- layer_range: [24, 26]
|
620 |
+
model: ./Yosegi-0603
|
621 |
+
- sources:
|
622 |
+
- layer_range: [26, 28]
|
623 |
+
model: ./Yosegi-0601
|
624 |
+
parameters:
|
625 |
+
density:
|
626 |
+
- filter: self_attn
|
627 |
+
value: 1.0
|
628 |
+
- filter: mlp
|
629 |
+
value: 0.9104496350690235
|
630 |
+
- value: 1.0
|
631 |
+
gamma:
|
632 |
+
- filter: self_attn
|
633 |
+
value: 0.24831264214235005
|
634 |
+
- filter: mlp
|
635 |
+
value: -0.03903149241855605
|
636 |
+
- value: 0.14189425093398259
|
637 |
+
weight:
|
638 |
+
- filter: self_attn
|
639 |
+
value: 0.7685811138035815
|
640 |
+
- filter: mlp
|
641 |
+
value: 0.06535011571274918
|
642 |
+
- value: 0.696502559577317
|
643 |
+
- layer_range: [26, 28]
|
644 |
+
model: ./Ninja-2B_JP
|
645 |
+
parameters:
|
646 |
+
density:
|
647 |
+
- filter: self_attn
|
648 |
+
value: 0.9236218028490522
|
649 |
+
- filter: mlp
|
650 |
+
value: 1.0
|
651 |
+
- value: 1.0
|
652 |
+
gamma:
|
653 |
+
- filter: self_attn
|
654 |
+
value: -0.2451400735890047
|
655 |
+
- filter: mlp
|
656 |
+
value: -0.21555851418482214
|
657 |
+
- value: 0.020418471695148876
|
658 |
+
weight:
|
659 |
+
- filter: self_attn
|
660 |
+
value: 0.451368534421561
|
661 |
+
- filter: mlp
|
662 |
+
value: 0.27412879847687055
|
663 |
+
- value: 0.18339776770537336
|
664 |
+
- layer_range: [26, 28]
|
665 |
+
model: ./Yosegi-0603
|
666 |
+
- sources:
|
667 |
+
- layer_range: [28, 30]
|
668 |
+
model: ./Yosegi-0601
|
669 |
+
parameters:
|
670 |
+
density:
|
671 |
+
- filter: self_attn
|
672 |
+
value: 0.8590812961904566
|
673 |
+
- filter: mlp
|
674 |
+
value: 1.0
|
675 |
+
- value: 1.0
|
676 |
+
gamma:
|
677 |
+
- filter: self_attn
|
678 |
+
value: -0.06934549536310654
|
679 |
+
- filter: mlp
|
680 |
+
value: -0.28464693250998063
|
681 |
+
- value: -0.0588491947891552
|
682 |
+
weight:
|
683 |
+
- filter: self_attn
|
684 |
+
value: 0.26716389671655294
|
685 |
+
- filter: mlp
|
686 |
+
value: 0.8228280162386532
|
687 |
+
- value: 0.24197568479527135
|
688 |
+
- layer_range: [28, 30]
|
689 |
+
model: ./Ninja-2B_JP
|
690 |
+
parameters:
|
691 |
+
density:
|
692 |
+
- filter: self_attn
|
693 |
+
value: 0.7277181780542642
|
694 |
+
- filter: mlp
|
695 |
+
value: 0.74166025738732
|
696 |
+
- value: 1.0
|
697 |
+
gamma:
|
698 |
+
- filter: self_attn
|
699 |
+
value: 0.1772650150670655
|
700 |
+
- filter: mlp
|
701 |
+
value: 0.06545031487123437
|
702 |
+
- value: -0.28681451125993446
|
703 |
+
weight:
|
704 |
+
- filter: self_attn
|
705 |
+
value: 0.5781944040541174
|
706 |
+
- filter: mlp
|
707 |
+
value: 0.2288692970435767
|
708 |
+
- value: 0.689751088930503
|
709 |
+
- layer_range: [28, 30]
|
710 |
+
model: ./Yosegi-0603
|
711 |
+
- sources:
|
712 |
+
- layer_range: [30, 32]
|
713 |
+
model: ./Yosegi-0601
|
714 |
+
parameters:
|
715 |
+
density:
|
716 |
+
- filter: self_attn
|
717 |
+
value: 0.8177341862620365
|
718 |
+
- filter: mlp
|
719 |
+
value: 0.8875629677599377
|
720 |
+
- value: 1.0
|
721 |
+
gamma:
|
722 |
+
- filter: self_attn
|
723 |
+
value: -0.06572527259889459
|
724 |
+
- filter: mlp
|
725 |
+
value: -0.18979543285938766
|
726 |
+
- value: -0.24122036571646263
|
727 |
+
weight:
|
728 |
+
- filter: self_attn
|
729 |
+
value: 0.5818433594657613
|
730 |
+
- filter: mlp
|
731 |
+
value: 0.36676821100234736
|
732 |
+
- value: 0.3580688869263428
|
733 |
+
- layer_range: [30, 32]
|
734 |
+
model: ./Ninja-2B_JP
|
735 |
+
parameters:
|
736 |
+
density:
|
737 |
+
- filter: self_attn
|
738 |
+
value: 0.8306036003344672
|
739 |
+
- filter: mlp
|
740 |
+
value: 0.6993970248745297
|
741 |
+
- value: 1.0
|
742 |
+
gamma:
|
743 |
+
- filter: self_attn
|
744 |
+
value: -0.20599853236581384
|
745 |
+
- filter: mlp
|
746 |
+
value: -0.2001187634455465
|
747 |
+
- value: -0.07654635090020837
|
748 |
+
weight:
|
749 |
+
- filter: self_attn
|
750 |
+
value: 0.37120677279712305
|
751 |
+
- filter: mlp
|
752 |
+
value: 0.13105486609905853
|
753 |
+
- value: 0.7204857820148367
|
754 |
+
- layer_range: [30, 32]
|
755 |
+
model: ./Yosegi-0603
|
756 |
+
tokenizer_source: union
|
757 |
+
```
|
best_config.yaml
ADDED
@@ -0,0 +1,728 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
2 |
+
dtype: bfloat16
|
3 |
+
merge_method: breadcrumbs_ties
|
4 |
+
parameters:
|
5 |
+
int8_mask: 1.0
|
6 |
+
normalize: 0.0
|
7 |
+
slices:
|
8 |
+
- sources:
|
9 |
+
- layer_range: [0, 2]
|
10 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
11 |
+
parameters:
|
12 |
+
density:
|
13 |
+
- filter: self_attn
|
14 |
+
value: 1.0
|
15 |
+
- filter: mlp
|
16 |
+
value: 1.0
|
17 |
+
- value: 1.0
|
18 |
+
gamma:
|
19 |
+
- filter: self_attn
|
20 |
+
value: -0.050387850856855765
|
21 |
+
- filter: mlp
|
22 |
+
value: -0.17075015661203768
|
23 |
+
- value: -0.008041653902986862
|
24 |
+
weight:
|
25 |
+
- filter: self_attn
|
26 |
+
value: 0.0999312941470471
|
27 |
+
- filter: mlp
|
28 |
+
value: 0.541727762184749
|
29 |
+
- value: 0.6837012779994258
|
30 |
+
- layer_range: [0, 2]
|
31 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
32 |
+
parameters:
|
33 |
+
density:
|
34 |
+
- filter: self_attn
|
35 |
+
value: 0.8218846237599902
|
36 |
+
- filter: mlp
|
37 |
+
value: 1.0
|
38 |
+
- value: 0.9254078866667358
|
39 |
+
gamma:
|
40 |
+
- filter: self_attn
|
41 |
+
value: -0.11213758231875963
|
42 |
+
- filter: mlp
|
43 |
+
value: 0.021586098873668948
|
44 |
+
- value: -0.12827998218659437
|
45 |
+
weight:
|
46 |
+
- filter: self_attn
|
47 |
+
value: 0.40391646444657003
|
48 |
+
- filter: mlp
|
49 |
+
value: 0.623121864641881
|
50 |
+
- value: 0.5967833694632534
|
51 |
+
- layer_range: [0, 2]
|
52 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
53 |
+
- sources:
|
54 |
+
- layer_range: [2, 4]
|
55 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
56 |
+
parameters:
|
57 |
+
density:
|
58 |
+
- filter: self_attn
|
59 |
+
value: 0.8079479346300947
|
60 |
+
- filter: mlp
|
61 |
+
value: 1.0
|
62 |
+
- value: 0.710146185559419
|
63 |
+
gamma:
|
64 |
+
- filter: self_attn
|
65 |
+
value: 0.1383609589681566
|
66 |
+
- filter: mlp
|
67 |
+
value: 0.21188532059635062
|
68 |
+
- value: 0.2994723556443468
|
69 |
+
weight:
|
70 |
+
- filter: self_attn
|
71 |
+
value: 0.48107070906079974
|
72 |
+
- filter: mlp
|
73 |
+
value: 0.5848073552919492
|
74 |
+
- value: 0.4583842493359253
|
75 |
+
- layer_range: [2, 4]
|
76 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
77 |
+
parameters:
|
78 |
+
density:
|
79 |
+
- filter: self_attn
|
80 |
+
value: 1.0
|
81 |
+
- filter: mlp
|
82 |
+
value: 0.934378153535579
|
83 |
+
- value: 1.0
|
84 |
+
gamma:
|
85 |
+
- filter: self_attn
|
86 |
+
value: 0.073192612278188
|
87 |
+
- filter: mlp
|
88 |
+
value: 0.07939126555063317
|
89 |
+
- value: -0.06891845030175699
|
90 |
+
weight:
|
91 |
+
- filter: self_attn
|
92 |
+
value: 0.32120386994101
|
93 |
+
- filter: mlp
|
94 |
+
value: 0.5001108459121922
|
95 |
+
- value: 0.9138710221666694
|
96 |
+
- layer_range: [2, 4]
|
97 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
98 |
+
- sources:
|
99 |
+
- layer_range: [4, 6]
|
100 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
101 |
+
parameters:
|
102 |
+
density:
|
103 |
+
- filter: self_attn
|
104 |
+
value: 1.0
|
105 |
+
- filter: mlp
|
106 |
+
value: 0.7237519222177541
|
107 |
+
- value: 0.776951124863642
|
108 |
+
gamma:
|
109 |
+
- filter: self_attn
|
110 |
+
value: -0.2265121048274062
|
111 |
+
- filter: mlp
|
112 |
+
value: -0.1757947421960496
|
113 |
+
- value: -0.11401593728931929
|
114 |
+
weight:
|
115 |
+
- filter: self_attn
|
116 |
+
value: 0.6448742737026658
|
117 |
+
- filter: mlp
|
118 |
+
value: 0.13809748641457986
|
119 |
+
- value: 0.3950550285769662
|
120 |
+
- layer_range: [4, 6]
|
121 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
122 |
+
parameters:
|
123 |
+
density:
|
124 |
+
- filter: self_attn
|
125 |
+
value: 0.9649359194114893
|
126 |
+
- filter: mlp
|
127 |
+
value: 0.916637032428399
|
128 |
+
- value: 1.0
|
129 |
+
gamma:
|
130 |
+
- filter: self_attn
|
131 |
+
value: -0.16291684846287688
|
132 |
+
- filter: mlp
|
133 |
+
value: -0.19013548712121703
|
134 |
+
- value: 0.038409066391918795
|
135 |
+
weight:
|
136 |
+
- filter: self_attn
|
137 |
+
value: 0.1977358472772336
|
138 |
+
- filter: mlp
|
139 |
+
value: 0.22661167907612348
|
140 |
+
- value: 0.6426575016448257
|
141 |
+
- layer_range: [4, 6]
|
142 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
143 |
+
- sources:
|
144 |
+
- layer_range: [6, 8]
|
145 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
146 |
+
parameters:
|
147 |
+
density:
|
148 |
+
- filter: self_attn
|
149 |
+
value: 0.8727809666891416
|
150 |
+
- filter: mlp
|
151 |
+
value: 1.0
|
152 |
+
- value: 0.5160677785559116
|
153 |
+
gamma:
|
154 |
+
- filter: self_attn
|
155 |
+
value: 0.14245180617134273
|
156 |
+
- filter: mlp
|
157 |
+
value: 0.08189992601998919
|
158 |
+
- value: -0.1038827997670827
|
159 |
+
weight:
|
160 |
+
- filter: self_attn
|
161 |
+
value: 0.23575676914257698
|
162 |
+
- filter: mlp
|
163 |
+
value: 0.4047231670507743
|
164 |
+
- value: 0.34207794631274374
|
165 |
+
- layer_range: [6, 8]
|
166 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
167 |
+
parameters:
|
168 |
+
density:
|
169 |
+
- filter: self_attn
|
170 |
+
value: 1.0
|
171 |
+
- filter: mlp
|
172 |
+
value: 1.0
|
173 |
+
- value: 1.0
|
174 |
+
gamma:
|
175 |
+
- filter: self_attn
|
176 |
+
value: 0.576775501046583
|
177 |
+
- filter: mlp
|
178 |
+
value: -0.046028636298718645
|
179 |
+
- value: -0.024161321403060265
|
180 |
+
weight:
|
181 |
+
- filter: self_attn
|
182 |
+
value: 0.833089842843994
|
183 |
+
- filter: mlp
|
184 |
+
value: 0.5434667434613458
|
185 |
+
- value: 0.2946693008513797
|
186 |
+
- layer_range: [6, 8]
|
187 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
188 |
+
- sources:
|
189 |
+
- layer_range: [8, 10]
|
190 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
191 |
+
parameters:
|
192 |
+
density:
|
193 |
+
- filter: self_attn
|
194 |
+
value: 1.0
|
195 |
+
- filter: mlp
|
196 |
+
value: 1.0
|
197 |
+
- value: 0.9930269337531187
|
198 |
+
gamma:
|
199 |
+
- filter: self_attn
|
200 |
+
value: 0.4549980941970383
|
201 |
+
- filter: mlp
|
202 |
+
value: 0.10362988739411173
|
203 |
+
- value: -0.43800391668559174
|
204 |
+
weight:
|
205 |
+
- filter: self_attn
|
206 |
+
value: 0.19663450954683193
|
207 |
+
- filter: mlp
|
208 |
+
value: 0.16783989984505265
|
209 |
+
- value: 0.7465091417598162
|
210 |
+
- layer_range: [8, 10]
|
211 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
212 |
+
parameters:
|
213 |
+
density:
|
214 |
+
- filter: self_attn
|
215 |
+
value: 0.797370597380894
|
216 |
+
- filter: mlp
|
217 |
+
value: 1.0
|
218 |
+
- value: 1.0
|
219 |
+
gamma:
|
220 |
+
- filter: self_attn
|
221 |
+
value: -0.0665958634205702
|
222 |
+
- filter: mlp
|
223 |
+
value: -0.058297473060129834
|
224 |
+
- value: -0.38206760673090134
|
225 |
+
weight:
|
226 |
+
- filter: self_attn
|
227 |
+
value: 0.7015967347604024
|
228 |
+
- filter: mlp
|
229 |
+
value: 0.7733694864324641
|
230 |
+
- value: 0.7636921732342238
|
231 |
+
- layer_range: [8, 10]
|
232 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
233 |
+
- sources:
|
234 |
+
- layer_range: [10, 12]
|
235 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
236 |
+
parameters:
|
237 |
+
density:
|
238 |
+
- filter: self_attn
|
239 |
+
value: 0.8047576867589878
|
240 |
+
- filter: mlp
|
241 |
+
value: 0.8852533319203653
|
242 |
+
- value: 0.7707342647603538
|
243 |
+
gamma:
|
244 |
+
- filter: self_attn
|
245 |
+
value: -0.054343999574509694
|
246 |
+
- filter: mlp
|
247 |
+
value: -0.3465154355167133
|
248 |
+
- value: 0.022315854655582765
|
249 |
+
weight:
|
250 |
+
- filter: self_attn
|
251 |
+
value: 0.4396484757291151
|
252 |
+
- filter: mlp
|
253 |
+
value: 0.34318396468602314
|
254 |
+
- value: 0.8236034746664869
|
255 |
+
- layer_range: [10, 12]
|
256 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
257 |
+
parameters:
|
258 |
+
density:
|
259 |
+
- filter: self_attn
|
260 |
+
value: 0.9058471193805165
|
261 |
+
- filter: mlp
|
262 |
+
value: 1.0
|
263 |
+
- value: 1.0
|
264 |
+
gamma:
|
265 |
+
- filter: self_attn
|
266 |
+
value: 0.1221058588826469
|
267 |
+
- filter: mlp
|
268 |
+
value: -0.4004985640890659
|
269 |
+
- value: 0.3219195440395816
|
270 |
+
weight:
|
271 |
+
- filter: self_attn
|
272 |
+
value: 0.3565443612269864
|
273 |
+
- filter: mlp
|
274 |
+
value: 0.2817057075232181
|
275 |
+
- value: 0.5934890337808251
|
276 |
+
- layer_range: [10, 12]
|
277 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
278 |
+
- sources:
|
279 |
+
- layer_range: [12, 14]
|
280 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
281 |
+
parameters:
|
282 |
+
density:
|
283 |
+
- filter: self_attn
|
284 |
+
value: 1.0
|
285 |
+
- filter: mlp
|
286 |
+
value: 1.0
|
287 |
+
- value: 1.0
|
288 |
+
gamma:
|
289 |
+
- filter: self_attn
|
290 |
+
value: -0.027897116191693133
|
291 |
+
- filter: mlp
|
292 |
+
value: -0.1765379388255607
|
293 |
+
- value: 0.09108936063176161
|
294 |
+
weight:
|
295 |
+
- filter: self_attn
|
296 |
+
value: 0.4499753137521779
|
297 |
+
- filter: mlp
|
298 |
+
value: 0.901296236087911
|
299 |
+
- value: 0.3548680126954006
|
300 |
+
- layer_range: [12, 14]
|
301 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
302 |
+
parameters:
|
303 |
+
density:
|
304 |
+
- filter: self_attn
|
305 |
+
value: 0.8973815150776497
|
306 |
+
- filter: mlp
|
307 |
+
value: 0.6029953465961999
|
308 |
+
- value: 1.0
|
309 |
+
gamma:
|
310 |
+
- filter: self_attn
|
311 |
+
value: 0.10393082898402586
|
312 |
+
- filter: mlp
|
313 |
+
value: 0.15993577688878796
|
314 |
+
- value: 0.011410411917833683
|
315 |
+
weight:
|
316 |
+
- filter: self_attn
|
317 |
+
value: 0.2211644023056492
|
318 |
+
- filter: mlp
|
319 |
+
value: 0.5677387594231849
|
320 |
+
- value: 0.1316535663010981
|
321 |
+
- layer_range: [12, 14]
|
322 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
323 |
+
- sources:
|
324 |
+
- layer_range: [14, 16]
|
325 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
326 |
+
parameters:
|
327 |
+
density:
|
328 |
+
- filter: self_attn
|
329 |
+
value: 0.9584597245055072
|
330 |
+
- filter: mlp
|
331 |
+
value: 1.0
|
332 |
+
- value: 1.0
|
333 |
+
gamma:
|
334 |
+
- filter: self_attn
|
335 |
+
value: -0.17789727632680347
|
336 |
+
- filter: mlp
|
337 |
+
value: 0.2182263440314275
|
338 |
+
- value: 0.1449547656126498
|
339 |
+
weight:
|
340 |
+
- filter: self_attn
|
341 |
+
value: 0.4551004762874224
|
342 |
+
- filter: mlp
|
343 |
+
value: 0.9182082826762857
|
344 |
+
- value: 0.3736989395186422
|
345 |
+
- layer_range: [14, 16]
|
346 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
347 |
+
parameters:
|
348 |
+
density:
|
349 |
+
- filter: self_attn
|
350 |
+
value: 0.7414465107848625
|
351 |
+
- filter: mlp
|
352 |
+
value: 1.0
|
353 |
+
- value: 0.7894887419395906
|
354 |
+
gamma:
|
355 |
+
- filter: self_attn
|
356 |
+
value: -0.07343933395880992
|
357 |
+
- filter: mlp
|
358 |
+
value: 0.250800731630588
|
359 |
+
- value: -0.2948778134297542
|
360 |
+
weight:
|
361 |
+
- filter: self_attn
|
362 |
+
value: 0.43125199001016495
|
363 |
+
- filter: mlp
|
364 |
+
value: 0.6182726353394477
|
365 |
+
- value: 0.838902157446268
|
366 |
+
- layer_range: [14, 16]
|
367 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
368 |
+
- sources:
|
369 |
+
- layer_range: [16, 18]
|
370 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
371 |
+
parameters:
|
372 |
+
density:
|
373 |
+
- filter: self_attn
|
374 |
+
value: 0.9474287877268394
|
375 |
+
- filter: mlp
|
376 |
+
value: 1.0
|
377 |
+
- value: 0.9613380133344519
|
378 |
+
gamma:
|
379 |
+
- filter: self_attn
|
380 |
+
value: -0.08608895546593046
|
381 |
+
- filter: mlp
|
382 |
+
value: -0.07275416053291164
|
383 |
+
- value: -0.5796137860399382
|
384 |
+
weight:
|
385 |
+
- filter: self_attn
|
386 |
+
value: 0.5593420897751296
|
387 |
+
- filter: mlp
|
388 |
+
value: 0.7339447992880666
|
389 |
+
- value: 0.5447558586689005
|
390 |
+
- layer_range: [16, 18]
|
391 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
392 |
+
parameters:
|
393 |
+
density:
|
394 |
+
- filter: self_attn
|
395 |
+
value: 0.9321536960575384
|
396 |
+
- filter: mlp
|
397 |
+
value: 1.0
|
398 |
+
- value: 0.9613033408813294
|
399 |
+
gamma:
|
400 |
+
- filter: self_attn
|
401 |
+
value: 0.20610728738224296
|
402 |
+
- filter: mlp
|
403 |
+
value: 0.2002206706624053
|
404 |
+
- value: -0.45349278793293785
|
405 |
+
weight:
|
406 |
+
- filter: self_attn
|
407 |
+
value: 0.16162975594196963
|
408 |
+
- filter: mlp
|
409 |
+
value: 0.21262726992327483
|
410 |
+
- value: 0.061213622827234075
|
411 |
+
- layer_range: [16, 18]
|
412 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
413 |
+
- sources:
|
414 |
+
- layer_range: [18, 20]
|
415 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
416 |
+
parameters:
|
417 |
+
density:
|
418 |
+
- filter: self_attn
|
419 |
+
value: 1.0
|
420 |
+
- filter: mlp
|
421 |
+
value: 1.0
|
422 |
+
- value: 1.0
|
423 |
+
gamma:
|
424 |
+
- filter: self_attn
|
425 |
+
value: 0.03922456593148313
|
426 |
+
- filter: mlp
|
427 |
+
value: 0.3318035822806869
|
428 |
+
- value: -0.10373990685028205
|
429 |
+
weight:
|
430 |
+
- filter: self_attn
|
431 |
+
value: 0.8254441016674987
|
432 |
+
- filter: mlp
|
433 |
+
value: 0.4568039342431161
|
434 |
+
- value: 0.3152648515747969
|
435 |
+
- layer_range: [18, 20]
|
436 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
437 |
+
parameters:
|
438 |
+
density:
|
439 |
+
- filter: self_attn
|
440 |
+
value: 1.0
|
441 |
+
- filter: mlp
|
442 |
+
value: 1.0
|
443 |
+
- value: 0.9807358937293073
|
444 |
+
gamma:
|
445 |
+
- filter: self_attn
|
446 |
+
value: -0.22734036563128657
|
447 |
+
- filter: mlp
|
448 |
+
value: 0.26113222150270854
|
449 |
+
- value: 0.17739039022957015
|
450 |
+
weight:
|
451 |
+
- filter: self_attn
|
452 |
+
value: 0.33759130475641996
|
453 |
+
- filter: mlp
|
454 |
+
value: 0.616639215544168
|
455 |
+
- value: 0.47560658618977714
|
456 |
+
- layer_range: [18, 20]
|
457 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
458 |
+
- sources:
|
459 |
+
- layer_range: [20, 22]
|
460 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
461 |
+
parameters:
|
462 |
+
density:
|
463 |
+
- filter: self_attn
|
464 |
+
value: 0.9394514442960196
|
465 |
+
- filter: mlp
|
466 |
+
value: 1.0
|
467 |
+
- value: 0.9885037757465567
|
468 |
+
gamma:
|
469 |
+
- filter: self_attn
|
470 |
+
value: -0.17365709450334324
|
471 |
+
- filter: mlp
|
472 |
+
value: 0.0712279381144505
|
473 |
+
- value: 0.11809665485306464
|
474 |
+
weight:
|
475 |
+
- filter: self_attn
|
476 |
+
value: 0.485610337254665
|
477 |
+
- filter: mlp
|
478 |
+
value: 0.8406593173801935
|
479 |
+
- value: 0.5024102481819739
|
480 |
+
- layer_range: [20, 22]
|
481 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
482 |
+
parameters:
|
483 |
+
density:
|
484 |
+
- filter: self_attn
|
485 |
+
value: 1.0
|
486 |
+
- filter: mlp
|
487 |
+
value: 1.0
|
488 |
+
- value: 1.0
|
489 |
+
gamma:
|
490 |
+
- filter: self_attn
|
491 |
+
value: -0.09980202641768818
|
492 |
+
- filter: mlp
|
493 |
+
value: 0.051454493742856926
|
494 |
+
- value: 0.14619126408666103
|
495 |
+
weight:
|
496 |
+
- filter: self_attn
|
497 |
+
value: 0.54772456079406
|
498 |
+
- filter: mlp
|
499 |
+
value: 0.3440893571099615
|
500 |
+
- value: 0.3747271233512448
|
501 |
+
- layer_range: [20, 22]
|
502 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
503 |
+
- sources:
|
504 |
+
- layer_range: [22, 24]
|
505 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
506 |
+
parameters:
|
507 |
+
density:
|
508 |
+
- filter: self_attn
|
509 |
+
value: 1.0
|
510 |
+
- filter: mlp
|
511 |
+
value: 0.9474712362889293
|
512 |
+
- value: 1.0
|
513 |
+
gamma:
|
514 |
+
- filter: self_attn
|
515 |
+
value: -0.16020032978118146
|
516 |
+
- filter: mlp
|
517 |
+
value: -0.025085248873309034
|
518 |
+
- value: 0.06046174910893976
|
519 |
+
weight:
|
520 |
+
- filter: self_attn
|
521 |
+
value: 0.8654189362345427
|
522 |
+
- filter: mlp
|
523 |
+
value: 0.6344956382288498
|
524 |
+
- value: 0.6383979001549549
|
525 |
+
- layer_range: [22, 24]
|
526 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
527 |
+
parameters:
|
528 |
+
density:
|
529 |
+
- filter: self_attn
|
530 |
+
value: 0.8240762427167851
|
531 |
+
- filter: mlp
|
532 |
+
value: 1.0
|
533 |
+
- value: 0.9004913821398048
|
534 |
+
gamma:
|
535 |
+
- filter: self_attn
|
536 |
+
value: -0.12224186789525764
|
537 |
+
- filter: mlp
|
538 |
+
value: -0.25877585460700525
|
539 |
+
- value: 0.35149388360871714
|
540 |
+
weight:
|
541 |
+
- filter: self_attn
|
542 |
+
value: 0.4294356408713786
|
543 |
+
- filter: mlp
|
544 |
+
value: 0.3920647298630233
|
545 |
+
- value: 0.795891295390721
|
546 |
+
- layer_range: [22, 24]
|
547 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
548 |
+
- sources:
|
549 |
+
- layer_range: [24, 26]
|
550 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
551 |
+
parameters:
|
552 |
+
density:
|
553 |
+
- filter: self_attn
|
554 |
+
value: 1.0
|
555 |
+
- filter: mlp
|
556 |
+
value: 1.0
|
557 |
+
- value: 1.0
|
558 |
+
gamma:
|
559 |
+
- filter: self_attn
|
560 |
+
value: 0.16915580088030202
|
561 |
+
- filter: mlp
|
562 |
+
value: 0.2602652727555053
|
563 |
+
- value: 0.16985672723305376
|
564 |
+
weight:
|
565 |
+
- filter: self_attn
|
566 |
+
value: 0.420377024485687
|
567 |
+
- filter: mlp
|
568 |
+
value: 0.3401141209432324
|
569 |
+
- value: 0.4953511256159331
|
570 |
+
- layer_range: [24, 26]
|
571 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
572 |
+
parameters:
|
573 |
+
density:
|
574 |
+
- filter: self_attn
|
575 |
+
value: 0.7290652609253236
|
576 |
+
- filter: mlp
|
577 |
+
value: 1.0
|
578 |
+
- value: 1.0
|
579 |
+
gamma:
|
580 |
+
- filter: self_attn
|
581 |
+
value: -0.1039167464696765
|
582 |
+
- filter: mlp
|
583 |
+
value: -0.18476572570059685
|
584 |
+
- value: 0.1221387313921081
|
585 |
+
weight:
|
586 |
+
- filter: self_attn
|
587 |
+
value: 0.2925002157134928
|
588 |
+
- filter: mlp
|
589 |
+
value: 0.3854740639588027
|
590 |
+
- value: 0.555448110317977
|
591 |
+
- layer_range: [24, 26]
|
592 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
593 |
+
- sources:
|
594 |
+
- layer_range: [26, 28]
|
595 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
596 |
+
parameters:
|
597 |
+
density:
|
598 |
+
- filter: self_attn
|
599 |
+
value: 1.0
|
600 |
+
- filter: mlp
|
601 |
+
value: 0.9104496350690235
|
602 |
+
- value: 1.0
|
603 |
+
gamma:
|
604 |
+
- filter: self_attn
|
605 |
+
value: 0.24831264214235005
|
606 |
+
- filter: mlp
|
607 |
+
value: -0.03903149241855605
|
608 |
+
- value: 0.14189425093398259
|
609 |
+
weight:
|
610 |
+
- filter: self_attn
|
611 |
+
value: 0.7685811138035815
|
612 |
+
- filter: mlp
|
613 |
+
value: 0.06535011571274918
|
614 |
+
- value: 0.696502559577317
|
615 |
+
- layer_range: [26, 28]
|
616 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
617 |
+
parameters:
|
618 |
+
density:
|
619 |
+
- filter: self_attn
|
620 |
+
value: 0.9236218028490522
|
621 |
+
- filter: mlp
|
622 |
+
value: 1.0
|
623 |
+
- value: 1.0
|
624 |
+
gamma:
|
625 |
+
- filter: self_attn
|
626 |
+
value: -0.2451400735890047
|
627 |
+
- filter: mlp
|
628 |
+
value: -0.21555851418482214
|
629 |
+
- value: 0.020418471695148876
|
630 |
+
weight:
|
631 |
+
- filter: self_attn
|
632 |
+
value: 0.451368534421561
|
633 |
+
- filter: mlp
|
634 |
+
value: 0.27412879847687055
|
635 |
+
- value: 0.18339776770537336
|
636 |
+
- layer_range: [26, 28]
|
637 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
638 |
+
- sources:
|
639 |
+
- layer_range: [28, 30]
|
640 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
641 |
+
parameters:
|
642 |
+
density:
|
643 |
+
- filter: self_attn
|
644 |
+
value: 0.8590812961904566
|
645 |
+
- filter: mlp
|
646 |
+
value: 1.0
|
647 |
+
- value: 1.0
|
648 |
+
gamma:
|
649 |
+
- filter: self_attn
|
650 |
+
value: -0.06934549536310654
|
651 |
+
- filter: mlp
|
652 |
+
value: -0.28464693250998063
|
653 |
+
- value: -0.0588491947891552
|
654 |
+
weight:
|
655 |
+
- filter: self_attn
|
656 |
+
value: 0.26716389671655294
|
657 |
+
- filter: mlp
|
658 |
+
value: 0.8228280162386532
|
659 |
+
- value: 0.24197568479527135
|
660 |
+
- layer_range: [28, 30]
|
661 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
662 |
+
parameters:
|
663 |
+
density:
|
664 |
+
- filter: self_attn
|
665 |
+
value: 0.7277181780542642
|
666 |
+
- filter: mlp
|
667 |
+
value: 0.74166025738732
|
668 |
+
- value: 1.0
|
669 |
+
gamma:
|
670 |
+
- filter: self_attn
|
671 |
+
value: 0.1772650150670655
|
672 |
+
- filter: mlp
|
673 |
+
value: 0.06545031487123437
|
674 |
+
- value: -0.28681451125993446
|
675 |
+
weight:
|
676 |
+
- filter: self_attn
|
677 |
+
value: 0.5781944040541174
|
678 |
+
- filter: mlp
|
679 |
+
value: 0.2288692970435767
|
680 |
+
- value: 0.689751088930503
|
681 |
+
- layer_range: [28, 30]
|
682 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
683 |
+
- sources:
|
684 |
+
- layer_range: [30, 32]
|
685 |
+
model: ./evo-storage/input_models/Yosegi-0601_1486698715
|
686 |
+
parameters:
|
687 |
+
density:
|
688 |
+
- filter: self_attn
|
689 |
+
value: 0.8177341862620365
|
690 |
+
- filter: mlp
|
691 |
+
value: 0.8875629677599377
|
692 |
+
- value: 1.0
|
693 |
+
gamma:
|
694 |
+
- filter: self_attn
|
695 |
+
value: -0.06572527259889459
|
696 |
+
- filter: mlp
|
697 |
+
value: -0.18979543285938766
|
698 |
+
- value: -0.24122036571646263
|
699 |
+
weight:
|
700 |
+
- filter: self_attn
|
701 |
+
value: 0.5818433594657613
|
702 |
+
- filter: mlp
|
703 |
+
value: 0.36676821100234736
|
704 |
+
- value: 0.3580688869263428
|
705 |
+
- layer_range: [30, 32]
|
706 |
+
model: ./evo-storage/input_models/Ninja-2B_JP_706546503
|
707 |
+
parameters:
|
708 |
+
density:
|
709 |
+
- filter: self_attn
|
710 |
+
value: 0.8306036003344672
|
711 |
+
- filter: mlp
|
712 |
+
value: 0.6993970248745297
|
713 |
+
- value: 1.0
|
714 |
+
gamma:
|
715 |
+
- filter: self_attn
|
716 |
+
value: -0.20599853236581384
|
717 |
+
- filter: mlp
|
718 |
+
value: -0.2001187634455465
|
719 |
+
- value: -0.07654635090020837
|
720 |
+
weight:
|
721 |
+
- filter: self_attn
|
722 |
+
value: 0.37120677279712305
|
723 |
+
- filter: mlp
|
724 |
+
value: 0.13105486609905853
|
725 |
+
- value: 0.7204857820148367
|
726 |
+
- layer_range: [30, 32]
|
727 |
+
model: ./evo-storage/input_models/Yosegi-0603_3063110135
|
728 |
+
tokenizer_source: union
|
config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "./Yosegi-0603",
|
3 |
+
"architectures": [
|
4 |
+
"MistralForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"bos_token_id": 1,
|
8 |
+
"eos_token_id": 2,
|
9 |
+
"hidden_act": "silu",
|
10 |
+
"hidden_size": 4096,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 14336,
|
13 |
+
"max_position_embeddings": 32768,
|
14 |
+
"model_type": "mistral",
|
15 |
+
"num_attention_heads": 32,
|
16 |
+
"num_hidden_layers": 32,
|
17 |
+
"num_key_value_heads": 8,
|
18 |
+
"rms_norm_eps": 1e-05,
|
19 |
+
"rope_theta": 10000.0,
|
20 |
+
"sliding_window": 4096,
|
21 |
+
"tie_word_embeddings": false,
|
22 |
+
"torch_dtype": "bfloat16",
|
23 |
+
"transformers_version": "4.41.1",
|
24 |
+
"use_cache": true,
|
25 |
+
"vocab_size": 32000
|
26 |
+
}
|
mergekit_config.yml
ADDED
@@ -0,0 +1,728 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_model: ./Yosegi-0603
|
2 |
+
dtype: bfloat16
|
3 |
+
merge_method: breadcrumbs_ties
|
4 |
+
parameters:
|
5 |
+
int8_mask: 1.0
|
6 |
+
normalize: 0.0
|
7 |
+
slices:
|
8 |
+
- sources:
|
9 |
+
- layer_range: [0, 2]
|
10 |
+
model: ./Yosegi-0601
|
11 |
+
parameters:
|
12 |
+
density:
|
13 |
+
- filter: self_attn
|
14 |
+
value: 1.0
|
15 |
+
- filter: mlp
|
16 |
+
value: 1.0
|
17 |
+
- value: 1.0
|
18 |
+
gamma:
|
19 |
+
- filter: self_attn
|
20 |
+
value: -0.050387850856855765
|
21 |
+
- filter: mlp
|
22 |
+
value: -0.17075015661203768
|
23 |
+
- value: -0.008041653902986862
|
24 |
+
weight:
|
25 |
+
- filter: self_attn
|
26 |
+
value: 0.0999312941470471
|
27 |
+
- filter: mlp
|
28 |
+
value: 0.541727762184749
|
29 |
+
- value: 0.6837012779994258
|
30 |
+
- layer_range: [0, 2]
|
31 |
+
model: ./Ninja-2B_JP
|
32 |
+
parameters:
|
33 |
+
density:
|
34 |
+
- filter: self_attn
|
35 |
+
value: 0.8218846237599902
|
36 |
+
- filter: mlp
|
37 |
+
value: 1.0
|
38 |
+
- value: 0.9254078866667358
|
39 |
+
gamma:
|
40 |
+
- filter: self_attn
|
41 |
+
value: -0.11213758231875963
|
42 |
+
- filter: mlp
|
43 |
+
value: 0.021586098873668948
|
44 |
+
- value: -0.12827998218659437
|
45 |
+
weight:
|
46 |
+
- filter: self_attn
|
47 |
+
value: 0.40391646444657003
|
48 |
+
- filter: mlp
|
49 |
+
value: 0.623121864641881
|
50 |
+
- value: 0.5967833694632534
|
51 |
+
- layer_range: [0, 2]
|
52 |
+
model: ./Yosegi-0603
|
53 |
+
- sources:
|
54 |
+
- layer_range: [2, 4]
|
55 |
+
model: ./Yosegi-0601
|
56 |
+
parameters:
|
57 |
+
density:
|
58 |
+
- filter: self_attn
|
59 |
+
value: 0.8079479346300947
|
60 |
+
- filter: mlp
|
61 |
+
value: 1.0
|
62 |
+
- value: 0.710146185559419
|
63 |
+
gamma:
|
64 |
+
- filter: self_attn
|
65 |
+
value: 0.1383609589681566
|
66 |
+
- filter: mlp
|
67 |
+
value: 0.21188532059635062
|
68 |
+
- value: 0.2994723556443468
|
69 |
+
weight:
|
70 |
+
- filter: self_attn
|
71 |
+
value: 0.48107070906079974
|
72 |
+
- filter: mlp
|
73 |
+
value: 0.5848073552919492
|
74 |
+
- value: 0.4583842493359253
|
75 |
+
- layer_range: [2, 4]
|
76 |
+
model: ./Ninja-2B_JP
|
77 |
+
parameters:
|
78 |
+
density:
|
79 |
+
- filter: self_attn
|
80 |
+
value: 1.0
|
81 |
+
- filter: mlp
|
82 |
+
value: 0.934378153535579
|
83 |
+
- value: 1.0
|
84 |
+
gamma:
|
85 |
+
- filter: self_attn
|
86 |
+
value: 0.073192612278188
|
87 |
+
- filter: mlp
|
88 |
+
value: 0.07939126555063317
|
89 |
+
- value: -0.06891845030175699
|
90 |
+
weight:
|
91 |
+
- filter: self_attn
|
92 |
+
value: 0.32120386994101
|
93 |
+
- filter: mlp
|
94 |
+
value: 0.5001108459121922
|
95 |
+
- value: 0.9138710221666694
|
96 |
+
- layer_range: [2, 4]
|
97 |
+
model: ./Yosegi-0603
|
98 |
+
- sources:
|
99 |
+
- layer_range: [4, 6]
|
100 |
+
model: ./Yosegi-0601
|
101 |
+
parameters:
|
102 |
+
density:
|
103 |
+
- filter: self_attn
|
104 |
+
value: 1.0
|
105 |
+
- filter: mlp
|
106 |
+
value: 0.7237519222177541
|
107 |
+
- value: 0.776951124863642
|
108 |
+
gamma:
|
109 |
+
- filter: self_attn
|
110 |
+
value: -0.2265121048274062
|
111 |
+
- filter: mlp
|
112 |
+
value: -0.1757947421960496
|
113 |
+
- value: -0.11401593728931929
|
114 |
+
weight:
|
115 |
+
- filter: self_attn
|
116 |
+
value: 0.6448742737026658
|
117 |
+
- filter: mlp
|
118 |
+
value: 0.13809748641457986
|
119 |
+
- value: 0.3950550285769662
|
120 |
+
- layer_range: [4, 6]
|
121 |
+
model: ./Ninja-2B_JP
|
122 |
+
parameters:
|
123 |
+
density:
|
124 |
+
- filter: self_attn
|
125 |
+
value: 0.9649359194114893
|
126 |
+
- filter: mlp
|
127 |
+
value: 0.916637032428399
|
128 |
+
- value: 1.0
|
129 |
+
gamma:
|
130 |
+
- filter: self_attn
|
131 |
+
value: -0.16291684846287688
|
132 |
+
- filter: mlp
|
133 |
+
value: -0.19013548712121703
|
134 |
+
- value: 0.038409066391918795
|
135 |
+
weight:
|
136 |
+
- filter: self_attn
|
137 |
+
value: 0.1977358472772336
|
138 |
+
- filter: mlp
|
139 |
+
value: 0.22661167907612348
|
140 |
+
- value: 0.6426575016448257
|
141 |
+
- layer_range: [4, 6]
|
142 |
+
model: ./Yosegi-0603
|
143 |
+
- sources:
|
144 |
+
- layer_range: [6, 8]
|
145 |
+
model: ./Yosegi-0601
|
146 |
+
parameters:
|
147 |
+
density:
|
148 |
+
- filter: self_attn
|
149 |
+
value: 0.8727809666891416
|
150 |
+
- filter: mlp
|
151 |
+
value: 1.0
|
152 |
+
- value: 0.5160677785559116
|
153 |
+
gamma:
|
154 |
+
- filter: self_attn
|
155 |
+
value: 0.14245180617134273
|
156 |
+
- filter: mlp
|
157 |
+
value: 0.08189992601998919
|
158 |
+
- value: -0.1038827997670827
|
159 |
+
weight:
|
160 |
+
- filter: self_attn
|
161 |
+
value: 0.23575676914257698
|
162 |
+
- filter: mlp
|
163 |
+
value: 0.4047231670507743
|
164 |
+
- value: 0.34207794631274374
|
165 |
+
- layer_range: [6, 8]
|
166 |
+
model: ./Ninja-2B_JP
|
167 |
+
parameters:
|
168 |
+
density:
|
169 |
+
- filter: self_attn
|
170 |
+
value: 1.0
|
171 |
+
- filter: mlp
|
172 |
+
value: 1.0
|
173 |
+
- value: 1.0
|
174 |
+
gamma:
|
175 |
+
- filter: self_attn
|
176 |
+
value: 0.576775501046583
|
177 |
+
- filter: mlp
|
178 |
+
value: -0.046028636298718645
|
179 |
+
- value: -0.024161321403060265
|
180 |
+
weight:
|
181 |
+
- filter: self_attn
|
182 |
+
value: 0.833089842843994
|
183 |
+
- filter: mlp
|
184 |
+
value: 0.5434667434613458
|
185 |
+
- value: 0.2946693008513797
|
186 |
+
- layer_range: [6, 8]
|
187 |
+
model: ./Yosegi-0603
|
188 |
+
- sources:
|
189 |
+
- layer_range: [8, 10]
|
190 |
+
model: ./Yosegi-0601
|
191 |
+
parameters:
|
192 |
+
density:
|
193 |
+
- filter: self_attn
|
194 |
+
value: 1.0
|
195 |
+
- filter: mlp
|
196 |
+
value: 1.0
|
197 |
+
- value: 0.9930269337531187
|
198 |
+
gamma:
|
199 |
+
- filter: self_attn
|
200 |
+
value: 0.4549980941970383
|
201 |
+
- filter: mlp
|
202 |
+
value: 0.10362988739411173
|
203 |
+
- value: -0.43800391668559174
|
204 |
+
weight:
|
205 |
+
- filter: self_attn
|
206 |
+
value: 0.19663450954683193
|
207 |
+
- filter: mlp
|
208 |
+
value: 0.16783989984505265
|
209 |
+
- value: 0.7465091417598162
|
210 |
+
- layer_range: [8, 10]
|
211 |
+
model: ./Ninja-2B_JP
|
212 |
+
parameters:
|
213 |
+
density:
|
214 |
+
- filter: self_attn
|
215 |
+
value: 0.797370597380894
|
216 |
+
- filter: mlp
|
217 |
+
value: 1.0
|
218 |
+
- value: 1.0
|
219 |
+
gamma:
|
220 |
+
- filter: self_attn
|
221 |
+
value: -0.0665958634205702
|
222 |
+
- filter: mlp
|
223 |
+
value: -0.058297473060129834
|
224 |
+
- value: -0.38206760673090134
|
225 |
+
weight:
|
226 |
+
- filter: self_attn
|
227 |
+
value: 0.7015967347604024
|
228 |
+
- filter: mlp
|
229 |
+
value: 0.7733694864324641
|
230 |
+
- value: 0.7636921732342238
|
231 |
+
- layer_range: [8, 10]
|
232 |
+
model: ./Yosegi-0603
|
233 |
+
- sources:
|
234 |
+
- layer_range: [10, 12]
|
235 |
+
model: ./Yosegi-0601
|
236 |
+
parameters:
|
237 |
+
density:
|
238 |
+
- filter: self_attn
|
239 |
+
value: 0.8047576867589878
|
240 |
+
- filter: mlp
|
241 |
+
value: 0.8852533319203653
|
242 |
+
- value: 0.7707342647603538
|
243 |
+
gamma:
|
244 |
+
- filter: self_attn
|
245 |
+
value: -0.054343999574509694
|
246 |
+
- filter: mlp
|
247 |
+
value: -0.3465154355167133
|
248 |
+
- value: 0.022315854655582765
|
249 |
+
weight:
|
250 |
+
- filter: self_attn
|
251 |
+
value: 0.4396484757291151
|
252 |
+
- filter: mlp
|
253 |
+
value: 0.34318396468602314
|
254 |
+
- value: 0.8236034746664869
|
255 |
+
- layer_range: [10, 12]
|
256 |
+
model: ./Ninja-2B_JP
|
257 |
+
parameters:
|
258 |
+
density:
|
259 |
+
- filter: self_attn
|
260 |
+
value: 0.9058471193805165
|
261 |
+
- filter: mlp
|
262 |
+
value: 1.0
|
263 |
+
- value: 1.0
|
264 |
+
gamma:
|
265 |
+
- filter: self_attn
|
266 |
+
value: 0.1221058588826469
|
267 |
+
- filter: mlp
|
268 |
+
value: -0.4004985640890659
|
269 |
+
- value: 0.3219195440395816
|
270 |
+
weight:
|
271 |
+
- filter: self_attn
|
272 |
+
value: 0.3565443612269864
|
273 |
+
- filter: mlp
|
274 |
+
value: 0.2817057075232181
|
275 |
+
- value: 0.5934890337808251
|
276 |
+
- layer_range: [10, 12]
|
277 |
+
model: ./Yosegi-0603
|
278 |
+
- sources:
|
279 |
+
- layer_range: [12, 14]
|
280 |
+
model: ./Yosegi-0601
|
281 |
+
parameters:
|
282 |
+
density:
|
283 |
+
- filter: self_attn
|
284 |
+
value: 1.0
|
285 |
+
- filter: mlp
|
286 |
+
value: 1.0
|
287 |
+
- value: 1.0
|
288 |
+
gamma:
|
289 |
+
- filter: self_attn
|
290 |
+
value: -0.027897116191693133
|
291 |
+
- filter: mlp
|
292 |
+
value: -0.1765379388255607
|
293 |
+
- value: 0.09108936063176161
|
294 |
+
weight:
|
295 |
+
- filter: self_attn
|
296 |
+
value: 0.4499753137521779
|
297 |
+
- filter: mlp
|
298 |
+
value: 0.901296236087911
|
299 |
+
- value: 0.3548680126954006
|
300 |
+
- layer_range: [12, 14]
|
301 |
+
model: ./Ninja-2B_JP
|
302 |
+
parameters:
|
303 |
+
density:
|
304 |
+
- filter: self_attn
|
305 |
+
value: 0.8973815150776497
|
306 |
+
- filter: mlp
|
307 |
+
value: 0.6029953465961999
|
308 |
+
- value: 1.0
|
309 |
+
gamma:
|
310 |
+
- filter: self_attn
|
311 |
+
value: 0.10393082898402586
|
312 |
+
- filter: mlp
|
313 |
+
value: 0.15993577688878796
|
314 |
+
- value: 0.011410411917833683
|
315 |
+
weight:
|
316 |
+
- filter: self_attn
|
317 |
+
value: 0.2211644023056492
|
318 |
+
- filter: mlp
|
319 |
+
value: 0.5677387594231849
|
320 |
+
- value: 0.1316535663010981
|
321 |
+
- layer_range: [12, 14]
|
322 |
+
model: ./Yosegi-0603
|
323 |
+
- sources:
|
324 |
+
- layer_range: [14, 16]
|
325 |
+
model: ./Yosegi-0601
|
326 |
+
parameters:
|
327 |
+
density:
|
328 |
+
- filter: self_attn
|
329 |
+
value: 0.9584597245055072
|
330 |
+
- filter: mlp
|
331 |
+
value: 1.0
|
332 |
+
- value: 1.0
|
333 |
+
gamma:
|
334 |
+
- filter: self_attn
|
335 |
+
value: -0.17789727632680347
|
336 |
+
- filter: mlp
|
337 |
+
value: 0.2182263440314275
|
338 |
+
- value: 0.1449547656126498
|
339 |
+
weight:
|
340 |
+
- filter: self_attn
|
341 |
+
value: 0.4551004762874224
|
342 |
+
- filter: mlp
|
343 |
+
value: 0.9182082826762857
|
344 |
+
- value: 0.3736989395186422
|
345 |
+
- layer_range: [14, 16]
|
346 |
+
model: ./Ninja-2B_JP
|
347 |
+
parameters:
|
348 |
+
density:
|
349 |
+
- filter: self_attn
|
350 |
+
value: 0.7414465107848625
|
351 |
+
- filter: mlp
|
352 |
+
value: 1.0
|
353 |
+
- value: 0.7894887419395906
|
354 |
+
gamma:
|
355 |
+
- filter: self_attn
|
356 |
+
value: -0.07343933395880992
|
357 |
+
- filter: mlp
|
358 |
+
value: 0.250800731630588
|
359 |
+
- value: -0.2948778134297542
|
360 |
+
weight:
|
361 |
+
- filter: self_attn
|
362 |
+
value: 0.43125199001016495
|
363 |
+
- filter: mlp
|
364 |
+
value: 0.6182726353394477
|
365 |
+
- value: 0.838902157446268
|
366 |
+
- layer_range: [14, 16]
|
367 |
+
model: ./Yosegi-0603
|
368 |
+
- sources:
|
369 |
+
- layer_range: [16, 18]
|
370 |
+
model: ./Yosegi-0601
|
371 |
+
parameters:
|
372 |
+
density:
|
373 |
+
- filter: self_attn
|
374 |
+
value: 0.9474287877268394
|
375 |
+
- filter: mlp
|
376 |
+
value: 1.0
|
377 |
+
- value: 0.9613380133344519
|
378 |
+
gamma:
|
379 |
+
- filter: self_attn
|
380 |
+
value: -0.08608895546593046
|
381 |
+
- filter: mlp
|
382 |
+
value: -0.07275416053291164
|
383 |
+
- value: -0.5796137860399382
|
384 |
+
weight:
|
385 |
+
- filter: self_attn
|
386 |
+
value: 0.5593420897751296
|
387 |
+
- filter: mlp
|
388 |
+
value: 0.7339447992880666
|
389 |
+
- value: 0.5447558586689005
|
390 |
+
- layer_range: [16, 18]
|
391 |
+
model: ./Ninja-2B_JP
|
392 |
+
parameters:
|
393 |
+
density:
|
394 |
+
- filter: self_attn
|
395 |
+
value: 0.9321536960575384
|
396 |
+
- filter: mlp
|
397 |
+
value: 1.0
|
398 |
+
- value: 0.9613033408813294
|
399 |
+
gamma:
|
400 |
+
- filter: self_attn
|
401 |
+
value: 0.20610728738224296
|
402 |
+
- filter: mlp
|
403 |
+
value: 0.2002206706624053
|
404 |
+
- value: -0.45349278793293785
|
405 |
+
weight:
|
406 |
+
- filter: self_attn
|
407 |
+
value: 0.16162975594196963
|
408 |
+
- filter: mlp
|
409 |
+
value: 0.21262726992327483
|
410 |
+
- value: 0.061213622827234075
|
411 |
+
- layer_range: [16, 18]
|
412 |
+
model: ./Yosegi-0603
|
413 |
+
- sources:
|
414 |
+
- layer_range: [18, 20]
|
415 |
+
model: ./Yosegi-0601
|
416 |
+
parameters:
|
417 |
+
density:
|
418 |
+
- filter: self_attn
|
419 |
+
value: 1.0
|
420 |
+
- filter: mlp
|
421 |
+
value: 1.0
|
422 |
+
- value: 1.0
|
423 |
+
gamma:
|
424 |
+
- filter: self_attn
|
425 |
+
value: 0.03922456593148313
|
426 |
+
- filter: mlp
|
427 |
+
value: 0.3318035822806869
|
428 |
+
- value: -0.10373990685028205
|
429 |
+
weight:
|
430 |
+
- filter: self_attn
|
431 |
+
value: 0.8254441016674987
|
432 |
+
- filter: mlp
|
433 |
+
value: 0.4568039342431161
|
434 |
+
- value: 0.3152648515747969
|
435 |
+
- layer_range: [18, 20]
|
436 |
+
model: ./Ninja-2B_JP
|
437 |
+
parameters:
|
438 |
+
density:
|
439 |
+
- filter: self_attn
|
440 |
+
value: 1.0
|
441 |
+
- filter: mlp
|
442 |
+
value: 1.0
|
443 |
+
- value: 0.9807358937293073
|
444 |
+
gamma:
|
445 |
+
- filter: self_attn
|
446 |
+
value: -0.22734036563128657
|
447 |
+
- filter: mlp
|
448 |
+
value: 0.26113222150270854
|
449 |
+
- value: 0.17739039022957015
|
450 |
+
weight:
|
451 |
+
- filter: self_attn
|
452 |
+
value: 0.33759130475641996
|
453 |
+
- filter: mlp
|
454 |
+
value: 0.616639215544168
|
455 |
+
- value: 0.47560658618977714
|
456 |
+
- layer_range: [18, 20]
|
457 |
+
model: ./Yosegi-0603
|
458 |
+
- sources:
|
459 |
+
- layer_range: [20, 22]
|
460 |
+
model: ./Yosegi-0601
|
461 |
+
parameters:
|
462 |
+
density:
|
463 |
+
- filter: self_attn
|
464 |
+
value: 0.9394514442960196
|
465 |
+
- filter: mlp
|
466 |
+
value: 1.0
|
467 |
+
- value: 0.9885037757465567
|
468 |
+
gamma:
|
469 |
+
- filter: self_attn
|
470 |
+
value: -0.17365709450334324
|
471 |
+
- filter: mlp
|
472 |
+
value: 0.0712279381144505
|
473 |
+
- value: 0.11809665485306464
|
474 |
+
weight:
|
475 |
+
- filter: self_attn
|
476 |
+
value: 0.485610337254665
|
477 |
+
- filter: mlp
|
478 |
+
value: 0.8406593173801935
|
479 |
+
- value: 0.5024102481819739
|
480 |
+
- layer_range: [20, 22]
|
481 |
+
model: ./Ninja-2B_JP
|
482 |
+
parameters:
|
483 |
+
density:
|
484 |
+
- filter: self_attn
|
485 |
+
value: 1.0
|
486 |
+
- filter: mlp
|
487 |
+
value: 1.0
|
488 |
+
- value: 1.0
|
489 |
+
gamma:
|
490 |
+
- filter: self_attn
|
491 |
+
value: -0.09980202641768818
|
492 |
+
- filter: mlp
|
493 |
+
value: 0.051454493742856926
|
494 |
+
- value: 0.14619126408666103
|
495 |
+
weight:
|
496 |
+
- filter: self_attn
|
497 |
+
value: 0.54772456079406
|
498 |
+
- filter: mlp
|
499 |
+
value: 0.3440893571099615
|
500 |
+
- value: 0.3747271233512448
|
501 |
+
- layer_range: [20, 22]
|
502 |
+
model: ./Yosegi-0603
|
503 |
+
- sources:
|
504 |
+
- layer_range: [22, 24]
|
505 |
+
model: ./Yosegi-0601
|
506 |
+
parameters:
|
507 |
+
density:
|
508 |
+
- filter: self_attn
|
509 |
+
value: 1.0
|
510 |
+
- filter: mlp
|
511 |
+
value: 0.9474712362889293
|
512 |
+
- value: 1.0
|
513 |
+
gamma:
|
514 |
+
- filter: self_attn
|
515 |
+
value: -0.16020032978118146
|
516 |
+
- filter: mlp
|
517 |
+
value: -0.025085248873309034
|
518 |
+
- value: 0.06046174910893976
|
519 |
+
weight:
|
520 |
+
- filter: self_attn
|
521 |
+
value: 0.8654189362345427
|
522 |
+
- filter: mlp
|
523 |
+
value: 0.6344956382288498
|
524 |
+
- value: 0.6383979001549549
|
525 |
+
- layer_range: [22, 24]
|
526 |
+
model: ./Ninja-2B_JP
|
527 |
+
parameters:
|
528 |
+
density:
|
529 |
+
- filter: self_attn
|
530 |
+
value: 0.8240762427167851
|
531 |
+
- filter: mlp
|
532 |
+
value: 1.0
|
533 |
+
- value: 0.9004913821398048
|
534 |
+
gamma:
|
535 |
+
- filter: self_attn
|
536 |
+
value: -0.12224186789525764
|
537 |
+
- filter: mlp
|
538 |
+
value: -0.25877585460700525
|
539 |
+
- value: 0.35149388360871714
|
540 |
+
weight:
|
541 |
+
- filter: self_attn
|
542 |
+
value: 0.4294356408713786
|
543 |
+
- filter: mlp
|
544 |
+
value: 0.3920647298630233
|
545 |
+
- value: 0.795891295390721
|
546 |
+
- layer_range: [22, 24]
|
547 |
+
model: ./Yosegi-0603
|
548 |
+
- sources:
|
549 |
+
- layer_range: [24, 26]
|
550 |
+
model: ./Yosegi-0601
|
551 |
+
parameters:
|
552 |
+
density:
|
553 |
+
- filter: self_attn
|
554 |
+
value: 1.0
|
555 |
+
- filter: mlp
|
556 |
+
value: 1.0
|
557 |
+
- value: 1.0
|
558 |
+
gamma:
|
559 |
+
- filter: self_attn
|
560 |
+
value: 0.16915580088030202
|
561 |
+
- filter: mlp
|
562 |
+
value: 0.2602652727555053
|
563 |
+
- value: 0.16985672723305376
|
564 |
+
weight:
|
565 |
+
- filter: self_attn
|
566 |
+
value: 0.420377024485687
|
567 |
+
- filter: mlp
|
568 |
+
value: 0.3401141209432324
|
569 |
+
- value: 0.4953511256159331
|
570 |
+
- layer_range: [24, 26]
|
571 |
+
model: ./Ninja-2B_JP
|
572 |
+
parameters:
|
573 |
+
density:
|
574 |
+
- filter: self_attn
|
575 |
+
value: 0.7290652609253236
|
576 |
+
- filter: mlp
|
577 |
+
value: 1.0
|
578 |
+
- value: 1.0
|
579 |
+
gamma:
|
580 |
+
- filter: self_attn
|
581 |
+
value: -0.1039167464696765
|
582 |
+
- filter: mlp
|
583 |
+
value: -0.18476572570059685
|
584 |
+
- value: 0.1221387313921081
|
585 |
+
weight:
|
586 |
+
- filter: self_attn
|
587 |
+
value: 0.2925002157134928
|
588 |
+
- filter: mlp
|
589 |
+
value: 0.3854740639588027
|
590 |
+
- value: 0.555448110317977
|
591 |
+
- layer_range: [24, 26]
|
592 |
+
model: ./Yosegi-0603
|
593 |
+
- sources:
|
594 |
+
- layer_range: [26, 28]
|
595 |
+
model: ./Yosegi-0601
|
596 |
+
parameters:
|
597 |
+
density:
|
598 |
+
- filter: self_attn
|
599 |
+
value: 1.0
|
600 |
+
- filter: mlp
|
601 |
+
value: 0.9104496350690235
|
602 |
+
- value: 1.0
|
603 |
+
gamma:
|
604 |
+
- filter: self_attn
|
605 |
+
value: 0.24831264214235005
|
606 |
+
- filter: mlp
|
607 |
+
value: -0.03903149241855605
|
608 |
+
- value: 0.14189425093398259
|
609 |
+
weight:
|
610 |
+
- filter: self_attn
|
611 |
+
value: 0.7685811138035815
|
612 |
+
- filter: mlp
|
613 |
+
value: 0.06535011571274918
|
614 |
+
- value: 0.696502559577317
|
615 |
+
- layer_range: [26, 28]
|
616 |
+
model: ./Ninja-2B_JP
|
617 |
+
parameters:
|
618 |
+
density:
|
619 |
+
- filter: self_attn
|
620 |
+
value: 0.9236218028490522
|
621 |
+
- filter: mlp
|
622 |
+
value: 1.0
|
623 |
+
- value: 1.0
|
624 |
+
gamma:
|
625 |
+
- filter: self_attn
|
626 |
+
value: -0.2451400735890047
|
627 |
+
- filter: mlp
|
628 |
+
value: -0.21555851418482214
|
629 |
+
- value: 0.020418471695148876
|
630 |
+
weight:
|
631 |
+
- filter: self_attn
|
632 |
+
value: 0.451368534421561
|
633 |
+
- filter: mlp
|
634 |
+
value: 0.27412879847687055
|
635 |
+
- value: 0.18339776770537336
|
636 |
+
- layer_range: [26, 28]
|
637 |
+
model: ./Yosegi-0603
|
638 |
+
- sources:
|
639 |
+
- layer_range: [28, 30]
|
640 |
+
model: ./Yosegi-0601
|
641 |
+
parameters:
|
642 |
+
density:
|
643 |
+
- filter: self_attn
|
644 |
+
value: 0.8590812961904566
|
645 |
+
- filter: mlp
|
646 |
+
value: 1.0
|
647 |
+
- value: 1.0
|
648 |
+
gamma:
|
649 |
+
- filter: self_attn
|
650 |
+
value: -0.06934549536310654
|
651 |
+
- filter: mlp
|
652 |
+
value: -0.28464693250998063
|
653 |
+
- value: -0.0588491947891552
|
654 |
+
weight:
|
655 |
+
- filter: self_attn
|
656 |
+
value: 0.26716389671655294
|
657 |
+
- filter: mlp
|
658 |
+
value: 0.8228280162386532
|
659 |
+
- value: 0.24197568479527135
|
660 |
+
- layer_range: [28, 30]
|
661 |
+
model: ./Ninja-2B_JP
|
662 |
+
parameters:
|
663 |
+
density:
|
664 |
+
- filter: self_attn
|
665 |
+
value: 0.7277181780542642
|
666 |
+
- filter: mlp
|
667 |
+
value: 0.74166025738732
|
668 |
+
- value: 1.0
|
669 |
+
gamma:
|
670 |
+
- filter: self_attn
|
671 |
+
value: 0.1772650150670655
|
672 |
+
- filter: mlp
|
673 |
+
value: 0.06545031487123437
|
674 |
+
- value: -0.28681451125993446
|
675 |
+
weight:
|
676 |
+
- filter: self_attn
|
677 |
+
value: 0.5781944040541174
|
678 |
+
- filter: mlp
|
679 |
+
value: 0.2288692970435767
|
680 |
+
- value: 0.689751088930503
|
681 |
+
- layer_range: [28, 30]
|
682 |
+
model: ./Yosegi-0603
|
683 |
+
- sources:
|
684 |
+
- layer_range: [30, 32]
|
685 |
+
model: ./Yosegi-0601
|
686 |
+
parameters:
|
687 |
+
density:
|
688 |
+
- filter: self_attn
|
689 |
+
value: 0.8177341862620365
|
690 |
+
- filter: mlp
|
691 |
+
value: 0.8875629677599377
|
692 |
+
- value: 1.0
|
693 |
+
gamma:
|
694 |
+
- filter: self_attn
|
695 |
+
value: -0.06572527259889459
|
696 |
+
- filter: mlp
|
697 |
+
value: -0.18979543285938766
|
698 |
+
- value: -0.24122036571646263
|
699 |
+
weight:
|
700 |
+
- filter: self_attn
|
701 |
+
value: 0.5818433594657613
|
702 |
+
- filter: mlp
|
703 |
+
value: 0.36676821100234736
|
704 |
+
- value: 0.3580688869263428
|
705 |
+
- layer_range: [30, 32]
|
706 |
+
model: ./Ninja-2B_JP
|
707 |
+
parameters:
|
708 |
+
density:
|
709 |
+
- filter: self_attn
|
710 |
+
value: 0.8306036003344672
|
711 |
+
- filter: mlp
|
712 |
+
value: 0.6993970248745297
|
713 |
+
- value: 1.0
|
714 |
+
gamma:
|
715 |
+
- filter: self_attn
|
716 |
+
value: -0.20599853236581384
|
717 |
+
- filter: mlp
|
718 |
+
value: -0.2001187634455465
|
719 |
+
- value: -0.07654635090020837
|
720 |
+
weight:
|
721 |
+
- filter: self_attn
|
722 |
+
value: 0.37120677279712305
|
723 |
+
- filter: mlp
|
724 |
+
value: 0.13105486609905853
|
725 |
+
- value: 0.7204857820148367
|
726 |
+
- layer_range: [30, 32]
|
727 |
+
model: ./Yosegi-0603
|
728 |
+
tokenizer_source: union
|
model-00001-of-00001.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d194d368edc66d7fd2d496a4bd92092485a61b780e0cb6d7b4bfbcd50cefd682
|
3 |
+
size 14483498224
|
model.safetensors.index.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"metadata": {"mergekit_version": "0.0.4.2", "total_size": 14483464192}, "weight_map": {"lm_head.weight": "model-00001-of-00001.safetensors", "model.embed_tokens.weight": "model-00001-of-00001.safetensors", "model.layers.0.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.0.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.0.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.1.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.1.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.1.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.10.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.10.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.10.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.11.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.11.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.11.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.12.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.12.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.12.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.13.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.13.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.13.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.14.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.14.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.14.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.15.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.15.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.15.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.16.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.16.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.16.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.17.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.17.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.17.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.18.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.18.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.18.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.19.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.19.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.19.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.2.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.2.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.2.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.20.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.20.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.20.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.21.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.21.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.21.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.21.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.21.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.21.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.22.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.22.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.22.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.22.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.22.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.22.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.23.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.23.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.23.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.23.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.23.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.23.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.24.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.24.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.24.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.24.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.24.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.24.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.25.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.25.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.25.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.25.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.25.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.25.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.26.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.26.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.26.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.26.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.26.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.26.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.27.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.27.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.27.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.27.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.27.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.27.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.28.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.28.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.28.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.28.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.28.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.28.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.28.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.28.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.28.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.29.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.29.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.29.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.29.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.29.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.29.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.29.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.29.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.29.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.3.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.3.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.3.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.30.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.30.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.30.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.30.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.30.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.30.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.30.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.30.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.30.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.31.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.31.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.31.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.31.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.31.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.31.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.31.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.31.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.31.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.4.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.4.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.4.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.5.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.5.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.5.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.6.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.6.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.6.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.7.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.7.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.7.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.8.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.8.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.8.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.layers.9.input_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.9.mlp.down_proj.weight": "model-00001-of-00001.safetensors", "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00001.safetensors", "model.layers.9.mlp.up_proj.weight": "model-00001-of-00001.safetensors", "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00001.safetensors", "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00001.safetensors", "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00001.safetensors", "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00001.safetensors", "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00001.safetensors", "model.norm.weight": "model-00001-of-00001.safetensors"}}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"unk_token": {
|
17 |
+
"content": "<unk>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
}
|
23 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_bos_token": true,
|
3 |
+
"add_eos_token": false,
|
4 |
+
"added_tokens_decoder": {
|
5 |
+
"0": {
|
6 |
+
"content": "<unk>",
|
7 |
+
"lstrip": false,
|
8 |
+
"normalized": false,
|
9 |
+
"rstrip": false,
|
10 |
+
"single_word": false,
|
11 |
+
"special": false
|
12 |
+
},
|
13 |
+
"1": {
|
14 |
+
"content": "<s>",
|
15 |
+
"lstrip": false,
|
16 |
+
"normalized": false,
|
17 |
+
"rstrip": false,
|
18 |
+
"single_word": false,
|
19 |
+
"special": false
|
20 |
+
},
|
21 |
+
"2": {
|
22 |
+
"content": "</s>",
|
23 |
+
"lstrip": false,
|
24 |
+
"normalized": false,
|
25 |
+
"rstrip": false,
|
26 |
+
"single_word": false,
|
27 |
+
"special": false
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"additional_special_tokens": [],
|
31 |
+
"bos_token": "<s>",
|
32 |
+
"clean_up_tokenization_spaces": false,
|
33 |
+
"eos_token": "</s>",
|
34 |
+
"legacy": true,
|
35 |
+
"model_max_length": 1000000000000000019884624838656,
|
36 |
+
"pad_token": null,
|
37 |
+
"padding_side": "left",
|
38 |
+
"sp_model_kwargs": {},
|
39 |
+
"spaces_between_special_tokens": false,
|
40 |
+
"split_special_tokens": false,
|
41 |
+
"tokenizer_class": "LlamaTokenizer",
|
42 |
+
"unk_token": "<unk>",
|
43 |
+
"use_default_system_prompt": true
|
44 |
+
}
|