apepkuss79 commited on
Commit
88f02e3
1 Parent(s): 2ffb78e

Update models

Browse files
.gitattributes CHANGED
@@ -33,3 +33,20 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Llama-3_1-Nemotron-51B-Instruct-Q2_K.gguf filter=lfs diff=lfs merge=lfs -text
37
+ Llama-3_1-Nemotron-51B-Instruct-Q3_K_L.gguf filter=lfs diff=lfs merge=lfs -text
38
+ Llama-3_1-Nemotron-51B-Instruct-Q3_K_M.gguf filter=lfs diff=lfs merge=lfs -text
39
+ Llama-3_1-Nemotron-51B-Instruct-Q3_K_S.gguf filter=lfs diff=lfs merge=lfs -text
40
+ Llama-3_1-Nemotron-51B-Instruct-Q4_0.gguf filter=lfs diff=lfs merge=lfs -text
41
+ Llama-3_1-Nemotron-51B-Instruct-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text
42
+ Llama-3_1-Nemotron-51B-Instruct-Q4_K_S.gguf filter=lfs diff=lfs merge=lfs -text
43
+ Llama-3_1-Nemotron-51B-Instruct-Q5_0.gguf filter=lfs diff=lfs merge=lfs -text
44
+ Llama-3_1-Nemotron-51B-Instruct-Q5_K_M.gguf filter=lfs diff=lfs merge=lfs -text
45
+ Llama-3_1-Nemotron-51B-Instruct-Q5_K_S.gguf filter=lfs diff=lfs merge=lfs -text
46
+ Llama-3_1-Nemotron-51B-Instruct-Q6_K.gguf filter=lfs diff=lfs merge=lfs -text
47
+ Llama-3_1-Nemotron-51B-Instruct-Q8_0-00001-of-00002.gguf filter=lfs diff=lfs merge=lfs -text
48
+ Llama-3_1-Nemotron-51B-Instruct-Q8_0-00002-of-00002.gguf filter=lfs diff=lfs merge=lfs -text
49
+ Llama-3_1-Nemotron-51B-Instruct-f16-00001-of-00004.gguf filter=lfs diff=lfs merge=lfs -text
50
+ Llama-3_1-Nemotron-51B-Instruct-f16-00002-of-00004.gguf filter=lfs diff=lfs merge=lfs -text
51
+ Llama-3_1-Nemotron-51B-Instruct-f16-00003-of-00004.gguf filter=lfs diff=lfs merge=lfs -text
52
+ Llama-3_1-Nemotron-51B-Instruct-f16-00004-of-00004.gguf filter=lfs diff=lfs merge=lfs -text
Llama-3_1-Nemotron-51B-Instruct-Q2_K.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f1d6ad4df6fa589194203235d19bb31315cd73beb3ed2dffb02a6eb1d404ac9
3
+ size 19418642464
Llama-3_1-Nemotron-51B-Instruct-Q3_K_L.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbe29ae0b25627ac8631a34ca6f11d572ae4a0bd6c1f8650f960a0faf7fe69b0
3
+ size 27349751840
Llama-3_1-Nemotron-51B-Instruct-Q3_K_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8595b234baa5fc3dc14db611793cedcdb3ab73bcc421bacc46a15a4ba6bb57b
3
+ size 25182345248
Llama-3_1-Nemotron-51B-Instruct-Q3_K_S.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef2b0a9330eafc8fb6cde9285e7cd1c9b9a081e4ecd68f02fcb8e3b1459eaa68
3
+ size 22652393504
Llama-3_1-Nemotron-51B-Instruct-Q4_0.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:624711249056e4dc79bda0090015f3ed0a44fe7ca35a2a91579efe90b92908aa
3
+ size 29252368416
Llama-3_1-Nemotron-51B-Instruct-Q4_K_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db50b770b4cc7918f1a25f204609c4c478e41c2f10499ec900259cb052948c11
3
+ size 31037306912
Llama-3_1-Nemotron-51B-Instruct-Q4_K_S.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:041d5660e8f40b1fe28c65c522e92814cb855426187aae81ca9359ca10e46df5
3
+ size 29484496928
Llama-3_1-Nemotron-51B-Instruct-Q5_0.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f6baa9e8ba28870f76093655dcf04b9a4439a8edf4b51efc4725c44684d1c24
3
+ size 35558504480
Llama-3_1-Nemotron-51B-Instruct-Q5_K_M.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e19d5f2172cb51bcb3dd22391cebaf4c7f0210c562200ae671d8640e115de36
3
+ size 36465391648
Llama-3_1-Nemotron-51B-Instruct-Q5_K_S.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:906aefa4926bec89d3aff281db866bb02e4ed79792566b28cc3d6a8bebcb5c79
3
+ size 35558504480
Llama-3_1-Nemotron-51B-Instruct-Q6_K.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d66d3daee89d9e2596841f3589b3268ef060aef1513070a5ce390ce00adf12f
3
+ size 42258774048
Llama-3_1-Nemotron-51B-Instruct-Q8_0-00001-of-00002.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ed2a1ad93790fcfe630e5755f5ce18d9fd7a04808e36343f3b000bde61ffa55
3
+ size 29826134528
Llama-3_1-Nemotron-51B-Instruct-Q8_0-00002-of-00002.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f20fc51b96935e0e17daee28f9694ef85cc7f66faabf3ea0d2a5f126e958fbf9
3
+ size 24905238240
Llama-3_1-Nemotron-51B-Instruct-f16-00001-of-00004.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c29966a7c3b2172525e4b9b29de9391ac1507476e26ae09486fb0953501c5c47
3
+ size 29772092192
Llama-3_1-Nemotron-51B-Instruct-f16-00002-of-00004.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67d59882c28b27005f0551e3ece2e49c37a9096022f08b1cb228f8cefc09e75c
3
+ size 29562578048
Llama-3_1-Nemotron-51B-Instruct-f16-00003-of-00004.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:534c2dd1ee26a2fbf3979582b0fdafa9bd01e9d3f0eb7d78a506ab10f5bb0897
3
+ size 29596988128
Llama-3_1-Nemotron-51B-Instruct-f16-00004-of-00004.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82558d958d66febc358278b3c96b4479e2c002dd4b7babe49cbf111cc05813f3
3
+ size 14080741216
config.json ADDED
@@ -0,0 +1,1004 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DeciLMForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_decilm.DeciLMConfig",
9
+ "AutoModelForCausalLM": "modeling_decilm.DeciLMForCausalLM"
10
+ },
11
+ "block_configs": [
12
+ {
13
+ "attention": {
14
+ "n_heads_in_group": 8,
15
+ "no_op": false,
16
+ "replace_with_linear": false
17
+ },
18
+ "ffn": {
19
+ "ffn_mult": 1.3125,
20
+ "no_op": false,
21
+ "replace_with_linear": false
22
+ }
23
+ },
24
+ {
25
+ "attention": {
26
+ "n_heads_in_group": 16,
27
+ "no_op": false,
28
+ "replace_with_linear": false
29
+ },
30
+ "ffn": {
31
+ "ffn_mult": 2.625,
32
+ "no_op": false,
33
+ "replace_with_linear": false
34
+ }
35
+ },
36
+ {
37
+ "attention": {
38
+ "n_heads_in_group": 8,
39
+ "no_op": false,
40
+ "replace_with_linear": false
41
+ },
42
+ "ffn": {
43
+ "ffn_mult": 5.25,
44
+ "no_op": false,
45
+ "replace_with_linear": false
46
+ }
47
+ },
48
+ {
49
+ "attention": {
50
+ "n_heads_in_group": 8,
51
+ "no_op": false,
52
+ "replace_with_linear": false
53
+ },
54
+ "ffn": {
55
+ "ffn_mult": 5.25,
56
+ "no_op": false,
57
+ "replace_with_linear": false
58
+ }
59
+ },
60
+ {
61
+ "attention": {
62
+ "n_heads_in_group": 8,
63
+ "no_op": false,
64
+ "replace_with_linear": false
65
+ },
66
+ "ffn": {
67
+ "ffn_mult": 5.25,
68
+ "no_op": false,
69
+ "replace_with_linear": false
70
+ }
71
+ },
72
+ {
73
+ "attention": {
74
+ "n_heads_in_group": 32,
75
+ "no_op": false,
76
+ "replace_with_linear": false
77
+ },
78
+ "ffn": {
79
+ "ffn_mult": 2.625,
80
+ "no_op": false,
81
+ "replace_with_linear": false
82
+ }
83
+ },
84
+ {
85
+ "attention": {
86
+ "n_heads_in_group": 32,
87
+ "no_op": false,
88
+ "replace_with_linear": false
89
+ },
90
+ "ffn": {
91
+ "ffn_mult": 2.625,
92
+ "no_op": false,
93
+ "replace_with_linear": false
94
+ }
95
+ },
96
+ {
97
+ "attention": {
98
+ "n_heads_in_group": 64,
99
+ "no_op": false,
100
+ "replace_with_linear": false
101
+ },
102
+ "ffn": {
103
+ "ffn_mult": 2.625,
104
+ "no_op": false,
105
+ "replace_with_linear": false
106
+ }
107
+ },
108
+ {
109
+ "attention": {
110
+ "n_heads_in_group": 64,
111
+ "no_op": false,
112
+ "replace_with_linear": false
113
+ },
114
+ "ffn": {
115
+ "ffn_mult": 2.625,
116
+ "no_op": false,
117
+ "replace_with_linear": false
118
+ }
119
+ },
120
+ {
121
+ "attention": {
122
+ "n_heads_in_group": 32,
123
+ "no_op": false,
124
+ "replace_with_linear": false
125
+ },
126
+ "ffn": {
127
+ "ffn_mult": 2.625,
128
+ "no_op": false,
129
+ "replace_with_linear": false
130
+ }
131
+ },
132
+ {
133
+ "attention": {
134
+ "n_heads_in_group": 32,
135
+ "no_op": false,
136
+ "replace_with_linear": false
137
+ },
138
+ "ffn": {
139
+ "ffn_mult": 2.625,
140
+ "no_op": false,
141
+ "replace_with_linear": false
142
+ }
143
+ },
144
+ {
145
+ "attention": {
146
+ "n_heads_in_group": null,
147
+ "no_op": false,
148
+ "replace_with_linear": true
149
+ },
150
+ "ffn": {
151
+ "ffn_mult": 2.625,
152
+ "no_op": false,
153
+ "replace_with_linear": false
154
+ }
155
+ },
156
+ {
157
+ "attention": {
158
+ "n_heads_in_group": 64,
159
+ "no_op": false,
160
+ "replace_with_linear": false
161
+ },
162
+ "ffn": {
163
+ "ffn_mult": 2.625,
164
+ "no_op": false,
165
+ "replace_with_linear": false
166
+ }
167
+ },
168
+ {
169
+ "attention": {
170
+ "n_heads_in_group": 32,
171
+ "no_op": false,
172
+ "replace_with_linear": false
173
+ },
174
+ "ffn": {
175
+ "ffn_mult": 2.625,
176
+ "no_op": false,
177
+ "replace_with_linear": false
178
+ }
179
+ },
180
+ {
181
+ "attention": {
182
+ "n_heads_in_group": 32,
183
+ "no_op": false,
184
+ "replace_with_linear": false
185
+ },
186
+ "ffn": {
187
+ "ffn_mult": 2.625,
188
+ "no_op": false,
189
+ "replace_with_linear": false
190
+ }
191
+ },
192
+ {
193
+ "attention": {
194
+ "n_heads_in_group": null,
195
+ "no_op": false,
196
+ "replace_with_linear": true
197
+ },
198
+ "ffn": {
199
+ "ffn_mult": 1.3125,
200
+ "no_op": false,
201
+ "replace_with_linear": false
202
+ }
203
+ },
204
+ {
205
+ "attention": {
206
+ "n_heads_in_group": 8,
207
+ "no_op": false,
208
+ "replace_with_linear": false
209
+ },
210
+ "ffn": {
211
+ "ffn_mult": 5.25,
212
+ "no_op": false,
213
+ "replace_with_linear": false
214
+ }
215
+ },
216
+ {
217
+ "attention": {
218
+ "n_heads_in_group": 8,
219
+ "no_op": false,
220
+ "replace_with_linear": false
221
+ },
222
+ "ffn": {
223
+ "ffn_mult": 5.25,
224
+ "no_op": false,
225
+ "replace_with_linear": false
226
+ }
227
+ },
228
+ {
229
+ "attention": {
230
+ "n_heads_in_group": 8,
231
+ "no_op": false,
232
+ "replace_with_linear": false
233
+ },
234
+ "ffn": {
235
+ "ffn_mult": 5.25,
236
+ "no_op": false,
237
+ "replace_with_linear": false
238
+ }
239
+ },
240
+ {
241
+ "attention": {
242
+ "n_heads_in_group": 8,
243
+ "no_op": false,
244
+ "replace_with_linear": false
245
+ },
246
+ "ffn": {
247
+ "ffn_mult": 5.25,
248
+ "no_op": false,
249
+ "replace_with_linear": false
250
+ }
251
+ },
252
+ {
253
+ "attention": {
254
+ "n_heads_in_group": 8,
255
+ "no_op": false,
256
+ "replace_with_linear": false
257
+ },
258
+ "ffn": {
259
+ "ffn_mult": 5.25,
260
+ "no_op": false,
261
+ "replace_with_linear": false
262
+ }
263
+ },
264
+ {
265
+ "attention": {
266
+ "n_heads_in_group": 8,
267
+ "no_op": false,
268
+ "replace_with_linear": false
269
+ },
270
+ "ffn": {
271
+ "ffn_mult": 5.25,
272
+ "no_op": false,
273
+ "replace_with_linear": false
274
+ }
275
+ },
276
+ {
277
+ "attention": {
278
+ "n_heads_in_group": 8,
279
+ "no_op": false,
280
+ "replace_with_linear": false
281
+ },
282
+ "ffn": {
283
+ "ffn_mult": 5.25,
284
+ "no_op": false,
285
+ "replace_with_linear": false
286
+ }
287
+ },
288
+ {
289
+ "attention": {
290
+ "n_heads_in_group": 8,
291
+ "no_op": false,
292
+ "replace_with_linear": false
293
+ },
294
+ "ffn": {
295
+ "ffn_mult": 5.25,
296
+ "no_op": false,
297
+ "replace_with_linear": false
298
+ }
299
+ },
300
+ {
301
+ "attention": {
302
+ "n_heads_in_group": 8,
303
+ "no_op": false,
304
+ "replace_with_linear": false
305
+ },
306
+ "ffn": {
307
+ "ffn_mult": 5.25,
308
+ "no_op": false,
309
+ "replace_with_linear": false
310
+ }
311
+ },
312
+ {
313
+ "attention": {
314
+ "n_heads_in_group": 8,
315
+ "no_op": false,
316
+ "replace_with_linear": false
317
+ },
318
+ "ffn": {
319
+ "ffn_mult": 5.25,
320
+ "no_op": false,
321
+ "replace_with_linear": false
322
+ }
323
+ },
324
+ {
325
+ "attention": {
326
+ "n_heads_in_group": 8,
327
+ "no_op": false,
328
+ "replace_with_linear": false
329
+ },
330
+ "ffn": {
331
+ "ffn_mult": 5.25,
332
+ "no_op": false,
333
+ "replace_with_linear": false
334
+ }
335
+ },
336
+ {
337
+ "attention": {
338
+ "n_heads_in_group": 8,
339
+ "no_op": false,
340
+ "replace_with_linear": false
341
+ },
342
+ "ffn": {
343
+ "ffn_mult": 5.25,
344
+ "no_op": false,
345
+ "replace_with_linear": false
346
+ }
347
+ },
348
+ {
349
+ "attention": {
350
+ "n_heads_in_group": 8,
351
+ "no_op": false,
352
+ "replace_with_linear": false
353
+ },
354
+ "ffn": {
355
+ "ffn_mult": 5.25,
356
+ "no_op": false,
357
+ "replace_with_linear": false
358
+ }
359
+ },
360
+ {
361
+ "attention": {
362
+ "n_heads_in_group": 8,
363
+ "no_op": false,
364
+ "replace_with_linear": false
365
+ },
366
+ "ffn": {
367
+ "ffn_mult": 5.25,
368
+ "no_op": false,
369
+ "replace_with_linear": false
370
+ }
371
+ },
372
+ {
373
+ "attention": {
374
+ "n_heads_in_group": 8,
375
+ "no_op": false,
376
+ "replace_with_linear": false
377
+ },
378
+ "ffn": {
379
+ "ffn_mult": 5.25,
380
+ "no_op": false,
381
+ "replace_with_linear": false
382
+ }
383
+ },
384
+ {
385
+ "attention": {
386
+ "n_heads_in_group": 8,
387
+ "no_op": false,
388
+ "replace_with_linear": false
389
+ },
390
+ "ffn": {
391
+ "ffn_mult": 5.25,
392
+ "no_op": false,
393
+ "replace_with_linear": false
394
+ }
395
+ },
396
+ {
397
+ "attention": {
398
+ "n_heads_in_group": 8,
399
+ "no_op": false,
400
+ "replace_with_linear": false
401
+ },
402
+ "ffn": {
403
+ "ffn_mult": 5.25,
404
+ "no_op": false,
405
+ "replace_with_linear": false
406
+ }
407
+ },
408
+ {
409
+ "attention": {
410
+ "n_heads_in_group": 8,
411
+ "no_op": false,
412
+ "replace_with_linear": false
413
+ },
414
+ "ffn": {
415
+ "ffn_mult": 5.25,
416
+ "no_op": false,
417
+ "replace_with_linear": false
418
+ }
419
+ },
420
+ {
421
+ "attention": {
422
+ "n_heads_in_group": 8,
423
+ "no_op": false,
424
+ "replace_with_linear": false
425
+ },
426
+ "ffn": {
427
+ "ffn_mult": 5.25,
428
+ "no_op": false,
429
+ "replace_with_linear": false
430
+ }
431
+ },
432
+ {
433
+ "attention": {
434
+ "n_heads_in_group": 8,
435
+ "no_op": false,
436
+ "replace_with_linear": false
437
+ },
438
+ "ffn": {
439
+ "ffn_mult": 5.25,
440
+ "no_op": false,
441
+ "replace_with_linear": false
442
+ }
443
+ },
444
+ {
445
+ "attention": {
446
+ "n_heads_in_group": 8,
447
+ "no_op": false,
448
+ "replace_with_linear": false
449
+ },
450
+ "ffn": {
451
+ "ffn_mult": 5.25,
452
+ "no_op": false,
453
+ "replace_with_linear": false
454
+ }
455
+ },
456
+ {
457
+ "attention": {
458
+ "n_heads_in_group": 8,
459
+ "no_op": false,
460
+ "replace_with_linear": false
461
+ },
462
+ "ffn": {
463
+ "ffn_mult": 5.25,
464
+ "no_op": false,
465
+ "replace_with_linear": false
466
+ }
467
+ },
468
+ {
469
+ "attention": {
470
+ "n_heads_in_group": 8,
471
+ "no_op": false,
472
+ "replace_with_linear": false
473
+ },
474
+ "ffn": {
475
+ "ffn_mult": 5.25,
476
+ "no_op": false,
477
+ "replace_with_linear": false
478
+ }
479
+ },
480
+ {
481
+ "attention": {
482
+ "n_heads_in_group": 8,
483
+ "no_op": false,
484
+ "replace_with_linear": false
485
+ },
486
+ "ffn": {
487
+ "ffn_mult": 5.25,
488
+ "no_op": false,
489
+ "replace_with_linear": false
490
+ }
491
+ },
492
+ {
493
+ "attention": {
494
+ "n_heads_in_group": 8,
495
+ "no_op": false,
496
+ "replace_with_linear": false
497
+ },
498
+ "ffn": {
499
+ "ffn_mult": 5.25,
500
+ "no_op": false,
501
+ "replace_with_linear": false
502
+ }
503
+ },
504
+ {
505
+ "attention": {
506
+ "n_heads_in_group": 8,
507
+ "no_op": false,
508
+ "replace_with_linear": false
509
+ },
510
+ "ffn": {
511
+ "ffn_mult": 5.25,
512
+ "no_op": false,
513
+ "replace_with_linear": false
514
+ }
515
+ },
516
+ {
517
+ "attention": {
518
+ "n_heads_in_group": null,
519
+ "no_op": false,
520
+ "replace_with_linear": true
521
+ },
522
+ "ffn": {
523
+ "ffn_mult": 2.625,
524
+ "no_op": false,
525
+ "replace_with_linear": false
526
+ }
527
+ },
528
+ {
529
+ "attention": {
530
+ "n_heads_in_group": 8,
531
+ "no_op": false,
532
+ "replace_with_linear": false
533
+ },
534
+ "ffn": {
535
+ "ffn_mult": 5.25,
536
+ "no_op": false,
537
+ "replace_with_linear": false
538
+ }
539
+ },
540
+ {
541
+ "attention": {
542
+ "n_heads_in_group": 8,
543
+ "no_op": false,
544
+ "replace_with_linear": false
545
+ },
546
+ "ffn": {
547
+ "ffn_mult": 5.25,
548
+ "no_op": false,
549
+ "replace_with_linear": false
550
+ }
551
+ },
552
+ {
553
+ "attention": {
554
+ "n_heads_in_group": null,
555
+ "no_op": false,
556
+ "replace_with_linear": true
557
+ },
558
+ "ffn": {
559
+ "ffn_mult": 2.625,
560
+ "no_op": false,
561
+ "replace_with_linear": false
562
+ }
563
+ },
564
+ {
565
+ "attention": {
566
+ "n_heads_in_group": null,
567
+ "no_op": false,
568
+ "replace_with_linear": true
569
+ },
570
+ "ffn": {
571
+ "ffn_mult": 5.25,
572
+ "no_op": false,
573
+ "replace_with_linear": false
574
+ }
575
+ },
576
+ {
577
+ "attention": {
578
+ "n_heads_in_group": null,
579
+ "no_op": false,
580
+ "replace_with_linear": true
581
+ },
582
+ "ffn": {
583
+ "ffn_mult": 2.625,
584
+ "no_op": false,
585
+ "replace_with_linear": false
586
+ }
587
+ },
588
+ {
589
+ "attention": {
590
+ "n_heads_in_group": null,
591
+ "no_op": false,
592
+ "replace_with_linear": true
593
+ },
594
+ "ffn": {
595
+ "ffn_mult": 2.625,
596
+ "no_op": false,
597
+ "replace_with_linear": false
598
+ }
599
+ },
600
+ {
601
+ "attention": {
602
+ "n_heads_in_group": null,
603
+ "no_op": false,
604
+ "replace_with_linear": true
605
+ },
606
+ "ffn": {
607
+ "ffn_mult": 2.625,
608
+ "no_op": false,
609
+ "replace_with_linear": false
610
+ }
611
+ },
612
+ {
613
+ "attention": {
614
+ "n_heads_in_group": null,
615
+ "no_op": true,
616
+ "replace_with_linear": false
617
+ },
618
+ "ffn": {
619
+ "ffn_mult": 1.3125,
620
+ "no_op": false,
621
+ "replace_with_linear": false
622
+ }
623
+ },
624
+ {
625
+ "attention": {
626
+ "n_heads_in_group": null,
627
+ "no_op": false,
628
+ "replace_with_linear": true
629
+ },
630
+ "ffn": {
631
+ "ffn_mult": 1.3125,
632
+ "no_op": false,
633
+ "replace_with_linear": false
634
+ }
635
+ },
636
+ {
637
+ "attention": {
638
+ "n_heads_in_group": 8,
639
+ "no_op": false,
640
+ "replace_with_linear": false
641
+ },
642
+ "ffn": {
643
+ "ffn_mult": 5.25,
644
+ "no_op": false,
645
+ "replace_with_linear": false
646
+ }
647
+ },
648
+ {
649
+ "attention": {
650
+ "n_heads_in_group": null,
651
+ "no_op": true,
652
+ "replace_with_linear": false
653
+ },
654
+ "ffn": {
655
+ "ffn_mult": 1.3125,
656
+ "no_op": false,
657
+ "replace_with_linear": false
658
+ }
659
+ },
660
+ {
661
+ "attention": {
662
+ "n_heads_in_group": null,
663
+ "no_op": false,
664
+ "replace_with_linear": true
665
+ },
666
+ "ffn": {
667
+ "ffn_mult": 1.3125,
668
+ "no_op": false,
669
+ "replace_with_linear": false
670
+ }
671
+ },
672
+ {
673
+ "attention": {
674
+ "n_heads_in_group": null,
675
+ "no_op": true,
676
+ "replace_with_linear": false
677
+ },
678
+ "ffn": {
679
+ "ffn_mult": 1.3125,
680
+ "no_op": false,
681
+ "replace_with_linear": false
682
+ }
683
+ },
684
+ {
685
+ "attention": {
686
+ "n_heads_in_group": 8,
687
+ "no_op": false,
688
+ "replace_with_linear": false
689
+ },
690
+ "ffn": {
691
+ "ffn_mult": 5.25,
692
+ "no_op": false,
693
+ "replace_with_linear": false
694
+ }
695
+ },
696
+ {
697
+ "attention": {
698
+ "n_heads_in_group": null,
699
+ "no_op": false,
700
+ "replace_with_linear": true
701
+ },
702
+ "ffn": {
703
+ "ffn_mult": 1.3125,
704
+ "no_op": false,
705
+ "replace_with_linear": false
706
+ }
707
+ },
708
+ {
709
+ "attention": {
710
+ "n_heads_in_group": null,
711
+ "no_op": true,
712
+ "replace_with_linear": false
713
+ },
714
+ "ffn": {
715
+ "ffn_mult": 1.3125,
716
+ "no_op": false,
717
+ "replace_with_linear": false
718
+ }
719
+ },
720
+ {
721
+ "attention": {
722
+ "n_heads_in_group": null,
723
+ "no_op": false,
724
+ "replace_with_linear": true
725
+ },
726
+ "ffn": {
727
+ "ffn_mult": 1.3125,
728
+ "no_op": false,
729
+ "replace_with_linear": false
730
+ }
731
+ },
732
+ {
733
+ "attention": {
734
+ "n_heads_in_group": null,
735
+ "no_op": false,
736
+ "replace_with_linear": true
737
+ },
738
+ "ffn": {
739
+ "ffn_mult": 1.3125,
740
+ "no_op": false,
741
+ "replace_with_linear": false
742
+ }
743
+ },
744
+ {
745
+ "attention": {
746
+ "n_heads_in_group": null,
747
+ "no_op": true,
748
+ "replace_with_linear": false
749
+ },
750
+ "ffn": {
751
+ "ffn_mult": 1.3125,
752
+ "no_op": false,
753
+ "replace_with_linear": false
754
+ }
755
+ },
756
+ {
757
+ "attention": {
758
+ "n_heads_in_group": null,
759
+ "no_op": true,
760
+ "replace_with_linear": false
761
+ },
762
+ "ffn": {
763
+ "ffn_mult": 1.3125,
764
+ "no_op": false,
765
+ "replace_with_linear": false
766
+ }
767
+ },
768
+ {
769
+ "attention": {
770
+ "n_heads_in_group": null,
771
+ "no_op": false,
772
+ "replace_with_linear": true
773
+ },
774
+ "ffn": {
775
+ "ffn_mult": 1.3125,
776
+ "no_op": false,
777
+ "replace_with_linear": false
778
+ }
779
+ },
780
+ {
781
+ "attention": {
782
+ "n_heads_in_group": null,
783
+ "no_op": true,
784
+ "replace_with_linear": false
785
+ },
786
+ "ffn": {
787
+ "ffn_mult": 1.3125,
788
+ "no_op": false,
789
+ "replace_with_linear": false
790
+ }
791
+ },
792
+ {
793
+ "attention": {
794
+ "n_heads_in_group": null,
795
+ "no_op": true,
796
+ "replace_with_linear": false
797
+ },
798
+ "ffn": {
799
+ "ffn_mult": 1.3125,
800
+ "no_op": false,
801
+ "replace_with_linear": false
802
+ }
803
+ },
804
+ {
805
+ "attention": {
806
+ "n_heads_in_group": null,
807
+ "no_op": false,
808
+ "replace_with_linear": true
809
+ },
810
+ "ffn": {
811
+ "ffn_mult": 1.3125,
812
+ "no_op": false,
813
+ "replace_with_linear": false
814
+ }
815
+ },
816
+ {
817
+ "attention": {
818
+ "n_heads_in_group": null,
819
+ "no_op": false,
820
+ "replace_with_linear": true
821
+ },
822
+ "ffn": {
823
+ "ffn_mult": 1.3125,
824
+ "no_op": false,
825
+ "replace_with_linear": false
826
+ }
827
+ },
828
+ {
829
+ "attention": {
830
+ "n_heads_in_group": null,
831
+ "no_op": false,
832
+ "replace_with_linear": true
833
+ },
834
+ "ffn": {
835
+ "ffn_mult": 1.3125,
836
+ "no_op": false,
837
+ "replace_with_linear": false
838
+ }
839
+ },
840
+ {
841
+ "attention": {
842
+ "n_heads_in_group": null,
843
+ "no_op": false,
844
+ "replace_with_linear": true
845
+ },
846
+ "ffn": {
847
+ "ffn_mult": 1.3125,
848
+ "no_op": false,
849
+ "replace_with_linear": false
850
+ }
851
+ },
852
+ {
853
+ "attention": {
854
+ "n_heads_in_group": 8,
855
+ "no_op": false,
856
+ "replace_with_linear": false
857
+ },
858
+ "ffn": {
859
+ "ffn_mult": 5.25,
860
+ "no_op": false,
861
+ "replace_with_linear": false
862
+ }
863
+ },
864
+ {
865
+ "attention": {
866
+ "n_heads_in_group": 8,
867
+ "no_op": false,
868
+ "replace_with_linear": false
869
+ },
870
+ "ffn": {
871
+ "ffn_mult": 5.25,
872
+ "no_op": false,
873
+ "replace_with_linear": false
874
+ }
875
+ },
876
+ {
877
+ "attention": {
878
+ "n_heads_in_group": 8,
879
+ "no_op": false,
880
+ "replace_with_linear": false
881
+ },
882
+ "ffn": {
883
+ "ffn_mult": 5.25,
884
+ "no_op": false,
885
+ "replace_with_linear": false
886
+ }
887
+ },
888
+ {
889
+ "attention": {
890
+ "n_heads_in_group": 8,
891
+ "no_op": false,
892
+ "replace_with_linear": false
893
+ },
894
+ "ffn": {
895
+ "ffn_mult": 5.25,
896
+ "no_op": false,
897
+ "replace_with_linear": false
898
+ }
899
+ },
900
+ {
901
+ "attention": {
902
+ "n_heads_in_group": 8,
903
+ "no_op": false,
904
+ "replace_with_linear": false
905
+ },
906
+ "ffn": {
907
+ "ffn_mult": 5.25,
908
+ "no_op": false,
909
+ "replace_with_linear": false
910
+ }
911
+ },
912
+ {
913
+ "attention": {
914
+ "n_heads_in_group": 8,
915
+ "no_op": false,
916
+ "replace_with_linear": false
917
+ },
918
+ "ffn": {
919
+ "ffn_mult": 5.25,
920
+ "no_op": false,
921
+ "replace_with_linear": false
922
+ }
923
+ },
924
+ {
925
+ "attention": {
926
+ "n_heads_in_group": 8,
927
+ "no_op": false,
928
+ "replace_with_linear": false
929
+ },
930
+ "ffn": {
931
+ "ffn_mult": 5.25,
932
+ "no_op": false,
933
+ "replace_with_linear": false
934
+ }
935
+ },
936
+ {
937
+ "attention": {
938
+ "n_heads_in_group": 8,
939
+ "no_op": false,
940
+ "replace_with_linear": false
941
+ },
942
+ "ffn": {
943
+ "ffn_mult": 5.25,
944
+ "no_op": false,
945
+ "replace_with_linear": false
946
+ }
947
+ },
948
+ {
949
+ "attention": {
950
+ "n_heads_in_group": 8,
951
+ "no_op": false,
952
+ "replace_with_linear": false
953
+ },
954
+ "ffn": {
955
+ "ffn_mult": 5.25,
956
+ "no_op": false,
957
+ "replace_with_linear": false
958
+ }
959
+ },
960
+ {
961
+ "attention": {
962
+ "n_heads_in_group": 8,
963
+ "no_op": false,
964
+ "replace_with_linear": false
965
+ },
966
+ "ffn": {
967
+ "ffn_mult": 5.25,
968
+ "no_op": false,
969
+ "replace_with_linear": false
970
+ }
971
+ }
972
+ ],
973
+ "bos_token_id": 128000,
974
+ "eos_token_id": [
975
+ 128001,
976
+ 128008,
977
+ 128009
978
+ ],
979
+ "hidden_act": "silu",
980
+ "hidden_size": 8192,
981
+ "initializer_range": 0.02,
982
+ "intermediate_size": null,
983
+ "max_position_embeddings": 131072,
984
+ "mlp_bias": false,
985
+ "model_type": "nemotron-nas",
986
+ "num_attention_heads": 64,
987
+ "num_hidden_layers": 80,
988
+ "num_key_value_heads": null,
989
+ "pretraining_tp": 1,
990
+ "rms_norm_eps": 1e-05,
991
+ "rope_scaling": {
992
+ "factor": 8.0,
993
+ "high_freq_factor": 4.0,
994
+ "low_freq_factor": 1.0,
995
+ "original_max_position_embeddings": 8192,
996
+ "rope_type": "llama3"
997
+ },
998
+ "rope_theta": 500000.0,
999
+ "tie_word_embeddings": false,
1000
+ "torch_dtype": "bfloat16",
1001
+ "transformers_version": "4.44.2",
1002
+ "use_cache": true,
1003
+ "vocab_size": 128256
1004
+ }