Tanrei commited on
Commit
5ce809b
1 Parent(s): aec98c8

Upload GPTSANJapaneseForConditionalGeneration

Browse files
config.json CHANGED
@@ -1,5 +1,4 @@
1
  {
2
- "_name_or_path": "Tanrei/GPTSAN-japanese",
3
  "architectures": [
4
  "GPTSANJapaneseForConditionalGeneration"
5
  ],
@@ -8,15 +7,14 @@
8
  "d_ff": 8192,
9
  "d_model": 1024,
10
  "d_spout": 128,
11
- "do_sample": true,
12
  "dropout_rate": 0.0,
13
  "eos_token_id": 35999,
14
  "expert_capacity": 128,
15
  "initializer_factor": 0.002,
16
  "layer_norm_epsilon": 1e-05,
17
  "mask_token_id": 35994,
 
18
  "model_type": "gptsan-japanese",
19
- "num_contexts": 1280,
20
  "num_experts": 16,
21
  "num_ext_layers": 0,
22
  "num_heads": 16,
@@ -29,10 +27,8 @@
29
  "router_ignore_padding_tokens": false,
30
  "router_jitter_noise": 0.0,
31
  "separator_token_id": 35998,
32
- "top_k": 120,
33
  "torch_dtype": "float32",
34
- "transformers_version": "4.26.0.dev0",
35
  "unk_token_id": 35996,
36
- "use_cache": true,
37
  "vocab_size": 36000
38
  }
 
1
  {
 
2
  "architectures": [
3
  "GPTSANJapaneseForConditionalGeneration"
4
  ],
 
7
  "d_ff": 8192,
8
  "d_model": 1024,
9
  "d_spout": 128,
 
10
  "dropout_rate": 0.0,
11
  "eos_token_id": 35999,
12
  "expert_capacity": 128,
13
  "initializer_factor": 0.002,
14
  "layer_norm_epsilon": 1e-05,
15
  "mask_token_id": 35994,
16
+ "max_position_embeddings": 1280,
17
  "model_type": "gptsan-japanese",
 
18
  "num_experts": 16,
19
  "num_ext_layers": 0,
20
  "num_heads": 16,
 
27
  "router_ignore_padding_tokens": false,
28
  "router_jitter_noise": 0.0,
29
  "separator_token_id": 35998,
 
30
  "torch_dtype": "float32",
31
+ "transformers_version": "4.27.0.dev0",
32
  "unk_token_id": 35996,
 
33
  "vocab_size": 36000
34
  }
generation_config.json CHANGED
@@ -1,5 +1,7 @@
1
  {
2
- "do_sample": true,
3
- "top_k": 120,
 
 
4
  "transformers_version": "4.27.0.dev0"
5
  }
 
1
  {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 35993,
4
+ "eos_token_id": 35999,
5
+ "pad_token_id": 35995,
6
  "transformers_version": "4.27.0.dev0"
7
  }
pytorch_model-00001-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f34e100f478d000708270536fd52b3011f09343c6d4b007b94e662abf8531621
3
+ size 9972570554
pytorch_model-00002-of-00002.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1f9dd95d7ccb1c5972ba3a267ca07113bf41ac5c6adc3973a758f0b28e4d4dc
3
+ size 1143580233
pytorch_model.bin.index.json CHANGED
@@ -38,7 +38,7 @@
38
  "blocks.0.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
39
  "blocks.0.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
40
  "blocks.0.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
41
- "blocks.0.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
42
  "blocks.0.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
43
  "blocks.0.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
44
  "blocks.0.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -78,7 +78,7 @@
78
  "blocks.1.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
79
  "blocks.1.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
80
  "blocks.1.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
81
- "blocks.1.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
82
  "blocks.1.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
83
  "blocks.1.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
84
  "blocks.1.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -118,7 +118,7 @@
118
  "blocks.2.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
119
  "blocks.2.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
120
  "blocks.2.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
121
- "blocks.2.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
122
  "blocks.2.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
123
  "blocks.2.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
124
  "blocks.2.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -158,7 +158,7 @@
158
  "blocks.3.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
159
  "blocks.3.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
160
  "blocks.3.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
161
- "blocks.3.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
162
  "blocks.3.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
163
  "blocks.3.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
164
  "blocks.3.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -198,7 +198,7 @@
198
  "blocks.4.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
199
  "blocks.4.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
200
  "blocks.4.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
201
- "blocks.4.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
202
  "blocks.4.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
203
  "blocks.4.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
204
  "blocks.4.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -238,7 +238,7 @@
238
  "blocks.5.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
239
  "blocks.5.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
240
  "blocks.5.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
241
- "blocks.5.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
242
  "blocks.5.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
243
  "blocks.5.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
244
  "blocks.5.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -278,7 +278,7 @@
278
  "blocks.6.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
279
  "blocks.6.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
280
  "blocks.6.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
281
- "blocks.6.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
282
  "blocks.6.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
283
  "blocks.6.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
284
  "blocks.6.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -318,7 +318,7 @@
318
  "blocks.7.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
319
  "blocks.7.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
320
  "blocks.7.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
321
- "blocks.7.FeedForward.smlp.weight": "pytorch_model-00001-of-00002.bin",
322
  "blocks.7.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
323
  "blocks.7.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
324
  "blocks.7.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -358,7 +358,7 @@
358
  "blocks.8.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
359
  "blocks.8.FeedForward.norm.bias": "pytorch_model-00002-of-00002.bin",
360
  "blocks.8.FeedForward.norm.weight": "pytorch_model-00002-of-00002.bin",
361
- "blocks.8.FeedForward.smlp.weight": "pytorch_model-00002-of-00002.bin",
362
  "blocks.8.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
363
  "blocks.8.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
364
  "blocks.8.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
@@ -398,7 +398,7 @@
398
  "blocks.9.FeedForward.mlp.router.classifier.weight": "pytorch_model-00002-of-00002.bin",
399
  "blocks.9.FeedForward.norm.bias": "pytorch_model-00002-of-00002.bin",
400
  "blocks.9.FeedForward.norm.weight": "pytorch_model-00002-of-00002.bin",
401
- "blocks.9.FeedForward.smlp.weight": "pytorch_model-00002-of-00002.bin",
402
  "blocks.9.SelfAttention.SelfAttention.o": "pytorch_model-00002-of-00002.bin",
403
  "blocks.9.SelfAttention.SelfAttention.qkv": "pytorch_model-00002-of-00002.bin",
404
  "blocks.9.SelfAttention.norm.bias": "pytorch_model-00002-of-00002.bin",
 
38
  "blocks.0.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
39
  "blocks.0.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
40
  "blocks.0.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
41
+ "blocks.0.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
42
  "blocks.0.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
43
  "blocks.0.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
44
  "blocks.0.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
 
78
  "blocks.1.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
79
  "blocks.1.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
80
  "blocks.1.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
81
+ "blocks.1.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
82
  "blocks.1.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
83
  "blocks.1.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
84
  "blocks.1.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
 
118
  "blocks.2.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
119
  "blocks.2.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
120
  "blocks.2.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
121
+ "blocks.2.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
122
  "blocks.2.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
123
  "blocks.2.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
124
  "blocks.2.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
 
158
  "blocks.3.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
159
  "blocks.3.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
160
  "blocks.3.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
161
+ "blocks.3.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
162
  "blocks.3.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
163
  "blocks.3.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
164
  "blocks.3.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
 
198
  "blocks.4.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
199
  "blocks.4.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
200
  "blocks.4.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
201
+ "blocks.4.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
202
  "blocks.4.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
203
  "blocks.4.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
204
  "blocks.4.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
 
238
  "blocks.5.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
239
  "blocks.5.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
240
  "blocks.5.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
241
+ "blocks.5.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
242
  "blocks.5.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
243
  "blocks.5.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
244
  "blocks.5.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
 
278
  "blocks.6.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
279
  "blocks.6.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
280
  "blocks.6.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
281
+ "blocks.6.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
282
  "blocks.6.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
283
  "blocks.6.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
284
  "blocks.6.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
 
318
  "blocks.7.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
319
  "blocks.7.FeedForward.norm.bias": "pytorch_model-00001-of-00002.bin",
320
  "blocks.7.FeedForward.norm.weight": "pytorch_model-00001-of-00002.bin",
321
+ "blocks.7.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00001-of-00002.bin",
322
  "blocks.7.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
323
  "blocks.7.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
324
  "blocks.7.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
 
358
  "blocks.8.FeedForward.mlp.router.classifier.weight": "pytorch_model-00001-of-00002.bin",
359
  "blocks.8.FeedForward.norm.bias": "pytorch_model-00002-of-00002.bin",
360
  "blocks.8.FeedForward.norm.weight": "pytorch_model-00002-of-00002.bin",
361
+ "blocks.8.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00002-of-00002.bin",
362
  "blocks.8.SelfAttention.SelfAttention.o": "pytorch_model-00001-of-00002.bin",
363
  "blocks.8.SelfAttention.SelfAttention.qkv": "pytorch_model-00001-of-00002.bin",
364
  "blocks.8.SelfAttention.norm.bias": "pytorch_model-00001-of-00002.bin",
 
398
  "blocks.9.FeedForward.mlp.router.classifier.weight": "pytorch_model-00002-of-00002.bin",
399
  "blocks.9.FeedForward.norm.bias": "pytorch_model-00002-of-00002.bin",
400
  "blocks.9.FeedForward.norm.weight": "pytorch_model-00002-of-00002.bin",
401
+ "blocks.9.FeedForward.soft_bypass_mlp.weight": "pytorch_model-00002-of-00002.bin",
402
  "blocks.9.SelfAttention.SelfAttention.o": "pytorch_model-00002-of-00002.bin",
403
  "blocks.9.SelfAttention.SelfAttention.qkv": "pytorch_model-00002-of-00002.bin",
404
  "blocks.9.SelfAttention.norm.bias": "pytorch_model-00002-of-00002.bin",