Update deepseek-moe-16b-base/bert4torch_config.json
Browse files
deepseek-moe-16b-base/bert4torch_config.json
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
{
|
|
|
|
|
2 |
"attention_bias": false,
|
3 |
"attention_dropout": 0.0,
|
4 |
"aux_loss_alpha": 0.001,
|
@@ -10,7 +12,6 @@
|
|
10 |
"initializer_range": 0.02,
|
11 |
"intermediate_size": 10944,
|
12 |
"max_position_embeddings": 4096,
|
13 |
-
"model": "deepseek",
|
14 |
"moe_intermediate_size": 1408,
|
15 |
"moe_layer_freq": 1,
|
16 |
"n_routed_experts": 64,
|
@@ -32,5 +33,5 @@
|
|
32 |
"skip_init": true,
|
33 |
"segment_vocab_size": 0,
|
34 |
"rope_rank": "updown",
|
35 |
-
"generation_config": {"tokenizer_config":
|
36 |
}
|
|
|
1 |
{
|
2 |
+
"model": "deepseek",
|
3 |
+
"template": "pretrained_text_continuation",
|
4 |
"attention_bias": false,
|
5 |
"attention_dropout": 0.0,
|
6 |
"aux_loss_alpha": 0.001,
|
|
|
12 |
"initializer_range": 0.02,
|
13 |
"intermediate_size": 10944,
|
14 |
"max_position_embeddings": 4096,
|
|
|
15 |
"moe_intermediate_size": 1408,
|
16 |
"moe_layer_freq": 1,
|
17 |
"n_routed_experts": 64,
|
|
|
33 |
"skip_init": true,
|
34 |
"segment_vocab_size": 0,
|
35 |
"rope_rank": "updown",
|
36 |
+
"generation_config": {"tokenizer_config": {"skip_special_tokens": true, "add_special_tokens": false}, "eos_token_id": [100001]}
|
37 |
}
|