Tongjilibo commited on
Commit
36f7606
1 Parent(s): e0ea324

添加配置文件

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. BELLE-LLaMA-7B-2M-enc/bert4torch_config.json +13 -0
  2. Baichuan-13B-Base/bert4torch_config.json +21 -0
  3. Baichuan-13B-Chat/bert4torch_config.json +21 -0
  4. Baichuan-7B/bert4torch_config.json +20 -0
  5. Baichuan2-13B-Base/bert4torch_config.json +21 -0
  6. Baichuan2-13B-Chat/bert4torch_config.json +21 -0
  7. Baichuan2-7B-Base/bert4torch_config.json +21 -0
  8. Baichuan2-7B-Chat/bert4torch_config.json +21 -0
  9. ChatYuan-large-v1/bert4torch_config.json +17 -0
  10. Llama-2-13b-chat-hf/bert4torch_config.json +19 -0
  11. Llama-2-13b-hf/bert4torch_config.json +19 -0
  12. Llama-2-7b-chat-hf/bert4torch_config.json +13 -0
  13. Llama-2-7b-hf/bert4torch_config.json +13 -0
  14. PromptCLUE/bert4torch_config.json +17 -0
  15. Qwen-1_8B-Chat/bert4torch_config.json +31 -0
  16. Qwen-1_8B/bert4torch_config.json +31 -0
  17. Qwen-7B-Chat/bert4torch_config.json +31 -0
  18. Qwen-7B/bert4torch_config.json +31 -0
  19. Yi-6B-200K/bert4torch_config.json +315 -0
  20. Yi-6B/bert4torch_config.json +315 -0
  21. Ziya-LLaMA-13B-v1.1/bert4torch_config.json +20 -0
  22. Ziya-LLaMA-13B-v1/bert4torch_config.json +20 -0
  23. bart-base-chinese/bert4torch_config.json +14 -0
  24. bert-base-multilingual-cased/bert4torch_config.json +25 -0
  25. bge-large-en-v1.5/bert4torch_config.json +412 -0
  26. bge-large-zh-v1.5/bert4torch_config.json +415 -0
  27. bloom-560m/bert4torch_config.json +27 -0
  28. bloomz-560m/bert4torch_config.json +27 -0
  29. chatglm-6b-int4/bert4torch_config.json +25 -0
  30. chatglm-6b-int8/bert4torch_config.json +25 -0
  31. chatglm-6b/bert4torch_config.json +22 -0
  32. chatglm2-6b-32k/bert4torch_config.json +22 -0
  33. chatglm2-6b-int4/bert4torch_config.json +24 -0
  34. chatglm2-6b/bert4torch_config.json +21 -0
  35. chatglm3-6b-32k/bert4torch_config.json +22 -0
  36. chatglm3-6b/bert4torch_config.json +21 -0
  37. chinese-bert-wwm-ext/bert4torch_config.json +26 -0
  38. chinese-macbert-base/bert4torch_config.json +26 -0
  39. chinese-macbert-large/bert4torch_config.json +26 -0
  40. chinese-roberta-wwm-ext-base/bert4torch_config.json +28 -0
  41. chinese-roberta-wwm-ext-large/bert4torch_config.json +28 -0
  42. chinese-xlnet-base/bert4torch_config.json +29 -0
  43. chinese_alpaca_plus_7b/bert4torch_config.json +13 -0
  44. chinese_llama_plus_7b/bert4torch_config.json +13 -0
  45. deepseek-moe-16b-base/bert4torch_config.json +35 -0
  46. deepseek-moe-16b-chat/bert4torch_config.json +35 -0
  47. falcon-7b-instruct/bert4torch_config.json +27 -0
  48. falcon-7b/bert4torch_config.json +27 -0
  49. falcon-rw-1b/bert4torch_config.json +24 -0
  50. gpt2-chinese-cluecorpussmall/bert4torch_config.json +14 -0
BELLE-LLaMA-7B-2M-enc/bert4torch_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "hidden_size": 4096,
4
+ "intermediate_size": 11008,
5
+ "num_attention_heads": 32,
6
+ "num_hidden_layers": 32,
7
+ "layer_norm_eps": 1e-06,
8
+ "hidden_act": "silu",
9
+ "vocab_size": 32000,
10
+ "segment_vocab_size": 0,
11
+ "skip_init": true,
12
+ "rope_rank": "updown"
13
+ }
Baichuan-13B-Base/bert4torch_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "hidden_act": "silu",
6
+ "hidden_size": 5120,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 13696,
9
+ "model_max_length": 4096,
10
+ "num_attention_heads": 40,
11
+ "num_hidden_layers": 40,
12
+ "pad_token_id": 0,
13
+ "layer_norm_eps": 1e-06,
14
+ "tie_word_embeddings": false,
15
+ "torch_dtype": "bfloat16",
16
+ "vocab_size": 64000,
17
+ "segment_vocab_size": 0,
18
+ "rope_rank": "updown",
19
+ "p_bias": "alibi",
20
+ "skip_init": true
21
+ }
Baichuan-13B-Chat/bert4torch_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "hidden_act": "silu",
6
+ "hidden_size": 5120,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 13696,
9
+ "model_max_length": 4096,
10
+ "num_attention_heads": 40,
11
+ "num_hidden_layers": 40,
12
+ "pad_token_id": 0,
13
+ "layer_norm_eps": 1e-06,
14
+ "tie_word_embeddings": false,
15
+ "torch_dtype": "bfloat16",
16
+ "vocab_size": 64000,
17
+ "segment_vocab_size": 0,
18
+ "rope_rank": "updown",
19
+ "p_bias": "alibi",
20
+ "skip_init": true
21
+ }
Baichuan-7B/bert4torch_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "hidden_act": "silu",
6
+ "hidden_size": 4096,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 11008,
9
+ "max_position_embeddings": 4096,
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 32,
12
+ "pad_token_id": 0,
13
+ "layer_norm_eps": 1e-06,
14
+ "tie_word_embeddings": false,
15
+ "torch_dtype": "float32",
16
+ "vocab_size": 64000,
17
+ "segment_vocab_size": 0,
18
+ "rope_rank": "updown",
19
+ "skip_init": true
20
+ }
Baichuan2-13B-Base/bert4torch_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "hidden_act": "silu",
6
+ "hidden_size": 5120,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 13696,
9
+ "model_max_length": 4096,
10
+ "num_attention_heads": 40,
11
+ "num_hidden_layers": 40,
12
+ "pad_token_id": 0,
13
+ "layer_norm_eps": 1e-06,
14
+ "tie_word_embeddings": false,
15
+ "torch_dtype": "bfloat16",
16
+ "vocab_size": 125696,
17
+ "segment_vocab_size": 0,
18
+ "p_bias": "alibi",
19
+ "skip_init": true,
20
+ "norm_head": true
21
+ }
Baichuan2-13B-Chat/bert4torch_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "hidden_act": "silu",
6
+ "hidden_size": 5120,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 13696,
9
+ "model_max_length": 4096,
10
+ "num_attention_heads": 40,
11
+ "num_hidden_layers": 40,
12
+ "pad_token_id": 0,
13
+ "layer_norm_eps": 1e-06,
14
+ "tie_word_embeddings": false,
15
+ "torch_dtype": "bfloat16",
16
+ "vocab_size": 125696,
17
+ "segment_vocab_size": 0,
18
+ "p_bias": "alibi",
19
+ "skip_init": true,
20
+ "norm_head": true
21
+ }
Baichuan2-7B-Base/bert4torch_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "hidden_act": "silu",
6
+ "hidden_size": 4096,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 11008,
9
+ "max_position_embeddings": 4096,
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 32,
12
+ "pad_token_id": 0,
13
+ "layer_norm_eps": 1e-06,
14
+ "tie_word_embeddings": false,
15
+ "torch_dtype": "bfloat16",
16
+ "vocab_size": 125696,
17
+ "segment_vocab_size": 0,
18
+ "rope_rank": "updown",
19
+ "skip_init": true,
20
+ "norm_head": true
21
+ }
Baichuan2-7B-Chat/bert4torch_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "hidden_act": "silu",
6
+ "hidden_size": 4096,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 11008,
9
+ "max_position_embeddings": 4096,
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 32,
12
+ "pad_token_id": 0,
13
+ "layer_norm_eps": 1e-06,
14
+ "tie_word_embeddings": false,
15
+ "torch_dtype": "bfloat16",
16
+ "vocab_size": 125696,
17
+ "segment_vocab_size": 0,
18
+ "rope_rank": "updown",
19
+ "skip_init": true,
20
+ "norm_head": true
21
+ }
ChatYuan-large-v1/bert4torch_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "mt5.1.1",
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 1024,
6
+ "intermediate_size": 2816,
7
+ "num_attention_heads": 16,
8
+ "attention_head_size": 64,
9
+ "num_hidden_layers": 24,
10
+ "vocab_size": 32128,
11
+ "relative_attention_num_buckets": 32,
12
+ "attention_scale": false,
13
+ "is_dropout": true,
14
+ "max_position_embeddings": 768,
15
+ "segment_vocab_size": 0,
16
+ "logit_scale": false
17
+ }
Llama-2-13b-chat-hf/bert4torch_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "hidden_size": 5120,
7
+ "intermediate_size": 13824,
8
+ "num_attention_heads": 40,
9
+ "num_hidden_layers": 40,
10
+ "initializer_range": 0.02,
11
+ "hidden_act": "silu",
12
+ "vocab_size": 32000,
13
+ "segment_vocab_size": 0,
14
+ "skip_init": true,
15
+ "layer_norm_eps": 1e-5,
16
+ "rope_rank": "updown",
17
+ "tie_word_embeddings": false,
18
+ "torch_dtype": "float16"
19
+ }
Llama-2-13b-hf/bert4torch_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 0,
6
+ "hidden_size": 5120,
7
+ "intermediate_size": 13824,
8
+ "num_attention_heads": 40,
9
+ "num_hidden_layers": 40,
10
+ "initializer_range": 0.02,
11
+ "hidden_act": "silu",
12
+ "vocab_size": 32000,
13
+ "segment_vocab_size": 0,
14
+ "skip_init": true,
15
+ "layer_norm_eps": 1e-5,
16
+ "rope_rank": "updown",
17
+ "tie_word_embeddings": false,
18
+ "torch_dtype": "float16"
19
+ }
Llama-2-7b-chat-hf/bert4torch_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "hidden_size": 4096,
4
+ "intermediate_size": 11008,
5
+ "num_attention_heads": 32,
6
+ "num_hidden_layers": 32,
7
+ "hidden_act": "silu",
8
+ "vocab_size": 32000,
9
+ "segment_vocab_size": 0,
10
+ "skip_init": true,
11
+ "layer_norm_eps": 1e-6,
12
+ "rope_rank": "updown"
13
+ }
Llama-2-7b-hf/bert4torch_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "hidden_size": 4096,
4
+ "intermediate_size": 11008,
5
+ "num_attention_heads": 32,
6
+ "num_hidden_layers": 32,
7
+ "hidden_act": "silu",
8
+ "vocab_size": 32000,
9
+ "segment_vocab_size": 0,
10
+ "skip_init": true,
11
+ "layer_norm_eps": 1e-5,
12
+ "rope_rank": "updown"
13
+ }
PromptCLUE/bert4torch_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "mt5.1.1",
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 768,
6
+ "intermediate_size": 2048,
7
+ "num_attention_heads": 12,
8
+ "attention_head_size": 64,
9
+ "num_hidden_layers": 12,
10
+ "vocab_size": 32128,
11
+ "relative_attention_num_buckets": 32,
12
+ "attention_scale": false,
13
+ "is_dropout": true,
14
+ "max_position_embeddings": 768,
15
+ "segment_vocab_size": 0,
16
+ "logit_scale": false
17
+ }
Qwen-1_8B-Chat/bert4torch_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen",
3
+ "hidden_act": "silu",
4
+ "bias_dropout_fusion": true,
5
+ "bos_token_id": 151643,
6
+ "embd_pdrop": 0.1,
7
+ "eos_token_id": 151643,
8
+ "intermediate_size": 11008,
9
+ "initializer_range": 0.02,
10
+ "kv_channels": 128,
11
+ "layer_norm_eps": 1e-06,
12
+ "hidden_size": 2048,
13
+ "num_attention_heads": 16,
14
+ "num_hidden_layers": 24,
15
+ "n_positions": 6144,
16
+ "resid_pdrop": 0.1,
17
+ "rotary_emb_base": 10000,
18
+ "rotary_pct": 1.0,
19
+ "scale_attn_weights": true,
20
+ "seq_length": 8192,
21
+ "tie_word_embeddings": false,
22
+ "use_cache": true,
23
+ "use_flash_attn": true,
24
+ "vocab_size": 151936,
25
+ "use_dynamic_ntk": true,
26
+ "use_logn_attn": true,
27
+ "segment_vocab_size": 0,
28
+ "skip_init": true,
29
+ "rope_rank": "updown",
30
+ "max_position": 8192
31
+ }
Qwen-1_8B/bert4torch_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen",
3
+ "hidden_act": "silu",
4
+ "bias_dropout_fusion": true,
5
+ "bos_token_id": 151643,
6
+ "embd_pdrop": 0.1,
7
+ "eos_token_id": 151643,
8
+ "intermediate_size": 11008,
9
+ "initializer_range": 0.02,
10
+ "kv_channels": 128,
11
+ "layer_norm_eps": 1e-06,
12
+ "hidden_size": 2048,
13
+ "num_attention_heads": 16,
14
+ "num_hidden_layers": 24,
15
+ "n_positions": 6144,
16
+ "resid_pdrop": 0.1,
17
+ "rotary_emb_base": 10000,
18
+ "rotary_pct": 1.0,
19
+ "scale_attn_weights": true,
20
+ "seq_length": 8192,
21
+ "tie_word_embeddings": false,
22
+ "use_cache": true,
23
+ "use_flash_attn": true,
24
+ "vocab_size": 151936,
25
+ "use_dynamic_ntk": true,
26
+ "use_logn_attn": true,
27
+ "segment_vocab_size": 0,
28
+ "skip_init": true,
29
+ "rope_rank": "updown",
30
+ "max_position": 8192
31
+ }
Qwen-7B-Chat/bert4torch_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen",
3
+ "hidden_act": "silu",
4
+ "bias_dropout_fusion": true,
5
+ "bos_token_id": 151643,
6
+ "embd_pdrop": 0.1,
7
+ "eos_token_id": 151643,
8
+ "intermediate_size": 22016,
9
+ "initializer_range": 0.02,
10
+ "kv_channels": 128,
11
+ "layer_norm_eps": 1e-05,
12
+ "hidden_size": 4096,
13
+ "num_attention_heads": 32,
14
+ "num_hidden_layers": 32,
15
+ "n_positions": 6144,
16
+ "resid_pdrop": 0.1,
17
+ "rotary_emb_base": 10000,
18
+ "rotary_pct": 1.0,
19
+ "scale_attn_weights": true,
20
+ "seq_length": 2048,
21
+ "tie_word_embeddings": false,
22
+ "use_cache": true,
23
+ "flash_attention": "flash_attn_2",
24
+ "vocab_size": 151936,
25
+ "use_dynamic_ntk": true,
26
+ "use_logn_attn": true,
27
+ "segment_vocab_size": 0,
28
+ "skip_init": true,
29
+ "rope_rank": "updown",
30
+ "max_position": 2048
31
+ }
Qwen-7B/bert4torch_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen",
3
+ "hidden_act": "silu",
4
+ "bias_dropout_fusion": true,
5
+ "bos_token_id": 151643,
6
+ "embd_pdrop": 0.1,
7
+ "eos_token_id": 151643,
8
+ "intermediate_size": 22016,
9
+ "initializer_range": 0.02,
10
+ "kv_channels": 128,
11
+ "layer_norm_eps": 1e-05,
12
+ "hidden_size": 4096,
13
+ "num_attention_heads": 32,
14
+ "num_hidden_layers": 32,
15
+ "n_positions": 6144,
16
+ "resid_pdrop": 0.1,
17
+ "rotary_emb_base": 10000,
18
+ "rotary_pct": 1.0,
19
+ "scale_attn_weights": true,
20
+ "seq_length": 2048,
21
+ "tie_word_embeddings": false,
22
+ "use_cache": true,
23
+ "flash_attention": "flash_attn_2",
24
+ "vocab_size": 151936,
25
+ "use_dynamic_ntk": true,
26
+ "use_logn_attn": true,
27
+ "segment_vocab_size": 0,
28
+ "skip_init": true,
29
+ "rope_rank": "updown",
30
+ "max_position": 2048
31
+ }
Yi-6B-200K/bert4torch_config.json ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 4096,
9
+ "model": "llama",
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 32,
12
+ "multi_query_group_num": 4,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-05,
15
+ "rope_theta": 5000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "vocab_size": 64000,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "segment_vocab_size": 0,
22
+ "mapping": {
23
+ "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
24
+ "LayerNormFinal.weight": "model.norm.weight",
25
+ "lm_head.weight": "lm_head.weight",
26
+ "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
27
+ "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
28
+ "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
29
+ "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
30
+ "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
31
+ "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
32
+ "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
33
+ "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
34
+ "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
35
+ "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
36
+ "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
37
+ "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
38
+ "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
39
+ "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
40
+ "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
41
+ "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
42
+ "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
43
+ "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
44
+ "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
45
+ "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
46
+ "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
47
+ "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
48
+ "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
49
+ "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
50
+ "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
51
+ "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
52
+ "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
53
+ "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
54
+ "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
55
+ "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
56
+ "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
57
+ "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
58
+ "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
59
+ "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
60
+ "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
61
+ "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
62
+ "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
63
+ "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
64
+ "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
65
+ "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
66
+ "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
67
+ "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
68
+ "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
69
+ "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
70
+ "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
71
+ "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
72
+ "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
73
+ "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
74
+ "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
75
+ "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
76
+ "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
77
+ "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
78
+ "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
79
+ "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
80
+ "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
81
+ "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
82
+ "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
83
+ "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
84
+ "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
85
+ "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
86
+ "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
87
+ "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
88
+ "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
89
+ "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
90
+ "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
91
+ "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
92
+ "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
93
+ "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
94
+ "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
95
+ "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
96
+ "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
97
+ "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
98
+ "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
99
+ "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
100
+ "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
101
+ "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
102
+ "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
103
+ "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
104
+ "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
105
+ "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
106
+ "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
107
+ "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
108
+ "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
109
+ "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
110
+ "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
111
+ "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
112
+ "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
113
+ "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
114
+ "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
115
+ "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
116
+ "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
117
+ "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
118
+ "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
119
+ "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
120
+ "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
121
+ "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
122
+ "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
123
+ "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
124
+ "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
125
+ "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
126
+ "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
127
+ "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
128
+ "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
129
+ "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
130
+ "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
131
+ "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
132
+ "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
133
+ "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
134
+ "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
135
+ "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
136
+ "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
137
+ "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
138
+ "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
139
+ "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
140
+ "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
141
+ "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
142
+ "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
143
+ "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
144
+ "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
145
+ "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
146
+ "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
147
+ "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
148
+ "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
149
+ "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
150
+ "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
151
+ "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
152
+ "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
153
+ "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
154
+ "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
155
+ "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
156
+ "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
157
+ "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
158
+ "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
159
+ "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
160
+ "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
161
+ "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
162
+ "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
163
+ "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
164
+ "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
165
+ "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
166
+ "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
167
+ "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
168
+ "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
169
+ "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
170
+ "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
171
+ "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
172
+ "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
173
+ "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
174
+ "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
175
+ "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
176
+ "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
177
+ "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
178
+ "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
179
+ "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
180
+ "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
181
+ "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
182
+ "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
183
+ "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
184
+ "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
185
+ "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
186
+ "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
187
+ "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
188
+ "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
189
+ "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
190
+ "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
191
+ "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
192
+ "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
193
+ "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
194
+ "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
195
+ "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
196
+ "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
197
+ "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
198
+ "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
199
+ "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
200
+ "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
201
+ "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
202
+ "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
203
+ "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
204
+ "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
205
+ "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
206
+ "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
207
+ "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
208
+ "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
209
+ "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
210
+ "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
211
+ "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
212
+ "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
213
+ "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
214
+ "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
215
+ "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
216
+ "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
217
+ "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
218
+ "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
219
+ "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
220
+ "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
221
+ "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
222
+ "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
223
+ "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
224
+ "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
225
+ "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
226
+ "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
227
+ "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
228
+ "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
229
+ "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
230
+ "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
231
+ "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
232
+ "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
233
+ "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
234
+ "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
235
+ "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
236
+ "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
237
+ "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
238
+ "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
239
+ "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
240
+ "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
241
+ "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
242
+ "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
243
+ "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
244
+ "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
245
+ "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
246
+ "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
247
+ "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
248
+ "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
249
+ "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
250
+ "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
251
+ "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
252
+ "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
253
+ "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
254
+ "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
255
+ "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
256
+ "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
257
+ "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
258
+ "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
259
+ "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
260
+ "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
261
+ "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
262
+ "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
263
+ "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
264
+ "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
265
+ "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
266
+ "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
267
+ "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
268
+ "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
269
+ "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
270
+ "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
271
+ "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
272
+ "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
273
+ "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
274
+ "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
275
+ "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
276
+ "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
277
+ "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
278
+ "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
279
+ "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
280
+ "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
281
+ "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
282
+ "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
283
+ "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
284
+ "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
285
+ "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
286
+ "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
287
+ "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
288
+ "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
289
+ "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
290
+ "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
291
+ "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
292
+ "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
293
+ "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
294
+ "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
295
+ "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
296
+ "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
297
+ "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
298
+ "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
299
+ "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
300
+ "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
301
+ "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
302
+ "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
303
+ "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
304
+ "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
305
+ "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
306
+ "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
307
+ "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
308
+ "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
309
+ "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
310
+ "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
311
+ "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
312
+ "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
313
+ "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight"
314
+ }
315
+ }
Yi-6B/bert4torch_config.json ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 4096,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 11008,
8
+ "max_position_embeddings": 4096,
9
+ "model": "llama",
10
+ "num_attention_heads": 32,
11
+ "num_hidden_layers": 32,
12
+ "multi_query_group_num": 4,
13
+ "pad_token_id": 0,
14
+ "layer_norm_eps": 1e-05,
15
+ "rope_theta": 5000000.0,
16
+ "tie_word_embeddings": false,
17
+ "torch_dtype": "bfloat16",
18
+ "vocab_size": 64000,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "segment_vocab_size": 0,
22
+ "mapping": {
23
+ "embeddings.word_embeddings.weight": "model.embed_tokens.weight",
24
+ "LayerNormFinal.weight": "model.norm.weight",
25
+ "lm_head.weight": "lm_head.weight",
26
+ "decoderLayer.0.multiHeadAttention.q.weight": "model.layers.0.self_attn.q_proj.weight",
27
+ "decoderLayer.0.multiHeadAttention.k.weight": "model.layers.0.self_attn.k_proj.weight",
28
+ "decoderLayer.0.multiHeadAttention.v.weight": "model.layers.0.self_attn.v_proj.weight",
29
+ "decoderLayer.0.multiHeadAttention.o.weight": "model.layers.0.self_attn.o_proj.weight",
30
+ "decoderLayer.0.attnLayerNorm.weight": "model.layers.0.ln1.weight",
31
+ "decoderLayer.0.feedForward.intermediateDense.weight": "model.layers.0.mlp.gate_proj.weight",
32
+ "decoderLayer.0.feedForward.outputDense.weight": "model.layers.0.mlp.down_proj.weight",
33
+ "decoderLayer.0.ffnLayerNorm.weight": "model.layers.0.ln2.weight",
34
+ "decoderLayer.0.feedForward.intermediateDense2.weight": "model.layers.0.mlp.up_proj.weight",
35
+ "decoderLayer.1.multiHeadAttention.q.weight": "model.layers.1.self_attn.q_proj.weight",
36
+ "decoderLayer.1.multiHeadAttention.k.weight": "model.layers.1.self_attn.k_proj.weight",
37
+ "decoderLayer.1.multiHeadAttention.v.weight": "model.layers.1.self_attn.v_proj.weight",
38
+ "decoderLayer.1.multiHeadAttention.o.weight": "model.layers.1.self_attn.o_proj.weight",
39
+ "decoderLayer.1.attnLayerNorm.weight": "model.layers.1.ln1.weight",
40
+ "decoderLayer.1.feedForward.intermediateDense.weight": "model.layers.1.mlp.gate_proj.weight",
41
+ "decoderLayer.1.feedForward.outputDense.weight": "model.layers.1.mlp.down_proj.weight",
42
+ "decoderLayer.1.ffnLayerNorm.weight": "model.layers.1.ln2.weight",
43
+ "decoderLayer.1.feedForward.intermediateDense2.weight": "model.layers.1.mlp.up_proj.weight",
44
+ "decoderLayer.2.multiHeadAttention.q.weight": "model.layers.2.self_attn.q_proj.weight",
45
+ "decoderLayer.2.multiHeadAttention.k.weight": "model.layers.2.self_attn.k_proj.weight",
46
+ "decoderLayer.2.multiHeadAttention.v.weight": "model.layers.2.self_attn.v_proj.weight",
47
+ "decoderLayer.2.multiHeadAttention.o.weight": "model.layers.2.self_attn.o_proj.weight",
48
+ "decoderLayer.2.attnLayerNorm.weight": "model.layers.2.ln1.weight",
49
+ "decoderLayer.2.feedForward.intermediateDense.weight": "model.layers.2.mlp.gate_proj.weight",
50
+ "decoderLayer.2.feedForward.outputDense.weight": "model.layers.2.mlp.down_proj.weight",
51
+ "decoderLayer.2.ffnLayerNorm.weight": "model.layers.2.ln2.weight",
52
+ "decoderLayer.2.feedForward.intermediateDense2.weight": "model.layers.2.mlp.up_proj.weight",
53
+ "decoderLayer.3.multiHeadAttention.q.weight": "model.layers.3.self_attn.q_proj.weight",
54
+ "decoderLayer.3.multiHeadAttention.k.weight": "model.layers.3.self_attn.k_proj.weight",
55
+ "decoderLayer.3.multiHeadAttention.v.weight": "model.layers.3.self_attn.v_proj.weight",
56
+ "decoderLayer.3.multiHeadAttention.o.weight": "model.layers.3.self_attn.o_proj.weight",
57
+ "decoderLayer.3.attnLayerNorm.weight": "model.layers.3.ln1.weight",
58
+ "decoderLayer.3.feedForward.intermediateDense.weight": "model.layers.3.mlp.gate_proj.weight",
59
+ "decoderLayer.3.feedForward.outputDense.weight": "model.layers.3.mlp.down_proj.weight",
60
+ "decoderLayer.3.ffnLayerNorm.weight": "model.layers.3.ln2.weight",
61
+ "decoderLayer.3.feedForward.intermediateDense2.weight": "model.layers.3.mlp.up_proj.weight",
62
+ "decoderLayer.4.multiHeadAttention.q.weight": "model.layers.4.self_attn.q_proj.weight",
63
+ "decoderLayer.4.multiHeadAttention.k.weight": "model.layers.4.self_attn.k_proj.weight",
64
+ "decoderLayer.4.multiHeadAttention.v.weight": "model.layers.4.self_attn.v_proj.weight",
65
+ "decoderLayer.4.multiHeadAttention.o.weight": "model.layers.4.self_attn.o_proj.weight",
66
+ "decoderLayer.4.attnLayerNorm.weight": "model.layers.4.ln1.weight",
67
+ "decoderLayer.4.feedForward.intermediateDense.weight": "model.layers.4.mlp.gate_proj.weight",
68
+ "decoderLayer.4.feedForward.outputDense.weight": "model.layers.4.mlp.down_proj.weight",
69
+ "decoderLayer.4.ffnLayerNorm.weight": "model.layers.4.ln2.weight",
70
+ "decoderLayer.4.feedForward.intermediateDense2.weight": "model.layers.4.mlp.up_proj.weight",
71
+ "decoderLayer.5.multiHeadAttention.q.weight": "model.layers.5.self_attn.q_proj.weight",
72
+ "decoderLayer.5.multiHeadAttention.k.weight": "model.layers.5.self_attn.k_proj.weight",
73
+ "decoderLayer.5.multiHeadAttention.v.weight": "model.layers.5.self_attn.v_proj.weight",
74
+ "decoderLayer.5.multiHeadAttention.o.weight": "model.layers.5.self_attn.o_proj.weight",
75
+ "decoderLayer.5.attnLayerNorm.weight": "model.layers.5.ln1.weight",
76
+ "decoderLayer.5.feedForward.intermediateDense.weight": "model.layers.5.mlp.gate_proj.weight",
77
+ "decoderLayer.5.feedForward.outputDense.weight": "model.layers.5.mlp.down_proj.weight",
78
+ "decoderLayer.5.ffnLayerNorm.weight": "model.layers.5.ln2.weight",
79
+ "decoderLayer.5.feedForward.intermediateDense2.weight": "model.layers.5.mlp.up_proj.weight",
80
+ "decoderLayer.6.multiHeadAttention.q.weight": "model.layers.6.self_attn.q_proj.weight",
81
+ "decoderLayer.6.multiHeadAttention.k.weight": "model.layers.6.self_attn.k_proj.weight",
82
+ "decoderLayer.6.multiHeadAttention.v.weight": "model.layers.6.self_attn.v_proj.weight",
83
+ "decoderLayer.6.multiHeadAttention.o.weight": "model.layers.6.self_attn.o_proj.weight",
84
+ "decoderLayer.6.attnLayerNorm.weight": "model.layers.6.ln1.weight",
85
+ "decoderLayer.6.feedForward.intermediateDense.weight": "model.layers.6.mlp.gate_proj.weight",
86
+ "decoderLayer.6.feedForward.outputDense.weight": "model.layers.6.mlp.down_proj.weight",
87
+ "decoderLayer.6.ffnLayerNorm.weight": "model.layers.6.ln2.weight",
88
+ "decoderLayer.6.feedForward.intermediateDense2.weight": "model.layers.6.mlp.up_proj.weight",
89
+ "decoderLayer.7.multiHeadAttention.q.weight": "model.layers.7.self_attn.q_proj.weight",
90
+ "decoderLayer.7.multiHeadAttention.k.weight": "model.layers.7.self_attn.k_proj.weight",
91
+ "decoderLayer.7.multiHeadAttention.v.weight": "model.layers.7.self_attn.v_proj.weight",
92
+ "decoderLayer.7.multiHeadAttention.o.weight": "model.layers.7.self_attn.o_proj.weight",
93
+ "decoderLayer.7.attnLayerNorm.weight": "model.layers.7.ln1.weight",
94
+ "decoderLayer.7.feedForward.intermediateDense.weight": "model.layers.7.mlp.gate_proj.weight",
95
+ "decoderLayer.7.feedForward.outputDense.weight": "model.layers.7.mlp.down_proj.weight",
96
+ "decoderLayer.7.ffnLayerNorm.weight": "model.layers.7.ln2.weight",
97
+ "decoderLayer.7.feedForward.intermediateDense2.weight": "model.layers.7.mlp.up_proj.weight",
98
+ "decoderLayer.8.multiHeadAttention.q.weight": "model.layers.8.self_attn.q_proj.weight",
99
+ "decoderLayer.8.multiHeadAttention.k.weight": "model.layers.8.self_attn.k_proj.weight",
100
+ "decoderLayer.8.multiHeadAttention.v.weight": "model.layers.8.self_attn.v_proj.weight",
101
+ "decoderLayer.8.multiHeadAttention.o.weight": "model.layers.8.self_attn.o_proj.weight",
102
+ "decoderLayer.8.attnLayerNorm.weight": "model.layers.8.ln1.weight",
103
+ "decoderLayer.8.feedForward.intermediateDense.weight": "model.layers.8.mlp.gate_proj.weight",
104
+ "decoderLayer.8.feedForward.outputDense.weight": "model.layers.8.mlp.down_proj.weight",
105
+ "decoderLayer.8.ffnLayerNorm.weight": "model.layers.8.ln2.weight",
106
+ "decoderLayer.8.feedForward.intermediateDense2.weight": "model.layers.8.mlp.up_proj.weight",
107
+ "decoderLayer.9.multiHeadAttention.q.weight": "model.layers.9.self_attn.q_proj.weight",
108
+ "decoderLayer.9.multiHeadAttention.k.weight": "model.layers.9.self_attn.k_proj.weight",
109
+ "decoderLayer.9.multiHeadAttention.v.weight": "model.layers.9.self_attn.v_proj.weight",
110
+ "decoderLayer.9.multiHeadAttention.o.weight": "model.layers.9.self_attn.o_proj.weight",
111
+ "decoderLayer.9.attnLayerNorm.weight": "model.layers.9.ln1.weight",
112
+ "decoderLayer.9.feedForward.intermediateDense.weight": "model.layers.9.mlp.gate_proj.weight",
113
+ "decoderLayer.9.feedForward.outputDense.weight": "model.layers.9.mlp.down_proj.weight",
114
+ "decoderLayer.9.ffnLayerNorm.weight": "model.layers.9.ln2.weight",
115
+ "decoderLayer.9.feedForward.intermediateDense2.weight": "model.layers.9.mlp.up_proj.weight",
116
+ "decoderLayer.10.multiHeadAttention.q.weight": "model.layers.10.self_attn.q_proj.weight",
117
+ "decoderLayer.10.multiHeadAttention.k.weight": "model.layers.10.self_attn.k_proj.weight",
118
+ "decoderLayer.10.multiHeadAttention.v.weight": "model.layers.10.self_attn.v_proj.weight",
119
+ "decoderLayer.10.multiHeadAttention.o.weight": "model.layers.10.self_attn.o_proj.weight",
120
+ "decoderLayer.10.attnLayerNorm.weight": "model.layers.10.ln1.weight",
121
+ "decoderLayer.10.feedForward.intermediateDense.weight": "model.layers.10.mlp.gate_proj.weight",
122
+ "decoderLayer.10.feedForward.outputDense.weight": "model.layers.10.mlp.down_proj.weight",
123
+ "decoderLayer.10.ffnLayerNorm.weight": "model.layers.10.ln2.weight",
124
+ "decoderLayer.10.feedForward.intermediateDense2.weight": "model.layers.10.mlp.up_proj.weight",
125
+ "decoderLayer.11.multiHeadAttention.q.weight": "model.layers.11.self_attn.q_proj.weight",
126
+ "decoderLayer.11.multiHeadAttention.k.weight": "model.layers.11.self_attn.k_proj.weight",
127
+ "decoderLayer.11.multiHeadAttention.v.weight": "model.layers.11.self_attn.v_proj.weight",
128
+ "decoderLayer.11.multiHeadAttention.o.weight": "model.layers.11.self_attn.o_proj.weight",
129
+ "decoderLayer.11.attnLayerNorm.weight": "model.layers.11.ln1.weight",
130
+ "decoderLayer.11.feedForward.intermediateDense.weight": "model.layers.11.mlp.gate_proj.weight",
131
+ "decoderLayer.11.feedForward.outputDense.weight": "model.layers.11.mlp.down_proj.weight",
132
+ "decoderLayer.11.ffnLayerNorm.weight": "model.layers.11.ln2.weight",
133
+ "decoderLayer.11.feedForward.intermediateDense2.weight": "model.layers.11.mlp.up_proj.weight",
134
+ "decoderLayer.12.multiHeadAttention.q.weight": "model.layers.12.self_attn.q_proj.weight",
135
+ "decoderLayer.12.multiHeadAttention.k.weight": "model.layers.12.self_attn.k_proj.weight",
136
+ "decoderLayer.12.multiHeadAttention.v.weight": "model.layers.12.self_attn.v_proj.weight",
137
+ "decoderLayer.12.multiHeadAttention.o.weight": "model.layers.12.self_attn.o_proj.weight",
138
+ "decoderLayer.12.attnLayerNorm.weight": "model.layers.12.ln1.weight",
139
+ "decoderLayer.12.feedForward.intermediateDense.weight": "model.layers.12.mlp.gate_proj.weight",
140
+ "decoderLayer.12.feedForward.outputDense.weight": "model.layers.12.mlp.down_proj.weight",
141
+ "decoderLayer.12.ffnLayerNorm.weight": "model.layers.12.ln2.weight",
142
+ "decoderLayer.12.feedForward.intermediateDense2.weight": "model.layers.12.mlp.up_proj.weight",
143
+ "decoderLayer.13.multiHeadAttention.q.weight": "model.layers.13.self_attn.q_proj.weight",
144
+ "decoderLayer.13.multiHeadAttention.k.weight": "model.layers.13.self_attn.k_proj.weight",
145
+ "decoderLayer.13.multiHeadAttention.v.weight": "model.layers.13.self_attn.v_proj.weight",
146
+ "decoderLayer.13.multiHeadAttention.o.weight": "model.layers.13.self_attn.o_proj.weight",
147
+ "decoderLayer.13.attnLayerNorm.weight": "model.layers.13.ln1.weight",
148
+ "decoderLayer.13.feedForward.intermediateDense.weight": "model.layers.13.mlp.gate_proj.weight",
149
+ "decoderLayer.13.feedForward.outputDense.weight": "model.layers.13.mlp.down_proj.weight",
150
+ "decoderLayer.13.ffnLayerNorm.weight": "model.layers.13.ln2.weight",
151
+ "decoderLayer.13.feedForward.intermediateDense2.weight": "model.layers.13.mlp.up_proj.weight",
152
+ "decoderLayer.14.multiHeadAttention.q.weight": "model.layers.14.self_attn.q_proj.weight",
153
+ "decoderLayer.14.multiHeadAttention.k.weight": "model.layers.14.self_attn.k_proj.weight",
154
+ "decoderLayer.14.multiHeadAttention.v.weight": "model.layers.14.self_attn.v_proj.weight",
155
+ "decoderLayer.14.multiHeadAttention.o.weight": "model.layers.14.self_attn.o_proj.weight",
156
+ "decoderLayer.14.attnLayerNorm.weight": "model.layers.14.ln1.weight",
157
+ "decoderLayer.14.feedForward.intermediateDense.weight": "model.layers.14.mlp.gate_proj.weight",
158
+ "decoderLayer.14.feedForward.outputDense.weight": "model.layers.14.mlp.down_proj.weight",
159
+ "decoderLayer.14.ffnLayerNorm.weight": "model.layers.14.ln2.weight",
160
+ "decoderLayer.14.feedForward.intermediateDense2.weight": "model.layers.14.mlp.up_proj.weight",
161
+ "decoderLayer.15.multiHeadAttention.q.weight": "model.layers.15.self_attn.q_proj.weight",
162
+ "decoderLayer.15.multiHeadAttention.k.weight": "model.layers.15.self_attn.k_proj.weight",
163
+ "decoderLayer.15.multiHeadAttention.v.weight": "model.layers.15.self_attn.v_proj.weight",
164
+ "decoderLayer.15.multiHeadAttention.o.weight": "model.layers.15.self_attn.o_proj.weight",
165
+ "decoderLayer.15.attnLayerNorm.weight": "model.layers.15.ln1.weight",
166
+ "decoderLayer.15.feedForward.intermediateDense.weight": "model.layers.15.mlp.gate_proj.weight",
167
+ "decoderLayer.15.feedForward.outputDense.weight": "model.layers.15.mlp.down_proj.weight",
168
+ "decoderLayer.15.ffnLayerNorm.weight": "model.layers.15.ln2.weight",
169
+ "decoderLayer.15.feedForward.intermediateDense2.weight": "model.layers.15.mlp.up_proj.weight",
170
+ "decoderLayer.16.multiHeadAttention.q.weight": "model.layers.16.self_attn.q_proj.weight",
171
+ "decoderLayer.16.multiHeadAttention.k.weight": "model.layers.16.self_attn.k_proj.weight",
172
+ "decoderLayer.16.multiHeadAttention.v.weight": "model.layers.16.self_attn.v_proj.weight",
173
+ "decoderLayer.16.multiHeadAttention.o.weight": "model.layers.16.self_attn.o_proj.weight",
174
+ "decoderLayer.16.attnLayerNorm.weight": "model.layers.16.ln1.weight",
175
+ "decoderLayer.16.feedForward.intermediateDense.weight": "model.layers.16.mlp.gate_proj.weight",
176
+ "decoderLayer.16.feedForward.outputDense.weight": "model.layers.16.mlp.down_proj.weight",
177
+ "decoderLayer.16.ffnLayerNorm.weight": "model.layers.16.ln2.weight",
178
+ "decoderLayer.16.feedForward.intermediateDense2.weight": "model.layers.16.mlp.up_proj.weight",
179
+ "decoderLayer.17.multiHeadAttention.q.weight": "model.layers.17.self_attn.q_proj.weight",
180
+ "decoderLayer.17.multiHeadAttention.k.weight": "model.layers.17.self_attn.k_proj.weight",
181
+ "decoderLayer.17.multiHeadAttention.v.weight": "model.layers.17.self_attn.v_proj.weight",
182
+ "decoderLayer.17.multiHeadAttention.o.weight": "model.layers.17.self_attn.o_proj.weight",
183
+ "decoderLayer.17.attnLayerNorm.weight": "model.layers.17.ln1.weight",
184
+ "decoderLayer.17.feedForward.intermediateDense.weight": "model.layers.17.mlp.gate_proj.weight",
185
+ "decoderLayer.17.feedForward.outputDense.weight": "model.layers.17.mlp.down_proj.weight",
186
+ "decoderLayer.17.ffnLayerNorm.weight": "model.layers.17.ln2.weight",
187
+ "decoderLayer.17.feedForward.intermediateDense2.weight": "model.layers.17.mlp.up_proj.weight",
188
+ "decoderLayer.18.multiHeadAttention.q.weight": "model.layers.18.self_attn.q_proj.weight",
189
+ "decoderLayer.18.multiHeadAttention.k.weight": "model.layers.18.self_attn.k_proj.weight",
190
+ "decoderLayer.18.multiHeadAttention.v.weight": "model.layers.18.self_attn.v_proj.weight",
191
+ "decoderLayer.18.multiHeadAttention.o.weight": "model.layers.18.self_attn.o_proj.weight",
192
+ "decoderLayer.18.attnLayerNorm.weight": "model.layers.18.ln1.weight",
193
+ "decoderLayer.18.feedForward.intermediateDense.weight": "model.layers.18.mlp.gate_proj.weight",
194
+ "decoderLayer.18.feedForward.outputDense.weight": "model.layers.18.mlp.down_proj.weight",
195
+ "decoderLayer.18.ffnLayerNorm.weight": "model.layers.18.ln2.weight",
196
+ "decoderLayer.18.feedForward.intermediateDense2.weight": "model.layers.18.mlp.up_proj.weight",
197
+ "decoderLayer.19.multiHeadAttention.q.weight": "model.layers.19.self_attn.q_proj.weight",
198
+ "decoderLayer.19.multiHeadAttention.k.weight": "model.layers.19.self_attn.k_proj.weight",
199
+ "decoderLayer.19.multiHeadAttention.v.weight": "model.layers.19.self_attn.v_proj.weight",
200
+ "decoderLayer.19.multiHeadAttention.o.weight": "model.layers.19.self_attn.o_proj.weight",
201
+ "decoderLayer.19.attnLayerNorm.weight": "model.layers.19.ln1.weight",
202
+ "decoderLayer.19.feedForward.intermediateDense.weight": "model.layers.19.mlp.gate_proj.weight",
203
+ "decoderLayer.19.feedForward.outputDense.weight": "model.layers.19.mlp.down_proj.weight",
204
+ "decoderLayer.19.ffnLayerNorm.weight": "model.layers.19.ln2.weight",
205
+ "decoderLayer.19.feedForward.intermediateDense2.weight": "model.layers.19.mlp.up_proj.weight",
206
+ "decoderLayer.20.multiHeadAttention.q.weight": "model.layers.20.self_attn.q_proj.weight",
207
+ "decoderLayer.20.multiHeadAttention.k.weight": "model.layers.20.self_attn.k_proj.weight",
208
+ "decoderLayer.20.multiHeadAttention.v.weight": "model.layers.20.self_attn.v_proj.weight",
209
+ "decoderLayer.20.multiHeadAttention.o.weight": "model.layers.20.self_attn.o_proj.weight",
210
+ "decoderLayer.20.attnLayerNorm.weight": "model.layers.20.ln1.weight",
211
+ "decoderLayer.20.feedForward.intermediateDense.weight": "model.layers.20.mlp.gate_proj.weight",
212
+ "decoderLayer.20.feedForward.outputDense.weight": "model.layers.20.mlp.down_proj.weight",
213
+ "decoderLayer.20.ffnLayerNorm.weight": "model.layers.20.ln2.weight",
214
+ "decoderLayer.20.feedForward.intermediateDense2.weight": "model.layers.20.mlp.up_proj.weight",
215
+ "decoderLayer.21.multiHeadAttention.q.weight": "model.layers.21.self_attn.q_proj.weight",
216
+ "decoderLayer.21.multiHeadAttention.k.weight": "model.layers.21.self_attn.k_proj.weight",
217
+ "decoderLayer.21.multiHeadAttention.v.weight": "model.layers.21.self_attn.v_proj.weight",
218
+ "decoderLayer.21.multiHeadAttention.o.weight": "model.layers.21.self_attn.o_proj.weight",
219
+ "decoderLayer.21.attnLayerNorm.weight": "model.layers.21.ln1.weight",
220
+ "decoderLayer.21.feedForward.intermediateDense.weight": "model.layers.21.mlp.gate_proj.weight",
221
+ "decoderLayer.21.feedForward.outputDense.weight": "model.layers.21.mlp.down_proj.weight",
222
+ "decoderLayer.21.ffnLayerNorm.weight": "model.layers.21.ln2.weight",
223
+ "decoderLayer.21.feedForward.intermediateDense2.weight": "model.layers.21.mlp.up_proj.weight",
224
+ "decoderLayer.22.multiHeadAttention.q.weight": "model.layers.22.self_attn.q_proj.weight",
225
+ "decoderLayer.22.multiHeadAttention.k.weight": "model.layers.22.self_attn.k_proj.weight",
226
+ "decoderLayer.22.multiHeadAttention.v.weight": "model.layers.22.self_attn.v_proj.weight",
227
+ "decoderLayer.22.multiHeadAttention.o.weight": "model.layers.22.self_attn.o_proj.weight",
228
+ "decoderLayer.22.attnLayerNorm.weight": "model.layers.22.ln1.weight",
229
+ "decoderLayer.22.feedForward.intermediateDense.weight": "model.layers.22.mlp.gate_proj.weight",
230
+ "decoderLayer.22.feedForward.outputDense.weight": "model.layers.22.mlp.down_proj.weight",
231
+ "decoderLayer.22.ffnLayerNorm.weight": "model.layers.22.ln2.weight",
232
+ "decoderLayer.22.feedForward.intermediateDense2.weight": "model.layers.22.mlp.up_proj.weight",
233
+ "decoderLayer.23.multiHeadAttention.q.weight": "model.layers.23.self_attn.q_proj.weight",
234
+ "decoderLayer.23.multiHeadAttention.k.weight": "model.layers.23.self_attn.k_proj.weight",
235
+ "decoderLayer.23.multiHeadAttention.v.weight": "model.layers.23.self_attn.v_proj.weight",
236
+ "decoderLayer.23.multiHeadAttention.o.weight": "model.layers.23.self_attn.o_proj.weight",
237
+ "decoderLayer.23.attnLayerNorm.weight": "model.layers.23.ln1.weight",
238
+ "decoderLayer.23.feedForward.intermediateDense.weight": "model.layers.23.mlp.gate_proj.weight",
239
+ "decoderLayer.23.feedForward.outputDense.weight": "model.layers.23.mlp.down_proj.weight",
240
+ "decoderLayer.23.ffnLayerNorm.weight": "model.layers.23.ln2.weight",
241
+ "decoderLayer.23.feedForward.intermediateDense2.weight": "model.layers.23.mlp.up_proj.weight",
242
+ "decoderLayer.24.multiHeadAttention.q.weight": "model.layers.24.self_attn.q_proj.weight",
243
+ "decoderLayer.24.multiHeadAttention.k.weight": "model.layers.24.self_attn.k_proj.weight",
244
+ "decoderLayer.24.multiHeadAttention.v.weight": "model.layers.24.self_attn.v_proj.weight",
245
+ "decoderLayer.24.multiHeadAttention.o.weight": "model.layers.24.self_attn.o_proj.weight",
246
+ "decoderLayer.24.attnLayerNorm.weight": "model.layers.24.ln1.weight",
247
+ "decoderLayer.24.feedForward.intermediateDense.weight": "model.layers.24.mlp.gate_proj.weight",
248
+ "decoderLayer.24.feedForward.outputDense.weight": "model.layers.24.mlp.down_proj.weight",
249
+ "decoderLayer.24.ffnLayerNorm.weight": "model.layers.24.ln2.weight",
250
+ "decoderLayer.24.feedForward.intermediateDense2.weight": "model.layers.24.mlp.up_proj.weight",
251
+ "decoderLayer.25.multiHeadAttention.q.weight": "model.layers.25.self_attn.q_proj.weight",
252
+ "decoderLayer.25.multiHeadAttention.k.weight": "model.layers.25.self_attn.k_proj.weight",
253
+ "decoderLayer.25.multiHeadAttention.v.weight": "model.layers.25.self_attn.v_proj.weight",
254
+ "decoderLayer.25.multiHeadAttention.o.weight": "model.layers.25.self_attn.o_proj.weight",
255
+ "decoderLayer.25.attnLayerNorm.weight": "model.layers.25.ln1.weight",
256
+ "decoderLayer.25.feedForward.intermediateDense.weight": "model.layers.25.mlp.gate_proj.weight",
257
+ "decoderLayer.25.feedForward.outputDense.weight": "model.layers.25.mlp.down_proj.weight",
258
+ "decoderLayer.25.ffnLayerNorm.weight": "model.layers.25.ln2.weight",
259
+ "decoderLayer.25.feedForward.intermediateDense2.weight": "model.layers.25.mlp.up_proj.weight",
260
+ "decoderLayer.26.multiHeadAttention.q.weight": "model.layers.26.self_attn.q_proj.weight",
261
+ "decoderLayer.26.multiHeadAttention.k.weight": "model.layers.26.self_attn.k_proj.weight",
262
+ "decoderLayer.26.multiHeadAttention.v.weight": "model.layers.26.self_attn.v_proj.weight",
263
+ "decoderLayer.26.multiHeadAttention.o.weight": "model.layers.26.self_attn.o_proj.weight",
264
+ "decoderLayer.26.attnLayerNorm.weight": "model.layers.26.ln1.weight",
265
+ "decoderLayer.26.feedForward.intermediateDense.weight": "model.layers.26.mlp.gate_proj.weight",
266
+ "decoderLayer.26.feedForward.outputDense.weight": "model.layers.26.mlp.down_proj.weight",
267
+ "decoderLayer.26.ffnLayerNorm.weight": "model.layers.26.ln2.weight",
268
+ "decoderLayer.26.feedForward.intermediateDense2.weight": "model.layers.26.mlp.up_proj.weight",
269
+ "decoderLayer.27.multiHeadAttention.q.weight": "model.layers.27.self_attn.q_proj.weight",
270
+ "decoderLayer.27.multiHeadAttention.k.weight": "model.layers.27.self_attn.k_proj.weight",
271
+ "decoderLayer.27.multiHeadAttention.v.weight": "model.layers.27.self_attn.v_proj.weight",
272
+ "decoderLayer.27.multiHeadAttention.o.weight": "model.layers.27.self_attn.o_proj.weight",
273
+ "decoderLayer.27.attnLayerNorm.weight": "model.layers.27.ln1.weight",
274
+ "decoderLayer.27.feedForward.intermediateDense.weight": "model.layers.27.mlp.gate_proj.weight",
275
+ "decoderLayer.27.feedForward.outputDense.weight": "model.layers.27.mlp.down_proj.weight",
276
+ "decoderLayer.27.ffnLayerNorm.weight": "model.layers.27.ln2.weight",
277
+ "decoderLayer.27.feedForward.intermediateDense2.weight": "model.layers.27.mlp.up_proj.weight",
278
+ "decoderLayer.28.multiHeadAttention.q.weight": "model.layers.28.self_attn.q_proj.weight",
279
+ "decoderLayer.28.multiHeadAttention.k.weight": "model.layers.28.self_attn.k_proj.weight",
280
+ "decoderLayer.28.multiHeadAttention.v.weight": "model.layers.28.self_attn.v_proj.weight",
281
+ "decoderLayer.28.multiHeadAttention.o.weight": "model.layers.28.self_attn.o_proj.weight",
282
+ "decoderLayer.28.attnLayerNorm.weight": "model.layers.28.ln1.weight",
283
+ "decoderLayer.28.feedForward.intermediateDense.weight": "model.layers.28.mlp.gate_proj.weight",
284
+ "decoderLayer.28.feedForward.outputDense.weight": "model.layers.28.mlp.down_proj.weight",
285
+ "decoderLayer.28.ffnLayerNorm.weight": "model.layers.28.ln2.weight",
286
+ "decoderLayer.28.feedForward.intermediateDense2.weight": "model.layers.28.mlp.up_proj.weight",
287
+ "decoderLayer.29.multiHeadAttention.q.weight": "model.layers.29.self_attn.q_proj.weight",
288
+ "decoderLayer.29.multiHeadAttention.k.weight": "model.layers.29.self_attn.k_proj.weight",
289
+ "decoderLayer.29.multiHeadAttention.v.weight": "model.layers.29.self_attn.v_proj.weight",
290
+ "decoderLayer.29.multiHeadAttention.o.weight": "model.layers.29.self_attn.o_proj.weight",
291
+ "decoderLayer.29.attnLayerNorm.weight": "model.layers.29.ln1.weight",
292
+ "decoderLayer.29.feedForward.intermediateDense.weight": "model.layers.29.mlp.gate_proj.weight",
293
+ "decoderLayer.29.feedForward.outputDense.weight": "model.layers.29.mlp.down_proj.weight",
294
+ "decoderLayer.29.ffnLayerNorm.weight": "model.layers.29.ln2.weight",
295
+ "decoderLayer.29.feedForward.intermediateDense2.weight": "model.layers.29.mlp.up_proj.weight",
296
+ "decoderLayer.30.multiHeadAttention.q.weight": "model.layers.30.self_attn.q_proj.weight",
297
+ "decoderLayer.30.multiHeadAttention.k.weight": "model.layers.30.self_attn.k_proj.weight",
298
+ "decoderLayer.30.multiHeadAttention.v.weight": "model.layers.30.self_attn.v_proj.weight",
299
+ "decoderLayer.30.multiHeadAttention.o.weight": "model.layers.30.self_attn.o_proj.weight",
300
+ "decoderLayer.30.attnLayerNorm.weight": "model.layers.30.ln1.weight",
301
+ "decoderLayer.30.feedForward.intermediateDense.weight": "model.layers.30.mlp.gate_proj.weight",
302
+ "decoderLayer.30.feedForward.outputDense.weight": "model.layers.30.mlp.down_proj.weight",
303
+ "decoderLayer.30.ffnLayerNorm.weight": "model.layers.30.ln2.weight",
304
+ "decoderLayer.30.feedForward.intermediateDense2.weight": "model.layers.30.mlp.up_proj.weight",
305
+ "decoderLayer.31.multiHeadAttention.q.weight": "model.layers.31.self_attn.q_proj.weight",
306
+ "decoderLayer.31.multiHeadAttention.k.weight": "model.layers.31.self_attn.k_proj.weight",
307
+ "decoderLayer.31.multiHeadAttention.v.weight": "model.layers.31.self_attn.v_proj.weight",
308
+ "decoderLayer.31.multiHeadAttention.o.weight": "model.layers.31.self_attn.o_proj.weight",
309
+ "decoderLayer.31.attnLayerNorm.weight": "model.layers.31.ln1.weight",
310
+ "decoderLayer.31.feedForward.intermediateDense.weight": "model.layers.31.mlp.gate_proj.weight",
311
+ "decoderLayer.31.feedForward.outputDense.weight": "model.layers.31.mlp.down_proj.weight",
312
+ "decoderLayer.31.ffnLayerNorm.weight": "model.layers.31.ln2.weight",
313
+ "decoderLayer.31.feedForward.intermediateDense2.weight": "model.layers.31.mlp.up_proj.weight"
314
+ }
315
+ }
Ziya-LLaMA-13B-v1.1/bert4torch_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "hidden_act": "silu",
6
+ "hidden_size": 5120,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 13824,
9
+ "max_position_embeddings": 2048,
10
+ "num_attention_heads": 40,
11
+ "num_hidden_layers": 40,
12
+ "pad_token_id": 0,
13
+ "layer_norm_eps": 1e-06,
14
+ "tie_word_embeddings": false,
15
+ "use_cache": true,
16
+ "vocab_size": 39424,
17
+ "segment_vocab_size": 0,
18
+ "skip_init": true,
19
+ "rope_rank": "updown"
20
+ }
Ziya-LLaMA-13B-v1/bert4torch_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "hidden_act": "silu",
6
+ "hidden_size": 5120,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 13824,
9
+ "max_position_embeddings": 2048,
10
+ "num_attention_heads": 40,
11
+ "num_hidden_layers": 40,
12
+ "pad_token_id": 0,
13
+ "layer_norm_eps": 1e-06,
14
+ "tie_word_embeddings": false,
15
+ "use_cache": true,
16
+ "vocab_size": 39424,
17
+ "segment_vocab_size": 0,
18
+ "skip_init": true,
19
+ "rope_rank": "updown"
20
+ }
bart-base-chinese/bert4torch_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "bart",
3
+ "attention_probs_dropout_prob": 0.1,
4
+ "hidden_act": "gelu",
5
+ "hidden_dropout_prob": 0.1,
6
+ "hidden_size": 768,
7
+ "initializer_range": 0.02,
8
+ "intermediate_size": 3072,
9
+ "max_position_embeddings": 1024,
10
+ "num_attention_heads": 12,
11
+ "num_hidden_layers": 6,
12
+ "type_vocab_size": 0,
13
+ "vocab_size": 51271
14
+ }
bert-base-multilingual-cased/bert4torch_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "directionality": "bidi",
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 512,
14
+ "model_type": "bert",
15
+ "num_attention_heads": 12,
16
+ "num_hidden_layers": 12,
17
+ "pad_token_id": 0,
18
+ "pooler_fc_size": 768,
19
+ "pooler_num_attention_heads": 12,
20
+ "pooler_num_fc_layers": 3,
21
+ "pooler_size_per_head": 128,
22
+ "pooler_type": "first_token_transform",
23
+ "type_vocab_size": 2,
24
+ "vocab_size": 119547
25
+ }
bge-large-en-v1.5/bert4torch_config.json ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_probs_dropout_prob": 0.1,
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 1024,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 4096,
8
+ "layer_norm_eps": 1e-12,
9
+ "max_position_embeddings": 512,
10
+ "model": "bert",
11
+ "num_attention_heads": 16,
12
+ "num_hidden_layers": 24,
13
+ "pad_token_id": 0,
14
+ "torch_dtype": "float32",
15
+ "type_vocab_size": 2,
16
+ "vocab_size": 30522,
17
+ "with_pool": true,
18
+ "pool_strategy": "cls",
19
+ "mapping": {
20
+ "embeddings.word_embeddings.weight": "embeddings.word_embeddings.weight",
21
+ "embeddings.position_embeddings.weight": "embeddings.position_embeddings.weight",
22
+ "embeddings.segment_embeddings.weight": "embeddings.token_type_embeddings.weight",
23
+ "embeddings.layerNorm.weight": "embeddings.LayerNorm.weight",
24
+ "embeddings.layerNorm.bias": "embeddings.LayerNorm.bias",
25
+ "pooler.weight": "pooler.dense.weight",
26
+ "pooler.bias": "pooler.dense.bias",
27
+ "encoderLayer.0.multiHeadAttention.q.weight": "encoder.layer.0.attention.self.query.weight",
28
+ "encoderLayer.0.multiHeadAttention.q.bias": "encoder.layer.0.attention.self.query.bias",
29
+ "encoderLayer.0.multiHeadAttention.k.weight": "encoder.layer.0.attention.self.key.weight",
30
+ "encoderLayer.0.multiHeadAttention.k.bias": "encoder.layer.0.attention.self.key.bias",
31
+ "encoderLayer.0.multiHeadAttention.v.weight": "encoder.layer.0.attention.self.value.weight",
32
+ "encoderLayer.0.multiHeadAttention.v.bias": "encoder.layer.0.attention.self.value.bias",
33
+ "encoderLayer.0.multiHeadAttention.o.weight": "encoder.layer.0.attention.output.dense.weight",
34
+ "encoderLayer.0.multiHeadAttention.o.bias": "encoder.layer.0.attention.output.dense.bias",
35
+ "encoderLayer.0.attnLayerNorm.weight": "encoder.layer.0.attention.output.LayerNorm.weight",
36
+ "encoderLayer.0.attnLayerNorm.bias": "encoder.layer.0.attention.output.LayerNorm.bias",
37
+ "encoderLayer.0.feedForward.intermediateDense.weight": "encoder.layer.0.intermediate.dense.weight",
38
+ "encoderLayer.0.feedForward.intermediateDense.bias": "encoder.layer.0.intermediate.dense.bias",
39
+ "encoderLayer.0.feedForward.outputDense.weight": "encoder.layer.0.output.dense.weight",
40
+ "encoderLayer.0.feedForward.outputDense.bias": "encoder.layer.0.output.dense.bias",
41
+ "encoderLayer.0.ffnLayerNorm.weight": "encoder.layer.0.output.LayerNorm.weight",
42
+ "encoderLayer.0.ffnLayerNorm.bias": "encoder.layer.0.output.LayerNorm.bias",
43
+ "encoderLayer.1.multiHeadAttention.q.weight": "encoder.layer.1.attention.self.query.weight",
44
+ "encoderLayer.1.multiHeadAttention.q.bias": "encoder.layer.1.attention.self.query.bias",
45
+ "encoderLayer.1.multiHeadAttention.k.weight": "encoder.layer.1.attention.self.key.weight",
46
+ "encoderLayer.1.multiHeadAttention.k.bias": "encoder.layer.1.attention.self.key.bias",
47
+ "encoderLayer.1.multiHeadAttention.v.weight": "encoder.layer.1.attention.self.value.weight",
48
+ "encoderLayer.1.multiHeadAttention.v.bias": "encoder.layer.1.attention.self.value.bias",
49
+ "encoderLayer.1.multiHeadAttention.o.weight": "encoder.layer.1.attention.output.dense.weight",
50
+ "encoderLayer.1.multiHeadAttention.o.bias": "encoder.layer.1.attention.output.dense.bias",
51
+ "encoderLayer.1.attnLayerNorm.weight": "encoder.layer.1.attention.output.LayerNorm.weight",
52
+ "encoderLayer.1.attnLayerNorm.bias": "encoder.layer.1.attention.output.LayerNorm.bias",
53
+ "encoderLayer.1.feedForward.intermediateDense.weight": "encoder.layer.1.intermediate.dense.weight",
54
+ "encoderLayer.1.feedForward.intermediateDense.bias": "encoder.layer.1.intermediate.dense.bias",
55
+ "encoderLayer.1.feedForward.outputDense.weight": "encoder.layer.1.output.dense.weight",
56
+ "encoderLayer.1.feedForward.outputDense.bias": "encoder.layer.1.output.dense.bias",
57
+ "encoderLayer.1.ffnLayerNorm.weight": "encoder.layer.1.output.LayerNorm.weight",
58
+ "encoderLayer.1.ffnLayerNorm.bias": "encoder.layer.1.output.LayerNorm.bias",
59
+ "encoderLayer.2.multiHeadAttention.q.weight": "encoder.layer.2.attention.self.query.weight",
60
+ "encoderLayer.2.multiHeadAttention.q.bias": "encoder.layer.2.attention.self.query.bias",
61
+ "encoderLayer.2.multiHeadAttention.k.weight": "encoder.layer.2.attention.self.key.weight",
62
+ "encoderLayer.2.multiHeadAttention.k.bias": "encoder.layer.2.attention.self.key.bias",
63
+ "encoderLayer.2.multiHeadAttention.v.weight": "encoder.layer.2.attention.self.value.weight",
64
+ "encoderLayer.2.multiHeadAttention.v.bias": "encoder.layer.2.attention.self.value.bias",
65
+ "encoderLayer.2.multiHeadAttention.o.weight": "encoder.layer.2.attention.output.dense.weight",
66
+ "encoderLayer.2.multiHeadAttention.o.bias": "encoder.layer.2.attention.output.dense.bias",
67
+ "encoderLayer.2.attnLayerNorm.weight": "encoder.layer.2.attention.output.LayerNorm.weight",
68
+ "encoderLayer.2.attnLayerNorm.bias": "encoder.layer.2.attention.output.LayerNorm.bias",
69
+ "encoderLayer.2.feedForward.intermediateDense.weight": "encoder.layer.2.intermediate.dense.weight",
70
+ "encoderLayer.2.feedForward.intermediateDense.bias": "encoder.layer.2.intermediate.dense.bias",
71
+ "encoderLayer.2.feedForward.outputDense.weight": "encoder.layer.2.output.dense.weight",
72
+ "encoderLayer.2.feedForward.outputDense.bias": "encoder.layer.2.output.dense.bias",
73
+ "encoderLayer.2.ffnLayerNorm.weight": "encoder.layer.2.output.LayerNorm.weight",
74
+ "encoderLayer.2.ffnLayerNorm.bias": "encoder.layer.2.output.LayerNorm.bias",
75
+ "encoderLayer.3.multiHeadAttention.q.weight": "encoder.layer.3.attention.self.query.weight",
76
+ "encoderLayer.3.multiHeadAttention.q.bias": "encoder.layer.3.attention.self.query.bias",
77
+ "encoderLayer.3.multiHeadAttention.k.weight": "encoder.layer.3.attention.self.key.weight",
78
+ "encoderLayer.3.multiHeadAttention.k.bias": "encoder.layer.3.attention.self.key.bias",
79
+ "encoderLayer.3.multiHeadAttention.v.weight": "encoder.layer.3.attention.self.value.weight",
80
+ "encoderLayer.3.multiHeadAttention.v.bias": "encoder.layer.3.attention.self.value.bias",
81
+ "encoderLayer.3.multiHeadAttention.o.weight": "encoder.layer.3.attention.output.dense.weight",
82
+ "encoderLayer.3.multiHeadAttention.o.bias": "encoder.layer.3.attention.output.dense.bias",
83
+ "encoderLayer.3.attnLayerNorm.weight": "encoder.layer.3.attention.output.LayerNorm.weight",
84
+ "encoderLayer.3.attnLayerNorm.bias": "encoder.layer.3.attention.output.LayerNorm.bias",
85
+ "encoderLayer.3.feedForward.intermediateDense.weight": "encoder.layer.3.intermediate.dense.weight",
86
+ "encoderLayer.3.feedForward.intermediateDense.bias": "encoder.layer.3.intermediate.dense.bias",
87
+ "encoderLayer.3.feedForward.outputDense.weight": "encoder.layer.3.output.dense.weight",
88
+ "encoderLayer.3.feedForward.outputDense.bias": "encoder.layer.3.output.dense.bias",
89
+ "encoderLayer.3.ffnLayerNorm.weight": "encoder.layer.3.output.LayerNorm.weight",
90
+ "encoderLayer.3.ffnLayerNorm.bias": "encoder.layer.3.output.LayerNorm.bias",
91
+ "encoderLayer.4.multiHeadAttention.q.weight": "encoder.layer.4.attention.self.query.weight",
92
+ "encoderLayer.4.multiHeadAttention.q.bias": "encoder.layer.4.attention.self.query.bias",
93
+ "encoderLayer.4.multiHeadAttention.k.weight": "encoder.layer.4.attention.self.key.weight",
94
+ "encoderLayer.4.multiHeadAttention.k.bias": "encoder.layer.4.attention.self.key.bias",
95
+ "encoderLayer.4.multiHeadAttention.v.weight": "encoder.layer.4.attention.self.value.weight",
96
+ "encoderLayer.4.multiHeadAttention.v.bias": "encoder.layer.4.attention.self.value.bias",
97
+ "encoderLayer.4.multiHeadAttention.o.weight": "encoder.layer.4.attention.output.dense.weight",
98
+ "encoderLayer.4.multiHeadAttention.o.bias": "encoder.layer.4.attention.output.dense.bias",
99
+ "encoderLayer.4.attnLayerNorm.weight": "encoder.layer.4.attention.output.LayerNorm.weight",
100
+ "encoderLayer.4.attnLayerNorm.bias": "encoder.layer.4.attention.output.LayerNorm.bias",
101
+ "encoderLayer.4.feedForward.intermediateDense.weight": "encoder.layer.4.intermediate.dense.weight",
102
+ "encoderLayer.4.feedForward.intermediateDense.bias": "encoder.layer.4.intermediate.dense.bias",
103
+ "encoderLayer.4.feedForward.outputDense.weight": "encoder.layer.4.output.dense.weight",
104
+ "encoderLayer.4.feedForward.outputDense.bias": "encoder.layer.4.output.dense.bias",
105
+ "encoderLayer.4.ffnLayerNorm.weight": "encoder.layer.4.output.LayerNorm.weight",
106
+ "encoderLayer.4.ffnLayerNorm.bias": "encoder.layer.4.output.LayerNorm.bias",
107
+ "encoderLayer.5.multiHeadAttention.q.weight": "encoder.layer.5.attention.self.query.weight",
108
+ "encoderLayer.5.multiHeadAttention.q.bias": "encoder.layer.5.attention.self.query.bias",
109
+ "encoderLayer.5.multiHeadAttention.k.weight": "encoder.layer.5.attention.self.key.weight",
110
+ "encoderLayer.5.multiHeadAttention.k.bias": "encoder.layer.5.attention.self.key.bias",
111
+ "encoderLayer.5.multiHeadAttention.v.weight": "encoder.layer.5.attention.self.value.weight",
112
+ "encoderLayer.5.multiHeadAttention.v.bias": "encoder.layer.5.attention.self.value.bias",
113
+ "encoderLayer.5.multiHeadAttention.o.weight": "encoder.layer.5.attention.output.dense.weight",
114
+ "encoderLayer.5.multiHeadAttention.o.bias": "encoder.layer.5.attention.output.dense.bias",
115
+ "encoderLayer.5.attnLayerNorm.weight": "encoder.layer.5.attention.output.LayerNorm.weight",
116
+ "encoderLayer.5.attnLayerNorm.bias": "encoder.layer.5.attention.output.LayerNorm.bias",
117
+ "encoderLayer.5.feedForward.intermediateDense.weight": "encoder.layer.5.intermediate.dense.weight",
118
+ "encoderLayer.5.feedForward.intermediateDense.bias": "encoder.layer.5.intermediate.dense.bias",
119
+ "encoderLayer.5.feedForward.outputDense.weight": "encoder.layer.5.output.dense.weight",
120
+ "encoderLayer.5.feedForward.outputDense.bias": "encoder.layer.5.output.dense.bias",
121
+ "encoderLayer.5.ffnLayerNorm.weight": "encoder.layer.5.output.LayerNorm.weight",
122
+ "encoderLayer.5.ffnLayerNorm.bias": "encoder.layer.5.output.LayerNorm.bias",
123
+ "encoderLayer.6.multiHeadAttention.q.weight": "encoder.layer.6.attention.self.query.weight",
124
+ "encoderLayer.6.multiHeadAttention.q.bias": "encoder.layer.6.attention.self.query.bias",
125
+ "encoderLayer.6.multiHeadAttention.k.weight": "encoder.layer.6.attention.self.key.weight",
126
+ "encoderLayer.6.multiHeadAttention.k.bias": "encoder.layer.6.attention.self.key.bias",
127
+ "encoderLayer.6.multiHeadAttention.v.weight": "encoder.layer.6.attention.self.value.weight",
128
+ "encoderLayer.6.multiHeadAttention.v.bias": "encoder.layer.6.attention.self.value.bias",
129
+ "encoderLayer.6.multiHeadAttention.o.weight": "encoder.layer.6.attention.output.dense.weight",
130
+ "encoderLayer.6.multiHeadAttention.o.bias": "encoder.layer.6.attention.output.dense.bias",
131
+ "encoderLayer.6.attnLayerNorm.weight": "encoder.layer.6.attention.output.LayerNorm.weight",
132
+ "encoderLayer.6.attnLayerNorm.bias": "encoder.layer.6.attention.output.LayerNorm.bias",
133
+ "encoderLayer.6.feedForward.intermediateDense.weight": "encoder.layer.6.intermediate.dense.weight",
134
+ "encoderLayer.6.feedForward.intermediateDense.bias": "encoder.layer.6.intermediate.dense.bias",
135
+ "encoderLayer.6.feedForward.outputDense.weight": "encoder.layer.6.output.dense.weight",
136
+ "encoderLayer.6.feedForward.outputDense.bias": "encoder.layer.6.output.dense.bias",
137
+ "encoderLayer.6.ffnLayerNorm.weight": "encoder.layer.6.output.LayerNorm.weight",
138
+ "encoderLayer.6.ffnLayerNorm.bias": "encoder.layer.6.output.LayerNorm.bias",
139
+ "encoderLayer.7.multiHeadAttention.q.weight": "encoder.layer.7.attention.self.query.weight",
140
+ "encoderLayer.7.multiHeadAttention.q.bias": "encoder.layer.7.attention.self.query.bias",
141
+ "encoderLayer.7.multiHeadAttention.k.weight": "encoder.layer.7.attention.self.key.weight",
142
+ "encoderLayer.7.multiHeadAttention.k.bias": "encoder.layer.7.attention.self.key.bias",
143
+ "encoderLayer.7.multiHeadAttention.v.weight": "encoder.layer.7.attention.self.value.weight",
144
+ "encoderLayer.7.multiHeadAttention.v.bias": "encoder.layer.7.attention.self.value.bias",
145
+ "encoderLayer.7.multiHeadAttention.o.weight": "encoder.layer.7.attention.output.dense.weight",
146
+ "encoderLayer.7.multiHeadAttention.o.bias": "encoder.layer.7.attention.output.dense.bias",
147
+ "encoderLayer.7.attnLayerNorm.weight": "encoder.layer.7.attention.output.LayerNorm.weight",
148
+ "encoderLayer.7.attnLayerNorm.bias": "encoder.layer.7.attention.output.LayerNorm.bias",
149
+ "encoderLayer.7.feedForward.intermediateDense.weight": "encoder.layer.7.intermediate.dense.weight",
150
+ "encoderLayer.7.feedForward.intermediateDense.bias": "encoder.layer.7.intermediate.dense.bias",
151
+ "encoderLayer.7.feedForward.outputDense.weight": "encoder.layer.7.output.dense.weight",
152
+ "encoderLayer.7.feedForward.outputDense.bias": "encoder.layer.7.output.dense.bias",
153
+ "encoderLayer.7.ffnLayerNorm.weight": "encoder.layer.7.output.LayerNorm.weight",
154
+ "encoderLayer.7.ffnLayerNorm.bias": "encoder.layer.7.output.LayerNorm.bias",
155
+ "encoderLayer.8.multiHeadAttention.q.weight": "encoder.layer.8.attention.self.query.weight",
156
+ "encoderLayer.8.multiHeadAttention.q.bias": "encoder.layer.8.attention.self.query.bias",
157
+ "encoderLayer.8.multiHeadAttention.k.weight": "encoder.layer.8.attention.self.key.weight",
158
+ "encoderLayer.8.multiHeadAttention.k.bias": "encoder.layer.8.attention.self.key.bias",
159
+ "encoderLayer.8.multiHeadAttention.v.weight": "encoder.layer.8.attention.self.value.weight",
160
+ "encoderLayer.8.multiHeadAttention.v.bias": "encoder.layer.8.attention.self.value.bias",
161
+ "encoderLayer.8.multiHeadAttention.o.weight": "encoder.layer.8.attention.output.dense.weight",
162
+ "encoderLayer.8.multiHeadAttention.o.bias": "encoder.layer.8.attention.output.dense.bias",
163
+ "encoderLayer.8.attnLayerNorm.weight": "encoder.layer.8.attention.output.LayerNorm.weight",
164
+ "encoderLayer.8.attnLayerNorm.bias": "encoder.layer.8.attention.output.LayerNorm.bias",
165
+ "encoderLayer.8.feedForward.intermediateDense.weight": "encoder.layer.8.intermediate.dense.weight",
166
+ "encoderLayer.8.feedForward.intermediateDense.bias": "encoder.layer.8.intermediate.dense.bias",
167
+ "encoderLayer.8.feedForward.outputDense.weight": "encoder.layer.8.output.dense.weight",
168
+ "encoderLayer.8.feedForward.outputDense.bias": "encoder.layer.8.output.dense.bias",
169
+ "encoderLayer.8.ffnLayerNorm.weight": "encoder.layer.8.output.LayerNorm.weight",
170
+ "encoderLayer.8.ffnLayerNorm.bias": "encoder.layer.8.output.LayerNorm.bias",
171
+ "encoderLayer.9.multiHeadAttention.q.weight": "encoder.layer.9.attention.self.query.weight",
172
+ "encoderLayer.9.multiHeadAttention.q.bias": "encoder.layer.9.attention.self.query.bias",
173
+ "encoderLayer.9.multiHeadAttention.k.weight": "encoder.layer.9.attention.self.key.weight",
174
+ "encoderLayer.9.multiHeadAttention.k.bias": "encoder.layer.9.attention.self.key.bias",
175
+ "encoderLayer.9.multiHeadAttention.v.weight": "encoder.layer.9.attention.self.value.weight",
176
+ "encoderLayer.9.multiHeadAttention.v.bias": "encoder.layer.9.attention.self.value.bias",
177
+ "encoderLayer.9.multiHeadAttention.o.weight": "encoder.layer.9.attention.output.dense.weight",
178
+ "encoderLayer.9.multiHeadAttention.o.bias": "encoder.layer.9.attention.output.dense.bias",
179
+ "encoderLayer.9.attnLayerNorm.weight": "encoder.layer.9.attention.output.LayerNorm.weight",
180
+ "encoderLayer.9.attnLayerNorm.bias": "encoder.layer.9.attention.output.LayerNorm.bias",
181
+ "encoderLayer.9.feedForward.intermediateDense.weight": "encoder.layer.9.intermediate.dense.weight",
182
+ "encoderLayer.9.feedForward.intermediateDense.bias": "encoder.layer.9.intermediate.dense.bias",
183
+ "encoderLayer.9.feedForward.outputDense.weight": "encoder.layer.9.output.dense.weight",
184
+ "encoderLayer.9.feedForward.outputDense.bias": "encoder.layer.9.output.dense.bias",
185
+ "encoderLayer.9.ffnLayerNorm.weight": "encoder.layer.9.output.LayerNorm.weight",
186
+ "encoderLayer.9.ffnLayerNorm.bias": "encoder.layer.9.output.LayerNorm.bias",
187
+ "encoderLayer.10.multiHeadAttention.q.weight": "encoder.layer.10.attention.self.query.weight",
188
+ "encoderLayer.10.multiHeadAttention.q.bias": "encoder.layer.10.attention.self.query.bias",
189
+ "encoderLayer.10.multiHeadAttention.k.weight": "encoder.layer.10.attention.self.key.weight",
190
+ "encoderLayer.10.multiHeadAttention.k.bias": "encoder.layer.10.attention.self.key.bias",
191
+ "encoderLayer.10.multiHeadAttention.v.weight": "encoder.layer.10.attention.self.value.weight",
192
+ "encoderLayer.10.multiHeadAttention.v.bias": "encoder.layer.10.attention.self.value.bias",
193
+ "encoderLayer.10.multiHeadAttention.o.weight": "encoder.layer.10.attention.output.dense.weight",
194
+ "encoderLayer.10.multiHeadAttention.o.bias": "encoder.layer.10.attention.output.dense.bias",
195
+ "encoderLayer.10.attnLayerNorm.weight": "encoder.layer.10.attention.output.LayerNorm.weight",
196
+ "encoderLayer.10.attnLayerNorm.bias": "encoder.layer.10.attention.output.LayerNorm.bias",
197
+ "encoderLayer.10.feedForward.intermediateDense.weight": "encoder.layer.10.intermediate.dense.weight",
198
+ "encoderLayer.10.feedForward.intermediateDense.bias": "encoder.layer.10.intermediate.dense.bias",
199
+ "encoderLayer.10.feedForward.outputDense.weight": "encoder.layer.10.output.dense.weight",
200
+ "encoderLayer.10.feedForward.outputDense.bias": "encoder.layer.10.output.dense.bias",
201
+ "encoderLayer.10.ffnLayerNorm.weight": "encoder.layer.10.output.LayerNorm.weight",
202
+ "encoderLayer.10.ffnLayerNorm.bias": "encoder.layer.10.output.LayerNorm.bias",
203
+ "encoderLayer.11.multiHeadAttention.q.weight": "encoder.layer.11.attention.self.query.weight",
204
+ "encoderLayer.11.multiHeadAttention.q.bias": "encoder.layer.11.attention.self.query.bias",
205
+ "encoderLayer.11.multiHeadAttention.k.weight": "encoder.layer.11.attention.self.key.weight",
206
+ "encoderLayer.11.multiHeadAttention.k.bias": "encoder.layer.11.attention.self.key.bias",
207
+ "encoderLayer.11.multiHeadAttention.v.weight": "encoder.layer.11.attention.self.value.weight",
208
+ "encoderLayer.11.multiHeadAttention.v.bias": "encoder.layer.11.attention.self.value.bias",
209
+ "encoderLayer.11.multiHeadAttention.o.weight": "encoder.layer.11.attention.output.dense.weight",
210
+ "encoderLayer.11.multiHeadAttention.o.bias": "encoder.layer.11.attention.output.dense.bias",
211
+ "encoderLayer.11.attnLayerNorm.weight": "encoder.layer.11.attention.output.LayerNorm.weight",
212
+ "encoderLayer.11.attnLayerNorm.bias": "encoder.layer.11.attention.output.LayerNorm.bias",
213
+ "encoderLayer.11.feedForward.intermediateDense.weight": "encoder.layer.11.intermediate.dense.weight",
214
+ "encoderLayer.11.feedForward.intermediateDense.bias": "encoder.layer.11.intermediate.dense.bias",
215
+ "encoderLayer.11.feedForward.outputDense.weight": "encoder.layer.11.output.dense.weight",
216
+ "encoderLayer.11.feedForward.outputDense.bias": "encoder.layer.11.output.dense.bias",
217
+ "encoderLayer.11.ffnLayerNorm.weight": "encoder.layer.11.output.LayerNorm.weight",
218
+ "encoderLayer.11.ffnLayerNorm.bias": "encoder.layer.11.output.LayerNorm.bias",
219
+ "encoderLayer.12.multiHeadAttention.q.weight": "encoder.layer.12.attention.self.query.weight",
220
+ "encoderLayer.12.multiHeadAttention.q.bias": "encoder.layer.12.attention.self.query.bias",
221
+ "encoderLayer.12.multiHeadAttention.k.weight": "encoder.layer.12.attention.self.key.weight",
222
+ "encoderLayer.12.multiHeadAttention.k.bias": "encoder.layer.12.attention.self.key.bias",
223
+ "encoderLayer.12.multiHeadAttention.v.weight": "encoder.layer.12.attention.self.value.weight",
224
+ "encoderLayer.12.multiHeadAttention.v.bias": "encoder.layer.12.attention.self.value.bias",
225
+ "encoderLayer.12.multiHeadAttention.o.weight": "encoder.layer.12.attention.output.dense.weight",
226
+ "encoderLayer.12.multiHeadAttention.o.bias": "encoder.layer.12.attention.output.dense.bias",
227
+ "encoderLayer.12.attnLayerNorm.weight": "encoder.layer.12.attention.output.LayerNorm.weight",
228
+ "encoderLayer.12.attnLayerNorm.bias": "encoder.layer.12.attention.output.LayerNorm.bias",
229
+ "encoderLayer.12.feedForward.intermediateDense.weight": "encoder.layer.12.intermediate.dense.weight",
230
+ "encoderLayer.12.feedForward.intermediateDense.bias": "encoder.layer.12.intermediate.dense.bias",
231
+ "encoderLayer.12.feedForward.outputDense.weight": "encoder.layer.12.output.dense.weight",
232
+ "encoderLayer.12.feedForward.outputDense.bias": "encoder.layer.12.output.dense.bias",
233
+ "encoderLayer.12.ffnLayerNorm.weight": "encoder.layer.12.output.LayerNorm.weight",
234
+ "encoderLayer.12.ffnLayerNorm.bias": "encoder.layer.12.output.LayerNorm.bias",
235
+ "encoderLayer.13.multiHeadAttention.q.weight": "encoder.layer.13.attention.self.query.weight",
236
+ "encoderLayer.13.multiHeadAttention.q.bias": "encoder.layer.13.attention.self.query.bias",
237
+ "encoderLayer.13.multiHeadAttention.k.weight": "encoder.layer.13.attention.self.key.weight",
238
+ "encoderLayer.13.multiHeadAttention.k.bias": "encoder.layer.13.attention.self.key.bias",
239
+ "encoderLayer.13.multiHeadAttention.v.weight": "encoder.layer.13.attention.self.value.weight",
240
+ "encoderLayer.13.multiHeadAttention.v.bias": "encoder.layer.13.attention.self.value.bias",
241
+ "encoderLayer.13.multiHeadAttention.o.weight": "encoder.layer.13.attention.output.dense.weight",
242
+ "encoderLayer.13.multiHeadAttention.o.bias": "encoder.layer.13.attention.output.dense.bias",
243
+ "encoderLayer.13.attnLayerNorm.weight": "encoder.layer.13.attention.output.LayerNorm.weight",
244
+ "encoderLayer.13.attnLayerNorm.bias": "encoder.layer.13.attention.output.LayerNorm.bias",
245
+ "encoderLayer.13.feedForward.intermediateDense.weight": "encoder.layer.13.intermediate.dense.weight",
246
+ "encoderLayer.13.feedForward.intermediateDense.bias": "encoder.layer.13.intermediate.dense.bias",
247
+ "encoderLayer.13.feedForward.outputDense.weight": "encoder.layer.13.output.dense.weight",
248
+ "encoderLayer.13.feedForward.outputDense.bias": "encoder.layer.13.output.dense.bias",
249
+ "encoderLayer.13.ffnLayerNorm.weight": "encoder.layer.13.output.LayerNorm.weight",
250
+ "encoderLayer.13.ffnLayerNorm.bias": "encoder.layer.13.output.LayerNorm.bias",
251
+ "encoderLayer.14.multiHeadAttention.q.weight": "encoder.layer.14.attention.self.query.weight",
252
+ "encoderLayer.14.multiHeadAttention.q.bias": "encoder.layer.14.attention.self.query.bias",
253
+ "encoderLayer.14.multiHeadAttention.k.weight": "encoder.layer.14.attention.self.key.weight",
254
+ "encoderLayer.14.multiHeadAttention.k.bias": "encoder.layer.14.attention.self.key.bias",
255
+ "encoderLayer.14.multiHeadAttention.v.weight": "encoder.layer.14.attention.self.value.weight",
256
+ "encoderLayer.14.multiHeadAttention.v.bias": "encoder.layer.14.attention.self.value.bias",
257
+ "encoderLayer.14.multiHeadAttention.o.weight": "encoder.layer.14.attention.output.dense.weight",
258
+ "encoderLayer.14.multiHeadAttention.o.bias": "encoder.layer.14.attention.output.dense.bias",
259
+ "encoderLayer.14.attnLayerNorm.weight": "encoder.layer.14.attention.output.LayerNorm.weight",
260
+ "encoderLayer.14.attnLayerNorm.bias": "encoder.layer.14.attention.output.LayerNorm.bias",
261
+ "encoderLayer.14.feedForward.intermediateDense.weight": "encoder.layer.14.intermediate.dense.weight",
262
+ "encoderLayer.14.feedForward.intermediateDense.bias": "encoder.layer.14.intermediate.dense.bias",
263
+ "encoderLayer.14.feedForward.outputDense.weight": "encoder.layer.14.output.dense.weight",
264
+ "encoderLayer.14.feedForward.outputDense.bias": "encoder.layer.14.output.dense.bias",
265
+ "encoderLayer.14.ffnLayerNorm.weight": "encoder.layer.14.output.LayerNorm.weight",
266
+ "encoderLayer.14.ffnLayerNorm.bias": "encoder.layer.14.output.LayerNorm.bias",
267
+ "encoderLayer.15.multiHeadAttention.q.weight": "encoder.layer.15.attention.self.query.weight",
268
+ "encoderLayer.15.multiHeadAttention.q.bias": "encoder.layer.15.attention.self.query.bias",
269
+ "encoderLayer.15.multiHeadAttention.k.weight": "encoder.layer.15.attention.self.key.weight",
270
+ "encoderLayer.15.multiHeadAttention.k.bias": "encoder.layer.15.attention.self.key.bias",
271
+ "encoderLayer.15.multiHeadAttention.v.weight": "encoder.layer.15.attention.self.value.weight",
272
+ "encoderLayer.15.multiHeadAttention.v.bias": "encoder.layer.15.attention.self.value.bias",
273
+ "encoderLayer.15.multiHeadAttention.o.weight": "encoder.layer.15.attention.output.dense.weight",
274
+ "encoderLayer.15.multiHeadAttention.o.bias": "encoder.layer.15.attention.output.dense.bias",
275
+ "encoderLayer.15.attnLayerNorm.weight": "encoder.layer.15.attention.output.LayerNorm.weight",
276
+ "encoderLayer.15.attnLayerNorm.bias": "encoder.layer.15.attention.output.LayerNorm.bias",
277
+ "encoderLayer.15.feedForward.intermediateDense.weight": "encoder.layer.15.intermediate.dense.weight",
278
+ "encoderLayer.15.feedForward.intermediateDense.bias": "encoder.layer.15.intermediate.dense.bias",
279
+ "encoderLayer.15.feedForward.outputDense.weight": "encoder.layer.15.output.dense.weight",
280
+ "encoderLayer.15.feedForward.outputDense.bias": "encoder.layer.15.output.dense.bias",
281
+ "encoderLayer.15.ffnLayerNorm.weight": "encoder.layer.15.output.LayerNorm.weight",
282
+ "encoderLayer.15.ffnLayerNorm.bias": "encoder.layer.15.output.LayerNorm.bias",
283
+ "encoderLayer.16.multiHeadAttention.q.weight": "encoder.layer.16.attention.self.query.weight",
284
+ "encoderLayer.16.multiHeadAttention.q.bias": "encoder.layer.16.attention.self.query.bias",
285
+ "encoderLayer.16.multiHeadAttention.k.weight": "encoder.layer.16.attention.self.key.weight",
286
+ "encoderLayer.16.multiHeadAttention.k.bias": "encoder.layer.16.attention.self.key.bias",
287
+ "encoderLayer.16.multiHeadAttention.v.weight": "encoder.layer.16.attention.self.value.weight",
288
+ "encoderLayer.16.multiHeadAttention.v.bias": "encoder.layer.16.attention.self.value.bias",
289
+ "encoderLayer.16.multiHeadAttention.o.weight": "encoder.layer.16.attention.output.dense.weight",
290
+ "encoderLayer.16.multiHeadAttention.o.bias": "encoder.layer.16.attention.output.dense.bias",
291
+ "encoderLayer.16.attnLayerNorm.weight": "encoder.layer.16.attention.output.LayerNorm.weight",
292
+ "encoderLayer.16.attnLayerNorm.bias": "encoder.layer.16.attention.output.LayerNorm.bias",
293
+ "encoderLayer.16.feedForward.intermediateDense.weight": "encoder.layer.16.intermediate.dense.weight",
294
+ "encoderLayer.16.feedForward.intermediateDense.bias": "encoder.layer.16.intermediate.dense.bias",
295
+ "encoderLayer.16.feedForward.outputDense.weight": "encoder.layer.16.output.dense.weight",
296
+ "encoderLayer.16.feedForward.outputDense.bias": "encoder.layer.16.output.dense.bias",
297
+ "encoderLayer.16.ffnLayerNorm.weight": "encoder.layer.16.output.LayerNorm.weight",
298
+ "encoderLayer.16.ffnLayerNorm.bias": "encoder.layer.16.output.LayerNorm.bias",
299
+ "encoderLayer.17.multiHeadAttention.q.weight": "encoder.layer.17.attention.self.query.weight",
300
+ "encoderLayer.17.multiHeadAttention.q.bias": "encoder.layer.17.attention.self.query.bias",
301
+ "encoderLayer.17.multiHeadAttention.k.weight": "encoder.layer.17.attention.self.key.weight",
302
+ "encoderLayer.17.multiHeadAttention.k.bias": "encoder.layer.17.attention.self.key.bias",
303
+ "encoderLayer.17.multiHeadAttention.v.weight": "encoder.layer.17.attention.self.value.weight",
304
+ "encoderLayer.17.multiHeadAttention.v.bias": "encoder.layer.17.attention.self.value.bias",
305
+ "encoderLayer.17.multiHeadAttention.o.weight": "encoder.layer.17.attention.output.dense.weight",
306
+ "encoderLayer.17.multiHeadAttention.o.bias": "encoder.layer.17.attention.output.dense.bias",
307
+ "encoderLayer.17.attnLayerNorm.weight": "encoder.layer.17.attention.output.LayerNorm.weight",
308
+ "encoderLayer.17.attnLayerNorm.bias": "encoder.layer.17.attention.output.LayerNorm.bias",
309
+ "encoderLayer.17.feedForward.intermediateDense.weight": "encoder.layer.17.intermediate.dense.weight",
310
+ "encoderLayer.17.feedForward.intermediateDense.bias": "encoder.layer.17.intermediate.dense.bias",
311
+ "encoderLayer.17.feedForward.outputDense.weight": "encoder.layer.17.output.dense.weight",
312
+ "encoderLayer.17.feedForward.outputDense.bias": "encoder.layer.17.output.dense.bias",
313
+ "encoderLayer.17.ffnLayerNorm.weight": "encoder.layer.17.output.LayerNorm.weight",
314
+ "encoderLayer.17.ffnLayerNorm.bias": "encoder.layer.17.output.LayerNorm.bias",
315
+ "encoderLayer.18.multiHeadAttention.q.weight": "encoder.layer.18.attention.self.query.weight",
316
+ "encoderLayer.18.multiHeadAttention.q.bias": "encoder.layer.18.attention.self.query.bias",
317
+ "encoderLayer.18.multiHeadAttention.k.weight": "encoder.layer.18.attention.self.key.weight",
318
+ "encoderLayer.18.multiHeadAttention.k.bias": "encoder.layer.18.attention.self.key.bias",
319
+ "encoderLayer.18.multiHeadAttention.v.weight": "encoder.layer.18.attention.self.value.weight",
320
+ "encoderLayer.18.multiHeadAttention.v.bias": "encoder.layer.18.attention.self.value.bias",
321
+ "encoderLayer.18.multiHeadAttention.o.weight": "encoder.layer.18.attention.output.dense.weight",
322
+ "encoderLayer.18.multiHeadAttention.o.bias": "encoder.layer.18.attention.output.dense.bias",
323
+ "encoderLayer.18.attnLayerNorm.weight": "encoder.layer.18.attention.output.LayerNorm.weight",
324
+ "encoderLayer.18.attnLayerNorm.bias": "encoder.layer.18.attention.output.LayerNorm.bias",
325
+ "encoderLayer.18.feedForward.intermediateDense.weight": "encoder.layer.18.intermediate.dense.weight",
326
+ "encoderLayer.18.feedForward.intermediateDense.bias": "encoder.layer.18.intermediate.dense.bias",
327
+ "encoderLayer.18.feedForward.outputDense.weight": "encoder.layer.18.output.dense.weight",
328
+ "encoderLayer.18.feedForward.outputDense.bias": "encoder.layer.18.output.dense.bias",
329
+ "encoderLayer.18.ffnLayerNorm.weight": "encoder.layer.18.output.LayerNorm.weight",
330
+ "encoderLayer.18.ffnLayerNorm.bias": "encoder.layer.18.output.LayerNorm.bias",
331
+ "encoderLayer.19.multiHeadAttention.q.weight": "encoder.layer.19.attention.self.query.weight",
332
+ "encoderLayer.19.multiHeadAttention.q.bias": "encoder.layer.19.attention.self.query.bias",
333
+ "encoderLayer.19.multiHeadAttention.k.weight": "encoder.layer.19.attention.self.key.weight",
334
+ "encoderLayer.19.multiHeadAttention.k.bias": "encoder.layer.19.attention.self.key.bias",
335
+ "encoderLayer.19.multiHeadAttention.v.weight": "encoder.layer.19.attention.self.value.weight",
336
+ "encoderLayer.19.multiHeadAttention.v.bias": "encoder.layer.19.attention.self.value.bias",
337
+ "encoderLayer.19.multiHeadAttention.o.weight": "encoder.layer.19.attention.output.dense.weight",
338
+ "encoderLayer.19.multiHeadAttention.o.bias": "encoder.layer.19.attention.output.dense.bias",
339
+ "encoderLayer.19.attnLayerNorm.weight": "encoder.layer.19.attention.output.LayerNorm.weight",
340
+ "encoderLayer.19.attnLayerNorm.bias": "encoder.layer.19.attention.output.LayerNorm.bias",
341
+ "encoderLayer.19.feedForward.intermediateDense.weight": "encoder.layer.19.intermediate.dense.weight",
342
+ "encoderLayer.19.feedForward.intermediateDense.bias": "encoder.layer.19.intermediate.dense.bias",
343
+ "encoderLayer.19.feedForward.outputDense.weight": "encoder.layer.19.output.dense.weight",
344
+ "encoderLayer.19.feedForward.outputDense.bias": "encoder.layer.19.output.dense.bias",
345
+ "encoderLayer.19.ffnLayerNorm.weight": "encoder.layer.19.output.LayerNorm.weight",
346
+ "encoderLayer.19.ffnLayerNorm.bias": "encoder.layer.19.output.LayerNorm.bias",
347
+ "encoderLayer.20.multiHeadAttention.q.weight": "encoder.layer.20.attention.self.query.weight",
348
+ "encoderLayer.20.multiHeadAttention.q.bias": "encoder.layer.20.attention.self.query.bias",
349
+ "encoderLayer.20.multiHeadAttention.k.weight": "encoder.layer.20.attention.self.key.weight",
350
+ "encoderLayer.20.multiHeadAttention.k.bias": "encoder.layer.20.attention.self.key.bias",
351
+ "encoderLayer.20.multiHeadAttention.v.weight": "encoder.layer.20.attention.self.value.weight",
352
+ "encoderLayer.20.multiHeadAttention.v.bias": "encoder.layer.20.attention.self.value.bias",
353
+ "encoderLayer.20.multiHeadAttention.o.weight": "encoder.layer.20.attention.output.dense.weight",
354
+ "encoderLayer.20.multiHeadAttention.o.bias": "encoder.layer.20.attention.output.dense.bias",
355
+ "encoderLayer.20.attnLayerNorm.weight": "encoder.layer.20.attention.output.LayerNorm.weight",
356
+ "encoderLayer.20.attnLayerNorm.bias": "encoder.layer.20.attention.output.LayerNorm.bias",
357
+ "encoderLayer.20.feedForward.intermediateDense.weight": "encoder.layer.20.intermediate.dense.weight",
358
+ "encoderLayer.20.feedForward.intermediateDense.bias": "encoder.layer.20.intermediate.dense.bias",
359
+ "encoderLayer.20.feedForward.outputDense.weight": "encoder.layer.20.output.dense.weight",
360
+ "encoderLayer.20.feedForward.outputDense.bias": "encoder.layer.20.output.dense.bias",
361
+ "encoderLayer.20.ffnLayerNorm.weight": "encoder.layer.20.output.LayerNorm.weight",
362
+ "encoderLayer.20.ffnLayerNorm.bias": "encoder.layer.20.output.LayerNorm.bias",
363
+ "encoderLayer.21.multiHeadAttention.q.weight": "encoder.layer.21.attention.self.query.weight",
364
+ "encoderLayer.21.multiHeadAttention.q.bias": "encoder.layer.21.attention.self.query.bias",
365
+ "encoderLayer.21.multiHeadAttention.k.weight": "encoder.layer.21.attention.self.key.weight",
366
+ "encoderLayer.21.multiHeadAttention.k.bias": "encoder.layer.21.attention.self.key.bias",
367
+ "encoderLayer.21.multiHeadAttention.v.weight": "encoder.layer.21.attention.self.value.weight",
368
+ "encoderLayer.21.multiHeadAttention.v.bias": "encoder.layer.21.attention.self.value.bias",
369
+ "encoderLayer.21.multiHeadAttention.o.weight": "encoder.layer.21.attention.output.dense.weight",
370
+ "encoderLayer.21.multiHeadAttention.o.bias": "encoder.layer.21.attention.output.dense.bias",
371
+ "encoderLayer.21.attnLayerNorm.weight": "encoder.layer.21.attention.output.LayerNorm.weight",
372
+ "encoderLayer.21.attnLayerNorm.bias": "encoder.layer.21.attention.output.LayerNorm.bias",
373
+ "encoderLayer.21.feedForward.intermediateDense.weight": "encoder.layer.21.intermediate.dense.weight",
374
+ "encoderLayer.21.feedForward.intermediateDense.bias": "encoder.layer.21.intermediate.dense.bias",
375
+ "encoderLayer.21.feedForward.outputDense.weight": "encoder.layer.21.output.dense.weight",
376
+ "encoderLayer.21.feedForward.outputDense.bias": "encoder.layer.21.output.dense.bias",
377
+ "encoderLayer.21.ffnLayerNorm.weight": "encoder.layer.21.output.LayerNorm.weight",
378
+ "encoderLayer.21.ffnLayerNorm.bias": "encoder.layer.21.output.LayerNorm.bias",
379
+ "encoderLayer.22.multiHeadAttention.q.weight": "encoder.layer.22.attention.self.query.weight",
380
+ "encoderLayer.22.multiHeadAttention.q.bias": "encoder.layer.22.attention.self.query.bias",
381
+ "encoderLayer.22.multiHeadAttention.k.weight": "encoder.layer.22.attention.self.key.weight",
382
+ "encoderLayer.22.multiHeadAttention.k.bias": "encoder.layer.22.attention.self.key.bias",
383
+ "encoderLayer.22.multiHeadAttention.v.weight": "encoder.layer.22.attention.self.value.weight",
384
+ "encoderLayer.22.multiHeadAttention.v.bias": "encoder.layer.22.attention.self.value.bias",
385
+ "encoderLayer.22.multiHeadAttention.o.weight": "encoder.layer.22.attention.output.dense.weight",
386
+ "encoderLayer.22.multiHeadAttention.o.bias": "encoder.layer.22.attention.output.dense.bias",
387
+ "encoderLayer.22.attnLayerNorm.weight": "encoder.layer.22.attention.output.LayerNorm.weight",
388
+ "encoderLayer.22.attnLayerNorm.bias": "encoder.layer.22.attention.output.LayerNorm.bias",
389
+ "encoderLayer.22.feedForward.intermediateDense.weight": "encoder.layer.22.intermediate.dense.weight",
390
+ "encoderLayer.22.feedForward.intermediateDense.bias": "encoder.layer.22.intermediate.dense.bias",
391
+ "encoderLayer.22.feedForward.outputDense.weight": "encoder.layer.22.output.dense.weight",
392
+ "encoderLayer.22.feedForward.outputDense.bias": "encoder.layer.22.output.dense.bias",
393
+ "encoderLayer.22.ffnLayerNorm.weight": "encoder.layer.22.output.LayerNorm.weight",
394
+ "encoderLayer.22.ffnLayerNorm.bias": "encoder.layer.22.output.LayerNorm.bias",
395
+ "encoderLayer.23.multiHeadAttention.q.weight": "encoder.layer.23.attention.self.query.weight",
396
+ "encoderLayer.23.multiHeadAttention.q.bias": "encoder.layer.23.attention.self.query.bias",
397
+ "encoderLayer.23.multiHeadAttention.k.weight": "encoder.layer.23.attention.self.key.weight",
398
+ "encoderLayer.23.multiHeadAttention.k.bias": "encoder.layer.23.attention.self.key.bias",
399
+ "encoderLayer.23.multiHeadAttention.v.weight": "encoder.layer.23.attention.self.value.weight",
400
+ "encoderLayer.23.multiHeadAttention.v.bias": "encoder.layer.23.attention.self.value.bias",
401
+ "encoderLayer.23.multiHeadAttention.o.weight": "encoder.layer.23.attention.output.dense.weight",
402
+ "encoderLayer.23.multiHeadAttention.o.bias": "encoder.layer.23.attention.output.dense.bias",
403
+ "encoderLayer.23.attnLayerNorm.weight": "encoder.layer.23.attention.output.LayerNorm.weight",
404
+ "encoderLayer.23.attnLayerNorm.bias": "encoder.layer.23.attention.output.LayerNorm.bias",
405
+ "encoderLayer.23.feedForward.intermediateDense.weight": "encoder.layer.23.intermediate.dense.weight",
406
+ "encoderLayer.23.feedForward.intermediateDense.bias": "encoder.layer.23.intermediate.dense.bias",
407
+ "encoderLayer.23.feedForward.outputDense.weight": "encoder.layer.23.output.dense.weight",
408
+ "encoderLayer.23.feedForward.outputDense.bias": "encoder.layer.23.output.dense.bias",
409
+ "encoderLayer.23.ffnLayerNorm.weight": "encoder.layer.23.output.LayerNorm.weight",
410
+ "encoderLayer.23.ffnLayerNorm.bias": "encoder.layer.23.output.LayerNorm.bias"
411
+ }
412
+ }
bge-large-zh-v1.5/bert4torch_config.json ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_probs_dropout_prob": 0.1,
3
+ "hidden_act": "gelu",
4
+ "hidden_dropout_prob": 0.1,
5
+ "hidden_size": 1024,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 4096,
8
+ "layer_norm_eps": 1e-12,
9
+ "max_position_embeddings": 512,
10
+ "model": "bert",
11
+ "num_attention_heads": 16,
12
+ "num_hidden_layers": 24,
13
+ "bos_token_id": 0,
14
+ "eos_token_id": 2,
15
+ "pad_token_id": 0,
16
+ "torch_dtype": "float32",
17
+ "type_vocab_size": 2,
18
+ "vocab_size": 21128,
19
+ "with_pool": true,
20
+ "pool_strategy": "cls",
21
+ "norm_mode": "torch_buildin",
22
+ "mapping": {
23
+ "embeddings.word_embeddings.weight": "embeddings.word_embeddings.weight",
24
+ "embeddings.position_embeddings.weight": "embeddings.position_embeddings.weight",
25
+ "embeddings.segment_embeddings.weight": "embeddings.token_type_embeddings.weight",
26
+ "embeddings.layerNorm.weight": "embeddings.LayerNorm.weight",
27
+ "embeddings.layerNorm.bias": "embeddings.LayerNorm.bias",
28
+ "pooler.weight": "pooler.dense.weight",
29
+ "pooler.bias": "pooler.dense.bias",
30
+ "encoderLayer.0.multiHeadAttention.q.weight": "encoder.layer.0.attention.self.query.weight",
31
+ "encoderLayer.0.multiHeadAttention.q.bias": "encoder.layer.0.attention.self.query.bias",
32
+ "encoderLayer.0.multiHeadAttention.k.weight": "encoder.layer.0.attention.self.key.weight",
33
+ "encoderLayer.0.multiHeadAttention.k.bias": "encoder.layer.0.attention.self.key.bias",
34
+ "encoderLayer.0.multiHeadAttention.v.weight": "encoder.layer.0.attention.self.value.weight",
35
+ "encoderLayer.0.multiHeadAttention.v.bias": "encoder.layer.0.attention.self.value.bias",
36
+ "encoderLayer.0.multiHeadAttention.o.weight": "encoder.layer.0.attention.output.dense.weight",
37
+ "encoderLayer.0.multiHeadAttention.o.bias": "encoder.layer.0.attention.output.dense.bias",
38
+ "encoderLayer.0.attnLayerNorm.weight": "encoder.layer.0.attention.output.LayerNorm.weight",
39
+ "encoderLayer.0.attnLayerNorm.bias": "encoder.layer.0.attention.output.LayerNorm.bias",
40
+ "encoderLayer.0.feedForward.intermediateDense.weight": "encoder.layer.0.intermediate.dense.weight",
41
+ "encoderLayer.0.feedForward.intermediateDense.bias": "encoder.layer.0.intermediate.dense.bias",
42
+ "encoderLayer.0.feedForward.outputDense.weight": "encoder.layer.0.output.dense.weight",
43
+ "encoderLayer.0.feedForward.outputDense.bias": "encoder.layer.0.output.dense.bias",
44
+ "encoderLayer.0.ffnLayerNorm.weight": "encoder.layer.0.output.LayerNorm.weight",
45
+ "encoderLayer.0.ffnLayerNorm.bias": "encoder.layer.0.output.LayerNorm.bias",
46
+ "encoderLayer.1.multiHeadAttention.q.weight": "encoder.layer.1.attention.self.query.weight",
47
+ "encoderLayer.1.multiHeadAttention.q.bias": "encoder.layer.1.attention.self.query.bias",
48
+ "encoderLayer.1.multiHeadAttention.k.weight": "encoder.layer.1.attention.self.key.weight",
49
+ "encoderLayer.1.multiHeadAttention.k.bias": "encoder.layer.1.attention.self.key.bias",
50
+ "encoderLayer.1.multiHeadAttention.v.weight": "encoder.layer.1.attention.self.value.weight",
51
+ "encoderLayer.1.multiHeadAttention.v.bias": "encoder.layer.1.attention.self.value.bias",
52
+ "encoderLayer.1.multiHeadAttention.o.weight": "encoder.layer.1.attention.output.dense.weight",
53
+ "encoderLayer.1.multiHeadAttention.o.bias": "encoder.layer.1.attention.output.dense.bias",
54
+ "encoderLayer.1.attnLayerNorm.weight": "encoder.layer.1.attention.output.LayerNorm.weight",
55
+ "encoderLayer.1.attnLayerNorm.bias": "encoder.layer.1.attention.output.LayerNorm.bias",
56
+ "encoderLayer.1.feedForward.intermediateDense.weight": "encoder.layer.1.intermediate.dense.weight",
57
+ "encoderLayer.1.feedForward.intermediateDense.bias": "encoder.layer.1.intermediate.dense.bias",
58
+ "encoderLayer.1.feedForward.outputDense.weight": "encoder.layer.1.output.dense.weight",
59
+ "encoderLayer.1.feedForward.outputDense.bias": "encoder.layer.1.output.dense.bias",
60
+ "encoderLayer.1.ffnLayerNorm.weight": "encoder.layer.1.output.LayerNorm.weight",
61
+ "encoderLayer.1.ffnLayerNorm.bias": "encoder.layer.1.output.LayerNorm.bias",
62
+ "encoderLayer.2.multiHeadAttention.q.weight": "encoder.layer.2.attention.self.query.weight",
63
+ "encoderLayer.2.multiHeadAttention.q.bias": "encoder.layer.2.attention.self.query.bias",
64
+ "encoderLayer.2.multiHeadAttention.k.weight": "encoder.layer.2.attention.self.key.weight",
65
+ "encoderLayer.2.multiHeadAttention.k.bias": "encoder.layer.2.attention.self.key.bias",
66
+ "encoderLayer.2.multiHeadAttention.v.weight": "encoder.layer.2.attention.self.value.weight",
67
+ "encoderLayer.2.multiHeadAttention.v.bias": "encoder.layer.2.attention.self.value.bias",
68
+ "encoderLayer.2.multiHeadAttention.o.weight": "encoder.layer.2.attention.output.dense.weight",
69
+ "encoderLayer.2.multiHeadAttention.o.bias": "encoder.layer.2.attention.output.dense.bias",
70
+ "encoderLayer.2.attnLayerNorm.weight": "encoder.layer.2.attention.output.LayerNorm.weight",
71
+ "encoderLayer.2.attnLayerNorm.bias": "encoder.layer.2.attention.output.LayerNorm.bias",
72
+ "encoderLayer.2.feedForward.intermediateDense.weight": "encoder.layer.2.intermediate.dense.weight",
73
+ "encoderLayer.2.feedForward.intermediateDense.bias": "encoder.layer.2.intermediate.dense.bias",
74
+ "encoderLayer.2.feedForward.outputDense.weight": "encoder.layer.2.output.dense.weight",
75
+ "encoderLayer.2.feedForward.outputDense.bias": "encoder.layer.2.output.dense.bias",
76
+ "encoderLayer.2.ffnLayerNorm.weight": "encoder.layer.2.output.LayerNorm.weight",
77
+ "encoderLayer.2.ffnLayerNorm.bias": "encoder.layer.2.output.LayerNorm.bias",
78
+ "encoderLayer.3.multiHeadAttention.q.weight": "encoder.layer.3.attention.self.query.weight",
79
+ "encoderLayer.3.multiHeadAttention.q.bias": "encoder.layer.3.attention.self.query.bias",
80
+ "encoderLayer.3.multiHeadAttention.k.weight": "encoder.layer.3.attention.self.key.weight",
81
+ "encoderLayer.3.multiHeadAttention.k.bias": "encoder.layer.3.attention.self.key.bias",
82
+ "encoderLayer.3.multiHeadAttention.v.weight": "encoder.layer.3.attention.self.value.weight",
83
+ "encoderLayer.3.multiHeadAttention.v.bias": "encoder.layer.3.attention.self.value.bias",
84
+ "encoderLayer.3.multiHeadAttention.o.weight": "encoder.layer.3.attention.output.dense.weight",
85
+ "encoderLayer.3.multiHeadAttention.o.bias": "encoder.layer.3.attention.output.dense.bias",
86
+ "encoderLayer.3.attnLayerNorm.weight": "encoder.layer.3.attention.output.LayerNorm.weight",
87
+ "encoderLayer.3.attnLayerNorm.bias": "encoder.layer.3.attention.output.LayerNorm.bias",
88
+ "encoderLayer.3.feedForward.intermediateDense.weight": "encoder.layer.3.intermediate.dense.weight",
89
+ "encoderLayer.3.feedForward.intermediateDense.bias": "encoder.layer.3.intermediate.dense.bias",
90
+ "encoderLayer.3.feedForward.outputDense.weight": "encoder.layer.3.output.dense.weight",
91
+ "encoderLayer.3.feedForward.outputDense.bias": "encoder.layer.3.output.dense.bias",
92
+ "encoderLayer.3.ffnLayerNorm.weight": "encoder.layer.3.output.LayerNorm.weight",
93
+ "encoderLayer.3.ffnLayerNorm.bias": "encoder.layer.3.output.LayerNorm.bias",
94
+ "encoderLayer.4.multiHeadAttention.q.weight": "encoder.layer.4.attention.self.query.weight",
95
+ "encoderLayer.4.multiHeadAttention.q.bias": "encoder.layer.4.attention.self.query.bias",
96
+ "encoderLayer.4.multiHeadAttention.k.weight": "encoder.layer.4.attention.self.key.weight",
97
+ "encoderLayer.4.multiHeadAttention.k.bias": "encoder.layer.4.attention.self.key.bias",
98
+ "encoderLayer.4.multiHeadAttention.v.weight": "encoder.layer.4.attention.self.value.weight",
99
+ "encoderLayer.4.multiHeadAttention.v.bias": "encoder.layer.4.attention.self.value.bias",
100
+ "encoderLayer.4.multiHeadAttention.o.weight": "encoder.layer.4.attention.output.dense.weight",
101
+ "encoderLayer.4.multiHeadAttention.o.bias": "encoder.layer.4.attention.output.dense.bias",
102
+ "encoderLayer.4.attnLayerNorm.weight": "encoder.layer.4.attention.output.LayerNorm.weight",
103
+ "encoderLayer.4.attnLayerNorm.bias": "encoder.layer.4.attention.output.LayerNorm.bias",
104
+ "encoderLayer.4.feedForward.intermediateDense.weight": "encoder.layer.4.intermediate.dense.weight",
105
+ "encoderLayer.4.feedForward.intermediateDense.bias": "encoder.layer.4.intermediate.dense.bias",
106
+ "encoderLayer.4.feedForward.outputDense.weight": "encoder.layer.4.output.dense.weight",
107
+ "encoderLayer.4.feedForward.outputDense.bias": "encoder.layer.4.output.dense.bias",
108
+ "encoderLayer.4.ffnLayerNorm.weight": "encoder.layer.4.output.LayerNorm.weight",
109
+ "encoderLayer.4.ffnLayerNorm.bias": "encoder.layer.4.output.LayerNorm.bias",
110
+ "encoderLayer.5.multiHeadAttention.q.weight": "encoder.layer.5.attention.self.query.weight",
111
+ "encoderLayer.5.multiHeadAttention.q.bias": "encoder.layer.5.attention.self.query.bias",
112
+ "encoderLayer.5.multiHeadAttention.k.weight": "encoder.layer.5.attention.self.key.weight",
113
+ "encoderLayer.5.multiHeadAttention.k.bias": "encoder.layer.5.attention.self.key.bias",
114
+ "encoderLayer.5.multiHeadAttention.v.weight": "encoder.layer.5.attention.self.value.weight",
115
+ "encoderLayer.5.multiHeadAttention.v.bias": "encoder.layer.5.attention.self.value.bias",
116
+ "encoderLayer.5.multiHeadAttention.o.weight": "encoder.layer.5.attention.output.dense.weight",
117
+ "encoderLayer.5.multiHeadAttention.o.bias": "encoder.layer.5.attention.output.dense.bias",
118
+ "encoderLayer.5.attnLayerNorm.weight": "encoder.layer.5.attention.output.LayerNorm.weight",
119
+ "encoderLayer.5.attnLayerNorm.bias": "encoder.layer.5.attention.output.LayerNorm.bias",
120
+ "encoderLayer.5.feedForward.intermediateDense.weight": "encoder.layer.5.intermediate.dense.weight",
121
+ "encoderLayer.5.feedForward.intermediateDense.bias": "encoder.layer.5.intermediate.dense.bias",
122
+ "encoderLayer.5.feedForward.outputDense.weight": "encoder.layer.5.output.dense.weight",
123
+ "encoderLayer.5.feedForward.outputDense.bias": "encoder.layer.5.output.dense.bias",
124
+ "encoderLayer.5.ffnLayerNorm.weight": "encoder.layer.5.output.LayerNorm.weight",
125
+ "encoderLayer.5.ffnLayerNorm.bias": "encoder.layer.5.output.LayerNorm.bias",
126
+ "encoderLayer.6.multiHeadAttention.q.weight": "encoder.layer.6.attention.self.query.weight",
127
+ "encoderLayer.6.multiHeadAttention.q.bias": "encoder.layer.6.attention.self.query.bias",
128
+ "encoderLayer.6.multiHeadAttention.k.weight": "encoder.layer.6.attention.self.key.weight",
129
+ "encoderLayer.6.multiHeadAttention.k.bias": "encoder.layer.6.attention.self.key.bias",
130
+ "encoderLayer.6.multiHeadAttention.v.weight": "encoder.layer.6.attention.self.value.weight",
131
+ "encoderLayer.6.multiHeadAttention.v.bias": "encoder.layer.6.attention.self.value.bias",
132
+ "encoderLayer.6.multiHeadAttention.o.weight": "encoder.layer.6.attention.output.dense.weight",
133
+ "encoderLayer.6.multiHeadAttention.o.bias": "encoder.layer.6.attention.output.dense.bias",
134
+ "encoderLayer.6.attnLayerNorm.weight": "encoder.layer.6.attention.output.LayerNorm.weight",
135
+ "encoderLayer.6.attnLayerNorm.bias": "encoder.layer.6.attention.output.LayerNorm.bias",
136
+ "encoderLayer.6.feedForward.intermediateDense.weight": "encoder.layer.6.intermediate.dense.weight",
137
+ "encoderLayer.6.feedForward.intermediateDense.bias": "encoder.layer.6.intermediate.dense.bias",
138
+ "encoderLayer.6.feedForward.outputDense.weight": "encoder.layer.6.output.dense.weight",
139
+ "encoderLayer.6.feedForward.outputDense.bias": "encoder.layer.6.output.dense.bias",
140
+ "encoderLayer.6.ffnLayerNorm.weight": "encoder.layer.6.output.LayerNorm.weight",
141
+ "encoderLayer.6.ffnLayerNorm.bias": "encoder.layer.6.output.LayerNorm.bias",
142
+ "encoderLayer.7.multiHeadAttention.q.weight": "encoder.layer.7.attention.self.query.weight",
143
+ "encoderLayer.7.multiHeadAttention.q.bias": "encoder.layer.7.attention.self.query.bias",
144
+ "encoderLayer.7.multiHeadAttention.k.weight": "encoder.layer.7.attention.self.key.weight",
145
+ "encoderLayer.7.multiHeadAttention.k.bias": "encoder.layer.7.attention.self.key.bias",
146
+ "encoderLayer.7.multiHeadAttention.v.weight": "encoder.layer.7.attention.self.value.weight",
147
+ "encoderLayer.7.multiHeadAttention.v.bias": "encoder.layer.7.attention.self.value.bias",
148
+ "encoderLayer.7.multiHeadAttention.o.weight": "encoder.layer.7.attention.output.dense.weight",
149
+ "encoderLayer.7.multiHeadAttention.o.bias": "encoder.layer.7.attention.output.dense.bias",
150
+ "encoderLayer.7.attnLayerNorm.weight": "encoder.layer.7.attention.output.LayerNorm.weight",
151
+ "encoderLayer.7.attnLayerNorm.bias": "encoder.layer.7.attention.output.LayerNorm.bias",
152
+ "encoderLayer.7.feedForward.intermediateDense.weight": "encoder.layer.7.intermediate.dense.weight",
153
+ "encoderLayer.7.feedForward.intermediateDense.bias": "encoder.layer.7.intermediate.dense.bias",
154
+ "encoderLayer.7.feedForward.outputDense.weight": "encoder.layer.7.output.dense.weight",
155
+ "encoderLayer.7.feedForward.outputDense.bias": "encoder.layer.7.output.dense.bias",
156
+ "encoderLayer.7.ffnLayerNorm.weight": "encoder.layer.7.output.LayerNorm.weight",
157
+ "encoderLayer.7.ffnLayerNorm.bias": "encoder.layer.7.output.LayerNorm.bias",
158
+ "encoderLayer.8.multiHeadAttention.q.weight": "encoder.layer.8.attention.self.query.weight",
159
+ "encoderLayer.8.multiHeadAttention.q.bias": "encoder.layer.8.attention.self.query.bias",
160
+ "encoderLayer.8.multiHeadAttention.k.weight": "encoder.layer.8.attention.self.key.weight",
161
+ "encoderLayer.8.multiHeadAttention.k.bias": "encoder.layer.8.attention.self.key.bias",
162
+ "encoderLayer.8.multiHeadAttention.v.weight": "encoder.layer.8.attention.self.value.weight",
163
+ "encoderLayer.8.multiHeadAttention.v.bias": "encoder.layer.8.attention.self.value.bias",
164
+ "encoderLayer.8.multiHeadAttention.o.weight": "encoder.layer.8.attention.output.dense.weight",
165
+ "encoderLayer.8.multiHeadAttention.o.bias": "encoder.layer.8.attention.output.dense.bias",
166
+ "encoderLayer.8.attnLayerNorm.weight": "encoder.layer.8.attention.output.LayerNorm.weight",
167
+ "encoderLayer.8.attnLayerNorm.bias": "encoder.layer.8.attention.output.LayerNorm.bias",
168
+ "encoderLayer.8.feedForward.intermediateDense.weight": "encoder.layer.8.intermediate.dense.weight",
169
+ "encoderLayer.8.feedForward.intermediateDense.bias": "encoder.layer.8.intermediate.dense.bias",
170
+ "encoderLayer.8.feedForward.outputDense.weight": "encoder.layer.8.output.dense.weight",
171
+ "encoderLayer.8.feedForward.outputDense.bias": "encoder.layer.8.output.dense.bias",
172
+ "encoderLayer.8.ffnLayerNorm.weight": "encoder.layer.8.output.LayerNorm.weight",
173
+ "encoderLayer.8.ffnLayerNorm.bias": "encoder.layer.8.output.LayerNorm.bias",
174
+ "encoderLayer.9.multiHeadAttention.q.weight": "encoder.layer.9.attention.self.query.weight",
175
+ "encoderLayer.9.multiHeadAttention.q.bias": "encoder.layer.9.attention.self.query.bias",
176
+ "encoderLayer.9.multiHeadAttention.k.weight": "encoder.layer.9.attention.self.key.weight",
177
+ "encoderLayer.9.multiHeadAttention.k.bias": "encoder.layer.9.attention.self.key.bias",
178
+ "encoderLayer.9.multiHeadAttention.v.weight": "encoder.layer.9.attention.self.value.weight",
179
+ "encoderLayer.9.multiHeadAttention.v.bias": "encoder.layer.9.attention.self.value.bias",
180
+ "encoderLayer.9.multiHeadAttention.o.weight": "encoder.layer.9.attention.output.dense.weight",
181
+ "encoderLayer.9.multiHeadAttention.o.bias": "encoder.layer.9.attention.output.dense.bias",
182
+ "encoderLayer.9.attnLayerNorm.weight": "encoder.layer.9.attention.output.LayerNorm.weight",
183
+ "encoderLayer.9.attnLayerNorm.bias": "encoder.layer.9.attention.output.LayerNorm.bias",
184
+ "encoderLayer.9.feedForward.intermediateDense.weight": "encoder.layer.9.intermediate.dense.weight",
185
+ "encoderLayer.9.feedForward.intermediateDense.bias": "encoder.layer.9.intermediate.dense.bias",
186
+ "encoderLayer.9.feedForward.outputDense.weight": "encoder.layer.9.output.dense.weight",
187
+ "encoderLayer.9.feedForward.outputDense.bias": "encoder.layer.9.output.dense.bias",
188
+ "encoderLayer.9.ffnLayerNorm.weight": "encoder.layer.9.output.LayerNorm.weight",
189
+ "encoderLayer.9.ffnLayerNorm.bias": "encoder.layer.9.output.LayerNorm.bias",
190
+ "encoderLayer.10.multiHeadAttention.q.weight": "encoder.layer.10.attention.self.query.weight",
191
+ "encoderLayer.10.multiHeadAttention.q.bias": "encoder.layer.10.attention.self.query.bias",
192
+ "encoderLayer.10.multiHeadAttention.k.weight": "encoder.layer.10.attention.self.key.weight",
193
+ "encoderLayer.10.multiHeadAttention.k.bias": "encoder.layer.10.attention.self.key.bias",
194
+ "encoderLayer.10.multiHeadAttention.v.weight": "encoder.layer.10.attention.self.value.weight",
195
+ "encoderLayer.10.multiHeadAttention.v.bias": "encoder.layer.10.attention.self.value.bias",
196
+ "encoderLayer.10.multiHeadAttention.o.weight": "encoder.layer.10.attention.output.dense.weight",
197
+ "encoderLayer.10.multiHeadAttention.o.bias": "encoder.layer.10.attention.output.dense.bias",
198
+ "encoderLayer.10.attnLayerNorm.weight": "encoder.layer.10.attention.output.LayerNorm.weight",
199
+ "encoderLayer.10.attnLayerNorm.bias": "encoder.layer.10.attention.output.LayerNorm.bias",
200
+ "encoderLayer.10.feedForward.intermediateDense.weight": "encoder.layer.10.intermediate.dense.weight",
201
+ "encoderLayer.10.feedForward.intermediateDense.bias": "encoder.layer.10.intermediate.dense.bias",
202
+ "encoderLayer.10.feedForward.outputDense.weight": "encoder.layer.10.output.dense.weight",
203
+ "encoderLayer.10.feedForward.outputDense.bias": "encoder.layer.10.output.dense.bias",
204
+ "encoderLayer.10.ffnLayerNorm.weight": "encoder.layer.10.output.LayerNorm.weight",
205
+ "encoderLayer.10.ffnLayerNorm.bias": "encoder.layer.10.output.LayerNorm.bias",
206
+ "encoderLayer.11.multiHeadAttention.q.weight": "encoder.layer.11.attention.self.query.weight",
207
+ "encoderLayer.11.multiHeadAttention.q.bias": "encoder.layer.11.attention.self.query.bias",
208
+ "encoderLayer.11.multiHeadAttention.k.weight": "encoder.layer.11.attention.self.key.weight",
209
+ "encoderLayer.11.multiHeadAttention.k.bias": "encoder.layer.11.attention.self.key.bias",
210
+ "encoderLayer.11.multiHeadAttention.v.weight": "encoder.layer.11.attention.self.value.weight",
211
+ "encoderLayer.11.multiHeadAttention.v.bias": "encoder.layer.11.attention.self.value.bias",
212
+ "encoderLayer.11.multiHeadAttention.o.weight": "encoder.layer.11.attention.output.dense.weight",
213
+ "encoderLayer.11.multiHeadAttention.o.bias": "encoder.layer.11.attention.output.dense.bias",
214
+ "encoderLayer.11.attnLayerNorm.weight": "encoder.layer.11.attention.output.LayerNorm.weight",
215
+ "encoderLayer.11.attnLayerNorm.bias": "encoder.layer.11.attention.output.LayerNorm.bias",
216
+ "encoderLayer.11.feedForward.intermediateDense.weight": "encoder.layer.11.intermediate.dense.weight",
217
+ "encoderLayer.11.feedForward.intermediateDense.bias": "encoder.layer.11.intermediate.dense.bias",
218
+ "encoderLayer.11.feedForward.outputDense.weight": "encoder.layer.11.output.dense.weight",
219
+ "encoderLayer.11.feedForward.outputDense.bias": "encoder.layer.11.output.dense.bias",
220
+ "encoderLayer.11.ffnLayerNorm.weight": "encoder.layer.11.output.LayerNorm.weight",
221
+ "encoderLayer.11.ffnLayerNorm.bias": "encoder.layer.11.output.LayerNorm.bias",
222
+ "encoderLayer.12.multiHeadAttention.q.weight": "encoder.layer.12.attention.self.query.weight",
223
+ "encoderLayer.12.multiHeadAttention.q.bias": "encoder.layer.12.attention.self.query.bias",
224
+ "encoderLayer.12.multiHeadAttention.k.weight": "encoder.layer.12.attention.self.key.weight",
225
+ "encoderLayer.12.multiHeadAttention.k.bias": "encoder.layer.12.attention.self.key.bias",
226
+ "encoderLayer.12.multiHeadAttention.v.weight": "encoder.layer.12.attention.self.value.weight",
227
+ "encoderLayer.12.multiHeadAttention.v.bias": "encoder.layer.12.attention.self.value.bias",
228
+ "encoderLayer.12.multiHeadAttention.o.weight": "encoder.layer.12.attention.output.dense.weight",
229
+ "encoderLayer.12.multiHeadAttention.o.bias": "encoder.layer.12.attention.output.dense.bias",
230
+ "encoderLayer.12.attnLayerNorm.weight": "encoder.layer.12.attention.output.LayerNorm.weight",
231
+ "encoderLayer.12.attnLayerNorm.bias": "encoder.layer.12.attention.output.LayerNorm.bias",
232
+ "encoderLayer.12.feedForward.intermediateDense.weight": "encoder.layer.12.intermediate.dense.weight",
233
+ "encoderLayer.12.feedForward.intermediateDense.bias": "encoder.layer.12.intermediate.dense.bias",
234
+ "encoderLayer.12.feedForward.outputDense.weight": "encoder.layer.12.output.dense.weight",
235
+ "encoderLayer.12.feedForward.outputDense.bias": "encoder.layer.12.output.dense.bias",
236
+ "encoderLayer.12.ffnLayerNorm.weight": "encoder.layer.12.output.LayerNorm.weight",
237
+ "encoderLayer.12.ffnLayerNorm.bias": "encoder.layer.12.output.LayerNorm.bias",
238
+ "encoderLayer.13.multiHeadAttention.q.weight": "encoder.layer.13.attention.self.query.weight",
239
+ "encoderLayer.13.multiHeadAttention.q.bias": "encoder.layer.13.attention.self.query.bias",
240
+ "encoderLayer.13.multiHeadAttention.k.weight": "encoder.layer.13.attention.self.key.weight",
241
+ "encoderLayer.13.multiHeadAttention.k.bias": "encoder.layer.13.attention.self.key.bias",
242
+ "encoderLayer.13.multiHeadAttention.v.weight": "encoder.layer.13.attention.self.value.weight",
243
+ "encoderLayer.13.multiHeadAttention.v.bias": "encoder.layer.13.attention.self.value.bias",
244
+ "encoderLayer.13.multiHeadAttention.o.weight": "encoder.layer.13.attention.output.dense.weight",
245
+ "encoderLayer.13.multiHeadAttention.o.bias": "encoder.layer.13.attention.output.dense.bias",
246
+ "encoderLayer.13.attnLayerNorm.weight": "encoder.layer.13.attention.output.LayerNorm.weight",
247
+ "encoderLayer.13.attnLayerNorm.bias": "encoder.layer.13.attention.output.LayerNorm.bias",
248
+ "encoderLayer.13.feedForward.intermediateDense.weight": "encoder.layer.13.intermediate.dense.weight",
249
+ "encoderLayer.13.feedForward.intermediateDense.bias": "encoder.layer.13.intermediate.dense.bias",
250
+ "encoderLayer.13.feedForward.outputDense.weight": "encoder.layer.13.output.dense.weight",
251
+ "encoderLayer.13.feedForward.outputDense.bias": "encoder.layer.13.output.dense.bias",
252
+ "encoderLayer.13.ffnLayerNorm.weight": "encoder.layer.13.output.LayerNorm.weight",
253
+ "encoderLayer.13.ffnLayerNorm.bias": "encoder.layer.13.output.LayerNorm.bias",
254
+ "encoderLayer.14.multiHeadAttention.q.weight": "encoder.layer.14.attention.self.query.weight",
255
+ "encoderLayer.14.multiHeadAttention.q.bias": "encoder.layer.14.attention.self.query.bias",
256
+ "encoderLayer.14.multiHeadAttention.k.weight": "encoder.layer.14.attention.self.key.weight",
257
+ "encoderLayer.14.multiHeadAttention.k.bias": "encoder.layer.14.attention.self.key.bias",
258
+ "encoderLayer.14.multiHeadAttention.v.weight": "encoder.layer.14.attention.self.value.weight",
259
+ "encoderLayer.14.multiHeadAttention.v.bias": "encoder.layer.14.attention.self.value.bias",
260
+ "encoderLayer.14.multiHeadAttention.o.weight": "encoder.layer.14.attention.output.dense.weight",
261
+ "encoderLayer.14.multiHeadAttention.o.bias": "encoder.layer.14.attention.output.dense.bias",
262
+ "encoderLayer.14.attnLayerNorm.weight": "encoder.layer.14.attention.output.LayerNorm.weight",
263
+ "encoderLayer.14.attnLayerNorm.bias": "encoder.layer.14.attention.output.LayerNorm.bias",
264
+ "encoderLayer.14.feedForward.intermediateDense.weight": "encoder.layer.14.intermediate.dense.weight",
265
+ "encoderLayer.14.feedForward.intermediateDense.bias": "encoder.layer.14.intermediate.dense.bias",
266
+ "encoderLayer.14.feedForward.outputDense.weight": "encoder.layer.14.output.dense.weight",
267
+ "encoderLayer.14.feedForward.outputDense.bias": "encoder.layer.14.output.dense.bias",
268
+ "encoderLayer.14.ffnLayerNorm.weight": "encoder.layer.14.output.LayerNorm.weight",
269
+ "encoderLayer.14.ffnLayerNorm.bias": "encoder.layer.14.output.LayerNorm.bias",
270
+ "encoderLayer.15.multiHeadAttention.q.weight": "encoder.layer.15.attention.self.query.weight",
271
+ "encoderLayer.15.multiHeadAttention.q.bias": "encoder.layer.15.attention.self.query.bias",
272
+ "encoderLayer.15.multiHeadAttention.k.weight": "encoder.layer.15.attention.self.key.weight",
273
+ "encoderLayer.15.multiHeadAttention.k.bias": "encoder.layer.15.attention.self.key.bias",
274
+ "encoderLayer.15.multiHeadAttention.v.weight": "encoder.layer.15.attention.self.value.weight",
275
+ "encoderLayer.15.multiHeadAttention.v.bias": "encoder.layer.15.attention.self.value.bias",
276
+ "encoderLayer.15.multiHeadAttention.o.weight": "encoder.layer.15.attention.output.dense.weight",
277
+ "encoderLayer.15.multiHeadAttention.o.bias": "encoder.layer.15.attention.output.dense.bias",
278
+ "encoderLayer.15.attnLayerNorm.weight": "encoder.layer.15.attention.output.LayerNorm.weight",
279
+ "encoderLayer.15.attnLayerNorm.bias": "encoder.layer.15.attention.output.LayerNorm.bias",
280
+ "encoderLayer.15.feedForward.intermediateDense.weight": "encoder.layer.15.intermediate.dense.weight",
281
+ "encoderLayer.15.feedForward.intermediateDense.bias": "encoder.layer.15.intermediate.dense.bias",
282
+ "encoderLayer.15.feedForward.outputDense.weight": "encoder.layer.15.output.dense.weight",
283
+ "encoderLayer.15.feedForward.outputDense.bias": "encoder.layer.15.output.dense.bias",
284
+ "encoderLayer.15.ffnLayerNorm.weight": "encoder.layer.15.output.LayerNorm.weight",
285
+ "encoderLayer.15.ffnLayerNorm.bias": "encoder.layer.15.output.LayerNorm.bias",
286
+ "encoderLayer.16.multiHeadAttention.q.weight": "encoder.layer.16.attention.self.query.weight",
287
+ "encoderLayer.16.multiHeadAttention.q.bias": "encoder.layer.16.attention.self.query.bias",
288
+ "encoderLayer.16.multiHeadAttention.k.weight": "encoder.layer.16.attention.self.key.weight",
289
+ "encoderLayer.16.multiHeadAttention.k.bias": "encoder.layer.16.attention.self.key.bias",
290
+ "encoderLayer.16.multiHeadAttention.v.weight": "encoder.layer.16.attention.self.value.weight",
291
+ "encoderLayer.16.multiHeadAttention.v.bias": "encoder.layer.16.attention.self.value.bias",
292
+ "encoderLayer.16.multiHeadAttention.o.weight": "encoder.layer.16.attention.output.dense.weight",
293
+ "encoderLayer.16.multiHeadAttention.o.bias": "encoder.layer.16.attention.output.dense.bias",
294
+ "encoderLayer.16.attnLayerNorm.weight": "encoder.layer.16.attention.output.LayerNorm.weight",
295
+ "encoderLayer.16.attnLayerNorm.bias": "encoder.layer.16.attention.output.LayerNorm.bias",
296
+ "encoderLayer.16.feedForward.intermediateDense.weight": "encoder.layer.16.intermediate.dense.weight",
297
+ "encoderLayer.16.feedForward.intermediateDense.bias": "encoder.layer.16.intermediate.dense.bias",
298
+ "encoderLayer.16.feedForward.outputDense.weight": "encoder.layer.16.output.dense.weight",
299
+ "encoderLayer.16.feedForward.outputDense.bias": "encoder.layer.16.output.dense.bias",
300
+ "encoderLayer.16.ffnLayerNorm.weight": "encoder.layer.16.output.LayerNorm.weight",
301
+ "encoderLayer.16.ffnLayerNorm.bias": "encoder.layer.16.output.LayerNorm.bias",
302
+ "encoderLayer.17.multiHeadAttention.q.weight": "encoder.layer.17.attention.self.query.weight",
303
+ "encoderLayer.17.multiHeadAttention.q.bias": "encoder.layer.17.attention.self.query.bias",
304
+ "encoderLayer.17.multiHeadAttention.k.weight": "encoder.layer.17.attention.self.key.weight",
305
+ "encoderLayer.17.multiHeadAttention.k.bias": "encoder.layer.17.attention.self.key.bias",
306
+ "encoderLayer.17.multiHeadAttention.v.weight": "encoder.layer.17.attention.self.value.weight",
307
+ "encoderLayer.17.multiHeadAttention.v.bias": "encoder.layer.17.attention.self.value.bias",
308
+ "encoderLayer.17.multiHeadAttention.o.weight": "encoder.layer.17.attention.output.dense.weight",
309
+ "encoderLayer.17.multiHeadAttention.o.bias": "encoder.layer.17.attention.output.dense.bias",
310
+ "encoderLayer.17.attnLayerNorm.weight": "encoder.layer.17.attention.output.LayerNorm.weight",
311
+ "encoderLayer.17.attnLayerNorm.bias": "encoder.layer.17.attention.output.LayerNorm.bias",
312
+ "encoderLayer.17.feedForward.intermediateDense.weight": "encoder.layer.17.intermediate.dense.weight",
313
+ "encoderLayer.17.feedForward.intermediateDense.bias": "encoder.layer.17.intermediate.dense.bias",
314
+ "encoderLayer.17.feedForward.outputDense.weight": "encoder.layer.17.output.dense.weight",
315
+ "encoderLayer.17.feedForward.outputDense.bias": "encoder.layer.17.output.dense.bias",
316
+ "encoderLayer.17.ffnLayerNorm.weight": "encoder.layer.17.output.LayerNorm.weight",
317
+ "encoderLayer.17.ffnLayerNorm.bias": "encoder.layer.17.output.LayerNorm.bias",
318
+ "encoderLayer.18.multiHeadAttention.q.weight": "encoder.layer.18.attention.self.query.weight",
319
+ "encoderLayer.18.multiHeadAttention.q.bias": "encoder.layer.18.attention.self.query.bias",
320
+ "encoderLayer.18.multiHeadAttention.k.weight": "encoder.layer.18.attention.self.key.weight",
321
+ "encoderLayer.18.multiHeadAttention.k.bias": "encoder.layer.18.attention.self.key.bias",
322
+ "encoderLayer.18.multiHeadAttention.v.weight": "encoder.layer.18.attention.self.value.weight",
323
+ "encoderLayer.18.multiHeadAttention.v.bias": "encoder.layer.18.attention.self.value.bias",
324
+ "encoderLayer.18.multiHeadAttention.o.weight": "encoder.layer.18.attention.output.dense.weight",
325
+ "encoderLayer.18.multiHeadAttention.o.bias": "encoder.layer.18.attention.output.dense.bias",
326
+ "encoderLayer.18.attnLayerNorm.weight": "encoder.layer.18.attention.output.LayerNorm.weight",
327
+ "encoderLayer.18.attnLayerNorm.bias": "encoder.layer.18.attention.output.LayerNorm.bias",
328
+ "encoderLayer.18.feedForward.intermediateDense.weight": "encoder.layer.18.intermediate.dense.weight",
329
+ "encoderLayer.18.feedForward.intermediateDense.bias": "encoder.layer.18.intermediate.dense.bias",
330
+ "encoderLayer.18.feedForward.outputDense.weight": "encoder.layer.18.output.dense.weight",
331
+ "encoderLayer.18.feedForward.outputDense.bias": "encoder.layer.18.output.dense.bias",
332
+ "encoderLayer.18.ffnLayerNorm.weight": "encoder.layer.18.output.LayerNorm.weight",
333
+ "encoderLayer.18.ffnLayerNorm.bias": "encoder.layer.18.output.LayerNorm.bias",
334
+ "encoderLayer.19.multiHeadAttention.q.weight": "encoder.layer.19.attention.self.query.weight",
335
+ "encoderLayer.19.multiHeadAttention.q.bias": "encoder.layer.19.attention.self.query.bias",
336
+ "encoderLayer.19.multiHeadAttention.k.weight": "encoder.layer.19.attention.self.key.weight",
337
+ "encoderLayer.19.multiHeadAttention.k.bias": "encoder.layer.19.attention.self.key.bias",
338
+ "encoderLayer.19.multiHeadAttention.v.weight": "encoder.layer.19.attention.self.value.weight",
339
+ "encoderLayer.19.multiHeadAttention.v.bias": "encoder.layer.19.attention.self.value.bias",
340
+ "encoderLayer.19.multiHeadAttention.o.weight": "encoder.layer.19.attention.output.dense.weight",
341
+ "encoderLayer.19.multiHeadAttention.o.bias": "encoder.layer.19.attention.output.dense.bias",
342
+ "encoderLayer.19.attnLayerNorm.weight": "encoder.layer.19.attention.output.LayerNorm.weight",
343
+ "encoderLayer.19.attnLayerNorm.bias": "encoder.layer.19.attention.output.LayerNorm.bias",
344
+ "encoderLayer.19.feedForward.intermediateDense.weight": "encoder.layer.19.intermediate.dense.weight",
345
+ "encoderLayer.19.feedForward.intermediateDense.bias": "encoder.layer.19.intermediate.dense.bias",
346
+ "encoderLayer.19.feedForward.outputDense.weight": "encoder.layer.19.output.dense.weight",
347
+ "encoderLayer.19.feedForward.outputDense.bias": "encoder.layer.19.output.dense.bias",
348
+ "encoderLayer.19.ffnLayerNorm.weight": "encoder.layer.19.output.LayerNorm.weight",
349
+ "encoderLayer.19.ffnLayerNorm.bias": "encoder.layer.19.output.LayerNorm.bias",
350
+ "encoderLayer.20.multiHeadAttention.q.weight": "encoder.layer.20.attention.self.query.weight",
351
+ "encoderLayer.20.multiHeadAttention.q.bias": "encoder.layer.20.attention.self.query.bias",
352
+ "encoderLayer.20.multiHeadAttention.k.weight": "encoder.layer.20.attention.self.key.weight",
353
+ "encoderLayer.20.multiHeadAttention.k.bias": "encoder.layer.20.attention.self.key.bias",
354
+ "encoderLayer.20.multiHeadAttention.v.weight": "encoder.layer.20.attention.self.value.weight",
355
+ "encoderLayer.20.multiHeadAttention.v.bias": "encoder.layer.20.attention.self.value.bias",
356
+ "encoderLayer.20.multiHeadAttention.o.weight": "encoder.layer.20.attention.output.dense.weight",
357
+ "encoderLayer.20.multiHeadAttention.o.bias": "encoder.layer.20.attention.output.dense.bias",
358
+ "encoderLayer.20.attnLayerNorm.weight": "encoder.layer.20.attention.output.LayerNorm.weight",
359
+ "encoderLayer.20.attnLayerNorm.bias": "encoder.layer.20.attention.output.LayerNorm.bias",
360
+ "encoderLayer.20.feedForward.intermediateDense.weight": "encoder.layer.20.intermediate.dense.weight",
361
+ "encoderLayer.20.feedForward.intermediateDense.bias": "encoder.layer.20.intermediate.dense.bias",
362
+ "encoderLayer.20.feedForward.outputDense.weight": "encoder.layer.20.output.dense.weight",
363
+ "encoderLayer.20.feedForward.outputDense.bias": "encoder.layer.20.output.dense.bias",
364
+ "encoderLayer.20.ffnLayerNorm.weight": "encoder.layer.20.output.LayerNorm.weight",
365
+ "encoderLayer.20.ffnLayerNorm.bias": "encoder.layer.20.output.LayerNorm.bias",
366
+ "encoderLayer.21.multiHeadAttention.q.weight": "encoder.layer.21.attention.self.query.weight",
367
+ "encoderLayer.21.multiHeadAttention.q.bias": "encoder.layer.21.attention.self.query.bias",
368
+ "encoderLayer.21.multiHeadAttention.k.weight": "encoder.layer.21.attention.self.key.weight",
369
+ "encoderLayer.21.multiHeadAttention.k.bias": "encoder.layer.21.attention.self.key.bias",
370
+ "encoderLayer.21.multiHeadAttention.v.weight": "encoder.layer.21.attention.self.value.weight",
371
+ "encoderLayer.21.multiHeadAttention.v.bias": "encoder.layer.21.attention.self.value.bias",
372
+ "encoderLayer.21.multiHeadAttention.o.weight": "encoder.layer.21.attention.output.dense.weight",
373
+ "encoderLayer.21.multiHeadAttention.o.bias": "encoder.layer.21.attention.output.dense.bias",
374
+ "encoderLayer.21.attnLayerNorm.weight": "encoder.layer.21.attention.output.LayerNorm.weight",
375
+ "encoderLayer.21.attnLayerNorm.bias": "encoder.layer.21.attention.output.LayerNorm.bias",
376
+ "encoderLayer.21.feedForward.intermediateDense.weight": "encoder.layer.21.intermediate.dense.weight",
377
+ "encoderLayer.21.feedForward.intermediateDense.bias": "encoder.layer.21.intermediate.dense.bias",
378
+ "encoderLayer.21.feedForward.outputDense.weight": "encoder.layer.21.output.dense.weight",
379
+ "encoderLayer.21.feedForward.outputDense.bias": "encoder.layer.21.output.dense.bias",
380
+ "encoderLayer.21.ffnLayerNorm.weight": "encoder.layer.21.output.LayerNorm.weight",
381
+ "encoderLayer.21.ffnLayerNorm.bias": "encoder.layer.21.output.LayerNorm.bias",
382
+ "encoderLayer.22.multiHeadAttention.q.weight": "encoder.layer.22.attention.self.query.weight",
383
+ "encoderLayer.22.multiHeadAttention.q.bias": "encoder.layer.22.attention.self.query.bias",
384
+ "encoderLayer.22.multiHeadAttention.k.weight": "encoder.layer.22.attention.self.key.weight",
385
+ "encoderLayer.22.multiHeadAttention.k.bias": "encoder.layer.22.attention.self.key.bias",
386
+ "encoderLayer.22.multiHeadAttention.v.weight": "encoder.layer.22.attention.self.value.weight",
387
+ "encoderLayer.22.multiHeadAttention.v.bias": "encoder.layer.22.attention.self.value.bias",
388
+ "encoderLayer.22.multiHeadAttention.o.weight": "encoder.layer.22.attention.output.dense.weight",
389
+ "encoderLayer.22.multiHeadAttention.o.bias": "encoder.layer.22.attention.output.dense.bias",
390
+ "encoderLayer.22.attnLayerNorm.weight": "encoder.layer.22.attention.output.LayerNorm.weight",
391
+ "encoderLayer.22.attnLayerNorm.bias": "encoder.layer.22.attention.output.LayerNorm.bias",
392
+ "encoderLayer.22.feedForward.intermediateDense.weight": "encoder.layer.22.intermediate.dense.weight",
393
+ "encoderLayer.22.feedForward.intermediateDense.bias": "encoder.layer.22.intermediate.dense.bias",
394
+ "encoderLayer.22.feedForward.outputDense.weight": "encoder.layer.22.output.dense.weight",
395
+ "encoderLayer.22.feedForward.outputDense.bias": "encoder.layer.22.output.dense.bias",
396
+ "encoderLayer.22.ffnLayerNorm.weight": "encoder.layer.22.output.LayerNorm.weight",
397
+ "encoderLayer.22.ffnLayerNorm.bias": "encoder.layer.22.output.LayerNorm.bias",
398
+ "encoderLayer.23.multiHeadAttention.q.weight": "encoder.layer.23.attention.self.query.weight",
399
+ "encoderLayer.23.multiHeadAttention.q.bias": "encoder.layer.23.attention.self.query.bias",
400
+ "encoderLayer.23.multiHeadAttention.k.weight": "encoder.layer.23.attention.self.key.weight",
401
+ "encoderLayer.23.multiHeadAttention.k.bias": "encoder.layer.23.attention.self.key.bias",
402
+ "encoderLayer.23.multiHeadAttention.v.weight": "encoder.layer.23.attention.self.value.weight",
403
+ "encoderLayer.23.multiHeadAttention.v.bias": "encoder.layer.23.attention.self.value.bias",
404
+ "encoderLayer.23.multiHeadAttention.o.weight": "encoder.layer.23.attention.output.dense.weight",
405
+ "encoderLayer.23.multiHeadAttention.o.bias": "encoder.layer.23.attention.output.dense.bias",
406
+ "encoderLayer.23.attnLayerNorm.weight": "encoder.layer.23.attention.output.LayerNorm.weight",
407
+ "encoderLayer.23.attnLayerNorm.bias": "encoder.layer.23.attention.output.LayerNorm.bias",
408
+ "encoderLayer.23.feedForward.intermediateDense.weight": "encoder.layer.23.intermediate.dense.weight",
409
+ "encoderLayer.23.feedForward.intermediateDense.bias": "encoder.layer.23.intermediate.dense.bias",
410
+ "encoderLayer.23.feedForward.outputDense.weight": "encoder.layer.23.output.dense.weight",
411
+ "encoderLayer.23.feedForward.outputDense.bias": "encoder.layer.23.output.dense.bias",
412
+ "encoderLayer.23.ffnLayerNorm.weight": "encoder.layer.23.output.LayerNorm.weight",
413
+ "encoderLayer.23.ffnLayerNorm.bias": "encoder.layer.23.output.LayerNorm.bias"
414
+ }
415
+ }
bloom-560m/bert4torch_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "apply_residual_connection_post_layernorm": false,
3
+ "attention_dropout": 0.0,
4
+ "attention_softmax_in_fp32": true,
5
+ "bias_dropout_fusion": true,
6
+ "bos_token_id": 1,
7
+ "eos_token_id": 2,
8
+ "pad_token_id": 3,
9
+ "unk_token_id": 0,
10
+ "hidden_dropout": 0.0,
11
+ "hidden_act": "gelu_fast",
12
+ "initializer_range": 0.02,
13
+ "layer_norm_eps": 1e-05,
14
+ "hidden_size": 1024,
15
+ "intermediate_size": 4096,
16
+ "num_hidden_layers": 24,
17
+ "num_attention_heads": 16,
18
+ "offset_alibi": 100,
19
+ "pretraining_tp": 1,
20
+ "skip_bias_add": true,
21
+ "skip_bias_add_qkv": false,
22
+ "vocab_size": 250880,
23
+ "segment_vocab_size": 0,
24
+ "pre_layernorm": true,
25
+ "tie_emb_prj_weight": true,
26
+ "model": "bloom"
27
+ }
bloomz-560m/bert4torch_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "apply_residual_connection_post_layernorm": false,
3
+ "attention_dropout": 0.0,
4
+ "attention_softmax_in_fp32": true,
5
+ "bias_dropout_fusion": true,
6
+ "bos_token_id": 1,
7
+ "eos_token_id": 2,
8
+ "pad_token_id": 3,
9
+ "unk_token_id": 0,
10
+ "hidden_dropout": 0.0,
11
+ "hidden_act": "gelu_fast",
12
+ "initializer_range": 0.02,
13
+ "layer_norm_eps": 1e-05,
14
+ "hidden_size": 1024,
15
+ "intermediate_size": 4096,
16
+ "num_hidden_layers": 24,
17
+ "num_attention_heads": 16,
18
+ "offset_alibi": 100,
19
+ "pretraining_tp": 1,
20
+ "skip_bias_add": true,
21
+ "skip_bias_add_qkv": false,
22
+ "vocab_size": 250880,
23
+ "segment_vocab_size": 0,
24
+ "pre_layernorm": true,
25
+ "tie_emb_prj_weight": true,
26
+ "model": "bloom"
27
+ }
chatglm-6b-int4/bert4torch_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "glm",
3
+ "hidden_act": "gelu_new",
4
+ "bos_token_id": 130004,
5
+ "eos_token_id": 130005,
6
+ "mask_token_id": 130000,
7
+ "gmask_token_id": 130001,
8
+ "pad_token_id": 3,
9
+ "hidden_size": 4096,
10
+ "intermediate_size": 16384,
11
+ "layer_norm_eps": 1e-05,
12
+ "max_sequence_length": 2048,
13
+ "num_attention_heads": 32,
14
+ "num_hidden_layers": 28,
15
+ "position_encoding_2d": true,
16
+ "torch_dtype": "float16",
17
+ "vocab_size": 130528,
18
+ "segment_vocab_size": 0,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "tie_emb_prj_weight": false,
22
+ "quantization_bit": 4,
23
+ "quantization_method": "cpm_kernels",
24
+ "target_modules": ["q", "k", "v", "o", "intermediateDense", "outputDense"]
25
+ }
chatglm-6b-int8/bert4torch_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "glm",
3
+ "hidden_act": "gelu_new",
4
+ "bos_token_id": 130004,
5
+ "eos_token_id": 130005,
6
+ "mask_token_id": 130000,
7
+ "gmask_token_id": 130001,
8
+ "pad_token_id": 3,
9
+ "hidden_size": 4096,
10
+ "intermediate_size": 16384,
11
+ "layer_norm_eps": 1e-05,
12
+ "max_sequence_length": 2048,
13
+ "num_attention_heads": 32,
14
+ "num_hidden_layers": 28,
15
+ "position_encoding_2d": true,
16
+ "torch_dtype": "float16",
17
+ "vocab_size": 130528,
18
+ "segment_vocab_size": 0,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "quantization_bit": 8,
22
+ "quantization_method": "cpm_kernels",
23
+ "target_modules": ["q", "k", "v", "o", "intermediateDense", "outputDense"],
24
+ "tie_emb_prj_weight": false
25
+ }
chatglm-6b/bert4torch_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "glm",
3
+ "hidden_act": "gelu_new",
4
+ "bos_token_id": 130004,
5
+ "eos_token_id": 130005,
6
+ "mask_token_id": 130000,
7
+ "gmask_token_id": 130001,
8
+ "pad_token_id": 3,
9
+ "hidden_size": 4096,
10
+ "intermediate_size": 16384,
11
+ "layer_norm_eps": 1e-05,
12
+ "max_sequence_length": 2048,
13
+ "num_attention_heads": 32,
14
+ "num_hidden_layers": 28,
15
+ "position_encoding_2d": true,
16
+ "torch_dtype": "float16",
17
+ "vocab_size": 130528,
18
+ "segment_vocab_size": 0,
19
+ "skip_init": true,
20
+ "rope_rank": "updown",
21
+ "tie_emb_prj_weight": false
22
+ }
chatglm2-6b-32k/bert4torch_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "glm2",
3
+ "hidden_act": "swiglu",
4
+ "hidden_size": 4096,
5
+ "intermediate_size": 13696,
6
+ "layer_norm_eps": 1e-05,
7
+ "max_sequence_length": 32768,
8
+ "num_attention_heads": 32,
9
+ "num_hidden_layers": 28,
10
+ "vocab_size": 65024,
11
+ "segment_vocab_size": 0,
12
+ "multi_query_group_num": 2,
13
+ "skip_init": true,
14
+ "tie_emb_prj_weight": false,
15
+ "eos_token_id": 2,
16
+ "pad_token_id": 2,
17
+ "rmsnorm": true,
18
+ "rope_rank": "adjacent",
19
+ "rope_ratio": 16,
20
+ "position_encoding_2d": true,
21
+ "flash_attention": true
22
+ }
chatglm2-6b-int4/bert4torch_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "glm2",
3
+ "hidden_act": "swiglu",
4
+ "hidden_size": 4096,
5
+ "intermediate_size": 13696,
6
+ "layer_norm_eps": 1e-05,
7
+ "max_sequence_length": 32768,
8
+ "num_attention_heads": 32,
9
+ "num_hidden_layers": 28,
10
+ "vocab_size": 65024,
11
+ "segment_vocab_size": 0,
12
+ "multi_query_group_num": 2,
13
+ "skip_init": true,
14
+ "tie_emb_prj_weight": false,
15
+ "eos_token_id": 2,
16
+ "pad_token_id": 2,
17
+ "rmsnorm": true,
18
+ "rope_rank": "adjacent",
19
+ "position_encoding_2d": true,
20
+ "flash_attention": true,
21
+ "quantization_bit": 4,
22
+ "quantization_method": "cpm_kernels",
23
+ "target_modules": ["q", "k", "v", "o", "intermediateDense", "outputDense"]
24
+ }
chatglm2-6b/bert4torch_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "glm2",
3
+ "hidden_act": "swiglu",
4
+ "hidden_size": 4096,
5
+ "intermediate_size": 13696,
6
+ "layer_norm_eps": 1e-05,
7
+ "max_sequence_length": 32768,
8
+ "num_attention_heads": 32,
9
+ "num_hidden_layers": 28,
10
+ "vocab_size": 65024,
11
+ "segment_vocab_size": 0,
12
+ "multi_query_group_num": 2,
13
+ "skip_init": true,
14
+ "tie_emb_prj_weight": false,
15
+ "eos_token_id": 2,
16
+ "pad_token_id": 2,
17
+ "rmsnorm": true,
18
+ "rope_rank": "adjacent",
19
+ "position_encoding_2d": true,
20
+ "flash_attention": true
21
+ }
chatglm3-6b-32k/bert4torch_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "glm2",
3
+ "hidden_act": "swiglu",
4
+ "hidden_size": 4096,
5
+ "intermediate_size": 13696,
6
+ "layer_norm_eps": 1e-05,
7
+ "max_sequence_length": 32768,
8
+ "num_attention_heads": 32,
9
+ "num_hidden_layers": 28,
10
+ "vocab_size": 65024,
11
+ "segment_vocab_size": 0,
12
+ "multi_query_group_num": 2,
13
+ "skip_init": true,
14
+ "tie_emb_prj_weight": false,
15
+ "eos_token_id": 2,
16
+ "pad_token_id": 2,
17
+ "rmsnorm": true,
18
+ "rope_rank": "adjacent",
19
+ "ntk_alpha": 44.24632364389211,
20
+ "position_encoding_2d": true,
21
+ "flash_attention": true
22
+ }
chatglm3-6b/bert4torch_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "glm2",
3
+ "hidden_act": "swiglu",
4
+ "hidden_size": 4096,
5
+ "intermediate_size": 13696,
6
+ "layer_norm_eps": 1e-05,
7
+ "max_sequence_length": 32768,
8
+ "num_attention_heads": 32,
9
+ "num_hidden_layers": 28,
10
+ "vocab_size": 65024,
11
+ "segment_vocab_size": 0,
12
+ "multi_query_group_num": 2,
13
+ "skip_init": true,
14
+ "tie_emb_prj_weight": false,
15
+ "eos_token_id": 2,
16
+ "pad_token_id": 0,
17
+ "rmsnorm": true,
18
+ "rope_rank": "adjacent",
19
+ "position_encoding_2d": true,
20
+ "flash_attention": true
21
+ }
chinese-bert-wwm-ext/bert4torch_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "directionality": "bidi",
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 3072,
12
+ "layer_norm_eps": 1e-12,
13
+ "max_position_embeddings": 512,
14
+ "model_type": "bert",
15
+ "num_attention_heads": 12,
16
+ "num_hidden_layers": 12,
17
+ "output_past": true,
18
+ "pad_token_id": 0,
19
+ "pooler_fc_size": 768,
20
+ "pooler_num_attention_heads": 12,
21
+ "pooler_num_fc_layers": 3,
22
+ "pooler_size_per_head": 128,
23
+ "pooler_type": "first_token_transform",
24
+ "type_vocab_size": 2,
25
+ "vocab_size": 21128
26
+ }
chinese-macbert-base/bert4torch_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "directionality": "bidi",
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 3072,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "pad_token_id": 0,
19
+ "pooler_fc_size": 768,
20
+ "pooler_num_attention_heads": 12,
21
+ "pooler_num_fc_layers": 3,
22
+ "pooler_size_per_head": 128,
23
+ "pooler_type": "first_token_transform",
24
+ "type_vocab_size": 2,
25
+ "vocab_size": 21128
26
+ }
chinese-macbert-large/bert4torch_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "directionality": "bidi",
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 1024,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 4096,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 16,
17
+ "num_hidden_layers": 24,
18
+ "pad_token_id": 0,
19
+ "pooler_fc_size": 768,
20
+ "pooler_num_attention_heads": 12,
21
+ "pooler_num_fc_layers": 3,
22
+ "pooler_size_per_head": 128,
23
+ "pooler_type": "first_token_transform",
24
+ "type_vocab_size": 2,
25
+ "vocab_size": 21128
26
+ }
chinese-roberta-wwm-ext-base/bert4torch_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "directionality": "bidi",
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "output_past": true,
20
+ "pad_token_id": 1,
21
+ "pooler_fc_size": 768,
22
+ "pooler_num_attention_heads": 12,
23
+ "pooler_num_fc_layers": 3,
24
+ "pooler_size_per_head": 128,
25
+ "pooler_type": "first_token_transform",
26
+ "type_vocab_size": 2,
27
+ "vocab_size": 21128
28
+ }
chinese-roberta-wwm-ext-large/bert4torch_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "directionality": "bidi",
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 24,
19
+ "output_past": true,
20
+ "pad_token_id": 1,
21
+ "pooler_fc_size": 768,
22
+ "pooler_num_attention_heads": 12,
23
+ "pooler_num_fc_layers": 3,
24
+ "pooler_size_per_head": 128,
25
+ "pooler_type": "first_token_transform",
26
+ "type_vocab_size": 2,
27
+ "vocab_size": 21128
28
+ }
chinese-xlnet-base/bert4torch_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "xlnet",
3
+ "attn_type": "bi",
4
+ "bi_data": false,
5
+ "bos_token_id": 1,
6
+ "clamp_len": -1,
7
+ "intermediate_size": 3072,
8
+ "hidden_size": 768,
9
+ "hidden_dropout_prob": 0.1,
10
+ "end_n_top": 5,
11
+ "eos_token_id": 2,
12
+ "hidden_act": "relu",
13
+ "initializer_range": 0.02,
14
+ "layer_norm_eps": 1e-12,
15
+ "mem_len": null,
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 12,
18
+ "output_past": true,
19
+ "pad_token_id": 5,
20
+ "reuse_len": null,
21
+ "same_length": false,
22
+ "start_n_top": 5,
23
+ "summary_activation": "tanh",
24
+ "summary_last_hidden_dropout_prob": 0.1,
25
+ "summary_type": "last",
26
+ "summary_use_proj": true,
27
+ "untie_r": true,
28
+ "vocab_size": 32000
29
+ }
chinese_alpaca_plus_7b/bert4torch_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "hidden_size": 4096,
4
+ "intermediate_size": 11008,
5
+ "num_attention_heads": 32,
6
+ "num_hidden_layers": 32,
7
+ "layer_norm_eps": 1e-06,
8
+ "hidden_act": "silu",
9
+ "vocab_size": 49954,
10
+ "segment_vocab_size": 0,
11
+ "skip_init": true,
12
+ "rope_rank": "updown"
13
+ }
chinese_llama_plus_7b/bert4torch_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "hidden_size": 4096,
4
+ "intermediate_size": 11008,
5
+ "num_attention_heads": 32,
6
+ "num_hidden_layers": 32,
7
+ "layer_norm_eps": 1e-06,
8
+ "hidden_act": "silu",
9
+ "vocab_size": 49953,
10
+ "segment_vocab_size": 0,
11
+ "skip_init": true,
12
+ "rope_rank": "updown"
13
+ }
deepseek-moe-16b-base/bert4torch_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "attention_dropout": 0.0,
4
+ "aux_loss_alpha": 0.001,
5
+ "bos_token_id": 100000,
6
+ "eos_token_id": 100001,
7
+ "first_k_dense_replace": 1,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 2048,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 10944,
12
+ "max_position_embeddings": 4096,
13
+ "model": "deepseek",
14
+ "moe_intermediate_size": 1408,
15
+ "moe_layer_freq": 1,
16
+ "n_routed_experts": 64,
17
+ "n_shared_experts": 2,
18
+ "norm_topk_prob": false,
19
+ "num_attention_heads": 16,
20
+ "num_experts_per_tok": 6,
21
+ "num_hidden_layers": 28,
22
+ "num_key_value_heads": 16,
23
+ "pretraining_tp": 1,
24
+ "rms_norm_eps": 1e-06,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000,
27
+ "scoring_func": "softmax",
28
+ "seq_aux": true,
29
+ "tie_word_embeddings": false,
30
+ "torch_dtype": "bfloat16",
31
+ "vocab_size": 102400,
32
+ "skip_init": true,
33
+ "segment_vocab_size": 0,
34
+ "rope_rank": "updown"
35
+ }
deepseek-moe-16b-chat/bert4torch_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attention_bias": false,
3
+ "attention_dropout": 0.0,
4
+ "aux_loss_alpha": 0.001,
5
+ "bos_token_id": 100000,
6
+ "eos_token_id": 100001,
7
+ "first_k_dense_replace": 1,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 2048,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 10944,
12
+ "max_position_embeddings": 4096,
13
+ "model": "deepseek",
14
+ "moe_intermediate_size": 1408,
15
+ "moe_layer_freq": 1,
16
+ "n_routed_experts": 64,
17
+ "n_shared_experts": 2,
18
+ "norm_topk_prob": false,
19
+ "num_attention_heads": 16,
20
+ "num_experts_per_tok": 6,
21
+ "num_hidden_layers": 28,
22
+ "num_key_value_heads": 16,
23
+ "pretraining_tp": 1,
24
+ "rms_norm_eps": 1e-06,
25
+ "rope_scaling": null,
26
+ "rope_theta": 10000,
27
+ "scoring_func": "softmax",
28
+ "seq_aux": true,
29
+ "tie_word_embeddings": false,
30
+ "torch_dtype": "bfloat16",
31
+ "vocab_size": 102400,
32
+ "skip_init": true,
33
+ "segment_vocab_size": 0,
34
+ "rope_rank": "updown"
35
+ }
falcon-7b-instruct/bert4torch_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "falcon",
3
+ "type_vocab_size": 0,
4
+ "p_bias": "rotary",
5
+ "rope_rank": "updown",
6
+ "apply_residual_post_layernorm": false,
7
+ "attention_dropout": 0.0,
8
+ "bias": false,
9
+ "bos_token_id": 11,
10
+ "eos_token_id": 11,
11
+ "hidden_dropout": 0.0,
12
+ "hidden_size": 4544,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 8192,
15
+ "hidden_act": "gelu",
16
+ "layer_norm_eps": 1e-05,
17
+ "multi_query": true,
18
+ "num_attention_heads": 71,
19
+ "num_hidden_layers": 32,
20
+ "parallel_attn": true,
21
+ "torch_dtype": "bfloat16",
22
+ "multi_query_group_num": 1,
23
+ "vocab_size": 65024,
24
+ "skip_init": true,
25
+ "norm_mode": "torch_buildin",
26
+ "flash_attention": "sdpa"
27
+ }
falcon-7b/bert4torch_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "falcon",
3
+ "type_vocab_size": 0,
4
+ "p_bias": "rotary",
5
+ "rope_rank": "updown",
6
+ "apply_residual_post_layernorm": false,
7
+ "attention_dropout": 0.0,
8
+ "bias": false,
9
+ "bos_token_id": 11,
10
+ "eos_token_id": 11,
11
+ "hidden_dropout": 0.0,
12
+ "hidden_size": 4544,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 8192,
15
+ "hidden_act": "gelu",
16
+ "layer_norm_eps": 1e-05,
17
+ "multi_query": true,
18
+ "num_attention_heads": 71,
19
+ "num_hidden_layers": 32,
20
+ "parallel_attn": true,
21
+ "torch_dtype": "bfloat16",
22
+ "multi_query_group_num": 1,
23
+ "vocab_size": 65024,
24
+ "skip_init": true,
25
+ "norm_mode": "torch_buildin",
26
+ "flash_attention": "sdpa"
27
+ }
falcon-rw-1b/bert4torch_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "falcon",
3
+ "type_vocab_size": 0,
4
+ "p_bias": "alibi",
5
+ "apply_residual_post_layernorm": false,
6
+ "attention_dropout": 0.0,
7
+ "bias": true,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "hidden_dropout": 0.0,
11
+ "hidden_size": 2048,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 8192,
14
+ "hidden_act": "gelu",
15
+ "layer_norm_eps": 1e-05,
16
+ "multi_query": false,
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 24,
19
+ "parallel_attn": false,
20
+ "torch_dtype": "bfloat16",
21
+ "vocab_size": 50304,
22
+ "skip_init": true,
23
+ "norm_mode": "torch_buildin"
24
+ }
gpt2-chinese-cluecorpussmall/bert4torch_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "gpt2",
3
+ "segment_vocab_size": 0,
4
+ "vocab_size": 21128,
5
+ "hidden_size": 768,
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_dropout_prob": 0.1,
8
+ "hidden_act": "gelu",
9
+ "initializer_range": 0.014142135623731,
10
+ "intermediate_size": 3072,
11
+ "max_position_embeddings": 1024,
12
+ "num_attention_heads": 12,
13
+ "num_hidden_layers": 12
14
+ }