Tongjilibo commited on
Commit
6864349
·
1 Parent(s): e5d6f0a

去掉glm的position_encoding_2d

Browse files
THUDM/chatglm-6b-int4/bert4torch_config.json CHANGED
@@ -12,7 +12,6 @@
12
  "max_sequence_length": 2048,
13
  "num_attention_heads": 32,
14
  "num_hidden_layers": 28,
15
- "position_encoding_2d": true,
16
  "rope_scaling": {"type": "glm"},
17
  "torch_dtype": "float16",
18
  "vocab_size": 130528,
 
12
  "max_sequence_length": 2048,
13
  "num_attention_heads": 32,
14
  "num_hidden_layers": 28,
 
15
  "rope_scaling": {"type": "glm"},
16
  "torch_dtype": "float16",
17
  "vocab_size": 130528,
THUDM/chatglm-6b-int8/bert4torch_config.json CHANGED
@@ -12,7 +12,6 @@
12
  "max_sequence_length": 2048,
13
  "num_attention_heads": 32,
14
  "num_hidden_layers": 28,
15
- "position_encoding_2d": true,
16
  "rope_scaling": {"type": "glm"},
17
  "torch_dtype": "float16",
18
  "vocab_size": 130528,
 
12
  "max_sequence_length": 2048,
13
  "num_attention_heads": 32,
14
  "num_hidden_layers": 28,
 
15
  "rope_scaling": {"type": "glm"},
16
  "torch_dtype": "float16",
17
  "vocab_size": 130528,
THUDM/chatglm-6b-v0.1.0/bert4torch_config.json CHANGED
@@ -12,7 +12,6 @@
12
  "max_sequence_length": 2048,
13
  "num_attention_heads": 32,
14
  "num_hidden_layers": 28,
15
- "position_encoding_2d": true,
16
  "rope_scaling": {"type": "glm"},
17
  "torch_dtype": "float16",
18
  "vocab_size": 130528,
 
12
  "max_sequence_length": 2048,
13
  "num_attention_heads": 32,
14
  "num_hidden_layers": 28,
 
15
  "rope_scaling": {"type": "glm"},
16
  "torch_dtype": "float16",
17
  "vocab_size": 130528,
THUDM/chatglm-6b/bert4torch_config.json CHANGED
@@ -12,7 +12,6 @@
12
  "max_sequence_length": 2048,
13
  "num_attention_heads": 32,
14
  "num_hidden_layers": 28,
15
- "position_encoding_2d": true,
16
  "rope_scaling": {"type": "glm"},
17
  "torch_dtype": "float16",
18
  "vocab_size": 130528,
 
12
  "max_sequence_length": 2048,
13
  "num_attention_heads": 32,
14
  "num_hidden_layers": 28,
 
15
  "rope_scaling": {"type": "glm"},
16
  "torch_dtype": "float16",
17
  "vocab_size": 130528,
THUDM/chatglm2-6b-32k/bert4torch_config.json CHANGED
@@ -21,7 +21,6 @@
21
  "factor": 16
22
  },
23
  "torch_dtype": "float16",
24
- "position_encoding_2d": true,
25
  "_attn_implementation": "sdpa",
26
  "generation_config": {"max_length": 32768}
27
  }
 
21
  "factor": 16
22
  },
23
  "torch_dtype": "float16",
 
24
  "_attn_implementation": "sdpa",
25
  "generation_config": {"max_length": 32768}
26
  }
THUDM/chatglm2-6b-int4/bert4torch_config.json CHANGED
@@ -16,7 +16,6 @@
16
  "pad_token_id": 2,
17
  "rmsnorm": true,
18
  "rope_rank": "adjacent",
19
- "position_encoding_2d": true,
20
  "rope_scaling": {"type": "glm"},
21
  "torch_dtype": "float16",
22
  "_attn_implementation": "sdpa",
 
16
  "pad_token_id": 2,
17
  "rmsnorm": true,
18
  "rope_rank": "adjacent",
 
19
  "rope_scaling": {"type": "glm"},
20
  "torch_dtype": "float16",
21
  "_attn_implementation": "sdpa",
THUDM/chatglm2-6b/bert4torch_config.json CHANGED
@@ -16,7 +16,6 @@
16
  "pad_token_id": 2,
17
  "rmsnorm": true,
18
  "rope_rank": "adjacent",
19
- "position_encoding_2d": true,
20
  "rope_scaling": {"type": "glm"},
21
  "torch_dtype": "float16",
22
  "_attn_implementation": "sdpa",
 
16
  "pad_token_id": 2,
17
  "rmsnorm": true,
18
  "rope_rank": "adjacent",
 
19
  "rope_scaling": {"type": "glm"},
20
  "torch_dtype": "float16",
21
  "_attn_implementation": "sdpa",
THUDM/chatglm3-6b-32k/bert4torch_config.json CHANGED
@@ -18,7 +18,6 @@
18
  "rmsnorm": true,
19
  "rope_rank": "adjacent",
20
  "rope_theta": 500000,
21
- "position_encoding_2d": true,
22
  "rope_scaling": {"type": "glm"},
23
  "_attn_implementation": "sdpa",
24
  "torch_dtype": "float16",
 
18
  "rmsnorm": true,
19
  "rope_rank": "adjacent",
20
  "rope_theta": 500000,
 
21
  "rope_scaling": {"type": "glm"},
22
  "_attn_implementation": "sdpa",
23
  "torch_dtype": "float16",
THUDM/chatglm3-6b/bert4torch_config.json CHANGED
@@ -17,7 +17,6 @@
17
  "pad_token_id": 0,
18
  "rmsnorm": true,
19
  "rope_rank": "adjacent",
20
- "position_encoding_2d": true,
21
  "rope_scaling": {"type": "glm"},
22
  "torch_dtype": "float16",
23
  "_attn_implementation": "sdpa",
 
17
  "pad_token_id": 0,
18
  "rmsnorm": true,
19
  "rope_rank": "adjacent",
 
20
  "rope_scaling": {"type": "glm"},
21
  "torch_dtype": "float16",
22
  "_attn_implementation": "sdpa",
THUDM/glm-4-9b-chat-1m/bert4torch_config.json CHANGED
@@ -16,7 +16,6 @@
16
  "rmsnorm": true,
17
  "rope_rank": "adjacent",
18
  "rope_theta": 100000000,
19
- "position_encoding_2d": true,
20
  "rope_scaling": {"type": "glm"},
21
  "torch_dtype": "bfloat16",
22
  "_attn_implementation": "sdpa",
 
16
  "rmsnorm": true,
17
  "rope_rank": "adjacent",
18
  "rope_theta": 100000000,
 
19
  "rope_scaling": {"type": "glm"},
20
  "torch_dtype": "bfloat16",
21
  "_attn_implementation": "sdpa",
THUDM/glm-4-9b-chat/bert4torch_config.json CHANGED
@@ -16,7 +16,6 @@
16
  "rmsnorm": true,
17
  "rope_rank": "adjacent",
18
  "rope_theta": 5000000,
19
- "position_encoding_2d": true,
20
  "rope_scaling": {"type": "glm"},
21
  "torch_dtype": "bfloat16",
22
  "_attn_implementation": "sdpa",
 
16
  "rmsnorm": true,
17
  "rope_rank": "adjacent",
18
  "rope_theta": 5000000,
 
19
  "rope_scaling": {"type": "glm"},
20
  "torch_dtype": "bfloat16",
21
  "_attn_implementation": "sdpa",
THUDM/glm-4-9b/bert4torch_config.json CHANGED
@@ -15,7 +15,6 @@
15
  "tie_emb_prj_weight": false,
16
  "rmsnorm": true,
17
  "rope_rank": "adjacent",
18
- "position_encoding_2d": true,
19
  "rope_scaling": {"type": "glm"},
20
  "torch_dtype": "bfloat16",
21
  "_attn_implementation": "sdpa",
 
15
  "tie_emb_prj_weight": false,
16
  "rmsnorm": true,
17
  "rope_rank": "adjacent",
 
18
  "rope_scaling": {"type": "glm"},
19
  "torch_dtype": "bfloat16",
20
  "_attn_implementation": "sdpa",
THUDM/glm-4v-9b/bert4torch_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "glm4v",
3
+ "template": "glm4v",
4
+ "hidden_act": "swiglu",
5
+ "hidden_size": 4096,
6
+ "intermediate_size": 13696,
7
+ "layer_norm_eps": 1.5625e-07,
8
+ "max_sequence_length": 131072,
9
+ "num_attention_heads": 32,
10
+ "num_hidden_layers": 40,
11
+ "vocab_size": 151552,
12
+ "segment_vocab_size": 0,
13
+ "num_key_value_heads": 2,
14
+ "skip_init": true,
15
+ "tie_emb_prj_weight": false,
16
+ "rmsnorm": true,
17
+ "rope_rank": "adjacent",
18
+ "rope_theta": 10000,
19
+ "position_encoding_2d": true,
20
+ "rope_scaling": {"type": "glm"},
21
+ "torch_dtype": "bfloat16",
22
+ "_attn_implementation": "sdpa",
23
+ "eos_token_id": [151329, 151336, 151338],
24
+ "pad_token_id": 151329,
25
+ "boi_token_id": 151339,
26
+ "eoi_token_id": 151340,
27
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true, "add_special_tokens": false},
28
+ "eos_token_id": [151329, 151336, 151338], "max_length": 131072},
29
+ "vision_config": {
30
+ "dropout_prob": 0.0,
31
+ "hidden_act": "gelu",
32
+ "in_channels": 3,
33
+ "num_hidden_layers": 63,
34
+ "hidden_size": 1792,
35
+ "patch_size": 14,
36
+ "num_heads": 16,
37
+ "intermediate_size": 15360,
38
+ "layer_norm_eps": 1e-06,
39
+ "num_positions": 6401,
40
+ "image_size": 1120,
41
+ "scaling_factor": 8
42
+ }
43
+ }
deepseek-ai/DeepSeek-V2-Lite-Chat/bert4torch_config.json CHANGED
@@ -52,5 +52,11 @@
52
  "convert_lm_logits_dtype": "float32",
53
  "segment_vocab_size": 0,
54
  "rope_rank": "updown",
55
- "generation_config": {"tokenizer_config": {"skip_special_tokens": true, "add_special_tokens": false, "additional_special_tokens": ["User"]}, "eos_token_id": [100001, 5726]}
 
 
 
 
 
 
56
  }
 
52
  "convert_lm_logits_dtype": "float32",
53
  "segment_vocab_size": 0,
54
  "rope_rank": "updown",
55
+ "generation_config": {
56
+ "tokenizer_config": {
57
+ "skip_special_tokens": true,
58
+ "add_special_tokens": false,
59
+ "additional_special_tokens": ["User"]
60
+ },
61
+ "eos_token_id": [100001, 5726]}
62
  }
deepseek-ai/deepseek-moe-16b-base/bert4torch_config.json CHANGED
@@ -22,7 +22,7 @@
22
  "num_hidden_layers": 28,
23
  "num_key_value_heads": 16,
24
  "pretraining_tp": 1,
25
- "rms_norm_eps": 1e-06,
26
  "rope_scaling": null,
27
  "rope_theta": 10000,
28
  "scoring_func": "softmax",
 
22
  "num_hidden_layers": 28,
23
  "num_key_value_heads": 16,
24
  "pretraining_tp": 1,
25
+ "layer_norm_eps": 1e-06,
26
  "rope_scaling": null,
27
  "rope_theta": 10000,
28
  "scoring_func": "softmax",
deepseek-ai/deepseek-moe-16b-chat/bert4torch_config.json CHANGED
@@ -22,7 +22,7 @@
22
  "num_hidden_layers": 28,
23
  "num_key_value_heads": 16,
24
  "pretraining_tp": 1,
25
- "rms_norm_eps": 1e-06,
26
  "rope_scaling": null,
27
  "rope_theta": 10000,
28
  "scoring_func": "softmax",
 
22
  "num_hidden_layers": 28,
23
  "num_key_value_heads": 16,
24
  "pretraining_tp": 1,
25
+ "layer_norm_eps": 1e-06,
26
  "rope_scaling": null,
27
  "rope_theta": 10000,
28
  "scoring_func": "softmax",