Tongjilibo
/

bert4torch_config

Model card Files Files and versions Community

Tongjilibo commited on Jan 10

Commit

6864349

1 Parent(s): e5d6f0a

去掉glm的position_encoding_2d

Browse files

Files changed (16) hide show

THUDM/chatglm-6b-int4/bert4torch_config.json +0 -1
THUDM/chatglm-6b-int8/bert4torch_config.json +0 -1
THUDM/chatglm-6b-v0.1.0/bert4torch_config.json +0 -1
THUDM/chatglm-6b/bert4torch_config.json +0 -1
THUDM/chatglm2-6b-32k/bert4torch_config.json +0 -1
THUDM/chatglm2-6b-int4/bert4torch_config.json +0 -1
THUDM/chatglm2-6b/bert4torch_config.json +0 -1
THUDM/chatglm3-6b-32k/bert4torch_config.json +0 -1
THUDM/chatglm3-6b/bert4torch_config.json +0 -1
THUDM/glm-4-9b-chat-1m/bert4torch_config.json +0 -1
THUDM/glm-4-9b-chat/bert4torch_config.json +0 -1
THUDM/glm-4-9b/bert4torch_config.json +0 -1
THUDM/glm-4v-9b/bert4torch_config.json +43 -0
deepseek-ai/DeepSeek-V2-Lite-Chat/bert4torch_config.json +7 -1
deepseek-ai/deepseek-moe-16b-base/bert4torch_config.json +1 -1
deepseek-ai/deepseek-moe-16b-chat/bert4torch_config.json +1 -1

THUDM/chatglm-6b-int4/bert4torch_config.json CHANGED Viewed

@@ -12,7 +12,6 @@
     "max_sequence_length": 2048,
     "num_attention_heads": 32,
     "num_hidden_layers": 28,
-    "position_encoding_2d": true,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "float16",
     "vocab_size": 130528,

     "max_sequence_length": 2048,
     "num_attention_heads": 32,
     "num_hidden_layers": 28,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "float16",
     "vocab_size": 130528,

THUDM/chatglm-6b-int8/bert4torch_config.json CHANGED Viewed

@@ -12,7 +12,6 @@
     "max_sequence_length": 2048,
     "num_attention_heads": 32,
     "num_hidden_layers": 28,
-    "position_encoding_2d": true,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "float16",
     "vocab_size": 130528,

     "max_sequence_length": 2048,
     "num_attention_heads": 32,
     "num_hidden_layers": 28,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "float16",
     "vocab_size": 130528,

THUDM/chatglm-6b-v0.1.0/bert4torch_config.json CHANGED Viewed

@@ -12,7 +12,6 @@
     "max_sequence_length": 2048,
     "num_attention_heads": 32,
     "num_hidden_layers": 28,
-    "position_encoding_2d": true,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "float16",
     "vocab_size": 130528,

     "max_sequence_length": 2048,
     "num_attention_heads": 32,
     "num_hidden_layers": 28,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "float16",
     "vocab_size": 130528,

THUDM/chatglm-6b/bert4torch_config.json CHANGED Viewed

@@ -12,7 +12,6 @@
     "max_sequence_length": 2048,
     "num_attention_heads": 32,
     "num_hidden_layers": 28,
-    "position_encoding_2d": true,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "float16",
     "vocab_size": 130528,

     "max_sequence_length": 2048,
     "num_attention_heads": 32,
     "num_hidden_layers": 28,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "float16",
     "vocab_size": 130528,

THUDM/chatglm2-6b-32k/bert4torch_config.json CHANGED Viewed

@@ -21,7 +21,6 @@
     "factor": 16
   },
   "torch_dtype": "float16",
-  "position_encoding_2d": true,
   "_attn_implementation": "sdpa",
   "generation_config": {"max_length": 32768}
 }

     "factor": 16
   },
   "torch_dtype": "float16",
   "_attn_implementation": "sdpa",
   "generation_config": {"max_length": 32768}
 }

THUDM/chatglm2-6b-int4/bert4torch_config.json CHANGED Viewed

@@ -16,7 +16,6 @@
     "pad_token_id": 2,
     "rmsnorm": true,
     "rope_rank": "adjacent",
-    "position_encoding_2d": true,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "float16",
     "_attn_implementation": "sdpa",

     "pad_token_id": 2,
     "rmsnorm": true,
     "rope_rank": "adjacent",
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "float16",
     "_attn_implementation": "sdpa",

THUDM/chatglm2-6b/bert4torch_config.json CHANGED Viewed

@@ -16,7 +16,6 @@
     "pad_token_id": 2,
     "rmsnorm": true,
     "rope_rank": "adjacent",
-    "position_encoding_2d": true,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "float16",
     "_attn_implementation": "sdpa",

     "pad_token_id": 2,
     "rmsnorm": true,
     "rope_rank": "adjacent",
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "float16",
     "_attn_implementation": "sdpa",

THUDM/chatglm3-6b-32k/bert4torch_config.json CHANGED Viewed

@@ -18,7 +18,6 @@
   "rmsnorm": true,
   "rope_rank": "adjacent",
   "rope_theta": 500000,
-  "position_encoding_2d": true,
   "rope_scaling": {"type": "glm"},
   "_attn_implementation": "sdpa",
   "torch_dtype": "float16",

   "rmsnorm": true,
   "rope_rank": "adjacent",
   "rope_theta": 500000,
   "rope_scaling": {"type": "glm"},
   "_attn_implementation": "sdpa",
   "torch_dtype": "float16",

THUDM/chatglm3-6b/bert4torch_config.json CHANGED Viewed

@@ -17,7 +17,6 @@
     "pad_token_id": 0,
     "rmsnorm": true,
     "rope_rank": "adjacent",
-    "position_encoding_2d": true,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "float16",
     "_attn_implementation": "sdpa",

     "pad_token_id": 0,
     "rmsnorm": true,
     "rope_rank": "adjacent",
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "float16",
     "_attn_implementation": "sdpa",

THUDM/glm-4-9b-chat-1m/bert4torch_config.json CHANGED Viewed

@@ -16,7 +16,6 @@
     "rmsnorm": true,
     "rope_rank": "adjacent",
     "rope_theta": 100000000,
-    "position_encoding_2d": true,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "bfloat16",
     "_attn_implementation": "sdpa",

     "rmsnorm": true,
     "rope_rank": "adjacent",
     "rope_theta": 100000000,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "bfloat16",
     "_attn_implementation": "sdpa",

THUDM/glm-4-9b-chat/bert4torch_config.json CHANGED Viewed

@@ -16,7 +16,6 @@
     "rmsnorm": true,
     "rope_rank": "adjacent",
     "rope_theta": 5000000,
-    "position_encoding_2d": true,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "bfloat16",
     "_attn_implementation": "sdpa",

     "rmsnorm": true,
     "rope_rank": "adjacent",
     "rope_theta": 5000000,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "bfloat16",
     "_attn_implementation": "sdpa",

THUDM/glm-4-9b/bert4torch_config.json CHANGED Viewed

@@ -15,7 +15,6 @@
     "tie_emb_prj_weight": false,
     "rmsnorm": true,
     "rope_rank": "adjacent",
-    "position_encoding_2d": true,
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "bfloat16",
     "_attn_implementation": "sdpa",

     "tie_emb_prj_weight": false,
     "rmsnorm": true,
     "rope_rank": "adjacent",
     "rope_scaling": {"type": "glm"},
     "torch_dtype": "bfloat16",
     "_attn_implementation": "sdpa",

THUDM/glm-4v-9b/bert4torch_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+	"model": "glm4v",
+    "template": "glm4v",
+    "hidden_act": "swiglu",
+    "hidden_size": 4096,
+    "intermediate_size": 13696,
+    "layer_norm_eps": 1.5625e-07,
+    "max_sequence_length": 131072,
+    "num_attention_heads": 32,
+    "num_hidden_layers": 40,
+    "vocab_size": 151552,
+    "segment_vocab_size": 0,
+    "num_key_value_heads": 2,
+    "skip_init": true,
+    "tie_emb_prj_weight": false,
+    "rmsnorm": true,
+    "rope_rank": "adjacent",
+    "rope_theta": 10000,
+    "position_encoding_2d": true,
+    "rope_scaling": {"type": "glm"},
+    "torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
+    "eos_token_id": [151329, 151336, 151338],
+    "pad_token_id": 151329,
+    "boi_token_id": 151339,
+    "eoi_token_id": 151340,
+    "generation_config": {"tokenizer_config": {"skip_special_tokens": true, "add_special_tokens": false},
+                          "eos_token_id": [151329, 151336, 151338], "max_length": 131072},
+    "vision_config": {
+        "dropout_prob": 0.0,
+        "hidden_act": "gelu",
+        "in_channels": 3,
+        "num_hidden_layers": 63,
+        "hidden_size": 1792,
+        "patch_size": 14,
+        "num_heads": 16,
+        "intermediate_size": 15360,
+        "layer_norm_eps": 1e-06,
+        "num_positions": 6401,
+        "image_size": 1120,
+        "scaling_factor": 8
+    }
+}

deepseek-ai/DeepSeek-V2-Lite-Chat/bert4torch_config.json CHANGED Viewed

@@ -52,5 +52,11 @@
     "convert_lm_logits_dtype": "float32",
     "segment_vocab_size": 0,
     "rope_rank": "updown",
-    "generation_config": {"tokenizer_config":  {"skip_special_tokens": true, "add_special_tokens": false, "additional_special_tokens": ["User"]}, "eos_token_id": [100001, 5726]}
   }

     "convert_lm_logits_dtype": "float32",
     "segment_vocab_size": 0,
     "rope_rank": "updown",
+    "generation_config": {
+      "tokenizer_config": {
+         "skip_special_tokens": true,
+         "add_special_tokens": false,
+         "additional_special_tokens": ["User"]
+        },
+        "eos_token_id": [100001, 5726]}
   }

deepseek-ai/deepseek-moe-16b-base/bert4torch_config.json CHANGED Viewed

@@ -22,7 +22,7 @@
   "num_hidden_layers": 28,
   "num_key_value_heads": 16,
   "pretraining_tp": 1,
-  "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 10000,
   "scoring_func": "softmax",

   "num_hidden_layers": 28,
   "num_key_value_heads": 16,
   "pretraining_tp": 1,
+  "layer_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 10000,
   "scoring_func": "softmax",

deepseek-ai/deepseek-moe-16b-chat/bert4torch_config.json CHANGED Viewed

@@ -22,7 +22,7 @@
   "num_hidden_layers": 28,
   "num_key_value_heads": 16,
   "pretraining_tp": 1,
-  "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 10000,
   "scoring_func": "softmax",

   "num_hidden_layers": 28,
   "num_key_value_heads": 16,
   "pretraining_tp": 1,
+  "layer_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 10000,
   "scoring_func": "softmax",