Tongjilibo
/

bert4torch_config

Model card Files Files and versions Community

Tongjilibo commited on Aug 30

Commit

35d10b1

•

1 Parent(s): c394e13

修改flash_attention为_attn_implementation，增加deepseek

Browse files

Files changed (31) hide show

Qwen-14B-Chat/bert4torch_config.json +1 -1
Qwen-14B/bert4torch_config.json +1 -1
Qwen-1_8B-Chat/bert4torch_config.json +1 -1
Qwen-1_8B/bert4torch_config.json +1 -1
Qwen-7B-Chat/bert4torch_config.json +1 -1
Qwen-7B/bert4torch_config.json +1 -1
Qwen1.5-0.5B-Chat/bert4torch_config.json +1 -1
Qwen1.5-0.5B/bert4torch_config.json +1 -1
Qwen1.5-1.8B-Chat/bert4torch_config.json +1 -1
Qwen1.5-1.8B/bert4torch_config.json +1 -1
Qwen1.5-14B-Chat/bert4torch_config.json +1 -1
Qwen1.5-14B/bert4torch_config.json +1 -1
Qwen1.5-7B-Chat/bert4torch_config.json +1 -1
Qwen1.5-7B/bert4torch_config.json +1 -1
Qwen2-0.5B-Instruct/bert4torch_config.json +1 -1
Qwen2-0.5B/bert4torch_config.json +1 -1
Qwen2-1.5B-Instruct/bert4torch_config.json +1 -1
Qwen2-1.5B/bert4torch_config.json +1 -1
Qwen2-7B-Instruct/bert4torch_config.json +1 -1
Qwen2-7B/bert4torch_config.json +1 -1
chatglm2-6b-32k/bert4torch_config.json +1 -1
chatglm2-6b-int4/bert4torch_config.json +1 -1
chatglm2-6b/bert4torch_config.json +1 -1
chatglm3-6b-32k/bert4torch_config.json +1 -1
chatglm3-6b/bert4torch_config.json +1 -1
deepseek-coder-1.3b-instruct/bert4torch_config.json +30 -0
falcon-7b-instruct/bert4torch_config.json +1 -1
falcon-7b/bert4torch_config.json +1 -1
glm-4-9b-chat-1m/bert4torch_config.json +1 -1
glm-4-9b-chat/bert4torch_config.json +1 -1
glm-4-9b/bert4torch_config.json +1 -1

Qwen-14B-Chat/bert4torch_config.json CHANGED Viewed

@@ -11,7 +11,7 @@
     "scale_attn_weights": true,
     "seq_length": 2048,
     "tie_word_embeddings": false,
-    "flash_attention": "flash_attn_2",
     "vocab_size": 152064,
     "rope_scaling": {"type": "dynamic_qwen"},
     "use_logn_attn": true,

     "scale_attn_weights": true,
     "seq_length": 2048,
     "tie_word_embeddings": false,
+    "_attn_implementation": "flash_attn_2",
     "vocab_size": 152064,
     "rope_scaling": {"type": "dynamic_qwen"},
     "use_logn_attn": true,

Qwen-14B/bert4torch_config.json CHANGED Viewed

@@ -11,7 +11,7 @@
     "scale_attn_weights": true,
     "seq_length": 2048,
     "tie_word_embeddings": false,
-    "flash_attention": "flash_attn_2",
     "vocab_size": 152064,
     "rope_scaling": {"type": "dynamic_qwen"},
     "use_logn_attn": true,

     "scale_attn_weights": true,
     "seq_length": 2048,
     "tie_word_embeddings": false,
+    "_attn_implementation": "flash_attn_2",
     "vocab_size": 152064,
     "rope_scaling": {"type": "dynamic_qwen"},
     "use_logn_attn": true,

Qwen-1_8B-Chat/bert4torch_config.json CHANGED Viewed

@@ -12,7 +12,7 @@
     "rope_theta": 10000,
     "scale_attn_weights": true,
     "tie_word_embeddings": false,
-    "flash_attention": true,
     "vocab_size": 151936,
     "rope_scaling": {"type": "dynamic_qwen"},
     "use_logn_attn": true,

     "rope_theta": 10000,
     "scale_attn_weights": true,
     "tie_word_embeddings": false,
+    "_attn_implementation": "sdpa",
     "vocab_size": 151936,
     "rope_scaling": {"type": "dynamic_qwen"},
     "use_logn_attn": true,

Qwen-1_8B/bert4torch_config.json CHANGED Viewed

@@ -12,7 +12,7 @@
     "rope_theta": 10000,
     "scale_attn_weights": true,
     "tie_word_embeddings": false,
-    "flash_attention": true,
     "vocab_size": 151936,
     "rope_scaling": {"type": "dynamic_qwen"},
     "use_logn_attn": true,

     "rope_theta": 10000,
     "scale_attn_weights": true,
     "tie_word_embeddings": false,
+    "_attn_implementation": "sdpa",
     "vocab_size": 151936,
     "rope_scaling": {"type": "dynamic_qwen"},
     "use_logn_attn": true,

Qwen-7B-Chat/bert4torch_config.json CHANGED Viewed

@@ -12,7 +12,7 @@
     "rope_theta": 10000,
     "scale_attn_weights": true,
     "tie_word_embeddings": false,
-    "flash_attention": "flash_attn_2",
     "vocab_size": 151936,
     "rope_scaling": {"type": "dynamic_qwen"},
     "use_logn_attn": true,

     "rope_theta": 10000,
     "scale_attn_weights": true,
     "tie_word_embeddings": false,
+    "_attn_implementation": "flash_attn_2",
     "vocab_size": 151936,
     "rope_scaling": {"type": "dynamic_qwen"},
     "use_logn_attn": true,

Qwen-7B/bert4torch_config.json CHANGED Viewed

@@ -12,7 +12,7 @@
     "rope_theta": 10000,
     "scale_attn_weights": true,
     "tie_word_embeddings": false,
-    "flash_attention": "flash_attn_2",
     "vocab_size": 151936,
     "rope_scaling": {"type": "dynamic_qwen"},
     "use_logn_attn": true,

     "rope_theta": 10000,
     "scale_attn_weights": true,
     "tie_word_embeddings": false,
+    "_attn_implementation": "flash_attn_2",
     "vocab_size": 151936,
     "rope_scaling": {"type": "dynamic_qwen"},
     "use_logn_attn": true,

Qwen1.5-0.5B-Chat/bert4torch_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "rope_theta": 1000000.0,
     "tie_word_embeddings": true,
 	"torch_dtype": "bfloat16",
-    "flash_attention": true,
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

     "rope_theta": 1000000.0,
     "tie_word_embeddings": true,
 	"torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

Qwen1.5-0.5B/bert4torch_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "rope_theta": 1000000.0,
     "tie_word_embeddings": true,
 	"torch_dtype": "bfloat16",
-    "flash_attention": true,
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

     "rope_theta": 1000000.0,
     "tie_word_embeddings": true,
 	"torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

Qwen1.5-1.8B-Chat/bert4torch_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
-    "flash_attention": true,
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

Qwen1.5-1.8B/bert4torch_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
-    "flash_attention": true,
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

Qwen1.5-14B-Chat/bert4torch_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
-    "flash_attention": true,
 	"is_causal": true,
     "vocab_size": 152064,
     "segment_vocab_size": 0,

     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
 	"is_causal": true,
     "vocab_size": 152064,
     "segment_vocab_size": 0,

Qwen1.5-14B/bert4torch_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
-    "flash_attention": true,
 	"is_causal": true,
     "vocab_size": 152064,
     "segment_vocab_size": 0,

     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
 	"is_causal": true,
     "vocab_size": 152064,
     "segment_vocab_size": 0,

Qwen1.5-7B-Chat/bert4torch_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
-    "flash_attention": true,
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

Qwen1.5-7B/bert4torch_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
-    "flash_attention": true,
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

Qwen2-0.5B-Instruct/bert4torch_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "rope_theta": 1000000.0,
     "tie_word_embeddings": true,
 	"torch_dtype": "bfloat16",
-    "flash_attention": true,
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

     "rope_theta": 1000000.0,
     "tie_word_embeddings": true,
 	"torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

Qwen2-0.5B/bert4torch_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "rope_theta": 1000000.0,
     "tie_word_embeddings": true,
 	"torch_dtype": "bfloat16",
-    "flash_attention": true,
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

     "rope_theta": 1000000.0,
     "tie_word_embeddings": true,
 	"torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

Qwen2-1.5B-Instruct/bert4torch_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "rope_theta": 1000000.0,
     "tie_word_embeddings": true,
 	"torch_dtype": "bfloat16",
-    "flash_attention": true,
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

     "rope_theta": 1000000.0,
     "tie_word_embeddings": true,
 	"torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

Qwen2-1.5B/bert4torch_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "rope_theta": 1000000.0,
     "tie_word_embeddings": true,
 	"torch_dtype": "bfloat16",
-    "flash_attention": true,
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

     "rope_theta": 1000000.0,
     "tie_word_embeddings": true,
 	"torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
 	"is_causal": true,
     "vocab_size": 151936,
     "segment_vocab_size": 0,

Qwen2-7B-Instruct/bert4torch_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
-    "flash_attention": true,
 	"is_causal": true,
     "vocab_size": 152064,
     "segment_vocab_size": 0,

     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
 	"is_causal": true,
     "vocab_size": 152064,
     "segment_vocab_size": 0,

Qwen2-7B/bert4torch_config.json CHANGED Viewed

@@ -14,7 +14,7 @@
     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
-    "flash_attention": true,
 	"is_causal": true,
     "vocab_size": 152064,
     "segment_vocab_size": 0,

     "rope_theta": 1000000.0,
     "tie_word_embeddings": false,
 	"torch_dtype": "bfloat16",
+    "_attn_implementation": "sdpa",
 	"is_causal": true,
     "vocab_size": 152064,
     "segment_vocab_size": 0,

chatglm2-6b-32k/bert4torch_config.json CHANGED Viewed

@@ -21,6 +21,6 @@
     "factor": 16
   },
   "position_encoding_2d": true,
-  "flash_attention": true,
   "generation_config": {"max_length": 32768}
 }

     "factor": 16
   },
   "position_encoding_2d": true,
+  "_attn_implementation": "sdpa",
   "generation_config": {"max_length": 32768}
 }

chatglm2-6b-int4/bert4torch_config.json CHANGED Viewed

@@ -17,7 +17,7 @@
     "rmsnorm": true,
     "rope_rank": "adjacent",
     "position_encoding_2d": true,
-    "flash_attention": true,
     "quantization_bit": 4,
     "quantization_method": "cpm_kernels",
     "target_modules": ["q", "k", "v", "o", "intermediateDense", "outputDense"],

     "rmsnorm": true,
     "rope_rank": "adjacent",
     "position_encoding_2d": true,
+    "_attn_implementation": "sdpa",
     "quantization_bit": 4,
     "quantization_method": "cpm_kernels",
     "target_modules": ["q", "k", "v", "o", "intermediateDense", "outputDense"],

chatglm2-6b/bert4torch_config.json CHANGED Viewed

@@ -17,6 +17,6 @@
     "rmsnorm": true,
     "rope_rank": "adjacent",
     "position_encoding_2d": true,
-    "flash_attention": true,
     "generation_config": {"max_length": 32768}
     }

     "rmsnorm": true,
     "rope_rank": "adjacent",
     "position_encoding_2d": true,
+    "_attn_implementation": "sdpa",
     "generation_config": {"max_length": 32768}
     }

chatglm3-6b-32k/bert4torch_config.json CHANGED Viewed

@@ -19,7 +19,7 @@
   "rope_rank": "adjacent",
   "rope_theta": 500000,
   "position_encoding_2d": true,
-  "flash_attention": true,
   "generation_config": {"tokenizer_config": {"additional_special_tokens": ["<|user|>", "<|observation|>"],
                         "skip_special_tokens": true}, "eos_token_id": [2, 64795, 64797], "max_length": 32768}
 }

   "rope_rank": "adjacent",
   "rope_theta": 500000,
   "position_encoding_2d": true,
+  "_attn_implementation": "sdpa",
   "generation_config": {"tokenizer_config": {"additional_special_tokens": ["<|user|>", "<|observation|>"],
                         "skip_special_tokens": true}, "eos_token_id": [2, 64795, 64797], "max_length": 32768}
 }

chatglm3-6b/bert4torch_config.json CHANGED Viewed

@@ -18,7 +18,7 @@
     "rmsnorm": true,
     "rope_rank": "adjacent",
     "position_encoding_2d": true,
-    "flash_attention": true,
     "generation_config": {"tokenizer_config": {"additional_special_tokens": ["<|user|>", "<|observation|>"],
                           "skip_special_tokens": true}, "eos_token_id": [2, 64795, 64797], "max_length": 8192}
 }

     "rmsnorm": true,
     "rope_rank": "adjacent",
     "position_encoding_2d": true,
+    "_attn_implementation": "sdpa",
     "generation_config": {"tokenizer_config": {"additional_special_tokens": ["<|user|>", "<|observation|>"],
                           "skip_special_tokens": true}, "eos_token_id": [2, 64795, 64797], "max_length": 8192}
 }

deepseek-coder-1.3b-instruct/bert4torch_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "model": "llama",
+    "template": "apply_chat_template",
+    "bos_token_id": 32013,
+    "eos_token_id": 32021,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 5504,
+    "max_position_embeddings": 16384,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "num_key_value_heads": 16,
+    "layer_norm_eps": 1e-06,
+    "rope_scaling": {
+      "factor": 4.0,
+      "type": "linear"
+    },
+    "rope_theta": 100000,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "vocab_size": 32256,
+    "rope_rank": "updown",
+    "_attn_implementation": "sdpa",
+    "segment_vocab_size": 0,
+    "skip_init": true,
+    "convert_lm_logits_dtype": "float32",
+    "generation_config": {"tokenizer_config":  {"skip_special_tokens": true, "add_special_tokens": false}, "eos_token_id": [32021]}
+  }

falcon-7b-instruct/bert4torch_config.json CHANGED Viewed

@@ -23,5 +23,5 @@
     "vocab_size": 65024,
 	"skip_init": true,
     "norm_mode": "torch_buildin",
-	"flash_attention": "sdpa"
 }

     "vocab_size": 65024,
 	"skip_init": true,
     "norm_mode": "torch_buildin",
+	"_attn_implementation": "sdpa"
 }

falcon-7b/bert4torch_config.json CHANGED Viewed

@@ -23,5 +23,5 @@
     "vocab_size": 65024,
 	"skip_init": true,
     "norm_mode": "torch_buildin",
-	"flash_attention": "sdpa"
 }

     "vocab_size": 65024,
 	"skip_init": true,
     "norm_mode": "torch_buildin",
+	"_attn_implementation": "sdpa"
 }

glm-4-9b-chat-1m/bert4torch_config.json CHANGED Viewed

@@ -17,7 +17,7 @@
     "rope_rank": "adjacent",
     "rope_theta": 100000000,
     "position_encoding_2d": true,
-    "flash_attention": true,
     "eos_token_id": [151329, 151336, 151338],
     "pad_token_id": 151329,
     "generation_config": {"tokenizer_config": {"skip_special_tokens": true},

     "rope_rank": "adjacent",
     "rope_theta": 100000000,
     "position_encoding_2d": true,
+    "_attn_implementation": "sdpa",
     "eos_token_id": [151329, 151336, 151338],
     "pad_token_id": 151329,
     "generation_config": {"tokenizer_config": {"skip_special_tokens": true},

glm-4-9b-chat/bert4torch_config.json CHANGED Viewed

@@ -17,7 +17,7 @@
     "rope_rank": "adjacent",
     "rope_theta": 5000000,
     "position_encoding_2d": true,
-    "flash_attention": true,
     "eos_token_id": [151329, 151336, 151338],
     "pad_token_id": 151329,
     "generation_config": {"tokenizer_config": {"skip_special_tokens": true},

     "rope_rank": "adjacent",
     "rope_theta": 5000000,
     "position_encoding_2d": true,
+    "_attn_implementation": "sdpa",
     "eos_token_id": [151329, 151336, 151338],
     "pad_token_id": 151329,
     "generation_config": {"tokenizer_config": {"skip_special_tokens": true},

glm-4-9b/bert4torch_config.json CHANGED Viewed

@@ -16,7 +16,7 @@
     "rmsnorm": true,
     "rope_rank": "adjacent",
     "position_encoding_2d": true,
-    "flash_attention": true,
     "eos_token_id": [151329, 151336, 151338],
     "pad_token_id": 151329,
     "generation_config": {"tokenizer_config": {"skip_special_tokens": true},

     "rmsnorm": true,
     "rope_rank": "adjacent",
     "position_encoding_2d": true,
+    "_attn_implementation": "sdpa",
     "eos_token_id": [151329, 151336, 151338],
     "pad_token_id": 151329,
     "generation_config": {"tokenizer_config": {"skip_special_tokens": true},