Tongjilibo commited on
Commit
35d10b1
1 Parent(s): c394e13

修改flash_attention为_attn_implementation,增加deepseek

Browse files
Qwen-14B-Chat/bert4torch_config.json CHANGED
@@ -11,7 +11,7 @@
11
  "scale_attn_weights": true,
12
  "seq_length": 2048,
13
  "tie_word_embeddings": false,
14
- "flash_attention": "flash_attn_2",
15
  "vocab_size": 152064,
16
  "rope_scaling": {"type": "dynamic_qwen"},
17
  "use_logn_attn": true,
 
11
  "scale_attn_weights": true,
12
  "seq_length": 2048,
13
  "tie_word_embeddings": false,
14
+ "_attn_implementation": "flash_attn_2",
15
  "vocab_size": 152064,
16
  "rope_scaling": {"type": "dynamic_qwen"},
17
  "use_logn_attn": true,
Qwen-14B/bert4torch_config.json CHANGED
@@ -11,7 +11,7 @@
11
  "scale_attn_weights": true,
12
  "seq_length": 2048,
13
  "tie_word_embeddings": false,
14
- "flash_attention": "flash_attn_2",
15
  "vocab_size": 152064,
16
  "rope_scaling": {"type": "dynamic_qwen"},
17
  "use_logn_attn": true,
 
11
  "scale_attn_weights": true,
12
  "seq_length": 2048,
13
  "tie_word_embeddings": false,
14
+ "_attn_implementation": "flash_attn_2",
15
  "vocab_size": 152064,
16
  "rope_scaling": {"type": "dynamic_qwen"},
17
  "use_logn_attn": true,
Qwen-1_8B-Chat/bert4torch_config.json CHANGED
@@ -12,7 +12,7 @@
12
  "rope_theta": 10000,
13
  "scale_attn_weights": true,
14
  "tie_word_embeddings": false,
15
- "flash_attention": true,
16
  "vocab_size": 151936,
17
  "rope_scaling": {"type": "dynamic_qwen"},
18
  "use_logn_attn": true,
 
12
  "rope_theta": 10000,
13
  "scale_attn_weights": true,
14
  "tie_word_embeddings": false,
15
+ "_attn_implementation": "sdpa",
16
  "vocab_size": 151936,
17
  "rope_scaling": {"type": "dynamic_qwen"},
18
  "use_logn_attn": true,
Qwen-1_8B/bert4torch_config.json CHANGED
@@ -12,7 +12,7 @@
12
  "rope_theta": 10000,
13
  "scale_attn_weights": true,
14
  "tie_word_embeddings": false,
15
- "flash_attention": true,
16
  "vocab_size": 151936,
17
  "rope_scaling": {"type": "dynamic_qwen"},
18
  "use_logn_attn": true,
 
12
  "rope_theta": 10000,
13
  "scale_attn_weights": true,
14
  "tie_word_embeddings": false,
15
+ "_attn_implementation": "sdpa",
16
  "vocab_size": 151936,
17
  "rope_scaling": {"type": "dynamic_qwen"},
18
  "use_logn_attn": true,
Qwen-7B-Chat/bert4torch_config.json CHANGED
@@ -12,7 +12,7 @@
12
  "rope_theta": 10000,
13
  "scale_attn_weights": true,
14
  "tie_word_embeddings": false,
15
- "flash_attention": "flash_attn_2",
16
  "vocab_size": 151936,
17
  "rope_scaling": {"type": "dynamic_qwen"},
18
  "use_logn_attn": true,
 
12
  "rope_theta": 10000,
13
  "scale_attn_weights": true,
14
  "tie_word_embeddings": false,
15
+ "_attn_implementation": "flash_attn_2",
16
  "vocab_size": 151936,
17
  "rope_scaling": {"type": "dynamic_qwen"},
18
  "use_logn_attn": true,
Qwen-7B/bert4torch_config.json CHANGED
@@ -12,7 +12,7 @@
12
  "rope_theta": 10000,
13
  "scale_attn_weights": true,
14
  "tie_word_embeddings": false,
15
- "flash_attention": "flash_attn_2",
16
  "vocab_size": 151936,
17
  "rope_scaling": {"type": "dynamic_qwen"},
18
  "use_logn_attn": true,
 
12
  "rope_theta": 10000,
13
  "scale_attn_weights": true,
14
  "tie_word_embeddings": false,
15
+ "_attn_implementation": "flash_attn_2",
16
  "vocab_size": 151936,
17
  "rope_scaling": {"type": "dynamic_qwen"},
18
  "use_logn_attn": true,
Qwen1.5-0.5B-Chat/bert4torch_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": true,
16
  "torch_dtype": "bfloat16",
17
- "flash_attention": true,
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
 
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": true,
16
  "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
Qwen1.5-0.5B/bert4torch_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": true,
16
  "torch_dtype": "bfloat16",
17
- "flash_attention": true,
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
 
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": true,
16
  "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
Qwen1.5-1.8B-Chat/bert4torch_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
- "flash_attention": true,
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
 
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
Qwen1.5-1.8B/bert4torch_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
- "flash_attention": true,
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
 
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
Qwen1.5-14B-Chat/bert4torch_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
- "flash_attention": true,
18
  "is_causal": true,
19
  "vocab_size": 152064,
20
  "segment_vocab_size": 0,
 
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
  "is_causal": true,
19
  "vocab_size": 152064,
20
  "segment_vocab_size": 0,
Qwen1.5-14B/bert4torch_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
- "flash_attention": true,
18
  "is_causal": true,
19
  "vocab_size": 152064,
20
  "segment_vocab_size": 0,
 
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
  "is_causal": true,
19
  "vocab_size": 152064,
20
  "segment_vocab_size": 0,
Qwen1.5-7B-Chat/bert4torch_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
- "flash_attention": true,
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
 
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
Qwen1.5-7B/bert4torch_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
- "flash_attention": true,
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
 
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
Qwen2-0.5B-Instruct/bert4torch_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": true,
16
  "torch_dtype": "bfloat16",
17
- "flash_attention": true,
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
 
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": true,
16
  "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
Qwen2-0.5B/bert4torch_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": true,
16
  "torch_dtype": "bfloat16",
17
- "flash_attention": true,
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
 
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": true,
16
  "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
Qwen2-1.5B-Instruct/bert4torch_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": true,
16
  "torch_dtype": "bfloat16",
17
- "flash_attention": true,
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
 
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": true,
16
  "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
Qwen2-1.5B/bert4torch_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": true,
16
  "torch_dtype": "bfloat16",
17
- "flash_attention": true,
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
 
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": true,
16
  "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
  "is_causal": true,
19
  "vocab_size": 151936,
20
  "segment_vocab_size": 0,
Qwen2-7B-Instruct/bert4torch_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
- "flash_attention": true,
18
  "is_causal": true,
19
  "vocab_size": 152064,
20
  "segment_vocab_size": 0,
 
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
  "is_causal": true,
19
  "vocab_size": 152064,
20
  "segment_vocab_size": 0,
Qwen2-7B/bert4torch_config.json CHANGED
@@ -14,7 +14,7 @@
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
- "flash_attention": true,
18
  "is_causal": true,
19
  "vocab_size": 152064,
20
  "segment_vocab_size": 0,
 
14
  "rope_theta": 1000000.0,
15
  "tie_word_embeddings": false,
16
  "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
  "is_causal": true,
19
  "vocab_size": 152064,
20
  "segment_vocab_size": 0,
chatglm2-6b-32k/bert4torch_config.json CHANGED
@@ -21,6 +21,6 @@
21
  "factor": 16
22
  },
23
  "position_encoding_2d": true,
24
- "flash_attention": true,
25
  "generation_config": {"max_length": 32768}
26
  }
 
21
  "factor": 16
22
  },
23
  "position_encoding_2d": true,
24
+ "_attn_implementation": "sdpa",
25
  "generation_config": {"max_length": 32768}
26
  }
chatglm2-6b-int4/bert4torch_config.json CHANGED
@@ -17,7 +17,7 @@
17
  "rmsnorm": true,
18
  "rope_rank": "adjacent",
19
  "position_encoding_2d": true,
20
- "flash_attention": true,
21
  "quantization_bit": 4,
22
  "quantization_method": "cpm_kernels",
23
  "target_modules": ["q", "k", "v", "o", "intermediateDense", "outputDense"],
 
17
  "rmsnorm": true,
18
  "rope_rank": "adjacent",
19
  "position_encoding_2d": true,
20
+ "_attn_implementation": "sdpa",
21
  "quantization_bit": 4,
22
  "quantization_method": "cpm_kernels",
23
  "target_modules": ["q", "k", "v", "o", "intermediateDense", "outputDense"],
chatglm2-6b/bert4torch_config.json CHANGED
@@ -17,6 +17,6 @@
17
  "rmsnorm": true,
18
  "rope_rank": "adjacent",
19
  "position_encoding_2d": true,
20
- "flash_attention": true,
21
  "generation_config": {"max_length": 32768}
22
  }
 
17
  "rmsnorm": true,
18
  "rope_rank": "adjacent",
19
  "position_encoding_2d": true,
20
+ "_attn_implementation": "sdpa",
21
  "generation_config": {"max_length": 32768}
22
  }
chatglm3-6b-32k/bert4torch_config.json CHANGED
@@ -19,7 +19,7 @@
19
  "rope_rank": "adjacent",
20
  "rope_theta": 500000,
21
  "position_encoding_2d": true,
22
- "flash_attention": true,
23
  "generation_config": {"tokenizer_config": {"additional_special_tokens": ["<|user|>", "<|observation|>"],
24
  "skip_special_tokens": true}, "eos_token_id": [2, 64795, 64797], "max_length": 32768}
25
  }
 
19
  "rope_rank": "adjacent",
20
  "rope_theta": 500000,
21
  "position_encoding_2d": true,
22
+ "_attn_implementation": "sdpa",
23
  "generation_config": {"tokenizer_config": {"additional_special_tokens": ["<|user|>", "<|observation|>"],
24
  "skip_special_tokens": true}, "eos_token_id": [2, 64795, 64797], "max_length": 32768}
25
  }
chatglm3-6b/bert4torch_config.json CHANGED
@@ -18,7 +18,7 @@
18
  "rmsnorm": true,
19
  "rope_rank": "adjacent",
20
  "position_encoding_2d": true,
21
- "flash_attention": true,
22
  "generation_config": {"tokenizer_config": {"additional_special_tokens": ["<|user|>", "<|observation|>"],
23
  "skip_special_tokens": true}, "eos_token_id": [2, 64795, 64797], "max_length": 8192}
24
  }
 
18
  "rmsnorm": true,
19
  "rope_rank": "adjacent",
20
  "position_encoding_2d": true,
21
+ "_attn_implementation": "sdpa",
22
  "generation_config": {"tokenizer_config": {"additional_special_tokens": ["<|user|>", "<|observation|>"],
23
  "skip_special_tokens": true}, "eos_token_id": [2, 64795, 64797], "max_length": 8192}
24
  }
deepseek-coder-1.3b-instruct/bert4torch_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "llama",
3
+ "template": "apply_chat_template",
4
+ "bos_token_id": 32013,
5
+ "eos_token_id": 32021,
6
+ "hidden_act": "silu",
7
+ "hidden_size": 2048,
8
+ "initializer_range": 0.02,
9
+ "intermediate_size": 5504,
10
+ "max_position_embeddings": 16384,
11
+ "num_attention_heads": 16,
12
+ "num_hidden_layers": 24,
13
+ "num_key_value_heads": 16,
14
+ "layer_norm_eps": 1e-06,
15
+ "rope_scaling": {
16
+ "factor": 4.0,
17
+ "type": "linear"
18
+ },
19
+ "rope_theta": 100000,
20
+ "tie_word_embeddings": false,
21
+ "torch_dtype": "bfloat16",
22
+ "vocab_size": 32256,
23
+ "rope_rank": "updown",
24
+ "_attn_implementation": "sdpa",
25
+ "segment_vocab_size": 0,
26
+ "skip_init": true,
27
+ "convert_lm_logits_dtype": "float32",
28
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true, "add_special_tokens": false}, "eos_token_id": [32021]}
29
+ }
30
+
falcon-7b-instruct/bert4torch_config.json CHANGED
@@ -23,5 +23,5 @@
23
  "vocab_size": 65024,
24
  "skip_init": true,
25
  "norm_mode": "torch_buildin",
26
- "flash_attention": "sdpa"
27
  }
 
23
  "vocab_size": 65024,
24
  "skip_init": true,
25
  "norm_mode": "torch_buildin",
26
+ "_attn_implementation": "sdpa"
27
  }
falcon-7b/bert4torch_config.json CHANGED
@@ -23,5 +23,5 @@
23
  "vocab_size": 65024,
24
  "skip_init": true,
25
  "norm_mode": "torch_buildin",
26
- "flash_attention": "sdpa"
27
  }
 
23
  "vocab_size": 65024,
24
  "skip_init": true,
25
  "norm_mode": "torch_buildin",
26
+ "_attn_implementation": "sdpa"
27
  }
glm-4-9b-chat-1m/bert4torch_config.json CHANGED
@@ -17,7 +17,7 @@
17
  "rope_rank": "adjacent",
18
  "rope_theta": 100000000,
19
  "position_encoding_2d": true,
20
- "flash_attention": true,
21
  "eos_token_id": [151329, 151336, 151338],
22
  "pad_token_id": 151329,
23
  "generation_config": {"tokenizer_config": {"skip_special_tokens": true},
 
17
  "rope_rank": "adjacent",
18
  "rope_theta": 100000000,
19
  "position_encoding_2d": true,
20
+ "_attn_implementation": "sdpa",
21
  "eos_token_id": [151329, 151336, 151338],
22
  "pad_token_id": 151329,
23
  "generation_config": {"tokenizer_config": {"skip_special_tokens": true},
glm-4-9b-chat/bert4torch_config.json CHANGED
@@ -17,7 +17,7 @@
17
  "rope_rank": "adjacent",
18
  "rope_theta": 5000000,
19
  "position_encoding_2d": true,
20
- "flash_attention": true,
21
  "eos_token_id": [151329, 151336, 151338],
22
  "pad_token_id": 151329,
23
  "generation_config": {"tokenizer_config": {"skip_special_tokens": true},
 
17
  "rope_rank": "adjacent",
18
  "rope_theta": 5000000,
19
  "position_encoding_2d": true,
20
+ "_attn_implementation": "sdpa",
21
  "eos_token_id": [151329, 151336, 151338],
22
  "pad_token_id": 151329,
23
  "generation_config": {"tokenizer_config": {"skip_special_tokens": true},
glm-4-9b/bert4torch_config.json CHANGED
@@ -16,7 +16,7 @@
16
  "rmsnorm": true,
17
  "rope_rank": "adjacent",
18
  "position_encoding_2d": true,
19
- "flash_attention": true,
20
  "eos_token_id": [151329, 151336, 151338],
21
  "pad_token_id": 151329,
22
  "generation_config": {"tokenizer_config": {"skip_special_tokens": true},
 
16
  "rmsnorm": true,
17
  "rope_rank": "adjacent",
18
  "position_encoding_2d": true,
19
+ "_attn_implementation": "sdpa",
20
  "eos_token_id": [151329, 151336, 151338],
21
  "pad_token_id": 151329,
22
  "generation_config": {"tokenizer_config": {"skip_special_tokens": true},