Tongjilibo commited on
Commit
96ab741
1 Parent(s): dee8424

增加minicomv和qwen2.5

Browse files
MiniCPM-V-2_6/bert4torch_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "minicpmv",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 18944,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 3584,
11
+ "num_attention_heads": 28,
12
+ "num_hidden_layers": 28,
13
+ "num_key_value_heads": 4,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": false,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "is_causal": true,
19
+ "vocab_size": 151666,
20
+ "segment_vocab_size": 0,
21
+ "skip_init": true,
22
+ "rope_rank": "updown",
23
+ "max_position_embeddings": 32768,
24
+ "sliding_window": 131072,
25
+ "max_window_layers": 28,
26
+ "convert_lm_logits_dtype": "float32",
27
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
28
+ "max_length": 32768},
29
+
30
+ "use_image_id": true,
31
+ "batch_vision_input": true,
32
+ "drop_vision_last_layer": false,
33
+ "vision_batch_size": 16,
34
+ "image_size": 448,
35
+ "patch_size": 14,
36
+ "query_num": 64,
37
+ "slice_config": {
38
+ "max_slice_nums": 9,
39
+ "patch_size": 14,
40
+ "model_type": "minicpmv"
41
+ },
42
+ "slice_mode": true,
43
+ "vision_config": {
44
+ "hidden_size": 1152,
45
+ "image_size": 980,
46
+ "intermediate_size": 4304,
47
+ "model_type": "siglip",
48
+ "num_attention_heads": 16,
49
+ "num_hidden_layers": 27,
50
+ "patch_size": 14
51
+ }
52
+ }
Qwen2.5-0.5B-Instruct/bert4torch_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 4864,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 896,
11
+ "num_attention_heads": 14,
12
+ "num_hidden_layers": 24,
13
+ "num_key_value_heads": 2,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": true,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "vocab_size": 151936,
19
+ "segment_vocab_size": 0,
20
+ "skip_init": true,
21
+ "rope_rank": "updown",
22
+ "max_position_embeddings": 32768,
23
+ "sliding_window": 32768,
24
+ "max_window_layers": 21,
25
+ "convert_lm_logits_dtype": "float32",
26
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
27
+ "max_length": 32768}
28
+ }
Qwen2.5-0.5B/bert4torch_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 4864,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 896,
11
+ "num_attention_heads": 14,
12
+ "num_hidden_layers": 24,
13
+ "num_key_value_heads": 2,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": true,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "vocab_size": 151936,
19
+ "segment_vocab_size": 0,
20
+ "skip_init": true,
21
+ "rope_rank": "updown",
22
+ "max_position_embeddings": 32768,
23
+ "sliding_window": 32768,
24
+ "max_window_layers": 21,
25
+ "convert_lm_logits_dtype": "float32",
26
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
27
+ "max_length": 32768}
28
+ }
Qwen2.5-1.5B-Instruct/bert4torch_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 8960,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 1536,
11
+ "num_attention_heads": 12,
12
+ "num_hidden_layers": 28,
13
+ "num_key_value_heads": 2,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": true,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "vocab_size": 151936,
19
+ "segment_vocab_size": 0,
20
+ "skip_init": true,
21
+ "rope_rank": "updown",
22
+ "max_position_embeddings": 32768,
23
+ "sliding_window": 32768,
24
+ "max_window_layers": 21,
25
+ "convert_lm_logits_dtype": "float32",
26
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
27
+ "max_length": 32768}
28
+ }
Qwen2.5-1.5B/bert4torch_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 8960,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 1536,
11
+ "num_attention_heads": 12,
12
+ "num_hidden_layers": 28,
13
+ "num_key_value_heads": 2,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": true,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "vocab_size": 151936,
19
+ "segment_vocab_size": 0,
20
+ "skip_init": true,
21
+ "rope_rank": "updown",
22
+ "max_position_embeddings": 32768,
23
+ "sliding_window": 32768,
24
+ "max_window_layers": 21,
25
+ "convert_lm_logits_dtype": "float32",
26
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
27
+ "max_length": 32768}
28
+ }
Qwen2.5-14B-Instruct/bert4torch_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 13824,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 13824,
11
+ "num_attention_heads": 40,
12
+ "num_hidden_layers": 48,
13
+ "num_key_value_heads": 8,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": false,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "vocab_size": 152064,
19
+ "segment_vocab_size": 0,
20
+ "skip_init": true,
21
+ "rope_rank": "updown",
22
+ "max_position_embeddings": 32768,
23
+ "sliding_window": 131072,
24
+ "max_window_layers": 70,
25
+ "convert_lm_logits_dtype": "float32",
26
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
27
+ "max_length": 32768}
28
+ }
Qwen2.5-14B/bert4torch_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 13824,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 13824,
11
+ "num_attention_heads": 40,
12
+ "num_hidden_layers": 48,
13
+ "num_key_value_heads": 8,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": false,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "vocab_size": 152064,
19
+ "segment_vocab_size": 0,
20
+ "skip_init": true,
21
+ "rope_rank": "updown",
22
+ "max_position_embeddings": 32768,
23
+ "sliding_window": 131072,
24
+ "max_window_layers": 70,
25
+ "convert_lm_logits_dtype": "float32",
26
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
27
+ "max_length": 32768}
28
+ }
Qwen2.5-3B-Instruct/bert4torch_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 11008,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 2048,
11
+ "num_attention_heads": 16,
12
+ "num_hidden_layers": 36,
13
+ "num_key_value_heads": 2,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": true,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "vocab_size": 151936,
19
+ "segment_vocab_size": 0,
20
+ "skip_init": true,
21
+ "rope_rank": "updown",
22
+ "max_position_embeddings": 32768,
23
+ "sliding_window": 32768,
24
+ "max_window_layers": 70,
25
+ "convert_lm_logits_dtype": "float32",
26
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
27
+ "max_length": 32768}
28
+ }
Qwen2.5-3B/bert4torch_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 11008,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 2048,
11
+ "num_attention_heads": 16,
12
+ "num_hidden_layers": 36,
13
+ "num_key_value_heads": 2,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": true,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "vocab_size": 151936,
19
+ "segment_vocab_size": 0,
20
+ "skip_init": true,
21
+ "rope_rank": "updown",
22
+ "max_position_embeddings": 32768,
23
+ "sliding_window": 32768,
24
+ "max_window_layers": 70,
25
+ "convert_lm_logits_dtype": "float32",
26
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
27
+ "max_length": 32768}
28
+ }
Qwen2.5-7B-Instruct/bert4torch_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 18944,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 3584,
11
+ "num_attention_heads": 28,
12
+ "num_hidden_layers": 28,
13
+ "num_key_value_heads": 4,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": false,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "vocab_size": 152064,
19
+ "segment_vocab_size": 0,
20
+ "skip_init": true,
21
+ "rope_rank": "updown",
22
+ "max_position_embeddings": 32768,
23
+ "sliding_window": 131072,
24
+ "max_window_layers": 28,
25
+ "convert_lm_logits_dtype": "float32",
26
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
27
+ "max_length": 32768}
28
+ }
Qwen2.5-7B/bert4torch_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "qwen2",
3
+ "hidden_act": "silu",
4
+ "attention_probs_dropout_prob": 0.0,
5
+ "bos_token_id": 151643,
6
+ "eos_token_id": 151645,
7
+ "intermediate_size": 18944,
8
+ "initializer_range": 0.02,
9
+ "layer_norm_eps": 1e-06,
10
+ "hidden_size": 3584,
11
+ "num_attention_heads": 28,
12
+ "num_hidden_layers": 28,
13
+ "num_key_value_heads": 4,
14
+ "rope_theta": 1000000.0,
15
+ "tie_word_embeddings": false,
16
+ "torch_dtype": "bfloat16",
17
+ "_attn_implementation": "sdpa",
18
+ "vocab_size": 152064,
19
+ "segment_vocab_size": 0,
20
+ "skip_init": true,
21
+ "rope_rank": "updown",
22
+ "max_position_embeddings": 32768,
23
+ "sliding_window": 131072,
24
+ "max_window_layers": 28,
25
+ "convert_lm_logits_dtype": "float32",
26
+ "generation_config": {"tokenizer_config": {"skip_special_tokens": true}, "eos_token_id": [151643, 151645],
27
+ "max_length": 32768}
28
+ }