liuxz0801 commited on
Commit
0b4ef71
·
1 Parent(s): a9469e8

更新配置文件

Browse files
Files changed (2) hide show
  1. config.json +4 -6
  2. modeling_telechat.py +1 -2
config.json CHANGED
@@ -4,7 +4,7 @@
4
  "architectures": [
5
  "TelechatForCausalLM"
6
  ],
7
- "attention_dropout": 0.0,
8
  "attention_softmax_in_fp32": true,
9
  "auto_map": {
10
  "AutoConfig": "configuration_telechat.TelechatConfig",
@@ -16,25 +16,23 @@
16
  "eos_token_id": 2,
17
  "ffn_hidden_size": 12288,
18
  "flash_attn": true,
19
- "hidden_dropout": 0.0,
20
  "hidden_size": 4096,
21
  "initializer_range": 0.02,
22
  "layer_norm_epsilon": 1e-05,
23
  "logn": false,
 
24
  "masked_softmax_fusion": true,
25
  "model_type": "telechat",
26
  "n_head": 32,
27
  "n_inner": null,
28
  "n_layer": 30,
29
- "offset_alibi": 100,
30
  "pad_token_id": 3,
31
- "pretraining_tp": 2,
32
- "seq_length": 8192,
33
  "skip_bias_add": true,
34
  "skip_bias_add_qkv": false,
35
  "slow_but_exact": false,
36
  "torch_dtype": "float16",
37
- "training_seqlen": 4096,
38
  "transformers_version": "4.30.0",
39
  "unk_token_id": 0,
40
  "use_cache": true,
 
4
  "architectures": [
5
  "TelechatForCausalLM"
6
  ],
7
+ "attention_dropout": 0.1,
8
  "attention_softmax_in_fp32": true,
9
  "auto_map": {
10
  "AutoConfig": "configuration_telechat.TelechatConfig",
 
16
  "eos_token_id": 2,
17
  "ffn_hidden_size": 12288,
18
  "flash_attn": true,
19
+ "hidden_dropout": 0.1,
20
  "hidden_size": 4096,
21
  "initializer_range": 0.02,
22
  "layer_norm_epsilon": 1e-05,
23
  "logn": false,
24
+ "seq_length": 8192,
25
  "masked_softmax_fusion": true,
26
  "model_type": "telechat",
27
  "n_head": 32,
28
  "n_inner": null,
29
  "n_layer": 30,
 
30
  "pad_token_id": 3,
 
 
31
  "skip_bias_add": true,
32
  "skip_bias_add_qkv": false,
33
  "slow_but_exact": false,
34
  "torch_dtype": "float16",
35
+ "training_seqlen": 8192,
36
  "transformers_version": "4.30.0",
37
  "unk_token_id": 0,
38
  "use_cache": true,
modeling_telechat.py CHANGED
@@ -105,8 +105,7 @@ class RotaryEmbedding(torch.nn.Module):
105
  return ntk_alpha
106
 
107
  def forward(self, x, seq_dim=0, seq_len=None):
108
- if seq_len is None:
109
- seq_len = x.shape[seq_dim]
110
  seq_len = max(seq_len, self.config.training_seqlen)
111
  ntk_alpha = self.get_ntk_alpha(seq_len)
112
  self.mscale = float(self.get_mscale(seq_len / self.config.training_seqlen))
 
105
  return ntk_alpha
106
 
107
  def forward(self, x, seq_dim=0, seq_len=None):
108
+ seq_len = x.shape[seq_dim]
 
109
  seq_len = max(seq_len, self.config.training_seqlen)
110
  ntk_alpha = self.get_ntk_alpha(seq_len)
111
  self.mscale = float(self.get_mscale(seq_len / self.config.training_seqlen))