Raincleared commited on
Commit
efca936
1 Parent(s): 21b27c5

Update configuration_minicpm.py

Browse files
Files changed (1) hide show
  1. configuration_minicpm.py +6 -2
configuration_minicpm.py CHANGED
@@ -58,8 +58,10 @@ class MiniCPMConfig(PretrainedConfig):
58
  by meanpooling all the original heads within that group. For more details checkout [this
59
  paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
60
  `num_attention_heads`.
61
- hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
62
  The non-linear activation function (function or string) in the decoder.
 
 
63
  max_position_embeddings (`int`, *optional*, defaults to 2048):
64
  The maximum sequence length that this model might ever be used with. MiniCPM 1 supports up to 2048 tokens,
65
  MiniCPM 2 up to 4096, CodeMiniCPM up to 16384.
@@ -122,7 +124,8 @@ class MiniCPMConfig(PretrainedConfig):
122
  num_hidden_layers=32,
123
  num_attention_heads=32,
124
  num_key_value_heads=None,
125
- hidden_act="silu",
 
126
  max_position_embeddings=2048,
127
  initializer_range=0.02,
128
  rms_norm_eps=1e-6,
@@ -154,6 +157,7 @@ class MiniCPMConfig(PretrainedConfig):
154
 
155
  self.num_key_value_heads = num_key_value_heads
156
  self.hidden_act = hidden_act
 
157
  self.initializer_range = initializer_range
158
  self.rms_norm_eps = rms_norm_eps
159
  self.pretraining_tp = pretraining_tp
 
58
  by meanpooling all the original heads within that group. For more details checkout [this
59
  paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
60
  `num_attention_heads`.
61
+ hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
62
  The non-linear activation function (function or string) in the decoder.
63
+ hidden_act_param (`float`, *optional*, defaults to 0.):
64
+ The bias for shiftrelu or threshold for fatrelu.
65
  max_position_embeddings (`int`, *optional*, defaults to 2048):
66
  The maximum sequence length that this model might ever be used with. MiniCPM 1 supports up to 2048 tokens,
67
  MiniCPM 2 up to 4096, CodeMiniCPM up to 16384.
 
124
  num_hidden_layers=32,
125
  num_attention_heads=32,
126
  num_key_value_heads=None,
127
+ hidden_act="relu",
128
+ hidden_act_param=0.,
129
  max_position_embeddings=2048,
130
  initializer_range=0.02,
131
  rms_norm_eps=1e-6,
 
157
 
158
  self.num_key_value_heads = num_key_value_heads
159
  self.hidden_act = hidden_act
160
+ self.hidden_act_param = hidden_act_param
161
  self.initializer_range = initializer_range
162
  self.rms_norm_eps = rms_norm_eps
163
  self.pretraining_tp = pretraining_tp