Raincleared
commited on
Commit
•
efca936
1
Parent(s):
21b27c5
Update configuration_minicpm.py
Browse files- configuration_minicpm.py +6 -2
configuration_minicpm.py
CHANGED
@@ -58,8 +58,10 @@ class MiniCPMConfig(PretrainedConfig):
|
|
58 |
by meanpooling all the original heads within that group. For more details checkout [this
|
59 |
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
60 |
`num_attention_heads`.
|
61 |
-
hidden_act (`str` or `function`, *optional*, defaults to `"
|
62 |
The non-linear activation function (function or string) in the decoder.
|
|
|
|
|
63 |
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
64 |
The maximum sequence length that this model might ever be used with. MiniCPM 1 supports up to 2048 tokens,
|
65 |
MiniCPM 2 up to 4096, CodeMiniCPM up to 16384.
|
@@ -122,7 +124,8 @@ class MiniCPMConfig(PretrainedConfig):
|
|
122 |
num_hidden_layers=32,
|
123 |
num_attention_heads=32,
|
124 |
num_key_value_heads=None,
|
125 |
-
hidden_act="
|
|
|
126 |
max_position_embeddings=2048,
|
127 |
initializer_range=0.02,
|
128 |
rms_norm_eps=1e-6,
|
@@ -154,6 +157,7 @@ class MiniCPMConfig(PretrainedConfig):
|
|
154 |
|
155 |
self.num_key_value_heads = num_key_value_heads
|
156 |
self.hidden_act = hidden_act
|
|
|
157 |
self.initializer_range = initializer_range
|
158 |
self.rms_norm_eps = rms_norm_eps
|
159 |
self.pretraining_tp = pretraining_tp
|
|
|
58 |
by meanpooling all the original heads within that group. For more details checkout [this
|
59 |
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
60 |
`num_attention_heads`.
|
61 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
|
62 |
The non-linear activation function (function or string) in the decoder.
|
63 |
+
hidden_act_param (`float`, *optional*, defaults to 0.):
|
64 |
+
The bias for shiftrelu or threshold for fatrelu.
|
65 |
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
66 |
The maximum sequence length that this model might ever be used with. MiniCPM 1 supports up to 2048 tokens,
|
67 |
MiniCPM 2 up to 4096, CodeMiniCPM up to 16384.
|
|
|
124 |
num_hidden_layers=32,
|
125 |
num_attention_heads=32,
|
126 |
num_key_value_heads=None,
|
127 |
+
hidden_act="relu",
|
128 |
+
hidden_act_param=0.,
|
129 |
max_position_embeddings=2048,
|
130 |
initializer_range=0.02,
|
131 |
rms_norm_eps=1e-6,
|
|
|
157 |
|
158 |
self.num_key_value_heads = num_key_value_heads
|
159 |
self.hidden_act = hidden_act
|
160 |
+
self.hidden_act_param = hidden_act_param
|
161 |
self.initializer_range = initializer_range
|
162 |
self.rms_norm_eps = rms_norm_eps
|
163 |
self.pretraining_tp = pretraining_tp
|