shunxing1234 commited on
Commit
d334358
1 Parent(s): 3d19935

Update configuration_aquila.py

Browse files
Files changed (1) hide show
  1. configuration_aquila.py +15 -0
configuration_aquila.py CHANGED
@@ -83,6 +83,7 @@ class AquilaConfig(PretrainedConfig):
83
  intermediate_size=11008,
84
  num_hidden_layers=32,
85
  num_attention_heads=32,
 
86
  hidden_act="silu",
87
  max_position_embeddings=2048,
88
  initializer_range=0.02,
@@ -91,7 +92,10 @@ class AquilaConfig(PretrainedConfig):
91
  pad_token_id=0,
92
  bos_token_id=1,
93
  eos_token_id=2,
 
94
  tie_word_embeddings=False,
 
 
95
  **kwargs,
96
  ):
97
  self.vocab_size = vocab_size
@@ -99,11 +103,22 @@ class AquilaConfig(PretrainedConfig):
99
  self.hidden_size = hidden_size
100
  self.intermediate_size = intermediate_size
101
  self.num_hidden_layers = num_hidden_layers
 
 
 
 
 
 
 
102
  self.num_attention_heads = num_attention_heads
103
  self.hidden_act = hidden_act
104
  self.initializer_range = initializer_range
105
  self.rms_norm_eps = rms_norm_eps
 
106
  self.use_cache = use_cache
 
 
 
107
  super().__init__(
108
  pad_token_id=pad_token_id,
109
  bos_token_id=bos_token_id,
 
83
  intermediate_size=11008,
84
  num_hidden_layers=32,
85
  num_attention_heads=32,
86
+ num_key_value_heads=None,
87
  hidden_act="silu",
88
  max_position_embeddings=2048,
89
  initializer_range=0.02,
 
92
  pad_token_id=0,
93
  bos_token_id=1,
94
  eos_token_id=2,
95
+ pretraining_tp=1,
96
  tie_word_embeddings=False,
97
+ rope_theta=10000.0,
98
+ rope_scaling=None,
99
  **kwargs,
100
  ):
101
  self.vocab_size = vocab_size
 
103
  self.hidden_size = hidden_size
104
  self.intermediate_size = intermediate_size
105
  self.num_hidden_layers = num_hidden_layers
106
+
107
+ # for backward compatibility
108
+ if num_key_value_heads is None:
109
+ num_key_value_heads = num_attention_heads
110
+
111
+ self.num_key_value_heads = num_key_value_heads
112
+
113
  self.num_attention_heads = num_attention_heads
114
  self.hidden_act = hidden_act
115
  self.initializer_range = initializer_range
116
  self.rms_norm_eps = rms_norm_eps
117
+ self.pretraining_tp = pretraining_tp
118
  self.use_cache = use_cache
119
+ self.rope_theta = rope_theta
120
+ self.rope_scaling = rope_scaling
121
+
122
  super().__init__(
123
  pad_token_id=pad_token_id,
124
  bos_token_id=bos_token_id,