gugarosa commited on
Commit
48553e4
1 Parent(s): e0e061e

Update configuration_phi3.py

Browse files
Files changed (1) hide show
  1. configuration_phi3.py +14 -7
configuration_phi3.py CHANGED
@@ -16,8 +16,9 @@
16
  """ Phi-3 model configuration"""
17
 
18
 
19
- from transformers.configuration_utils import PretrainedConfig
20
- from transformers.utils import logging
 
21
 
22
  logger = logging.get_logger(__name__)
23
 
@@ -72,8 +73,8 @@ class Phi3Config(PretrainedConfig):
72
  original RoPE embeddings when using long scaling.
73
  initializer_range (`float`, *optional*, defaults to 0.02):
74
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
75
- layer_norm_eps (`float`, *optional*, defaults to 1e-05):
76
- The epsilon used by the rms normalization layers.
77
  use_cache (`bool`, *optional*, defaults to `True`):
78
  Whether or not the model should return the last key/values attentions (not used by all models). Only
79
  relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
@@ -81,11 +82,17 @@ class Phi3Config(PretrainedConfig):
81
  Whether to tie weight embeddings
82
  rope_theta (`float`, *optional*, defaults to 10000.0):
83
  The base period of the RoPE embeddings.
84
- rope_scaling (`dict`, *optional*, defaults to `None`):
85
  The scaling factor for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
86
  contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
87
  the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
88
  divided by the number of attention heads divided by 2.
 
 
 
 
 
 
89
 
90
  Example:
91
 
@@ -178,7 +185,7 @@ class Phi3Config(PretrainedConfig):
178
 
179
  short_factor = self.rope_scaling["short_factor"]
180
  assert isinstance(short_factor, list) and all(
181
- [isinstance(x, (int, float)) for x in short_factor]
182
  ), f"RoPE scaling factor must be a list of numbers, got {short_factor}."
183
  assert (
184
  len(short_factor) == self.hidden_size // self.num_attention_heads // 2
@@ -186,7 +193,7 @@ class Phi3Config(PretrainedConfig):
186
 
187
  long_factor = self.rope_scaling["long_factor"]
188
  assert isinstance(long_factor, list) and all(
189
- [isinstance(x, (int, float)) for x in long_factor]
190
  ), f"RoPE scaling factor must be a list of numbers, got {long_factor}."
191
  assert (
192
  len(long_factor) == self.hidden_size // self.num_attention_heads // 2
 
16
  """ Phi-3 model configuration"""
17
 
18
 
19
+ from ...configuration_utils import PretrainedConfig
20
+ from ...utils import logging
21
+
22
 
23
  logger = logging.get_logger(__name__)
24
 
 
73
  original RoPE embeddings when using long scaling.
74
  initializer_range (`float`, *optional*, defaults to 0.02):
75
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
76
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
77
+ The epsilon value used for the RMSNorm.
78
  use_cache (`bool`, *optional*, defaults to `True`):
79
  Whether or not the model should return the last key/values attentions (not used by all models). Only
80
  relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
 
82
  Whether to tie weight embeddings
83
  rope_theta (`float`, *optional*, defaults to 10000.0):
84
  The base period of the RoPE embeddings.
85
+ rope_scaling (`dict`, *optional*):
86
  The scaling factor for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
87
  contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
88
  the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
89
  divided by the number of attention heads divided by 2.
90
+ eos_token_id (`int`, *optional*, defaults to 32000):
91
+ The id of the "end-of-sequence" token.
92
+ pad_token_id (`int`, *optional*, defaults to 32000):
93
+ The id of the padding token.
94
+ sliding_window (`int`, *optional*):
95
+ Sliding window attention window size. If `None`, no sliding window is applied.
96
 
97
  Example:
98
 
 
185
 
186
  short_factor = self.rope_scaling["short_factor"]
187
  assert isinstance(short_factor, list) and all(
188
+ isinstance(x, (int, float)) for x in short_factor
189
  ), f"RoPE scaling factor must be a list of numbers, got {short_factor}."
190
  assert (
191
  len(short_factor) == self.hidden_size // self.num_attention_heads // 2
 
193
 
194
  long_factor = self.rope_scaling["long_factor"]
195
  assert isinstance(long_factor, list) and all(
196
+ isinstance(x, (int, float)) for x in long_factor
197
  ), f"RoPE scaling factor must be a list of numbers, got {long_factor}."
198
  assert (
199
  len(long_factor) == self.hidden_size // self.num_attention_heads // 2