chore(root): Updates source files with RC versions.
Browse files- config.json +1 -1
- configuration_phi3.py +38 -29
- modeling_phi3.py +59 -80
config.json
CHANGED
@@ -125,7 +125,7 @@
|
|
125 |
2.849999999999998,
|
126 |
2.9499999999999975
|
127 |
],
|
128 |
-
"type": "
|
129 |
},
|
130 |
"rope_theta": 10000.0,
|
131 |
"sliding_window": 262144,
|
|
|
125 |
2.849999999999998,
|
126 |
2.9499999999999975
|
127 |
],
|
128 |
+
"type": "su"
|
129 |
},
|
130 |
"rope_theta": 10000.0,
|
131 |
"sliding_window": 262144,
|
configuration_phi3.py
CHANGED
@@ -83,8 +83,8 @@ class Phi3Config(PretrainedConfig):
|
|
83 |
rope_theta (`float`, *optional*, defaults to 10000.0):
|
84 |
The base period of the RoPE embeddings.
|
85 |
rope_scaling (`dict`, *optional*):
|
86 |
-
The scaling
|
87 |
-
contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `
|
88 |
the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
|
89 |
divided by the number of attention heads divided by 2.
|
90 |
eos_token_id (`int`, *optional*, defaults to 32000):
|
@@ -158,6 +158,7 @@ class Phi3Config(PretrainedConfig):
|
|
158 |
self.use_cache = use_cache
|
159 |
self.rope_theta = rope_theta
|
160 |
self.rope_scaling = rope_scaling
|
|
|
161 |
self.sliding_window = sliding_window
|
162 |
|
163 |
super().__init__(
|
@@ -168,33 +169,41 @@ class Phi3Config(PretrainedConfig):
|
|
168 |
)
|
169 |
|
170 |
def _rope_scaling_validation(self):
|
|
|
|
|
|
|
171 |
if self.rope_scaling is None:
|
172 |
return
|
173 |
|
174 |
-
|
175 |
-
(
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
isinstance(
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
83 |
rope_theta (`float`, *optional*, defaults to 10000.0):
|
84 |
The base period of the RoPE embeddings.
|
85 |
rope_scaling (`dict`, *optional*):
|
86 |
+
The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
|
87 |
+
contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
|
88 |
the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
|
89 |
divided by the number of attention heads divided by 2.
|
90 |
eos_token_id (`int`, *optional*, defaults to 32000):
|
|
|
158 |
self.use_cache = use_cache
|
159 |
self.rope_theta = rope_theta
|
160 |
self.rope_scaling = rope_scaling
|
161 |
+
self._rope_scaling_validation()
|
162 |
self.sliding_window = sliding_window
|
163 |
|
164 |
super().__init__(
|
|
|
169 |
)
|
170 |
|
171 |
def _rope_scaling_validation(self):
|
172 |
+
"""
|
173 |
+
Validate the `rope_scaling` configuration.
|
174 |
+
"""
|
175 |
if self.rope_scaling is None:
|
176 |
return
|
177 |
|
178 |
+
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
|
179 |
+
raise ValueError(
|
180 |
+
"`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
|
181 |
+
f"got {self.rope_scaling}"
|
182 |
+
)
|
183 |
+
rope_scaling_type = self.rope_scaling.get("type", None)
|
184 |
+
rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
|
185 |
+
rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
|
186 |
+
if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:
|
187 |
+
raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
|
188 |
+
if not (
|
189 |
+
isinstance(rope_scaling_short_factor, list)
|
190 |
+
and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
|
191 |
+
):
|
192 |
+
raise ValueError(
|
193 |
+
f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
|
194 |
+
)
|
195 |
+
if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
|
196 |
+
raise ValueError(
|
197 |
+
f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
|
198 |
+
)
|
199 |
+
if not (
|
200 |
+
isinstance(rope_scaling_long_factor, list)
|
201 |
+
and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
|
202 |
+
):
|
203 |
+
raise ValueError(
|
204 |
+
f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
|
205 |
+
)
|
206 |
+
if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
|
207 |
+
raise ValueError(
|
208 |
+
f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
|
209 |
+
)
|
modeling_phi3.py
CHANGED
@@ -54,26 +54,17 @@ logger = logging.get_logger(__name__)
|
|
54 |
_flash_supports_window_size = False
|
55 |
try:
|
56 |
from flash_attn import flash_attn_func, flash_attn_varlen_func
|
|
|
57 |
|
58 |
_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
|
59 |
-
|
60 |
-
if not _flash_supports_window_size:
|
61 |
-
raise ValueError("Please update flash-attention to support window size.")
|
62 |
-
|
63 |
-
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
|
64 |
-
from flash_attn.ops.activations import swiglu
|
65 |
-
from flash_attn.ops.rms_norm import RMSNorm as Phi3FlashRMSNorm
|
66 |
-
# else:
|
67 |
except ImportError as error:
|
68 |
logger.warning(
|
69 |
-
f"
|
70 |
)
|
71 |
if not _flash_supports_window_size:
|
72 |
logger.warning(
|
73 |
-
"
|
74 |
)
|
75 |
-
swiglu = None
|
76 |
-
Phi3FlashRMSNorm = None
|
77 |
|
78 |
_CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
|
79 |
_CONFIG_FOR_DOC = "Phi3Config"
|
@@ -103,9 +94,6 @@ class Phi3RMSNorm(nn.Module):
|
|
103 |
return self.weight * hidden_states.to(input_dtype)
|
104 |
|
105 |
|
106 |
-
PHI3_NORM_CLASS = Phi3RMSNorm if Phi3FlashRMSNorm is None else Phi3FlashRMSNorm
|
107 |
-
|
108 |
-
|
109 |
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
|
110 |
def _get_unpad_data(attention_mask):
|
111 |
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
|
@@ -156,43 +144,27 @@ class Phi3RotaryEmbedding(nn.Module):
|
|
156 |
)
|
157 |
|
158 |
|
159 |
-
class
|
160 |
def __init__(
|
161 |
self,
|
162 |
dim,
|
163 |
short_factor,
|
164 |
long_factor,
|
165 |
-
max_position_embeddings=
|
166 |
-
original_max_position_embeddings=
|
167 |
base=10000,
|
168 |
-
magnitude_scaling_policy="su",
|
169 |
):
|
170 |
super().__init__()
|
171 |
|
172 |
self.dim = dim
|
|
|
|
|
173 |
self.max_position_embeddings = max_position_embeddings
|
174 |
self.original_max_position_embeddings = original_max_position_embeddings
|
175 |
self.base = base
|
176 |
|
177 |
-
|
178 |
-
|
179 |
-
elif magnitude_scaling_policy == "yarn":
|
180 |
-
self._calc_mscale = self._calc_mscale_yarn
|
181 |
-
else:
|
182 |
-
self._calc_mscale = lambda scale: float(scale)
|
183 |
-
|
184 |
-
self.short_factor = short_factor
|
185 |
-
self.long_factor = long_factor
|
186 |
-
|
187 |
-
def _calc_mscale_su(self, scale):
|
188 |
-
if scale <= 1.0:
|
189 |
-
return 1.0
|
190 |
-
return math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
|
191 |
-
|
192 |
-
def _calc_mscale_yarn(self, scale):
|
193 |
-
if scale <= 1.0:
|
194 |
-
return 1.0
|
195 |
-
return 0.1 * math.log(scale) + 1.0
|
196 |
|
197 |
@torch.no_grad()
|
198 |
def forward(self, x, seq_len=None):
|
@@ -206,9 +178,6 @@ class Phi3LongScaledRotaryEmbedding(nn.Module):
|
|
206 |
else:
|
207 |
t = torch.arange(self.original_max_position_embeddings, device=x.device, dtype=torch.float32)
|
208 |
rescale_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
|
209 |
-
assert rescale_factors.shape == (
|
210 |
-
self.dim // 2,
|
211 |
-
), f"misaligned shape for LongRoPE rescale factors: {rescale_factors.shape}"
|
212 |
|
213 |
inv_freq = 1.0 / (
|
214 |
rescale_factors * (self.base ** (torch.arange(0, self.dim, 2).float().to(x.device) / self.dim))
|
@@ -221,6 +190,20 @@ class Phi3LongScaledRotaryEmbedding(nn.Module):
|
|
221 |
return (emb.cos() * mscale).to(x.dtype), (emb.sin() * mscale).to(x.dtype)
|
222 |
|
223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
# Copied from transformers.models.llama.modeling_llama.rotate_half
|
225 |
def rotate_half(x):
|
226 |
"""Rotates half the hidden dims of the input."""
|
@@ -253,24 +236,12 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
|
|
253 |
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
|
254 |
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
|
255 |
# Need fp32 here to match logits
|
256 |
-
q_embed = (q.
|
257 |
-
|
258 |
-
)
|
259 |
-
k_embed = (k.to(dtype=torch.float32) * cos.to(dtype=torch.float32)) + (
|
260 |
-
rotate_half(k).to(dtype=torch.float32) * sin.to(dtype=torch.float32)
|
261 |
-
)
|
262 |
return q_embed.to(q.dtype), k_embed.to(k.dtype)
|
263 |
|
264 |
|
265 |
class Phi3MLP(nn.Module):
|
266 |
-
"""Gated Linear Unit.
|
267 |
-
|
268 |
-
Reference:
|
269 |
-
Language Modeling with Gated Convolutional Networks.
|
270 |
-
https://arxiv.org/pdf/1612.08083v3.pdf.
|
271 |
-
|
272 |
-
"""
|
273 |
-
|
274 |
def __init__(self, config):
|
275 |
super().__init__()
|
276 |
|
@@ -283,13 +254,8 @@ class Phi3MLP(nn.Module):
|
|
283 |
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
|
284 |
y = self.gate_up_proj(hidden_states)
|
285 |
|
286 |
-
|
287 |
-
|
288 |
-
gate, y = y.chunk(2, dim=-1)
|
289 |
-
y = swiglu(gate, y)
|
290 |
-
else:
|
291 |
-
gate, y = y.chunk(2, dim=-1)
|
292 |
-
y = y * self.activation_fn(gate)
|
293 |
|
294 |
return self.down_proj(y)
|
295 |
|
@@ -330,7 +296,6 @@ class Phi3Attention(nn.Module):
|
|
330 |
self.max_position_embeddings = config.max_position_embeddings
|
331 |
self.original_max_position_embeddings = config.original_max_position_embeddings
|
332 |
self.rope_theta = config.rope_theta
|
333 |
-
self.rope_scaling = config.rope_scaling
|
334 |
self.is_causal = True
|
335 |
|
336 |
if (self.head_dim * self.num_heads) != self.hidden_size:
|
@@ -341,24 +306,38 @@ class Phi3Attention(nn.Module):
|
|
341 |
|
342 |
op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
|
343 |
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
|
344 |
-
|
345 |
self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
|
|
|
346 |
|
347 |
-
|
|
|
348 |
self.rotary_emb = Phi3RotaryEmbedding(
|
349 |
self.head_dim,
|
350 |
max_position_embeddings=self.max_position_embeddings,
|
351 |
base=self.rope_theta,
|
352 |
)
|
353 |
else:
|
354 |
-
|
355 |
-
|
356 |
-
self.
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
|
363 |
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
|
364 |
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
|
@@ -859,11 +838,11 @@ class Phi3DecoderLayer(nn.Module):
|
|
859 |
self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
|
860 |
|
861 |
self.mlp = Phi3MLP(config)
|
862 |
-
self.input_layernorm =
|
863 |
|
864 |
self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
|
865 |
self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
|
866 |
-
self.post_attention_layernorm =
|
867 |
|
868 |
def forward(
|
869 |
self,
|
@@ -1066,9 +1045,8 @@ class Phi3Model(Phi3PreTrainedModel):
|
|
1066 |
self.layers = nn.ModuleList(
|
1067 |
[Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
|
1068 |
)
|
1069 |
-
self.norm = PHI3_NORM_CLASS(config.hidden_size, eps=config.rms_norm_eps)
|
1070 |
-
|
1071 |
self._attn_implementation = config._attn_implementation
|
|
|
1072 |
|
1073 |
self.gradient_checkpointing = False
|
1074 |
# Initialize weights and apply final processing
|
@@ -1255,6 +1233,7 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
|
|
1255 |
def get_decoder(self):
|
1256 |
return self.model
|
1257 |
|
|
|
1258 |
@add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
|
1259 |
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
1260 |
def forward(
|
@@ -1284,8 +1263,8 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
|
|
1284 |
```python
|
1285 |
>>> from transformers import AutoTokenizer, Phi3ForCausalLM
|
1286 |
|
1287 |
-
>>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3")
|
1288 |
-
>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3")
|
1289 |
|
1290 |
>>> prompt = "This is an example script ."
|
1291 |
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
@@ -1293,7 +1272,7 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
|
|
1293 |
>>> # Generate
|
1294 |
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
1295 |
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
1296 |
-
'This is an example script .\n
|
1297 |
```"""
|
1298 |
|
1299 |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
|
|
54 |
_flash_supports_window_size = False
|
55 |
try:
|
56 |
from flash_attn import flash_attn_func, flash_attn_varlen_func
|
57 |
+
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
|
58 |
|
59 |
_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
except ImportError as error:
|
61 |
logger.warning(
|
62 |
+
f"`flash-attention` package not found, consider installing for better performance: {error}."
|
63 |
)
|
64 |
if not _flash_supports_window_size:
|
65 |
logger.warning(
|
66 |
+
"Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`."
|
67 |
)
|
|
|
|
|
68 |
|
69 |
_CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
|
70 |
_CONFIG_FOR_DOC = "Phi3Config"
|
|
|
94 |
return self.weight * hidden_states.to(input_dtype)
|
95 |
|
96 |
|
|
|
|
|
|
|
97 |
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
|
98 |
def _get_unpad_data(attention_mask):
|
99 |
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
|
|
|
144 |
)
|
145 |
|
146 |
|
147 |
+
class _Phi3ScaledRotaryEmbedding(nn.Module):
|
148 |
def __init__(
|
149 |
self,
|
150 |
dim,
|
151 |
short_factor,
|
152 |
long_factor,
|
153 |
+
max_position_embeddings=2048,
|
154 |
+
original_max_position_embeddings=2048,
|
155 |
base=10000,
|
|
|
156 |
):
|
157 |
super().__init__()
|
158 |
|
159 |
self.dim = dim
|
160 |
+
self.short_factor = short_factor
|
161 |
+
self.long_factor = long_factor
|
162 |
self.max_position_embeddings = max_position_embeddings
|
163 |
self.original_max_position_embeddings = original_max_position_embeddings
|
164 |
self.base = base
|
165 |
|
166 |
+
def _calc_mscale(self, scale):
|
167 |
+
raise NotImplementedError("`_calc_mscale` should be implemented in subclasses")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
@torch.no_grad()
|
170 |
def forward(self, x, seq_len=None):
|
|
|
178 |
else:
|
179 |
t = torch.arange(self.original_max_position_embeddings, device=x.device, dtype=torch.float32)
|
180 |
rescale_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
|
|
|
|
|
|
|
181 |
|
182 |
inv_freq = 1.0 / (
|
183 |
rescale_factors * (self.base ** (torch.arange(0, self.dim, 2).float().to(x.device) / self.dim))
|
|
|
190 |
return (emb.cos() * mscale).to(x.dtype), (emb.sin() * mscale).to(x.dtype)
|
191 |
|
192 |
|
193 |
+
class Phi3SuScaledRotaryEmbedding(_Phi3ScaledRotaryEmbedding):
|
194 |
+
def _calc_mscale(self, scale):
|
195 |
+
if scale <= 1.0:
|
196 |
+
return 1.0
|
197 |
+
return math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
|
198 |
+
|
199 |
+
|
200 |
+
class Phi3YarnScaledRotaryEmbedding(_Phi3ScaledRotaryEmbedding):
|
201 |
+
def _calc_mscale(self, scale):
|
202 |
+
if scale <= 1.0:
|
203 |
+
return 1.0
|
204 |
+
return 0.1 * math.log(scale) + 1.0
|
205 |
+
|
206 |
+
|
207 |
# Copied from transformers.models.llama.modeling_llama.rotate_half
|
208 |
def rotate_half(x):
|
209 |
"""Rotates half the hidden dims of the input."""
|
|
|
236 |
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
|
237 |
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
|
238 |
# Need fp32 here to match logits
|
239 |
+
q_embed = (q.float() * cos.float()) + (rotate_half(q).float() * sin.float())
|
240 |
+
k_embed = (k.float() * cos.float()) + (rotate_half(k).float() * sin.float())
|
|
|
|
|
|
|
|
|
241 |
return q_embed.to(q.dtype), k_embed.to(k.dtype)
|
242 |
|
243 |
|
244 |
class Phi3MLP(nn.Module):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
def __init__(self, config):
|
246 |
super().__init__()
|
247 |
|
|
|
254 |
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
|
255 |
y = self.gate_up_proj(hidden_states)
|
256 |
|
257 |
+
gate, y = y.chunk(2, dim=-1)
|
258 |
+
y = y * self.activation_fn(gate)
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
return self.down_proj(y)
|
261 |
|
|
|
296 |
self.max_position_embeddings = config.max_position_embeddings
|
297 |
self.original_max_position_embeddings = config.original_max_position_embeddings
|
298 |
self.rope_theta = config.rope_theta
|
|
|
299 |
self.is_causal = True
|
300 |
|
301 |
if (self.head_dim * self.num_heads) != self.hidden_size:
|
|
|
306 |
|
307 |
op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
|
308 |
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
|
|
|
309 |
self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
|
310 |
+
self._init_rope()
|
311 |
|
312 |
+
def _init_rope(self):
|
313 |
+
if self.config.rope_scaling is None:
|
314 |
self.rotary_emb = Phi3RotaryEmbedding(
|
315 |
self.head_dim,
|
316 |
max_position_embeddings=self.max_position_embeddings,
|
317 |
base=self.rope_theta,
|
318 |
)
|
319 |
else:
|
320 |
+
scaling_type = self.config.rope_scaling["type"]
|
321 |
+
if scaling_type == "su":
|
322 |
+
self.rotary_emb = Phi3SuScaledRotaryEmbedding(
|
323 |
+
self.head_dim,
|
324 |
+
self.config.rope_scaling["short_factor"],
|
325 |
+
self.config.rope_scaling["long_factor"],
|
326 |
+
max_position_embeddings=self.config.max_position_embeddings,
|
327 |
+
original_max_position_embeddings=self.config.original_max_position_embeddings,
|
328 |
+
base=self.config.rope_theta,
|
329 |
+
)
|
330 |
+
elif scaling_type == "yarn":
|
331 |
+
self.rotary_emb = Phi3YarnScaledRotaryEmbedding(
|
332 |
+
self.head_dim,
|
333 |
+
self.config.rope_scaling["short_factor"],
|
334 |
+
self.config.rope_scaling["long_factor"],
|
335 |
+
max_position_embeddings=self.config.max_position_embeddings,
|
336 |
+
original_max_position_embeddings=self.config.original_max_position_embeddings,
|
337 |
+
base=self.config.rope_theta,
|
338 |
+
)
|
339 |
+
else:
|
340 |
+
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
|
341 |
|
342 |
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
|
343 |
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
|
|
|
838 |
self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
|
839 |
|
840 |
self.mlp = Phi3MLP(config)
|
841 |
+
self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
842 |
|
843 |
self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
|
844 |
self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
|
845 |
+
self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
846 |
|
847 |
def forward(
|
848 |
self,
|
|
|
1045 |
self.layers = nn.ModuleList(
|
1046 |
[Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
|
1047 |
)
|
|
|
|
|
1048 |
self._attn_implementation = config._attn_implementation
|
1049 |
+
self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
1050 |
|
1051 |
self.gradient_checkpointing = False
|
1052 |
# Initialize weights and apply final processing
|
|
|
1233 |
def get_decoder(self):
|
1234 |
return self.model
|
1235 |
|
1236 |
+
# Ignore copy
|
1237 |
@add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
|
1238 |
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
1239 |
def forward(
|
|
|
1263 |
```python
|
1264 |
>>> from transformers import AutoTokenizer, Phi3ForCausalLM
|
1265 |
|
1266 |
+
>>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
|
1267 |
+
>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
|
1268 |
|
1269 |
>>> prompt = "This is an example script ."
|
1270 |
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
|
|
1272 |
>>> # Generate
|
1273 |
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
1274 |
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
1275 |
+
'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
|
1276 |
```"""
|
1277 |
|
1278 |
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|