gugarosa commited on
Commit
b657a9a
1 Parent(s): 49a5cb3

chore(root): Updates source files with RC versions.

Browse files
Files changed (3) hide show
  1. config.json +1 -1
  2. configuration_phi3.py +38 -29
  3. modeling_phi3.py +59 -80
config.json CHANGED
@@ -125,7 +125,7 @@
125
  2.849999999999998,
126
  2.9499999999999975
127
  ],
128
- "type": "longrope"
129
  },
130
  "rope_theta": 10000.0,
131
  "sliding_window": 262144,
 
125
  2.849999999999998,
126
  2.9499999999999975
127
  ],
128
+ "type": "su"
129
  },
130
  "rope_theta": 10000.0,
131
  "sliding_window": 262144,
configuration_phi3.py CHANGED
@@ -83,8 +83,8 @@ class Phi3Config(PretrainedConfig):
83
  rope_theta (`float`, *optional*, defaults to 10000.0):
84
  The base period of the RoPE embeddings.
85
  rope_scaling (`dict`, *optional*):
86
- The scaling factor for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
87
- contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
88
  the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
89
  divided by the number of attention heads divided by 2.
90
  eos_token_id (`int`, *optional*, defaults to 32000):
@@ -158,6 +158,7 @@ class Phi3Config(PretrainedConfig):
158
  self.use_cache = use_cache
159
  self.rope_theta = rope_theta
160
  self.rope_scaling = rope_scaling
 
161
  self.sliding_window = sliding_window
162
 
163
  super().__init__(
@@ -168,33 +169,41 @@ class Phi3Config(PretrainedConfig):
168
  )
169
 
170
  def _rope_scaling_validation(self):
 
 
 
171
  if self.rope_scaling is None:
172
  return
173
 
174
- assert (
175
- (isinstance(self.rope_scaling, dict))
176
- and ("type" in self.rope_scaling)
177
- and ("short_factor" in self.rope_scaling)
178
- and ("long_factor" in self.rope_scaling)
179
- ), (
180
- "`rope_scaling` must be a dictionary with three keys: `type`, `short_factor` and `long_factor`, "
181
- f"got {self.rope_scaling}."
182
- )
183
-
184
- assert self.rope_scaling["type"].lower() == "longrope", "RoPE scaling type must be `longrope`."
185
-
186
- short_factor = self.rope_scaling["short_factor"]
187
- assert isinstance(short_factor, list) and all(
188
- isinstance(x, (int, float)) for x in short_factor
189
- ), f"RoPE scaling factor must be a list of numbers, got {short_factor}."
190
- assert (
191
- len(short_factor) == self.hidden_size // self.num_attention_heads // 2
192
- ), f"Length of RoPE scaling factor must be half of the attention head, got {short_factor}."
193
-
194
- long_factor = self.rope_scaling["long_factor"]
195
- assert isinstance(long_factor, list) and all(
196
- isinstance(x, (int, float)) for x in long_factor
197
- ), f"RoPE scaling factor must be a list of numbers, got {long_factor}."
198
- assert (
199
- len(long_factor) == self.hidden_size // self.num_attention_heads // 2
200
- ), f"Length of RoPE scaling factor must be half of the attention head, got {long_factor}."
 
 
 
 
 
 
83
  rope_theta (`float`, *optional*, defaults to 10000.0):
84
  The base period of the RoPE embeddings.
85
  rope_scaling (`dict`, *optional*):
86
+ The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
87
+ contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
88
  the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
89
  divided by the number of attention heads divided by 2.
90
  eos_token_id (`int`, *optional*, defaults to 32000):
 
158
  self.use_cache = use_cache
159
  self.rope_theta = rope_theta
160
  self.rope_scaling = rope_scaling
161
+ self._rope_scaling_validation()
162
  self.sliding_window = sliding_window
163
 
164
  super().__init__(
 
169
  )
170
 
171
  def _rope_scaling_validation(self):
172
+ """
173
+ Validate the `rope_scaling` configuration.
174
+ """
175
  if self.rope_scaling is None:
176
  return
177
 
178
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
179
+ raise ValueError(
180
+ "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
181
+ f"got {self.rope_scaling}"
182
+ )
183
+ rope_scaling_type = self.rope_scaling.get("type", None)
184
+ rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
185
+ rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
186
+ if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:
187
+ raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
188
+ if not (
189
+ isinstance(rope_scaling_short_factor, list)
190
+ and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
191
+ ):
192
+ raise ValueError(
193
+ f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
194
+ )
195
+ if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
196
+ raise ValueError(
197
+ f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
198
+ )
199
+ if not (
200
+ isinstance(rope_scaling_long_factor, list)
201
+ and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
202
+ ):
203
+ raise ValueError(
204
+ f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
205
+ )
206
+ if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
207
+ raise ValueError(
208
+ f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
209
+ )
modeling_phi3.py CHANGED
@@ -54,26 +54,17 @@ logger = logging.get_logger(__name__)
54
  _flash_supports_window_size = False
55
  try:
56
  from flash_attn import flash_attn_func, flash_attn_varlen_func
 
57
 
58
  _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
59
-
60
- if not _flash_supports_window_size:
61
- raise ValueError("Please update flash-attention to support window size.")
62
-
63
- from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
64
- from flash_attn.ops.activations import swiglu
65
- from flash_attn.ops.rms_norm import RMSNorm as Phi3FlashRMSNorm
66
- # else:
67
  except ImportError as error:
68
  logger.warning(
69
- f"Flash Attention or Flash Attention Submodules not found, consider installing for better performance: {error}."
70
  )
71
  if not _flash_supports_window_size:
72
  logger.warning(
73
- "This version of flash does not support window size. Please use `attn_implementation='eager'` or upgrade flash-attn library."
74
  )
75
- swiglu = None
76
- Phi3FlashRMSNorm = None
77
 
78
  _CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
79
  _CONFIG_FOR_DOC = "Phi3Config"
@@ -103,9 +94,6 @@ class Phi3RMSNorm(nn.Module):
103
  return self.weight * hidden_states.to(input_dtype)
104
 
105
 
106
- PHI3_NORM_CLASS = Phi3RMSNorm if Phi3FlashRMSNorm is None else Phi3FlashRMSNorm
107
-
108
-
109
  # Copied from transformers.models.llama.modeling_llama._get_unpad_data
110
  def _get_unpad_data(attention_mask):
111
  seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
@@ -156,43 +144,27 @@ class Phi3RotaryEmbedding(nn.Module):
156
  )
157
 
158
 
159
- class Phi3LongScaledRotaryEmbedding(nn.Module):
160
  def __init__(
161
  self,
162
  dim,
163
  short_factor,
164
  long_factor,
165
- max_position_embeddings=4096,
166
- original_max_position_embeddings=4096,
167
  base=10000,
168
- magnitude_scaling_policy="su",
169
  ):
170
  super().__init__()
171
 
172
  self.dim = dim
 
 
173
  self.max_position_embeddings = max_position_embeddings
174
  self.original_max_position_embeddings = original_max_position_embeddings
175
  self.base = base
176
 
177
- if magnitude_scaling_policy == "su":
178
- self._calc_mscale = self._calc_mscale_su
179
- elif magnitude_scaling_policy == "yarn":
180
- self._calc_mscale = self._calc_mscale_yarn
181
- else:
182
- self._calc_mscale = lambda scale: float(scale)
183
-
184
- self.short_factor = short_factor
185
- self.long_factor = long_factor
186
-
187
- def _calc_mscale_su(self, scale):
188
- if scale <= 1.0:
189
- return 1.0
190
- return math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
191
-
192
- def _calc_mscale_yarn(self, scale):
193
- if scale <= 1.0:
194
- return 1.0
195
- return 0.1 * math.log(scale) + 1.0
196
 
197
  @torch.no_grad()
198
  def forward(self, x, seq_len=None):
@@ -206,9 +178,6 @@ class Phi3LongScaledRotaryEmbedding(nn.Module):
206
  else:
207
  t = torch.arange(self.original_max_position_embeddings, device=x.device, dtype=torch.float32)
208
  rescale_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
209
- assert rescale_factors.shape == (
210
- self.dim // 2,
211
- ), f"misaligned shape for LongRoPE rescale factors: {rescale_factors.shape}"
212
 
213
  inv_freq = 1.0 / (
214
  rescale_factors * (self.base ** (torch.arange(0, self.dim, 2).float().to(x.device) / self.dim))
@@ -221,6 +190,20 @@ class Phi3LongScaledRotaryEmbedding(nn.Module):
221
  return (emb.cos() * mscale).to(x.dtype), (emb.sin() * mscale).to(x.dtype)
222
 
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  # Copied from transformers.models.llama.modeling_llama.rotate_half
225
  def rotate_half(x):
226
  """Rotates half the hidden dims of the input."""
@@ -253,24 +236,12 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
253
  cos = cos[position_ids].unsqueeze(unsqueeze_dim)
254
  sin = sin[position_ids].unsqueeze(unsqueeze_dim)
255
  # Need fp32 here to match logits
256
- q_embed = (q.to(dtype=torch.float32) * cos.to(dtype=torch.float32)) + (
257
- rotate_half(q).to(dtype=torch.float32) * sin.to(dtype=torch.float32)
258
- )
259
- k_embed = (k.to(dtype=torch.float32) * cos.to(dtype=torch.float32)) + (
260
- rotate_half(k).to(dtype=torch.float32) * sin.to(dtype=torch.float32)
261
- )
262
  return q_embed.to(q.dtype), k_embed.to(k.dtype)
263
 
264
 
265
  class Phi3MLP(nn.Module):
266
- """Gated Linear Unit.
267
-
268
- Reference:
269
- Language Modeling with Gated Convolutional Networks.
270
- https://arxiv.org/pdf/1612.08083v3.pdf.
271
-
272
- """
273
-
274
  def __init__(self, config):
275
  super().__init__()
276
 
@@ -283,13 +254,8 @@ class Phi3MLP(nn.Module):
283
  def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
284
  y = self.gate_up_proj(hidden_states)
285
 
286
- # Special case for SwiGLU
287
- if self.config.hidden_act == "silu" and swiglu is not None:
288
- gate, y = y.chunk(2, dim=-1)
289
- y = swiglu(gate, y)
290
- else:
291
- gate, y = y.chunk(2, dim=-1)
292
- y = y * self.activation_fn(gate)
293
 
294
  return self.down_proj(y)
295
 
@@ -330,7 +296,6 @@ class Phi3Attention(nn.Module):
330
  self.max_position_embeddings = config.max_position_embeddings
331
  self.original_max_position_embeddings = config.original_max_position_embeddings
332
  self.rope_theta = config.rope_theta
333
- self.rope_scaling = config.rope_scaling
334
  self.is_causal = True
335
 
336
  if (self.head_dim * self.num_heads) != self.hidden_size:
@@ -341,24 +306,38 @@ class Phi3Attention(nn.Module):
341
 
342
  op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
343
  self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
344
-
345
  self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
 
346
 
347
- if self.rope_scaling is None:
 
348
  self.rotary_emb = Phi3RotaryEmbedding(
349
  self.head_dim,
350
  max_position_embeddings=self.max_position_embeddings,
351
  base=self.rope_theta,
352
  )
353
  else:
354
- self.rotary_emb = Phi3LongScaledRotaryEmbedding(
355
- self.head_dim,
356
- self.config.rope_scaling["short_factor"],
357
- self.config.rope_scaling["long_factor"],
358
- max_position_embeddings=self.config.max_position_embeddings,
359
- original_max_position_embeddings=self.config.original_max_position_embeddings,
360
- base=self.config.rope_theta,
361
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
362
 
363
  def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
364
  return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
@@ -859,11 +838,11 @@ class Phi3DecoderLayer(nn.Module):
859
  self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
860
 
861
  self.mlp = Phi3MLP(config)
862
- self.input_layernorm = PHI3_NORM_CLASS(config.hidden_size, eps=config.rms_norm_eps)
863
 
864
  self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
865
  self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
866
- self.post_attention_layernorm = PHI3_NORM_CLASS(config.hidden_size, eps=config.rms_norm_eps)
867
 
868
  def forward(
869
  self,
@@ -1066,9 +1045,8 @@ class Phi3Model(Phi3PreTrainedModel):
1066
  self.layers = nn.ModuleList(
1067
  [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
1068
  )
1069
- self.norm = PHI3_NORM_CLASS(config.hidden_size, eps=config.rms_norm_eps)
1070
-
1071
  self._attn_implementation = config._attn_implementation
 
1072
 
1073
  self.gradient_checkpointing = False
1074
  # Initialize weights and apply final processing
@@ -1255,6 +1233,7 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
1255
  def get_decoder(self):
1256
  return self.model
1257
 
 
1258
  @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
1259
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1260
  def forward(
@@ -1284,8 +1263,8 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
1284
  ```python
1285
  >>> from transformers import AutoTokenizer, Phi3ForCausalLM
1286
 
1287
- >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3")
1288
- >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3")
1289
 
1290
  >>> prompt = "This is an example script ."
1291
  >>> inputs = tokenizer(prompt, return_tensors="pt")
@@ -1293,7 +1272,7 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
1293
  >>> # Generate
1294
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1295
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1296
- 'This is an example script .\n\n\n\nfrom typing import List\n\ndef find_most_common_letter(words: List[str'
1297
  ```"""
1298
 
1299
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
 
54
  _flash_supports_window_size = False
55
  try:
56
  from flash_attn import flash_attn_func, flash_attn_varlen_func
57
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
58
 
59
  _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 
 
 
 
 
 
 
 
60
  except ImportError as error:
61
  logger.warning(
62
+ f"`flash-attention` package not found, consider installing for better performance: {error}."
63
  )
64
  if not _flash_supports_window_size:
65
  logger.warning(
66
+ "Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`."
67
  )
 
 
68
 
69
  _CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
70
  _CONFIG_FOR_DOC = "Phi3Config"
 
94
  return self.weight * hidden_states.to(input_dtype)
95
 
96
 
 
 
 
97
  # Copied from transformers.models.llama.modeling_llama._get_unpad_data
98
  def _get_unpad_data(attention_mask):
99
  seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
 
144
  )
145
 
146
 
147
+ class _Phi3ScaledRotaryEmbedding(nn.Module):
148
  def __init__(
149
  self,
150
  dim,
151
  short_factor,
152
  long_factor,
153
+ max_position_embeddings=2048,
154
+ original_max_position_embeddings=2048,
155
  base=10000,
 
156
  ):
157
  super().__init__()
158
 
159
  self.dim = dim
160
+ self.short_factor = short_factor
161
+ self.long_factor = long_factor
162
  self.max_position_embeddings = max_position_embeddings
163
  self.original_max_position_embeddings = original_max_position_embeddings
164
  self.base = base
165
 
166
+ def _calc_mscale(self, scale):
167
+ raise NotImplementedError("`_calc_mscale` should be implemented in subclasses")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
  @torch.no_grad()
170
  def forward(self, x, seq_len=None):
 
178
  else:
179
  t = torch.arange(self.original_max_position_embeddings, device=x.device, dtype=torch.float32)
180
  rescale_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
 
 
 
181
 
182
  inv_freq = 1.0 / (
183
  rescale_factors * (self.base ** (torch.arange(0, self.dim, 2).float().to(x.device) / self.dim))
 
190
  return (emb.cos() * mscale).to(x.dtype), (emb.sin() * mscale).to(x.dtype)
191
 
192
 
193
+ class Phi3SuScaledRotaryEmbedding(_Phi3ScaledRotaryEmbedding):
194
+ def _calc_mscale(self, scale):
195
+ if scale <= 1.0:
196
+ return 1.0
197
+ return math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
198
+
199
+
200
+ class Phi3YarnScaledRotaryEmbedding(_Phi3ScaledRotaryEmbedding):
201
+ def _calc_mscale(self, scale):
202
+ if scale <= 1.0:
203
+ return 1.0
204
+ return 0.1 * math.log(scale) + 1.0
205
+
206
+
207
  # Copied from transformers.models.llama.modeling_llama.rotate_half
208
  def rotate_half(x):
209
  """Rotates half the hidden dims of the input."""
 
236
  cos = cos[position_ids].unsqueeze(unsqueeze_dim)
237
  sin = sin[position_ids].unsqueeze(unsqueeze_dim)
238
  # Need fp32 here to match logits
239
+ q_embed = (q.float() * cos.float()) + (rotate_half(q).float() * sin.float())
240
+ k_embed = (k.float() * cos.float()) + (rotate_half(k).float() * sin.float())
 
 
 
 
241
  return q_embed.to(q.dtype), k_embed.to(k.dtype)
242
 
243
 
244
  class Phi3MLP(nn.Module):
 
 
 
 
 
 
 
 
245
  def __init__(self, config):
246
  super().__init__()
247
 
 
254
  def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
255
  y = self.gate_up_proj(hidden_states)
256
 
257
+ gate, y = y.chunk(2, dim=-1)
258
+ y = y * self.activation_fn(gate)
 
 
 
 
 
259
 
260
  return self.down_proj(y)
261
 
 
296
  self.max_position_embeddings = config.max_position_embeddings
297
  self.original_max_position_embeddings = config.original_max_position_embeddings
298
  self.rope_theta = config.rope_theta
 
299
  self.is_causal = True
300
 
301
  if (self.head_dim * self.num_heads) != self.hidden_size:
 
306
 
307
  op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
308
  self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
 
309
  self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
310
+ self._init_rope()
311
 
312
+ def _init_rope(self):
313
+ if self.config.rope_scaling is None:
314
  self.rotary_emb = Phi3RotaryEmbedding(
315
  self.head_dim,
316
  max_position_embeddings=self.max_position_embeddings,
317
  base=self.rope_theta,
318
  )
319
  else:
320
+ scaling_type = self.config.rope_scaling["type"]
321
+ if scaling_type == "su":
322
+ self.rotary_emb = Phi3SuScaledRotaryEmbedding(
323
+ self.head_dim,
324
+ self.config.rope_scaling["short_factor"],
325
+ self.config.rope_scaling["long_factor"],
326
+ max_position_embeddings=self.config.max_position_embeddings,
327
+ original_max_position_embeddings=self.config.original_max_position_embeddings,
328
+ base=self.config.rope_theta,
329
+ )
330
+ elif scaling_type == "yarn":
331
+ self.rotary_emb = Phi3YarnScaledRotaryEmbedding(
332
+ self.head_dim,
333
+ self.config.rope_scaling["short_factor"],
334
+ self.config.rope_scaling["long_factor"],
335
+ max_position_embeddings=self.config.max_position_embeddings,
336
+ original_max_position_embeddings=self.config.original_max_position_embeddings,
337
+ base=self.config.rope_theta,
338
+ )
339
+ else:
340
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
341
 
342
  def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
343
  return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
 
838
  self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
839
 
840
  self.mlp = Phi3MLP(config)
841
+ self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
842
 
843
  self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
844
  self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
845
+ self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
846
 
847
  def forward(
848
  self,
 
1045
  self.layers = nn.ModuleList(
1046
  [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
1047
  )
 
 
1048
  self._attn_implementation = config._attn_implementation
1049
+ self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
1050
 
1051
  self.gradient_checkpointing = False
1052
  # Initialize weights and apply final processing
 
1233
  def get_decoder(self):
1234
  return self.model
1235
 
1236
+ # Ignore copy
1237
  @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
1238
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1239
  def forward(
 
1263
  ```python
1264
  >>> from transformers import AutoTokenizer, Phi3ForCausalLM
1265
 
1266
+ >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
1267
+ >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
1268
 
1269
  >>> prompt = "This is an example script ."
1270
  >>> inputs = tokenizer(prompt, return_tensors="pt")
 
1272
  >>> # Generate
1273
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1274
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1275
+ 'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
1276
  ```"""
1277
 
1278
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions