Files changed (1) hide show
  1. modeling_phi.py +991 -0
modeling_phi.py ADDED
@@ -0,0 +1,991 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+ #
4
+ # Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu.
5
+ # Licensed under the BSD 3-Clause License.
6
+
7
+ from __future__ import annotations
8
+
9
+ import math
10
+ from dataclasses import dataclass, field
11
+ from typing import Any, Dict, Optional, Tuple, Union
12
+
13
+ import torch
14
+ import torch.nn as nn
15
+ from einops import rearrange, repeat
16
+ from transformers import PretrainedConfig, PreTrainedModel
17
+ from transformers.activations import ACT2FN
18
+ from transformers.modeling_outputs import CausalLMOutputWithPast
19
+
20
+ from .configuration_phi import PhiConfig
21
+
22
+ try:
23
+ from flash_attn.bert_padding import pad_input, unpad_input
24
+ from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding
25
+ from flash_attn.modules.mha import FlashCrossAttention, FlashSelfAttention
26
+ from flash_attn.ops.fused_dense import FusedDense
27
+ except:
28
+ pad_input, unpad_input = None, None
29
+ FlashRotaryEmbedding = None
30
+ FlashSelfAttention, FlashCrossAttention = None, None
31
+ FusedDense = None
32
+
33
+
34
+ @dataclass
35
+ class InferenceParams:
36
+ """Inference parameters passed to model to efficiently calculate
37
+ and store context during inference.
38
+
39
+ Reference:
40
+ https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/generation.py.
41
+
42
+ Args:
43
+ max_seqlen: Maximum sequence length.
44
+ max_batch_size: Maximum batch size.
45
+ seqlen_offset: Sequence length offset.
46
+ batch_size_offset: Batch size offset.
47
+ key_value_memory_dict: Key value memory dictionary.
48
+ lengths_per_sample: Lengths per sample.
49
+
50
+ """
51
+
52
+ max_seqlen: int = field(metadata={"help": "Maximum sequence length."})
53
+
54
+ max_batch_size: int = field(metadata={"help": "Maximum batch size."})
55
+
56
+ seqlen_offset: int = field(default=0, metadata={"help": "Sequence length offset."})
57
+
58
+ batch_size_offset: int = field(default=0, metadata={"help": "Batch size offset."})
59
+
60
+ key_value_memory_dict: Dict[str, Any] = field(
61
+ default_factory=dict, metadata={"help": "Key value memory dictionary."}
62
+ )
63
+
64
+ lengths_per_sample: torch.Tensor = field(default=None, metadata={"help": "Lengths per sample."})
65
+
66
+
67
+ class Embedding(nn.Module):
68
+ """Token embedding with dropout."""
69
+
70
+ def __init__(self, config: PretrainedConfig) -> None:
71
+ super().__init__()
72
+
73
+ self.wte = nn.Embedding(config.vocab_size, config.n_embd)
74
+ self.drop = nn.Dropout(config.embd_pdrop)
75
+
76
+ def forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
77
+ input_shape = input_ids.size()
78
+ input_ids = input_ids.view(-1, input_shape[-1])
79
+
80
+ hidden_states = self.wte(input_ids)
81
+ hidden_states = self.drop(hidden_states)
82
+
83
+ return hidden_states
84
+
85
+
86
+ def _apply_rotary_emb(
87
+ x: torch.FloatTensor,
88
+ cos: torch.FloatTensor,
89
+ sin: torch.FloatTensor,
90
+ ) -> torch.FloatTensor:
91
+ _, seqlen, _, _ = x.shape
92
+ _, rotary_dim = cos.shape
93
+ rotary_dim *= 2
94
+
95
+ x_rot = x[:, :, :, :rotary_dim]
96
+ x_pass = x[:, :, :, rotary_dim:]
97
+
98
+ x1, x2 = x_rot.chunk(2, dim=-1)
99
+ c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
100
+ x1, x2, c, s = [t.to(dtype=torch.float32) for t in [x1, x2, c, s]]
101
+
102
+ x_rot = torch.cat([x1 * c - x2 * s, x1 * s + x2 * c], axis=-1).to(x.dtype)
103
+
104
+ return torch.cat([x_rot, x_pass], axis=-1)
105
+
106
+
107
+ def _apply_rotary_emb_kv(
108
+ kv: torch.FloatTensor,
109
+ cos: torch.FloatTensor,
110
+ sin: torch.FloatTensor,
111
+ cos_k: Optional[torch.FloatTensor] = None,
112
+ sin_k: Optional[torch.FloatTensor] = None,
113
+ ) -> torch.FloatTensor:
114
+ _, seqlen, _, _, _ = kv.shape
115
+ _, rotary_dim = cos.shape
116
+ rotary_dim *= 2
117
+
118
+ k_rot = kv[:, :, 0, :, :rotary_dim]
119
+ k_pass = kv[:, :, 0, :, rotary_dim:]
120
+
121
+ k1, k2 = k_rot.chunk(2, dim=-1)
122
+ c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
123
+ k1, k2, c, s = [t.to(dtype=torch.float32) for t in [k1, k2, c, s]]
124
+
125
+ k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(kv.dtype)
126
+
127
+ return torch.cat(
128
+ [
129
+ torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
130
+ kv[:, :, 1:2, :, :],
131
+ ],
132
+ axis=2,
133
+ )
134
+
135
+
136
+ def _apply_rotary_emb_qkv(
137
+ qkv: torch.FloatTensor,
138
+ cos: torch.FloatTensor,
139
+ sin: torch.FloatTensor,
140
+ cos_k: Optional[torch.FloatTensor] = None,
141
+ sin_k: Optional[torch.FloatTensor] = None,
142
+ ) -> torch.FloatTensor:
143
+ _, seqlen, _, _, _ = qkv.shape
144
+ _, rotary_dim = cos.shape
145
+ rotary_dim *= 2
146
+
147
+ q_rot = qkv[:, :, 0, :, :rotary_dim]
148
+ q_pass = qkv[:, :, 0, :, rotary_dim:]
149
+
150
+ k_rot = qkv[:, :, 1, :, :rotary_dim]
151
+ k_pass = qkv[:, :, 1, :, rotary_dim:]
152
+
153
+ q1, q2 = q_rot.chunk(2, dim=-1)
154
+ k1, k2 = k_rot.chunk(2, dim=-1)
155
+ c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
156
+ q1, q2, k1, k2, c, s = [t.to(dtype=torch.float32) for t in [q1, q2, k1, k2, c, s]]
157
+
158
+ q_rot = torch.cat([q1 * c - q2 * s, q1 * s + q2 * c], axis=-1).to(qkv.dtype)
159
+ k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(qkv.dtype)
160
+
161
+ return torch.cat(
162
+ [
163
+ torch.cat([q_rot, q_pass], axis=-1).unsqueeze(2),
164
+ torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
165
+ qkv[:, :, 2:3, :, :],
166
+ ],
167
+ axis=2,
168
+ )
169
+
170
+
171
+ class RotaryEmbedding(nn.Module):
172
+ """Rotary positional embedding (RoPE).
173
+
174
+ Reference:
175
+ RoFormer: Enhanced Transformer with Rotary Position Embedding.
176
+ https://arxiv.org/pdf/2104.09864.pdf.
177
+
178
+ """
179
+
180
+ def __init__(
181
+ self,
182
+ dim: int,
183
+ base: int = 10000,
184
+ scale_base: Optional[float] = None,
185
+ pos_idx_in_fp32: bool = True,
186
+ max_position_embeddings: int = 2048,
187
+ device: Optional[str] = None,
188
+ **kwargs,
189
+ ) -> None:
190
+ super().__init__()
191
+
192
+ if scale_base is not None:
193
+ raise NotImplementedError
194
+
195
+ self.dim = dim
196
+ self.base = float(base)
197
+ self.scale_base = scale_base
198
+ self.pos_idx_in_fp32 = pos_idx_in_fp32
199
+ self.max_position_embeddings = max_position_embeddings
200
+ self.device = device
201
+
202
+ # Generate and save the inverse frequency buffer (non-trainable)
203
+ inv_freq = self._compute_inv_freq(device)
204
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
205
+
206
+ # Generate and save the scale buffer (non-trainable)
207
+ scale = (
208
+ (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
209
+ if scale_base is not None
210
+ else None
211
+ )
212
+ self.register_buffer("scale", scale, persistent=False)
213
+
214
+ # Initialize cached attributes since ONNX can't rely on dynamic initialization
215
+ self._update_cos_sin_cache(max_position_embeddings, device=device, dtype=torch.float32)
216
+
217
+ def _compute_inv_freq(self, device: Optional[str] = None) -> torch.FloatTensor:
218
+ return 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
219
+
220
+ def _update_cos_sin_cache(
221
+ self,
222
+ seqlen: int,
223
+ device: Optional[str] = None,
224
+ dtype: Optional[torch.dtype] = None,
225
+ ) -> None:
226
+ self._seq_len_cached = seqlen
227
+
228
+ # fp32 is preferred since the output of `torch.arange` can be quite large
229
+ # and bf16 would lose a lot of precision
230
+ if self.pos_idx_in_fp32:
231
+ t = torch.arange(seqlen, device=device, dtype=torch.float32)
232
+ if self.inv_freq.dtype != torch.float32:
233
+ inv_freq = self._compute_inv_freq(device=device)
234
+ else:
235
+ inv_freq = self.inv_freq
236
+ else:
237
+ t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
238
+ inv_freq = self.inv_freq
239
+
240
+ # `torch.outer` is preferred since `torch.einsum` converts from fp32 to fp16 if used with AMP
241
+ freqs = torch.outer(t, inv_freq)
242
+ if self.scale is None:
243
+ self._cos_cached = torch.cos(freqs).to(dtype)
244
+ self._sin_cached = torch.sin(freqs).to(dtype)
245
+ else:
246
+ power = (
247
+ torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
248
+ ) / self.scale_base
249
+ scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
250
+
251
+ # Force the scale multiplication to happen in fp32
252
+ self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
253
+ self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
254
+ self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
255
+ self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
256
+
257
+ def forward(
258
+ self,
259
+ qkv: torch.Tensor,
260
+ kv: Optional[torch.Tensor] = None,
261
+ seqlen_offset: int = 0,
262
+ **kwargs,
263
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
264
+ if (
265
+ self._seq_len_cached < qkv.shape[1] + seqlen_offset
266
+ or self._cos_cached.device != qkv.device
267
+ or self._cos_cached.dtype != qkv.dtype
268
+ or (self.training and self._cos_cached.is_inference())
269
+ ):
270
+ self._update_cos_sin_cache(qkv.shape[1] + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
271
+
272
+ if kv is None:
273
+ return _apply_rotary_emb_qkv(
274
+ qkv,
275
+ self._cos_cached[seqlen_offset:],
276
+ self._sin_cached[seqlen_offset:],
277
+ )
278
+ else:
279
+ q = _apply_rotary_emb(
280
+ qkv,
281
+ self._cos_cached[seqlen_offset:],
282
+ self._sin_cached[seqlen_offset:],
283
+ )
284
+ kv = _apply_rotary_emb_kv(
285
+ kv,
286
+ self._cos_cached[seqlen_offset:],
287
+ self._sin_cached[seqlen_offset:],
288
+ )
289
+
290
+ return q, kv
291
+
292
+
293
+ class MoE(nn.Module):
294
+ def __init__(
295
+ self,
296
+ config: PretrainedConfig,
297
+ num_experts=4,
298
+ num_experts_per_tok=2,
299
+ num_shards=1,
300
+ **kwargs,
301
+ ):
302
+ super().__init__()
303
+ self.mlp = nn.ModuleList([MLP(config) for i in range(num_experts)])
304
+ self.gate = nn.Linear(config.n_embd, num_experts, bias=False)
305
+ self.num_experts_per_tok = num_experts_per_tok
306
+
307
+ def forward(self, x):
308
+ orig_shape = x.shape
309
+ x = x.view(-1, x.shape[-1])
310
+
311
+ scores = self.gate(x)
312
+ expert_weights, expert_indices = torch.topk(scores, self.num_experts_per_tok, dim=-1)
313
+ expert_weights = expert_weights.softmax(dim=-1)
314
+ flat_expert_indices = expert_indices.view(-1)
315
+
316
+ x = x.repeat_interleave(self.num_experts_per_tok, dim=0)
317
+ y = torch.empty_like(x)
318
+ for i, expert in enumerate(self.mlp):
319
+ y[flat_expert_indices == i] = expert(x[flat_expert_indices == i])
320
+ y = (y.view(*expert_weights.shape, -1) * expert_weights.unsqueeze(-1)).sum(dim=1)
321
+ return y.view(*orig_shape)
322
+
323
+
324
+ class MLP(nn.Module):
325
+ """Multi-Layer Perceptron.
326
+
327
+ Reference:
328
+ Attention Is All You Need.
329
+ https://arxiv.org/pdf/1706.03762.pdf.
330
+
331
+ """
332
+
333
+ def __init__(
334
+ self,
335
+ config: PretrainedConfig,
336
+ n_inner: Optional[int] = None,
337
+ act_fn: Optional[str] = None,
338
+ ) -> None:
339
+ super().__init__()
340
+
341
+ act_fn = config.activation_function if act_fn is None else act_fn
342
+
343
+ n_inner = getattr(config, "n_inner", None) if n_inner is None else n_inner
344
+ n_inner = n_inner if n_inner is not None else 4 * config.n_embd
345
+
346
+ self.fc1 = nn.Linear(config.n_embd, n_inner)
347
+ self.fc2 = nn.Linear(n_inner, config.n_embd)
348
+ self.act = ACT2FN[act_fn]
349
+
350
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
351
+ hidden_states = self.fc1(hidden_states)
352
+ hidden_states = self.act(hidden_states)
353
+ hidden_states = self.fc2(hidden_states)
354
+
355
+ return hidden_states
356
+
357
+
358
+ class SelfAttention(nn.Module):
359
+ """Self-attention layer (compatible with PyTorch).
360
+
361
+ Reference:
362
+ https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py.
363
+
364
+ """
365
+
366
+ def __init__(
367
+ self,
368
+ causal: bool = True,
369
+ softmax_scale: Optional[float] = None,
370
+ attention_dropout: float = 0.0,
371
+ ) -> None:
372
+ super().__init__()
373
+
374
+ self.causal = causal
375
+ self.softmax_scale = softmax_scale
376
+ self.drop = nn.Dropout(attention_dropout)
377
+
378
+ @torch.autocast("cpu", enabled=False)
379
+ @torch.autocast("cuda", enabled=False)
380
+ def forward(
381
+ self,
382
+ qkv: torch.FloatTensor,
383
+ causal: bool = None,
384
+ key_padding_mask: Optional[torch.BoolTensor] = None,
385
+ **kwargs,
386
+ ) -> torch.FloatTensor:
387
+ batch_size, seqlen = qkv.shape[0], qkv.shape[1]
388
+ q, k, v = qkv.unbind(dim=2)
389
+
390
+ q = q.to(torch.float32)
391
+ k = k.to(torch.float32)
392
+
393
+ causal = self.causal if causal is None else causal
394
+ softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
395
+
396
+ # Autocast is manually disabled to avoid `torch.einsum` performing the operation
397
+ # using float16, which might lead to overflow
398
+ scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
399
+
400
+ if key_padding_mask is not None:
401
+ padding_mask = torch.full((batch_size, seqlen), -10000.0, dtype=scores.dtype, device=scores.device)
402
+ padding_mask.masked_fill_(key_padding_mask, 0.0)
403
+
404
+ scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
405
+
406
+ if causal:
407
+ causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
408
+ scores = scores + causal_mask.to(dtype=scores.dtype)
409
+
410
+ attention = torch.softmax(scores, dim=-1).to(v.dtype)
411
+ attention = self.drop(attention)
412
+
413
+ output = torch.einsum("bhts,bshd->bthd", attention, v)
414
+
415
+ return output
416
+
417
+
418
+ class CrossAttention(nn.Module):
419
+ """Cross-attention layer (compatible with PyTorch).
420
+
421
+ Reference:
422
+ https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py.
423
+
424
+ """
425
+
426
+ def __init__(
427
+ self,
428
+ causal: bool = True,
429
+ softmax_scale: Optional[float] = None,
430
+ attention_dropout: float = 0.0,
431
+ ) -> None:
432
+ super().__init__()
433
+
434
+ self.causal = causal
435
+ self.softmax_scale = softmax_scale
436
+ self.drop = nn.Dropout(attention_dropout)
437
+
438
+ @torch.autocast("cpu", enabled=False)
439
+ @torch.autocast("cuda", enabled=False)
440
+ def forward(
441
+ self,
442
+ q: torch.FloatTensor,
443
+ kv: torch.FloatTensor,
444
+ causal: bool = None,
445
+ key_padding_mask: Optional[torch.BoolTensor] = None,
446
+ **kwargs,
447
+ ) -> torch.FloatTensor:
448
+ batch_size, seqlen_q = q.shape[0], q.shape[1]
449
+ seqlen_k = kv.shape[1]
450
+
451
+ if kv.shape[3] != q.shape[2]:
452
+ kv = repeat(kv, "... hkv d -> ... (hkv g) d", g=q.shape[2] // kv.shape[3])
453
+ k, v = kv.unbind(dim=2)
454
+
455
+ q = q.to(torch.float32)
456
+ k = k.to(torch.float32)
457
+
458
+ causal = self.causal if causal is None else causal
459
+ softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
460
+
461
+ # Autocast is manually disabled to avoid `torch.einsum` performing the operation
462
+ # using float16, which might lead to overflow
463
+ scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
464
+
465
+ if key_padding_mask is not None:
466
+ padding_mask = torch.full(
467
+ (batch_size, seqlen_k),
468
+ -10000.0,
469
+ dtype=scores.dtype,
470
+ device=scores.device,
471
+ )
472
+ padding_mask.masked_fill_(key_padding_mask, 0.0)
473
+
474
+ scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
475
+
476
+ if causal:
477
+ rows = rearrange(torch.arange(seqlen_q, device=q.device, dtype=torch.long), "s -> s 1")
478
+ cols = torch.arange(seqlen_k, device=k.device, dtype=torch.long)
479
+ causal_mask = cols > rows + seqlen_k - seqlen_q
480
+
481
+ scores = scores.masked_fill(causal_mask, -10000.0)
482
+
483
+ attention = torch.softmax(scores, dim=-1).to(v.dtype)
484
+ attention = self.drop(attention)
485
+
486
+ output = torch.einsum("bhts,bshd->bthd", attention, v)
487
+
488
+ return output
489
+
490
+
491
+ def _find_mha_dims(
492
+ config: PretrainedConfig,
493
+ n_head: Optional[int] = None,
494
+ n_head_kv: Optional[int] = None,
495
+ head_dim: Optional[int] = None,
496
+ ) -> Tuple[int, int]:
497
+ if n_head is None and head_dim is None:
498
+ head_dim = config.n_embd // config.n_head
499
+ n_head = config.n_head
500
+ elif n_head is None or head_dim is None:
501
+ raise ValueError("`n_head` and `head_dim` must be both specified or `None`.")
502
+
503
+ if n_head_kv is None:
504
+ n_head_kv = getattr(config, "n_head_kv", None) or n_head
505
+
506
+ return n_head, n_head_kv, head_dim
507
+
508
+
509
+ def _update_kv_cache(kv: torch.FloatTensor, inference_params: InferenceParams, layer_idx: int) -> torch.FloatTensor:
510
+ num_heads, head_dim = kv.shape[-2:]
511
+
512
+ if layer_idx not in inference_params.key_value_memory_dict:
513
+ inference_params.key_value_memory_dict[layer_idx] = torch.empty(
514
+ inference_params.max_batch_size,
515
+ inference_params.max_seqlen,
516
+ 2,
517
+ num_heads,
518
+ head_dim,
519
+ dtype=kv.dtype,
520
+ device=kv.device,
521
+ )
522
+
523
+ batch_start = inference_params.batch_size_offset
524
+ batch_end = batch_start + kv.shape[0]
525
+
526
+ sequence_start = inference_params.seqlen_offset
527
+ sequence_end = sequence_start + kv.shape[1]
528
+
529
+ # When the current sequence length is equal to or larger than the maximum sequence length,
530
+ # we need to concatenate the current `kv` with the cached `kv` to expand its length
531
+ if sequence_end >= inference_params.max_seqlen:
532
+ inference_params.key_value_memory_dict[layer_idx] = torch.concatenate((inference_params.key_value_memory_dict[layer_idx], kv), dim=1)
533
+
534
+ inference_params.key_value_memory_dict[layer_idx][batch_start:batch_end, sequence_start:sequence_end, ...] = kv
535
+ kv = inference_params.key_value_memory_dict[layer_idx][batch_start:batch_end, :sequence_end, ...]
536
+
537
+ return kv
538
+
539
+
540
+ class MHA(nn.Module):
541
+ """Multi-head attention layer."""
542
+
543
+ def __init__(
544
+ self,
545
+ config: PretrainedConfig,
546
+ dtype: Optional[torch.dtype] = None,
547
+ device: Optional[str] = None,
548
+ rotary_dim: Optional[int] = None,
549
+ rotary_base: float = 10000.0,
550
+ rotary_scale_base: Optional[float] = None,
551
+ n_head: Optional[int] = None,
552
+ n_head_kv: Optional[int] = None,
553
+ head_dim: Optional[int] = None,
554
+ bias: bool = True,
555
+ causal: bool = True,
556
+ softmax_scale: Optional[float] = None,
557
+ layer_idx: Optional[int] = None,
558
+ return_residual: bool = False,
559
+ checkpointing: bool = False,
560
+ ) -> None:
561
+ super().__init__()
562
+
563
+ # Rotary embedding
564
+ self.rotary_dim = rotary_dim if rotary_dim is not None else getattr(config, "rotary_dim", 0)
565
+ if self.rotary_dim > 0:
566
+ rotary_cls = FlashRotaryEmbedding if config.flash_rotary else RotaryEmbedding
567
+ if rotary_cls is None:
568
+ rotary_cls = RotaryEmbedding
569
+
570
+ rotary_kwargs = {}
571
+ if rotary_cls is RotaryEmbedding:
572
+ rotary_kwargs["max_position_embeddings"] = config.n_positions
573
+
574
+ self.rotary_emb = rotary_cls(
575
+ self.rotary_dim,
576
+ base=rotary_base,
577
+ scale_base=rotary_scale_base,
578
+ device=device,
579
+ **rotary_kwargs,
580
+ )
581
+
582
+ # MLP
583
+ self.n_head, self.n_head_kv, self.head_dim = _find_mha_dims(
584
+ config, n_head=n_head, n_head_kv=n_head_kv, head_dim=head_dim
585
+ )
586
+ op_size = self.head_dim * (self.n_head + 2 * self.n_head_kv)
587
+ hidden_size = config.n_embd
588
+
589
+ linear_cls = FusedDense if config.fused_dense else nn.Linear
590
+ if linear_cls is None:
591
+ linear_cls = nn.Linear
592
+
593
+ self.Wqkv = linear_cls(hidden_size, op_size, bias=bias, device=device, dtype=dtype)
594
+ self.out_proj = linear_cls(hidden_size, hidden_size, bias=bias, device=device, dtype=dtype)
595
+
596
+ # Attention
597
+ attn_cls = FlashSelfAttention if config.flash_attn else SelfAttention
598
+ if attn_cls is None:
599
+ attn_cls = SelfAttention
600
+
601
+ cross_attn_cls = FlashCrossAttention if config.flash_attn else CrossAttention
602
+ if cross_attn_cls is None:
603
+ cross_attn_cls = CrossAttention
604
+
605
+ self.inner_attn = attn_cls(
606
+ causal=causal,
607
+ softmax_scale=softmax_scale,
608
+ attention_dropout=config.attn_pdrop,
609
+ )
610
+ self.inner_cross_attn = cross_attn_cls(
611
+ causal=causal,
612
+ softmax_scale=softmax_scale,
613
+ attention_dropout=config.attn_pdrop,
614
+ )
615
+
616
+ self.flash_attn = config.flash_attn and attn_cls is FlashSelfAttention
617
+ self.layer_idx = layer_idx
618
+ self.return_residual = return_residual
619
+ self.checkpointing = checkpointing
620
+
621
+ def _forward_self_attn(
622
+ self, x: torch.FloatTensor, key_padding_mask: Optional[torch.BoolTensor]
623
+ ) -> torch.FloatTensor:
624
+ qkv = self.Wqkv(x)
625
+ qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
626
+
627
+ if self.rotary_dim > 0:
628
+ qkv = self.rotary_emb(qkv)
629
+
630
+ if self.flash_attn:
631
+ batch_size, seqlen = qkv.shape[0], qkv.shape[1]
632
+
633
+ cu_seqlens, max_seqlen = None, None
634
+ if key_padding_mask is not None:
635
+ # If `key_padding_mask` is supplied, we need to unpad the input and retrieve
636
+ # the `cu_seqlens` and `max_seqlen` to be used by `flash-attn`
637
+ qkv, indices, cu_seqlens, max_seqlen = unpad_input(qkv, key_padding_mask)
638
+
639
+ if self.checkpointing:
640
+ attn_output = torch.utils.checkpoint.checkpoint(
641
+ self.inner_attn, qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
642
+ )
643
+ else:
644
+ attn_output = self.inner_attn(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen).to(qkv.device)
645
+
646
+ # If `key_padding_mask` is supplied, we need to pad the output back to the original shape
647
+ return pad_input(attn_output, indices, batch_size, seqlen) if key_padding_mask is not None else attn_output
648
+
649
+ if self.checkpointing:
650
+ return torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, key_padding_mask=key_padding_mask)
651
+
652
+ return self.inner_attn(qkv, key_padding_mask=key_padding_mask)
653
+
654
+ def _forward_cross_attn(
655
+ self,
656
+ x: torch.FloatTensor,
657
+ past_key_values: Optional[InferenceParams],
658
+ key_padding_mask: Optional[torch.BoolTensor],
659
+ ) -> torch.FloatTensor:
660
+ batch_size = x.shape[0]
661
+
662
+ qkv = self.Wqkv(x)
663
+
664
+ q = qkv[..., : self.n_head * self.head_dim]
665
+ q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
666
+
667
+ kv = qkv[..., self.n_head * self.head_dim :]
668
+ kv = rearrange(kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim)
669
+
670
+ seqlen_offset = past_key_values.seqlen_offset if past_key_values is not None else 0
671
+ causal = None if seqlen_offset == 0 else False
672
+ if self.rotary_dim > 0:
673
+ q, kv = self.rotary_emb(q, kv=kv, seqlen_offset=seqlen_offset)
674
+
675
+ if past_key_values is not None:
676
+ kv = _update_kv_cache(kv, past_key_values, self.layer_idx)
677
+
678
+ if self.flash_attn:
679
+ batch_size, seqlen_q = q.shape[0], q.shape[1]
680
+ seqlen_k = kv.shape[1]
681
+
682
+ cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k = (
683
+ None,
684
+ None,
685
+ None,
686
+ None,
687
+ )
688
+ if key_padding_mask is not None:
689
+ kv, _, cu_seqlens_k, max_seqlen_k = unpad_input(kv, key_padding_mask)
690
+
691
+ if seqlen_q == 1:
692
+ key_padding_mask = torch.ones(batch_size, 1, device=q.device)
693
+ elif seqlen_q != seqlen_k:
694
+ key_padding_mask = key_padding_mask[:, -seqlen_q:]
695
+
696
+ q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, key_padding_mask)
697
+
698
+ if self.checkpointing:
699
+ attn_output = torch.utils.checkpoint.checkpoint(
700
+ self.inner_cross_attn,
701
+ q,
702
+ kv,
703
+ causal=causal,
704
+ cu_seqlens=cu_seqlens_q,
705
+ max_seqlen=max_seqlen_q,
706
+ cu_seqlens_k=cu_seqlens_k,
707
+ max_seqlen_k=max_seqlen_k,
708
+ )
709
+ else:
710
+ attn_output = self.inner_cross_attn(
711
+ q,
712
+ kv,
713
+ causal=causal,
714
+ cu_seqlens=cu_seqlens_q,
715
+ max_seqlen=max_seqlen_q,
716
+ cu_seqlens_k=cu_seqlens_k,
717
+ max_seqlen_k=max_seqlen_k,
718
+ )
719
+
720
+ return (
721
+ pad_input(attn_output, indices_q, batch_size, max_seqlen_q)
722
+ if key_padding_mask is not None
723
+ else attn_output
724
+ )
725
+
726
+ if self.checkpointing:
727
+ return torch.utils.checkpoint.checkpoint(
728
+ self.inner_cross_attn,
729
+ q,
730
+ kv,
731
+ key_padding_mask=key_padding_mask,
732
+ causal=causal,
733
+ )
734
+
735
+ return self.inner_cross_attn(q, kv, key_padding_mask=key_padding_mask, causal=causal)
736
+
737
+ def forward(
738
+ self,
739
+ x: torch.FloatTensor,
740
+ past_key_values: Optional[InferenceParams] = None,
741
+ attention_mask: Optional[Union[torch.LongTensor, torch.BoolTensor]] = None,
742
+ **kwargs,
743
+ ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
744
+ if attention_mask is not None:
745
+ attention_mask = attention_mask.bool()
746
+ else:
747
+ attention_mask = None
748
+
749
+ # MHA
750
+ if self.n_head == self.n_head_kv:
751
+ if past_key_values is None:
752
+ # If `past_key_values` are not supplied, we run self-attention
753
+ attn_output = self._forward_self_attn(x, attention_mask)
754
+ else:
755
+ # If `past_key_values` are supplied, it means that we might have cached values and
756
+ # could take advantage of cross-attention
757
+ attn_output = self._forward_cross_attn(x, past_key_values, attention_mask)
758
+ # MQA / GQA
759
+ else:
760
+ # Regardless of `past_key_values` being supplied or not, it always use cross-attention
761
+ # because `q` and `kv` lengths might be different
762
+ attn_output = self._forward_cross_attn(x, past_key_values, attention_mask)
763
+
764
+ output = rearrange(attn_output, "... h d -> ... (h d)")
765
+ output = self.out_proj(output)
766
+
767
+ return output if not self.return_residual else (output, x)
768
+
769
+
770
+ class ParallelBlock(nn.Module):
771
+ """Parallel block.
772
+
773
+ This block applies parallel mixer and MLP layers to the input (used in GPT-J and CodeGen).
774
+
775
+ """
776
+
777
+ def __init__(
778
+ self,
779
+ config: PretrainedConfig,
780
+ block_idx: Optional[int] = None,
781
+ ) -> None:
782
+ super().__init__()
783
+
784
+ self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
785
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
786
+ self.block_idx = block_idx
787
+
788
+ self.mixer = MHA(config, layer_idx=block_idx)
789
+ self.moe = MoE(config)
790
+
791
+ def forward(
792
+ self,
793
+ hidden_states: torch.FloatTensor,
794
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
795
+ attention_mask: Optional[torch.BoolTensor] = None,
796
+ **kwargs,
797
+ ) -> torch.FloatTensor:
798
+ residual = hidden_states
799
+ hidden_states = self.ln(hidden_states)
800
+
801
+ attn_outputs = self.mixer(
802
+ hidden_states,
803
+ past_key_values=past_key_values,
804
+ attention_mask=attention_mask,
805
+ )
806
+ if isinstance(attn_outputs, tuple):
807
+ attn_outputs = attn_outputs[0]
808
+
809
+ attn_outputs = self.resid_dropout(attn_outputs)
810
+ feed_forward_hidden_states = self.resid_dropout(self.moe(hidden_states))
811
+
812
+ hidden_states = attn_outputs + feed_forward_hidden_states + residual
813
+
814
+ return hidden_states
815
+
816
+
817
+ class CausalLMHead(nn.Module):
818
+ """Causal Language Modeling head.
819
+
820
+ Reference:
821
+ Improving Language Understanding by Generative Pre-Training.
822
+ https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
823
+
824
+ """
825
+
826
+ def __init__(self, config: PretrainedConfig) -> None:
827
+ super().__init__()
828
+
829
+ self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
830
+ self.linear = nn.Linear(config.n_embd, config.vocab_size)
831
+
832
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
833
+ hidden_states = self.ln(hidden_states)
834
+ logits = self.linear(hidden_states).to(torch.float32)
835
+
836
+ return logits
837
+
838
+
839
+ class CausalLMLoss(nn.Module):
840
+ """Causal Language Modeling loss.
841
+
842
+ Reference:
843
+ Improving Language Understanding by Generative Pre-Training.
844
+ https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
845
+
846
+ """
847
+
848
+ def __init__(self, shift_labels: bool = True) -> None:
849
+ super().__init__()
850
+
851
+ self.shift_labels = shift_labels
852
+ self.loss_fct = nn.CrossEntropyLoss()
853
+
854
+ def forward(self, logits: torch.FloatTensor, labels: torch.LongTensor) -> torch.FloatTensor:
855
+ if self.shift_labels:
856
+ logits = logits[..., :-1, :].contiguous()
857
+ labels = labels[..., 1:].contiguous()
858
+
859
+ loss = self.loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
860
+
861
+ return loss
862
+
863
+
864
+ class PhiPreTrainedModel(PreTrainedModel):
865
+ """Phi pre-trained model."""
866
+
867
+ config_class = PhiConfig
868
+ base_model_prefix = "transformer"
869
+ supports_gradient_checkpointing = False
870
+ _no_split_modules = ["ParallelBlock"]
871
+
872
+ def __init__(self, *inputs, **kwargs) -> None:
873
+ super().__init__(*inputs, **kwargs)
874
+
875
+ def _init_weights(self, module: nn.Module) -> None:
876
+ if isinstance(module, (nn.Linear,)):
877
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
878
+ if module.bias is not None:
879
+ module.bias.data.zero_()
880
+ elif isinstance(module, nn.Embedding):
881
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
882
+ if module.padding_idx is not None:
883
+ module.weight.data[module.padding_idx].zero_()
884
+ elif isinstance(module, nn.LayerNorm):
885
+ if module.bias is not None:
886
+ module.bias.data.zero_()
887
+ module.weight.data.fill_(1.0)
888
+
889
+ def prepare_inputs_for_generation(
890
+ self,
891
+ input_ids: torch.LongTensor,
892
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
893
+ attention_mask: Optional[Union[torch.LongTensor, torch.BoolTensor]] = None,
894
+ **kwargs,
895
+ ) -> Dict[str, Any]:
896
+ if past_key_values is None or not (isinstance(past_key_values, InferenceParams)):
897
+ past_key_values = InferenceParams(
898
+ max_seqlen=self.config.n_positions,
899
+ max_batch_size=input_ids.shape[0],
900
+ seqlen_offset=0,
901
+ batch_size_offset=0,
902
+ key_value_memory_dict={},
903
+ lengths_per_sample=None,
904
+ )
905
+ else:
906
+ # Assume that `past_key_values` has cached all tokens up to the last token in `input_ids`
907
+ past_key_values.seqlen_offset = input_ids.shape[1] - 1
908
+ input_ids = input_ids[:, -1].unsqueeze(-1)
909
+
910
+ return {
911
+ "input_ids": input_ids,
912
+ "past_key_values": past_key_values,
913
+ "attention_mask": attention_mask,
914
+ }
915
+
916
+
917
+ class PhiModel(PhiPreTrainedModel):
918
+ """Phi model."""
919
+
920
+ _keys_to_ignore_on_load_missing = [""]
921
+ _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"]
922
+
923
+ def __init__(self, config: PhiConfig) -> None:
924
+ super().__init__(config)
925
+
926
+ self.embd = Embedding(config)
927
+ self.h = nn.ModuleList([ParallelBlock(config, block_idx=i) for i in range(config.n_layer)])
928
+ self.gradient_checkpointing = False
929
+ self.post_init()
930
+
931
+ def get_input_embeddings(self) -> nn.Embedding:
932
+ return self.embd.wte
933
+
934
+ def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
935
+ self.embd.wte = new_embeddings
936
+
937
+ def forward(
938
+ self,
939
+ input_ids: torch.LongTensor,
940
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
941
+ attention_mask: Optional[torch.BoolTensor] = None,
942
+ ) -> torch.FloatTensor:
943
+ hidden_states = self.embd(input_ids)
944
+
945
+ for layer in self.h:
946
+ hidden_states = layer(
947
+ hidden_states,
948
+ past_key_values=past_key_values,
949
+ attention_mask=attention_mask,
950
+ )
951
+
952
+ return hidden_states
953
+
954
+
955
+ class PhiForCausalLM(PhiPreTrainedModel):
956
+ """Phi for Causal Language Modeling."""
957
+
958
+ _keys_to_ignore_on_load_missing = [""]
959
+ _keys_to_ignore_on_load_unexpected = [r"transformer\.h\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"]
960
+
961
+ def __init__(self, config: PhiConfig) -> None:
962
+ super().__init__(config)
963
+
964
+ self.transformer = PhiModel(config)
965
+ self.lm_head = CausalLMHead(config)
966
+ self.loss = CausalLMLoss()
967
+
968
+ self.post_init()
969
+
970
+ def get_output_embeddings(self) -> nn.Linear:
971
+ return self.lm_head.linear
972
+
973
+ def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
974
+ self.lm_head.linear = new_embeddings
975
+
976
+ def forward(
977
+ self,
978
+ input_ids: torch.LongTensor,
979
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
980
+ attention_mask: Optional[torch.BoolTensor] = None,
981
+ labels: Optional[torch.LongTensor] = None,
982
+ **kwargs,
983
+ ) -> CausalLMOutputWithPast:
984
+ hidden_states = self.transformer(input_ids, past_key_values=past_key_values, attention_mask=attention_mask)
985
+ lm_logits = self.lm_head(hidden_states)
986
+
987
+ loss = None
988
+ if labels is not None:
989
+ loss = self.loss(lm_logits, labels)
990
+
991
+ return CausalLMOutputWithPast(loss=loss, logits=lm_logits, past_key_values=past_key_values)