vince62s commited on
Commit
457b9cc
1 Parent(s): d70f024

Upload 5 files

Browse files
Files changed (5) hide show
  1. modeling_phi.py +967 -0
  2. special_tokens_map.json +30 -0
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +340 -0
  5. vocab.json +0 -0
modeling_phi.py ADDED
@@ -0,0 +1,967 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+ #
4
+ # Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu.
5
+ # Licensed under the BSD 3-Clause License.
6
+
7
+ from __future__ import annotations
8
+
9
+ import math
10
+ from dataclasses import dataclass, field
11
+ from typing import Any, Dict, Optional, Tuple, Union
12
+
13
+ import torch
14
+ import torch.nn as nn
15
+ from einops import rearrange, repeat
16
+ from transformers import PretrainedConfig, PreTrainedModel
17
+ from transformers.activations import ACT2FN
18
+ from transformers.modeling_outputs import CausalLMOutputWithPast
19
+
20
+ from .configuration_phi import PhiConfig
21
+
22
+ try:
23
+ from flash_attn.bert_padding import pad_input, unpad_input
24
+ from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding
25
+ from flash_attn.modules.mha import FlashCrossAttention, FlashSelfAttention
26
+ from flash_attn.ops.fused_dense import FusedDense
27
+ except:
28
+ pad_input, unpad_input = None, None
29
+ FlashRotaryEmbedding = None
30
+ FlashSelfAttention, FlashCrossAttention = None, None
31
+ FusedDense = None
32
+
33
+
34
+ @dataclass
35
+ class InferenceParams:
36
+ """Inference parameters passed to model to efficiently calculate
37
+ and store context during inference.
38
+
39
+ Reference:
40
+ https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/generation.py.
41
+
42
+ Args:
43
+ max_seqlen: Maximum sequence length.
44
+ max_batch_size: Maximum batch size.
45
+ seqlen_offset: Sequence length offset.
46
+ batch_size_offset: Batch size offset.
47
+ key_value_memory_dict: Key value memory dictionary.
48
+ lengths_per_sample: Lengths per sample.
49
+
50
+ """
51
+
52
+ max_seqlen: int = field(metadata={"help": "Maximum sequence length."})
53
+
54
+ max_batch_size: int = field(metadata={"help": "Maximum batch size."})
55
+
56
+ seqlen_offset: int = field(default=0, metadata={"help": "Sequence length offset."})
57
+
58
+ batch_size_offset: int = field(default=0, metadata={"help": "Batch size offset."})
59
+
60
+ key_value_memory_dict: Dict[str, Any] = field(
61
+ default_factory=dict, metadata={"help": "Key value memory dictionary."}
62
+ )
63
+
64
+ lengths_per_sample: torch.Tensor = field(default=None, metadata={"help": "Lengths per sample."})
65
+
66
+
67
+ class Embedding(nn.Module):
68
+ """Token embedding with dropout."""
69
+
70
+ def __init__(self, config: PretrainedConfig) -> None:
71
+ super().__init__()
72
+
73
+ self.wte = nn.Embedding(config.vocab_size, config.n_embd)
74
+ self.drop = nn.Dropout(config.embd_pdrop)
75
+
76
+ def forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
77
+ input_shape = input_ids.size()
78
+ input_ids = input_ids.view(-1, input_shape[-1])
79
+
80
+ hidden_states = self.wte(input_ids)
81
+ hidden_states = self.drop(hidden_states)
82
+
83
+ return hidden_states
84
+
85
+
86
+ def _apply_rotary_emb(
87
+ x: torch.FloatTensor,
88
+ cos: torch.FloatTensor,
89
+ sin: torch.FloatTensor,
90
+ ) -> torch.FloatTensor:
91
+ _, seqlen, _, _ = x.shape
92
+ _, rotary_dim = cos.shape
93
+ rotary_dim *= 2
94
+
95
+ x_rot = x[:, :, :, :rotary_dim]
96
+ x_pass = x[:, :, :, rotary_dim:]
97
+
98
+ x1, x2 = x_rot.chunk(2, dim=-1)
99
+ c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
100
+ x1, x2, c, s = [t.to(dtype=torch.float32) for t in [x1, x2, c, s]]
101
+
102
+ x_rot = torch.cat([x1 * c - x2 * s, x1 * s + x2 * c], axis=-1).to(x.dtype)
103
+
104
+ return torch.cat([x_rot, x_pass], axis=-1)
105
+
106
+
107
+ def _apply_rotary_emb_kv(
108
+ kv: torch.FloatTensor,
109
+ cos: torch.FloatTensor,
110
+ sin: torch.FloatTensor,
111
+ cos_k: Optional[torch.FloatTensor] = None,
112
+ sin_k: Optional[torch.FloatTensor] = None,
113
+ ) -> torch.FloatTensor:
114
+ _, seqlen, _, _, _ = kv.shape
115
+ _, rotary_dim = cos.shape
116
+ rotary_dim *= 2
117
+
118
+ k_rot = kv[:, :, 0, :, :rotary_dim]
119
+ k_pass = kv[:, :, 0, :, rotary_dim:]
120
+
121
+ k1, k2 = k_rot.chunk(2, dim=-1)
122
+ c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
123
+ k1, k2, c, s = [t.to(dtype=torch.float32) for t in [k1, k2, c, s]]
124
+
125
+ k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(kv.dtype)
126
+
127
+ return torch.cat(
128
+ [
129
+ torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
130
+ kv[:, :, 1:2, :, :],
131
+ ],
132
+ axis=2,
133
+ )
134
+
135
+
136
+ def _apply_rotary_emb_qkv(
137
+ qkv: torch.FloatTensor,
138
+ cos: torch.FloatTensor,
139
+ sin: torch.FloatTensor,
140
+ cos_k: Optional[torch.FloatTensor] = None,
141
+ sin_k: Optional[torch.FloatTensor] = None,
142
+ ) -> torch.FloatTensor:
143
+ _, seqlen, _, _, _ = qkv.shape
144
+ _, rotary_dim = cos.shape
145
+ rotary_dim *= 2
146
+
147
+ q_rot = qkv[:, :, 0, :, :rotary_dim]
148
+ q_pass = qkv[:, :, 0, :, rotary_dim:]
149
+
150
+ k_rot = qkv[:, :, 1, :, :rotary_dim]
151
+ k_pass = qkv[:, :, 1, :, rotary_dim:]
152
+
153
+ q1, q2 = q_rot.chunk(2, dim=-1)
154
+ k1, k2 = k_rot.chunk(2, dim=-1)
155
+ c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
156
+ q1, q2, k1, k2, c, s = [t.to(dtype=torch.float32) for t in [q1, q2, k1, k2, c, s]]
157
+
158
+ q_rot = torch.cat([q1 * c - q2 * s, q1 * s + q2 * c], axis=-1).to(qkv.dtype)
159
+ k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(qkv.dtype)
160
+
161
+ return torch.cat(
162
+ [
163
+ torch.cat([q_rot, q_pass], axis=-1).unsqueeze(2),
164
+ torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
165
+ qkv[:, :, 2:3, :, :],
166
+ ],
167
+ axis=2,
168
+ )
169
+
170
+
171
+ class RotaryEmbedding(nn.Module):
172
+ """Rotary positional embedding (RoPE).
173
+
174
+ Reference:
175
+ RoFormer: Enhanced Transformer with Rotary Position Embedding.
176
+ https://arxiv.org/pdf/2104.09864.pdf.
177
+
178
+ """
179
+
180
+ def __init__(
181
+ self,
182
+ dim: int,
183
+ base: int = 10000,
184
+ scale_base: Optional[float] = None,
185
+ pos_idx_in_fp32: bool = True,
186
+ max_position_embeddings: int = 2048,
187
+ device: Optional[str] = None,
188
+ **kwargs,
189
+ ) -> None:
190
+ super().__init__()
191
+
192
+ if scale_base is not None:
193
+ raise NotImplementedError
194
+
195
+ self.dim = dim
196
+ self.base = float(base)
197
+ self.scale_base = scale_base
198
+ self.pos_idx_in_fp32 = pos_idx_in_fp32
199
+ self.max_position_embeddings = max_position_embeddings
200
+ self.device = device
201
+
202
+ # Generate and save the inverse frequency buffer (non-trainable)
203
+ inv_freq = self._compute_inv_freq(device)
204
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
205
+
206
+ # Generate and save the scale buffer (non-trainable)
207
+ scale = (
208
+ (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
209
+ if scale_base is not None
210
+ else None
211
+ )
212
+ self.register_buffer("scale", scale, persistent=False)
213
+
214
+ # Initialize cached attributes since ONNX can't rely on dynamic initialization
215
+ self._update_cos_sin_cache(max_position_embeddings, device=device, dtype=torch.float32)
216
+
217
+ def _compute_inv_freq(self, device: Optional[str] = None) -> torch.FloatTensor:
218
+ return 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
219
+
220
+ def _update_cos_sin_cache(
221
+ self,
222
+ seqlen: int,
223
+ device: Optional[str] = None,
224
+ dtype: Optional[torch.dtype] = None,
225
+ ) -> None:
226
+ self._seq_len_cached = seqlen
227
+
228
+ # fp32 is preferred since the output of `torch.arange` can be quite large
229
+ # and bf16 would lose a lot of precision
230
+ if self.pos_idx_in_fp32:
231
+ t = torch.arange(seqlen, device=device, dtype=torch.float32)
232
+ if self.inv_freq.dtype != torch.float32:
233
+ inv_freq = self._compute_inv_freq(device=device)
234
+ else:
235
+ inv_freq = self.inv_freq
236
+ else:
237
+ t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
238
+ inv_freq = self.inv_freq
239
+
240
+ # `torch.outer` is preferred since `torch.einsum` converts from fp32 to fp16 if used with AMP
241
+ freqs = torch.outer(t, inv_freq)
242
+ if self.scale is None:
243
+ self._cos_cached = torch.cos(freqs).to(dtype)
244
+ self._sin_cached = torch.sin(freqs).to(dtype)
245
+ else:
246
+ power = (
247
+ torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
248
+ ) / self.scale_base
249
+ scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
250
+
251
+ # Force the scale multiplication to happen in fp32
252
+ self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
253
+ self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
254
+ self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
255
+ self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
256
+
257
+ def forward(
258
+ self,
259
+ qkv: torch.Tensor,
260
+ kv: Optional[torch.Tensor] = None,
261
+ seqlen_offset: int = 0,
262
+ **kwargs,
263
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
264
+ if (
265
+ self._seq_len_cached < qkv.shape[1] + seqlen_offset
266
+ or self._cos_cached.device != qkv.device
267
+ or self._cos_cached.dtype != qkv.dtype
268
+ or (self.training and self._cos_cached.is_inference())
269
+ ):
270
+ self._update_cos_sin_cache(qkv.shape[1] + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
271
+
272
+ if kv is None:
273
+ return _apply_rotary_emb_qkv(
274
+ qkv,
275
+ self._cos_cached[seqlen_offset:],
276
+ self._sin_cached[seqlen_offset:],
277
+ )
278
+ else:
279
+ #print(self._cos_cached[seqlen_offset:].size(), self._cos_cached[seqlen_offset:])
280
+ q = _apply_rotary_emb(
281
+ qkv,
282
+ self._cos_cached[seqlen_offset:],
283
+ self._sin_cached[seqlen_offset:],
284
+ )
285
+ #print(q)
286
+ #exit()
287
+ kv = _apply_rotary_emb_kv(
288
+ kv,
289
+ self._cos_cached[seqlen_offset:],
290
+ self._sin_cached[seqlen_offset:],
291
+ )
292
+
293
+ return q, kv
294
+
295
+
296
+ class MLP(nn.Module):
297
+ """Multi-Layer Perceptron.
298
+
299
+ Reference:
300
+ Attention Is All You Need.
301
+ https://arxiv.org/pdf/1706.03762.pdf.
302
+
303
+ """
304
+
305
+ def __init__(
306
+ self,
307
+ config: PretrainedConfig,
308
+ n_inner: Optional[int] = None,
309
+ act_fn: Optional[str] = None,
310
+ ) -> None:
311
+ super().__init__()
312
+
313
+ act_fn = config.activation_function if act_fn is None else act_fn
314
+
315
+ n_inner = getattr(config, "n_inner", None) if n_inner is None else n_inner
316
+ n_inner = n_inner if n_inner is not None else 4 * config.n_embd
317
+
318
+ self.fc1 = nn.Linear(config.n_embd, n_inner)
319
+ self.fc2 = nn.Linear(n_inner, config.n_embd)
320
+ self.act = ACT2FN[act_fn]
321
+
322
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
323
+ hidden_states = self.fc1(hidden_states)
324
+ hidden_states = self.act(hidden_states)
325
+ hidden_states = self.fc2(hidden_states)
326
+
327
+ return hidden_states
328
+
329
+
330
+ class SelfAttention(nn.Module):
331
+ """Self-attention layer (compatible with PyTorch).
332
+
333
+ Reference:
334
+ https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py.
335
+
336
+ """
337
+
338
+ def __init__(
339
+ self,
340
+ causal: bool = True,
341
+ softmax_scale: Optional[float] = None,
342
+ attention_dropout: float = 0.0,
343
+ ) -> None:
344
+ super().__init__()
345
+
346
+ self.causal = causal
347
+ self.softmax_scale = softmax_scale
348
+ self.drop = nn.Dropout(attention_dropout)
349
+
350
+ @torch.autocast("cpu", enabled=False)
351
+ @torch.autocast("cuda", enabled=False)
352
+ def forward(
353
+ self,
354
+ qkv: torch.FloatTensor,
355
+ causal: bool = None,
356
+ key_padding_mask: Optional[torch.BoolTensor] = None,
357
+ **kwargs,
358
+ ) -> torch.FloatTensor:
359
+ batch_size, seqlen = qkv.shape[0], qkv.shape[1]
360
+ q, k, v = qkv.unbind(dim=2)
361
+
362
+ q = q.to(torch.float32)
363
+ k = k.to(torch.float32)
364
+
365
+ causal = self.causal if causal is None else causal
366
+ softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
367
+
368
+ # Autocast is manually disabled to avoid `torch.einsum` performing the operation
369
+ # using float16, which might lead to overflow
370
+ scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
371
+
372
+ if key_padding_mask is not None:
373
+ padding_mask = torch.full((batch_size, seqlen), -10000.0, dtype=scores.dtype, device=scores.device)
374
+ padding_mask.masked_fill_(key_padding_mask, 0.0)
375
+
376
+ scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
377
+
378
+ if causal:
379
+ causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
380
+ scores = scores + causal_mask.to(dtype=scores.dtype)
381
+
382
+ attention = torch.softmax(scores, dim=-1).to(v.dtype)
383
+ attention = self.drop(attention)
384
+
385
+ output = torch.einsum("bhts,bshd->bthd", attention, v)
386
+
387
+ return output
388
+
389
+
390
+ class CrossAttention(nn.Module):
391
+ """Cross-attention layer (compatible with PyTorch).
392
+
393
+ Reference:
394
+ https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py.
395
+
396
+ """
397
+
398
+ def __init__(
399
+ self,
400
+ causal: bool = True,
401
+ softmax_scale: Optional[float] = None,
402
+ attention_dropout: float = 0.0,
403
+ ) -> None:
404
+ super().__init__()
405
+
406
+ self.causal = causal
407
+ self.softmax_scale = softmax_scale
408
+ self.drop = nn.Dropout(attention_dropout)
409
+
410
+ @torch.autocast("cpu", enabled=False)
411
+ @torch.autocast("cuda", enabled=False)
412
+ def forward(
413
+ self,
414
+ q: torch.FloatTensor,
415
+ kv: torch.FloatTensor,
416
+ causal: bool = None,
417
+ key_padding_mask: Optional[torch.BoolTensor] = None,
418
+ **kwargs,
419
+ ) -> torch.FloatTensor:
420
+ batch_size, seqlen_q = q.shape[0], q.shape[1]
421
+ seqlen_k = kv.shape[1]
422
+
423
+ if kv.shape[3] != q.shape[2]:
424
+ kv = repeat(kv, "... hkv d -> ... (hkv g) d", g=q.shape[2] // kv.shape[3])
425
+ k, v = kv.unbind(dim=2)
426
+
427
+ q = q.to(torch.float32)
428
+ k = k.to(torch.float32)
429
+
430
+ causal = self.causal if causal is None else causal
431
+ softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
432
+
433
+ # Autocast is manually disabled to avoid `torch.einsum` performing the operation
434
+ # using float16, which might lead to overflow
435
+ scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
436
+
437
+ if key_padding_mask is not None:
438
+ padding_mask = torch.full(
439
+ (batch_size, seqlen_k),
440
+ -10000.0,
441
+ dtype=scores.dtype,
442
+ device=scores.device,
443
+ )
444
+ padding_mask.masked_fill_(key_padding_mask, 0.0)
445
+
446
+ scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
447
+
448
+ if causal:
449
+ rows = rearrange(torch.arange(seqlen_q, device=q.device, dtype=torch.long), "s -> s 1")
450
+ cols = torch.arange(seqlen_k, device=k.device, dtype=torch.long)
451
+ causal_mask = cols > rows + seqlen_k - seqlen_q
452
+
453
+ scores = scores.masked_fill(causal_mask, -10000.0)
454
+
455
+ attention = torch.softmax(scores, dim=-1).to(v.dtype)
456
+ attention = self.drop(attention)
457
+
458
+ output = torch.einsum("bhts,bshd->bthd", attention, v)
459
+
460
+ return output
461
+
462
+
463
+ def _find_mha_dims(
464
+ config: PretrainedConfig,
465
+ n_head: Optional[int] = None,
466
+ n_head_kv: Optional[int] = None,
467
+ head_dim: Optional[int] = None,
468
+ ) -> Tuple[int, int]:
469
+ if n_head is None and head_dim is None:
470
+ head_dim = config.n_embd // config.n_head
471
+ n_head = config.n_head
472
+ elif n_head is None or head_dim is None:
473
+ raise ValueError("`n_head` and `head_dim` must be both specified or `None`.")
474
+
475
+ if n_head_kv is None:
476
+ n_head_kv = getattr(config, "n_head_kv", None) or n_head
477
+
478
+ return n_head, n_head_kv, head_dim
479
+
480
+
481
+ def _update_kv_cache(kv: torch.FloatTensor, inference_params: InferenceParams, layer_idx: int) -> torch.FloatTensor:
482
+ num_heads, head_dim = kv.shape[-2:]
483
+
484
+ if layer_idx not in inference_params.key_value_memory_dict:
485
+ inference_params.key_value_memory_dict[layer_idx] = torch.empty(
486
+ inference_params.max_batch_size,
487
+ inference_params.max_seqlen,
488
+ 2,
489
+ num_heads,
490
+ head_dim,
491
+ dtype=kv.dtype,
492
+ device=kv.device,
493
+ )
494
+
495
+ batch_start = inference_params.batch_size_offset
496
+ batch_end = batch_start + kv.shape[0]
497
+
498
+ sequence_start = inference_params.seqlen_offset
499
+ sequence_end = sequence_start + kv.shape[1]
500
+
501
+ # When the current sequence length is equal to or larger than the maximum sequence length,
502
+ # we need to concatenate the current `kv` with the cached `kv` to expand its length
503
+ if sequence_end >= inference_params.max_seqlen:
504
+ inference_params.key_value_memory_dict[layer_idx] = torch.concatenate((inference_params.key_value_memory_dict[layer_idx], kv), dim=1)
505
+
506
+ inference_params.key_value_memory_dict[layer_idx][batch_start:batch_end, sequence_start:sequence_end, ...] = kv
507
+ kv = inference_params.key_value_memory_dict[layer_idx][batch_start:batch_end, :sequence_end, ...]
508
+
509
+ return kv
510
+
511
+
512
+ class MHA(nn.Module):
513
+ """Multi-head attention layer."""
514
+
515
+ def __init__(
516
+ self,
517
+ config: PretrainedConfig,
518
+ dtype: Optional[torch.dtype] = None,
519
+ device: Optional[str] = None,
520
+ rotary_dim: Optional[int] = None,
521
+ rotary_base: float = 10000.0,
522
+ rotary_scale_base: Optional[float] = None,
523
+ n_head: Optional[int] = None,
524
+ n_head_kv: Optional[int] = None,
525
+ head_dim: Optional[int] = None,
526
+ bias: bool = True,
527
+ causal: bool = True,
528
+ softmax_scale: Optional[float] = None,
529
+ layer_idx: Optional[int] = None,
530
+ return_residual: bool = False,
531
+ checkpointing: bool = False,
532
+ ) -> None:
533
+ super().__init__()
534
+
535
+ # Rotary embedding
536
+ self.rotary_dim = rotary_dim if rotary_dim is not None else getattr(config, "rotary_dim", 0)
537
+ if self.rotary_dim > 0:
538
+ rotary_cls = FlashRotaryEmbedding if config.flash_rotary else RotaryEmbedding
539
+ if rotary_cls is None:
540
+ rotary_cls = RotaryEmbedding
541
+
542
+ rotary_kwargs = {}
543
+ if rotary_cls is RotaryEmbedding:
544
+ rotary_kwargs["max_position_embeddings"] = config.n_positions
545
+
546
+ self.rotary_emb = rotary_cls(
547
+ self.rotary_dim,
548
+ base=rotary_base,
549
+ scale_base=rotary_scale_base,
550
+ device=device,
551
+ **rotary_kwargs,
552
+ )
553
+
554
+ # MLP
555
+ self.n_head, self.n_head_kv, self.head_dim = _find_mha_dims(
556
+ config, n_head=n_head, n_head_kv=n_head_kv, head_dim=head_dim
557
+ )
558
+ op_size = self.head_dim * (self.n_head + 2 * self.n_head_kv)
559
+ hidden_size = config.n_embd
560
+
561
+ linear_cls = FusedDense if config.fused_dense else nn.Linear
562
+ if linear_cls is None:
563
+ linear_cls = nn.Linear
564
+
565
+ self.Wqkv = linear_cls(hidden_size, op_size, bias=bias, device=device, dtype=dtype)
566
+ self.out_proj = linear_cls(hidden_size, hidden_size, bias=bias, device=device, dtype=dtype)
567
+
568
+ # Attention
569
+ attn_cls = FlashSelfAttention if config.flash_attn else SelfAttention
570
+ if attn_cls is None:
571
+ attn_cls = SelfAttention
572
+
573
+ cross_attn_cls = FlashCrossAttention if config.flash_attn else CrossAttention
574
+ if cross_attn_cls is None:
575
+ cross_attn_cls = CrossAttention
576
+
577
+ self.inner_attn = attn_cls(
578
+ causal=causal,
579
+ softmax_scale=softmax_scale,
580
+ attention_dropout=config.attn_pdrop,
581
+ )
582
+ self.inner_cross_attn = cross_attn_cls(
583
+ causal=causal,
584
+ softmax_scale=softmax_scale,
585
+ attention_dropout=config.attn_pdrop,
586
+ )
587
+
588
+ self.flash_attn = config.flash_attn and attn_cls is FlashSelfAttention
589
+ self.layer_idx = layer_idx
590
+ self.return_residual = return_residual
591
+ self.checkpointing = checkpointing
592
+
593
+ def _forward_self_attn(
594
+ self, x: torch.FloatTensor, key_padding_mask: Optional[torch.BoolTensor]
595
+ ) -> torch.FloatTensor:
596
+ qkv = self.Wqkv(x)
597
+
598
+ qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
599
+
600
+ if self.rotary_dim > 0:
601
+ qkv = self.rotary_emb(qkv)
602
+
603
+ if self.flash_attn:
604
+ batch_size, seqlen = qkv.shape[0], qkv.shape[1]
605
+
606
+ cu_seqlens, max_seqlen = None, None
607
+ if key_padding_mask is not None:
608
+ # If `key_padding_mask` is supplied, we need to unpad the input and retrieve
609
+ # the `cu_seqlens` and `max_seqlen` to be used by `flash-attn`
610
+ qkv, indices, cu_seqlens, max_seqlen = unpad_input(qkv, key_padding_mask)
611
+
612
+ if self.checkpointing:
613
+ attn_output = torch.utils.checkpoint.checkpoint(
614
+ self.inner_attn, qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
615
+ )
616
+ else:
617
+ attn_output = self.inner_attn(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen).to(qkv.device)
618
+
619
+ # If `key_padding_mask` is supplied, we need to pad the output back to the original shape
620
+ return pad_input(attn_output, indices, batch_size, seqlen) if key_padding_mask is not None else attn_output
621
+
622
+ if self.checkpointing:
623
+ return torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, key_padding_mask=key_padding_mask)
624
+
625
+ return self.inner_attn(qkv, key_padding_mask=key_padding_mask)
626
+
627
+ def _forward_cross_attn(
628
+ self,
629
+ x: torch.FloatTensor,
630
+ past_key_values: Optional[InferenceParams],
631
+ key_padding_mask: Optional[torch.BoolTensor],
632
+ ) -> torch.FloatTensor:
633
+ batch_size = x.shape[0]
634
+
635
+ qkv = self.Wqkv(x)
636
+
637
+ q = qkv[..., : self.n_head * self.head_dim]
638
+
639
+ q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
640
+
641
+ kv = qkv[..., self.n_head * self.head_dim :]
642
+
643
+ kv = rearrange(kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim)
644
+
645
+ seqlen_offset = past_key_values.seqlen_offset if past_key_values is not None else 0
646
+ causal = None if seqlen_offset == 0 else False
647
+ if self.rotary_dim > 0:
648
+ q, kv = self.rotary_emb(q, kv=kv, seqlen_offset=seqlen_offset)
649
+
650
+ if past_key_values is not None:
651
+ kv = _update_kv_cache(kv, past_key_values, self.layer_idx)
652
+
653
+ if self.flash_attn:
654
+ batch_size, seqlen_q = q.shape[0], q.shape[1]
655
+ seqlen_k = kv.shape[1]
656
+
657
+ cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k = (
658
+ None,
659
+ None,
660
+ None,
661
+ None,
662
+ )
663
+ if key_padding_mask is not None:
664
+ kv, _, cu_seqlens_k, max_seqlen_k = unpad_input(kv, key_padding_mask)
665
+
666
+ if seqlen_q == 1:
667
+ key_padding_mask = torch.ones(batch_size, 1, device=q.device)
668
+ elif seqlen_q != seqlen_k:
669
+ key_padding_mask = key_padding_mask[:, -seqlen_q:]
670
+
671
+ q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, key_padding_mask)
672
+
673
+ if self.checkpointing:
674
+ attn_output = torch.utils.checkpoint.checkpoint(
675
+ self.inner_cross_attn,
676
+ q,
677
+ kv,
678
+ causal=causal,
679
+ cu_seqlens=cu_seqlens_q,
680
+ max_seqlen=max_seqlen_q,
681
+ cu_seqlens_k=cu_seqlens_k,
682
+ max_seqlen_k=max_seqlen_k,
683
+ )
684
+ else:
685
+ attn_output = self.inner_cross_attn(
686
+ q,
687
+ kv,
688
+ causal=causal,
689
+ cu_seqlens=cu_seqlens_q,
690
+ max_seqlen=max_seqlen_q,
691
+ cu_seqlens_k=cu_seqlens_k,
692
+ max_seqlen_k=max_seqlen_k,
693
+ )
694
+
695
+ return (
696
+ pad_input(attn_output, indices_q, batch_size, max_seqlen_q)
697
+ if key_padding_mask is not None
698
+ else attn_output
699
+ )
700
+
701
+ if self.checkpointing:
702
+ return torch.utils.checkpoint.checkpoint(
703
+ self.inner_cross_attn,
704
+ q,
705
+ kv,
706
+ key_padding_mask=key_padding_mask,
707
+ causal=causal,
708
+ )
709
+
710
+ return self.inner_cross_attn(q, kv, key_padding_mask=key_padding_mask, causal=causal)
711
+
712
+ def forward(
713
+ self,
714
+ x: torch.FloatTensor,
715
+ past_key_values: Optional[InferenceParams] = None,
716
+ attention_mask: Optional[Union[torch.LongTensor, torch.BoolTensor]] = None,
717
+ **kwargs,
718
+ ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
719
+ if attention_mask is not None:
720
+ attention_mask = attention_mask.bool()
721
+ else:
722
+ attention_mask = None
723
+
724
+ # MHA
725
+ if self.n_head == self.n_head_kv:
726
+ if past_key_values is None:
727
+ # If `past_key_values` are not supplied, we run self-attention
728
+ attn_output = self._forward_self_attn(x, attention_mask)
729
+ else:
730
+ # If `past_key_values` are supplied, it means that we might have cached values and
731
+ # could take advantage of cross-attention
732
+ attn_output = self._forward_cross_attn(x, past_key_values, attention_mask)
733
+ # MQA / GQA
734
+ else:
735
+ # Regardless of `past_key_values` being supplied or not, it always use cross-attention
736
+ # because `q` and `kv` lengths might be different
737
+ attn_output = self._forward_cross_attn(x, past_key_values, attention_mask)
738
+
739
+ output = rearrange(attn_output, "... h d -> ... (h d)")
740
+ output = self.out_proj(output)
741
+ print(output.size(), output)
742
+ exit()
743
+ return output if not self.return_residual else (output, x)
744
+
745
+
746
+ class ParallelBlock(nn.Module):
747
+ """Parallel block.
748
+
749
+ This block applies parallel mixer and MLP layers to the input (used in GPT-J and CodeGen).
750
+
751
+ """
752
+
753
+ def __init__(
754
+ self,
755
+ config: PretrainedConfig,
756
+ block_idx: Optional[int] = None,
757
+ ) -> None:
758
+ super().__init__()
759
+
760
+ self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
761
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
762
+ self.block_idx = block_idx
763
+
764
+ self.mixer = MHA(config, layer_idx=block_idx)
765
+ self.mlp = MLP(config)
766
+
767
+ def forward(
768
+ self,
769
+ hidden_states: torch.FloatTensor,
770
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
771
+ attention_mask: Optional[torch.BoolTensor] = None,
772
+ **kwargs,
773
+ ) -> torch.FloatTensor:
774
+ residual = hidden_states
775
+ hidden_states = self.ln(hidden_states)
776
+
777
+ attn_outputs = self.mixer(
778
+ hidden_states,
779
+ past_key_values=past_key_values,
780
+ attention_mask=attention_mask,
781
+ )
782
+ if isinstance(attn_outputs, tuple):
783
+ attn_outputs = attn_outputs[0]
784
+
785
+ attn_outputs = self.resid_dropout(attn_outputs)
786
+ feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states))
787
+
788
+ hidden_states = attn_outputs + feed_forward_hidden_states + residual
789
+
790
+ return hidden_states
791
+
792
+
793
+ class CausalLMHead(nn.Module):
794
+ """Causal Language Modeling head.
795
+
796
+ Reference:
797
+ Improving Language Understanding by Generative Pre-Training.
798
+ https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
799
+
800
+ """
801
+
802
+ def __init__(self, config: PretrainedConfig) -> None:
803
+ super().__init__()
804
+
805
+ self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
806
+ self.linear = nn.Linear(config.n_embd, config.vocab_size)
807
+
808
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
809
+ hidden_states = self.ln(hidden_states)
810
+ logits = self.linear(hidden_states).to(torch.float32)
811
+
812
+ return logits
813
+
814
+
815
+ class CausalLMLoss(nn.Module):
816
+ """Causal Language Modeling loss.
817
+
818
+ Reference:
819
+ Improving Language Understanding by Generative Pre-Training.
820
+ https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
821
+
822
+ """
823
+
824
+ def __init__(self, shift_labels: bool = True) -> None:
825
+ super().__init__()
826
+
827
+ self.shift_labels = shift_labels
828
+ self.loss_fct = nn.CrossEntropyLoss()
829
+
830
+ def forward(self, logits: torch.FloatTensor, labels: torch.LongTensor) -> torch.FloatTensor:
831
+ if self.shift_labels:
832
+ logits = logits[..., :-1, :].contiguous()
833
+ labels = labels[..., 1:].contiguous()
834
+
835
+ loss = self.loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
836
+
837
+ return loss
838
+
839
+
840
+ class PhiPreTrainedModel(PreTrainedModel):
841
+ """Phi pre-trained model."""
842
+
843
+ config_class = PhiConfig
844
+ base_model_prefix = "transformer"
845
+ supports_gradient_checkpointing = False
846
+ _no_split_modules = ["ParallelBlock"]
847
+
848
+ def __init__(self, *inputs, **kwargs) -> None:
849
+ super().__init__(*inputs, **kwargs)
850
+
851
+ def _init_weights(self, module: nn.Module) -> None:
852
+ if isinstance(module, (nn.Linear,)):
853
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
854
+ if module.bias is not None:
855
+ module.bias.data.zero_()
856
+ elif isinstance(module, nn.Embedding):
857
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
858
+ if module.padding_idx is not None:
859
+ module.weight.data[module.padding_idx].zero_()
860
+ elif isinstance(module, nn.LayerNorm):
861
+ if module.bias is not None:
862
+ module.bias.data.zero_()
863
+ module.weight.data.fill_(1.0)
864
+
865
+ def prepare_inputs_for_generation(
866
+ self,
867
+ input_ids: torch.LongTensor,
868
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
869
+ attention_mask: Optional[Union[torch.LongTensor, torch.BoolTensor]] = None,
870
+ **kwargs,
871
+ ) -> Dict[str, Any]:
872
+ if past_key_values is None or not (isinstance(past_key_values, InferenceParams)):
873
+ past_key_values = InferenceParams(
874
+ max_seqlen=self.config.n_positions,
875
+ max_batch_size=input_ids.shape[0],
876
+ seqlen_offset=0,
877
+ batch_size_offset=0,
878
+ key_value_memory_dict={},
879
+ lengths_per_sample=None,
880
+ )
881
+ else:
882
+ # Assume that `past_key_values` has cached all tokens up to the last token in `input_ids`
883
+ past_key_values.seqlen_offset = input_ids.shape[1] - 1
884
+ input_ids = input_ids[:, -1].unsqueeze(-1)
885
+
886
+ return {
887
+ "input_ids": input_ids,
888
+ "past_key_values": past_key_values,
889
+ "attention_mask": attention_mask,
890
+ }
891
+
892
+
893
+ class PhiModel(PhiPreTrainedModel):
894
+ """Phi model."""
895
+
896
+ _keys_to_ignore_on_load_missing = [""]
897
+ _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"]
898
+
899
+ def __init__(self, config: PhiConfig) -> None:
900
+ super().__init__(config)
901
+
902
+ self.embd = Embedding(config)
903
+ self.h = nn.ModuleList([ParallelBlock(config, block_idx=i) for i in range(config.n_layer)])
904
+ self.gradient_checkpointing = False
905
+ self.post_init()
906
+
907
+ def get_input_embeddings(self) -> nn.Embedding:
908
+ return self.embd.wte
909
+
910
+ def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
911
+ self.embd.wte = new_embeddings
912
+
913
+ def forward(
914
+ self,
915
+ input_ids: torch.LongTensor,
916
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
917
+ attention_mask: Optional[torch.BoolTensor] = None,
918
+ ) -> torch.FloatTensor:
919
+ hidden_states = self.embd(input_ids)
920
+
921
+ for layer in self.h:
922
+ hidden_states = layer(
923
+ hidden_states,
924
+ past_key_values=past_key_values,
925
+ attention_mask=attention_mask,
926
+ )
927
+
928
+ return hidden_states
929
+
930
+
931
+ class PhiForCausalLM(PhiPreTrainedModel):
932
+ """Phi for Causal Language Modeling."""
933
+
934
+ _keys_to_ignore_on_load_missing = [""]
935
+ _keys_to_ignore_on_load_unexpected = [r"transformer\.h\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"]
936
+
937
+ def __init__(self, config: PhiConfig) -> None:
938
+ super().__init__(config)
939
+
940
+ self.transformer = PhiModel(config)
941
+ self.lm_head = CausalLMHead(config)
942
+ self.loss = CausalLMLoss()
943
+
944
+ self.post_init()
945
+
946
+ def get_output_embeddings(self) -> nn.Linear:
947
+ return self.lm_head.linear
948
+
949
+ def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
950
+ self.lm_head.linear = new_embeddings
951
+
952
+ def forward(
953
+ self,
954
+ input_ids: torch.LongTensor,
955
+ past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
956
+ attention_mask: Optional[torch.BoolTensor] = None,
957
+ labels: Optional[torch.LongTensor] = None,
958
+ **kwargs,
959
+ ) -> CausalLMOutputWithPast:
960
+ hidden_states = self.transformer(input_ids, past_key_values=past_key_values, attention_mask=attention_mask)
961
+ lm_logits = self.lm_head(hidden_states)
962
+
963
+ loss = None
964
+ if labels is not None:
965
+ loss = self.loss(lm_logits, labels)
966
+
967
+ return CausalLMOutputWithPast(loss=loss, logits=lm_logits, past_key_values=past_key_values)
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|im_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "50257": {
13
+ "content": " ",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": false
19
+ },
20
+ "50258": {
21
+ "content": " ",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": false
27
+ },
28
+ "50259": {
29
+ "content": " ",
30
+ "lstrip": false,
31
+ "normalized": true,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": false
35
+ },
36
+ "50260": {
37
+ "content": " ",
38
+ "lstrip": false,
39
+ "normalized": true,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": false
43
+ },
44
+ "50261": {
45
+ "content": " ",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": false
51
+ },
52
+ "50262": {
53
+ "content": " ",
54
+ "lstrip": false,
55
+ "normalized": true,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": false
59
+ },
60
+ "50263": {
61
+ "content": " ",
62
+ "lstrip": false,
63
+ "normalized": true,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": false
67
+ },
68
+ "50264": {
69
+ "content": " ",
70
+ "lstrip": false,
71
+ "normalized": true,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": false
75
+ },
76
+ "50265": {
77
+ "content": " ",
78
+ "lstrip": false,
79
+ "normalized": true,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": false
83
+ },
84
+ "50266": {
85
+ "content": " ",
86
+ "lstrip": false,
87
+ "normalized": true,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": false
91
+ },
92
+ "50267": {
93
+ "content": " ",
94
+ "lstrip": false,
95
+ "normalized": true,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": false
99
+ },
100
+ "50268": {
101
+ "content": " ",
102
+ "lstrip": false,
103
+ "normalized": true,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": false
107
+ },
108
+ "50269": {
109
+ "content": " ",
110
+ "lstrip": false,
111
+ "normalized": true,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": false
115
+ },
116
+ "50270": {
117
+ "content": " ",
118
+ "lstrip": false,
119
+ "normalized": true,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "50271": {
125
+ "content": " ",
126
+ "lstrip": false,
127
+ "normalized": true,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "50272": {
133
+ "content": " ",
134
+ "lstrip": false,
135
+ "normalized": true,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "50273": {
141
+ "content": " ",
142
+ "lstrip": false,
143
+ "normalized": true,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "50274": {
149
+ "content": " ",
150
+ "lstrip": false,
151
+ "normalized": true,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "50275": {
157
+ "content": " ",
158
+ "lstrip": false,
159
+ "normalized": true,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "50276": {
165
+ "content": " ",
166
+ "lstrip": false,
167
+ "normalized": true,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": false
171
+ },
172
+ "50277": {
173
+ "content": " ",
174
+ "lstrip": false,
175
+ "normalized": true,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": false
179
+ },
180
+ "50278": {
181
+ "content": " ",
182
+ "lstrip": false,
183
+ "normalized": true,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": false
187
+ },
188
+ "50279": {
189
+ "content": " ",
190
+ "lstrip": false,
191
+ "normalized": true,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": false
195
+ },
196
+ "50280": {
197
+ "content": " ",
198
+ "lstrip": false,
199
+ "normalized": true,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": false
203
+ },
204
+ "50281": {
205
+ "content": " ",
206
+ "lstrip": false,
207
+ "normalized": true,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": false
211
+ },
212
+ "50282": {
213
+ "content": " ",
214
+ "lstrip": false,
215
+ "normalized": true,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": false
219
+ },
220
+ "50283": {
221
+ "content": " ",
222
+ "lstrip": false,
223
+ "normalized": true,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": false
227
+ },
228
+ "50284": {
229
+ "content": " ",
230
+ "lstrip": false,
231
+ "normalized": true,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": false
235
+ },
236
+ "50285": {
237
+ "content": " ",
238
+ "lstrip": false,
239
+ "normalized": true,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": false
243
+ },
244
+ "50286": {
245
+ "content": " ",
246
+ "lstrip": false,
247
+ "normalized": true,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": false
251
+ },
252
+ "50287": {
253
+ "content": "\t\t\t\t\t\t\t\t\t",
254
+ "lstrip": false,
255
+ "normalized": true,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": false
259
+ },
260
+ "50288": {
261
+ "content": "\t\t\t\t\t\t\t\t",
262
+ "lstrip": false,
263
+ "normalized": true,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": false
267
+ },
268
+ "50289": {
269
+ "content": "\t\t\t\t\t\t\t",
270
+ "lstrip": false,
271
+ "normalized": true,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": false
275
+ },
276
+ "50290": {
277
+ "content": "\t\t\t\t\t\t",
278
+ "lstrip": false,
279
+ "normalized": true,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": false
283
+ },
284
+ "50291": {
285
+ "content": "\t\t\t\t\t",
286
+ "lstrip": false,
287
+ "normalized": true,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": false
291
+ },
292
+ "50292": {
293
+ "content": "\t\t\t\t",
294
+ "lstrip": false,
295
+ "normalized": true,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": false
299
+ },
300
+ "50293": {
301
+ "content": "\t\t\t",
302
+ "lstrip": false,
303
+ "normalized": true,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": false
307
+ },
308
+ "50294": {
309
+ "content": "\t\t",
310
+ "lstrip": false,
311
+ "normalized": true,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": false
315
+ },
316
+ "50295": {
317
+ "content": "<|im_end|>",
318
+ "lstrip": false,
319
+ "normalized": false,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": true
323
+ },
324
+ "50296": {
325
+ "content": "<|im_start|>",
326
+ "lstrip": false,
327
+ "normalized": false,
328
+ "rstrip": false,
329
+ "single_word": false,
330
+ "special": false
331
+ }
332
+ },
333
+ "bos_token": "<|endoftext|>",
334
+ "clean_up_tokenization_spaces": true,
335
+ "eos_token": "<|im_end|>",
336
+ "model_max_length": 2048,
337
+ "pad_token": "<|endoftext|>",
338
+ "tokenizer_class": "CodeGenTokenizer",
339
+ "unk_token": "<|endoftext|>"
340
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff