pirroh madhavatreplit commited on
Commit
e023a84
1 Parent(s): 9eceafb

Convert ReplitLM to MPT (#16)

Browse files

- Convert ReplitLM to MPT (2635241d5b35bf63b8652513c212f645b78b1572)


Co-authored-by: Madhav <madhavatreplit@users.noreply.huggingface.co>

README.md CHANGED
@@ -173,9 +173,4 @@ Note that as with all code generation models, post-processing of the generated c
173
  - stop generation when the EOS token is encountered
174
  - remove trailing whitespaces
175
  - set `max_tokens` to a reasonable value based on your completion use case
176
- - truncate generation to stop words such as `return`, `def`, "```", "`\n\n\n`" to avoid generating incomplete code when `max_tokens` is larger than the length of the expected generated code.
177
-
178
-
179
-
180
- ## Model Hash
181
- 5bc28ce32c6f9aec935ead7b60ea1c46
173
  - stop generation when the EOS token is encountered
174
  - remove trailing whitespaces
175
  - set `max_tokens` to a reasonable value based on your completion use case
176
+ - truncate generation to stop words such as `return`, `def`, "```", "`\n\n\n`" to avoid generating incomplete code when `max_tokens` is larger than the length of the expected generated code.
 
 
 
 
 
adapt_tokenizer.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+ from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
3
+ Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
4
+ NUM_SENTINEL_TOKENS: int = 100
5
+
6
+ def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
7
+ """Adds sentinel tokens and padding token (if missing).
8
+
9
+ Expands the tokenizer vocabulary to include sentinel tokens
10
+ used in mixture-of-denoiser tasks as well as a padding token.
11
+
12
+ All added tokens are added as special tokens. No tokens are
13
+ added if sentinel tokens and padding token already exist.
14
+ """
15
+ sentinels_to_add = [f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)]
16
+ tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
17
+ if tokenizer.pad_token is None:
18
+ tokenizer.add_tokens('<pad>', special_tokens=True)
19
+ tokenizer.pad_token = '<pad>'
20
+ assert tokenizer.pad_token_id is not None
21
+ sentinels = ''.join([f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)])
22
+ _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
23
+ tokenizer.sentinel_token_ids = _sentinel_token_ids
24
+
25
+ class AutoTokenizerForMOD(AutoTokenizer):
26
+ """AutoTokenizer + Adaptation for MOD.
27
+
28
+ A simple wrapper around AutoTokenizer to make instantiating
29
+ an MOD-adapted tokenizer a bit easier.
30
+
31
+ MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>),
32
+ a padding token, and a property to get the token ids of the
33
+ sentinel tokens.
34
+ """
35
+
36
+ @classmethod
37
+ def from_pretrained(cls, *args, **kwargs):
38
+ """See `AutoTokenizer.from_pretrained` docstring."""
39
+ tokenizer = super().from_pretrained(*args, **kwargs)
40
+ adapt_tokenizer_for_denoising(tokenizer)
41
+ return tokenizer
attention.py CHANGED
@@ -1,80 +1,39 @@
1
- # Copyright 2022 MosaicML Examples authors
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
  """Attention layers."""
5
-
6
  import math
7
  import warnings
8
  from typing import Optional
9
-
10
  import torch
 
11
  from einops import rearrange
12
  from torch import nn
 
13
 
14
- from .low_precision_layernorm import LPLayerNorm
15
-
16
-
17
- def _reset_is_causal(num_query_tokens: int, num_key_tokens: int,
18
- original_is_causal: bool):
19
  if original_is_causal and num_query_tokens != num_key_tokens:
20
  if num_query_tokens != 1:
21
- raise NotImplementedError(
22
- 'ReplitLM does not support query and key with different number of tokens, unless number of query tokens is 1.'
23
- )
24
  else:
25
  return False
26
  return original_is_causal
27
 
28
-
29
- def scaled_multihead_dot_product_attention(
30
- query,
31
- key,
32
- value,
33
- n_heads,
34
- softmax_scale=None,
35
- attn_bias=None,
36
- key_padding_mask=None,
37
- is_causal=False,
38
- dropout_p=0.0,
39
- training=False,
40
- needs_weights=False,
41
- ):
42
-
43
  q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
44
- k = rearrange(key, 'b s (h d) -> b h d s', h=n_heads) # includes key.t()
45
- v = rearrange(value, 'b s (h d) -> b h s d', h=n_heads)
46
-
47
  min_val = torch.finfo(q.dtype).min
48
-
49
- b, _, s_q, d = q.shape
50
  s_k = k.size(-1)
51
-
52
  if softmax_scale is None:
53
  softmax_scale = 1 / math.sqrt(d)
54
-
55
  attn_weight = q.matmul(k) * softmax_scale
56
-
57
  if attn_bias is not None:
58
- if (attn_bias.size(-1) != 1 and
59
- attn_bias.size(-1) != s_k) or (attn_bias.size(-2) != 1 and
60
- attn_bias.size(-2) != s_q):
61
- raise RuntimeError(
62
- f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.'
63
- )
64
  attn_weight = attn_weight + attn_bias
65
-
66
  if key_padding_mask is not None:
67
  if attn_bias is not None:
68
- warnings.warn(
69
- 'Propogating key_padding_mask to the attention module ' +
70
- 'and applying it within the attention module can cause ' +
71
- 'unneccessary computation/memory usage. Consider integrating ' +
72
- 'into attn_bias once and passing that to each attention ' +
73
- 'module instead.'
74
- )
75
- attn_weight = attn_weight.masked_fill(
76
- ~key_padding_mask.view((b, 1, 1, s_k)), min_val)
77
-
78
  if is_causal:
79
  s = max(s_q, s_k)
80
  causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
@@ -82,156 +41,76 @@ def scaled_multihead_dot_product_attention(
82
  causal_mask = causal_mask.to(torch.bool)
83
  causal_mask = ~causal_mask
84
  causal_mask = causal_mask[-s_q:, -s_k:]
85
- attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k),
86
- min_val)
87
-
88
  attn_weight = torch.softmax(attn_weight, dim=-1)
89
-
90
  if dropout_p:
91
- attn_weight = torch.nn.functional.dropout(attn_weight,
92
- p=dropout_p,
93
- training=training,
94
- inplace=True)
95
-
96
  out = attn_weight.matmul(v)
97
  out = rearrange(out, 'b h s d -> b s (h d)')
98
-
99
  if needs_weights:
100
- return out, attn_weight
101
- return out, None
102
-
103
 
104
  def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
105
  for tensor in tensors:
106
  if tensor.dtype not in valid_dtypes:
107
- raise TypeError(f'{tensor.dtype=} must be in {valid_dtypes=}.')
108
  if not tensor.is_cuda:
109
- raise TypeError(
110
- f'Inputs must be cuda tensors ({tensor.is_cuda=}).')
111
-
112
 
113
- def flash_attn_fn(
114
- query,
115
- key,
116
- value,
117
- n_heads,
118
- softmax_scale=None,
119
- attn_bias=None,
120
- key_padding_mask=None,
121
- is_causal=False,
122
- dropout_p=0.0,
123
- training=False,
124
- needs_weights=False,
125
- ):
126
  try:
127
  from flash_attn import bert_padding, flash_attn_interface
128
  except:
129
- raise RuntimeError('Please install flash_attn==0.2.8')
130
-
131
  check_valid_inputs(query, key, value)
132
-
133
  if attn_bias is not None:
134
  raise NotImplementedError(f'attn_bias not implemented for flash attn.')
135
-
136
- batch_size, seqlen = query.shape[:2]
137
-
138
  if key_padding_mask is None:
139
  key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
140
  query_padding_mask = key_padding_mask[:, -query.size(1):]
141
-
142
- query_unpad, indices_q, cu_seqlens_q, max_seqlen_q = bert_padding.unpad_input(
143
- query, query_padding_mask)
144
  query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
145
-
146
- key_unpad, _, cu_seqlens_k, max_seqlen_k = bert_padding.unpad_input(
147
- key, key_padding_mask)
148
- key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
149
-
150
- value_unpad, _, _, _ = bert_padding.unpad_input(value, key_padding_mask)
151
- value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
152
-
153
  dropout_p = dropout_p if training else 0.0
154
-
155
  reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
 
 
 
156
 
157
- output_unpad = flash_attn_interface.flash_attn_unpadded_func(
158
- query_unpad,
159
- key_unpad,
160
- value_unpad,
161
- cu_seqlens_q,
162
- cu_seqlens_k,
163
- max_seqlen_q,
164
- max_seqlen_k,
165
- dropout_p,
166
- softmax_scale=softmax_scale,
167
- causal=reset_is_causal,
168
- return_attn_probs=needs_weights)
169
-
170
- output = bert_padding.pad_input(
171
- rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size,
172
- seqlen)
173
- return output, None
174
-
175
-
176
- def triton_flash_attn_fn(
177
- query,
178
- key,
179
- value,
180
- n_heads,
181
- softmax_scale=None,
182
- attn_bias=None,
183
- key_padding_mask=None,
184
- is_causal=False,
185
- dropout_p=0.0,
186
- training=False,
187
- needs_weights=False,
188
- ):
189
  try:
190
- from flash_attn import flash_attn_triton # type: ignore
191
  except:
192
- raise RuntimeError(
193
- 'Please install flash_attn==0.2.8 and triton==2.0.0.dev20221202.')
194
-
195
  check_valid_inputs(query, key, value)
196
-
197
  if dropout_p:
198
- raise NotImplementedError(
199
- f'Dropout not implemented for attn_impl: triton.')
200
-
201
  if needs_weights:
202
- raise NotImplementedError(
203
- f'attn_impl: triton cannot return attn weights.')
204
-
205
  if key_padding_mask is not None:
206
- warnings.warn(
207
- 'Propagating key_padding_mask to the attention module ' +
208
- 'and applying it within the attention module can cause ' +
209
- 'unnecessary computation/memory usage. Consider integrating ' +
210
- 'into attn_bias once and passing that to each attention ' +
211
- 'module instead.'
212
- )
213
- b_size, s_k = key_padding_mask.shape[:2]
214
-
215
  if attn_bias is None:
216
  attn_bias = query.new_zeros(b_size, 1, 1, s_k)
217
-
218
- attn_bias = attn_bias.masked_fill(
219
- ~key_padding_mask.view((b_size, 1, 1, s_k)),
220
- torch.finfo(query.dtype).min)
221
-
222
  query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
223
- key = rearrange(key, 'b s (h d) -> b s h d', h=n_heads)
224
- value = rearrange(value, 'b s (h d) -> b s h d', h=n_heads)
225
-
 
 
226
  reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
227
- attn_output = flash_attn_triton.flash_attn_func(query, key, value,
228
- attn_bias, reset_is_causal,
229
- softmax_scale)
230
-
231
  output = attn_output.view(*attn_output.shape[:2], -1)
232
-
233
- return output, None
234
-
235
 
236
  class MultiheadAttention(nn.Module):
237
  """Multi-head self attention.
@@ -240,115 +119,121 @@ class MultiheadAttention(nn.Module):
240
  additive bias.
241
  """
242
 
243
- def __init__(
244
- self,
245
- d_model: int,
246
- n_heads: int,
247
- attn_impl: str = 'triton',
248
- attn_clip_qkv: Optional[float] = None,
249
- attn_qk_ln: bool = False,
250
- softmax_scale: Optional[float] = None,
251
- attn_pdrop: float = 0.0,
252
- low_precision_layernorm: bool = False,
253
- device: Optional[str] = None,
254
- ):
255
  super().__init__()
256
-
257
  self.attn_impl = attn_impl
258
- self.clip_qkv = attn_clip_qkv
259
- self.attn_qk_ln = attn_qk_ln
260
-
261
  self.d_model = d_model
262
  self.n_heads = n_heads
263
  self.softmax_scale = softmax_scale
264
  if self.softmax_scale is None:
265
  self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
266
  self.attn_dropout_p = attn_pdrop
267
-
268
  self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
269
- # for param init fn; enables shape based init of fused layers
270
  fuse_splits = (d_model, 2 * d_model)
271
- self.Wqkv._fused = (0, fuse_splits) # type: ignore
272
-
273
- if self.attn_qk_ln:
274
  layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
275
  self.q_ln = layernorm_class(self.d_model, device=device)
276
  self.k_ln = layernorm_class(self.d_model, device=device)
277
-
278
  if self.attn_impl == 'flash':
279
  self.attn_fn = flash_attn_fn
280
  elif self.attn_impl == 'triton':
281
  self.attn_fn = triton_flash_attn_fn
282
- warnings.warn(
283
- 'While `attn_impl: triton` can be faster than `attn_impl: flash` ' +
284
- 'it uses more memory. When training larger models this can trigger ' +
285
- 'alloc retries which hurts performance. If encountered, we recommend ' +
286
- 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
287
  elif self.attn_impl == 'torch':
288
  self.attn_fn = scaled_multihead_dot_product_attention
289
- if torch.cuda.is_available():
290
- warnings.warn(
291
- 'Using `attn_impl: torch`. If your model does not use `alibi` or ' +
292
- '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' +
293
- 'we recommend using `attn_impl: triton`.'
294
- )
295
  else:
296
- raise ValueError(f'{attn_impl=} is an invalid setting.')
297
-
298
  self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
299
- self.out_proj._is_residual = True # type: ignore
300
 
301
- def forward(self,
302
- x,
303
- past_key_value=None,
304
- attn_bias=None,
305
- attention_mask=None,
306
- is_causal=True,
307
- needs_weights=False):
308
  qkv = self.Wqkv(x)
309
-
310
  if self.clip_qkv:
311
  qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
312
-
313
- query, key, value = qkv.chunk(3, dim=2)
314
-
315
  key_padding_mask = attention_mask
316
-
317
- if self.attn_qk_ln:
318
- # Applying layernorm to qk
319
  dtype = query.dtype
320
  query = self.q_ln(query).to(dtype)
321
  key = self.k_ln(key).to(dtype)
322
-
323
  if past_key_value is not None:
324
  if len(past_key_value) != 0:
325
  key = torch.cat([past_key_value[0], key], dim=1)
326
  value = torch.cat([past_key_value[1], value], dim=1)
327
-
328
  past_key_value = (key, value)
329
-
330
  if attn_bias is not None:
331
  attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
 
 
332
 
333
- context, attn_weights = self.attn_fn(
334
- query,
335
- key,
336
- value,
337
- self.n_heads,
338
- softmax_scale=self.softmax_scale,
339
- attn_bias=attn_bias,
340
- key_padding_mask=key_padding_mask,
341
- is_causal=is_causal,
342
- dropout_p=self.attn_dropout_p,
343
- training=self.training,
344
- needs_weights=needs_weights,
345
- )
346
 
347
- return self.out_proj(context), attn_weights, past_key_value
 
 
348
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
- def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal,
351
- use_sequence_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  if attn_impl == 'flash':
353
  return None
354
  elif attn_impl in ['torch', 'triton']:
@@ -360,50 +245,34 @@ def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal,
360
  return (1, 1, seq_len, seq_len)
361
  return None
362
  else:
363
- raise ValueError(f'{attn_impl=} is an invalid setting.')
364
-
365
 
366
- def attn_bias(attn_impl,
367
- attn_bias,
368
- n_heads,
369
- seq_len,
370
- causal=False,
371
- alibi=False,
372
- alibi_bias_max=8):
373
  if attn_impl == 'flash':
374
  return None
375
  elif attn_impl in ['torch', 'triton']:
376
  if alibi:
377
- # in place add alibi to attn bias
378
- device, dtype = attn_bias.device, attn_bias.dtype
379
- attn_bias = attn_bias.add(
380
- alibi_bias(n_heads,
381
- seq_len,
382
- full=not causal,
383
- alibi_bias_max=alibi_bias_max,
384
- device=device,
385
- dtype=dtype))
386
  return attn_bias
387
  else:
388
- raise ValueError(f'{attn_impl=} is an invalid setting.')
389
-
390
-
391
- def alibi_bias(n_heads,
392
- seq_len,
393
- full=False,
394
- alibi_bias_max=8,
395
- device=None,
396
- dtype=None):
397
- alibi_bias = torch.arange(1 - seq_len, 1, dtype=dtype,
398
- device=device).view(1, 1, 1, seq_len)
 
 
399
  if full:
400
- # generate 1 x Heads x SeqLen x SeqLen alibi bias mask
401
- # otherwise the mask is 1 x Heads x 1 x SeqLen (which is broadcast to the appropriate size)
402
- alibi_bias = alibi_bias - torch.arange(
403
- 1 - seq_len, 1, dtype=dtype, device=device).view(1, 1, seq_len, 1)
404
  alibi_bias = alibi_bias.abs().mul(-1)
405
-
406
- m = torch.arange(1, n_heads + 1, dtype=dtype, device=device)
407
- m = m.mul(alibi_bias_max / n_heads)
408
- alibi_bias = alibi_bias * (1. / (2**m.view(1, n_heads, 1, 1)))
409
- return alibi_bias
 
 
 
1
  """Attention layers."""
 
2
  import math
3
  import warnings
4
  from typing import Optional
 
5
  import torch
6
+ import torch.nn as nn
7
  from einops import rearrange
8
  from torch import nn
9
+ from .norm import LPLayerNorm
10
 
11
+ def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
 
 
 
 
12
  if original_is_causal and num_query_tokens != num_key_tokens:
13
  if num_query_tokens != 1:
14
+ raise NotImplementedError('MPT does not support query and key with different number of tokens, unless number of query tokens is 1.')
 
 
15
  else:
16
  return False
17
  return original_is_causal
18
 
19
+ def scaled_multihead_dot_product_attention(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
21
+ k = rearrange(key, 'b s (h d) -> b h d s', h=1 if multiquery else n_heads)
22
+ v = rearrange(value, 'b s (h d) -> b h s d', h=1 if multiquery else n_heads)
 
23
  min_val = torch.finfo(q.dtype).min
24
+ (b, _, s_q, d) = q.shape
 
25
  s_k = k.size(-1)
 
26
  if softmax_scale is None:
27
  softmax_scale = 1 / math.sqrt(d)
 
28
  attn_weight = q.matmul(k) * softmax_scale
 
29
  if attn_bias is not None:
30
+ if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
31
+ raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
 
 
 
 
32
  attn_weight = attn_weight + attn_bias
 
33
  if key_padding_mask is not None:
34
  if attn_bias is not None:
35
+ warnings.warn('Propogating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
36
+ attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
 
 
 
 
 
 
 
 
37
  if is_causal:
38
  s = max(s_q, s_k)
39
  causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
41
  causal_mask = causal_mask.to(torch.bool)
42
  causal_mask = ~causal_mask
43
  causal_mask = causal_mask[-s_q:, -s_k:]
44
+ attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
 
 
45
  attn_weight = torch.softmax(attn_weight, dim=-1)
 
46
  if dropout_p:
47
+ attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
 
 
 
 
48
  out = attn_weight.matmul(v)
49
  out = rearrange(out, 'b h s d -> b s (h d)')
 
50
  if needs_weights:
51
+ return (out, attn_weight)
52
+ return (out, None)
 
53
 
54
  def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
55
  for tensor in tensors:
56
  if tensor.dtype not in valid_dtypes:
57
+ raise TypeError(f'tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}.')
58
  if not tensor.is_cuda:
59
+ raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')
 
 
60
 
61
+ def flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
 
 
 
 
 
 
 
 
 
 
 
 
62
  try:
63
  from flash_attn import bert_padding, flash_attn_interface
64
  except:
65
+ raise RuntimeError('Please install flash-attn==1.0.3.post0')
 
66
  check_valid_inputs(query, key, value)
 
67
  if attn_bias is not None:
68
  raise NotImplementedError(f'attn_bias not implemented for flash attn.')
69
+ (batch_size, seqlen) = query.shape[:2]
 
 
70
  if key_padding_mask is None:
71
  key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
72
  query_padding_mask = key_padding_mask[:, -query.size(1):]
73
+ (query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(query, query_padding_mask)
 
 
74
  query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
75
+ (key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(key, key_padding_mask)
76
+ key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
77
+ (value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask)
78
+ value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
79
+ if multiquery:
80
+ key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
81
+ value_unpad = value_unpad.expand(value_unpad.size(0), n_heads, value_unpad.size(-1))
 
82
  dropout_p = dropout_p if training else 0.0
 
83
  reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
84
+ output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
85
+ output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
86
+ return (output, None)
87
 
88
+ def triton_flash_attn_fn(query, key, value, n_heads, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  try:
90
+ from flash_attn import flash_attn_triton
91
  except:
92
+ raise RuntimeError('Please install flash-attn==1.0.3.post0 and triton==2.0.0.dev20221202')
 
 
93
  check_valid_inputs(query, key, value)
 
94
  if dropout_p:
95
+ raise NotImplementedError(f'Dropout not implemented for attn_impl: triton.')
 
 
96
  if needs_weights:
97
+ raise NotImplementedError(f'attn_impl: triton cannot return attn weights.')
 
 
98
  if key_padding_mask is not None:
99
+ warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
100
+ (b_size, s_k) = key_padding_mask.shape[:2]
 
 
 
 
 
 
 
101
  if attn_bias is None:
102
  attn_bias = query.new_zeros(b_size, 1, 1, s_k)
103
+ attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min)
 
 
 
 
104
  query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
105
+ key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
106
+ value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
107
+ if multiquery:
108
+ key = key.expand(*key.shape[:2], n_heads, key.size(-1))
109
+ value = value.expand(*value.shape[:2], n_heads, value.size(-1))
110
  reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
111
+ attn_output = flash_attn_triton.flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
 
 
 
112
  output = attn_output.view(*attn_output.shape[:2], -1)
113
+ return (output, None)
 
 
114
 
115
  class MultiheadAttention(nn.Module):
116
  """Multi-head self attention.
119
  additive bias.
120
  """
121
 
122
+ def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, verbose: int=0, device: Optional[str]=None):
 
 
 
 
 
 
 
 
 
 
 
123
  super().__init__()
 
124
  self.attn_impl = attn_impl
125
+ self.clip_qkv = clip_qkv
126
+ self.qk_ln = qk_ln
 
127
  self.d_model = d_model
128
  self.n_heads = n_heads
129
  self.softmax_scale = softmax_scale
130
  if self.softmax_scale is None:
131
  self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
132
  self.attn_dropout_p = attn_pdrop
 
133
  self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
 
134
  fuse_splits = (d_model, 2 * d_model)
135
+ self.Wqkv._fused = (0, fuse_splits)
136
+ if self.qk_ln:
 
137
  layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
138
  self.q_ln = layernorm_class(self.d_model, device=device)
139
  self.k_ln = layernorm_class(self.d_model, device=device)
 
140
  if self.attn_impl == 'flash':
141
  self.attn_fn = flash_attn_fn
142
  elif self.attn_impl == 'triton':
143
  self.attn_fn = triton_flash_attn_fn
144
+ if verbose:
145
+ warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
 
 
 
146
  elif self.attn_impl == 'torch':
147
  self.attn_fn = scaled_multihead_dot_product_attention
148
+ if torch.cuda.is_available() and verbose:
149
+ warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
 
 
 
 
150
  else:
151
+ raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
 
152
  self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
153
+ self.out_proj._is_residual = True
154
 
155
+ def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
 
 
 
 
 
 
156
  qkv = self.Wqkv(x)
 
157
  if self.clip_qkv:
158
  qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
159
+ (query, key, value) = qkv.chunk(3, dim=2)
 
 
160
  key_padding_mask = attention_mask
161
+ if self.qk_ln:
 
 
162
  dtype = query.dtype
163
  query = self.q_ln(query).to(dtype)
164
  key = self.k_ln(key).to(dtype)
 
165
  if past_key_value is not None:
166
  if len(past_key_value) != 0:
167
  key = torch.cat([past_key_value[0], key], dim=1)
168
  value = torch.cat([past_key_value[1], value], dim=1)
 
169
  past_key_value = (key, value)
 
170
  if attn_bias is not None:
171
  attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
172
+ (context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights)
173
+ return (self.out_proj(context), attn_weights, past_key_value)
174
 
175
+ class MultiQueryAttention(nn.Module):
176
+ """Multi-Query self attention.
 
 
 
 
 
 
 
 
 
 
 
177
 
178
+ Using torch or triton attention implemetation enables user to also use
179
+ additive bias.
180
+ """
181
 
182
+ def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, verbose: int=0, device: Optional[str]=None):
183
+ super().__init__()
184
+ self.attn_impl = attn_impl
185
+ self.clip_qkv = clip_qkv
186
+ self.qk_ln = qk_ln
187
+ self.d_model = d_model
188
+ self.n_heads = n_heads
189
+ self.head_dim = d_model // n_heads
190
+ self.softmax_scale = softmax_scale
191
+ if self.softmax_scale is None:
192
+ self.softmax_scale = 1 / math.sqrt(self.head_dim)
193
+ self.attn_dropout_p = attn_pdrop
194
+ self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
195
+ fuse_splits = (d_model, d_model + self.head_dim)
196
+ self.Wqkv._fused = (0, fuse_splits)
197
+ if self.qk_ln:
198
+ layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
199
+ self.q_ln = layernorm_class(d_model, device=device)
200
+ self.k_ln = layernorm_class(self.head_dim, device=device)
201
+ if self.attn_impl == 'flash':
202
+ self.attn_fn = flash_attn_fn
203
+ elif self.attn_impl == 'triton':
204
+ self.attn_fn = triton_flash_attn_fn
205
+ if verbose:
206
+ warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
207
+ elif self.attn_impl == 'torch':
208
+ self.attn_fn = scaled_multihead_dot_product_attention
209
+ if torch.cuda.is_available() and verbose:
210
+ warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
211
+ else:
212
+ raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
213
+ self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
214
+ self.out_proj._is_residual = True
215
 
216
+ def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
217
+ qkv = self.Wqkv(x)
218
+ if self.clip_qkv:
219
+ qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
220
+ (query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
221
+ key_padding_mask = attention_mask
222
+ if self.qk_ln:
223
+ dtype = query.dtype
224
+ query = self.q_ln(query).to(dtype)
225
+ key = self.k_ln(key).to(dtype)
226
+ if past_key_value is not None:
227
+ if len(past_key_value) != 0:
228
+ key = torch.cat([past_key_value[0], key], dim=1)
229
+ value = torch.cat([past_key_value[1], value], dim=1)
230
+ past_key_value = (key, value)
231
+ if attn_bias is not None:
232
+ attn_bias = attn_bias[:, :, -query.size(1):, -key.size(1):]
233
+ (context, attn_weights) = self.attn_fn(query, key, value, self.n_heads, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, multiquery=True)
234
+ return (self.out_proj(context), attn_weights, past_key_value)
235
+
236
+ def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
237
  if attn_impl == 'flash':
238
  return None
239
  elif attn_impl in ['torch', 'triton']:
245
  return (1, 1, seq_len, seq_len)
246
  return None
247
  else:
248
+ raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
 
249
 
250
+ def build_attn_bias(attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8):
 
 
 
 
 
 
251
  if attn_impl == 'flash':
252
  return None
253
  elif attn_impl in ['torch', 'triton']:
254
  if alibi:
255
+ (device, dtype) = (attn_bias.device, attn_bias.dtype)
256
+ attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
 
 
 
 
 
 
 
257
  return attn_bias
258
  else:
259
+ raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
260
+
261
+ def gen_slopes(n_heads, alibi_bias_max=8, device=None):
262
+ _n_heads = 2 ** math.ceil(math.log2(n_heads))
263
+ m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
264
+ m = m.mul(alibi_bias_max / _n_heads)
265
+ slopes = 1.0 / torch.pow(2, m)
266
+ if _n_heads != n_heads:
267
+ slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
268
+ return slopes.view(1, n_heads, 1, 1)
269
+
270
+ def build_alibi_bias(n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None):
271
+ alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len)
272
  if full:
273
+ alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, seq_len, 1)
 
 
 
274
  alibi_bias = alibi_bias.abs().mul(-1)
275
+ slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
276
+ alibi_bias = alibi_bias * slopes
277
+ return alibi_bias.to(dtype=dtype)
278
+ ATTN_CLASS_REGISTRY = {'multihead_attention': MultiheadAttention, 'multiquery_attention': MultiQueryAttention}
 
blocks.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GPT Blocks used for the GPT Model."""
2
+ from typing import Dict, Optional, Tuple
3
+ import torch
4
+ import torch.nn as nn
5
+ from .attention import ATTN_CLASS_REGISTRY
6
+ from .norm import NORM_CLASS_REGISTRY
7
+
8
+ class MPTMLP(nn.Module):
9
+
10
+ def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
11
+ super().__init__()
12
+ self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
13
+ self.act = nn.GELU(approximate='none')
14
+ self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
15
+ self.down_proj._is_residual = True
16
+
17
+ def forward(self, x):
18
+ return self.down_proj(self.act(self.up_proj(x)))
19
+
20
+ class MPTBlock(nn.Module):
21
+
22
+ def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, device: Optional[str]=None, **kwargs):
23
+ del kwargs
24
+ super().__init__()
25
+ norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
26
+ attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
27
+ self.norm_1 = norm_class(d_model, device=device)
28
+ self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, verbose=verbose, device=device)
29
+ self.norm_2 = norm_class(d_model, device=device)
30
+ self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
31
+ self.resid_attn_dropout = nn.Dropout(resid_pdrop)
32
+ self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
33
+
34
+ def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
35
+ a = self.norm_1(x)
36
+ (b, _, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
37
+ x = x + self.resid_attn_dropout(b)
38
+ m = self.norm_2(x)
39
+ n = self.ffn(m)
40
+ x = x + self.resid_ffn_dropout(n)
41
+ return (x, past_key_value)
config.json CHANGED
@@ -1,45 +1,51 @@
1
  {
2
- "_name_or_path": "replit/replit-code-v1-3b",
3
- "alibi": true,
4
- "alibi_bias_max": 8,
5
  "architectures": [
6
- "ReplitLM"
7
  ],
8
- "attn_clip_qkv": null,
9
- "attn_impl": "torch",
10
- "attn_pdrop": 0,
11
- "attn_qk_ln": false,
12
- "attn_uses_sequence_id": false,
 
 
 
 
 
 
 
13
  "auto_map": {
14
- "AutoConfig": "configuration_replit_lm.ReplitLMConfig",
15
- "AutoModelForCausalLM": "replit_lm.ReplitLM"
16
  },
17
  "d_model": 2560,
18
- "emb_init_std": null,
19
- "emb_init_uniform_lim": null,
20
  "emb_pdrop": 0,
21
  "embedding_fraction": 1.0,
22
- "fan_mode": "fan_in",
 
 
 
 
 
 
 
 
 
 
 
23
  "init_device": "cpu",
24
- "init_div_is_residual": true,
25
- "init_gain": 0,
26
- "init_nonlinearity": "relu",
27
- "init_std": 0.02,
28
  "logit_scale": null,
29
- "low_precision_layernorm": true,
30
  "max_seq_len": 2048,
31
- "mlp_ratio": 4,
32
- "model_type": "replit_lm",
33
  "n_heads": 32,
34
  "n_layers": 32,
35
  "no_bias": true,
36
- "param_init_fn": "kaiming_normal_",
37
- "prefix_lm": false,
38
  "resid_pdrop": 0,
39
- "softmax_scale": null,
40
  "tokenizer_name": "replit/replit-code-v1-3b",
41
  "torch_dtype": "float32",
42
- "transformers_version": "4.27.4",
43
  "use_cache": false,
44
  "verbose": 0,
45
  "vocab_size": 32768
1
  {
 
 
 
2
  "architectures": [
3
+ "MPTForCausalLM"
4
  ],
5
+ "attn_config": {
6
+ "alibi": true,
7
+ "alibi_bias_max": 8,
8
+ "attn_impl": "torch",
9
+ "attn_pdrop": 0,
10
+ "attn_type": "multihead_attention",
11
+ "attn_uses_sequence_id": false,
12
+ "clip_qkv": null,
13
+ "prefix_lm": false,
14
+ "qk_ln": false,
15
+ "softmax_scale": null
16
+ },
17
  "auto_map": {
18
+ "AutoConfig": "configuration_mpt.MPTConfig",
19
+ "AutoModelForCausalLM": "modeling_mpt.MPTForCausalLM"
20
  },
21
  "d_model": 2560,
 
 
22
  "emb_pdrop": 0,
23
  "embedding_fraction": 1.0,
24
+ "expansion_ratio": 4,
25
+ "init_config": {
26
+ "emb_init_std": null,
27
+ "emb_init_uniform_lim": null,
28
+ "fan_mode": "fan_in",
29
+ "init_div_is_residual": true,
30
+ "init_gain": 0,
31
+ "init_nonlinearity": "relu",
32
+ "init_std": 0.02,
33
+ "name": "kaiming_normal_",
34
+ "verbose": 0
35
+ },
36
  "init_device": "cpu",
37
+ "learned_pos_emb": true,
 
 
 
38
  "logit_scale": null,
 
39
  "max_seq_len": 2048,
40
+ "model_type": "mpt",
 
41
  "n_heads": 32,
42
  "n_layers": 32,
43
  "no_bias": true,
44
+ "norm_type": "low_precision_layernorm",
 
45
  "resid_pdrop": 0,
 
46
  "tokenizer_name": "replit/replit-code-v1-3b",
47
  "torch_dtype": "float32",
48
+ "transformers_version": "4.28.1",
49
  "use_cache": false,
50
  "verbose": 0,
51
  "vocab_size": 32768
configuration_mpt.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A HuggingFace-style model configuration."""
2
+ from typing import Dict, Optional, Union
3
+ from transformers import PretrainedConfig
4
+ attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
5
+ init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
6
+
7
+ class MPTConfig(PretrainedConfig):
8
+ model_type = 'mpt'
9
+
10
+ def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
11
+ """The MPT configuration class.
12
+
13
+ Args:
14
+ d_model (int): The size of the embedding dimension of the model.
15
+ n_heads (int): The number of attention heads.
16
+ n_layers (int): The number of layers in the model.
17
+ expansion_ratio (int): The ratio of the up/down scale in the MLP.
18
+ max_seq_len (int): The maximum sequence length of the model.
19
+ vocab_size (int): The size of the vocabulary.
20
+ resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
21
+ emb_pdrop (float): The dropout probability for the embedding layer.
22
+ learned_pos_emb (bool): Whether to use learned positional embeddings
23
+ attn_config (Dict): A dictionary used to configure the model's attention module:
24
+ attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
25
+ attn_pdrop (float): The dropout probability for the attention layers.
26
+ attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
27
+ qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
28
+ clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
29
+ this value.
30
+ softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
31
+ use the default scale of ``1/sqrt(d_keys)``.
32
+ prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
33
+ extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
34
+ can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
35
+ attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
36
+ When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
37
+ which sub-sequence each token belongs to.
38
+ Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
39
+ alibi (bool): Whether to use the alibi bias instead of position embeddings.
40
+ alibi_bias_max (int): The maximum value of the alibi bias.
41
+ init_device (str): The device to use for parameter initialization.
42
+ logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
43
+ no_bias (bool): Whether to use bias in all layers.
44
+ verbose (int): The verbosity level. 0 is silent.
45
+ embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
46
+ norm_type (str): choose type of norm to use
47
+ multiquery_attention (bool): Whether to use multiquery attention implementation.
48
+ use_cache (bool): Whether or not the model should return the last key/values attentions
49
+ init_config (Dict): A dictionary used to configure the model initialization:
50
+ init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
51
+ 'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
52
+ 'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
53
+ init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
54
+ emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
55
+ emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
56
+ used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
57
+ init_std (float): The standard deviation of the normal distribution used to initialize the model,
58
+ if using the baseline_ parameter initialization scheme.
59
+ init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
60
+ fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
61
+ init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
62
+ ---
63
+ See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
64
+ """
65
+ self.d_model = d_model
66
+ self.n_heads = n_heads
67
+ self.n_layers = n_layers
68
+ self.expansion_ratio = expansion_ratio
69
+ self.max_seq_len = max_seq_len
70
+ self.vocab_size = vocab_size
71
+ self.resid_pdrop = resid_pdrop
72
+ self.emb_pdrop = emb_pdrop
73
+ self.learned_pos_emb = learned_pos_emb
74
+ self.attn_config = attn_config
75
+ self.init_device = init_device
76
+ self.logit_scale = logit_scale
77
+ self.no_bias = no_bias
78
+ self.verbose = verbose
79
+ self.embedding_fraction = embedding_fraction
80
+ self.norm_type = norm_type
81
+ self.use_cache = use_cache
82
+ self.init_config = init_config
83
+ if 'name' in kwargs:
84
+ del kwargs['name']
85
+ if 'loss_fn' in kwargs:
86
+ del kwargs['loss_fn']
87
+ super().__init__(**kwargs)
88
+ self._validate_config()
89
+
90
+ def _set_config_defaults(self, config, config_defaults):
91
+ for (k, v) in config_defaults.items():
92
+ if k not in config:
93
+ config[k] = v
94
+ return config
95
+
96
+ def _validate_config(self):
97
+ self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
98
+ self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
99
+ if self.d_model % self.n_heads != 0:
100
+ raise ValueError('d_model must be divisible by n_heads')
101
+ if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
102
+ raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
103
+ if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
104
+ raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
105
+ if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
106
+ raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
107
+ if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
108
+ raise NotImplementedError('alibi only implemented with torch and triton attention.')
109
+ if self.attn_config['attn_uses_sequence_id'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
110
+ raise NotImplementedError('attn_uses_sequence_id only implemented with torch and triton attention.')
111
+ if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
112
+ raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
113
+ if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model':
114
+ raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
115
+ if self.init_config.get('name', None) is None:
116
+ raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
117
+ if not self.learned_pos_emb and (not self.attn_config['alibi']):
118
+ raise ValueError(f'Positional information must be provided to the model using either learned_pos_emb or alibi.')
generation_config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
  "_from_model_config": true,
3
- "transformers_version": "4.27.4",
4
  "use_cache": false
5
  }
1
  {
2
  "_from_model_config": true,
3
+ "transformers_version": "4.28.1",
4
  "use_cache": false
5
  }
hf_prefixlm_converter.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Converts Huggingface Causal LM to Prefix LM.
2
+
3
+ Conversion does lightweight surgery on a HuggingFace
4
+ Causal LM to convert it to a Prefix LM.
5
+
6
+ Prefix LMs accepts a `bidirectional_mask` input in `forward`
7
+ and treat the input prompt as the prefix in `generate`.
8
+ """
9
+ import math
10
+ import warnings
11
+ from types import MethodType
12
+ from typing import Any, Dict, List, Optional, Tuple, Union
13
+ import torch
14
+ from transformers.models.bloom.modeling_bloom import BaseModelOutputWithPastAndCrossAttentions, BloomForCausalLM, BloomModel, CausalLMOutputWithCrossAttentions, CrossEntropyLoss
15
+ from transformers.models.bloom.modeling_bloom import _expand_mask as _expand_mask_bloom
16
+ from transformers.models.bloom.modeling_bloom import _make_causal_mask as _make_causal_mask_bloom
17
+ from transformers.models.bloom.modeling_bloom import logging
18
+ from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
19
+ from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoForCausalLM
20
+ from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
21
+ from transformers.models.gptj.modeling_gptj import GPTJForCausalLM
22
+ from transformers.models.opt.modeling_opt import OPTForCausalLM
23
+ from transformers.models.opt.modeling_opt import _expand_mask as _expand_mask_opt
24
+ from transformers.models.opt.modeling_opt import _make_causal_mask as _make_causal_mask_opt
25
+ logger = logging.get_logger(__name__)
26
+ _SUPPORTED_GPT_MODELS = (GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM)
27
+ CAUSAL_GPT_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM]
28
+
29
+ def _convert_gpt_causal_lm_to_prefix_lm(model: CAUSAL_GPT_TYPES) -> CAUSAL_GPT_TYPES:
30
+ """Converts a GPT-style Causal LM to a Prefix LM.
31
+
32
+ Supported HuggingFace model classes:
33
+ - `GPT2LMHeadModel`
34
+ - `GPTNeoForCausalLM`
35
+ - `GPTNeoXForCausalLM`
36
+ - `GPTJForCausalLM`
37
+
38
+ See `convert_hf_causal_lm_to_prefix_lm` for more details.
39
+ """
40
+ if hasattr(model, '_prefix_lm_converted'):
41
+ return model
42
+ assert isinstance(model, _SUPPORTED_GPT_MODELS)
43
+ assert model.config.add_cross_attention == False, 'Only supports GPT-style decoder-only models'
44
+
45
+ def _get_attn_modules(model: CAUSAL_GPT_TYPES) -> List[torch.nn.Module]:
46
+ """Helper that gets a list of the model's attention modules.
47
+
48
+ Each module has a `bias` buffer used for causal masking. The Prefix LM
49
+ conversion adds logic to dynamically manipulate these biases to support
50
+ Prefix LM attention masking.
51
+ """
52
+ attn_modules = []
53
+ if isinstance(model, GPTNeoXForCausalLM):
54
+ blocks = model.gpt_neox.layers
55
+ else:
56
+ blocks = model.transformer.h
57
+ for block in blocks:
58
+ if isinstance(model, GPTNeoForCausalLM):
59
+ if block.attn.attention_type != 'global':
60
+ continue
61
+ attn_module = block.attn.attention
62
+ elif isinstance(model, GPTNeoXForCausalLM):
63
+ attn_module = block.attention
64
+ else:
65
+ attn_module = block.attn
66
+ attn_modules.append(attn_module)
67
+ return attn_modules
68
+ setattr(model, '_original_forward', getattr(model, 'forward'))
69
+ setattr(model, '_original_generate', getattr(model, 'generate'))
70
+
71
+ def forward(self: CAUSAL_GPT_TYPES, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]]=None, attention_mask: Optional[torch.FloatTensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, token_type_ids: Optional[torch.LongTensor]=None, position_ids: Optional[torch.LongTensor]=None, head_mask: Optional[torch.FloatTensor]=None, inputs_embeds: Optional[torch.FloatTensor]=None, labels: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None):
72
+ """Wraps original forward to enable PrefixLM attention."""
73
+
74
+ def call_og_forward():
75
+ if isinstance(self, GPTNeoXForCausalLM):
76
+ return self._original_forward(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
77
+ else:
78
+ return self._original_forward(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
79
+ if bidirectional_mask is None:
80
+ return call_og_forward()
81
+ assert isinstance(bidirectional_mask, torch.Tensor)
82
+ attn_modules = _get_attn_modules(model)
83
+ (b, s) = bidirectional_mask.shape
84
+ max_length = attn_modules[0].bias.shape[-1]
85
+ if s > max_length:
86
+ raise ValueError(f'bidirectional_mask sequence length (={s}) exceeds the ' + f'max length allowed by the model ({max_length}).')
87
+ assert s <= max_length
88
+ if s < max_length:
89
+ pad = torch.zeros((int(b), int(max_length - s)), dtype=bidirectional_mask.dtype, device=bidirectional_mask.device)
90
+ bidirectional_mask = torch.cat([bidirectional_mask, pad], dim=1)
91
+ bidirectional = bidirectional_mask.unsqueeze(1).unsqueeze(1)
92
+ for attn_module in attn_modules:
93
+ attn_module.bias.data = torch.logical_or(attn_module.bias.data, bidirectional)
94
+ output = call_og_forward()
95
+ for attn_module in attn_modules:
96
+ attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
97
+ return output
98
+
99
+ def generate(self: CAUSAL_GPT_TYPES, *args: tuple, **kwargs: Dict[str, Any]):
100
+ """Wraps original generate to enable PrefixLM attention."""
101
+ attn_modules = _get_attn_modules(model)
102
+ for attn_module in attn_modules:
103
+ attn_module.bias.data[:] = 1
104
+ output = self._original_generate(*args, **kwargs)
105
+ for attn_module in attn_modules:
106
+ attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
107
+ return output
108
+ setattr(model, 'forward', MethodType(forward, model))
109
+ setattr(model, 'generate', MethodType(generate, model))
110
+ setattr(model, '_prefix_lm_converted', True)
111
+ return model
112
+
113
+ def _convert_bloom_causal_lm_to_prefix_lm(model: BloomForCausalLM) -> BloomForCausalLM:
114
+ """Converts a BLOOM Causal LM to a Prefix LM.
115
+
116
+ Supported HuggingFace model classes:
117
+ - `BloomForCausalLM`
118
+
119
+ See `convert_hf_causal_lm_to_prefix_lm` for more details.
120
+ """
121
+ if hasattr(model, '_prefix_lm_converted'):
122
+ return model
123
+ assert isinstance(model, BloomForCausalLM)
124
+ assert model.config.add_cross_attention == False, 'Only supports BLOOM decoder-only models'
125
+
126
+ def _prepare_attn_mask(self: BloomModel, attention_mask: torch.Tensor, bidirectional_mask: Optional[torch.Tensor], input_shape: Tuple[int, int], past_key_values_length: int) -> torch.BoolTensor:
127
+ combined_attention_mask = None
128
+ device = attention_mask.device
129
+ (_, src_length) = input_shape
130
+ if src_length > 1:
131
+ combined_attention_mask = _make_causal_mask_bloom(input_shape, device=device, past_key_values_length=past_key_values_length)
132
+ if bidirectional_mask is not None:
133
+ assert attention_mask.shape == bidirectional_mask.shape
134
+ expanded_bidirectional_mask = _expand_mask_bloom(bidirectional_mask, tgt_length=src_length)
135
+ combined_attention_mask = torch.logical_and(combined_attention_mask, expanded_bidirectional_mask)
136
+ expanded_attn_mask = _expand_mask_bloom(attention_mask, tgt_length=src_length)
137
+ combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
138
+ return combined_attention_mask
139
+
140
+ def _build_alibi_tensor(self: BloomModel, batch_size: int, query_length: int, key_length: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
141
+ num_heads = self.config.n_head
142
+ closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
143
+ base = torch.tensor(2 ** (-2 ** (-(math.log2(closest_power_of_2) - 3))), device=device, dtype=torch.float32)
144
+ powers = torch.arange(1, 1 + closest_power_of_2, device=device, dtype=torch.int32)
145
+ slopes = torch.pow(base, powers)
146
+ if closest_power_of_2 != num_heads:
147
+ extra_base = torch.tensor(2 ** (-2 ** (-(math.log2(2 * closest_power_of_2) - 3))), device=device, dtype=torch.float32)
148
+ num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
149
+ extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=device, dtype=torch.int32)
150
+ slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
151
+ qa = torch.arange(query_length, device=device, dtype=torch.int32).view(-1, 1)
152
+ ka = torch.arange(key_length, device=device, dtype=torch.int32).view(1, -1)
153
+ diffs = qa - ka + key_length - query_length
154
+ diffs = -diffs.abs()
155
+ alibi = slopes.view(1, num_heads, 1, 1) * diffs.view(1, 1, query_length, key_length)
156
+ alibi = alibi.expand(batch_size, -1, -1, -1).reshape(-1, query_length, key_length)
157
+ return alibi.to(dtype)
158
+ KeyValueT = Tuple[torch.Tensor, torch.Tensor]
159
+
160
+ def forward(self: BloomModel, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[KeyValueT, ...]]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, head_mask: Optional[torch.LongTensor]=None, inputs_embeds: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None, **deprecated_arguments) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
161
+ if deprecated_arguments.pop('position_ids', False) is not False:
162
+ warnings.warn('`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. ' + 'You can safely ignore passing `position_ids`.', FutureWarning)
163
+ if len(deprecated_arguments) > 0:
164
+ raise ValueError(f'Got unexpected arguments: {deprecated_arguments}')
165
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
166
+ output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
167
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
168
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
169
+ if input_ids is not None and inputs_embeds is not None:
170
+ raise ValueError('You cannot specify both input_ids and inputs_embeds at the same time')
171
+ elif input_ids is not None:
172
+ (batch_size, seq_length) = input_ids.shape
173
+ elif inputs_embeds is not None:
174
+ (batch_size, seq_length, _) = inputs_embeds.shape
175
+ else:
176
+ raise ValueError('You have to specify either input_ids or inputs_embeds')
177
+ if past_key_values is None:
178
+ past_key_values = tuple([None] * len(self.h))
179
+ head_mask = self.get_head_mask(head_mask, self.config.n_layer)
180
+ if inputs_embeds is None:
181
+ inputs_embeds = self.word_embeddings(input_ids)
182
+ hidden_states = self.word_embeddings_layernorm(inputs_embeds)
183
+ presents = () if use_cache else None
184
+ all_self_attentions = () if output_attentions else None
185
+ all_hidden_states = () if output_hidden_states else None
186
+ seq_length_with_past = seq_length
187
+ past_key_values_length = 0
188
+ if past_key_values[0] is not None:
189
+ tmp = past_key_values[0][0]
190
+ past_key_values_length = tmp.shape[2]
191
+ seq_length_with_past = seq_length_with_past + past_key_values_length
192
+ if attention_mask is None:
193
+ attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
194
+ else:
195
+ attention_mask = attention_mask.to(hidden_states.device)
196
+ alibi = self._build_alibi_tensor(batch_size=batch_size, query_length=seq_length, key_length=seq_length_with_past, dtype=hidden_states.dtype, device=hidden_states.device)
197
+ causal_mask = self._prepare_attn_mask(attention_mask, bidirectional_mask, input_shape=(batch_size, seq_length), past_key_values_length=past_key_values_length)
198
+ for (i, (block, layer_past)) in enumerate(zip(self.h, past_key_values)):
199
+ if output_hidden_states:
200
+ hst = (hidden_states,)
201
+ all_hidden_states = all_hidden_states + hst
202
+ if self.gradient_checkpointing and self.training:
203
+ if use_cache:
204
+ logger.warning('`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...')
205
+ use_cache = False
206
+
207
+ def create_custom_forward(module):
208
+
209
+ def custom_forward(*inputs):
210
+ return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
211
+ return custom_forward
212
+ outputs = torch.utils.checkpoint.checkpoint(create_custom_forward(block), hidden_states, alibi, causal_mask, head_mask[i])
213
+ else:
214
+ outputs = block(hidden_states, layer_past=layer_past, attention_mask=causal_mask, head_mask=head_mask[i], use_cache=use_cache, output_attentions=output_attentions, alibi=alibi)
215
+ hidden_states = outputs[0]
216
+ if use_cache is True:
217
+ presents = presents + (outputs[1],)
218
+ if output_attentions:
219
+ oa = (outputs[2 if use_cache else 1],)
220
+ all_self_attentions = all_self_attentions + oa
221
+ hidden_states = self.ln_f(hidden_states)
222
+ if output_hidden_states:
223
+ hst = (hidden_states,)
224
+ all_hidden_states = all_hidden_states + hst
225
+ if not return_dict:
226
+ return tuple((v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None))
227
+ return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions)
228
+ setattr(model.transformer, '_prepare_attn_mask', MethodType(_prepare_attn_mask, model.transformer))
229
+ setattr(model.transformer, '_build_alibi_tensor', MethodType(_build_alibi_tensor, model.transformer))
230
+ setattr(model.transformer, 'forward', MethodType(forward, model.transformer))
231
+ KeyValueT = Tuple[torch.Tensor, torch.Tensor]
232
+
233
+ def forward(self: BloomForCausalLM, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[KeyValueT, ...]]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, head_mask: Optional[torch.Tensor]=None, inputs_embeds: Optional[torch.Tensor]=None, labels: Optional[torch.Tensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None, **deprecated_arguments) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
234
+ """Replacement forward method for BloomCausalLM."""
235
+ if deprecated_arguments.pop('position_ids', False) is not False:
236
+ warnings.warn('`position_ids` have no functionality in BLOOM and will be removed ' + 'in v5.0.0. You can safely ignore passing `position_ids`.', FutureWarning)
237
+ if len(deprecated_arguments) > 0:
238
+ raise ValueError(f'Got unexpected arguments: {deprecated_arguments}')
239
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
240
+ transformer_outputs = self.transformer(input_ids, past_key_values=past_key_values, attention_mask=attention_mask, bidirectional_mask=bidirectional_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
241
+ hidden_states = transformer_outputs[0]
242
+ lm_logits = self.lm_head(hidden_states)
243
+ loss = None
244
+ if labels is not None:
245
+ shift_logits = lm_logits[..., :-1, :].contiguous()
246
+ shift_labels = labels[..., 1:].contiguous()
247
+ (batch_size, seq_length, vocab_size) = shift_logits.shape
248
+ loss_fct = CrossEntropyLoss()
249
+ loss = loss_fct(shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length))
250
+ if not return_dict:
251
+ output = (lm_logits,) + transformer_outputs[1:]
252
+ return (loss,) + output if loss is not None else output
253
+ return CausalLMOutputWithCrossAttentions(loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions)
254
+
255
+ def prepare_inputs_for_generation(self: BloomForCausalLM, input_ids: torch.LongTensor, past: Optional[torch.Tensor]=None, attention_mask: Optional[torch.Tensor]=None, **kwargs) -> dict:
256
+ if past:
257
+ input_ids = input_ids[:, -1].unsqueeze(-1)
258
+ bidirectional_mask = None
259
+ if past[0][0].shape[0] == input_ids.shape[0]:
260
+ past = self._convert_to_bloom_cache(past)
261
+ else:
262
+ bidirectional_mask = torch.ones_like(input_ids)
263
+ return {'input_ids': input_ids, 'past_key_values': past, 'use_cache': True, 'attention_mask': attention_mask, 'bidirectional_mask': bidirectional_mask}
264
+ setattr(model, 'forward', MethodType(forward, model))
265
+ setattr(model, 'prepare_inputs_for_generation', MethodType(prepare_inputs_for_generation, model))
266
+ setattr(model, '_prefix_lm_converted', True)
267
+ return model
268
+
269
+ def _convert_opt_causal_lm_to_prefix_lm(model: OPTForCausalLM) -> OPTForCausalLM:
270
+ """Converts an OPT Causal LM to a Prefix LM.
271
+
272
+ Supported HuggingFace model classes:
273
+ - `OPTForCausalLM`
274
+
275
+ See `convert_hf_causal_lm_to_prefix_lm` for more details.
276
+ """
277
+ if hasattr(model, '_prefix_lm_converted'):
278
+ return model
279
+ assert isinstance(model, OPTForCausalLM)
280
+ assert model.config.add_cross_attention == False, 'Only supports OPT decoder-only models'
281
+ setattr(model, '_original_forward', getattr(model, 'forward'))
282
+ setattr(model, '_original_generate', getattr(model, 'generate'))
283
+ model.model.decoder.bidirectional_mask = None
284
+
285
+ def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
286
+ combined_attention_mask = None
287
+ if input_shape[-1] > 1:
288
+ if self.bidirectional_mask == 'g':
289
+ (bsz, src_length) = input_shape
290
+ combined_attention_mask = torch.zeros((bsz, 1, src_length, src_length + past_key_values_length), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
291
+ else:
292
+ combined_attention_mask = _make_causal_mask_opt(input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length).to(inputs_embeds.device)
293
+ if self.bidirectional_mask is not None:
294
+ assert attention_mask.shape == self.bidirectional_mask.shape
295
+ expanded_bidirectional_mask = _expand_mask_opt(self.bidirectional_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(inputs_embeds.device)
296
+ combined_attention_mask = torch.maximum(expanded_bidirectional_mask, combined_attention_mask)
297
+ if attention_mask is not None:
298
+ expanded_attn_mask = _expand_mask_opt(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(inputs_embeds.device)
299
+ combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
300
+ return combined_attention_mask
301
+ setattr(model.model.decoder, '_prepare_decoder_attention_mask', MethodType(_prepare_decoder_attention_mask, model.model.decoder))
302
+
303
+ def forward(self: OPTForCausalLM, input_ids: Optional[torch.LongTensor]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.ByteTensor]=None, head_mask: Optional[torch.Tensor]=None, past_key_values: Optional[List[torch.FloatTensor]]=None, inputs_embeds: Optional[torch.FloatTensor]=None, labels: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None):
304
+
305
+ def call_og_forward():
306
+ return self._original_forward(input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
307
+ if bidirectional_mask is None:
308
+ return call_og_forward()
309
+ self.model.decoder.bidirectional_mask = bidirectional_mask
310
+ try:
311
+ outputs = call_og_forward()
312
+ except:
313
+ self.model.decoder.bidirectional_mask = None
314
+ raise
315
+ self.model.decoder.bidirectional_mask = None
316
+ return outputs
317
+
318
+ def generate(self: OPTForCausalLM, *args: tuple, **kwargs: Dict[str, Any]):
319
+ """Wraps original generate to enable PrefixLM-style attention."""
320
+ self.model.decoder.bidirectional_mask = 'g'
321
+ try:
322
+ output = self._original_generate(*args, **kwargs)
323
+ except:
324
+ self.model.decoder.bidirectional_mask = None
325
+ raise
326
+ self.model.decoder.bidirectional_mask = None
327
+ return output
328
+ setattr(model, 'forward', MethodType(forward, model))
329
+ setattr(model, 'generate', MethodType(generate, model))
330
+ setattr(model, '_prefix_lm_converted', True)
331
+ return model
332
+ _SUPPORTED_HF_MODELS = _SUPPORTED_GPT_MODELS + (BloomForCausalLM, OPTForCausalLM)
333
+ CAUSAL_LM_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM, BloomForCausalLM, OPTForCausalLM]
334
+
335
+ def convert_hf_causal_lm_to_prefix_lm(model: CAUSAL_LM_TYPES) -> CAUSAL_LM_TYPES:
336
+ """Converts a HuggingFace Causal LM to a Prefix LM.
337
+
338
+ Supported HuggingFace model classes:
339
+ - `GPT2LMHeadModel`
340
+ - `GPTNeoForCausalLM`
341
+ - `GPTNeoXForCausalLM`
342
+ - `GPTJForCausalLM`
343
+ - `BloomForCausalLM`
344
+ - `OPTForCausalLM`
345
+
346
+ Conversion to a Prefix LM is done by modifying the `forward` method, and possibly also the
347
+ `generate` method and/or select underlying methods depending on the model class.
348
+
349
+ These changes preserve the model API, but add a new input to `forward`: "bidirectional_mask".
350
+
351
+ Notes on training:
352
+ To actually train the converted model as a Prefix LM, training batches will need to indicate
353
+ the prefix/target structure by including `bidirectional_mask` as part of the batch inputs.
354
+
355
+ **This is not a standard input and requires custom layers either within or after your dataloader.**
356
+
357
+ In addition to adding `bidirectional_mask` to the batch, this custom code should modify `labels`
358
+ such that `batch['labels'][batch['bidirectional_mask'] == 1] == -100`.
359
+ That is, the prefix portion of the sequence should not generate any loss. Loss should only be
360
+ generated by the target portion of the sequence.
361
+
362
+ Notes on `GPTNeoForCausalLM`:
363
+ To simplify the implementation, "global" and "local" attention layers are handled differently.
364
+ For "global" layers, we handle conversion as described above. For "local" layers, which use a
365
+ causal attention mask within a restricted local window, we do not alter the masking.
366
+
367
+ Notes on `forward` method conversion:
368
+ After conversion, the `forward` method will handle a new input, `bidirectional_mask`,
369
+ which should be a [batch_size, seq_length] byte tensor, where 1 indicates token positions
370
+ belonging to the prefix (prefix tokens can attend to one another bidirectionally), and
371
+ 0 indicates token positions belonging to the target.
372
+
373
+ The new `forward` method will incorporate `bidirectional_mask` (if supplied) into the existing
374
+ causal mask, call the original `forward` method, and (if the causal mask is a buffer) reset
375
+ the causal masks before returning the result.
376
+
377
+ Notes on `generate` method conversion:
378
+ After conversion, the `generate` method will have the same signature but will internally
379
+ convert all causal masks to be purely bidirectional, call the original `generate` method, and
380
+ (where appropriate) reset the causal masks before returning the result.
381
+
382
+ This works thanks to the logic of the HuggingFace `generate` API, which first encodes the token
383
+ "prompt" passed to `generate` (which is treated as the prefix) and then sequentially generates
384
+ each new token. Encodings are cached as generation happens, so all prefix tokens can attend to one
385
+ another (as expected in a Prefix LM) and generated tokens can only attend to prefix tokens and
386
+ previously-generated tokens (also as expected in a Prefix LM).
387
+
388
+ To preserve the API, the original methods are renamed to `_original_forward` and
389
+ `_original_generate`, and replaced with new `forward` and `generate` methods that wrap
390
+ them, respectively. Although implementation details vary by model class.
391
+ """
392
+ if isinstance(model, _SUPPORTED_GPT_MODELS):
393
+ return _convert_gpt_causal_lm_to_prefix_lm(model)
394
+ elif isinstance(model, BloomForCausalLM):
395
+ return _convert_bloom_causal_lm_to_prefix_lm(model)
396
+ elif isinstance(model, OPTForCausalLM):
397
+ return _convert_opt_causal_lm_to_prefix_lm(model)
398
+ else:
399
+ raise TypeError(f'Cannot convert model to Prefix LM. ' + f'Model does not belong to set of supported HF models:' + f'\n{_SUPPORTED_HF_MODELS}')
400
+
401
+ def add_bidirectional_mask_if_missing(batch: Dict[str, Any]):
402
+ """Attempts to add bidirectional_mask to batch if missing.
403
+
404
+ Raises:
405
+ KeyError if bidirectional_mask is missing and can't be inferred
406
+ """
407
+ if 'bidirectional_mask' not in batch:
408
+ if batch.get('mode', None) == 'icl_task':
409
+ batch['bidirectional_mask'] = batch['attention_mask'].clone()
410
+ for (i, continuation_indices) in enumerate(batch['continuation_indices']):
411
+ batch['bidirectional_mask'][i, continuation_indices] = 0
412
+ elif 'labels' in batch and 'attention_mask' in batch:
413
+ batch['bidirectional_mask'] = torch.logical_and(torch.eq(batch['attention_mask'], 1), torch.eq(batch['labels'], -100)).type_as(batch['attention_mask'])
414
+ else:
415
+ raise KeyError('No bidirectional_mask in batch and not sure how to construct one.')
meta_init_context.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import contextmanager
2
+ import torch
3
+ import torch.nn as nn
4
+
5
+ @contextmanager
6
+ def init_empty_weights(include_buffers: bool=False):
7
+ """Meta initialization context manager.
8
+
9
+ A context manager under which models are initialized with all parameters
10
+ on the meta device, therefore creating an empty model. Useful when just
11
+ initializing the model would blow the available RAM.
12
+
13
+ Args:
14
+ include_buffers (`bool`, *optional*, defaults to `False`): Whether or
15
+ not to also put all buffers on the meta device while initializing.
16
+
17
+ Example:
18
+ ```python
19
+ import torch.nn as nn
20
+
21
+ # Initialize a model with 100 billions parameters in no time and without using any RAM.
22
+ with init_empty_weights():
23
+ tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
24
+ ```
25
+
26
+ <Tip warning={true}>
27
+
28
+ Any model created under this context manager has no weights. As such you can't do something like
29
+ `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
30
+
31
+ </Tip>
32
+ """
33
+ with init_on_device(torch.device('meta'), include_buffers=include_buffers) as f:
34
+ yield f
35
+
36
+ @contextmanager
37
+ def init_on_device(device: torch.device, include_buffers: bool=False):
38
+ """Device initialization context manager.
39
+
40
+ A context manager under which models are initialized with all parameters
41
+ on the specified device.
42
+
43
+ Args:
44
+ device (`torch.device`): Device to initialize all parameters on.
45
+ include_buffers (`bool`, *optional*, defaults to `False`): Whether or
46
+ not to also put all buffers on the meta device while initializing.
47
+
48
+ Example:
49
+ ```python
50
+ import torch.nn as nn
51
+
52
+ with init_on_device(device=torch.device("cuda")):
53
+ tst = nn.Liner(100, 100) # on `cuda` device
54
+ ```
55
+ """
56
+ old_register_parameter = nn.Module.register_parameter
57
+ if include_buffers:
58
+ old_register_buffer = nn.Module.register_buffer
59
+
60
+ def register_empty_parameter(module, name, param):
61
+ old_register_parameter(module, name, param)
62
+ if param is not None:
63
+ param_cls = type(module._parameters[name])
64
+ kwargs = module._parameters[name].__dict__
65
+ module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
66
+
67
+ def register_empty_buffer(module, name, buffer):
68
+ old_register_buffer(module, name, buffer)
69
+ if buffer is not None:
70
+ module._buffers[name] = module._buffers[name].to(device)
71
+ if include_buffers:
72
+ tensor_constructors_to_patch = {torch_function_name: getattr(torch, torch_function_name) for torch_function_name in ['empty', 'zeros', 'ones', 'full']}
73
+ else:
74
+ tensor_constructors_to_patch = {}
75
+
76
+ def patch_tensor_constructor(fn):
77
+
78
+ def wrapper(*args, **kwargs):
79
+ kwargs['device'] = device
80
+ return fn(*args, **kwargs)
81
+ return wrapper
82
+ try:
83
+ nn.Module.register_parameter = register_empty_parameter
84
+ if include_buffers:
85
+ nn.Module.register_buffer = register_empty_buffer
86
+ for torch_function_name in tensor_constructors_to_patch.keys():
87
+ setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
88
+ yield
89
+ finally:
90
+ nn.Module.register_parameter = old_register_parameter
91
+ if include_buffers:
92
+ nn.Module.register_buffer = old_register_buffer
93
+ for (torch_function_name, old_torch_function) in tensor_constructors_to_patch.items():
94
+ setattr(torch, torch_function_name, old_torch_function)
modeling_mpt.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A simple, flexible implementation of a GPT model.
2
+
3
+ Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
4
+ """
5
+ import math
6
+ import warnings
7
+ from typing import List, Optional, Tuple, Union
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+ from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
12
+ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
13
+ from .attention import attn_bias_shape, build_attn_bias
14
+ from .blocks import MPTBlock
15
+ from .norm import NORM_CLASS_REGISTRY
16
+ from .configuration_mpt import MPTConfig
17
+ from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
18
+ from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
19
+ from .meta_init_context import init_empty_weights
20
+ from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
21
+ Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
22
+
23
+ class MPTPreTrainedModel(PreTrainedModel):
24
+ config_class = MPTConfig
25
+ base_model_prefix = 'model'
26
+
27
+ class MPTModel(MPTPreTrainedModel):
28
+
29
+ def __init__(self, config: MPTConfig):
30
+ config._validate_config()
31
+ super().__init__(config)
32
+ self.attn_impl = config.attn_config['attn_impl']
33
+ self.prefix_lm = config.attn_config['prefix_lm']
34
+ self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
35
+ self.alibi = config.attn_config['alibi']
36
+ self.alibi_bias_max = config.attn_config['alibi_bias_max']
37
+ if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
38
+ norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
39
+ raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
40
+ norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
41
+ self.embedding_fraction = config.embedding_fraction
42
+ self.wte = nn.Embedding(config.vocab_size, config.d_model, device=config.init_device)
43
+ if not self.alibi:
44
+ self.wpe = nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
45
+ self.emb_drop = nn.Dropout(config.emb_pdrop)
46
+ self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
47
+ self.norm_f = norm_class(config.d_model, device=config.init_device)
48
+ if config.init_device != 'meta':
49
+ print(f'You are using config.init_device={config.init_device!r}, but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.')
50
+ self.apply(self.param_init_fn)
51
+ self.is_causal = not self.prefix_lm
52
+ self._attn_bias_initialized = False
53
+ self.attn_bias = None
54
+ self.attn_bias_shape = attn_bias_shape(self.attn_impl, config.n_heads, config.max_seq_len, self.alibi, prefix_lm=self.prefix_lm, causal=self.is_causal, use_sequence_id=self.attn_uses_sequence_id)
55
+ if config.no_bias:
56
+ for module in self.modules():
57
+ if hasattr(module, 'bias') and isinstance(module.bias, nn.Parameter):
58
+ if config.verbose:
59
+ warnings.warn(f'Removing bias ({module.bias}) from {module}.')
60
+ module.register_parameter('bias', None)
61
+ if config.verbose and config.verbose > 2:
62
+ print(self)
63
+ if 'verbose' not in self.config.init_config:
64
+ self.config.init_config['verbose'] = self.config.verbose
65
+ if self.config.init_config['verbose'] > 1:
66
+ init_fn_name = self.config.init_config['name']
67
+ warnings.warn(f'Using {init_fn_name} initialization.')
68
+
69
+ def get_input_embeddings(self):
70
+ return self.wte
71
+
72
+ def set_input_embeddings(self, value):
73
+ self.wte = value
74
+
75
+ @torch.no_grad()
76
+ def _attn_bias(self, device, dtype, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None):
77
+ if not self._attn_bias_initialized:
78
+ if self.attn_bias_shape:
79
+ self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype)
80
+ self.attn_bias = build_attn_bias(self.attn_impl, self.attn_bias, self.config.n_heads, self.config.max_seq_len, causal=self.is_causal, alibi=self.alibi, alibi_bias_max=self.alibi_bias_max)
81
+ self._attn_bias_initialized = True
82
+ if self.attn_impl == 'flash':
83
+ return (self.attn_bias, attention_mask)
84
+ if self.attn_bias is not None:
85
+ self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
86
+ attn_bias = self.attn_bias
87
+ if self.prefix_lm:
88
+ assert isinstance(attn_bias, torch.Tensor)
89
+ assert isinstance(prefix_mask, torch.Tensor)
90
+ attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
91
+ if self.attn_uses_sequence_id and sequence_id is not None:
92
+ assert isinstance(attn_bias, torch.Tensor)
93
+ attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
94
+ if attention_mask is not None:
95
+ s_k = attention_mask.shape[-1]
96
+ if attn_bias is None:
97
+ attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
98
+ else:
99
+ attn_bias = attn_bias[:, :, :, -s_k:]
100
+ if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
101
+ raise ValueError(f'attention_mask shape={attention_mask.shape} ' + f'and prefix_mask shape={prefix_mask.shape} are not equal.')
102
+ min_val = torch.finfo(attn_bias.dtype).min
103
+ attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val)
104
+ return (attn_bias, None)
105
+
106
+ def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor):
107
+ (s_k, s_q) = attn_bias.shape[-2:]
108
+ if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
109
+ raise ValueError('attn_bias does not match the expected shape. ' + f'The last two dimensions should both be {self.config.max_length} ' + f'but are {s_k} and {s_q}.')
110
+ seq_len = prefix_mask.shape[-1]
111
+ if seq_len > self.config.max_seq_len:
112
+ raise ValueError(f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
113
+ attn_bias = attn_bias[..., :seq_len, :seq_len]
114
+ causal = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
115
+ prefix = prefix_mask.view(-1, 1, 1, seq_len)
116
+ cannot_attend = ~torch.logical_or(causal, prefix.bool())
117
+ min_val = torch.finfo(attn_bias.dtype).min
118
+ attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
119
+ return attn_bias
120
+
121
+ def _apply_sequence_id(self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor):
122
+ seq_len = sequence_id.shape[-1]
123
+ if seq_len > self.config.max_seq_len:
124
+ raise ValueError(f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
125
+ attn_bias = attn_bias[..., :seq_len, :seq_len]
126
+ cannot_attend = torch.logical_not(torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))).unsqueeze(1)
127
+ min_val = torch.finfo(attn_bias.dtype).min
128
+ attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
129
+ return attn_bias
130
+
131
+ def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
132
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
133
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
134
+ if attention_mask is not None:
135
+ attention_mask = attention_mask.bool()
136
+ if prefix_mask is not None:
137
+ prefix_mask = prefix_mask.bool()
138
+ if not return_dict:
139
+ raise NotImplementedError('return_dict False is not implemented yet for MPT')
140
+ if output_attentions:
141
+ raise NotImplementedError('output_attentions is not implemented yet for MPT')
142
+ if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training:
143
+ raise NotImplementedError('MPT does not support training with left padding.')
144
+ if self.prefix_lm and prefix_mask is None:
145
+ raise ValueError('prefix_mask is a required argument when MPT is configured with prefix_lm=True.')
146
+ if self.training:
147
+ if self.attn_uses_sequence_id and sequence_id is None:
148
+ raise ValueError('sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True ' + 'and the model is in train mode.')
149
+ elif self.attn_uses_sequence_id is False and sequence_id is not None:
150
+ warnings.warn('MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. ' + 'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.')
151
+ S = input_ids.size(1)
152
+ assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
153
+ tok_emb = self.wte(input_ids)
154
+ if self.alibi:
155
+ x = tok_emb
156
+ else:
157
+ past_position = 0
158
+ if past_key_values is not None:
159
+ if len(past_key_values) != self.config.n_layers:
160
+ raise ValueError(f'past_key_values must provide a past_key_value for each attention ' + f'layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r}).')
161
+ past_position = past_key_values[0][0].size(1)
162
+ if S + past_position > self.config.max_seq_len:
163
+ raise ValueError(f'Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.')
164
+ pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_ids.device).unsqueeze(0)
165
+ if attention_mask is not None:
166
+ pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
167
+ pos_emb = self.wpe(pos)
168
+ x = tok_emb + pos_emb
169
+ if self.embedding_fraction == 1:
170
+ x = self.emb_drop(x)
171
+ else:
172
+ x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
173
+ assert isinstance(self.emb_drop, nn.Module)
174
+ x = self.emb_drop(x_shrunk)
175
+ (attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=x.dtype, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
176
+ if use_cache and past_key_values is None:
177
+ past_key_values = [() for _ in range(self.config.n_layers)]
178
+ all_hidden_states = () if output_hidden_states else None
179
+ for (b_idx, block) in enumerate(self.blocks):
180
+ if output_hidden_states:
181
+ assert all_hidden_states is not None
182
+ all_hidden_states = all_hidden_states + (x,)
183
+ past_key_value = past_key_values[b_idx] if past_key_values is not None else None
184
+ (x, past_key_value) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal)
185
+ if past_key_values is not None:
186
+ past_key_values[b_idx] = past_key_value
187
+ x = self.norm_f(x)
188
+ return BaseModelOutputWithPast(last_hidden_state=x, past_key_values=past_key_values, hidden_states=all_hidden_states)
189
+
190
+ def param_init_fn(self, module):
191
+ init_fn_name = self.config.init_config['name']
192
+ MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
193
+
194
+ def fsdp_wrap_fn(self, module):
195
+ return isinstance(module, MPTBlock)
196
+
197
+ def activation_checkpointing_fn(self, module):
198
+ return isinstance(module, MPTBlock)
199
+
200
+ class MPTForCausalLM(MPTPreTrainedModel):
201
+
202
+ def __init__(self, config: MPTConfig):
203
+ super().__init__(config)
204
+ if not config.tie_word_embeddings:
205
+ raise ValueError('MPTForCausalLM only supports tied word embeddings')
206
+ self.transformer = MPTModel(config)
207
+ self.logit_scale = None
208
+ if config.logit_scale is not None:
209
+ logit_scale = config.logit_scale
210
+ if isinstance(logit_scale, str):
211
+ if logit_scale == 'inv_sqrt_d_model':
212
+ logit_scale = 1 / math.sqrt(config.d_model)
213
+ else:
214
+ raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
215
+ self.logit_scale = logit_scale
216
+
217
+ def get_input_embeddings(self):
218
+ return self.transformer.wte
219
+
220
+ def set_input_embeddings(self, value):
221
+ self.transformer.wte = value
222
+
223
+ def get_output_embeddings(self):
224
+ return self.transformer.wte
225
+
226
+ def set_output_embeddings(self, new_embeddings):
227
+ self.transformer.wte = new_embeddings
228
+
229
+ def set_decoder(self, decoder):
230
+ self.transformer = decoder
231
+
232
+ def get_decoder(self):
233
+ return self.transformer
234
+
235
+ def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
236
+ return_dict = return_dict if return_dict is not None else self.config.return_dict
237
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
238
+ outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
239
+ logits = F.linear(outputs.last_hidden_state, self.transformer.wte.weight)
240
+ if self.logit_scale is not None:
241
+ if self.logit_scale == 0:
242
+ warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
243
+ logits *= self.logit_scale
244
+ loss = None
245
+ if labels is not None:
246
+ labels = torch.roll(labels, shifts=-1)
247
+ labels[:, -1] = -100
248
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
249
+ return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states)
250
+
251
+ def param_init_fn(self, module):
252
+ init_fn_name = self.config.init_config['name']
253
+ MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
254
+
255
+ def fsdp_wrap_fn(self, module):
256
+ return isinstance(module, MPTBlock)
257
+
258
+ def activation_checkpointing_fn(self, module):
259
+ return isinstance(module, MPTBlock)
260
+
261
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
262
+ if inputs_embeds is not None:
263
+ raise NotImplementedError('inputs_embeds is not implemented for MPT yet')
264
+ attention_mask = kwargs['attention_mask'].bool()
265
+ if attention_mask[:, -1].sum() != attention_mask.shape[0]:
266
+ raise NotImplementedError('MPT does not support generation with right padding.')
267
+ if self.transformer.attn_uses_sequence_id and self.training:
268
+ sequence_id = torch.zeros_like(input_ids[:1])
269
+ else:
270
+ sequence_id = None
271
+ if past_key_values is not None:
272
+ input_ids = input_ids[:, -1].unsqueeze(-1)
273
+ if self.transformer.prefix_lm:
274
+ prefix_mask = torch.ones_like(attention_mask)
275
+ if kwargs.get('use_cache') == False:
276
+ raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.')
277
+ else:
278
+ prefix_mask = None
279
+ return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True)}
280
+
281
+ @staticmethod
282
+ def _reorder_cache(past_key_values, beam_idx):
283
+ """Used by HuggingFace generate when using beam search with kv-caching.
284
+
285
+ See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
286
+ for an example in transformers.
287
+ """
288
+ reordered_past = []
289
+ for layer_past in past_key_values:
290
+ reordered_past += [tuple((past_state.index_select(0, beam_idx) for past_state in layer_past))]
291
+ return reordered_past
norm.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ def _cast_if_autocast_enabled(tensor):
4
+ if torch.is_autocast_enabled():
5
+ if tensor.device.type == 'cuda':
6
+ dtype = torch.get_autocast_gpu_dtype()
7
+ elif tensor.device.type == 'cpu':
8
+ dtype = torch.get_autocast_cpu_dtype()
9
+ else:
10
+ raise NotImplementedError()
11
+ return tensor.to(dtype=dtype)
12
+ return tensor
13
+
14
+ class LPLayerNorm(torch.nn.LayerNorm):
15
+
16
+ def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
17
+ super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
18
+
19
+ def forward(self, x):
20
+ module_device = x.device
21
+ downcast_x = _cast_if_autocast_enabled(x)
22
+ downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
23
+ downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
24
+ with torch.autocast(enabled=False, device_type=module_device.type):
25
+ return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
26
+
27
+ def rms_norm(x, weight=None, eps=1e-05):
28
+ output = x / torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
29
+ if weight is not None:
30
+ return output * weight
31
+ return output
32
+
33
+ class RMSNorm(torch.nn.Module):
34
+
35
+ def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
36
+ super().__init__()
37
+ self.eps = eps
38
+ if weight:
39
+ self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
40
+ else:
41
+ self.register_parameter('weight', None)
42
+
43
+ def forward(self, x):
44
+ return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
45
+
46
+ class LPRMSNorm(RMSNorm):
47
+
48
+ def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
49
+ super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)
50
+
51
+ def forward(self, x):
52
+ downcast_x = _cast_if_autocast_enabled(x)
53
+ downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
54
+ with torch.autocast(enabled=False, device_type=x.device.type):
55
+ return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
56
+ NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}
param_init_fns.py CHANGED
@@ -1,464 +1,181 @@
1
- # Copyright 2022 MosaicML Examples authors
2
- # SPDX-License-Identifier: Apache-2.0
3
  import math
4
  import warnings
5
  from collections.abc import Sequence
6
  from functools import partial
7
  from typing import Optional, Tuple, Union
8
-
9
  import torch
10
  from torch import nn
 
11
 
12
-
13
- def torch_default_param_init_fn_(
14
- module: nn.Module,
15
- verbose: int = 0,
16
- **kwargs,
17
- ):
18
- del kwargs # unused, just to capture any extra args from the config
19
  if verbose > 1:
20
- warnings.warn(
21
- f"Initializing network using module's reset_parameters attribute")
22
-
23
  if hasattr(module, 'reset_parameters'):
24
- module.reset_parameters() # type: ignore
25
-
26
 
27
  def fused_init_helper_(module: nn.Module, init_fn_):
28
- # parameter initialization is often based on the parameters shape.
29
- # If a layer is fused, initialization should be based on the shapes
30
- # of the original tensor instead of the shape of the fused tensor.
31
- # Layers which are fused should have the _fused attibute defined.
32
- # The first element of _fused is the dimension along which the tensor is fused.
33
- # This is followed by an iterable of split indices."
34
-
35
  _fused = getattr(module, '_fused', None)
36
-
37
  if _fused is None:
38
  raise RuntimeError(f'Internal logic error')
39
-
40
- dim, splits = _fused
41
- splits = (0, *splits, module.weight.size(dim)) # type: ignore
42
- for s, e in zip(splits[:-1], splits[1:]):
43
- slice_indices = [slice(None)] * module.weight.ndim # type: ignore
44
  slice_indices[dim] = slice(s, e)
45
- init_fn_(module.weight[slice_indices]) # type: ignore
46
-
47
 
48
- def generic_param_init_fn_(
49
- module: nn.Module,
50
- init_fn_,
51
- n_layers: int,
52
- d_model: Optional[int] = None,
53
- init_div_is_residual: Union[int, float, str, bool] = True,
54
- emb_init_std: Optional[float] = None,
55
- emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
56
- verbose: int = 0,
57
- **kwargs,
58
- ):
59
- del kwargs # unused, just to capture any extra args from the config
60
  if verbose > 1:
61
- warnings.warn(
62
- f'If model has bias parameters they are initialized to 0.')
63
-
64
- # enable user to divide _is_residual weights by
65
- # a value which defaults to math.sqrt(2 * cfg.n_layers)
66
  init_div_is_residual = init_div_is_residual
67
-
68
  if init_div_is_residual is False:
69
- # not used, for pyright
70
  div_is_residual = 1.0
71
  elif init_div_is_residual is True:
72
  div_is_residual = math.sqrt(2 * n_layers)
73
- elif isinstance(init_div_is_residual, float) or isinstance(
74
- init_div_is_residual, int):
75
  div_is_residual = init_div_is_residual
76
- elif isinstance(init_div_is_residual,
77
- str) and init_div_is_residual.isnumeric():
78
- # do not trust YAML parsing to always convert numbers to numbers
79
  div_is_residual = float(init_div_is_residual)
80
  else:
81
- # not used, for pyright
82
  div_is_residual = 1.0
83
- raise ValueError(
84
- f'Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}'
85
- )
86
-
87
  if init_div_is_residual is not False:
88
  if verbose > 1:
89
- warnings.warn(
90
- f'Initializing _is_residual layers then dividing them by {div_is_residual}.' +
91
- f'set `init_div_is_residual: false` in model config to disable this.'
92
- )
93
-
94
  if isinstance(module, nn.Linear):
95
- # Linear
96
  if hasattr(module, '_fused'):
97
  fused_init_helper_(module, init_fn_)
98
  else:
99
  init_fn_(module.weight)
100
  if module.bias is not None:
101
  torch.nn.init.zeros_(module.bias)
102
-
103
- if init_div_is_residual is not False and getattr(
104
- module, '_is_residual', False):
105
  with torch.no_grad():
106
  module.weight.div_(div_is_residual)
107
-
108
  elif isinstance(module, nn.Embedding):
109
- # Embedding
110
  if emb_init_std is not None:
111
  std = emb_init_std
112
  if std == 0:
113
  warnings.warn(f'Embedding layer initialized to 0.')
114
  emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
115
  if verbose > 1:
116
- warnings.warn(
117
- f'Embedding layer initialized using normal distribution with mean=0 and {std=}.'
118
- )
119
  elif emb_init_uniform_lim is not None:
120
  lim = emb_init_uniform_lim
121
  if isinstance(lim, Sequence):
122
  if len(lim) > 2:
123
- raise ValueError(
124
- f'Uniform init requires a min and a max limit. User input: {lim}.'
125
- )
126
  if lim[0] == lim[1]:
127
  warnings.warn(f'Embedding layer initialized to {lim[0]}.')
128
  else:
129
  if lim == 0:
130
  warnings.warn(f'Embedding layer initialized to 0.')
131
  lim = [-lim, lim]
132
- a, b = lim
133
  emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
134
  if verbose > 1:
135
- warnings.warn(
136
- f'Embedding layer initialized using uniform distribution in range {lim}.'
137
- )
138
  else:
139
  emb_init_fn_ = init_fn_
140
-
141
  emb_init_fn_(module.weight)
142
-
143
- elif isinstance(module, nn.LayerNorm):
144
- # LayerNorm
145
  if verbose > 1:
146
- warnings.warn(
147
- f'LayerNorm gamma weights are set to 1. If the layer has a bias it is initialized to 0.'
148
- )
149
- torch.nn.init.ones_(module.weight)
150
- if module.bias is not None:
151
  torch.nn.init.zeros_(module.bias)
152
-
153
  elif isinstance(module, nn.MultiheadAttention):
154
- # torch's MultiheadAttention
155
  if module._qkv_same_embed_dim:
156
  assert module.in_proj_weight is not None
157
- assert module.q_proj_weight is None and module.k_proj_weight is None and module.v_proj_weight is None
158
  assert d_model is not None
159
- # in_proj_weight is actually 3 layers and should be split up for width based init
160
  _d = d_model
161
  splits = (0, _d, 2 * _d, 3 * _d)
162
- for s, e in zip(splits[:-1], splits[1:]):
163
  init_fn_(module.in_proj_weight[s:e])
164
  else:
165
- assert module.q_proj_weight is not None and module.k_proj_weight is not None and module.v_proj_weight is not None
166
  assert module.in_proj_weight is None
167
  init_fn_(module.q_proj_weight)
168
  init_fn_(module.k_proj_weight)
169
  init_fn_(module.v_proj_weight)
170
-
171
- # bias
172
  if module.in_proj_bias is not None:
173
  torch.nn.init.zeros_(module.in_proj_bias)
174
  if module.bias_k is not None:
175
  torch.nn.init.zeros_(module.bias_k)
176
  if module.bias_v is not None:
177
  torch.nn.init.zeros_(module.bias_v)
178
-
179
- # out proj
180
  init_fn_(module.out_proj.weight)
181
- if init_div_is_residual is not False and getattr(
182
- module.out_proj, '_is_residual', False):
183
  with torch.no_grad():
184
  module.out_proj.weight.div_(div_is_residual)
185
  if module.out_proj.bias is not None:
186
  torch.nn.init.zeros_(module.out_proj.bias)
187
-
188
  else:
189
  for _ in module.parameters(recurse=False):
190
- # raise error if uninitialized module has any parameters
191
- raise NotImplementedError(
192
- f'{module.__class__.__name__} parameters are not initialized by param_init_fn.'
193
- )
194
-
195
 
196
  def _normal_init_(std, mean=0.0):
197
  return partial(torch.nn.init.normal_, mean=mean, std=std)
198
 
199
-
200
- def _normal_param_init_fn_(
201
- module: nn.Module,
202
- std: float,
203
- n_layers: int,
204
- d_model: Optional[int] = None,
205
- init_div_is_residual: Union[int, float, str, bool] = True,
206
- emb_init_std: Optional[float] = None,
207
- emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
208
- verbose: int = 0,
209
- **kwargs,
210
- ):
211
- del kwargs # unused, just to capture any extra args from the config
212
  init_fn_ = _normal_init_(std=std)
213
-
214
  if verbose > 1:
215
- warnings.warn(
216
- f'Using torch.nn.init.normal_ init fn mean=0.0, std={std}')
217
-
218
- generic_param_init_fn_(
219
- module=module,
220
- init_fn_=init_fn_,
221
- d_model=d_model,
222
- n_layers=n_layers,
223
- init_div_is_residual=init_div_is_residual,
224
- emb_init_std=emb_init_std,
225
- emb_init_uniform_lim=emb_init_uniform_lim,
226
- verbose=verbose,
227
- )
228
-
229
 
230
- def baseline_param_init_fn_(
231
- module: nn.Module,
232
- init_std: float,
233
- n_layers: int,
234
- d_model: Optional[int] = None,
235
- init_div_is_residual: Union[int, float, str, bool] = True,
236
- emb_init_std: Optional[float] = None,
237
- emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
238
- verbose: int = 0,
239
- **kwargs,
240
- ):
241
- del kwargs # unused, just to capture any extra args from the config
242
  if init_std is None:
243
- raise ValueError(
244
- 'You must set model.init_std to a float value to use the default initialization scheme.'
245
- )
246
- _normal_param_init_fn_(
247
- module=module,
248
- std=init_std,
249
- d_model=d_model,
250
- n_layers=n_layers,
251
- init_div_is_residual=init_div_is_residual,
252
- emb_init_std=emb_init_std,
253
- emb_init_uniform_lim=emb_init_uniform_lim,
254
- verbose=verbose,
255
- )
256
 
257
-
258
- def small_param_init_fn_(
259
- module: nn.Module,
260
- n_layers: int,
261
- d_model: int,
262
- init_div_is_residual: Union[int, float, str, bool] = True,
263
- emb_init_std: Optional[float] = None,
264
- emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
265
- verbose: int = 0,
266
- **kwargs,
267
- ):
268
- del kwargs # unused, just to capture any extra args from the config
269
- # very close to kaiming normal
270
- # from Transformers without Tears (2019) - Nguyen & Salazar
271
  std = math.sqrt(2 / (5 * d_model))
272
- _normal_param_init_fn_(
273
- module=module,
274
- std=std,
275
- d_model=d_model,
276
- n_layers=n_layers,
277
- init_div_is_residual=init_div_is_residual,
278
- emb_init_std=emb_init_std,
279
- emb_init_uniform_lim=emb_init_uniform_lim,
280
- verbose=verbose,
281
- )
282
-
283
 
284
- def neox_param_init_fn_(
285
- module: nn.Module,
286
- n_layers: int,
287
- d_model: int,
288
- emb_init_std: Optional[float] = None,
289
- emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
290
- verbose: int = 0,
291
- **kwargs,
292
- ):
293
  """From section 2.3.1 of GPT-NeoX-20B:
294
 
295
  An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
296
  see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
297
  and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
298
  """
299
- del kwargs # unused, just to capture any extra args from the config
300
- residual_div = n_layers / math.sqrt(10) # small std / wang std
301
-
302
  if verbose > 1:
303
  warnings.warn(f'setting init_div_is_residual to {residual_div}')
 
304
 
305
- small_param_init_fn_(
306
- module=module,
307
- d_model=d_model,
308
- n_layers=n_layers,
309
- init_div_is_residual=residual_div,
310
- emb_init_std=emb_init_std,
311
- emb_init_uniform_lim=emb_init_uniform_lim,
312
- verbose=verbose,
313
- )
314
-
315
-
316
- def kaiming_uniform_param_init_fn_(
317
- module: nn.Module,
318
- n_layers: int,
319
- d_model: Optional[int] = None,
320
- init_div_is_residual: Union[int, float, str, bool] = True,
321
- emb_init_std: Optional[float] = None,
322
- emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
323
- init_gain: float = 0,
324
- fan_mode: str = 'fan_in',
325
- init_nonlinearity: str = 'leaky_relu',
326
- verbose: int = 0,
327
- **kwargs,
328
- ):
329
- del kwargs # unused, just to capture any extra args from the config
330
-
331
  if verbose > 1:
332
- warnings.warn(
333
- f'Using nn.init.kaiming_uniform_ init fn with parameters: ' +
334
- f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}'
335
- )
336
-
337
- kaiming_uniform_ = partial(nn.init.kaiming_uniform_,
338
- a=init_gain,
339
- mode=fan_mode,
340
- nonlinearity=init_nonlinearity)
341
-
342
- generic_param_init_fn_(
343
- module=module,
344
- init_fn_=kaiming_uniform_,
345
- d_model=d_model,
346
- n_layers=n_layers,
347
- init_div_is_residual=init_div_is_residual,
348
- emb_init_std=emb_init_std,
349
- emb_init_uniform_lim=emb_init_uniform_lim,
350
- verbose=verbose,
351
- )
352
-
353
-
354
- def kaiming_normal_param_init_fn_(
355
- module: nn.Module,
356
- n_layers: int,
357
- d_model: Optional[int] = None,
358
- init_div_is_residual: Union[int, float, str, bool] = True,
359
- emb_init_std: Optional[float] = None,
360
- emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
361
- init_gain: float = 0,
362
- fan_mode: str = 'fan_in',
363
- init_nonlinearity: str = 'leaky_relu',
364
- verbose: int = 0,
365
- **kwargs,
366
- ):
367
- del kwargs # unused, just to capture any extra args from the config
368
 
 
 
369
  if verbose > 1:
370
- warnings.warn(
371
- f'Using nn.init.kaiming_normal_ init fn with parameters: ' +
372
- f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}'
373
- )
374
-
375
- kaiming_normal_ = partial(torch.nn.init.kaiming_normal_,
376
- a=init_gain,
377
- mode=fan_mode,
378
- nonlinearity=init_nonlinearity)
379
-
380
- generic_param_init_fn_(
381
- module=module,
382
- init_fn_=kaiming_normal_,
383
- d_model=d_model,
384
- n_layers=n_layers,
385
- init_div_is_residual=init_div_is_residual,
386
- emb_init_std=emb_init_std,
387
- emb_init_uniform_lim=emb_init_uniform_lim,
388
- verbose=verbose,
389
- )
390
 
391
-
392
- def xavier_uniform_param_init_fn_(
393
- module: nn.Module,
394
- n_layers: int,
395
- d_model: Optional[int] = None,
396
- init_div_is_residual: Union[int, float, str, bool] = True,
397
- emb_init_std: Optional[float] = None,
398
- emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
399
- init_gain: float = 0,
400
- verbose: int = 0,
401
- **kwargs,
402
- ):
403
- del kwargs # unused, just to capture any extra args from the config
404
  xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
405
-
406
  if verbose > 1:
407
- warnings.warn(
408
- f'Using torch.nn.init.xavier_uniform_ init fn with parameters: ' +
409
- f'gain={init_gain}'
410
- )
411
-
412
- generic_param_init_fn_(
413
- module=module,
414
- init_fn_=xavier_uniform_,
415
- d_model=d_model,
416
- n_layers=n_layers,
417
- init_div_is_residual=init_div_is_residual,
418
- emb_init_std=emb_init_std,
419
- emb_init_uniform_lim=emb_init_uniform_lim,
420
- verbose=verbose,
421
- )
422
 
423
-
424
- def xavier_normal_param_init_fn_(
425
- module: nn.Module,
426
- n_layers: int,
427
- d_model: Optional[int] = None,
428
- init_div_is_residual: Union[int, float, str, bool] = True,
429
- emb_init_std: Optional[float] = None,
430
- emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
431
- init_gain: float = 0,
432
- verbose: int = 0,
433
- **kwargs,
434
- ):
435
  xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
436
-
437
  if verbose > 1:
438
- warnings.warn(
439
- f'Using torch.nn.init.xavier_normal_ init fn with parameters: ' +
440
- f'gain={init_gain}'
441
- )
442
-
443
- generic_param_init_fn_(
444
- module=module,
445
- init_fn_=xavier_normal_,
446
- d_model=d_model,
447
- n_layers=n_layers,
448
- init_div_is_residual=init_div_is_residual,
449
- emb_init_std=emb_init_std,
450
- emb_init_uniform_lim=emb_init_uniform_lim,
451
- verbose=verbose,
452
- )
453
-
454
-
455
- MODEL_INIT_REGISTRY = {
456
- 'default_': torch_default_param_init_fn_,
457
- 'baseline_': baseline_param_init_fn_,
458
- 'kaiming_uniform_': kaiming_uniform_param_init_fn_,
459
- 'kaiming_normal_': kaiming_normal_param_init_fn_,
460
- 'neox_init_': neox_param_init_fn_,
461
- 'small_init_': small_param_init_fn_,
462
- 'xavier_uniform_': xavier_uniform_param_init_fn_,
463
- 'xavier_normal_': xavier_normal_param_init_fn_,
464
- }
 
 
1
  import math
2
  import warnings
3
  from collections.abc import Sequence
4
  from functools import partial
5
  from typing import Optional, Tuple, Union
 
6
  import torch
7
  from torch import nn
8
+ from .norm import NORM_CLASS_REGISTRY
9
 
10
+ def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
11
+ del kwargs
 
 
 
 
 
12
  if verbose > 1:
13
+ warnings.warn(f"Initializing network using module's reset_parameters attribute")
 
 
14
  if hasattr(module, 'reset_parameters'):
15
+ module.reset_parameters()
 
16
 
17
  def fused_init_helper_(module: nn.Module, init_fn_):
 
 
 
 
 
 
 
18
  _fused = getattr(module, '_fused', None)
 
19
  if _fused is None:
20
  raise RuntimeError(f'Internal logic error')
21
+ (dim, splits) = _fused
22
+ splits = (0, *splits, module.weight.size(dim))
23
+ for (s, e) in zip(splits[:-1], splits[1:]):
24
+ slice_indices = [slice(None)] * module.weight.ndim
 
25
  slice_indices[dim] = slice(s, e)
26
+ init_fn_(module.weight[slice_indices])
 
27
 
28
+ def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
29
+ del kwargs
 
 
 
 
 
 
 
 
 
 
30
  if verbose > 1:
31
+ warnings.warn(f'If model has bias parameters they are initialized to 0.')
 
 
 
 
32
  init_div_is_residual = init_div_is_residual
 
33
  if init_div_is_residual is False:
 
34
  div_is_residual = 1.0
35
  elif init_div_is_residual is True:
36
  div_is_residual = math.sqrt(2 * n_layers)
37
+ elif isinstance(init_div_is_residual, float) or isinstance(init_div_is_residual, int):
 
38
  div_is_residual = init_div_is_residual
39
+ elif isinstance(init_div_is_residual, str) and init_div_is_residual.isnumeric():
 
 
40
  div_is_residual = float(init_div_is_residual)
41
  else:
 
42
  div_is_residual = 1.0
43
+ raise ValueError(f'Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}')
 
 
 
44
  if init_div_is_residual is not False:
45
  if verbose > 1:
46
+ warnings.warn(f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. ' + f'Set `init_div_is_residual: false` in init config to disable this.')
 
 
 
 
47
  if isinstance(module, nn.Linear):
 
48
  if hasattr(module, '_fused'):
49
  fused_init_helper_(module, init_fn_)
50
  else:
51
  init_fn_(module.weight)
52
  if module.bias is not None:
53
  torch.nn.init.zeros_(module.bias)
54
+ if init_div_is_residual is not False and getattr(module, '_is_residual', False):
 
 
55
  with torch.no_grad():
56
  module.weight.div_(div_is_residual)
 
57
  elif isinstance(module, nn.Embedding):
 
58
  if emb_init_std is not None:
59
  std = emb_init_std
60
  if std == 0:
61
  warnings.warn(f'Embedding layer initialized to 0.')
62
  emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
63
  if verbose > 1:
64
+ warnings.warn(f'Embedding layer initialized using normal distribution with mean=0 and std={std!r}.')
 
 
65
  elif emb_init_uniform_lim is not None:
66
  lim = emb_init_uniform_lim
67
  if isinstance(lim, Sequence):
68
  if len(lim) > 2:
69
+ raise ValueError(f'Uniform init requires a min and a max limit. User input: {lim}.')
 
 
70
  if lim[0] == lim[1]:
71
  warnings.warn(f'Embedding layer initialized to {lim[0]}.')
72
  else:
73
  if lim == 0:
74
  warnings.warn(f'Embedding layer initialized to 0.')
75
  lim = [-lim, lim]
76
+ (a, b) = lim
77
  emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
78
  if verbose > 1:
79
+ warnings.warn(f'Embedding layer initialized using uniform distribution in range {lim}.')
 
 
80
  else:
81
  emb_init_fn_ = init_fn_
 
82
  emb_init_fn_(module.weight)
83
+ elif isinstance(module, tuple(set(NORM_CLASS_REGISTRY.values()))):
 
 
84
  if verbose > 1:
85
+ warnings.warn(f'Norm weights are set to 1. If norm layer has a bias it is initialized to 0.')
86
+ if hasattr(module, 'weight') and module.weight is not None:
87
+ torch.nn.init.ones_(module.weight)
88
+ if hasattr(module, 'bias') and module.bias is not None:
 
89
  torch.nn.init.zeros_(module.bias)
 
90
  elif isinstance(module, nn.MultiheadAttention):
 
91
  if module._qkv_same_embed_dim:
92
  assert module.in_proj_weight is not None
93
+ assert module.q_proj_weight is None and module.k_proj_weight is None and (module.v_proj_weight is None)
94
  assert d_model is not None
 
95
  _d = d_model
96
  splits = (0, _d, 2 * _d, 3 * _d)
97
+ for (s, e) in zip(splits[:-1], splits[1:]):
98
  init_fn_(module.in_proj_weight[s:e])
99
  else:
100
+ assert module.q_proj_weight is not None and module.k_proj_weight is not None and (module.v_proj_weight is not None)
101
  assert module.in_proj_weight is None
102
  init_fn_(module.q_proj_weight)
103
  init_fn_(module.k_proj_weight)
104
  init_fn_(module.v_proj_weight)
 
 
105
  if module.in_proj_bias is not None:
106
  torch.nn.init.zeros_(module.in_proj_bias)
107
  if module.bias_k is not None:
108
  torch.nn.init.zeros_(module.bias_k)
109
  if module.bias_v is not None:
110
  torch.nn.init.zeros_(module.bias_v)
 
 
111
  init_fn_(module.out_proj.weight)
112
+ if init_div_is_residual is not False and getattr(module.out_proj, '_is_residual', False):
 
113
  with torch.no_grad():
114
  module.out_proj.weight.div_(div_is_residual)
115
  if module.out_proj.bias is not None:
116
  torch.nn.init.zeros_(module.out_proj.bias)
 
117
  else:
118
  for _ in module.parameters(recurse=False):
119
+ raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by param_init_fn.')
 
 
 
 
120
 
121
  def _normal_init_(std, mean=0.0):
122
  return partial(torch.nn.init.normal_, mean=mean, std=std)
123
 
124
+ def _normal_param_init_fn_(module: nn.Module, std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
125
+ del kwargs
 
 
 
 
 
 
 
 
 
 
 
126
  init_fn_ = _normal_init_(std=std)
 
127
  if verbose > 1:
128
+ warnings.warn(f'Using torch.nn.init.normal_ init fn mean=0.0, std={std}')
129
+ generic_param_init_fn_(module=module, init_fn_=init_fn_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
+ def baseline_param_init_fn_(module: nn.Module, init_std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
132
+ del kwargs
 
 
 
 
 
 
 
 
 
 
133
  if init_std is None:
134
+ raise ValueError("You must set model.init_config['init_std'] to a float value to use the default initialization scheme.")
135
+ _normal_param_init_fn_(module=module, std=init_std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ def small_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
138
+ del kwargs
 
 
 
 
 
 
 
 
 
 
 
 
139
  std = math.sqrt(2 / (5 * d_model))
140
+ _normal_param_init_fn_(module=module, std=std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
 
 
 
 
 
 
 
 
 
 
141
 
142
+ def neox_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
 
 
 
 
 
 
 
 
143
  """From section 2.3.1 of GPT-NeoX-20B:
144
 
145
  An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
146
  see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
147
  and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
148
  """
149
+ del kwargs
150
+ residual_div = n_layers / math.sqrt(10)
 
151
  if verbose > 1:
152
  warnings.warn(f'setting init_div_is_residual to {residual_div}')
153
+ small_param_init_fn_(module=module, d_model=d_model, n_layers=n_layers, init_div_is_residual=residual_div, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
154
 
155
+ def kaiming_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
156
+ del kwargs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  if verbose > 1:
158
+ warnings.warn(f'Using nn.init.kaiming_uniform_ init fn with parameters: ' + f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}')
159
+ kaiming_uniform_ = partial(nn.init.kaiming_uniform_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
160
+ generic_param_init_fn_(module=module, init_fn_=kaiming_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
+ def kaiming_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
163
+ del kwargs
164
  if verbose > 1:
165
+ warnings.warn(f'Using nn.init.kaiming_normal_ init fn with parameters: ' + f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}')
166
+ kaiming_normal_ = partial(torch.nn.init.kaiming_normal_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
167
+ generic_param_init_fn_(module=module, init_fn_=kaiming_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
+ def xavier_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
170
+ del kwargs
 
 
 
 
 
 
 
 
 
 
 
171
  xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
 
172
  if verbose > 1:
173
+ warnings.warn(f'Using torch.nn.init.xavier_uniform_ init fn with parameters: ' + f'gain={init_gain}')
174
+ generic_param_init_fn_(module=module, init_fn_=xavier_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
+ def xavier_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
 
 
 
 
 
 
 
 
 
 
 
177
  xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
 
178
  if verbose > 1:
179
+ warnings.warn(f'Using torch.nn.init.xavier_normal_ init fn with parameters: ' + f'gain={init_gain}')
180
+ generic_param_init_fn_(module=module, init_fn_=xavier_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
181
+ MODEL_INIT_REGISTRY = {'default_': torch_default_param_init_fn_, 'baseline_': baseline_param_init_fn_, 'kaiming_uniform_': kaiming_uniform_param_init_fn_, 'kaiming_normal_': kaiming_normal_param_init_fn_, 'neox_init_': neox_param_init_fn_, 'small_init_': small_param_init_fn_, 'xavier_uniform_': xavier_uniform_param_init_fn_, 'xavier_normal_': xavier_normal_param_init_fn_}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
replit_lm_tokenizer.py CHANGED
@@ -16,19 +16,15 @@
16
  Forked from the file src/transformers/models/bert_generation/tokenization_bert_generation.py from the HuggingFace Transformers library.
17
  Permalink: https://github.com/huggingface/transformers/blob/04ab5605fbb4ef207b10bf2772d88c53fc242e83/src/transformers/models/bert_generation/tokenization_bert_generation.py
18
 
19
- Class is modified for compatibility with custom vocabulary and to achieve desired encode/decode behavior for Replit Code v1.3b model.
 
20
  """
21
-
22
- """ Tokenizer class for ReplitLM"""
23
-
24
-
25
  import os
26
  import sentencepiece as spm
27
  from shutil import copyfile
28
  from transformers import PreTrainedTokenizer
29
  from typing import Any, Dict, List, Optional, Tuple
30
- VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
31
-
32
 
33
  class ReplitLMTokenizer(PreTrainedTokenizer):
34
  """
@@ -61,37 +57,14 @@ class ReplitLMTokenizer(PreTrainedTokenizer):
61
  - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
62
  BPE-dropout.
63
  """
64
-
65
  vocab_files_names = VOCAB_FILES_NAMES
66
  prefix_tokens: List[int] = []
67
- model_input_names = ["input_ids", "attention_mask"]
68
 
69
- def __init__(
70
- self,
71
- vocab_file,
72
- bos_token=None,
73
- eos_token="<|endoftext|>",
74
- unk_token="<|unk|>",
75
- pad_token="<|pad|>",
76
- sep_token=None,
77
- sp_model_kwargs: Optional[Dict[str, Any]] = None,
78
- **kwargs,
79
- ) -> None:
80
  self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
81
-
82
- # Add extra_ids to the special token list
83
- super().__init__(
84
- bos_token=bos_token,
85
- eos_token=eos_token,
86
- unk_token=unk_token,
87
- pad_token=pad_token,
88
- sep_token=sep_token,
89
- sp_model_kwargs=self.sp_model_kwargs,
90
- **kwargs,
91
- )
92
-
93
  self.vocab_file = vocab_file
94
-
95
  self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
96
  self.sp_model.Load(vocab_file)
97
 
@@ -100,23 +73,19 @@ class ReplitLMTokenizer(PreTrainedTokenizer):
100
  return self.sp_model.get_piece_size()
101
 
102
  def get_vocab(self):
103
- vocab = {self.convert_ids_to_tokens(
104
- i): i for i in range(self.vocab_size)}
105
  vocab.update(self.added_tokens_encoder)
106
  return vocab
107
 
108
  def __getstate__(self):
109
  state = self.__dict__.copy()
110
- state["sp_model"] = None
111
  return state
112
 
113
  def __setstate__(self, d):
114
  self.__dict__ = d
115
-
116
- # for backward compatibility
117
- if not hasattr(self, "sp_model_kwargs"):
118
  self.sp_model_kwargs = {}
119
-
120
  self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
121
  self.sp_model.load(self.vocab_file)
122
 
@@ -137,25 +106,14 @@ class ReplitLMTokenizer(PreTrainedTokenizer):
137
  """Converts a sequence of tokens (string) in a single string."""
138
  return self.sp_model.decode(tokens)
139
 
140
- def save_vocabulary(self,
141
- save_directory: str,
142
- filename_prefix: Optional[str] = None) -> Tuple[str]:
143
-
144
  if not os.path.isdir(save_directory):
145
- raise ValueError(
146
- f"Vocabulary path ({save_directory}) should be a directory")
147
-
148
- out_vocab_file = os.path.join(
149
- save_directory, (filename_prefix + "-" if filename_prefix else "") +
150
- VOCAB_FILES_NAMES["vocab_file"])
151
-
152
- if os.path.abspath(
153
- self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(
154
- self.vocab_file):
155
  copyfile(self.vocab_file, out_vocab_file)
156
  elif not os.path.isfile(self.vocab_file):
157
- with open(out_vocab_file, "wb") as fi:
158
  content_spiece_model = self.sp_model.serialized_model_proto()
159
  fi.write(content_spiece_model)
160
-
161
- return (out_vocab_file, )
16
  Forked from the file src/transformers/models/bert_generation/tokenization_bert_generation.py from the HuggingFace Transformers library.
17
  Permalink: https://github.com/huggingface/transformers/blob/04ab5605fbb4ef207b10bf2772d88c53fc242e83/src/transformers/models/bert_generation/tokenization_bert_generation.py
18
 
19
+ Tokenizer class for ReplitLM
20
+ Class is modified for compatibility with custom vocabulary and to achieve desired encode/decode behavior for Replit Code V1 3B model.
21
  """
 
 
 
 
22
  import os
23
  import sentencepiece as spm
24
  from shutil import copyfile
25
  from transformers import PreTrainedTokenizer
26
  from typing import Any, Dict, List, Optional, Tuple
27
+ VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
 
28
 
29
  class ReplitLMTokenizer(PreTrainedTokenizer):
30
  """
57
  - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
58
  BPE-dropout.
59
  """
 
60
  vocab_files_names = VOCAB_FILES_NAMES
61
  prefix_tokens: List[int] = []
62
+ model_input_names = ['input_ids', 'attention_mask']
63
 
64
+ def __init__(self, vocab_file, bos_token=None, eos_token='<|endoftext|>', unk_token='<|unk|>', pad_token='<|pad|>', sep_token=None, sp_model_kwargs: Optional[Dict[str, Any]]=None, **kwargs) -> None:
 
 
 
 
 
 
 
 
 
 
65
  self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
66
+ super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, sep_token=sep_token, sp_model_kwargs=self.sp_model_kwargs, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
67
  self.vocab_file = vocab_file
 
68
  self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
69
  self.sp_model.Load(vocab_file)
70
 
73
  return self.sp_model.get_piece_size()
74
 
75
  def get_vocab(self):
76
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
 
77
  vocab.update(self.added_tokens_encoder)
78
  return vocab
79
 
80
  def __getstate__(self):
81
  state = self.__dict__.copy()
82
+ state['sp_model'] = None
83
  return state
84
 
85
  def __setstate__(self, d):
86
  self.__dict__ = d
87
+ if not hasattr(self, 'sp_model_kwargs'):
 
 
88
  self.sp_model_kwargs = {}
 
89
  self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
90
  self.sp_model.load(self.vocab_file)
91
 
106
  """Converts a sequence of tokens (string) in a single string."""
107
  return self.sp_model.decode(tokens)
108
 
109
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str]=None) -> Tuple[str]:
 
 
 
110
  if not os.path.isdir(save_directory):
111
+ raise ValueError(f'Vocabulary path ({save_directory}) should be a directory')
112
+ out_vocab_file = os.path.join(save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file'])
113
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
 
 
 
 
 
 
 
114
  copyfile(self.vocab_file, out_vocab_file)
115
  elif not os.path.isfile(self.vocab_file):
116
+ with open(out_vocab_file, 'wb') as fi:
117
  content_spiece_model = self.sp_model.serialized_model_proto()
118
  fi.write(content_spiece_model)
119
+ return (out_vocab_file,)
 
special_tokens_map.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "eos_token": "<|endoftext|>",
3
- "pad_token": "<|pad|>",
4
- "unk_token": "<|unk|>"
5
- }
1
  {
2
+ "eos_token": "<|endoftext|>",
3
+ "pad_token": "<|pad|>",
4
+ "unk_token": "<|unk|>"
5
+ }
tokenizer_config.json CHANGED
@@ -1,18 +1,18 @@
1
  {
2
- "auto_map": {
3
- "AutoTokenizer": [
4
- "replit_lm_tokenizer.ReplitLMTokenizer",
5
- null
6
- ]
7
- },
8
- "bos_token": null,
9
- "clean_up_tokenization_spaces": false,
10
- "eos_token": "<|endoftext|>",
11
- "model_max_length": 2048,
12
- "pad_token": "<|pad|>",
13
- "padding_side": "right",
14
- "sep_token": null,
15
- "sp_model_kwargs": {},
16
- "tokenizer_class": "ReplitLMTokenizer",
17
- "unk_token": "<|unk|>"
18
- }
1
  {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "replit_lm_tokenizer.ReplitLMTokenizer",
5
+ null
6
+ ]
7
+ },
8
+ "bos_token": null,
9
+ "clean_up_tokenization_spaces": false,
10
+ "eos_token": "<|endoftext|>",
11
+ "model_max_length": 2048,
12
+ "pad_token": "<|pad|>",
13
+ "padding_side": "right",
14
+ "sep_token": null,
15
+ "sp_model_kwargs": {},
16
+ "tokenizer_class": "ReplitLMTokenizer",
17
+ "unk_token": "<|unk|>"
18
+ }