pirroh commited on
Commit
0ef6093
1 Parent(s): 1c6453a

Delete replit_lm.py

Browse files
Files changed (1) hide show
  1. replit_lm.py +0 -453
replit_lm.py DELETED
@@ -1,453 +0,0 @@
1
- # Copyright 2022 MosaicML Examples authors
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- """Forked from the MosaicGPT model class from the Mosaic Examples codebase of date May 1st, 2023.
5
- Permalink: https://github.com/mosaicml/examples/blob/52cd4fef69497f225a034fcd10692f8613732d10/examples/llm/src/models/mosaic_gpt/mosaic_gpt.py
6
- """
7
-
8
- """A simple, flexible implementation of a GPT model.
9
-
10
- Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
11
- """
12
-
13
- import math
14
- import torch
15
- import torch.nn as nn
16
- import torch.nn.functional as F
17
- import warnings
18
-
19
- from transformers import PreTrainedModel
20
- from transformers.modeling_outputs import CausalLMOutputWithPast
21
- from typing import List, Optional, Tuple
22
-
23
- from .attention import attn_bias as module_attn_bias, attn_bias_shape as module_attn_bias_shape
24
- from .gpt_blocks import GPTBlock
25
- from .configuration_replit_lm import \
26
- ReplitLMConfig
27
- from .param_init_fns import MODEL_INIT_REGISTRY
28
- from .low_precision_layernorm import LPLayerNorm
29
-
30
-
31
- class ReplitLM(PreTrainedModel):
32
- config_class = ReplitLMConfig
33
- base_model_prefix = 'replit_lm'
34
-
35
- def __init__(self, config: ReplitLMConfig):
36
- super().__init__(config)
37
-
38
- if config.attn_impl == 'flash' and config.alibi:
39
- raise RuntimeError("ALiBi is not supported with flash attention. Please use triton or torch.")
40
-
41
- self.attn_impl = config.attn_impl
42
- self.prefix_lm = config.prefix_lm
43
- self.attn_uses_sequence_id = config.attn_uses_sequence_id
44
- self.alibi = config.alibi
45
- self.alibi_bias_max = config.alibi_bias_max
46
-
47
- layernorm_class = LPLayerNorm if config.low_precision_layernorm else nn.LayerNorm
48
-
49
- # CogView (https://arxiv.org/abs/2105.13290) and GLM-130B (https://arxiv.org/abs/2210.02414)
50
- # both report this helping with stabilizing training
51
- self.embedding_fraction = config.embedding_fraction
52
-
53
- self.transformer = nn.ModuleDict({
54
- 'wte':
55
- nn.Embedding(config.vocab_size,
56
- config.d_model,
57
- device=config.init_device)
58
- })
59
- if not self.alibi:
60
- self.transformer.update({
61
- 'wpe':
62
- nn.Embedding(config.max_seq_len,
63
- config.d_model,
64
- device=config.init_device)
65
- })
66
- self.transformer.update({'emb_drop': nn.Dropout(config.emb_pdrop)})
67
- self.transformer.update({
68
- 'blocks':
69
- nn.ModuleList([
70
- GPTBlock(device=config.init_device,
71
- **config.to_dict())
72
- for _ in range(config.n_layers)
73
- ])
74
- })
75
- self.transformer.update({
76
- 'ln_f': layernorm_class(config.d_model, device=config.init_device)
77
- })
78
-
79
- # enables scaling output logits; similar to a softmax "temperature"
80
- # PaLM paper uses scale 1/sqrt(config.d_model)
81
- self.logit_scale = None
82
- if config.logit_scale is not None:
83
- logit_scale = config.logit_scale
84
- if isinstance(logit_scale, str):
85
- if logit_scale == 'inv_sqrt_d_model':
86
- logit_scale = 1 / math.sqrt(config.d_model)
87
- else:
88
- raise ValueError(
89
- f"{logit_scale=} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
90
- )
91
- self.logit_scale = logit_scale
92
-
93
- if config.init_device != 'meta':
94
- print(
95
- f'You are using {config.init_device=}, but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.'
96
- )
97
- self.apply(self.param_init_fn)
98
-
99
- self.is_causal = not self.prefix_lm
100
-
101
- # define attn mask
102
- self._attn_bias_initialized = False
103
- self.attn_bias = None
104
- self.attn_bias_shape = module_attn_bias_shape(
105
- self.attn_impl,
106
- config.n_heads,
107
- config.max_seq_len,
108
- self.alibi,
109
- prefix_lm=self.prefix_lm,
110
- causal=self.is_causal,
111
- use_sequence_id=self.attn_uses_sequence_id)
112
-
113
- if config.no_bias:
114
- for module in self.modules():
115
- if hasattr(module, 'bias') and isinstance(
116
- module.bias, nn.Parameter):
117
- if config.verbose:
118
- print(f'Removing bias ({module.bias}) from {module}.')
119
- module.register_parameter('bias', None)
120
-
121
- if config.verbose and config.verbose > 2:
122
- print(self)
123
-
124
- @torch.no_grad()
125
- def _attn_bias(self,
126
- device,
127
- dtype,
128
- attention_mask: Optional[torch.ByteTensor] = None,
129
- prefix_mask: Optional[torch.ByteTensor] = None,
130
- sequence_id: Optional[torch.LongTensor] = None):
131
- if not self._attn_bias_initialized:
132
- if self.attn_bias_shape:
133
- self.attn_bias = torch.zeros(self.attn_bias_shape,
134
- device=device,
135
- dtype=dtype)
136
- self.attn_bias = module_attn_bias(
137
- self.attn_impl,
138
- self.attn_bias,
139
- self.config.n_heads,
140
- self.config.max_seq_len,
141
- causal=self.is_causal,
142
- alibi=self.alibi,
143
- alibi_bias_max=self.alibi_bias_max)
144
- self._attn_bias_initialized = True
145
-
146
- # flash does not support prefix_lm and will incorporate any
147
- # attention_mask inside the attention module
148
- if self.attn_impl == 'flash':
149
- return self.attn_bias, attention_mask
150
-
151
- attn_bias = self.attn_bias
152
-
153
- # If using torch or triton, we incorporate the prefix_mask (if appropriate)
154
- if self.prefix_lm:
155
- assert isinstance(attn_bias, torch.Tensor) # pyright
156
- assert isinstance(prefix_mask, torch.Tensor) # pyright
157
- attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
158
-
159
- # If using torch or triton, we incorporate sequence_id (if appropriate)
160
- if self.attn_uses_sequence_id and sequence_id is not None:
161
- assert isinstance(attn_bias, torch.Tensor) # pyright
162
- attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
163
-
164
- # If using torch or triton, we incorporate attention_mask. This will output
165
- # None in place of attention_mask since it will not be further needed in the
166
- # attention modules.
167
- if attention_mask is not None:
168
- s_k = attention_mask.shape[-1]
169
- if attn_bias is None:
170
- attn_bias = torch.zeros((1, 1, 1, s_k),
171
- device=device,
172
- dtype=dtype)
173
- else:
174
- attn_bias = attn_bias[:, :, :, -s_k:]
175
- if prefix_mask is not None and (attention_mask.shape !=
176
- prefix_mask.shape):
177
- raise ValueError(
178
- f'attention_mask shape={attention_mask.shape} ' +\
179
- f'and prefix_mask shape={prefix_mask.shape} are not equal.'
180
- )
181
- min_val = torch.finfo(attn_bias.dtype).min
182
- attn_bias = attn_bias.masked_fill(
183
- ~attention_mask.view(-1, 1, 1, s_k), min_val)
184
-
185
- return attn_bias, None
186
-
187
- def _apply_prefix_mask(self, attn_bias: torch.Tensor,
188
- prefix_mask: torch.Tensor):
189
- s_k, s_q = attn_bias.shape[-2:]
190
- if (s_k != self.config.max_seq_len) or (s_q != self.config.max_seq_len):
191
- raise ValueError(
192
- 'attn_bias does not match the expected shape. ' +\
193
- f'The last two dimensions should both be {self.config.max_length} ' +\
194
- f'but are {s_k} and {s_q}.'
195
- )
196
- seq_len = prefix_mask.shape[-1]
197
- if seq_len > self.config.max_seq_len:
198
- raise ValueError(
199
- f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}'
200
- )
201
-
202
- # select seq_len subset of attn mask
203
- attn_bias = attn_bias[..., :seq_len, :seq_len]
204
-
205
- # Mix the causal max and the bidirectional mask to get the full
206
- # allowable attention (i.e. full = not accounting for padding yet)
207
- causal = torch.tril(
208
- torch.ones((seq_len, seq_len),
209
- dtype=torch.bool,
210
- device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
211
- prefix = prefix_mask.view(-1, 1, 1, seq_len)
212
- cannot_attend = ~torch.logical_or(causal, prefix.bool())
213
-
214
- min_val = torch.finfo(attn_bias.dtype).min
215
- attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
216
-
217
- return attn_bias
218
-
219
- def _apply_sequence_id(self, attn_bias: torch.Tensor,
220
- sequence_id: torch.LongTensor):
221
- seq_len = sequence_id.shape[-1]
222
- if seq_len > self.config.max_seq_len:
223
- raise ValueError(
224
- f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}'
225
- )
226
-
227
- # select seq_len subset of attn mask
228
- attn_bias = attn_bias[..., :seq_len, :seq_len]
229
-
230
- # Restrict attention to tokens that share the same value
231
- # in sequence_id
232
- cannot_attend = torch.logical_not(
233
- torch.eq(sequence_id.view(-1, seq_len, 1),
234
- sequence_id.view(-1, 1, seq_len))).unsqueeze(1)
235
- min_val = torch.finfo(attn_bias.dtype).min
236
- attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
237
-
238
- return attn_bias
239
-
240
- def forward(
241
- self,
242
- input_ids: torch.LongTensor,
243
- past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
244
- attention_mask: Optional[torch.ByteTensor] = None,
245
- prefix_mask: Optional[torch.ByteTensor] = None,
246
- sequence_id: Optional[torch.LongTensor] = None,
247
- return_dict: Optional[bool] = None,
248
- output_attentions: Optional[bool] = None,
249
- output_hidden_states: Optional[bool] = None,
250
- use_cache: Optional[bool] = None):
251
- return_dict = return_dict if return_dict is not None else self.config.return_dict
252
- use_cache = use_cache if use_cache is not None else self.config.use_cache
253
-
254
- # These args are passed in by keyword in huggingface's generate function
255
- # https://github.com/huggingface/transformers/blob/68287689f2f0d8b7063c400230b3766987abf18d/src/transformers/generation/utils.py#L2201-L2206
256
- # but have not yet been fully implemented in ReplitLM
257
- if not return_dict:
258
- raise NotImplementedError(
259
- 'return_dict False is not implemented yet for ReplitLM')
260
- if output_attentions:
261
- raise NotImplementedError(
262
- 'output_attentions is not implemented yet for ReplitLM')
263
-
264
- if attention_mask is not None and attention_mask[:, 0].sum(
265
- ) != attention_mask.shape[0] and self.training:
266
- raise NotImplementedError(
267
- 'ReplitLM does not support training with left padding.')
268
-
269
- if self.prefix_lm and prefix_mask is None:
270
- raise ValueError(
271
- 'prefix_mask is a required argument when ReplitLM is configured with prefix_lm=True.'
272
- )
273
-
274
- if self.training:
275
- if self.attn_uses_sequence_id and sequence_id is None:
276
- raise ValueError(
277
- 'sequence_id is a required argument when ReplitLM is configured with attn_uses_sequence_id=True ' +\
278
- 'and the model is in train mode.'
279
- )
280
- elif (self.attn_uses_sequence_id is False) and (sequence_id
281
- is not None):
282
- warnings.warn(
283
- 'ReplitLM received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. ' +\
284
- 'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.'
285
- )
286
-
287
- S = input_ids.size(1)
288
-
289
- assert (
290
- S <= self.config.max_seq_len
291
- ), f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
292
-
293
- tok_emb = self.transformer.wte(input_ids) # type: ignore
294
- if self.alibi:
295
- x = tok_emb
296
- else:
297
- past_position = 0
298
- if past_key_values is not None:
299
- if len(past_key_values) != self.config.n_layers:
300
- raise ValueError(
301
- f'past_key_values must provide a past_key_value for each attention ' +\
302
- f'layer in the network ({len(past_key_values)=}; {self.config.n_layers=}).'
303
- )
304
- # get the key tensor whose spec should be (batch, seq, dim), and
305
- # collect the `seq`, so that the position embedding is shifted
306
- past_position = past_key_values[0][0].size(1)
307
-
308
- if S + past_position > self.config.max_seq_len:
309
- raise ValueError(
310
- f'Cannot forward input with past sequence length {past_position} and current sequence length '
311
- f'{S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.'
312
- )
313
- pos = torch.arange(past_position,
314
- S + past_position,
315
- dtype=torch.long,
316
- device=input_ids.device).unsqueeze(0)
317
- if attention_mask is not None:
318
- # adjust the position indices to account for padding tokens
319
- pos = torch.clamp(pos - torch.cumsum(
320
- (~attention_mask).to(torch.int32), dim=1)[:,
321
- past_position:],
322
- min=0)
323
-
324
- pos_emb = self.transformer.wpe(pos) # type: ignore
325
- x = tok_emb + pos_emb
326
-
327
- if self.embedding_fraction == 1:
328
- x = self.transformer.emb_drop(x) # type: ignore
329
- else:
330
- # this implementation is proposed on page 7 of the GLM-130B paper https://arxiv.org/abs/2210.02414
331
- x_shrunk = (x * self.embedding_fraction) + (
332
- x.detach() * (1 - self.embedding_fraction))
333
- assert isinstance(self.transformer.emb_drop, nn.Module) # pyright
334
- x = self.transformer.emb_drop(x_shrunk)
335
-
336
- attn_bias, attention_mask = self._attn_bias(
337
- device=x.device,
338
- dtype=x.dtype,
339
- attention_mask=attention_mask,
340
- prefix_mask=prefix_mask,
341
- sequence_id=sequence_id)
342
-
343
- # initialize the past key values cache if it should be used
344
- if use_cache and past_key_values is None:
345
- past_key_values = [() for _ in range(self.config.n_layers)
346
- ] # type: ignore
347
-
348
- all_hidden_states = () if output_hidden_states else None
349
- for b_idx, block in enumerate(self.transformer.blocks): # type: ignore
350
- if output_hidden_states:
351
- assert all_hidden_states is not None # pyright
352
- all_hidden_states = all_hidden_states + (x,)
353
- past_key_value = past_key_values[
354
- b_idx] if past_key_values is not None else None
355
- x, past_key_value = block(x,
356
- past_key_value=past_key_value,
357
- attn_bias=attn_bias,
358
- attention_mask=attention_mask,
359
- is_causal=self.is_causal)
360
- if past_key_values is not None:
361
- past_key_values[b_idx] = past_key_value
362
-
363
- x = self.transformer.ln_f(x) # type: ignore
364
-
365
- # output embedding weight tied to input embedding
366
- assert isinstance(self.transformer.wte, nn.Module) # pyright
367
- assert isinstance(self.transformer.wte.weight, torch.Tensor) # pyright
368
- logits = F.linear(x, self.transformer.wte.weight, None)
369
-
370
- if self.logit_scale is not None:
371
- if self.logit_scale == 0:
372
- warnings.warn(
373
- f'Multiplying logits by {self.logit_scale=}. This will produce uniform (uninformative) outputs.'
374
- )
375
- logits *= self.logit_scale
376
-
377
- return CausalLMOutputWithPast(logits=logits,
378
- past_key_values=past_key_values,
379
- hidden_states=all_hidden_states)
380
-
381
- # Param Initialization, needed for device='meta' fast initialization
382
- def param_init_fn(self, module):
383
- init_fn_name = self.config.param_init_fn
384
- if self.config.verbose > 1:
385
- warnings.warn(f'Using {init_fn_name} initialization.')
386
- MODEL_INIT_REGISTRY[init_fn_name](module=module,
387
- **self.config.to_dict())
388
-
389
- # FSDP Wrap function
390
- def fsdp_wrap_fn(self, module):
391
- return isinstance(module, GPTBlock)
392
-
393
- # Activation Checkpointing
394
- def activation_checkpointing_fn(self, module):
395
- return isinstance(module, GPTBlock)
396
-
397
- def prepare_inputs_for_generation(self,
398
- input_ids,
399
- past_key_values=None,
400
- inputs_embeds=None,
401
- **kwargs):
402
- if inputs_embeds is not None:
403
- raise NotImplementedError(
404
- 'inputs_embeds is not implemented for ReplitLM yet')
405
-
406
- attention_mask = kwargs['attention_mask'].bool()
407
- if attention_mask[:, -1].sum() != attention_mask.shape[0]:
408
- raise NotImplementedError(
409
- 'ReplitLM does not support generation with right padding.')
410
-
411
- if self.attn_uses_sequence_id and self.training:
412
- sequence_id = torch.zeros_like(input_ids[:1])
413
- else:
414
- sequence_id = None
415
-
416
- if past_key_values is not None:
417
- input_ids = input_ids[:, -1].unsqueeze(-1)
418
-
419
- if self.prefix_lm:
420
- # Leverage a convenience of sequential generation!
421
- prefix_mask = torch.ones_like(attention_mask)
422
- # This requires that we're using the cache
423
- if kwargs.get('use_cache') == False:
424
- raise NotImplementedError(
425
- 'ReplitLM with prefix_lm=True does not support use_cache=False.'
426
- )
427
- else:
428
- prefix_mask = None
429
-
430
- return {
431
- 'input_ids': input_ids,
432
- 'attention_mask': attention_mask,
433
- 'prefix_mask': prefix_mask,
434
- 'sequence_id': sequence_id,
435
- 'past_key_values': past_key_values,
436
- 'use_cache': kwargs.get('use_cache', True),
437
- }
438
-
439
- @staticmethod
440
- def _reorder_cache(past_key_values, beam_idx):
441
- """Used by HuggingFace generate when using beam search with kv-caching.
442
-
443
- See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
444
- for an example in transformers.
445
- """
446
- reordered_past = []
447
- for layer_past in past_key_values:
448
- reordered_past += [
449
- tuple(
450
- past_state.index_select(0, beam_idx)
451
- for past_state in layer_past)
452
- ]
453
- return reordered_past