Text Generation
Transformers
PyTorch
mpt
Composer
MosaicML
llm-foundry
conversational
custom_code
text-generation-inference
sam-mosaic commited on
Commit
a59f066
1 Parent(s): 7756256

Upload folder using huggingface_hub

Browse files
attention.py CHANGED
@@ -55,7 +55,7 @@ def scaled_multihead_dot_product_attention(query, key, value, n_heads, past_key_
55
  attn_weight = torch.softmax(attn_weight, dim=-1)
56
  if dropout_p:
57
  attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
58
- out = attn_weight.matmul(v)
59
  out = rearrange(out, 'b h s d -> b s (h d)')
60
  if needs_weights:
61
  return (out, attn_weight, past_key_value)
 
55
  attn_weight = torch.softmax(attn_weight, dim=-1)
56
  if dropout_p:
57
  attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
58
+ out = attn_weight.to(v.dtype).matmul(v)
59
  out = rearrange(out, 'b h s d -> b s (h d)')
60
  if needs_weights:
61
  return (out, attn_weight, past_key_value)
config.json CHANGED
@@ -8,8 +8,8 @@
8
  "attn_impl": "torch",
9
  "attn_pdrop": 0,
10
  "attn_type": "multihead_attention",
11
- "attn_uses_sequence_id": true,
12
- "clip_qkv": 6,
13
  "prefix_lm": false,
14
  "qk_ln": false,
15
  "softmax_scale": null
@@ -36,7 +36,7 @@
36
  "init_device": "cpu",
37
  "learned_pos_emb": true,
38
  "logit_scale": null,
39
- "max_seq_len": 2048,
40
  "model_type": "mpt",
41
  "n_heads": 64,
42
  "n_layers": 48,
 
8
  "attn_impl": "torch",
9
  "attn_pdrop": 0,
10
  "attn_type": "multihead_attention",
11
+ "attn_uses_sequence_id": false,
12
+ "clip_qkv": null,
13
  "prefix_lm": false,
14
  "qk_ln": false,
15
  "softmax_scale": null
 
36
  "init_device": "cpu",
37
  "learned_pos_emb": true,
38
  "logit_scale": null,
39
+ "max_seq_len": 8192,
40
  "model_type": "mpt",
41
  "n_heads": 64,
42
  "n_layers": 48,
modeling_mpt.py CHANGED
@@ -40,6 +40,11 @@ class MPTModel(MPTPreTrainedModel):
40
  self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
41
  self.alibi = config.attn_config['alibi']
42
  self.alibi_bias_max = config.attn_config['alibi_bias_max']
 
 
 
 
 
43
  if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
44
  norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
45
  raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
@@ -182,7 +187,7 @@ class MPTModel(MPTPreTrainedModel):
182
  x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
183
  assert isinstance(self.emb_drop, nn.Module)
184
  x = self.emb_drop(x_shrunk)
185
- (attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=x.dtype, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
186
  if use_cache and past_key_values is None:
187
  past_key_values = [() for _ in range(self.config.n_layers)]
188
  all_hidden_states = () if output_hidden_states else None
 
40
  self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
41
  self.alibi = config.attn_config['alibi']
42
  self.alibi_bias_max = config.attn_config['alibi_bias_max']
43
+ if config.init_device == 'mixed':
44
+ if dist.get_local_rank() == 0:
45
+ config.init_device = 'cpu'
46
+ else:
47
+ config.init_device = 'meta'
48
  if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
49
  norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
50
  raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
 
187
  x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
188
  assert isinstance(self.emb_drop, nn.Module)
189
  x = self.emb_drop(x_shrunk)
190
+ (attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=torch.float32, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
191
  if use_cache and past_key_values is None:
192
  past_key_values = [() for _ in range(self.config.n_layers)]
193
  all_hidden_states = () if output_hidden_states else None
pytorch_model-00001-of-00007.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81689887c0b6b69e85ba724f5c5f2b723ab9af1e41ffcdf5978417023abfb567
3
  size 9766157965
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb8229f99d31f14643324ae69c945bfc5c2548dee813ef23b7260b8d22ff3d82
3
  size 9766157965
pytorch_model-00002-of-00007.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06b7ba1da683bc45b768d48eafa1ee4faf82b61091525a8f7ea1a60cc2ec34ec
3
  size 9865248775
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4612b6e2b09839df9f3babfb1d00a49ff6eef3a8e6126a1e37732f59b9539c7
3
  size 9865248775
pytorch_model-00003-of-00007.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:966f90ff885aa3349fefb60b6d67bf90054ba64c51d4603db6cfce62e1d7a625
3
  size 9865248775
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abf88070285848375c85b1fc5328b8b10485286b7c453a7a3e66e36f1be19aa8
3
  size 9865248775
pytorch_model-00004-of-00007.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9187a71cc8dbc91e7fe4abe99609eda468027c98983c4314ef3deb18a923aec9
3
  size 9865248775
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:028048cf61661c4c9173918053dad5aab752767f63d3e11133aee4f2c0431fbd
3
  size 9865248775
pytorch_model-00005-of-00007.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:788d60e164191f46fe72d24191df4da1cc2efafb6055f47dc52642f18f4db639
3
  size 9865248775
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5990a41d6a5a2418977a75622b00106043b68786cf85f221361b3711522290e8
3
  size 9865248775
pytorch_model-00006-of-00007.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0bab4477c25ad39e73ace5a9ad79a3d9cdee8939ffd06bfc26843d54c6b0a956
3
  size 9865248775
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7f090b3e9c31fef6b85569d537c86f1648bc61631e7d1fa83d498124bb7e752
3
  size 9865248775
pytorch_model-00007-of-00007.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33f1b1ad786d84e232016512b0831c1e032dbcdf7250c767ac7d9c068c647425
3
  size 822099468
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1ab59fada51f526990510c5601e82d4f606214cfad59ebf5a333f6d12d63f1f
3
  size 822099468
tokenizer.json CHANGED
@@ -1,7 +1,14 @@
1
  {
2
  "version": "1.0",
3
  "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
1
  {
2
  "version": "1.0",
3
  "truncation": null,
4
+ "padding": {
5
+ "strategy": "BatchLongest",
6
+ "direction": "Left",
7
+ "pad_to_multiple_of": null,
8
+ "pad_id": 0,
9
+ "pad_type_id": 0,
10
+ "pad_token": "<|endoftext|>"
11
+ },
12
  "added_tokens": [
13
  {
14
  "id": 0,
tokenizer_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "bos_token": "<|endoftext|>",
4
  "clean_up_tokenization_spaces": true,
5
  "eos_token": "<|endoftext|>",
6
- "model_max_length": 2048,
7
  "tokenizer_class": "GPTNeoXTokenizer",
8
  "unk_token": "<|endoftext|>"
9
  }
 
3
  "bos_token": "<|endoftext|>",
4
  "clean_up_tokenization_spaces": true,
5
  "eos_token": "<|endoftext|>",
6
+ "model_max_length": 8192,
7
  "tokenizer_class": "GPTNeoXTokenizer",
8
  "unk_token": "<|endoftext|>"
9
  }