Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

attention.py +1 -1
config.json +3 -3
generation_config.json +1 -1
modeling_mpt.py +13 -8
norm.py +3 -2
pytorch_model-00001-of-00002.bin +1 -1
pytorch_model-00002-of-00002.bin +1 -1

attention.py CHANGED Viewed

@@ -46,7 +46,7 @@ def scaled_multihead_dot_product_attention(query, key, value, n_heads, past_key_
         attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
     if is_causal and (not q.size(2) == 1):
         s = max(s_q, s_k)
-        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
         causal_mask = causal_mask.tril()
         causal_mask = causal_mask.to(torch.bool)
         causal_mask = ~causal_mask

         attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
     if is_causal and (not q.size(2) == 1):
         s = max(s_q, s_k)
+        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float32)
         causal_mask = causal_mask.tril()
         causal_mask = causal_mask.to(torch.bool)
         causal_mask = ~causal_mask

config.json CHANGED Viewed

@@ -27,9 +27,9 @@
     "emb_init_uniform_lim": null,
     "fan_mode": "fan_in",
     "init_div_is_residual": true,
-    "init_gain": 0,
     "init_nonlinearity": "relu",
-    "init_std": 0.02,
     "name": "kaiming_normal_",
     "verbose": 0
   },
@@ -45,7 +45,7 @@
   "resid_pdrop": 0,
   "tokenizer_name": "EleutherAI/gpt-neox-20b",
   "torch_dtype": "bfloat16",
-  "transformers_version": "4.28.1",
   "use_cache": false,
   "verbose": 0,
   "vocab_size": 50432

     "emb_init_uniform_lim": null,
     "fan_mode": "fan_in",
     "init_div_is_residual": true,
+    "init_gain": 0.0,
     "init_nonlinearity": "relu",
+    "init_std": null,
     "name": "kaiming_normal_",
     "verbose": 0
   },
   "resid_pdrop": 0,
   "tokenizer_name": "EleutherAI/gpt-neox-20b",
   "torch_dtype": "bfloat16",
+  "transformers_version": "4.30.2",
   "use_cache": false,
   "verbose": 0,
   "vocab_size": 50432

generation_config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
   "_from_model_config": true,
-  "transformers_version": "4.28.1",
   "use_cache": false
 }

 {
   "_from_model_config": true,
+  "transformers_version": "4.30.2",
   "use_cache": false
 }

modeling_mpt.py CHANGED Viewed

@@ -18,7 +18,7 @@ from .configuration_mpt import MPTConfig
 from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
 from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
 from .meta_init_context import init_empty_weights
-from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
 try:
     from .flash_attn_triton import flash_attn_func
 except:
@@ -80,7 +80,7 @@ class MPTModel(MPTPreTrainedModel):
     def get_input_embeddings(self):
         return self.wte
-    def set_input_embeddings(self, value):
         self.wte = value
     @torch.no_grad()
@@ -140,7 +140,7 @@ class MPTModel(MPTPreTrainedModel):
         attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
         return attn_bias
-    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         if attention_mask is not None:
@@ -156,6 +156,8 @@ class MPTModel(MPTPreTrainedModel):
             raise NotImplementedError('MPT does not support training with left padding.')
         if self.prefix_lm and prefix_mask is None:
             raise ValueError('prefix_mask is a required argument when MPT is configured with prefix_lm=True.')
         if self.training:
             if self.attn_uses_sequence_id and sequence_id is None:
                 raise ValueError('sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True ' + 'and the model is in train mode.')
@@ -225,7 +227,8 @@ class MPTForCausalLM(MPTPreTrainedModel):
         super().__init__(config)
         if not config.tie_word_embeddings:
             raise ValueError('MPTForCausalLM only supports tied word embeddings')
-        self.transformer = MPTModel(config)
         for child in self.transformer.children():
             if isinstance(child, torch.nn.ModuleList):
                 continue
@@ -259,9 +262,11 @@ class MPTForCausalLM(MPTPreTrainedModel):
     def get_decoder(self):
         return self.transformer
-    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None):
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
         logits = self.transformer.wte(outputs.last_hidden_state.to(self.transformer.wte.weight.device), True)
         if self.logit_scale is not None:
@@ -270,9 +275,9 @@ class MPTForCausalLM(MPTPreTrainedModel):
             logits *= self.logit_scale
         loss = None
         if labels is not None:
-            labels = torch.roll(labels, shifts=-1)
-            labels[:, -1] = -100
-            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
         return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
     def param_init_fn(self, module):

 from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
 from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
 from .meta_init_context import init_empty_weights
+from .param_init_fns import generic_param_init_fn_, MODEL_INIT_REGISTRY
 try:
     from .flash_attn_triton import flash_attn_func
 except:
     def get_input_embeddings(self):
         return self.wte
+    def set_input_embeddings(self, value: nn.Embedding):
         self.wte = value
     @torch.no_grad()
         attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
         return attn_bias
+    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, inputs_embeds: Optional[torch.Tensor]=None):
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         if attention_mask is not None:
             raise NotImplementedError('MPT does not support training with left padding.')
         if self.prefix_lm and prefix_mask is None:
             raise ValueError('prefix_mask is a required argument when MPT is configured with prefix_lm=True.')
+        if inputs_embeds is not None:
+            raise NotImplementedError('inputs_embeds is not implemented for MPT.')
         if self.training:
             if self.attn_uses_sequence_id and sequence_id is None:
                 raise ValueError('sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True ' + 'and the model is in train mode.')
         super().__init__(config)
         if not config.tie_word_embeddings:
             raise ValueError('MPTForCausalLM only supports tied word embeddings')
+        print(f'Instantiating an MPTForCausalLM model from {__file__}')
+        self.transformer: MPTModel = MPTModel(config)
         for child in self.transformer.children():
             if isinstance(child, torch.nn.ModuleList):
                 continue
     def get_decoder(self):
         return self.transformer
+    def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, inputs_embeds: Optional[torch.FloatTensor]=None):
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if inputs_embeds is not None:
+            raise NotImplementedError('inputs_embeds has to be None (for hf/peft support).')
         outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
         logits = self.transformer.wte(outputs.last_hidden_state.to(self.transformer.wte.weight.device), True)
         if self.logit_scale is not None:
             logits *= self.logit_scale
         loss = None
         if labels is not None:
+            _labels = torch.roll(labels, shifts=-1)
+            _labels[:, -1] = -100
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), _labels.to(logits.device).view(-1))
         return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
     def param_init_fn(self, module):

norm.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import torch
 def _cast_if_autocast_enabled(tensor):
@@ -25,7 +26,7 @@ class LPLayerNorm(torch.nn.LayerNorm):
             return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
 def rms_norm(x, weight=None, eps=1e-05):
-    output = x / torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
     if weight is not None:
         return output * weight
     return output
@@ -53,4 +54,4 @@ class LPRMSNorm(RMSNorm):
         downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
         with torch.autocast(enabled=False, device_type=x.device.type):
             return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
-NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}

+from typing import Dict, Type
 import torch
 def _cast_if_autocast_enabled(tensor):
             return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
 def rms_norm(x, weight=None, eps=1e-05):
+    output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
     if weight is not None:
         return output * weight
     return output
         downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
         with torch.autocast(enabled=False, device_type=x.device.type):
             return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
+NORM_CLASS_REGISTRY: Dict[str, Type[torch.nn.Module]] = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}

pytorch_model-00001-of-00002.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e96cf543cf2dbb5579abe2ca1f69e75ed159ff5d3cbad4b5fd406617d80ef44
 size 9943040275

 version https://git-lfs.github.com/spec/v1
+oid sha256:6003cd1c33b5a661320c11225b54fb0cdfd931f73241ed810c57dc9e32163146
 size 9943040275

pytorch_model-00002-of-00002.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f5fb9462a15a43819a0e2dd63faef50cea728f78d3de37721bcd2efe0d43439
 size 3355599187

 version https://git-lfs.github.com/spec/v1
+oid sha256:234b5d739ed88a00dcf1e28932158157418d386837d2345f0ec8a0b218e7d823
 size 3355599187