add model

Browse files

Files changed (4) hide show

config.json +5 -3
pytorch_model.bin +2 -2
rita_configuration.py +3 -1
rita_modeling.py +217 -15

config.json CHANGED Viewed

@@ -1,17 +1,19 @@
 {
-  "_name_or_path": "Seledorn/RITA_m",
   "architectures": [
-    "RITAModel"
   ],
   "auto_map": {
     "AutoConfig": "rita_configuration.RITAConfig",
     "AutoModel": "rita_modeling.RITAModel",
-    "AutoModelForCausalLM": "rita_modeling.RITAModel"
   },
   "d_feedforward": 4096,
   "d_model": 1024,
   "dropout": 0.0,
   "eos_token_id": 2,
   "max_seq_len": 1024,
   "model_type": "rita",
   "num_heads": 16,

 {
+  "_name_or_path": "Seledorn/RITA_m_2",
   "architectures": [
+    "RITAModelForCausalLM"
   ],
   "auto_map": {
     "AutoConfig": "rita_configuration.RITAConfig",
     "AutoModel": "rita_modeling.RITAModel",
+    "AutoModelForCausalLM": "rita_modeling.RITAModelForCausalLM",
+    "AutoModelForSequenceClassification": "rita_modeling.RITAModelForSequenceClassification"
   },
   "d_feedforward": 4096,
   "d_model": 1024,
   "dropout": 0.0,
   "eos_token_id": 2,
+  "initializer_range": 0.02,
   "max_seq_len": 1024,
   "model_type": "rita",
   "num_heads": 16,

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d65d9e6f6e9d88059d230f690d3d56daa3c1d88da3282f9e5ac1cbf0d6d6f18c
-size 604802635

 version https://git-lfs.github.com/spec/v1
+oid sha256:f27acd85e3cdaf1f803995d7ae653e9b00dac37cca4dd6048c5839c92df93548
+size 604861001

rita_configuration.py CHANGED Viewed

@@ -16,6 +16,7 @@ class RITAConfig(PretrainedConfig):
         dropout=0.,
         ff_ratio=4,
         eos_token_id=2,
         **kwargs,
     ):
         super().__init__(eos_token_id=eos_token_id, **kwargs)
@@ -26,4 +27,5 @@ class RITAConfig(PretrainedConfig):
         self.num_layers = num_layers
         self.max_seq_len=max_seq_len
         self.dropout = dropout
-        self.eos_token_id=eos_token_id

         dropout=0.,
         ff_ratio=4,
         eos_token_id=2,
+        initializer_range=0.02,
         **kwargs,
     ):
         super().__init__(eos_token_id=eos_token_id, **kwargs)
         self.num_layers = num_layers
         self.max_seq_len=max_seq_len
         self.dropout = dropout
+        self.eos_token_id=eos_token_id
+        self.initializer_range=0.02

rita_modeling.py CHANGED Viewed

@@ -6,14 +6,12 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import CrossEntropyLoss
 from transformers.modeling_outputs import (
-    BaseModelOutputWithPast,
-    BaseModelOutputWithPastAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions,
-    CausalLMOutputWithPast,
     CausalLMOutput,
 )
 from transformers.modeling_utils import PreTrainedModel
@@ -210,9 +208,12 @@ class DecoderLayer(nn.Module):
         y = self.mlp(y)
         x = x + self.mlp_dropout(y)
         return x
 class RITAModel(PreTrainedModel):
     config_class = RITAConfig
     def __init__(
         self,
         config
@@ -221,7 +222,6 @@ class RITAModel(PreTrainedModel):
         self.embedding = nn.Embedding(config.vocab_size, config.d_model)
         self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_layers)])
         self.final_norm = nn.LayerNorm(config.d_model)
-        self.projector = nn.Linear(config.d_model, config.vocab_size, bias = False)
     def forward(
         self,
@@ -251,7 +251,78 @@ class RITAModel(PreTrainedModel):
             x = layer(x, attn_mask=attention_mask)
         x = self.final_norm(x)  # N x L x D
-        logits = self.projector(x)
         loss = None
         if labels is not None:
             # Shift so that tokens < n predict n
@@ -264,19 +335,150 @@ class RITAModel(PreTrainedModel):
         return CausalLMOutput(
             loss=loss,
             logits=logits,
-            hidden_states=x,
         )
     #Some common HF functions.
     def get_input_embeddings(self):
-        return self.embedding
     def set_input_embeddings(self, new_embeddings):
-        self.embedding = new_embeddings
     def get_output_embeddings(self):
-        return self.projector
-    def set_output_embeddings(self, new_projector):
-        self.projector = new_projector

 import torch
 import torch.utils.checkpoint
 from torch import nn
+from torch.nn import CrossEntropyLoss, BCEWithLogitsLoss, MSELoss
 from transformers.modeling_outputs import (
+    BaseModelOutput,
     CausalLMOutput,
+    SequenceClassifierOutput
 )
 from transformers.modeling_utils import PreTrainedModel
         y = self.mlp(y)
         x = x + self.mlp_dropout(y)
         return x
 class RITAModel(PreTrainedModel):
     config_class = RITAConfig
+    base_model_prefix = "transformer"
+    is_parallelizable = False
     def __init__(
         self,
         config
         self.embedding = nn.Embedding(config.vocab_size, config.d_model)
         self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_layers)])
         self.final_norm = nn.LayerNorm(config.d_model)
     def forward(
         self,
             x = layer(x, attn_mask=attention_mask)
         x = self.final_norm(x)  # N x L x D
+        return BaseModelOutput(
+            hidden_states=x,
+        )
+    #Some common HF functions.
+    def get_input_embeddings(self):
+        return self.embedding
+    def set_input_embeddings(self, new_embeddings):
+        self.embedding = new_embeddings
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+class RITAModelForCausalLM(PreTrainedModel):
+    config_class = RITAConfig
+    base_model_prefix = "transformer"
+    is_parallelizable = False
+    def __init__(
+        self,
+        config
+    ):
+        super().__init__(config)
+        self.transformer = RITAModel(config)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,  # NOT USED
+        attention_mask=None,
+        token_type_ids=None, # NOT USED
+        position_ids=None, # NOT USED
+        head_mask=None, # NOT USED
+        inputs_embeds=None,
+        encoder_hidden_states=None,  # NOT USED
+        encoder_attention_mask=None, # NOT USED
+        labels=None,
+        use_cache=None, # NOT USED
+        output_attentions=None, # NOT USED
+        output_hidden_states=None, # NOT USED
+        return_dict=None # NOT USED
+        ) -> torch.FloatTensor:
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = self.lm_head(transformer_outputs.hidden_states)
         loss = None
         if labels is not None:
             # Shift so that tokens < n predict n
         return CausalLMOutput(
             loss=loss,
             logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
         )
     #Some common HF functions.
     def get_input_embeddings(self):
+        return self.transformer.embedding
     def set_input_embeddings(self, new_embeddings):
+        self.transformer.embedding = new_embeddings
     def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, lm_head):
+        self.lm_head = lm_head
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+class RITAModelForSequenceClassification(PreTrainedModel):
+    config_class = RITAConfig
+    base_model_prefix = "transformer"
+    is_parallelizable = False
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = RITAModel(config)
+        self.score = nn.Linear(config.d_model, self.num_labels, bias=False)
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size, sequence_length = input_ids.shape[:2]
+        else:
+            batch_size, sequence_length = inputs_embeds.shape[:2]
+        assert (
+            self.config.pad_token_id is not None or batch_size == 1
+        ), "Cannot handle batch sizes > 1 if no padding token is defined."
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
+            else:
+                sequence_lengths = -1
+                logger.warning(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+        pooled_logits = logits[torch.arange(batch_size, device=self.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=pooled_logits,
+        )
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)