vince62s
/

wmt22-cometkiwi-da-roberta-large

   "output_past": true,
   "pad_token_id": 1,
   "type_vocab_size": 1,
+  "vocab_size": 250002,
+  "layer_transformation": "softmax",
+  "layer_norm": false,
+  "dropout": 0.1
 }

modelling_xlm_roberta.py CHANGED Viewed

@@ -22,6 +22,7 @@ import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN, gelu
 from transformers.modeling_outputs import (
@@ -1344,6 +1345,117 @@ class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel):
         )
 class FeedForward(nn.Module):
     """Feed Forward Neural Network.
@@ -1364,7 +1476,7 @@ class FeedForward(nn.Module):
         hidden_sizes: List[int] = [3072, 1024],
         activations: str = "Tanh",
         final_activation: Optional[str] = None,
-        dropout: float = 0.1,
     ) -> None:
         super().__init__()
         modules = []
@@ -1406,7 +1518,13 @@ class XLMRobertaForEstimation(XLMRobertaPreTrainedModel):
         super().__init__(config)
         self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
-        print("toto")
         self.estimator = FeedForward()
         # Initialize weights and apply final processing
@@ -1431,7 +1549,8 @@ class XLMRobertaForEstimation(XLMRobertaPreTrainedModel):
             num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
             `input_ids` above)
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
         flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
@@ -1456,10 +1575,18 @@ class XLMRobertaForEstimation(XLMRobertaPreTrainedModel):
             return_dict=return_dict,
         )
-        CLS_tok = outputs[0][:, 0, :]  # for some reason at sentence level we take the first token score cf Comet
         logits = self.estimator(CLS_tok)
         reshaped_logits = logits #.view(-1, num_choices)
         loss = None
         if labels is not None:
             # move labels to correct device to enable model parallelism

 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn import Parameter, ParameterList
 from transformers.activations import ACT2FN, gelu
 from transformers.modeling_outputs import (
         )
+class LayerwiseAttention(torch.nn.Module):
+    def __init__(
+        self,
+        num_hidden_layers: int,
+        layer_norm: bool = False,
+        layer_weights: Optional[List[int]] = None,
+        dropout: float = None,
+        layer_transformation: str = "softmax",
+    ) -> None:
+        super(LayerwiseAttention, self).__init__()
+        self.num_layers = num_hidden_layers + 1
+        self.layer_norm = layer_norm
+        self.dropout = dropout
+        self.transform_fn = torch.softmax
+        if layer_transformation == "sparsemax":
+            from entmax import sparsemax
+            self.transform_fn = sparsemax
+        if layer_weights is None:
+            layer_weights = [0.0] * self.num_layers
+        elif len(layer_weights) != self.num_layers:
+            raise Exception(
+                "Length of layer_weights {} differs \
+                from num_layers {}".format(
+                    layer_weights, self.num_layers
+                )
+            )
+        self.gam = Parameter(torch.FloatTensor([1.0]), requires_grad=True)
+        self.scalar_parameters = ParameterList(
+            [
+                Parameter(
+                    torch.FloatTensor([layer_weights[i]]),
+                    requires_grad=True,
+                )
+                for i in range(self.num_layers)
+            ]
+        )
+        if self.dropout:
+            dropout_mask = torch.zeros(len(self.scalar_parameters))
+            dropout_fill = torch.empty(len(self.scalar_parameters)).fill_(-1e20)
+            self.register_buffer("dropout_mask", dropout_mask)
+            self.register_buffer("dropout_fill", dropout_fill)
+    def forward(
+        self,
+        tensors: List[torch.Tensor],  # pylint: disable=arguments-differ
+        mask: torch.Tensor = None,
+    ) -> torch.Tensor:
+        if len(tensors) != self.num_layers:
+            raise Exception(
+                "{} tensors were passed, but the module was initialized to \
+                mix {} tensors.".format(
+                    len(tensors), self.num_layers
+                )
+            )
+        def _layer_norm(tensor, broadcast_mask, mask):
+            tensor_masked = tensor * broadcast_mask
+            batch_size, _, input_dim = tensors[0].size()
+            # mean for each sentence
+            num_elements_not_masked = mask.sum(1) * input_dim
+            mean = tensor_masked.view(batch_size, -1).sum(1)
+            mean = (mean / num_elements_not_masked).view(batch_size, 1, 1)
+            variance = (((tensor_masked - mean) * broadcast_mask) ** 2).view(
+                batch_size, -1
+            ).sum(1) / num_elements_not_masked
+            normalized_tensor = (tensor - mean) / torch.sqrt(variance + 1e-12).view(
+                batch_size, 1, 1
+            )
+            return normalized_tensor
+        # BUG: Pytorch bug fix when Parameters are not well copied across GPUs
+        # https://github.com/pytorch/pytorch/issues/36035
+        if len([parameter for parameter in self.scalar_parameters]) != self.num_layers:
+            weights = torch.tensor(self.weights, device=tensors[0].device)
+            gamma = torch.tensor(self.gam, device=tensors[0].device)
+        else:
+            weights = torch.cat([parameter for parameter in self.scalar_parameters])
+            gamma = self.gam
+        if self.training and self.dropout:
+            weights = torch.where(
+                self.dropout_mask.uniform_() > self.dropout, weights, self.dropout_fill
+            )
+        normed_weights = self.transform_fn(weights, dim=0)
+        normed_weights = torch.split(normed_weights, split_size_or_sections=1)
+        if not self.layer_norm:
+            pieces = []
+            for weight, tensor in zip(normed_weights, tensors):
+                pieces.append(weight * tensor)
+            return gamma * sum(pieces)
+        else:
+            mask_float = mask.float()
+            broadcast_mask = mask_float.unsqueeze(-1)
+            pieces = []
+            for weight, tensor in zip(normed_weights, tensors):
+                pieces.append(weight * _layer_norm(tensor, broadcast_mask, mask_float))
+            return gamma * sum(pieces)
 class FeedForward(nn.Module):
     """Feed Forward Neural Network.
         hidden_sizes: List[int] = [3072, 1024],
         activations: str = "Tanh",
         final_activation: Optional[str] = None,
+        dropout: float = 0.0,
     ) -> None:
         super().__init__()
         modules = []
         super().__init__(config)
         self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
+        self.layerwise_attention = LayerwiseAttention(
+            layer_transformation=config.layer_transformation,
+            num_hidden_layers=config.num_hidden_layers,
+            dropout=config.dropout,
+            layer_norm=config.layer_norm
+        )
         self.estimator = FeedForward()
         # Initialize weights and apply final processing
             num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
             `input_ids` above)
         """
+        return_dict = False
+        output_hidden_states = True
         num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
         flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
             return_dict=return_dict,
         )
+        if self.layerwise_attention:
+            embeddings = self.layerwise_attention(
+                outputs[2], attention_mask
+            )
+        else:
+            embeddings = outputs[0]
+        CLS_tok = embeddings[:, 0, :]  # for some reason at sentence level we take the first token score cf Comet
         logits = self.estimator(CLS_tok)
         reshaped_logits = logits #.view(-1, num_choices)
         loss = None
         if labels is not None:
             # move labels to correct device to enable model parallelism

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f9851dca7395338e75587c6e869542cd7cc23159c9b3e4e0e65e7303a672aeb5
 size 1130454122

 version https://git-lfs.github.com/spec/v1
+oid sha256:ce50d1ef923a3464e6f8909eae487ec378da304a1a0ad489186b2ae51b9fede0
 size 1130454122