vince62s commited on
Commit
948f54b
1 Parent(s): 38b08d7

Upload 3 files

Browse files
Files changed (3) hide show
  1. config.json +4 -1
  2. modelling_xlm_roberta.py +132 -5
  3. pytorch_model.bin +1 -1
config.json CHANGED
@@ -22,5 +22,8 @@
22
  "output_past": true,
23
  "pad_token_id": 1,
24
  "type_vocab_size": 1,
25
- "vocab_size": 250002
 
 
 
26
  }
 
22
  "output_past": true,
23
  "pad_token_id": 1,
24
  "type_vocab_size": 1,
25
+ "vocab_size": 250002,
26
+ "layer_transformation": "softmax",
27
+ "layer_norm": false,
28
+ "dropout": 0.1
29
  }
modelling_xlm_roberta.py CHANGED
@@ -22,6 +22,7 @@ import torch
22
  import torch.utils.checkpoint
23
  from torch import nn
24
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
25
 
26
  from transformers.activations import ACT2FN, gelu
27
  from transformers.modeling_outputs import (
@@ -1344,6 +1345,117 @@ class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel):
1344
  )
1345
 
1346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1347
  class FeedForward(nn.Module):
1348
  """Feed Forward Neural Network.
1349
 
@@ -1364,7 +1476,7 @@ class FeedForward(nn.Module):
1364
  hidden_sizes: List[int] = [3072, 1024],
1365
  activations: str = "Tanh",
1366
  final_activation: Optional[str] = None,
1367
- dropout: float = 0.1,
1368
  ) -> None:
1369
  super().__init__()
1370
  modules = []
@@ -1406,7 +1518,13 @@ class XLMRobertaForEstimation(XLMRobertaPreTrainedModel):
1406
  super().__init__(config)
1407
 
1408
  self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
1409
- print("toto")
 
 
 
 
 
 
1410
  self.estimator = FeedForward()
1411
 
1412
  # Initialize weights and apply final processing
@@ -1431,7 +1549,8 @@ class XLMRobertaForEstimation(XLMRobertaPreTrainedModel):
1431
  num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
1432
  `input_ids` above)
1433
  """
1434
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
1435
  num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
1436
 
1437
  flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
@@ -1456,10 +1575,18 @@ class XLMRobertaForEstimation(XLMRobertaPreTrainedModel):
1456
  return_dict=return_dict,
1457
  )
1458
 
1459
- CLS_tok = outputs[0][:, 0, :] # for some reason at sentence level we take the first token score cf Comet
 
 
 
 
 
 
 
 
1460
  logits = self.estimator(CLS_tok)
1461
  reshaped_logits = logits #.view(-1, num_choices)
1462
-
1463
  loss = None
1464
  if labels is not None:
1465
  # move labels to correct device to enable model parallelism
 
22
  import torch.utils.checkpoint
23
  from torch import nn
24
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
25
+ from torch.nn import Parameter, ParameterList
26
 
27
  from transformers.activations import ACT2FN, gelu
28
  from transformers.modeling_outputs import (
 
1345
  )
1346
 
1347
 
1348
+ class LayerwiseAttention(torch.nn.Module):
1349
+ def __init__(
1350
+ self,
1351
+ num_hidden_layers: int,
1352
+ layer_norm: bool = False,
1353
+ layer_weights: Optional[List[int]] = None,
1354
+ dropout: float = None,
1355
+ layer_transformation: str = "softmax",
1356
+ ) -> None:
1357
+ super(LayerwiseAttention, self).__init__()
1358
+ self.num_layers = num_hidden_layers + 1
1359
+ self.layer_norm = layer_norm
1360
+ self.dropout = dropout
1361
+
1362
+ self.transform_fn = torch.softmax
1363
+ if layer_transformation == "sparsemax":
1364
+ from entmax import sparsemax
1365
+
1366
+ self.transform_fn = sparsemax
1367
+
1368
+ if layer_weights is None:
1369
+ layer_weights = [0.0] * self.num_layers
1370
+ elif len(layer_weights) != self.num_layers:
1371
+ raise Exception(
1372
+ "Length of layer_weights {} differs \
1373
+ from num_layers {}".format(
1374
+ layer_weights, self.num_layers
1375
+ )
1376
+ )
1377
+ self.gam = Parameter(torch.FloatTensor([1.0]), requires_grad=True)
1378
+ self.scalar_parameters = ParameterList(
1379
+ [
1380
+ Parameter(
1381
+ torch.FloatTensor([layer_weights[i]]),
1382
+ requires_grad=True,
1383
+ )
1384
+ for i in range(self.num_layers)
1385
+ ]
1386
+ )
1387
+
1388
+
1389
+
1390
+ if self.dropout:
1391
+ dropout_mask = torch.zeros(len(self.scalar_parameters))
1392
+ dropout_fill = torch.empty(len(self.scalar_parameters)).fill_(-1e20)
1393
+ self.register_buffer("dropout_mask", dropout_mask)
1394
+ self.register_buffer("dropout_fill", dropout_fill)
1395
+
1396
+ def forward(
1397
+ self,
1398
+ tensors: List[torch.Tensor], # pylint: disable=arguments-differ
1399
+ mask: torch.Tensor = None,
1400
+ ) -> torch.Tensor:
1401
+ if len(tensors) != self.num_layers:
1402
+ raise Exception(
1403
+ "{} tensors were passed, but the module was initialized to \
1404
+ mix {} tensors.".format(
1405
+ len(tensors), self.num_layers
1406
+ )
1407
+ )
1408
+
1409
+ def _layer_norm(tensor, broadcast_mask, mask):
1410
+ tensor_masked = tensor * broadcast_mask
1411
+ batch_size, _, input_dim = tensors[0].size()
1412
+
1413
+ # mean for each sentence
1414
+ num_elements_not_masked = mask.sum(1) * input_dim
1415
+ mean = tensor_masked.view(batch_size, -1).sum(1)
1416
+ mean = (mean / num_elements_not_masked).view(batch_size, 1, 1)
1417
+
1418
+ variance = (((tensor_masked - mean) * broadcast_mask) ** 2).view(
1419
+ batch_size, -1
1420
+ ).sum(1) / num_elements_not_masked
1421
+ normalized_tensor = (tensor - mean) / torch.sqrt(variance + 1e-12).view(
1422
+ batch_size, 1, 1
1423
+ )
1424
+ return normalized_tensor
1425
+
1426
+ # BUG: Pytorch bug fix when Parameters are not well copied across GPUs
1427
+ # https://github.com/pytorch/pytorch/issues/36035
1428
+ if len([parameter for parameter in self.scalar_parameters]) != self.num_layers:
1429
+ weights = torch.tensor(self.weights, device=tensors[0].device)
1430
+ gamma = torch.tensor(self.gam, device=tensors[0].device)
1431
+ else:
1432
+ weights = torch.cat([parameter for parameter in self.scalar_parameters])
1433
+ gamma = self.gam
1434
+
1435
+ if self.training and self.dropout:
1436
+ weights = torch.where(
1437
+ self.dropout_mask.uniform_() > self.dropout, weights, self.dropout_fill
1438
+ )
1439
+
1440
+ normed_weights = self.transform_fn(weights, dim=0)
1441
+ normed_weights = torch.split(normed_weights, split_size_or_sections=1)
1442
+
1443
+ if not self.layer_norm:
1444
+ pieces = []
1445
+ for weight, tensor in zip(normed_weights, tensors):
1446
+ pieces.append(weight * tensor)
1447
+ return gamma * sum(pieces)
1448
+
1449
+ else:
1450
+ mask_float = mask.float()
1451
+ broadcast_mask = mask_float.unsqueeze(-1)
1452
+
1453
+ pieces = []
1454
+ for weight, tensor in zip(normed_weights, tensors):
1455
+ pieces.append(weight * _layer_norm(tensor, broadcast_mask, mask_float))
1456
+ return gamma * sum(pieces)
1457
+
1458
+
1459
  class FeedForward(nn.Module):
1460
  """Feed Forward Neural Network.
1461
 
 
1476
  hidden_sizes: List[int] = [3072, 1024],
1477
  activations: str = "Tanh",
1478
  final_activation: Optional[str] = None,
1479
+ dropout: float = 0.0,
1480
  ) -> None:
1481
  super().__init__()
1482
  modules = []
 
1518
  super().__init__(config)
1519
 
1520
  self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
1521
+ self.layerwise_attention = LayerwiseAttention(
1522
+ layer_transformation=config.layer_transformation,
1523
+ num_hidden_layers=config.num_hidden_layers,
1524
+ dropout=config.dropout,
1525
+ layer_norm=config.layer_norm
1526
+ )
1527
+
1528
  self.estimator = FeedForward()
1529
 
1530
  # Initialize weights and apply final processing
 
1549
  num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
1550
  `input_ids` above)
1551
  """
1552
+ return_dict = False
1553
+ output_hidden_states = True
1554
  num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
1555
 
1556
  flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
 
1575
  return_dict=return_dict,
1576
  )
1577
 
1578
+ if self.layerwise_attention:
1579
+ embeddings = self.layerwise_attention(
1580
+ outputs[2], attention_mask
1581
+ )
1582
+ else:
1583
+ embeddings = outputs[0]
1584
+
1585
+ CLS_tok = embeddings[:, 0, :] # for some reason at sentence level we take the first token score cf Comet
1586
+
1587
  logits = self.estimator(CLS_tok)
1588
  reshaped_logits = logits #.view(-1, num_choices)
1589
+
1590
  loss = None
1591
  if labels is not None:
1592
  # move labels to correct device to enable model parallelism
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9851dca7395338e75587c6e869542cd7cc23159c9b3e4e0e65e7303a672aeb5
3
  size 1130454122
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce50d1ef923a3464e6f8909eae487ec378da304a1a0ad489186b2ae51b9fede0
3
  size 1130454122