Upload 3 files
Browse files- config.json +4 -1
- modelling_xlm_roberta.py +132 -5
- pytorch_model.bin +1 -1
config.json
CHANGED
@@ -22,5 +22,8 @@
|
|
22 |
"output_past": true,
|
23 |
"pad_token_id": 1,
|
24 |
"type_vocab_size": 1,
|
25 |
-
"vocab_size": 250002
|
|
|
|
|
|
|
26 |
}
|
|
|
22 |
"output_past": true,
|
23 |
"pad_token_id": 1,
|
24 |
"type_vocab_size": 1,
|
25 |
+
"vocab_size": 250002,
|
26 |
+
"layer_transformation": "softmax",
|
27 |
+
"layer_norm": false,
|
28 |
+
"dropout": 0.1
|
29 |
}
|
modelling_xlm_roberta.py
CHANGED
@@ -22,6 +22,7 @@ import torch
|
|
22 |
import torch.utils.checkpoint
|
23 |
from torch import nn
|
24 |
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
|
|
25 |
|
26 |
from transformers.activations import ACT2FN, gelu
|
27 |
from transformers.modeling_outputs import (
|
@@ -1344,6 +1345,117 @@ class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel):
|
|
1344 |
)
|
1345 |
|
1346 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1347 |
class FeedForward(nn.Module):
|
1348 |
"""Feed Forward Neural Network.
|
1349 |
|
@@ -1364,7 +1476,7 @@ class FeedForward(nn.Module):
|
|
1364 |
hidden_sizes: List[int] = [3072, 1024],
|
1365 |
activations: str = "Tanh",
|
1366 |
final_activation: Optional[str] = None,
|
1367 |
-
dropout: float = 0.
|
1368 |
) -> None:
|
1369 |
super().__init__()
|
1370 |
modules = []
|
@@ -1406,7 +1518,13 @@ class XLMRobertaForEstimation(XLMRobertaPreTrainedModel):
|
|
1406 |
super().__init__(config)
|
1407 |
|
1408 |
self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
|
1409 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1410 |
self.estimator = FeedForward()
|
1411 |
|
1412 |
# Initialize weights and apply final processing
|
@@ -1431,7 +1549,8 @@ class XLMRobertaForEstimation(XLMRobertaPreTrainedModel):
|
|
1431 |
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
|
1432 |
`input_ids` above)
|
1433 |
"""
|
1434 |
-
return_dict =
|
|
|
1435 |
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
|
1436 |
|
1437 |
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
|
@@ -1456,10 +1575,18 @@ class XLMRobertaForEstimation(XLMRobertaPreTrainedModel):
|
|
1456 |
return_dict=return_dict,
|
1457 |
)
|
1458 |
|
1459 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1460 |
logits = self.estimator(CLS_tok)
|
1461 |
reshaped_logits = logits #.view(-1, num_choices)
|
1462 |
-
|
1463 |
loss = None
|
1464 |
if labels is not None:
|
1465 |
# move labels to correct device to enable model parallelism
|
|
|
22 |
import torch.utils.checkpoint
|
23 |
from torch import nn
|
24 |
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
|
25 |
+
from torch.nn import Parameter, ParameterList
|
26 |
|
27 |
from transformers.activations import ACT2FN, gelu
|
28 |
from transformers.modeling_outputs import (
|
|
|
1345 |
)
|
1346 |
|
1347 |
|
1348 |
+
class LayerwiseAttention(torch.nn.Module):
|
1349 |
+
def __init__(
|
1350 |
+
self,
|
1351 |
+
num_hidden_layers: int,
|
1352 |
+
layer_norm: bool = False,
|
1353 |
+
layer_weights: Optional[List[int]] = None,
|
1354 |
+
dropout: float = None,
|
1355 |
+
layer_transformation: str = "softmax",
|
1356 |
+
) -> None:
|
1357 |
+
super(LayerwiseAttention, self).__init__()
|
1358 |
+
self.num_layers = num_hidden_layers + 1
|
1359 |
+
self.layer_norm = layer_norm
|
1360 |
+
self.dropout = dropout
|
1361 |
+
|
1362 |
+
self.transform_fn = torch.softmax
|
1363 |
+
if layer_transformation == "sparsemax":
|
1364 |
+
from entmax import sparsemax
|
1365 |
+
|
1366 |
+
self.transform_fn = sparsemax
|
1367 |
+
|
1368 |
+
if layer_weights is None:
|
1369 |
+
layer_weights = [0.0] * self.num_layers
|
1370 |
+
elif len(layer_weights) != self.num_layers:
|
1371 |
+
raise Exception(
|
1372 |
+
"Length of layer_weights {} differs \
|
1373 |
+
from num_layers {}".format(
|
1374 |
+
layer_weights, self.num_layers
|
1375 |
+
)
|
1376 |
+
)
|
1377 |
+
self.gam = Parameter(torch.FloatTensor([1.0]), requires_grad=True)
|
1378 |
+
self.scalar_parameters = ParameterList(
|
1379 |
+
[
|
1380 |
+
Parameter(
|
1381 |
+
torch.FloatTensor([layer_weights[i]]),
|
1382 |
+
requires_grad=True,
|
1383 |
+
)
|
1384 |
+
for i in range(self.num_layers)
|
1385 |
+
]
|
1386 |
+
)
|
1387 |
+
|
1388 |
+
|
1389 |
+
|
1390 |
+
if self.dropout:
|
1391 |
+
dropout_mask = torch.zeros(len(self.scalar_parameters))
|
1392 |
+
dropout_fill = torch.empty(len(self.scalar_parameters)).fill_(-1e20)
|
1393 |
+
self.register_buffer("dropout_mask", dropout_mask)
|
1394 |
+
self.register_buffer("dropout_fill", dropout_fill)
|
1395 |
+
|
1396 |
+
def forward(
|
1397 |
+
self,
|
1398 |
+
tensors: List[torch.Tensor], # pylint: disable=arguments-differ
|
1399 |
+
mask: torch.Tensor = None,
|
1400 |
+
) -> torch.Tensor:
|
1401 |
+
if len(tensors) != self.num_layers:
|
1402 |
+
raise Exception(
|
1403 |
+
"{} tensors were passed, but the module was initialized to \
|
1404 |
+
mix {} tensors.".format(
|
1405 |
+
len(tensors), self.num_layers
|
1406 |
+
)
|
1407 |
+
)
|
1408 |
+
|
1409 |
+
def _layer_norm(tensor, broadcast_mask, mask):
|
1410 |
+
tensor_masked = tensor * broadcast_mask
|
1411 |
+
batch_size, _, input_dim = tensors[0].size()
|
1412 |
+
|
1413 |
+
# mean for each sentence
|
1414 |
+
num_elements_not_masked = mask.sum(1) * input_dim
|
1415 |
+
mean = tensor_masked.view(batch_size, -1).sum(1)
|
1416 |
+
mean = (mean / num_elements_not_masked).view(batch_size, 1, 1)
|
1417 |
+
|
1418 |
+
variance = (((tensor_masked - mean) * broadcast_mask) ** 2).view(
|
1419 |
+
batch_size, -1
|
1420 |
+
).sum(1) / num_elements_not_masked
|
1421 |
+
normalized_tensor = (tensor - mean) / torch.sqrt(variance + 1e-12).view(
|
1422 |
+
batch_size, 1, 1
|
1423 |
+
)
|
1424 |
+
return normalized_tensor
|
1425 |
+
|
1426 |
+
# BUG: Pytorch bug fix when Parameters are not well copied across GPUs
|
1427 |
+
# https://github.com/pytorch/pytorch/issues/36035
|
1428 |
+
if len([parameter for parameter in self.scalar_parameters]) != self.num_layers:
|
1429 |
+
weights = torch.tensor(self.weights, device=tensors[0].device)
|
1430 |
+
gamma = torch.tensor(self.gam, device=tensors[0].device)
|
1431 |
+
else:
|
1432 |
+
weights = torch.cat([parameter for parameter in self.scalar_parameters])
|
1433 |
+
gamma = self.gam
|
1434 |
+
|
1435 |
+
if self.training and self.dropout:
|
1436 |
+
weights = torch.where(
|
1437 |
+
self.dropout_mask.uniform_() > self.dropout, weights, self.dropout_fill
|
1438 |
+
)
|
1439 |
+
|
1440 |
+
normed_weights = self.transform_fn(weights, dim=0)
|
1441 |
+
normed_weights = torch.split(normed_weights, split_size_or_sections=1)
|
1442 |
+
|
1443 |
+
if not self.layer_norm:
|
1444 |
+
pieces = []
|
1445 |
+
for weight, tensor in zip(normed_weights, tensors):
|
1446 |
+
pieces.append(weight * tensor)
|
1447 |
+
return gamma * sum(pieces)
|
1448 |
+
|
1449 |
+
else:
|
1450 |
+
mask_float = mask.float()
|
1451 |
+
broadcast_mask = mask_float.unsqueeze(-1)
|
1452 |
+
|
1453 |
+
pieces = []
|
1454 |
+
for weight, tensor in zip(normed_weights, tensors):
|
1455 |
+
pieces.append(weight * _layer_norm(tensor, broadcast_mask, mask_float))
|
1456 |
+
return gamma * sum(pieces)
|
1457 |
+
|
1458 |
+
|
1459 |
class FeedForward(nn.Module):
|
1460 |
"""Feed Forward Neural Network.
|
1461 |
|
|
|
1476 |
hidden_sizes: List[int] = [3072, 1024],
|
1477 |
activations: str = "Tanh",
|
1478 |
final_activation: Optional[str] = None,
|
1479 |
+
dropout: float = 0.0,
|
1480 |
) -> None:
|
1481 |
super().__init__()
|
1482 |
modules = []
|
|
|
1518 |
super().__init__(config)
|
1519 |
|
1520 |
self.roberta = XLMRobertaModel(config, add_pooling_layer=False)
|
1521 |
+
self.layerwise_attention = LayerwiseAttention(
|
1522 |
+
layer_transformation=config.layer_transformation,
|
1523 |
+
num_hidden_layers=config.num_hidden_layers,
|
1524 |
+
dropout=config.dropout,
|
1525 |
+
layer_norm=config.layer_norm
|
1526 |
+
)
|
1527 |
+
|
1528 |
self.estimator = FeedForward()
|
1529 |
|
1530 |
# Initialize weights and apply final processing
|
|
|
1549 |
num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
|
1550 |
`input_ids` above)
|
1551 |
"""
|
1552 |
+
return_dict = False
|
1553 |
+
output_hidden_states = True
|
1554 |
num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
|
1555 |
|
1556 |
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
|
|
|
1575 |
return_dict=return_dict,
|
1576 |
)
|
1577 |
|
1578 |
+
if self.layerwise_attention:
|
1579 |
+
embeddings = self.layerwise_attention(
|
1580 |
+
outputs[2], attention_mask
|
1581 |
+
)
|
1582 |
+
else:
|
1583 |
+
embeddings = outputs[0]
|
1584 |
+
|
1585 |
+
CLS_tok = embeddings[:, 0, :] # for some reason at sentence level we take the first token score cf Comet
|
1586 |
+
|
1587 |
logits = self.estimator(CLS_tok)
|
1588 |
reshaped_logits = logits #.view(-1, num_choices)
|
1589 |
+
|
1590 |
loss = None
|
1591 |
if labels is not None:
|
1592 |
# move labels to correct device to enable model parallelism
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1130454122
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ce50d1ef923a3464e6f8909eae487ec378da304a1a0ad489186b2ae51b9fede0
|
3 |
size 1130454122
|