Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
# @Time : 2021/12/30 8:35 下午 | |
# @Author : JianingWang | |
# @File : mlm.py | |
import logging | |
from typing import Union, Tuple, Optional | |
import torch | |
from torch.nn import CrossEntropyLoss | |
from transformers.modeling_outputs import MaskedLMOutput | |
from transformers.models.bert import BertPreTrainedModel | |
from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel, BertOnlyMLMHead | |
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaModel, RobertaLMHead | |
from transformers.models.albert.modeling_albert import AlbertPreTrainedModel, AlbertModel, AlbertMLMHead | |
from transformers.models.roformer.modeling_roformer import RoFormerPreTrainedModel, RoFormerModel, RoFormerOnlyMLMHead | |
logger = logging.getLogger(__name__) | |
""" | |
Function: Use MLM to pre-train BERT | |
Notes: | |
- The label of non-masked token is -100, which can be used for cross-entropy function (only calculate loss at not -100) | |
""" | |
class BertForMaskedLM(BertPreTrainedModel): | |
def __init__(self, config, *inputs, **kwargs): | |
super().__init__(config, *inputs, **kwargs) | |
self.bert = BertModel(config, add_pooling_layer=False) | |
self.cls = BertOnlyMLMHead(config) | |
# Initialize weights and apply final processing | |
self.post_init() | |
def forward( | |
self, | |
input_ids: Optional[torch.Tensor] = None, | |
attention_mask: Optional[torch.Tensor] = None, | |
token_type_ids: Optional[torch.Tensor] = None, | |
position_ids: Optional[torch.Tensor] = None, | |
head_mask: Optional[torch.Tensor] = None, | |
inputs_embeds: Optional[torch.Tensor] = None, | |
encoder_hidden_states: Optional[torch.Tensor] = None, | |
encoder_attention_mask: Optional[torch.Tensor] = None, | |
labels: Optional[torch.Tensor] = None, | |
output_attentions: Optional[bool] = None, | |
output_hidden_states: Optional[bool] = None, | |
return_dict: Optional[bool] = None, | |
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: | |
r""" | |
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., | |
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the | |
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` | |
kwargs (`Dict[str, any]`, optional, defaults to *{}*): | |
Used to hide legacy arguments that have been deprecated. | |
""" | |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
outputs = self.bert( | |
input_ids, | |
attention_mask=attention_mask, | |
token_type_ids=token_type_ids, | |
position_ids=position_ids, | |
head_mask=head_mask, | |
inputs_embeds=inputs_embeds, | |
encoder_hidden_states=encoder_hidden_states, | |
encoder_attention_mask=encoder_attention_mask, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |
sequence_output = outputs[0] | |
prediction_scores = self.cls(sequence_output) | |
masked_lm_loss = None | |
if labels is not None: | |
loss_fct = CrossEntropyLoss() # -100 index = padding token | |
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) | |
if not return_dict: | |
output = (prediction_scores,) + outputs[2:] | |
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output | |
return MaskedLMOutput( | |
loss=masked_lm_loss, # () | |
logits=prediction_scores, # (batch_size, seq_len, vocab_size) | |
hidden_states=outputs.hidden_states, # (batch_size, seq_len, hidden_size) | |
attentions=outputs.attentions, | |
) | |
""" | |
Function: Use MLM to pre-train RoBERTa | |
Notes: | |
- The label of non-masked token is -100, which can be used for cross-entropy function (only calculate loss at not -100) | |
""" | |
class RobertaForMaskedLM(RobertaPreTrainedModel): | |
def __init__(self, config, *inputs, **kwargs): | |
super().__init__(config, *inputs, **kwargs) | |
self.roberta = BertModel(config, add_pooling_layer=False) | |
self.lm_head = RobertaLMHead(config) | |
# Initialize weights and apply final processing | |
self.post_init() | |
def forward( | |
self, | |
input_ids: Optional[torch.LongTensor] = None, | |
attention_mask: Optional[torch.FloatTensor] = None, | |
token_type_ids: Optional[torch.LongTensor] = None, | |
position_ids: Optional[torch.LongTensor] = None, | |
head_mask: Optional[torch.FloatTensor] = None, | |
inputs_embeds: Optional[torch.FloatTensor] = None, | |
encoder_hidden_states: Optional[torch.FloatTensor] = None, | |
encoder_attention_mask: Optional[torch.FloatTensor] = None, | |
labels: Optional[torch.LongTensor] = None, | |
output_attentions: Optional[bool] = None, | |
output_hidden_states: Optional[bool] = None, | |
return_dict: Optional[bool] = None, | |
) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: | |
r""" | |
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., | |
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the | |
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` | |
kwargs (`Dict[str, any]`, optional, defaults to *{}*): | |
Used to hide legacy arguments that have been deprecated. | |
""" | |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
outputs = self.roberta( | |
input_ids, | |
attention_mask=attention_mask, | |
token_type_ids=token_type_ids, | |
position_ids=position_ids, | |
head_mask=head_mask, | |
inputs_embeds=inputs_embeds, | |
encoder_hidden_states=encoder_hidden_states, | |
encoder_attention_mask=encoder_attention_mask, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |
sequence_output = outputs[0] | |
prediction_scores = self.lm_head(sequence_output) | |
masked_lm_loss = None | |
if labels is not None: | |
loss_fct = CrossEntropyLoss() # -100 index = padding token | |
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) | |
if not return_dict: | |
output = (prediction_scores,) + outputs[2:] | |
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output | |
return MaskedLMOutput( | |
loss=masked_lm_loss, # () | |
logits=prediction_scores, # (batch_size, seq_len, vocab_size) | |
hidden_states=outputs.hidden_states, # (batch_size, seq_len, hidden_size) | |
attentions=outputs.attentions, | |
) | |
""" | |
Function: Use MLM to pre-train ALBERT | |
Notes: | |
- The label of non-masked token is -100, which can be used for cross-entropy function (only calculate loss at not -100) | |
""" | |
class AlbertForMaskedLM(AlbertPreTrainedModel): | |
def __init__(self, config, *inputs, **kwargs): | |
super().__init__(config, *inputs, **kwargs) | |
self.albert = AlbertModel(config, add_pooling_layer=False) | |
self.predictions = AlbertMLMHead(config) | |
# Initialize weights and apply final processing | |
self.post_init() | |
def forward( | |
self, | |
input_ids: Optional[torch.LongTensor] = None, | |
attention_mask: Optional[torch.FloatTensor] = None, | |
token_type_ids: Optional[torch.LongTensor] = None, | |
position_ids: Optional[torch.LongTensor] = None, | |
head_mask: Optional[torch.FloatTensor] = None, | |
inputs_embeds: Optional[torch.FloatTensor] = None, | |
labels: Optional[torch.LongTensor] = None, | |
output_attentions: Optional[bool] = None, | |
output_hidden_states: Optional[bool] = None, | |
return_dict: Optional[bool] = None, | |
) -> Union[MaskedLMOutput, Tuple]: | |
r""" | |
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., | |
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the | |
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` | |
Returns: | |
Example: | |
```python | |
>>> import torch | |
>>> from transformers import AlbertTokenizer, AlbertForMaskedLM | |
>>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2") | |
>>> model = AlbertForMaskedLM.from_pretrained("albert-base-v2") | |
>>> # add mask_token | |
>>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt") | |
>>> with torch.no_grad(): | |
... logits = model(**inputs).logits | |
>>> # retrieve index of [MASK] | |
>>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0] | |
>>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1) | |
>>> tokenizer.decode(predicted_token_id) | |
"france" | |
``` | |
```python | |
>>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"] | |
>>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100) | |
>>> outputs = model(**inputs, labels=labels) | |
>>> round(outputs.loss.item(), 2) | |
0.81 | |
``` | |
""" | |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
outputs = self.albert( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
token_type_ids=token_type_ids, | |
position_ids=position_ids, | |
head_mask=head_mask, | |
inputs_embeds=inputs_embeds, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |
sequence_outputs = outputs[0] | |
prediction_scores = self.predictions(sequence_outputs) | |
masked_lm_loss = None | |
if labels is not None: | |
loss_fct = CrossEntropyLoss() | |
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) | |
if not return_dict: | |
output = (prediction_scores,) + outputs[2:] | |
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output | |
return MaskedLMOutput( | |
loss=masked_lm_loss, | |
logits=prediction_scores, | |
hidden_states=outputs.hidden_states, | |
attentions=outputs.attentions, | |
) | |
""" | |
Function: Use MLM to pre-train RoFormer | |
Notes: | |
- The label of non-masked token is -100, which can be used for cross-entropy function (only calculate loss at not -100) | |
""" | |
class RoFormerForMaskedLM(RoFormerPreTrainedModel): | |
def __init__(self, config): | |
super().__init__(config) | |
if config.is_decoder: | |
logger.warning( | |
"If you want to use `RoFormerForMaskedLM` make sure `config.is_decoder=False` for " | |
"bi-directional self-attention." | |
) | |
self.roformer = RoFormerModel(config) | |
self.cls = RoFormerOnlyMLMHead(config) | |
# Initialize weights and apply final processing | |
self.post_init() | |
def forward( | |
self, | |
input_ids: Optional[torch.LongTensor] = None, | |
attention_mask: Optional[torch.FloatTensor] = None, | |
token_type_ids: Optional[torch.LongTensor] = None, | |
head_mask: Optional[torch.FloatTensor] = None, | |
inputs_embeds: Optional[torch.FloatTensor] = None, | |
encoder_hidden_states: Optional[torch.FloatTensor] = None, | |
encoder_attention_mask: Optional[torch.FloatTensor] = None, | |
labels: Optional[torch.LongTensor] = None, | |
output_attentions: Optional[bool] = None, | |
output_hidden_states: Optional[bool] = None, | |
return_dict: Optional[bool] = None, | |
) -> Union[MaskedLMOutput, Tuple[torch.Tensor]]: | |
r""" | |
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | |
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., | |
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the | |
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. | |
""" | |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
outputs = self.roformer( | |
input_ids, | |
attention_mask=attention_mask, | |
token_type_ids=token_type_ids, | |
head_mask=head_mask, | |
inputs_embeds=inputs_embeds, | |
encoder_hidden_states=encoder_hidden_states, | |
encoder_attention_mask=encoder_attention_mask, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |
sequence_output = outputs[0] | |
prediction_scores = self.cls(sequence_output) | |
masked_lm_loss = None | |
if labels is not None: | |
loss_fct = CrossEntropyLoss() # -100 index = padding token | |
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) | |
if not return_dict: | |
output = (prediction_scores,) + outputs[1:] | |
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output | |
return MaskedLMOutput( | |
loss=masked_lm_loss, | |
logits=prediction_scores, | |
hidden_states=outputs.hidden_states, | |
attentions=outputs.attentions, | |
) | |
if __name__ == "__main__": | |
from transformers.models.bert.tokenization_bert import BertTokenizer | |
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | |
model = BertForMaskedLM.from_pretrained("bert-base-uncased") | |
input_text = "Today is a nice day, I will [MASK] to play [MASK] with my friends." | |
inputs = tokenizer(input_text, return_tensors="pt") | |
masked_positions = inputs["input_ids"] == tokenizer.mask_token_id | |
print("inputs=", inputs) | |
""" | |
inputs= {"input_ids": tensor([[ 101, 2651, 2003, 1037, 3835, 2154, 1010, 1045, 2097, 103, 2000, 2377, | |
103, 2007, 2026, 2814, 1012, 102]]), "token_type_ids": tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), "attention_mask": tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])} | |
""" | |
outputs = model(**inputs) | |
masked_results = outputs.logits.argmax(-1)[masked_positions] | |
masked_results = tokenizer.convert_ids_to_tokens(masked_results) | |
print("masked_results=", masked_results) | |
""" | |
masked_results= ["have", "football"] | |
""" | |