Source code for transformers.tokenization_t5

# coding=utf-8
# Copyright 2018 T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization class for model T5."""


import os
import re
import warnings
from shutil import copyfile
from typing import List, Optional

from .tokenization_utils import BatchEncoding, PreTrainedTokenizer
from .utils import logging


logger = logging.get_logger(__name__)

####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to file names for serializing Tokenizer instances
####################################################
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}

####################################################
# Mapping from the keyword arguments names of Tokenizer `__init__`
# to pretrained vocabulary URL for all the model shortcut names.
####################################################
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "t5-small": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
        "t5-base": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
        "t5-large": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
        "t5-3b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
        "t5-11b": "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
    }
}

####################################################
# Mapping from model shortcut names to max length of inputs
####################################################
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "t5-small": 512,
    "t5-base": 512,
    "t5-large": 512,
    "t5-3b": 512,
    "t5-11b": 512,
}


[docs]class T5Tokenizer(PreTrainedTokenizer): """ Constructs a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ . This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users should refer to the superclass for more information regarding methods. Args: vocab_file (:obj:`string`): `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that contains the vocabulary necessary to instantiate a tokenizer. eos_token (:obj:`string`, `optional`, defaults to "</s>"): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. unk_token (:obj:`string`, `optional`, defaults to "<unk>"): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. pad_token (:obj:`string`, `optional`, defaults to "<pad>"): The token used for padding, for example when batching sequences of different lengths. extra_ids (:obj:`List[str]`, `optional`, defaults to :obj:`100`): Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are indexed from the end of the vocabulary up to beginnning ("<extra_id_0>" is the last token in the vocabulary like in T5 preprocessing see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117) additional_special_tokens (:obj:`List[str]`, `optional`): Additional special tokens used by the tokenizer. """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["attention_mask"] def __init__( self, vocab_file, eos_token="</s>", unk_token="<unk>", pad_token="<pad>", extra_ids=100, additional_special_tokens=None, **kwargs ): # Add extra_ids to the special token list if extra_ids > 0: if additional_special_tokens is None: additional_special_tokens = [] additional_special_tokens.extend(["<extra_id_{}>".format(i) for i in range(extra_ids)]) super().__init__( eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, additional_special_tokens=additional_special_tokens, **kwargs, ) try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use T5Tokenizer:" "https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.vocab_file = vocab_file self._extra_ids = extra_ids self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): return self.sp_model.get_piece_size() + self._extra_ids def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab
[docs] def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer ``prepare_for_model`` method. Args: token_ids_0 (:obj:`List[int]`): List of ids. token_ids_1 (:obj:`List[int]`, `optional`): Optional second list of IDs for sequence pairs. already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): Set to True if the token list is already formatted with special tokens for the model Returns: :obj:`List[int]`: A list of integers in the range [0, 1], 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if the provided sequence of " "ids is already formatted with special tokens for the model." ) return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) # normal case: some special tokens if token_ids_1 is None: return ([0] * len(token_ids_0)) + [1] return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]: """Do not add eos again if user already added it.""" if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id: warnings.warn( f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added." ) return token_ids else: return token_ids + [self.eos_token_id]
[docs] def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens. For some t5 tasks, model.config.prefix is specified. This must be used before tokenization. A sequence has the following format: - single sequence: ``X </s>`` - pair of sequences: ``A </s> B </s>`` Args: token_ids_0 (:obj:`List[int]`): List of IDs to which the special tokens will be added token_ids_1 (:obj:`List[int]`, `optional`): Optional second list of IDs for sequence pairs. Returns: :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. """ token_ids_0 = self._add_eos_if_not_present(token_ids_0) if token_ids_1 is None: return token_ids_0 else: token_ids_1 = self._add_eos_if_not_present(token_ids_1) return token_ids_0 + token_ids_1
def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None return state def __setstate__(self, d): self.__dict__ = d try: import sentencepiece as spm except ImportError: logger.warning( "You need to install SentencePiece to use T5Tokenizer: https://github.com/google/sentencepiece" "pip install sentencepiece" ) raise self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) def _tokenize(self, text, sample=False): """Take as input a string and return a list of strings (tokens) for words/sub-words""" if not sample: pieces = self.sp_model.EncodeAsPieces(text) else: pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) return pieces def _convert_token_to_id(self, token): """ Converts a token (str) in an id using the vocab. """ if token.startswith("<extra_id_"): match = re.match(r"<extra_id_(\d+)>", token) num = int(match.group(1)) return self.vocab_size - num - 1 return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" if index < self.sp_model.get_piece_size(): token = self.sp_model.IdToPiece(index) else: token = "<extra_id_{}>".format(self.vocab_size - 1 - index) return token def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ out_string = self.sp_model.decode_pieces(tokens) return out_string
[docs] def save_vocabulary(self, save_directory): """Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. """ if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"]) if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,)
def prepare_seq2seq_batch( self, src_texts: List[str], tgt_texts: Optional[List[str]] = None, max_length: Optional[int] = None, max_target_length: Optional[int] = None, padding: str = "longest", return_tensors: str = None, truncation: bool = True, **kwargs, ) -> BatchEncoding: r""" Prepare a batch that can be passed directly to an instance of :class:`~transformers.T5Model`. Args: src_texts: (:obj:`List[str]`): List of documents to summarize or source language texts. tgt_texts: (:obj:`List[str]`, `optional`): List of summaries or target language texts. max_length (:obj:`int`, `optional`): Controls the maximum length for encoder inputs (documents to summarize or source language texts). If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length is required by one of the truncation/padding parameters. If the model has no specific maximum input length (like XLNet) truncation/padding to a maximum length will be deactivated. max_target_length (:obj:`int`, `optional`): Controls the maximum length of decoder inputs (target language texts or summaries). If left unset or set to :obj:`None`, this will use the max_length value. padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): Activates and controls padding. Accepts the following values: * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not provided. * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths). return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`, defaults to "pt"): If set, will return tensors instead of list of python integers. Acceptable values are: * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`): Activates and controls truncation. Accepts the following values: * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not provided. This will truncate token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided. * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not provided. This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not provided. This will only truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater than the model maximum admissible input size). **kwargs: Additional keyword arguments passed along to :obj:`self.__call__`. Returns: :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields: - **input_ids** -- List of token ids to be fed to the encoder. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model. - **labels** -- List of token ids for tgt_texts The full set of keys ``[input_ids, attention_mask, decoder_input_ids, labels]``, will only be returned if tgt_texts is passed. Otherwise, input_ids, attention_mask will be the only keys. """ if max_length is None: max_length = self.max_len model_inputs = self( src_texts, add_special_tokens=True, return_tensors=return_tensors, max_length=max_length, padding=padding, truncation=truncation, **kwargs, ) if tgt_texts is None: return model_inputs # Process tgt_texts if max_target_length is None: max_target_length = max_length labels_and_decoder_mask = self( tgt_texts, add_special_tokens=True, return_tensors=return_tensors, padding=padding, max_length=max_target_length, truncation=truncation, **kwargs, ) model_inputs["labels"] = labels_and_decoder_mask["input_ids"] return model_inputs