| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | """ Base classes common to both the slow and the fast tokenization classes: |
| | PreTrainedTokenizerBase (host all the user fronting encoding methodes) |
| | Special token mixing (host the special tokens logic) and |
| | BatchEncoding (wrap the dictionnary of output with special method for the Fast tokenizers) |
| | """ |
| |
|
| | import copy |
| | import json |
| | import logging |
| | import os |
| | import warnings |
| | from collections import UserDict |
| | from enum import Enum |
| | from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union |
| |
|
| | import numpy as np |
| | from tokenizers import AddedToken |
| | from tokenizers import Encoding as EncodingFast |
| |
|
| | from .file_utils import ( |
| | add_end_docstrings, |
| | cached_path, |
| | hf_bucket_url, |
| | is_remote_url, |
| | is_tf_available, |
| | is_torch_available, |
| | torch_required, |
| | ) |
| |
|
| |
|
| | if is_tf_available(): |
| | import tensorflow as tf |
| | if is_torch_available(): |
| | import torch |
| |
|
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| | VERY_LARGE_INTEGER = int(1e30) |
| | LARGE_INTEGER = int(1e20) |
| |
|
| | |
| | TextInput = str |
| | PreTokenizedInput = List[str] |
| | EncodedInput = List[int] |
| | TextInputPair = Tuple[str, str] |
| | PreTokenizedInputPair = Tuple[List[str], List[str]] |
| | EncodedInputPair = Tuple[List[int], List[int]] |
| |
|
| |
|
| | |
| | SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" |
| | ADDED_TOKENS_FILE = "added_tokens.json" |
| | |
| | TOKENIZER_CONFIG_FILE = "tokenizer_config.json" |
| |
|
| | |
| | |
| | FULL_TOKENIZER_FILE = "tokenizer.json" |
| |
|
| |
|
| | class ExplicitEnum(Enum): |
| | """ Enum with more explicit error message for missing values. |
| | """ |
| |
|
| | @classmethod |
| | def _missing_(cls, value): |
| | raise ValueError( |
| | "%r is not a valid %s, please select one of %s" |
| | % (value, cls.__name__, str(list(cls._value2member_map_.keys()))) |
| | ) |
| |
|
| |
|
| | class TruncationStrategy(ExplicitEnum): |
| | ONLY_FIRST = "only_first" |
| | ONLY_SECOND = "only_second" |
| | LONGEST_FIRST = "longest_first" |
| | DO_NOT_TRUNCATE = "do_not_truncate" |
| |
|
| |
|
| | class PaddingStrategy(ExplicitEnum): |
| | LONGEST = "longest" |
| | MAX_LENGTH = "max_length" |
| | DO_NOT_PAD = "do_not_pad" |
| |
|
| |
|
| | class TensorType(ExplicitEnum): |
| | PYTORCH = "pt" |
| | TENSORFLOW = "tf" |
| | NUMPY = "np" |
| |
|
| |
|
| | class CharSpan(NamedTuple): |
| | """ Character span in the original string |
| | |
| | Args: |
| | start: index of the first character in the original string |
| | end: index of the character following the last character in the original string |
| | """ |
| |
|
| | start: int |
| | end: int |
| |
|
| |
|
| | class TokenSpan(NamedTuple): |
| | """ Token span in an encoded string (list of tokens) |
| | |
| | Args: |
| | start: index of the first token in the span |
| | end: index of the token following the last token in the span |
| | """ |
| |
|
| | start: int |
| | end: int |
| |
|
| |
|
| | class BatchEncoding(UserDict): |
| | """ BatchEncoding hold the output of the encode and batch_encode methods (tokens, attention_masks, etc). |
| | This class is derived from a python Dictionary and can be used as a dictionnary. |
| | In addition, this class expose utility methods to map from word/char space to token space. |
| | |
| | Args: |
| | data (:obj:`dict`): Dictionary of lists/arrays returned by the encode/batch_encode methods ('input_ids', 'attention_mask'...) |
| | encoding (:obj:`EncodingFast`, :obj:`list(EncodingFast)`, `optional`, defaults to :obj:`None`): |
| | If the tokenizer is a fast tokenizer which outputs additional informations like mapping from word/char space to token space |
| | the `EncodingFast` instance or list of instance (for batches) hold these informations. |
| | tensor_type (:obj:`Union[None, str, TensorType]`, `optional`, defaults to :obj:`None`): |
| | You can give a tensor_type here to convert the lists of integers in PyTorch/TF/Numpy Tensors at initialization |
| | prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`): |
| | Set to True to add a batch axis when converting in Tensors (see :obj:`tensor_type` above) |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | data: Optional[Dict[str, Any]] = None, |
| | encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None, |
| | tensor_type: Union[None, str, TensorType] = None, |
| | prepend_batch_axis: bool = False, |
| | ): |
| | super().__init__(data) |
| |
|
| | if isinstance(encoding, EncodingFast): |
| | encoding = [encoding] |
| |
|
| | self._encodings = encoding |
| |
|
| | self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis) |
| |
|
| | @property |
| | def is_fast(self): |
| | """ |
| | Indicate if this BatchEncoding was generated from the result of a PreTrainedTokenizerFast |
| | Returns: True if generated from subclasses of PreTrainedTokenizerFast, else otherwise |
| | """ |
| | return self._encodings is not None |
| |
|
| | def __getitem__(self, item: Union[int, str]) -> EncodingFast: |
| | """ If the key is a string, get the value of the dict associated to `key` ('input_ids', 'attention_mask'...) |
| | If the key is an integer, get the EncodingFast for batch item with index `key` |
| | """ |
| | if isinstance(item, str): |
| | return self.data[item] |
| | elif self._encodings is not None: |
| | return self._encodings[item] |
| | else: |
| | raise KeyError( |
| | "Indexing with integers (to access backend Encoding for a given batch index) " |
| | "is not available when using Python based tokenizers" |
| | ) |
| |
|
| | def __getattr__(self, item: str): |
| | try: |
| | return self.data[item] |
| | except KeyError: |
| | raise AttributeError |
| |
|
| | def __getstate__(self): |
| | return {"data": self.data, "encodings": self._encodings} |
| |
|
| | def __setstate__(self, state): |
| | if "data" in state: |
| | self.data = state["data"] |
| |
|
| | if "encodings" in state: |
| | self._encodings = state["encodings"] |
| |
|
| | def keys(self): |
| | return self.data.keys() |
| |
|
| | def values(self): |
| | return self.data.values() |
| |
|
| | def items(self): |
| | return self.data.items() |
| |
|
| | |
| | |
| | |
| |
|
| | @property |
| | def encodings(self) -> Optional[List[EncodingFast]]: |
| | """ |
| | Return the list all encoding from the tokenization process |
| | |
| | Returns: List[EncodingFast] or None if input was tokenized through Python (i.e. not fast) tokenizer |
| | """ |
| | return self._encodings |
| |
|
| | def tokens(self, batch_index: int = 0) -> List[str]: |
| | if not self._encodings: |
| | raise ValueError("tokens() is not available when using Python based tokenizers") |
| | return self._encodings[batch_index].tokens |
| |
|
| | def words(self, batch_index: int = 0) -> List[Optional[int]]: |
| | if not self._encodings: |
| | raise ValueError("words() is not available when using Python based tokenizers") |
| | return self._encodings[batch_index].words |
| |
|
| | def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: |
| | """ |
| | Get the index of the word corresponding (i.e. comprising) to an encoded token |
| | in a sequence of the batch. |
| | |
| | Can be called as: |
| | |
| | - ``self.token_to_word(token_index)`` if batch size is 1 |
| | - ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1 |
| | |
| | This method is particularly suited when the input sequences are provided as |
| | pre-tokenized sequences (i.e. words are defined by the user). In this case it allows |
| | to easily associate encoded tokens with provided tokenized words. |
| | |
| | Args: |
| | batch_or_token_index (:obj:`int`): |
| | Index of the sequence in the batch. If the batch only comprise one sequence, |
| | this can be the index of the token in the sequence |
| | token_index (:obj:`int`, `optional`): |
| | If a batch index is provided in `batch_or_token_index`, this can be the index |
| | of the token in the sequence. |
| | |
| | Returns: |
| | :obj:`int`: |
| | index of the word in the input sequence. |
| | |
| | """ |
| |
|
| | if not self._encodings: |
| | raise ValueError("token_to_word() is not available when using Python based tokenizers") |
| | if token_index is not None: |
| | batch_index = batch_or_token_index |
| | else: |
| | batch_index = 0 |
| | token_index = batch_or_token_index |
| | if batch_index < 0: |
| | batch_index = self._batch_size + batch_index |
| | if token_index < 0: |
| | token_index = self._seq_len + token_index |
| | return self._encodings[batch_index].token_to_word(token_index) |
| |
|
| | def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> TokenSpan: |
| | """ |
| | Get the encoded token span corresponding to a word in the sequence of the batch. |
| | |
| | Token spans are returned as a TokenSpan NamedTuple with: |
| | |
| | - start: index of the first token |
| | - end: index of the token following the last token |
| | |
| | Can be called as: |
| | |
| | - ``self.word_to_tokens(word_index)`` if batch size is 1 |
| | - ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1 |
| | |
| | This method is particularly suited when the input sequences are provided as |
| | pre-tokenized sequences (i.e. words are defined by the user). In this case it allows |
| | to easily associate encoded tokens with provided tokenized words. |
| | |
| | Args: |
| | batch_or_word_index (:obj:`int`): |
| | Index of the sequence in the batch. If the batch only comprises one sequence, |
| | this can be the index of the word in the sequence |
| | word_index (:obj:`int`, `optional`): |
| | If a batch index is provided in `batch_or_token_index`, this can be the index |
| | of the word in the sequence. |
| | |
| | Returns: |
| | :obj:`TokenSpan`: |
| | Span of tokens in the encoded sequence. |
| | |
| | :obj:`TokenSpan` are NamedTuple with: |
| | |
| | - start: index of the first token |
| | - end: index of the token following the last token |
| | """ |
| |
|
| | if not self._encodings: |
| | raise ValueError("word_to_tokens() is not available when using Python based tokenizers") |
| | if word_index is not None: |
| | batch_index = batch_or_word_index |
| | else: |
| | batch_index = 0 |
| | word_index = batch_or_word_index |
| | if batch_index < 0: |
| | batch_index = self._batch_size + batch_index |
| | if word_index < 0: |
| | word_index = self._seq_len + word_index |
| | return TokenSpan(*(self._encodings[batch_index].word_to_tokens(word_index))) |
| |
|
| | def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan: |
| | """ |
| | Get the character span corresponding to an encoded token in a sequence of the batch. |
| | |
| | Character spans are returned as a CharSpan NamedTuple with: |
| | |
| | - start: index of the first character in the original string associated to the token |
| | - end: index of the character following the last character in the original string associated to the token |
| | |
| | Can be called as: |
| | |
| | - ``self.token_to_chars(token_index)`` if batch size is 1 |
| | - ``self.token_to_chars(batch_index, token_index)`` if batch size is greater or equal to 1 |
| | |
| | Args: |
| | batch_or_token_index (:obj:`int`): |
| | Index of the sequence in the batch. If the batch only comprise one sequence, |
| | this can be the index of the token in the sequence |
| | token_index (:obj:`int`, `optional`): |
| | If a batch index is provided in `batch_or_token_index`, this can be the index |
| | of the token or tokens in the sequence. |
| | |
| | Returns: |
| | :obj:`CharSpan`: |
| | Span of characters in the original string. |
| | |
| | :obj:`CharSpan` are NamedTuple with: |
| | |
| | - start: index of the first character in the original string |
| | - end: index of the character following the last character in the original string |
| | """ |
| |
|
| | if not self._encodings: |
| | raise ValueError("token_to_chars() is not available when using Python based tokenizers") |
| | if token_index is not None: |
| | batch_index = batch_or_token_index |
| | else: |
| | batch_index = 0 |
| | token_index = batch_or_token_index |
| | return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index))) |
| |
|
| | def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: |
| | """ |
| | Get the index of the token in the encoded output comprising a character |
| | in the original string for a sequence of the batch. |
| | |
| | Can be called as: |
| | |
| | - ``self.char_to_token(char_index)`` if batch size is 1 |
| | - ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1 |
| | |
| | This method is particularly suited when the input sequences are provided as |
| | pre-tokenized sequences (i.e. words are defined by the user). In this case it allows |
| | to easily associate encoded tokens with provided tokenized words. |
| | |
| | Args: |
| | batch_or_char_index (:obj:`int`): |
| | Index of the sequence in the batch. If the batch only comprise one sequence, |
| | this can be the index of the word in the sequence |
| | char_index (:obj:`int`, `optional`): |
| | If a batch index is provided in `batch_or_token_index`, this can be the index |
| | of the word in the sequence. |
| | |
| | |
| | Returns: |
| | :obj:`int`: Index of the token. |
| | """ |
| |
|
| | if not self._encodings: |
| | raise ValueError("char_to_token() is not available when using Python based tokenizers") |
| | if char_index is not None: |
| | batch_index = batch_or_char_index |
| | else: |
| | batch_index = 0 |
| | char_index = batch_or_char_index |
| | return self._encodings[batch_index].char_to_token(char_index) |
| |
|
| | def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan: |
| | """ |
| | Get the character span in the original string corresponding to given word in a sequence |
| | of the batch. |
| | |
| | Character spans are returned as a CharSpan NamedTuple with: |
| | |
| | - start: index of the first character in the original string |
| | - end: index of the character following the last character in the original string |
| | |
| | Can be called as: |
| | |
| | - ``self.word_to_chars(word_index)`` if batch size is 1 |
| | - ``self.word_to_chars(batch_index, word_index)`` if batch size is greater or equal to 1 |
| | |
| | Args: |
| | batch_or_word_index (:obj:`int`): |
| | Index of the sequence in the batch. If the batch only comprise one sequence, |
| | this can be the index of the word in the sequence |
| | word_index (:obj:`int`, `optional`): |
| | If a batch index is provided in `batch_or_token_index`, this can be the index |
| | of the word in the sequence. |
| | |
| | Returns: |
| | :obj:`CharSpan` or :obj:`List[CharSpan]`: |
| | Span(s) of the associated character or characters in the string. |
| | CharSpan are NamedTuple with: |
| | |
| | - start: index of the first character associated to the token in the original string |
| | - end: index of the character following the last character associated to the token in the original string |
| | """ |
| |
|
| | if not self._encodings: |
| | raise ValueError("word_to_chars() is not available when using Python based tokenizers") |
| | if word_index is not None: |
| | batch_index = batch_or_word_index |
| | else: |
| | batch_index = 0 |
| | word_index = batch_or_word_index |
| | return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index))) |
| |
|
| | def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: |
| | """ |
| | Get the word in the original string corresponding to a character in the original string of |
| | a sequence of the batch. |
| | |
| | Can be called as: |
| | |
| | - ``self.char_to_word(char_index)`` if batch size is 1 |
| | - ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1 |
| | |
| | This method is particularly suited when the input sequences are provided as |
| | pre-tokenized sequences (i.e. words are defined by the user). In this case it allows |
| | to easily associate encoded tokens with provided tokenized words. |
| | |
| | Args: |
| | batch_or_char_index (:obj:`int`): |
| | Index of the sequence in the batch. If the batch only comprise one sequence, |
| | this can be the index of the character in the orginal string. |
| | char_index (:obj:`int`, `optional`): |
| | If a batch index is provided in `batch_or_token_index`, this can be the index |
| | of the character in the orginal string. |
| | |
| | |
| | Returns: |
| | :obj:`int` or :obj:`List[int]`: |
| | Index or indices of the associated encoded token(s). |
| | """ |
| |
|
| | if not self._encodings: |
| | raise ValueError("char_to_word() is not available when using Python based tokenizers") |
| | if char_index is not None: |
| | batch_index = batch_or_char_index |
| | else: |
| | batch_index = 0 |
| | char_index = batch_or_char_index |
| | return self._encodings[batch_index].char_to_word(char_index) |
| |
|
| | def convert_to_tensors(self, tensor_type: Union[None, str, TensorType], prepend_batch_axis: bool = False): |
| | if tensor_type is None: |
| | return self |
| |
|
| | |
| | if not isinstance(tensor_type, TensorType): |
| | tensor_type = TensorType(tensor_type) |
| |
|
| | |
| | if tensor_type == TensorType.TENSORFLOW and is_tf_available(): |
| | as_tensor = tf.constant |
| | elif tensor_type == TensorType.PYTORCH and is_torch_available(): |
| | as_tensor = torch.tensor |
| | elif tensor_type == TensorType.NUMPY: |
| | as_tensor = np.asarray |
| | else: |
| | raise ImportError( |
| | "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( |
| | tensor_type |
| | ) |
| | ) |
| |
|
| | |
| | for key, value in self.items(): |
| | try: |
| | if prepend_batch_axis: |
| | value = [value] |
| |
|
| | tensor = as_tensor(value) |
| |
|
| | |
| | if tensor.ndim > 2: |
| | tensor = tensor.squeeze(0) |
| | elif tensor.ndim < 2: |
| | tensor = tensor[None, :] |
| |
|
| | self[key] = tensor |
| | except: |
| | raise ValueError( |
| | "Unable to create tensor, you should probably activate truncation and/or padding " |
| | "with 'padding=True' 'truncation=True' to have batched tensors with the same length." |
| | ) |
| |
|
| | return self |
| |
|
| | @torch_required |
| | def to(self, device: str): |
| | """Send all values to device by calling v.to(device)""" |
| | self.data = {k: v.to(device) for k, v in self.data.items()} |
| | return self |
| |
|
| |
|
| | |
| | |
| |
|
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| |
|
| | class SpecialTokensMixin: |
| | """ SpecialTokensMixin is derived by ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` and |
| | handles specific behaviors related to special tokens. In particular, this class hold the |
| | attributes which can be used to directly access to these special tokens in a |
| | model-independant manner and allow to set and update the special tokens. |
| | """ |
| |
|
| | SPECIAL_TOKENS_ATTRIBUTES = [ |
| | "bos_token", |
| | "eos_token", |
| | "unk_token", |
| | "sep_token", |
| | "pad_token", |
| | "cls_token", |
| | "mask_token", |
| | "additional_special_tokens", |
| | ] |
| |
|
| | def __init__(self, verbose=True, **kwargs): |
| | self._bos_token = None |
| | self._eos_token = None |
| | self._unk_token = None |
| | self._sep_token = None |
| | self._pad_token = None |
| | self._cls_token = None |
| | self._mask_token = None |
| | self._pad_token_type_id = 0 |
| | self._additional_special_tokens = [] |
| | self.verbose = verbose |
| |
|
| | |
| | |
| | |
| | for key, value in kwargs.items(): |
| | if key in self.SPECIAL_TOKENS_ATTRIBUTES: |
| | if key == "additional_special_tokens": |
| | assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value) |
| | setattr(self, key, value) |
| | elif isinstance(value, (str, AddedToken)): |
| | setattr(self, key, value) |
| | else: |
| | raise TypeError( |
| | "special token {} has to be either str or AddedToken but got: {}".format(key, type(value)) |
| | ) |
| |
|
| | def sanitize_special_tokens(self) -> int: |
| | """ Make sure that all the special tokens attributes of the tokenizer (tokenizer.mask_token, tokenizer.cls_token, ...) |
| | are in the vocabulary. Add the missing ones to the vocabulary if needed. |
| | |
| | Return: |
| | Number of tokens added in the vocaulary during the operation. |
| | """ |
| | return self.add_tokens(self.all_special_tokens_extended, special_tokens=True) |
| |
|
| | def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int: |
| | """ |
| | Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them |
| | to class attributes. If special tokens are NOT in the vocabulary, they are added |
| | to it (indexed starting from the last index of the current vocabulary). |
| | |
| | Using `add_special_tokens` will ensure your special tokens can be used in several ways: |
| | |
| | - special tokens are carefully handled by the tokenizer (they are never split) |
| | - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts. |
| | |
| | When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>') |
| | |
| | Args: |
| | special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: |
| | [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, |
| | ``additional_special_tokens``]. |
| | |
| | Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). |
| | |
| | Returns: |
| | Number of tokens added to the vocabulary. |
| | |
| | Examples:: |
| | |
| | # Let's see how to add a new classification token to GPT-2 |
| | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') |
| | model = GPT2Model.from_pretrained('gpt2') |
| | |
| | special_tokens_dict = {'cls_token': '<CLS>'} |
| | |
| | num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) |
| | print('We have added', num_added_toks, 'tokens') |
| | model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. |
| | |
| | assert tokenizer.cls_token == '<CLS>' |
| | """ |
| | if not special_tokens_dict: |
| | return 0 |
| |
|
| | added_tokens = 0 |
| | for key, value in special_tokens_dict.items(): |
| | assert key in self.SPECIAL_TOKENS_ATTRIBUTES |
| |
|
| | if self.verbose: |
| | logger.info("Assigning %s to the %s key of the tokenizer", value, key) |
| | setattr(self, key, value) |
| |
|
| | if key == "additional_special_tokens": |
| | assert isinstance(value, (list, tuple)) and all( |
| | isinstance(t, (str, AddedToken)) for t in value |
| | ), f"Tokens {value} for key {key} should all be str or AddedToken instances" |
| | added_tokens += self.add_tokens(value, special_tokens=True) |
| | else: |
| | assert isinstance( |
| | value, (str, AddedToken) |
| | ), f"Token {value} for key {key} should be a str or an AddedToken instance" |
| | added_tokens += self.add_tokens([value], special_tokens=True) |
| |
|
| | return added_tokens |
| |
|
| | def add_tokens(self, new_tokens: Union[str, AddedToken, List[str], List[AddedToken]], special_tokens=False) -> int: |
| | """ |
| | Add a list of new tokens to the tokenizer class. If the new tokens are not in the |
| | vocabulary, they are added to it with indices starting from length of the current vocabulary. |
| | |
| | Args: |
| | new_tokens: string or list of string or :class:`~transformers.AddedToken`. Each string is a token to add. |
| | Tokens are only added if they are not already in the vocabulary. AddedToken wrap a string token to |
| | let you personnalize it's behavior (Whether this token should only match against single word, whether |
| | this token should strip all potential whitespaces on the left side, Whether this token should strip |
| | all potential whitespaces on the right side...). |
| | special_token: can be used to specify if the token is a special token. This mostly change the normalization |
| | behavior (special tokens like CLS or [MASK] are usually not lower-cased for instance) |
| | |
| | See details for :class:`~transformers.AddedToken` in HuggingFace tokenizers library. |
| | |
| | Returns: |
| | Number of tokens added to the vocabulary. |
| | |
| | Examples:: |
| | |
| | # Let's see how to increase the vocabulary of Bert model and tokenizer |
| | tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') |
| | model = BertModel.from_pretrained('bert-base-uncased') |
| | |
| | num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) |
| | print('We have added', num_added_toks, 'tokens') |
| | model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. |
| | """ |
| | if not new_tokens: |
| | return 0 |
| |
|
| | if not isinstance(new_tokens, (list, tuple)): |
| | new_tokens = [new_tokens] |
| |
|
| | return self._add_tokens(new_tokens, special_tokens=special_tokens) |
| |
|
| | @property |
| | def bos_token(self): |
| | """ Beginning of sentence token (string). Log an error if used while not having been set. """ |
| | if self._bos_token is None and self.verbose: |
| | logger.error("Using bos_token, but it is not set yet.") |
| | return None |
| | return str(self._bos_token) |
| |
|
| | @property |
| | def eos_token(self): |
| | """ End of sentence token (string). Log an error if used while not having been set. """ |
| | if self._eos_token is None and self.verbose: |
| | logger.error("Using eos_token, but it is not set yet.") |
| | return None |
| | return str(self._eos_token) |
| |
|
| | @property |
| | def unk_token(self): |
| | """ Unknown token (string). Log an error if used while not having been set. """ |
| | if self._unk_token is None and self.verbose: |
| | logger.error("Using unk_token, but it is not set yet.") |
| | return None |
| | return str(self._unk_token) |
| |
|
| | @property |
| | def sep_token(self): |
| | """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ |
| | if self._sep_token is None and self.verbose: |
| | logger.error("Using sep_token, but it is not set yet.") |
| | return None |
| | return str(self._sep_token) |
| |
|
| | @property |
| | def pad_token(self): |
| | """ Padding token (string). Log an error if used while not having been set. """ |
| | if self._pad_token is None and self.verbose: |
| | logger.error("Using pad_token, but it is not set yet.") |
| | return None |
| | return str(self._pad_token) |
| |
|
| | @property |
| | def cls_token(self): |
| | """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ |
| | if self._cls_token is None and self.verbose: |
| | logger.error("Using cls_token, but it is not set yet.") |
| | return None |
| | return str(self._cls_token) |
| |
|
| | @property |
| | def mask_token(self): |
| | """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ |
| | if self._mask_token is None and self.verbose: |
| | logger.error("Using mask_token, but it is not set yet.") |
| | return None |
| | return str(self._mask_token) |
| |
|
| | @property |
| | def additional_special_tokens(self): |
| | """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """ |
| | if self._additional_special_tokens is None and self.verbose: |
| | logger.error("Using additional_special_tokens, but it is not set yet.") |
| | return None |
| | return [str(tok) for tok in self._additional_special_tokens] |
| |
|
| | @bos_token.setter |
| | def bos_token(self, value): |
| | self._bos_token = value |
| |
|
| | @eos_token.setter |
| | def eos_token(self, value): |
| | self._eos_token = value |
| |
|
| | @unk_token.setter |
| | def unk_token(self, value): |
| | self._unk_token = value |
| |
|
| | @sep_token.setter |
| | def sep_token(self, value): |
| | self._sep_token = value |
| |
|
| | @pad_token.setter |
| | def pad_token(self, value): |
| | self._pad_token = value |
| |
|
| | @cls_token.setter |
| | def cls_token(self, value): |
| | self._cls_token = value |
| |
|
| | @mask_token.setter |
| | def mask_token(self, value): |
| | self._mask_token = value |
| |
|
| | @additional_special_tokens.setter |
| | def additional_special_tokens(self, value): |
| | self._additional_special_tokens = value |
| |
|
| | @property |
| | def bos_token_id(self): |
| | """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """ |
| | if self._bos_token is None: |
| | return None |
| | return self.convert_tokens_to_ids(self.bos_token) |
| |
|
| | @property |
| | def eos_token_id(self): |
| | """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """ |
| | if self._eos_token is None: |
| | return None |
| | return self.convert_tokens_to_ids(self.eos_token) |
| |
|
| | @property |
| | def unk_token_id(self): |
| | """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """ |
| | if self._unk_token is None: |
| | return None |
| | return self.convert_tokens_to_ids(self.unk_token) |
| |
|
| | @property |
| | def sep_token_id(self): |
| | """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ |
| | if self._sep_token is None: |
| | return None |
| | return self.convert_tokens_to_ids(self.sep_token) |
| |
|
| | @property |
| | def pad_token_id(self): |
| | """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ |
| | if self._pad_token is None: |
| | return None |
| | return self.convert_tokens_to_ids(self.pad_token) |
| |
|
| | @property |
| | def pad_token_type_id(self): |
| | """ Id of the padding token type in the vocabulary.""" |
| | return self._pad_token_type_id |
| |
|
| | @property |
| | def cls_token_id(self): |
| | """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ |
| | if self._cls_token is None: |
| | return None |
| | return self.convert_tokens_to_ids(self.cls_token) |
| |
|
| | @property |
| | def mask_token_id(self): |
| | """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ |
| | if self._mask_token is None: |
| | return None |
| | return self.convert_tokens_to_ids(self.mask_token) |
| |
|
| | @property |
| | def additional_special_tokens_ids(self): |
| | """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """ |
| | return self.convert_tokens_to_ids(self.additional_special_tokens) |
| |
|
| | @property |
| | def special_tokens_map(self): |
| | """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their |
| | values ('<unk>', '<cls>'...) |
| | Convert tokens of AddedToken type in string. |
| | All returned tokens are strings |
| | """ |
| | set_attr = {} |
| | for attr in self.SPECIAL_TOKENS_ATTRIBUTES: |
| | attr_value = getattr(self, "_" + attr) |
| | if attr_value: |
| | set_attr[attr] = str(attr_value) |
| | return set_attr |
| |
|
| | @property |
| | def special_tokens_map_extended(self): |
| | """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their |
| | values ('<unk>', '<cls>'...) |
| | Keep the tokens as AddedToken if they are of this type. |
| | |
| | AddedToken can be used to control more finely how special tokens are tokenized. |
| | """ |
| | set_attr = {} |
| | for attr in self.SPECIAL_TOKENS_ATTRIBUTES: |
| | attr_value = getattr(self, "_" + attr) |
| | if attr_value: |
| | set_attr[attr] = attr_value |
| | return set_attr |
| |
|
| | @property |
| | def all_special_tokens(self): |
| | """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes |
| | Convert tokens of AddedToken type in string. |
| | All returned tokens are strings |
| | (cls_token, unk_token...). |
| | """ |
| | all_toks = [str(s) for s in self.all_special_tokens_extended] |
| | return all_toks |
| |
|
| | @property |
| | def all_special_tokens_extended(self): |
| | """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes |
| | Keep the tokens as AddedToken if they are of this type. |
| | |
| | AddedToken can be used to control more finely how special tokens are tokenized. |
| | """ |
| | all_toks = [] |
| | set_attr = self.special_tokens_map_extended |
| | for attr_value in set_attr.values(): |
| | all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) |
| | all_toks = list(set(all_toks)) |
| | return all_toks |
| |
|
| | @property |
| | def all_special_ids(self): |
| | """ List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to |
| | class attributes (cls_token, unk_token...). |
| | """ |
| | all_toks = self.all_special_tokens |
| | all_ids = self.convert_tokens_to_ids(all_toks) |
| | return all_ids |
| |
|
| |
|
| | ENCODE_KWARGS_DOCSTRING = r""" |
| | add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): |
| | If set to ``True``, the sequences will be encoded with the special tokens relative |
| | to their model. |
| | `padding` (:obj:`Union[bool, str]`, `optional`, defaults to :obj:`False`): |
| | Activate and control padding. Accepts the following values: |
| | |
| | * `True` or `'longest'`: pad to the longest sequence in the batch (or no padding if only a single sequence if provided), |
| | * `'max_length'`: pad to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`) |
| | * `False` or `'do_not_pad'` (default): No padding (i.e. can output batch with sequences of uneven lengths) |
| | `truncation` (:obj:`Union[bool, str]`, `optional`, defaults to :obj:`False`): |
| | Activate and control truncation. Accepts the following values: |
| | |
| | * `True` or `'longest_first'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will truncate token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided, |
| | * `'only_first'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided, |
| | * `'only_second'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will only truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided, |
| | * `False` or `'do_not_truncate'` (default): No truncation (i.e. can output batch with sequences length greater than the model max admissible input size) |
| | `max_length` (:obj:`Union[int, None]`, `optional`, defaults to :obj:`None`): |
| | Control the length for padding/truncation. Accepts the following values |
| | |
| | * `None` (default): This will use the predefined model max length if required by one of the truncation/padding parameters. If the model has no specific max input length (e.g. XLNet) truncation/padding to max length is deactivated. |
| | * `any integer value` (e.g. `42`): Use this specific maximum length value if required by one of the truncation/padding parameters. |
| | stride (:obj:`int`, `optional`, defaults to ``0``): |
| | If set to a number along with max_length, the overflowing tokens returned when `return_overflowing_tokens=True` |
| | will contain some tokens from the end of the truncated sequence returned to provide some overlap between truncated and overflow ing sequences. |
| | The value of this argument defines the number of overlapping tokens. |
| | is_pretokenized (:obj:`bool`, defaults to :obj:`False`): |
| | Set to True to indicate the input is already tokenized |
| | pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. |
| | This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability |
| | >= 7.5 (Volta). |
| | return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): |
| | Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, |
| | PyTorch :obj:`torch.Tensor` or Numpy :oj: `np.ndarray` instead of a list of python integers. |
| | """ |
| |
|
| | ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" |
| | return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): |
| | Whether to return token type IDs. If left to the default, will return the token type IDs according |
| | to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. |
| | |
| | `What are token type IDs? <../glossary.html#token-type-ids>`_ |
| | return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`): |
| | Whether to return the attention mask. If left to the default, will return the attention mask according |
| | to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. |
| | |
| | `What are attention masks? <../glossary.html#attention-mask>`__ |
| | return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): |
| | Set to True to return overflowing token sequences (default False). |
| | return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): |
| | Set to True to return special tokens mask information (default False). |
| | return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): |
| | Set to True to return (char_start, char_end) for each token (default False). |
| | If using Python's tokenizer, this method will raise NotImplementedError. |
| | This one is only available on fast tokenizers inheriting from PreTrainedTokenizerFast. |
| | **kwargs: passed to the `self.tokenize()` method |
| | |
| | Return: |
| | A Dictionary of shape:: |
| | |
| | { |
| | input_ids: list[int], |
| | token_type_ids: list[int] if return_token_type_ids is True (default) |
| | attention_mask: list[int] if return_attention_mask is True (default) |
| | overflowing_tokens: list[int] if the tokenizer is a slow tokenize, else a List[List[int]] if a ``max_length`` is specified and ``return_overflowing_tokens=True`` |
| | special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` |
| | and return_special_tokens_mask is True |
| | } |
| | |
| | With the fields: |
| | |
| | - ``input_ids``: list of token ids to be fed to a model |
| | - ``token_type_ids``: list of token type ids to be fed to a model |
| | - ``attention_mask``: list of indices specifying which tokens should be attended to by the model |
| | - ``overflowing_tokens``: list of overflowing tokens sequences if a max length is specified and ``return_overflowing_tokens=True``. |
| | - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added |
| | tokens and 1 specifying sequence tokens. |
| | """ |
| |
|
| |
|
| | class PreTrainedTokenizerBase(SpecialTokensMixin): |
| | """ Base class for slow and fast tokenizers. |
| | |
| | Handle shared (mostly boiler plate) methods for slow and fast tokenizers. |
| | """ |
| |
|
| | vocab_files_names: Dict[str, str] = {} |
| | pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {} |
| | pretrained_init_configuration: Dict[str, Dict[str, Any]] = {} |
| | max_model_input_sizes: Dict[str, int] = {} |
| | model_input_names: List[str] = ["token_type_ids", "attention_mask"] |
| |
|
| | padding_side: str = "right" |
| |
|
| | def __init__(self, **kwargs): |
| | |
| | self.init_inputs = () |
| | self.init_kwargs = kwargs |
| |
|
| | |
| | model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None)) |
| | self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER |
| |
|
| | |
| | self.padding_side = kwargs.pop("padding_side", self.padding_side) |
| | assert self.padding_side in [ |
| | "right", |
| | "left", |
| | ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}" |
| | self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) |
| |
|
| | super().__init__(**kwargs) |
| |
|
| | @property |
| | def max_len(self) -> int: |
| | """ Kept here for backward compatibility. |
| | Now renamed to `model_max_length` to avoid ambiguity. |
| | """ |
| | return self.model_max_length |
| |
|
| | @property |
| | def max_len_single_sentence(self) -> int: |
| | return self.model_max_length - self.num_special_tokens_to_add(pair=False) |
| |
|
| | @property |
| | def max_len_sentences_pair(self) -> int: |
| | return self.model_max_length - self.num_special_tokens_to_add(pair=True) |
| |
|
| | @max_len_single_sentence.setter |
| | def max_len_single_sentence(self, value) -> int: |
| | """ For backward compatibility, allow to try to setup 'max_len_single_sentence' """ |
| | if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose: |
| | logger.warning( |
| | "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." |
| | ) |
| | else: |
| | raise ValueError( |
| | "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." |
| | ) |
| |
|
| | @max_len_sentences_pair.setter |
| | def max_len_sentences_pair(self, value) -> int: |
| | """ For backward compatibility, allow to try to setup 'max_len_sentences_pair' """ |
| | if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose: |
| | logger.warning( |
| | "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." |
| | ) |
| | else: |
| | raise ValueError( |
| | "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." |
| | ) |
| |
|
| | @classmethod |
| | def from_pretrained(cls, *inputs, **kwargs): |
| | r""" |
| | Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer. |
| | |
| | Args: |
| | pretrained_model_name_or_path: either: |
| | |
| | - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. |
| | - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. |
| | - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. |
| | - (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. |
| | |
| | cache_dir: (`optional`) string: |
| | Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. |
| | |
| | force_download: (`optional`) boolean, default False: |
| | Force to (re-)download the vocabulary files and override the cached versions if they exists. |
| | |
| | resume_download: (`optional`) boolean, default False: |
| | Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. |
| | |
| | proxies: (`optional`) dict, default None: |
| | A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. |
| | The proxies are used on each request. |
| | |
| | inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. |
| | |
| | kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details. |
| | |
| | Examples:: |
| | |
| | # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer |
| | |
| | # Download vocabulary from S3 and cache. |
| | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') |
| | |
| | # Download vocabulary from S3 (user-uploaded) and cache. |
| | tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased') |
| | |
| | # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) |
| | tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') |
| | |
| | # If the tokenizer uses a single vocabulary file, you can point directly to this file |
| | tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt') |
| | |
| | # You can link tokens to special vocabulary when instantiating |
| | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>') |
| | # You should be sure '<unk>' is in the vocabulary when doing that. |
| | # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead) |
| | assert tokenizer.unk_token == '<unk>' |
| | |
| | """ |
| | return cls._from_pretrained(*inputs, **kwargs) |
| |
|
| | @classmethod |
| | def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): |
| | cache_dir = kwargs.pop("cache_dir", None) |
| | force_download = kwargs.pop("force_download", False) |
| | resume_download = kwargs.pop("resume_download", False) |
| | proxies = kwargs.pop("proxies", None) |
| | local_files_only = kwargs.pop("local_files_only", False) |
| |
|
| | s3_models = list(cls.max_model_input_sizes.keys()) |
| | vocab_files = {} |
| | init_configuration = {} |
| | if pretrained_model_name_or_path in s3_models: |
| | |
| | for file_id, map_list in cls.pretrained_vocab_files_map.items(): |
| | vocab_files[file_id] = map_list[pretrained_model_name_or_path] |
| | if ( |
| | cls.pretrained_init_configuration |
| | and pretrained_model_name_or_path in cls.pretrained_init_configuration |
| | ): |
| | init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy() |
| | else: |
| | |
| | logger.info( |
| | "Model name '{}' not found in model shortcut name list ({}). " |
| | "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format( |
| | pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path |
| | ) |
| | ) |
| |
|
| | if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): |
| | if len(cls.vocab_files_names) > 1: |
| | raise ValueError( |
| | "Calling {}.from_pretrained() with the path to a single file or url is not supported." |
| | "Use a model identifier or the path to a directory instead.".format(cls.__name__) |
| | ) |
| | logger.warning( |
| | "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format( |
| | cls.__name__ |
| | ) |
| | ) |
| | file_id = list(cls.vocab_files_names.keys())[0] |
| | vocab_files[file_id] = pretrained_model_name_or_path |
| | else: |
| | |
| | additional_files_names = { |
| | "added_tokens_file": ADDED_TOKENS_FILE, |
| | "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, |
| | "tokenizer_config_file": TOKENIZER_CONFIG_FILE, |
| | "full_tokenizer_file": FULL_TOKENIZER_FILE, |
| | } |
| | |
| | for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items(): |
| | if os.path.isdir(pretrained_model_name_or_path): |
| | full_file_name = os.path.join(pretrained_model_name_or_path, file_name) |
| | if not os.path.exists(full_file_name): |
| | logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) |
| | full_file_name = None |
| | else: |
| | full_file_name = hf_bucket_url( |
| | pretrained_model_name_or_path, filename=file_name, use_cdn=False |
| | ) |
| |
|
| | vocab_files[file_id] = full_file_name |
| |
|
| | |
| | try: |
| | resolved_vocab_files = {} |
| | for file_id, file_path in vocab_files.items(): |
| | if file_path is None: |
| | resolved_vocab_files[file_id] = None |
| | else: |
| | resolved_vocab_files[file_id] = cached_path( |
| | file_path, |
| | cache_dir=cache_dir, |
| | force_download=force_download, |
| | proxies=proxies, |
| | resume_download=resume_download, |
| | local_files_only=local_files_only, |
| | ) |
| | except EnvironmentError: |
| | if pretrained_model_name_or_path in s3_models: |
| | msg = "Couldn't reach server at '{}' to download vocabulary files." |
| | else: |
| | msg = ( |
| | "Model name '{}' was not found in tokenizers model name list ({}). " |
| | "We assumed '{}' was a path or url to a directory containing vocabulary files " |
| | "named {}, but couldn't find such vocabulary files at this path or url.".format( |
| | pretrained_model_name_or_path, |
| | ", ".join(s3_models), |
| | pretrained_model_name_or_path, |
| | list(cls.vocab_files_names.values()), |
| | ) |
| | ) |
| |
|
| | raise EnvironmentError(msg) |
| |
|
| | if all(full_file_name is None for full_file_name in resolved_vocab_files.values()): |
| | raise EnvironmentError( |
| | "Model name '{}' was not found in tokenizers model name list ({}). " |
| | "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files " |
| | "named {} but couldn't find such vocabulary files at this path or url.".format( |
| | pretrained_model_name_or_path, |
| | ", ".join(s3_models), |
| | pretrained_model_name_or_path, |
| | list(cls.vocab_files_names.values()), |
| | ) |
| | ) |
| |
|
| | for file_id, file_path in vocab_files.items(): |
| | if file_path == resolved_vocab_files[file_id]: |
| | logger.info("loading file {}".format(file_path)) |
| | else: |
| | logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id])) |
| |
|
| | |
| | |
| | tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) |
| | if tokenizer_config_file is not None: |
| | with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: |
| | init_kwargs = json.load(tokenizer_config_handle) |
| | saved_init_inputs = init_kwargs.pop("init_inputs", ()) |
| | if not init_inputs: |
| | init_inputs = saved_init_inputs |
| | else: |
| | init_kwargs = init_configuration |
| |
|
| | |
| | init_kwargs.update(kwargs) |
| |
|
| | |
| | if pretrained_model_name_or_path in cls.max_model_input_sizes: |
| | |
| | |
| | model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path] |
| | if model_max_length is not None and isinstance(model_max_length, (int, float)): |
| | init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length) |
| |
|
| | |
| | added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) |
| | for args_name, file_path in resolved_vocab_files.items(): |
| | if args_name not in init_kwargs: |
| | init_kwargs[args_name] = file_path |
| |
|
| | |
| | try: |
| | tokenizer = cls(*init_inputs, **init_kwargs) |
| | except OSError: |
| | raise OSError( |
| | "Unable to load vocabulary from file. " |
| | "Please check that the provided vocabulary is accessible and not corrupted." |
| | ) |
| |
|
| | |
| | tokenizer.init_inputs = init_inputs |
| | tokenizer.init_kwargs = init_kwargs |
| |
|
| | |
| | special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) |
| | if special_tokens_map_file is not None: |
| | with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: |
| | special_tokens_map = json.load(special_tokens_map_handle) |
| |
|
| | for key, value in special_tokens_map.items(): |
| | if isinstance(value, dict): |
| | value = AddedToken(**value) |
| | setattr(tokenizer, key, value) |
| |
|
| | |
| | special_tokens = tokenizer.all_special_tokens |
| | if added_tokens_file is not None: |
| | with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: |
| | added_tok_encoder = json.load(added_tokens_handle) |
| |
|
| | |
| | added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1])) |
| |
|
| | for token, index in added_tok_encoder_sorted: |
| | assert index == len(tokenizer), ( |
| | f"Non-consecutive added token '{token}' found. " |
| | f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary." |
| | ) |
| | tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens)) |
| |
|
| | |
| | added_tokens = tokenizer.sanitize_special_tokens() |
| | if added_tokens: |
| | logger.warning( |
| | "Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained." |
| | ) |
| |
|
| | return tokenizer |
| |
|
| | def save_pretrained(self, save_directory) -> Tuple[str]: |
| | """ Save the tokenizer vocabulary files together with: |
| | - added tokens, |
| | - special-tokens-to-class-attributes-mapping, |
| | - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert). |
| | |
| | Warning: This won't save modifications you may have applied to the tokenizer after the instantiation |
| | (e.g. modifying tokenizer.do_lower_case after creation). |
| | |
| | This method make sure the full tokenizer can then be re-loaded using the |
| | :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method. |
| | """ |
| | if os.path.isfile(save_directory): |
| | logger.error("Provided path ({}) should be a directory, not a file".format(save_directory)) |
| | return |
| | os.makedirs(save_directory, exist_ok=True) |
| |
|
| | special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE) |
| | added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE) |
| | tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE) |
| |
|
| | tokenizer_config = copy.deepcopy(self.init_kwargs) |
| | if len(self.init_inputs) > 0: |
| | tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) |
| | for file_id in self.vocab_files_names.keys(): |
| | tokenizer_config.pop(file_id, None) |
| |
|
| | with open(tokenizer_config_file, "w", encoding="utf-8") as f: |
| | f.write(json.dumps(tokenizer_config, ensure_ascii=False)) |
| |
|
| | with open(special_tokens_map_file, "w", encoding="utf-8") as f: |
| | write_dict = {} |
| | for key, value in self.special_tokens_map_extended.items(): |
| | if isinstance(value, AddedToken): |
| | write_dict[key] = value.__getstate__() |
| | else: |
| | write_dict[key] = value |
| | f.write(json.dumps(write_dict, ensure_ascii=False)) |
| |
|
| | added_vocab = self.get_added_vocab() |
| | if added_vocab: |
| | with open(added_tokens_file, "w", encoding="utf-8") as f: |
| | out_str = json.dumps(added_vocab, ensure_ascii=False) |
| | f.write(out_str) |
| |
|
| | vocab_files = self.save_vocabulary(save_directory) |
| |
|
| | return vocab_files + (special_tokens_map_file, added_tokens_file) |
| |
|
| | @add_end_docstrings( |
| | ENCODE_KWARGS_DOCSTRING, |
| | """ |
| | **kwargs: passed to the `self.tokenize()` method. |
| | """, |
| | ) |
| | def encode( |
| | self, |
| | text: Union[TextInput, PreTokenizedInput, EncodedInput], |
| | text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, |
| | add_special_tokens: bool = True, |
| | padding: Union[bool, str] = False, |
| | truncation: Union[bool, str] = False, |
| | max_length: Optional[int] = None, |
| | stride: int = 0, |
| | return_tensors: Optional[Union[str, TensorType]] = None, |
| | **kwargs |
| | ): |
| | """ |
| | Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. |
| | |
| | Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. |
| | |
| | Args: |
| | text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`): |
| | The first sequence to be encoded. This can be a string, a list of strings (tokenized string using |
| | the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` |
| | method) |
| | text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): |
| | Optional second sequence to be encoded. This can be a string, a list of strings (tokenized |
| | string using the `tokenize` method) or a list of integers (tokenized string ids using the |
| | `convert_tokens_to_ids` method) |
| | """ |
| | encoded_inputs = self.encode_plus( |
| | text, |
| | text_pair=text_pair, |
| | add_special_tokens=add_special_tokens, |
| | padding=padding, |
| | truncation=truncation, |
| | max_length=max_length, |
| | stride=stride, |
| | return_tensors=return_tensors, |
| | **kwargs, |
| | ) |
| |
|
| | return encoded_inputs["input_ids"] |
| |
|
| | def num_special_tokens_to_add(self, pair: bool = False) -> int: |
| | raise NotImplementedError |
| |
|
| | def _get_padding_truncation_strategies( |
| | self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs |
| | ): |
| | """ Find the correct padding/truncation strategy with backward compatibility |
| | for old arguments (truncation_strategy and pad_to_max_length) and behaviors. |
| | """ |
| | old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate") |
| | old_pad_to_max_length = kwargs.pop("pad_to_max_length", False) |
| |
|
| | |
| | |
| | if max_length is not None and padding is False and truncation is False: |
| | if verbose: |
| | logger.warning( |
| | "Truncation was not explicitely activated but `max_length` is provided a specific value, " |
| | "please use `truncation=True` to explicitely truncate examples to max length. " |
| | "Defaulting to 'longest_first' truncation strategy. " |
| | "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy " |
| | "more precisely by providing a specific strategy to `truncation`." |
| | ) |
| | truncation = "longest_first" |
| |
|
| | |
| | if padding is False and old_pad_to_max_length: |
| | if verbose: |
| | warnings.warn( |
| | "The `pad_to_max_length` argument is deprecated and will be removed in a future version, " |
| | "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or " |
| | "use `padding='max_length'` to pad to a max length. In this case, you can give a specific " |
| | "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the " |
| | "maximal input size of the model (e.g. 512 for Bert).", |
| | DeprecationWarning, |
| | ) |
| | if max_length is None: |
| | padding_strategy = PaddingStrategy.LONGEST |
| | else: |
| | padding_strategy = PaddingStrategy.MAX_LENGTH |
| | elif padding is not False: |
| | if padding is True: |
| | padding_strategy = PaddingStrategy.LONGEST |
| | elif not isinstance(padding, PaddingStrategy): |
| | padding_strategy = PaddingStrategy(padding) |
| | else: |
| | padding_strategy = PaddingStrategy.DO_NOT_PAD |
| |
|
| | |
| | if truncation is False and old_truncation_strategy != "do_not_truncate": |
| | if verbose: |
| | warnings.warn( |
| | "The `truncation_strategy` argument is deprecated and will be removed in a future version, " |
| | "use `truncation=True` to truncate examples to a max length. You can give a specific " |
| | "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the " |
| | "maximal input size of the model (e.g. 512 for Bert). " |
| | " If you have pairs of inputs, you can give a specific truncation strategy selected among " |
| | "`truncation='only_first'` (will only truncate the first sentence in the pairs) " |
| | "`truncation='only_second'` (will only truncate the second sentence in the pairs) " |
| | "or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).", |
| | DeprecationWarning, |
| | ) |
| | truncation_strategy = TruncationStrategy(old_truncation_strategy) |
| | elif truncation is not False: |
| | if truncation is True: |
| | truncation_strategy = ( |
| | TruncationStrategy.LONGEST_FIRST |
| | ) |
| | elif not isinstance(truncation, TruncationStrategy): |
| | truncation_strategy = TruncationStrategy(truncation) |
| | else: |
| | truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE |
| |
|
| | |
| | if max_length is None: |
| | if padding_strategy == PaddingStrategy.MAX_LENGTH: |
| | if self.model_max_length > LARGE_INTEGER: |
| | if verbose: |
| | logger.warning( |
| | "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. " |
| | "Default to no padding." |
| | ) |
| | padding_strategy = PaddingStrategy.DO_NOT_PAD |
| | else: |
| | max_length = self.model_max_length |
| |
|
| | if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: |
| | if self.model_max_length > LARGE_INTEGER: |
| | if verbose: |
| | logger.warning( |
| | "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. " |
| | "Default to no truncation." |
| | ) |
| | truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE |
| | else: |
| | max_length = self.model_max_length |
| |
|
| | |
| | if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0): |
| | raise ValueError( |
| | "Asking to pad but the tokenizer does not have a padding token. " |
| | "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " |
| | "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." |
| | ) |
| |
|
| | |
| | if ( |
| | truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE |
| | and padding_strategy != PaddingStrategy.DO_NOT_PAD |
| | and pad_to_multiple_of is not None |
| | and max_length is not None |
| | and (max_length % pad_to_multiple_of != 0) |
| | ): |
| | raise ValueError( |
| | f"Truncation and padding are both activated but " |
| | f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." |
| | ) |
| |
|
| | return padding_strategy, truncation_strategy, max_length, kwargs |
| |
|
| | @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) |
| | def __call__( |
| | self, |
| | text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], |
| | text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, |
| | add_special_tokens: bool = True, |
| | padding: Union[bool, str] = False, |
| | truncation: Union[bool, str] = False, |
| | max_length: Optional[int] = None, |
| | stride: int = 0, |
| | is_pretokenized: bool = False, |
| | pad_to_multiple_of: Optional[int] = None, |
| | return_tensors: Optional[Union[str, TensorType]] = None, |
| | return_token_type_ids: Optional[bool] = None, |
| | return_attention_mask: Optional[bool] = None, |
| | return_overflowing_tokens: bool = False, |
| | return_special_tokens_mask: bool = False, |
| | return_offsets_mapping: bool = False, |
| | return_length: bool = False, |
| | verbose: bool = True, |
| | **kwargs |
| | ) -> BatchEncoding: |
| | """ |
| | Returns a dictionary containing the encoded sequence or sequence pair and additional information: |
| | the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. |
| | |
| | Args: |
| | text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]``): |
| | The sequence or batch of sequences to be encoded. |
| | Each sequence can be a string or a list of strings (pre-tokenized string). |
| | If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True` |
| | (to lift the ambiguity with a batch of sequences) |
| | text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]``): |
| | The sequence or batch of sequences to be encoded. |
| | Each sequence can be a string or a list of strings (pre-tokenized string). |
| | If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True` |
| | (to lift the ambiguity with a batch of sequences) |
| | """ |
| | |
| | assert isinstance(text, str) or ( |
| | isinstance(text, (list, tuple)) |
| | and ( |
| | len(text) == 0 |
| | or ( |
| | isinstance(text[0], str) |
| | or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str))) |
| | ) |
| | ) |
| | ), ( |
| | "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " |
| | "or `List[List[str]]` (batch of pretokenized examples)." |
| | ) |
| |
|
| | assert ( |
| | text_pair is None |
| | or isinstance(text_pair, str) |
| | or ( |
| | isinstance(text_pair, (list, tuple)) |
| | and ( |
| | len(text_pair) == 0 |
| | or ( |
| | isinstance(text_pair[0], str) |
| | or ( |
| | isinstance(text_pair[0], (list, tuple)) |
| | and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str)) |
| | ) |
| | ) |
| | ) |
| | ) |
| | ), ( |
| | "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " |
| | "or `List[List[str]]` (batch of pretokenized examples)." |
| | ) |
| |
|
| | is_batched = bool( |
| | (not is_pretokenized and isinstance(text, (list, tuple))) |
| | or (is_pretokenized and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))) |
| | ) |
| |
|
| | if is_batched: |
| | batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text |
| | return self.batch_encode_plus( |
| | batch_text_or_text_pairs=batch_text_or_text_pairs, |
| | add_special_tokens=add_special_tokens, |
| | padding=padding, |
| | truncation=truncation, |
| | max_length=max_length, |
| | stride=stride, |
| | is_pretokenized=is_pretokenized, |
| | pad_to_multiple_of=pad_to_multiple_of, |
| | return_tensors=return_tensors, |
| | return_token_type_ids=return_token_type_ids, |
| | return_attention_mask=return_attention_mask, |
| | return_overflowing_tokens=return_overflowing_tokens, |
| | return_special_tokens_mask=return_special_tokens_mask, |
| | return_offsets_mapping=return_offsets_mapping, |
| | return_length=return_length, |
| | verbose=verbose, |
| | **kwargs, |
| | ) |
| | else: |
| | return self.encode_plus( |
| | text=text, |
| | text_pair=text_pair, |
| | add_special_tokens=add_special_tokens, |
| | padding=padding, |
| | truncation=truncation, |
| | max_length=max_length, |
| | stride=stride, |
| | is_pretokenized=is_pretokenized, |
| | pad_to_multiple_of=pad_to_multiple_of, |
| | return_tensors=return_tensors, |
| | return_token_type_ids=return_token_type_ids, |
| | return_attention_mask=return_attention_mask, |
| | return_overflowing_tokens=return_overflowing_tokens, |
| | return_special_tokens_mask=return_special_tokens_mask, |
| | return_offsets_mapping=return_offsets_mapping, |
| | return_length=return_length, |
| | verbose=verbose, |
| | **kwargs, |
| | ) |
| |
|
| | @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) |
| | def encode_plus( |
| | self, |
| | text: Union[TextInput, PreTokenizedInput, EncodedInput], |
| | text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, |
| | add_special_tokens: bool = True, |
| | padding: Union[bool, str] = False, |
| | truncation: Union[bool, str] = False, |
| | max_length: Optional[int] = None, |
| | stride: int = 0, |
| | is_pretokenized: bool = False, |
| | pad_to_multiple_of: Optional[int] = None, |
| | return_tensors: Optional[Union[str, TensorType]] = None, |
| | return_token_type_ids: Optional[bool] = None, |
| | return_attention_mask: Optional[bool] = None, |
| | return_overflowing_tokens: bool = False, |
| | return_special_tokens_mask: bool = False, |
| | return_offsets_mapping: bool = False, |
| | return_length: bool = False, |
| | verbose: bool = True, |
| | **kwargs |
| | ) -> BatchEncoding: |
| | """ |
| | Returns a dictionary containing the encoded sequence or sequence pair and additional information: |
| | the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. |
| | |
| | Args: |
| | text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the later only for not-fast tokenizers)): |
| | The first sequence to be encoded. This can be a string, a list of strings (tokenized string using |
| | the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` |
| | method) |
| | text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): |
| | Optional second sequence to be encoded. This can be a string, a list of strings (tokenized |
| | string using the `tokenize` method) or a list of integers (tokenized string ids using the |
| | `convert_tokens_to_ids` method) |
| | """ |
| |
|
| | |
| | padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( |
| | padding=padding, |
| | truncation=truncation, |
| | max_length=max_length, |
| | pad_to_multiple_of=pad_to_multiple_of, |
| | verbose=verbose, |
| | **kwargs, |
| | ) |
| |
|
| | return self._encode_plus( |
| | text=text, |
| | text_pair=text_pair, |
| | add_special_tokens=add_special_tokens, |
| | padding_strategy=padding_strategy, |
| | truncation_strategy=truncation_strategy, |
| | max_length=max_length, |
| | stride=stride, |
| | is_pretokenized=is_pretokenized, |
| | pad_to_multiple_of=pad_to_multiple_of, |
| | return_tensors=return_tensors, |
| | return_token_type_ids=return_token_type_ids, |
| | return_attention_mask=return_attention_mask, |
| | return_overflowing_tokens=return_overflowing_tokens, |
| | return_special_tokens_mask=return_special_tokens_mask, |
| | return_offsets_mapping=return_offsets_mapping, |
| | return_length=return_length, |
| | verbose=verbose, |
| | **kwargs, |
| | ) |
| |
|
| | def _encode_plus( |
| | self, |
| | text: Union[TextInput, PreTokenizedInput, EncodedInput], |
| | text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, |
| | add_special_tokens: bool = True, |
| | padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
| | truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, |
| | max_length: Optional[int] = None, |
| | stride: int = 0, |
| | is_pretokenized: bool = False, |
| | pad_to_multiple_of: Optional[int] = None, |
| | return_tensors: Optional[Union[str, TensorType]] = None, |
| | return_token_type_ids: Optional[bool] = None, |
| | return_attention_mask: Optional[bool] = None, |
| | return_overflowing_tokens: bool = False, |
| | return_special_tokens_mask: bool = False, |
| | return_offsets_mapping: bool = False, |
| | return_length: bool = False, |
| | verbose: bool = True, |
| | **kwargs |
| | ) -> BatchEncoding: |
| | raise NotImplementedError |
| |
|
| | @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) |
| | def batch_encode_plus( |
| | self, |
| | batch_text_or_text_pairs: Union[ |
| | List[TextInput], |
| | List[TextInputPair], |
| | List[PreTokenizedInput], |
| | List[PreTokenizedInputPair], |
| | List[EncodedInput], |
| | List[EncodedInputPair], |
| | ], |
| | add_special_tokens: bool = True, |
| | padding: Union[bool, str] = False, |
| | truncation: Union[bool, str] = False, |
| | max_length: Optional[int] = None, |
| | stride: int = 0, |
| | is_pretokenized: bool = False, |
| | pad_to_multiple_of: Optional[int] = None, |
| | return_tensors: Optional[Union[str, TensorType]] = None, |
| | return_token_type_ids: Optional[bool] = None, |
| | return_attention_mask: Optional[bool] = None, |
| | return_overflowing_tokens: bool = False, |
| | return_special_tokens_mask: bool = False, |
| | return_offsets_mapping: bool = False, |
| | return_length: bool = False, |
| | verbose: bool = True, |
| | **kwargs |
| | ) -> BatchEncoding: |
| | """ |
| | Returns a dictionary containing the encoded sequence or sequence pair and additional information: |
| | the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. |
| | |
| | Args: |
| | batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, |
| | :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, |
| | and for not-fast tokenizers, also: |
| | :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`): |
| | Batch of sequences or pair of sequences to be encoded. |
| | This can be a list of string/string-sequences/int-sequences or a list of pair of |
| | string/string-sequences/int-sequence (see details in encode_plus) |
| | """ |
| |
|
| | |
| | padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( |
| | padding=padding, |
| | truncation=truncation, |
| | max_length=max_length, |
| | pad_to_multiple_of=pad_to_multiple_of, |
| | verbose=verbose, |
| | **kwargs, |
| | ) |
| |
|
| | return self._batch_encode_plus( |
| | batch_text_or_text_pairs=batch_text_or_text_pairs, |
| | add_special_tokens=add_special_tokens, |
| | padding_strategy=padding_strategy, |
| | truncation_strategy=truncation_strategy, |
| | max_length=max_length, |
| | stride=stride, |
| | is_pretokenized=is_pretokenized, |
| | pad_to_multiple_of=pad_to_multiple_of, |
| | return_tensors=return_tensors, |
| | return_token_type_ids=return_token_type_ids, |
| | return_attention_mask=return_attention_mask, |
| | return_overflowing_tokens=return_overflowing_tokens, |
| | return_special_tokens_mask=return_special_tokens_mask, |
| | return_offsets_mapping=return_offsets_mapping, |
| | return_length=return_length, |
| | verbose=verbose, |
| | **kwargs, |
| | ) |
| |
|
| | def _batch_encode_plus( |
| | self, |
| | batch_text_or_text_pairs: Union[ |
| | List[TextInput], |
| | List[TextInputPair], |
| | List[PreTokenizedInput], |
| | List[PreTokenizedInputPair], |
| | List[EncodedInput], |
| | List[EncodedInputPair], |
| | ], |
| | add_special_tokens: bool = True, |
| | padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
| | truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, |
| | max_length: Optional[int] = None, |
| | stride: int = 0, |
| | is_pretokenized: bool = False, |
| | pad_to_multiple_of: Optional[int] = None, |
| | return_tensors: Optional[Union[str, TensorType]] = None, |
| | return_token_type_ids: Optional[bool] = None, |
| | return_attention_mask: Optional[bool] = None, |
| | return_overflowing_tokens: bool = False, |
| | return_special_tokens_mask: bool = False, |
| | return_offsets_mapping: bool = False, |
| | return_length: bool = False, |
| | verbose: bool = True, |
| | **kwargs |
| | ) -> BatchEncoding: |
| | raise NotImplementedError |
| |
|
| | def pad( |
| | self, |
| | encoded_inputs: Union[ |
| | BatchEncoding, |
| | List[BatchEncoding], |
| | Dict[str, EncodedInput], |
| | Dict[str, List[EncodedInput]], |
| | List[Dict[str, EncodedInput]], |
| | ], |
| | padding: Union[bool, str] = True, |
| | max_length: Optional[int] = None, |
| | pad_to_multiple_of: Optional[int] = None, |
| | return_attention_mask: Optional[bool] = None, |
| | return_tensors: Optional[Union[str, TensorType]] = None, |
| | verbose: bool = True, |
| | ) -> BatchEncoding: |
| | """ Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length in the batch. |
| | |
| | Padding side (left/right) padding token ids are defined at the tokenizer level |
| | (with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``) |
| | |
| | Args: |
| | encoded_inputs: Dictionary of tokenized inputs (`Dict[str, List[int]]`) or batch of tokenized inputs. |
| | Batch of tokenized inputs can be given as dicts of lists or lists of dicts, both work so you can |
| | use ``tokenizer.pad()`` during pre-processing as well as in a PyTorch Dataloader collate function. |
| | (`Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`). |
| | padding: Boolean or specific strategy to use for padding. |
| | Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: |
| | - 'longest' (or `True`) Pad to the longest sequence in the batch |
| | - 'max_length': Pad to the max length (default) |
| | - 'do_not_pad' (or `False`): Do not pad |
| | max_length: maximum length of the returned list and optionally padding length (see below). |
| | Will truncate by taking into account the special tokens. |
| | pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. |
| | This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability |
| | >= 7.5 (Volta). |
| | return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) |
| | return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): |
| | Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, |
| | PyTorch :obj:`torch.Tensor` or Numpy :oj: `np.ndarray` instead of a list of python integers. |
| | verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): |
| | Set to ``False`` to avoid printing infos and warnings. |
| | """ |
| | |
| | if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)): |
| | encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()} |
| |
|
| | assert "input_ids" in encoded_inputs, ( |
| | "You should supply an encoding or a list of encodings to this method. " |
| | "An encoding is the output of one the encoding methods of the tokenizer, i.e. " |
| | "__call__/encode_plus/batch_encode_plus. " |
| | ) |
| |
|
| | if not encoded_inputs["input_ids"]: |
| | if return_attention_mask: |
| | encoded_inputs["attention_mask"] = [] |
| | return encoded_inputs |
| |
|
| | |
| | padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( |
| | padding=padding, max_length=max_length, verbose=verbose |
| | ) |
| |
|
| | if encoded_inputs["input_ids"] and not isinstance(encoded_inputs["input_ids"][0], (list, tuple)): |
| | encoded_inputs = self._pad( |
| | encoded_inputs, |
| | max_length=max_length, |
| | padding_strategy=padding_strategy, |
| | pad_to_multiple_of=pad_to_multiple_of, |
| | return_attention_mask=return_attention_mask, |
| | ) |
| | return BatchEncoding(encoded_inputs, tensor_type=return_tensors) |
| |
|
| | batch_size = len(encoded_inputs["input_ids"]) |
| | assert all( |
| | len(v) == batch_size for v in encoded_inputs.values() |
| | ), "Some items in the output dictionnary have a different batch size than others." |
| |
|
| | if padding_strategy == PaddingStrategy.LONGEST: |
| | max_length = max(len(inputs) for inputs in encoded_inputs["input_ids"]) |
| | padding_strategy = PaddingStrategy.MAX_LENGTH |
| |
|
| | batch_outputs = {} |
| | for i in range(batch_size): |
| | inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) |
| | outputs = self._pad( |
| | inputs, |
| | max_length=max_length, |
| | padding_strategy=padding_strategy, |
| | pad_to_multiple_of=pad_to_multiple_of, |
| | return_attention_mask=return_attention_mask, |
| | ) |
| |
|
| | for key, value in outputs.items(): |
| | if key not in batch_outputs: |
| | batch_outputs[key] = [] |
| | batch_outputs[key].append(value) |
| |
|
| | return BatchEncoding(batch_outputs, tensor_type=return_tensors) |
| |
|
| | def create_token_type_ids_from_sequences(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List[int]: |
| | if token_ids_1 is None: |
| | return len(token_ids_0) * [0] |
| | return [0] * len(token_ids_0) + [1] * len(token_ids_1) |
| |
|
| | def build_inputs_with_special_tokens(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List: |
| | """ |
| | Build model inputs from a sequence or a pair of sequence for sequence classification tasks |
| | by concatenating and adding special tokens. This implementation does not add special tokens. |
| | """ |
| | if token_ids_1 is None: |
| | return token_ids_0 |
| | return token_ids_0 + token_ids_1 |
| |
|
| | @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) |
| | def prepare_for_model( |
| | self, |
| | ids: List[int], |
| | pair_ids: Optional[List[int]] = None, |
| | add_special_tokens: bool = True, |
| | padding: Union[bool, str] = False, |
| | truncation: Union[bool, str] = False, |
| | max_length: Optional[int] = None, |
| | stride: int = 0, |
| | pad_to_multiple_of: Optional[int] = None, |
| | return_tensors: Optional[Union[str, TensorType]] = None, |
| | return_token_type_ids: Optional[bool] = None, |
| | return_attention_mask: Optional[bool] = None, |
| | return_overflowing_tokens: bool = False, |
| | return_special_tokens_mask: bool = False, |
| | return_offsets_mapping: bool = False, |
| | return_length: bool = False, |
| | verbose: bool = True, |
| | prepend_batch_axis: bool = False, |
| | **kwargs |
| | ) -> BatchEncoding: |
| | """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. |
| | It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and |
| | manages a moving window (with user defined stride) for overflowing tokens |
| | |
| | Args: |
| | ids: list of tokenized input ids. Can be obtained from a string by chaining the |
| | `tokenize` and `convert_tokens_to_ids` methods. |
| | pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the |
| | `tokenize` and `convert_tokens_to_ids` methods. |
| | """ |
| |
|
| | if "return_lengths" in kwargs: |
| | if verbose: |
| | warnings.warn( |
| | "The PreTrainedTokenizerBase.prepare_for_model `return_lengths` parameter is deprecated. " |
| | "Please use `return_length` instead.", |
| | FutureWarning, |
| | ) |
| | return_length = kwargs["return_lengths"] |
| |
|
| | |
| | padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( |
| | padding=padding, |
| | truncation=truncation, |
| | max_length=max_length, |
| | pad_to_multiple_of=pad_to_multiple_of, |
| | verbose=verbose, |
| | **kwargs, |
| | ) |
| |
|
| | pair = bool(pair_ids is not None) |
| | len_ids = len(ids) |
| | len_pair_ids = len(pair_ids) if pair else 0 |
| |
|
| | |
| | if return_token_type_ids is None: |
| | return_token_type_ids = "token_type_ids" in self.model_input_names |
| | if return_attention_mask is None: |
| | return_attention_mask = "attention_mask" in self.model_input_names |
| |
|
| | encoded_inputs = {} |
| |
|
| | |
| | total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) |
| |
|
| | |
| | if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: |
| | ids, pair_ids, overflowing_tokens = self.truncate_sequences( |
| | ids, |
| | pair_ids=pair_ids, |
| | num_tokens_to_remove=total_len - max_length, |
| | truncation_strategy=truncation_strategy, |
| | stride=stride, |
| | ) |
| | if return_overflowing_tokens: |
| | encoded_inputs["overflowing_tokens"] = overflowing_tokens |
| | encoded_inputs["num_truncated_tokens"] = total_len - max_length |
| |
|
| | |
| | if add_special_tokens: |
| | sequence = self.build_inputs_with_special_tokens(ids, pair_ids) |
| | token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) |
| | else: |
| | sequence = ids + pair_ids if pair else ids |
| | token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) |
| |
|
| | |
| | encoded_inputs["input_ids"] = sequence |
| | if return_token_type_ids: |
| | encoded_inputs["token_type_ids"] = token_type_ids |
| | if return_special_tokens_mask: |
| | if add_special_tokens: |
| | encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) |
| | else: |
| | encoded_inputs["special_tokens_mask"] = [0] * len(sequence) |
| |
|
| | |
| | if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length and verbose: |
| | logger.warning( |
| | "Token indices sequence length is longer than the specified maximum sequence length " |
| | "for this model ({} > {}). Running this sequence through the model will result in " |
| | "indexing errors".format(len(ids), self.model_max_length) |
| | ) |
| |
|
| | |
| | if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask: |
| | encoded_inputs = self.pad( |
| | encoded_inputs, |
| | max_length=max_length, |
| | padding=padding_strategy.value, |
| | pad_to_multiple_of=pad_to_multiple_of, |
| | return_attention_mask=return_attention_mask, |
| | ) |
| |
|
| | if return_length: |
| | encoded_inputs["length"] = len(encoded_inputs["input_ids"]) |
| |
|
| | batch_outputs = BatchEncoding( |
| | encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis |
| | ) |
| |
|
| | return batch_outputs |
| |
|
| | def truncate_sequences( |
| | self, |
| | ids: List[int], |
| | pair_ids: Optional[List[int]] = None, |
| | num_tokens_to_remove: int = 0, |
| | truncation_strategy: Union[str, TruncationStrategy] = "longest_first", |
| | stride: int = 0, |
| | ) -> Tuple[List[int], List[int], List[int]]: |
| | """ Truncates a sequence pair in place to the maximum length. |
| | |
| | Args: |
| | ids: list of tokenized input ids. Can be obtained from a string by chaining the |
| | `tokenize` and `convert_tokens_to_ids` methods. |
| | pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the |
| | `tokenize` and `convert_tokens_to_ids` methods. |
| | num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``): |
| | number of tokens to remove using the truncation strategy |
| | truncation_strategy (:obj:`string`, `optional`, defaults to "longest_first"): |
| | String selected in the following options: |
| | |
| | - 'longest_first' (default): Iteratively reduce the inputs sequence until the input is under max_length |
| | starting from the longest one at each token (when there is a pair of input sequences). |
| | Overflowing tokens only contains overflow from the first sequence. |
| | - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. |
| | - 'only_second': Only truncate the second sequence |
| | - 'do_not_truncate' |
| | stride (:obj:`int`, `optional`, defaults to ``0``): |
| | If set to a number along with max_length, the overflowing tokens returned will contain some tokens |
| | from the main sequence returned. The value of this argument defines the number of additional tokens. |
| | """ |
| | if num_tokens_to_remove <= 0: |
| | return ids, pair_ids, [] |
| |
|
| | if not isinstance(truncation_strategy, TruncationStrategy): |
| | truncation_strategy = TruncationStrategy(truncation_strategy) |
| |
|
| | overflowing_tokens = [] |
| | if truncation_strategy == TruncationStrategy.LONGEST_FIRST: |
| | for _ in range(num_tokens_to_remove): |
| | if pair_ids is None or len(ids) > len(pair_ids): |
| | if not overflowing_tokens: |
| | window_len = min(len(ids), stride + 1) |
| | else: |
| | window_len = 1 |
| | overflowing_tokens.extend(ids[-window_len:]) |
| | ids = ids[:-1] |
| | else: |
| | if not overflowing_tokens: |
| | window_len = min(len(pair_ids), stride + 1) |
| | else: |
| | window_len = 1 |
| | overflowing_tokens.extend(pair_ids[-window_len:]) |
| | pair_ids = pair_ids[:-1] |
| | elif truncation_strategy == TruncationStrategy.ONLY_FIRST: |
| | if len(ids) > num_tokens_to_remove: |
| | window_len = min(len(ids), stride + num_tokens_to_remove) |
| | overflowing_tokens = ids[-window_len:] |
| | ids = ids[:-num_tokens_to_remove] |
| | else: |
| | logger.error( |
| | f"We need to remove {num_tokens_to_remove} to truncate the input" |
| | f"but the first sequence has a length {len(ids)}. " |
| | f"Please select another truncation strategy than {truncation_strategy}, " |
| | f"for instance 'longest_first' or 'only_second'." |
| | ) |
| | elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None: |
| | if len(pair_ids) > num_tokens_to_remove: |
| | window_len = min(len(pair_ids), stride + num_tokens_to_remove) |
| | overflowing_tokens = pair_ids[-window_len:] |
| | pair_ids = pair_ids[:-num_tokens_to_remove] |
| | else: |
| | logger.error( |
| | f"We need to remove {num_tokens_to_remove} to truncate the input" |
| | f"but the second sequence has a length {len(pair_ids)}. " |
| | f"Please select another truncation strategy than {truncation_strategy}, " |
| | f"for instance 'longest_first' or 'only_first'." |
| | ) |
| |
|
| | return (ids, pair_ids, overflowing_tokens) |
| |
|
| | def _pad( |
| | self, |
| | encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], |
| | max_length: Optional[int] = None, |
| | padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
| | pad_to_multiple_of: Optional[int] = None, |
| | return_attention_mask: Optional[bool] = None, |
| | ) -> dict: |
| | """ Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch) |
| | |
| | Args: |
| | encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). |
| | max_length: maximum length of the returned list and optionally padding length (see below). |
| | Will truncate by taking into account the special tokens. |
| | padding_strategy: PaddingStrategy to use for padding. |
| | - PaddingStrategy.LONGEST Pad to the longest sequence in the batch |
| | - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) |
| | - PaddingStrategy.DO_NOT_PAD: Do not pad |
| | The tokenizer padding sides are defined in self.padding_side: |
| | - 'left': pads on the left of the sequences |
| | - 'right': pads on the right of the sequences |
| | pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. |
| | This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability |
| | >= 7.5 (Volta). |
| | return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) |
| | """ |
| | |
| | if return_attention_mask is None: |
| | return_attention_mask = "attention_mask" in self.model_input_names |
| |
|
| | if padding_strategy == PaddingStrategy.LONGEST: |
| | max_length = len(encoded_inputs["input_ids"]) |
| |
|
| | if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): |
| | max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of |
| |
|
| | needs_to_be_padded = ( |
| | padding_strategy != PaddingStrategy.DO_NOT_PAD and len(encoded_inputs["input_ids"]) != max_length |
| | ) |
| |
|
| | if needs_to_be_padded: |
| | difference = max_length - len(encoded_inputs["input_ids"]) |
| | if self.padding_side == "right": |
| | if return_attention_mask: |
| | encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference |
| | if "token_type_ids" in encoded_inputs: |
| | encoded_inputs["token_type_ids"] = ( |
| | encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference |
| | ) |
| | if "special_tokens_mask" in encoded_inputs: |
| | encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference |
| | encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference |
| | elif self.padding_side == "left": |
| | if return_attention_mask: |
| | encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) |
| | if "token_type_ids" in encoded_inputs: |
| | encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ |
| | "token_type_ids" |
| | ] |
| | if "special_tokens_mask" in encoded_inputs: |
| | encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] |
| | encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] |
| | else: |
| | raise ValueError("Invalid padding strategy:" + str(self.padding_side)) |
| | else: |
| | if return_attention_mask: |
| | encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) |
| |
|
| | return encoded_inputs |
| |
|
| | def batch_decode(self, sequences: List[List[int]], **kwargs) -> List[str]: |
| | return [self.decode(seq, **kwargs) for seq in sequences] |
| |
|
| | def decode( |
| | self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True |
| | ) -> str: |
| | """ |
| | Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary |
| | with options to remove special tokens and clean up tokenization spaces. |
| | Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. |
| | |
| | Args: |
| | token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods. |
| | skip_special_tokens: if set to True, will replace special tokens. |
| | clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces. |
| | """ |
| | raise NotImplementedError |
| |
|
| | def get_special_tokens_mask( |
| | self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False |
| | ) -> List[int]: |
| | """ |
| | Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding |
| | special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. |
| | |
| | Args: |
| | token_ids_0: list of ids (must not contain special tokens) |
| | token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids |
| | for sequence pairs |
| | already_has_special_tokens: (default False) Set to True if the token list is already formated with |
| | special tokens for the model |
| | |
| | Returns: |
| | A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. |
| | """ |
| | assert already_has_special_tokens and token_ids_1 is None, ( |
| | "You cannot use ``already_has_special_tokens=False`` with this tokenizer. " |
| | "Please use a slow (full python) tokenizer to activate this argument." |
| | "Or set `return_special_token_mask=True` when calling the encoding method " |
| | "to get the special tokens mask in any tokenizer. " |
| | ) |
| |
|
| | all_special_ids = self.all_special_ids |
| |
|
| | special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0] |
| |
|
| | return special_tokens_mask |
| |
|
| | @staticmethod |
| | def clean_up_tokenization(out_string: str) -> str: |
| | """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms. |
| | """ |
| | out_string = ( |
| | out_string.replace(" .", ".") |
| | .replace(" ?", "?") |
| | .replace(" !", "!") |
| | .replace(" ,", ",") |
| | .replace(" ' ", "'") |
| | .replace(" n't", "n't") |
| | .replace(" 'm", "'m") |
| | .replace(" 's", "'s") |
| | .replace(" 've", "'ve") |
| | .replace(" 're", "'re") |
| | ) |
| | return out_string |
| |
|