Spaces:
Running
Running
# coding=utf-8 | |
# Copyright 2020 The HuggingFace Inc. team. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
""" Base classes common to both the slow and the fast tokenization classes: | |
PreTrainedTokenizerBase (host all the user fronting encoding methodes) | |
Special token mixing (host the special tokens logic) and | |
BatchEncoding (wrap the dictionnary of output with special method for the Fast tokenizers) | |
""" | |
import copy | |
import json | |
import logging | |
import os | |
import warnings | |
from collections import UserDict | |
from enum import Enum | |
from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union | |
import numpy as np | |
from tokenizers import AddedToken | |
from tokenizers import Encoding as EncodingFast | |
from .file_utils import ( | |
add_end_docstrings, | |
cached_path, | |
hf_bucket_url, | |
is_remote_url, | |
is_tf_available, | |
is_torch_available, | |
torch_required, | |
) | |
if is_tf_available(): | |
import tensorflow as tf | |
if is_torch_available(): | |
import torch | |
logger = logging.getLogger(__name__) | |
VERY_LARGE_INTEGER = int(1e30) # This is used to set the max input length for a model with infinite size input | |
LARGE_INTEGER = int(1e20) # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER | |
# Define type aliases and NamedTuples | |
TextInput = str | |
PreTokenizedInput = List[str] | |
EncodedInput = List[int] | |
TextInputPair = Tuple[str, str] | |
PreTokenizedInputPair = Tuple[List[str], List[str]] | |
EncodedInputPair = Tuple[List[int], List[int]] | |
# Slow tokenizers used to be saved in three separated files | |
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" | |
ADDED_TOKENS_FILE = "added_tokens.json" | |
TOKENIZER_CONFIG_FILE = "tokenizer_config.json" | |
# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file | |
FULL_TOKENIZER_FILE = "tokenizer.json" | |
class ExplicitEnum(Enum): | |
""" Enum with more explicit error message for missing values. | |
""" | |
def _missing_(cls, value): | |
raise ValueError( | |
"%r is not a valid %s, please select one of %s" | |
% (value, cls.__name__, str(list(cls._value2member_map_.keys()))) | |
) | |
class TruncationStrategy(ExplicitEnum): | |
ONLY_FIRST = "only_first" | |
ONLY_SECOND = "only_second" | |
LONGEST_FIRST = "longest_first" | |
DO_NOT_TRUNCATE = "do_not_truncate" | |
class PaddingStrategy(ExplicitEnum): | |
LONGEST = "longest" | |
MAX_LENGTH = "max_length" | |
DO_NOT_PAD = "do_not_pad" | |
class TensorType(ExplicitEnum): | |
PYTORCH = "pt" | |
TENSORFLOW = "tf" | |
NUMPY = "np" | |
class CharSpan(NamedTuple): | |
""" Character span in the original string | |
Args: | |
start: index of the first character in the original string | |
end: index of the character following the last character in the original string | |
""" | |
start: int | |
end: int | |
class TokenSpan(NamedTuple): | |
""" Token span in an encoded string (list of tokens) | |
Args: | |
start: index of the first token in the span | |
end: index of the token following the last token in the span | |
""" | |
start: int | |
end: int | |
class BatchEncoding(UserDict): | |
""" BatchEncoding hold the output of the encode and batch_encode methods (tokens, attention_masks, etc). | |
This class is derived from a python Dictionary and can be used as a dictionnary. | |
In addition, this class expose utility methods to map from word/char space to token space. | |
Args: | |
data (:obj:`dict`): Dictionary of lists/arrays returned by the encode/batch_encode methods ('input_ids', 'attention_mask'...) | |
encoding (:obj:`EncodingFast`, :obj:`list(EncodingFast)`, `optional`, defaults to :obj:`None`): | |
If the tokenizer is a fast tokenizer which outputs additional informations like mapping from word/char space to token space | |
the `EncodingFast` instance or list of instance (for batches) hold these informations. | |
tensor_type (:obj:`Union[None, str, TensorType]`, `optional`, defaults to :obj:`None`): | |
You can give a tensor_type here to convert the lists of integers in PyTorch/TF/Numpy Tensors at initialization | |
prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`): | |
Set to True to add a batch axis when converting in Tensors (see :obj:`tensor_type` above) | |
""" | |
def __init__( | |
self, | |
data: Optional[Dict[str, Any]] = None, | |
encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None, | |
tensor_type: Union[None, str, TensorType] = None, | |
prepend_batch_axis: bool = False, | |
): | |
super().__init__(data) | |
if isinstance(encoding, EncodingFast): | |
encoding = [encoding] | |
self._encodings = encoding | |
self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis) | |
def is_fast(self): | |
""" | |
Indicate if this BatchEncoding was generated from the result of a PreTrainedTokenizerFast | |
Returns: True if generated from subclasses of PreTrainedTokenizerFast, else otherwise | |
""" | |
return self._encodings is not None | |
def __getitem__(self, item: Union[int, str]) -> EncodingFast: | |
""" If the key is a string, get the value of the dict associated to `key` ('input_ids', 'attention_mask'...) | |
If the key is an integer, get the EncodingFast for batch item with index `key` | |
""" | |
if isinstance(item, str): | |
return self.data[item] | |
elif self._encodings is not None: | |
return self._encodings[item] | |
else: | |
raise KeyError( | |
"Indexing with integers (to access backend Encoding for a given batch index) " | |
"is not available when using Python based tokenizers" | |
) | |
def __getattr__(self, item: str): | |
try: | |
return self.data[item] | |
except KeyError: | |
raise AttributeError | |
def __getstate__(self): | |
return {"data": self.data, "encodings": self._encodings} | |
def __setstate__(self, state): | |
if "data" in state: | |
self.data = state["data"] | |
if "encodings" in state: | |
self._encodings = state["encodings"] | |
def keys(self): | |
return self.data.keys() | |
def values(self): | |
return self.data.values() | |
def items(self): | |
return self.data.items() | |
# After this point: | |
# Extended properties and methods only available for fast (Rust-based) tokenizers | |
# provided by HuggingFace tokenizers library. | |
def encodings(self) -> Optional[List[EncodingFast]]: | |
""" | |
Return the list all encoding from the tokenization process | |
Returns: List[EncodingFast] or None if input was tokenized through Python (i.e. not fast) tokenizer | |
""" | |
return self._encodings | |
def tokens(self, batch_index: int = 0) -> List[str]: | |
if not self._encodings: | |
raise ValueError("tokens() is not available when using Python based tokenizers") | |
return self._encodings[batch_index].tokens | |
def words(self, batch_index: int = 0) -> List[Optional[int]]: | |
if not self._encodings: | |
raise ValueError("words() is not available when using Python based tokenizers") | |
return self._encodings[batch_index].words | |
def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: | |
""" | |
Get the index of the word corresponding (i.e. comprising) to an encoded token | |
in a sequence of the batch. | |
Can be called as: | |
- ``self.token_to_word(token_index)`` if batch size is 1 | |
- ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1 | |
This method is particularly suited when the input sequences are provided as | |
pre-tokenized sequences (i.e. words are defined by the user). In this case it allows | |
to easily associate encoded tokens with provided tokenized words. | |
Args: | |
batch_or_token_index (:obj:`int`): | |
Index of the sequence in the batch. If the batch only comprise one sequence, | |
this can be the index of the token in the sequence | |
token_index (:obj:`int`, `optional`): | |
If a batch index is provided in `batch_or_token_index`, this can be the index | |
of the token in the sequence. | |
Returns: | |
:obj:`int`: | |
index of the word in the input sequence. | |
""" | |
if not self._encodings: | |
raise ValueError("token_to_word() is not available when using Python based tokenizers") | |
if token_index is not None: | |
batch_index = batch_or_token_index | |
else: | |
batch_index = 0 | |
token_index = batch_or_token_index | |
if batch_index < 0: | |
batch_index = self._batch_size + batch_index | |
if token_index < 0: | |
token_index = self._seq_len + token_index | |
return self._encodings[batch_index].token_to_word(token_index) | |
def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> TokenSpan: | |
""" | |
Get the encoded token span corresponding to a word in the sequence of the batch. | |
Token spans are returned as a TokenSpan NamedTuple with: | |
- start: index of the first token | |
- end: index of the token following the last token | |
Can be called as: | |
- ``self.word_to_tokens(word_index)`` if batch size is 1 | |
- ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1 | |
This method is particularly suited when the input sequences are provided as | |
pre-tokenized sequences (i.e. words are defined by the user). In this case it allows | |
to easily associate encoded tokens with provided tokenized words. | |
Args: | |
batch_or_word_index (:obj:`int`): | |
Index of the sequence in the batch. If the batch only comprises one sequence, | |
this can be the index of the word in the sequence | |
word_index (:obj:`int`, `optional`): | |
If a batch index is provided in `batch_or_token_index`, this can be the index | |
of the word in the sequence. | |
Returns: | |
:obj:`TokenSpan`: | |
Span of tokens in the encoded sequence. | |
:obj:`TokenSpan` are NamedTuple with: | |
- start: index of the first token | |
- end: index of the token following the last token | |
""" | |
if not self._encodings: | |
raise ValueError("word_to_tokens() is not available when using Python based tokenizers") | |
if word_index is not None: | |
batch_index = batch_or_word_index | |
else: | |
batch_index = 0 | |
word_index = batch_or_word_index | |
if batch_index < 0: | |
batch_index = self._batch_size + batch_index | |
if word_index < 0: | |
word_index = self._seq_len + word_index | |
return TokenSpan(*(self._encodings[batch_index].word_to_tokens(word_index))) | |
def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan: | |
""" | |
Get the character span corresponding to an encoded token in a sequence of the batch. | |
Character spans are returned as a CharSpan NamedTuple with: | |
- start: index of the first character in the original string associated to the token | |
- end: index of the character following the last character in the original string associated to the token | |
Can be called as: | |
- ``self.token_to_chars(token_index)`` if batch size is 1 | |
- ``self.token_to_chars(batch_index, token_index)`` if batch size is greater or equal to 1 | |
Args: | |
batch_or_token_index (:obj:`int`): | |
Index of the sequence in the batch. If the batch only comprise one sequence, | |
this can be the index of the token in the sequence | |
token_index (:obj:`int`, `optional`): | |
If a batch index is provided in `batch_or_token_index`, this can be the index | |
of the token or tokens in the sequence. | |
Returns: | |
:obj:`CharSpan`: | |
Span of characters in the original string. | |
:obj:`CharSpan` are NamedTuple with: | |
- start: index of the first character in the original string | |
- end: index of the character following the last character in the original string | |
""" | |
if not self._encodings: | |
raise ValueError("token_to_chars() is not available when using Python based tokenizers") | |
if token_index is not None: | |
batch_index = batch_or_token_index | |
else: | |
batch_index = 0 | |
token_index = batch_or_token_index | |
return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index))) | |
def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: | |
""" | |
Get the index of the token in the encoded output comprising a character | |
in the original string for a sequence of the batch. | |
Can be called as: | |
- ``self.char_to_token(char_index)`` if batch size is 1 | |
- ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1 | |
This method is particularly suited when the input sequences are provided as | |
pre-tokenized sequences (i.e. words are defined by the user). In this case it allows | |
to easily associate encoded tokens with provided tokenized words. | |
Args: | |
batch_or_char_index (:obj:`int`): | |
Index of the sequence in the batch. If the batch only comprise one sequence, | |
this can be the index of the word in the sequence | |
char_index (:obj:`int`, `optional`): | |
If a batch index is provided in `batch_or_token_index`, this can be the index | |
of the word in the sequence. | |
Returns: | |
:obj:`int`: Index of the token. | |
""" | |
if not self._encodings: | |
raise ValueError("char_to_token() is not available when using Python based tokenizers") | |
if char_index is not None: | |
batch_index = batch_or_char_index | |
else: | |
batch_index = 0 | |
char_index = batch_or_char_index | |
return self._encodings[batch_index].char_to_token(char_index) | |
def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan: | |
""" | |
Get the character span in the original string corresponding to given word in a sequence | |
of the batch. | |
Character spans are returned as a CharSpan NamedTuple with: | |
- start: index of the first character in the original string | |
- end: index of the character following the last character in the original string | |
Can be called as: | |
- ``self.word_to_chars(word_index)`` if batch size is 1 | |
- ``self.word_to_chars(batch_index, word_index)`` if batch size is greater or equal to 1 | |
Args: | |
batch_or_word_index (:obj:`int`): | |
Index of the sequence in the batch. If the batch only comprise one sequence, | |
this can be the index of the word in the sequence | |
word_index (:obj:`int`, `optional`): | |
If a batch index is provided in `batch_or_token_index`, this can be the index | |
of the word in the sequence. | |
Returns: | |
:obj:`CharSpan` or :obj:`List[CharSpan]`: | |
Span(s) of the associated character or characters in the string. | |
CharSpan are NamedTuple with: | |
- start: index of the first character associated to the token in the original string | |
- end: index of the character following the last character associated to the token in the original string | |
""" | |
if not self._encodings: | |
raise ValueError("word_to_chars() is not available when using Python based tokenizers") | |
if word_index is not None: | |
batch_index = batch_or_word_index | |
else: | |
batch_index = 0 | |
word_index = batch_or_word_index | |
return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index))) | |
def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: | |
""" | |
Get the word in the original string corresponding to a character in the original string of | |
a sequence of the batch. | |
Can be called as: | |
- ``self.char_to_word(char_index)`` if batch size is 1 | |
- ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1 | |
This method is particularly suited when the input sequences are provided as | |
pre-tokenized sequences (i.e. words are defined by the user). In this case it allows | |
to easily associate encoded tokens with provided tokenized words. | |
Args: | |
batch_or_char_index (:obj:`int`): | |
Index of the sequence in the batch. If the batch only comprise one sequence, | |
this can be the index of the character in the orginal string. | |
char_index (:obj:`int`, `optional`): | |
If a batch index is provided in `batch_or_token_index`, this can be the index | |
of the character in the orginal string. | |
Returns: | |
:obj:`int` or :obj:`List[int]`: | |
Index or indices of the associated encoded token(s). | |
""" | |
if not self._encodings: | |
raise ValueError("char_to_word() is not available when using Python based tokenizers") | |
if char_index is not None: | |
batch_index = batch_or_char_index | |
else: | |
batch_index = 0 | |
char_index = batch_or_char_index | |
return self._encodings[batch_index].char_to_word(char_index) | |
def convert_to_tensors(self, tensor_type: Union[None, str, TensorType], prepend_batch_axis: bool = False): | |
if tensor_type is None: | |
return self | |
# Convert to TensorType | |
if not isinstance(tensor_type, TensorType): | |
tensor_type = TensorType(tensor_type) | |
# Get a function reference for the correct framework | |
if tensor_type == TensorType.TENSORFLOW and is_tf_available(): | |
as_tensor = tf.constant | |
elif tensor_type == TensorType.PYTORCH and is_torch_available(): | |
as_tensor = torch.tensor | |
elif tensor_type == TensorType.NUMPY: | |
as_tensor = np.asarray | |
else: | |
raise ImportError( | |
"Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( | |
tensor_type | |
) | |
) | |
# Do the tensor conversion in batch | |
for key, value in self.items(): | |
try: | |
if prepend_batch_axis: | |
value = [value] | |
tensor = as_tensor(value) | |
# at-least2d | |
if tensor.ndim > 2: | |
tensor = tensor.squeeze(0) | |
elif tensor.ndim < 2: | |
tensor = tensor[None, :] | |
self[key] = tensor | |
except: # noqa E722 | |
raise ValueError( | |
"Unable to create tensor, you should probably activate truncation and/or padding " | |
"with 'padding=True' 'truncation=True' to have batched tensors with the same length." | |
) | |
return self | |
def to(self, device: str): | |
"""Send all values to device by calling v.to(device)""" | |
self.data = {k: v.to(device) for k, v in self.data.items()} | |
return self | |
# class AddedToken(UserString): | |
# """ AddedToken represents a token to be added to a Tokenizer | |
# An AddedToken can have special options defining the way it should behave. | |
# Args: | |
# content: str: | |
# The content of the token | |
# single_word: bool | |
# Whether this token should only match against single word. If True, | |
# this token will never match inside of a word. | |
# lstrip: bool | |
# Whether this token should strip all potential whitespaces on the left side. | |
# If True, this token will greedily match any whitespace on the left and then strip | |
# them out. | |
# rstrip: bool | |
# Whether this token should strip all potential whitespaces on the right side. | |
# If True, this token will greedily match any whitespace on the right and then strip | |
# them out. | |
# """ | |
# def __init__( | |
# self, data: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False, | |
# ): | |
# super().__init__(data) | |
# self._single_word = single_word | |
# self._lstrip = lstrip | |
# self._rstrip = rstrip | |
# def lower(self): | |
# return AddedToken(self.data.lower(), self._single_word, self._lstrip, self._rstrip) | |
class SpecialTokensMixin: | |
""" SpecialTokensMixin is derived by ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` and | |
handles specific behaviors related to special tokens. In particular, this class hold the | |
attributes which can be used to directly access to these special tokens in a | |
model-independant manner and allow to set and update the special tokens. | |
""" | |
SPECIAL_TOKENS_ATTRIBUTES = [ | |
"bos_token", | |
"eos_token", | |
"unk_token", | |
"sep_token", | |
"pad_token", | |
"cls_token", | |
"mask_token", | |
"additional_special_tokens", | |
] | |
def __init__(self, verbose=True, **kwargs): | |
self._bos_token = None | |
self._eos_token = None | |
self._unk_token = None | |
self._sep_token = None | |
self._pad_token = None | |
self._cls_token = None | |
self._mask_token = None | |
self._pad_token_type_id = 0 | |
self._additional_special_tokens = [] | |
self.verbose = verbose | |
# We directly set the hidden value to allow initialization with special tokens | |
# which are not yet in the vocabulary. Necesssary for serialization/de-serialization | |
# TODO clean this up at some point (probably by sitching to fast tokenizers) | |
for key, value in kwargs.items(): | |
if key in self.SPECIAL_TOKENS_ATTRIBUTES: | |
if key == "additional_special_tokens": | |
assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value) | |
setattr(self, key, value) | |
elif isinstance(value, (str, AddedToken)): | |
setattr(self, key, value) | |
else: | |
raise TypeError( | |
"special token {} has to be either str or AddedToken but got: {}".format(key, type(value)) | |
) | |
def sanitize_special_tokens(self) -> int: | |
""" Make sure that all the special tokens attributes of the tokenizer (tokenizer.mask_token, tokenizer.cls_token, ...) | |
are in the vocabulary. Add the missing ones to the vocabulary if needed. | |
Return: | |
Number of tokens added in the vocaulary during the operation. | |
""" | |
return self.add_tokens(self.all_special_tokens_extended, special_tokens=True) | |
def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int: | |
""" | |
Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them | |
to class attributes. If special tokens are NOT in the vocabulary, they are added | |
to it (indexed starting from the last index of the current vocabulary). | |
Using `add_special_tokens` will ensure your special tokens can be used in several ways: | |
- special tokens are carefully handled by the tokenizer (they are never split) | |
- you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts. | |
When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>') | |
Args: | |
special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: | |
[``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, | |
``additional_special_tokens``]. | |
Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). | |
Returns: | |
Number of tokens added to the vocabulary. | |
Examples:: | |
# Let's see how to add a new classification token to GPT-2 | |
tokenizer = GPT2Tokenizer.from_pretrained('gpt2') | |
model = GPT2Model.from_pretrained('gpt2') | |
special_tokens_dict = {'cls_token': '<CLS>'} | |
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) | |
print('We have added', num_added_toks, 'tokens') | |
model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. | |
assert tokenizer.cls_token == '<CLS>' | |
""" | |
if not special_tokens_dict: | |
return 0 | |
added_tokens = 0 | |
for key, value in special_tokens_dict.items(): | |
assert key in self.SPECIAL_TOKENS_ATTRIBUTES | |
if self.verbose: | |
logger.info("Assigning %s to the %s key of the tokenizer", value, key) | |
setattr(self, key, value) | |
if key == "additional_special_tokens": | |
assert isinstance(value, (list, tuple)) and all( | |
isinstance(t, (str, AddedToken)) for t in value | |
), f"Tokens {value} for key {key} should all be str or AddedToken instances" | |
added_tokens += self.add_tokens(value, special_tokens=True) | |
else: | |
assert isinstance( | |
value, (str, AddedToken) | |
), f"Token {value} for key {key} should be a str or an AddedToken instance" | |
added_tokens += self.add_tokens([value], special_tokens=True) | |
return added_tokens | |
def add_tokens(self, new_tokens: Union[str, AddedToken, List[str], List[AddedToken]], special_tokens=False) -> int: | |
""" | |
Add a list of new tokens to the tokenizer class. If the new tokens are not in the | |
vocabulary, they are added to it with indices starting from length of the current vocabulary. | |
Args: | |
new_tokens: string or list of string or :class:`~transformers.AddedToken`. Each string is a token to add. | |
Tokens are only added if they are not already in the vocabulary. AddedToken wrap a string token to | |
let you personnalize it's behavior (Whether this token should only match against single word, whether | |
this token should strip all potential whitespaces on the left side, Whether this token should strip | |
all potential whitespaces on the right side...). | |
special_token: can be used to specify if the token is a special token. This mostly change the normalization | |
behavior (special tokens like CLS or [MASK] are usually not lower-cased for instance) | |
See details for :class:`~transformers.AddedToken` in HuggingFace tokenizers library. | |
Returns: | |
Number of tokens added to the vocabulary. | |
Examples:: | |
# Let's see how to increase the vocabulary of Bert model and tokenizer | |
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') | |
model = BertModel.from_pretrained('bert-base-uncased') | |
num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) | |
print('We have added', num_added_toks, 'tokens') | |
model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. | |
""" | |
if not new_tokens: | |
return 0 | |
if not isinstance(new_tokens, (list, tuple)): | |
new_tokens = [new_tokens] | |
return self._add_tokens(new_tokens, special_tokens=special_tokens) | |
def bos_token(self): | |
""" Beginning of sentence token (string). Log an error if used while not having been set. """ | |
if self._bos_token is None and self.verbose: | |
logger.error("Using bos_token, but it is not set yet.") | |
return None | |
return str(self._bos_token) | |
def eos_token(self): | |
""" End of sentence token (string). Log an error if used while not having been set. """ | |
if self._eos_token is None and self.verbose: | |
logger.error("Using eos_token, but it is not set yet.") | |
return None | |
return str(self._eos_token) | |
def unk_token(self): | |
""" Unknown token (string). Log an error if used while not having been set. """ | |
if self._unk_token is None and self.verbose: | |
logger.error("Using unk_token, but it is not set yet.") | |
return None | |
return str(self._unk_token) | |
def sep_token(self): | |
""" Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ | |
if self._sep_token is None and self.verbose: | |
logger.error("Using sep_token, but it is not set yet.") | |
return None | |
return str(self._sep_token) | |
def pad_token(self): | |
""" Padding token (string). Log an error if used while not having been set. """ | |
if self._pad_token is None and self.verbose: | |
logger.error("Using pad_token, but it is not set yet.") | |
return None | |
return str(self._pad_token) | |
def cls_token(self): | |
""" Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ | |
if self._cls_token is None and self.verbose: | |
logger.error("Using cls_token, but it is not set yet.") | |
return None | |
return str(self._cls_token) | |
def mask_token(self): | |
""" Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ | |
if self._mask_token is None and self.verbose: | |
logger.error("Using mask_token, but it is not set yet.") | |
return None | |
return str(self._mask_token) | |
def additional_special_tokens(self): | |
""" All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """ | |
if self._additional_special_tokens is None and self.verbose: | |
logger.error("Using additional_special_tokens, but it is not set yet.") | |
return None | |
return [str(tok) for tok in self._additional_special_tokens] | |
def bos_token(self, value): | |
self._bos_token = value | |
def eos_token(self, value): | |
self._eos_token = value | |
def unk_token(self, value): | |
self._unk_token = value | |
def sep_token(self, value): | |
self._sep_token = value | |
def pad_token(self, value): | |
self._pad_token = value | |
def cls_token(self, value): | |
self._cls_token = value | |
def mask_token(self, value): | |
self._mask_token = value | |
def additional_special_tokens(self, value): | |
self._additional_special_tokens = value | |
def bos_token_id(self): | |
""" Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """ | |
if self._bos_token is None: | |
return None | |
return self.convert_tokens_to_ids(self.bos_token) | |
def eos_token_id(self): | |
""" Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """ | |
if self._eos_token is None: | |
return None | |
return self.convert_tokens_to_ids(self.eos_token) | |
def unk_token_id(self): | |
""" Id of the unknown token in the vocabulary. Log an error if used while not having been set. """ | |
if self._unk_token is None: | |
return None | |
return self.convert_tokens_to_ids(self.unk_token) | |
def sep_token_id(self): | |
""" Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ | |
if self._sep_token is None: | |
return None | |
return self.convert_tokens_to_ids(self.sep_token) | |
def pad_token_id(self): | |
""" Id of the padding token in the vocabulary. Log an error if used while not having been set. """ | |
if self._pad_token is None: | |
return None | |
return self.convert_tokens_to_ids(self.pad_token) | |
def pad_token_type_id(self): | |
""" Id of the padding token type in the vocabulary.""" | |
return self._pad_token_type_id | |
def cls_token_id(self): | |
""" Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ | |
if self._cls_token is None: | |
return None | |
return self.convert_tokens_to_ids(self.cls_token) | |
def mask_token_id(self): | |
""" Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ | |
if self._mask_token is None: | |
return None | |
return self.convert_tokens_to_ids(self.mask_token) | |
def additional_special_tokens_ids(self): | |
""" Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """ | |
return self.convert_tokens_to_ids(self.additional_special_tokens) | |
def special_tokens_map(self): | |
""" A dictionary mapping special token class attribute (cls_token, unk_token...) to their | |
values ('<unk>', '<cls>'...) | |
Convert tokens of AddedToken type in string. | |
All returned tokens are strings | |
""" | |
set_attr = {} | |
for attr in self.SPECIAL_TOKENS_ATTRIBUTES: | |
attr_value = getattr(self, "_" + attr) | |
if attr_value: | |
set_attr[attr] = str(attr_value) | |
return set_attr | |
def special_tokens_map_extended(self): | |
""" A dictionary mapping special token class attribute (cls_token, unk_token...) to their | |
values ('<unk>', '<cls>'...) | |
Keep the tokens as AddedToken if they are of this type. | |
AddedToken can be used to control more finely how special tokens are tokenized. | |
""" | |
set_attr = {} | |
for attr in self.SPECIAL_TOKENS_ATTRIBUTES: | |
attr_value = getattr(self, "_" + attr) | |
if attr_value: | |
set_attr[attr] = attr_value | |
return set_attr | |
def all_special_tokens(self): | |
""" List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes | |
Convert tokens of AddedToken type in string. | |
All returned tokens are strings | |
(cls_token, unk_token...). | |
""" | |
all_toks = [str(s) for s in self.all_special_tokens_extended] | |
return all_toks | |
def all_special_tokens_extended(self): | |
""" List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes | |
Keep the tokens as AddedToken if they are of this type. | |
AddedToken can be used to control more finely how special tokens are tokenized. | |
""" | |
all_toks = [] | |
set_attr = self.special_tokens_map_extended | |
for attr_value in set_attr.values(): | |
all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) | |
all_toks = list(set(all_toks)) | |
return all_toks | |
def all_special_ids(self): | |
""" List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to | |
class attributes (cls_token, unk_token...). | |
""" | |
all_toks = self.all_special_tokens | |
all_ids = self.convert_tokens_to_ids(all_toks) | |
return all_ids | |
ENCODE_KWARGS_DOCSTRING = r""" | |
add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): | |
If set to ``True``, the sequences will be encoded with the special tokens relative | |
to their model. | |
`padding` (:obj:`Union[bool, str]`, `optional`, defaults to :obj:`False`): | |
Activate and control padding. Accepts the following values: | |
* `True` or `'longest'`: pad to the longest sequence in the batch (or no padding if only a single sequence if provided), | |
* `'max_length'`: pad to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`) | |
* `False` or `'do_not_pad'` (default): No padding (i.e. can output batch with sequences of uneven lengths) | |
`truncation` (:obj:`Union[bool, str]`, `optional`, defaults to :obj:`False`): | |
Activate and control truncation. Accepts the following values: | |
* `True` or `'longest_first'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will truncate token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided, | |
* `'only_first'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided, | |
* `'only_second'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will only truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided, | |
* `False` or `'do_not_truncate'` (default): No truncation (i.e. can output batch with sequences length greater than the model max admissible input size) | |
`max_length` (:obj:`Union[int, None]`, `optional`, defaults to :obj:`None`): | |
Control the length for padding/truncation. Accepts the following values | |
* `None` (default): This will use the predefined model max length if required by one of the truncation/padding parameters. If the model has no specific max input length (e.g. XLNet) truncation/padding to max length is deactivated. | |
* `any integer value` (e.g. `42`): Use this specific maximum length value if required by one of the truncation/padding parameters. | |
stride (:obj:`int`, `optional`, defaults to ``0``): | |
If set to a number along with max_length, the overflowing tokens returned when `return_overflowing_tokens=True` | |
will contain some tokens from the end of the truncated sequence returned to provide some overlap between truncated and overflow ing sequences. | |
The value of this argument defines the number of overlapping tokens. | |
is_pretokenized (:obj:`bool`, defaults to :obj:`False`): | |
Set to True to indicate the input is already tokenized | |
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. | |
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability | |
>= 7.5 (Volta). | |
return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): | |
Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, | |
PyTorch :obj:`torch.Tensor` or Numpy :oj: `np.ndarray` instead of a list of python integers. | |
""" | |
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" | |
return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): | |
Whether to return token type IDs. If left to the default, will return the token type IDs according | |
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. | |
`What are token type IDs? <../glossary.html#token-type-ids>`_ | |
return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`): | |
Whether to return the attention mask. If left to the default, will return the attention mask according | |
to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. | |
`What are attention masks? <../glossary.html#attention-mask>`__ | |
return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): | |
Set to True to return overflowing token sequences (default False). | |
return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): | |
Set to True to return special tokens mask information (default False). | |
return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): | |
Set to True to return (char_start, char_end) for each token (default False). | |
If using Python's tokenizer, this method will raise NotImplementedError. | |
This one is only available on fast tokenizers inheriting from PreTrainedTokenizerFast. | |
**kwargs: passed to the `self.tokenize()` method | |
Return: | |
A Dictionary of shape:: | |
{ | |
input_ids: list[int], | |
token_type_ids: list[int] if return_token_type_ids is True (default) | |
attention_mask: list[int] if return_attention_mask is True (default) | |
overflowing_tokens: list[int] if the tokenizer is a slow tokenize, else a List[List[int]] if a ``max_length`` is specified and ``return_overflowing_tokens=True`` | |
special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` | |
and return_special_tokens_mask is True | |
} | |
With the fields: | |
- ``input_ids``: list of token ids to be fed to a model | |
- ``token_type_ids``: list of token type ids to be fed to a model | |
- ``attention_mask``: list of indices specifying which tokens should be attended to by the model | |
- ``overflowing_tokens``: list of overflowing tokens sequences if a max length is specified and ``return_overflowing_tokens=True``. | |
- ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added | |
tokens and 1 specifying sequence tokens. | |
""" | |
class PreTrainedTokenizerBase(SpecialTokensMixin): | |
""" Base class for slow and fast tokenizers. | |
Handle shared (mostly boiler plate) methods for slow and fast tokenizers. | |
""" | |
vocab_files_names: Dict[str, str] = {} | |
pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {} | |
pretrained_init_configuration: Dict[str, Dict[str, Any]] = {} | |
max_model_input_sizes: Dict[str, int] = {} | |
model_input_names: List[str] = ["token_type_ids", "attention_mask"] | |
padding_side: str = "right" | |
def __init__(self, **kwargs): | |
# inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) | |
self.init_inputs = () | |
self.init_kwargs = kwargs | |
# For backward compatibility we fallback to set model_max_length from max_len if provided | |
model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None)) | |
self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER | |
# Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed. | |
self.padding_side = kwargs.pop("padding_side", self.padding_side) | |
assert self.padding_side in [ | |
"right", | |
"left", | |
], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}" | |
self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) | |
super().__init__(**kwargs) | |
def max_len(self) -> int: | |
""" Kept here for backward compatibility. | |
Now renamed to `model_max_length` to avoid ambiguity. | |
""" | |
return self.model_max_length | |
def max_len_single_sentence(self) -> int: | |
return self.model_max_length - self.num_special_tokens_to_add(pair=False) | |
def max_len_sentences_pair(self) -> int: | |
return self.model_max_length - self.num_special_tokens_to_add(pair=True) | |
def max_len_single_sentence(self, value) -> int: | |
""" For backward compatibility, allow to try to setup 'max_len_single_sentence' """ | |
if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose: | |
logger.warning( | |
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." | |
) | |
else: | |
raise ValueError( | |
"Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." | |
) | |
def max_len_sentences_pair(self, value) -> int: | |
""" For backward compatibility, allow to try to setup 'max_len_sentences_pair' """ | |
if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose: | |
logger.warning( | |
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." | |
) | |
else: | |
raise ValueError( | |
"Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." | |
) | |
def from_pretrained(cls, *inputs, **kwargs): | |
r""" | |
Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer. | |
Args: | |
pretrained_model_name_or_path: either: | |
- a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. | |
- a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. | |
- a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. | |
- (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. | |
cache_dir: (`optional`) string: | |
Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. | |
force_download: (`optional`) boolean, default False: | |
Force to (re-)download the vocabulary files and override the cached versions if they exists. | |
resume_download: (`optional`) boolean, default False: | |
Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. | |
proxies: (`optional`) dict, default None: | |
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. | |
The proxies are used on each request. | |
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. | |
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details. | |
Examples:: | |
# We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer | |
# Download vocabulary from S3 and cache. | |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
# Download vocabulary from S3 (user-uploaded) and cache. | |
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased') | |
# If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) | |
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') | |
# If the tokenizer uses a single vocabulary file, you can point directly to this file | |
tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt') | |
# You can link tokens to special vocabulary when instantiating | |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>') | |
# You should be sure '<unk>' is in the vocabulary when doing that. | |
# Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead) | |
assert tokenizer.unk_token == '<unk>' | |
""" | |
return cls._from_pretrained(*inputs, **kwargs) | |
def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): | |
cache_dir = kwargs.pop("cache_dir", None) | |
force_download = kwargs.pop("force_download", False) | |
resume_download = kwargs.pop("resume_download", False) | |
proxies = kwargs.pop("proxies", None) | |
local_files_only = kwargs.pop("local_files_only", False) | |
s3_models = list(cls.max_model_input_sizes.keys()) | |
vocab_files = {} | |
init_configuration = {} | |
if pretrained_model_name_or_path in s3_models: | |
# Get the vocabulary from AWS S3 bucket | |
for file_id, map_list in cls.pretrained_vocab_files_map.items(): | |
vocab_files[file_id] = map_list[pretrained_model_name_or_path] | |
if ( | |
cls.pretrained_init_configuration | |
and pretrained_model_name_or_path in cls.pretrained_init_configuration | |
): | |
init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy() | |
else: | |
# Get the vocabulary from local files | |
logger.info( | |
"Model name '{}' not found in model shortcut name list ({}). " | |
"Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format( | |
pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path | |
) | |
) | |
if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): | |
if len(cls.vocab_files_names) > 1: | |
raise ValueError( | |
"Calling {}.from_pretrained() with the path to a single file or url is not supported." | |
"Use a model identifier or the path to a directory instead.".format(cls.__name__) | |
) | |
logger.warning( | |
"Calling {}.from_pretrained() with the path to a single file or url is deprecated".format( | |
cls.__name__ | |
) | |
) | |
file_id = list(cls.vocab_files_names.keys())[0] | |
vocab_files[file_id] = pretrained_model_name_or_path | |
else: | |
# At this point pretrained_model_name_or_path is either a directory or a model identifier name | |
additional_files_names = { | |
"added_tokens_file": ADDED_TOKENS_FILE, | |
"special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, | |
"tokenizer_config_file": TOKENIZER_CONFIG_FILE, | |
"full_tokenizer_file": FULL_TOKENIZER_FILE, | |
} | |
# Look for the tokenizer files | |
for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items(): | |
if os.path.isdir(pretrained_model_name_or_path): | |
full_file_name = os.path.join(pretrained_model_name_or_path, file_name) | |
if not os.path.exists(full_file_name): | |
logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) | |
full_file_name = None | |
else: | |
full_file_name = hf_bucket_url( | |
pretrained_model_name_or_path, filename=file_name, use_cdn=False | |
) | |
vocab_files[file_id] = full_file_name | |
# Get files from url, cache, or disk depending on the case | |
try: | |
resolved_vocab_files = {} | |
for file_id, file_path in vocab_files.items(): | |
if file_path is None: | |
resolved_vocab_files[file_id] = None | |
else: | |
resolved_vocab_files[file_id] = cached_path( | |
file_path, | |
cache_dir=cache_dir, | |
force_download=force_download, | |
proxies=proxies, | |
resume_download=resume_download, | |
local_files_only=local_files_only, | |
) | |
except EnvironmentError: | |
if pretrained_model_name_or_path in s3_models: | |
msg = "Couldn't reach server at '{}' to download vocabulary files." | |
else: | |
msg = ( | |
"Model name '{}' was not found in tokenizers model name list ({}). " | |
"We assumed '{}' was a path or url to a directory containing vocabulary files " | |
"named {}, but couldn't find such vocabulary files at this path or url.".format( | |
pretrained_model_name_or_path, | |
", ".join(s3_models), | |
pretrained_model_name_or_path, | |
list(cls.vocab_files_names.values()), | |
) | |
) | |
raise EnvironmentError(msg) | |
if all(full_file_name is None for full_file_name in resolved_vocab_files.values()): | |
raise EnvironmentError( | |
"Model name '{}' was not found in tokenizers model name list ({}). " | |
"We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files " | |
"named {} but couldn't find such vocabulary files at this path or url.".format( | |
pretrained_model_name_or_path, | |
", ".join(s3_models), | |
pretrained_model_name_or_path, | |
list(cls.vocab_files_names.values()), | |
) | |
) | |
for file_id, file_path in vocab_files.items(): | |
if file_path == resolved_vocab_files[file_id]: | |
logger.info("loading file {}".format(file_path)) | |
else: | |
logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id])) | |
# Prepare tokenizer initialization kwargs | |
# Did we saved some inputs and kwargs to reload ? | |
tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) | |
if tokenizer_config_file is not None: | |
with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: | |
init_kwargs = json.load(tokenizer_config_handle) | |
saved_init_inputs = init_kwargs.pop("init_inputs", ()) | |
if not init_inputs: | |
init_inputs = saved_init_inputs | |
else: | |
init_kwargs = init_configuration | |
# Update with newly provided kwargs | |
init_kwargs.update(kwargs) | |
# Set max length if needed | |
if pretrained_model_name_or_path in cls.max_model_input_sizes: | |
# if we're using a pretrained model, ensure the tokenizer | |
# wont index sequences longer than the number of positional embeddings | |
model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path] | |
if model_max_length is not None and isinstance(model_max_length, (int, float)): | |
init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length) | |
# Merge resolved_vocab_files arguments in init_kwargs. | |
added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) | |
for args_name, file_path in resolved_vocab_files.items(): | |
if args_name not in init_kwargs: | |
init_kwargs[args_name] = file_path | |
# Instantiate tokenizer. | |
try: | |
tokenizer = cls(*init_inputs, **init_kwargs) | |
except OSError: | |
raise OSError( | |
"Unable to load vocabulary from file. " | |
"Please check that the provided vocabulary is accessible and not corrupted." | |
) | |
# Save inputs and kwargs for saving and re-loading with ``save_pretrained`` | |
tokenizer.init_inputs = init_inputs | |
tokenizer.init_kwargs = init_kwargs | |
# If there is a complementary special token map, load it | |
special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) | |
if special_tokens_map_file is not None: | |
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: | |
special_tokens_map = json.load(special_tokens_map_handle) | |
for key, value in special_tokens_map.items(): | |
if isinstance(value, dict): | |
value = AddedToken(**value) | |
setattr(tokenizer, key, value) | |
# Add supplementary tokens. | |
special_tokens = tokenizer.all_special_tokens | |
if added_tokens_file is not None: | |
with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: | |
added_tok_encoder = json.load(added_tokens_handle) | |
# Sort added tokens by index | |
added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1])) | |
for token, index in added_tok_encoder_sorted: | |
assert index == len(tokenizer), ( | |
f"Non-consecutive added token '{token}' found. " | |
f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary." | |
) | |
tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens)) | |
# Check all our special tokens are registrered as "no split" token (we don't cut them) and are in the vocab | |
added_tokens = tokenizer.sanitize_special_tokens() | |
if added_tokens: | |
logger.warning( | |
"Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained." | |
) | |
return tokenizer | |
def save_pretrained(self, save_directory) -> Tuple[str]: | |
""" Save the tokenizer vocabulary files together with: | |
- added tokens, | |
- special-tokens-to-class-attributes-mapping, | |
- tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert). | |
Warning: This won't save modifications you may have applied to the tokenizer after the instantiation | |
(e.g. modifying tokenizer.do_lower_case after creation). | |
This method make sure the full tokenizer can then be re-loaded using the | |
:func:`~transformers.PreTrainedTokenizer.from_pretrained` class method. | |
""" | |
if os.path.isfile(save_directory): | |
logger.error("Provided path ({}) should be a directory, not a file".format(save_directory)) | |
return | |
os.makedirs(save_directory, exist_ok=True) | |
special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE) | |
added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE) | |
tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE) | |
tokenizer_config = copy.deepcopy(self.init_kwargs) | |
if len(self.init_inputs) > 0: | |
tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) | |
for file_id in self.vocab_files_names.keys(): | |
tokenizer_config.pop(file_id, None) | |
with open(tokenizer_config_file, "w", encoding="utf-8") as f: | |
f.write(json.dumps(tokenizer_config, ensure_ascii=False)) | |
with open(special_tokens_map_file, "w", encoding="utf-8") as f: | |
write_dict = {} | |
for key, value in self.special_tokens_map_extended.items(): | |
if isinstance(value, AddedToken): | |
write_dict[key] = value.__getstate__() | |
else: | |
write_dict[key] = value | |
f.write(json.dumps(write_dict, ensure_ascii=False)) | |
added_vocab = self.get_added_vocab() | |
if added_vocab: | |
with open(added_tokens_file, "w", encoding="utf-8") as f: | |
out_str = json.dumps(added_vocab, ensure_ascii=False) | |
f.write(out_str) | |
vocab_files = self.save_vocabulary(save_directory) | |
return vocab_files + (special_tokens_map_file, added_tokens_file) | |
def encode( | |
self, | |
text: Union[TextInput, PreTokenizedInput, EncodedInput], | |
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, | |
add_special_tokens: bool = True, | |
padding: Union[bool, str] = False, | |
truncation: Union[bool, str] = False, | |
max_length: Optional[int] = None, | |
stride: int = 0, | |
return_tensors: Optional[Union[str, TensorType]] = None, | |
**kwargs | |
): | |
""" | |
Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. | |
Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. | |
Args: | |
text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`): | |
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using | |
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` | |
method) | |
text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): | |
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized | |
string using the `tokenize` method) or a list of integers (tokenized string ids using the | |
`convert_tokens_to_ids` method) | |
""" | |
encoded_inputs = self.encode_plus( | |
text, | |
text_pair=text_pair, | |
add_special_tokens=add_special_tokens, | |
padding=padding, | |
truncation=truncation, | |
max_length=max_length, | |
stride=stride, | |
return_tensors=return_tensors, | |
**kwargs, | |
) | |
return encoded_inputs["input_ids"] | |
def num_special_tokens_to_add(self, pair: bool = False) -> int: | |
raise NotImplementedError | |
def _get_padding_truncation_strategies( | |
self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs | |
): | |
""" Find the correct padding/truncation strategy with backward compatibility | |
for old arguments (truncation_strategy and pad_to_max_length) and behaviors. | |
""" | |
old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate") | |
old_pad_to_max_length = kwargs.pop("pad_to_max_length", False) | |
# Backward compatibility for previous behavior, maybe we should deprecate it: | |
# If you only set max_length, it activates truncation for max_length | |
if max_length is not None and padding is False and truncation is False: | |
if verbose: | |
logger.warning( | |
"Truncation was not explicitely activated but `max_length` is provided a specific value, " | |
"please use `truncation=True` to explicitely truncate examples to max length. " | |
"Defaulting to 'longest_first' truncation strategy. " | |
"If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy " | |
"more precisely by providing a specific strategy to `truncation`." | |
) | |
truncation = "longest_first" | |
# Get padding strategy | |
if padding is False and old_pad_to_max_length: | |
if verbose: | |
warnings.warn( | |
"The `pad_to_max_length` argument is deprecated and will be removed in a future version, " | |
"use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or " | |
"use `padding='max_length'` to pad to a max length. In this case, you can give a specific " | |
"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the " | |
"maximal input size of the model (e.g. 512 for Bert).", | |
DeprecationWarning, | |
) | |
if max_length is None: | |
padding_strategy = PaddingStrategy.LONGEST | |
else: | |
padding_strategy = PaddingStrategy.MAX_LENGTH | |
elif padding is not False: | |
if padding is True: | |
padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch | |
elif not isinstance(padding, PaddingStrategy): | |
padding_strategy = PaddingStrategy(padding) | |
else: | |
padding_strategy = PaddingStrategy.DO_NOT_PAD | |
# Get truncation strategy | |
if truncation is False and old_truncation_strategy != "do_not_truncate": | |
if verbose: | |
warnings.warn( | |
"The `truncation_strategy` argument is deprecated and will be removed in a future version, " | |
"use `truncation=True` to truncate examples to a max length. You can give a specific " | |
"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the " | |
"maximal input size of the model (e.g. 512 for Bert). " | |
" If you have pairs of inputs, you can give a specific truncation strategy selected among " | |
"`truncation='only_first'` (will only truncate the first sentence in the pairs) " | |
"`truncation='only_second'` (will only truncate the second sentence in the pairs) " | |
"or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).", | |
DeprecationWarning, | |
) | |
truncation_strategy = TruncationStrategy(old_truncation_strategy) | |
elif truncation is not False: | |
if truncation is True: | |
truncation_strategy = ( | |
TruncationStrategy.LONGEST_FIRST | |
) # Default to truncate the longest sequences in pairs of inputs | |
elif not isinstance(truncation, TruncationStrategy): | |
truncation_strategy = TruncationStrategy(truncation) | |
else: | |
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE | |
# Set max length if needed | |
if max_length is None: | |
if padding_strategy == PaddingStrategy.MAX_LENGTH: | |
if self.model_max_length > LARGE_INTEGER: | |
if verbose: | |
logger.warning( | |
"Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. " | |
"Default to no padding." | |
) | |
padding_strategy = PaddingStrategy.DO_NOT_PAD | |
else: | |
max_length = self.model_max_length | |
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: | |
if self.model_max_length > LARGE_INTEGER: | |
if verbose: | |
logger.warning( | |
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. " | |
"Default to no truncation." | |
) | |
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE | |
else: | |
max_length = self.model_max_length | |
# Test if we have a padding token | |
if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0): | |
raise ValueError( | |
"Asking to pad but the tokenizer does not have a padding token. " | |
"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " | |
"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." | |
) | |
# Check that we will truncate to a multiple of pad_to_multiple_of if both are provided | |
if ( | |
truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE | |
and padding_strategy != PaddingStrategy.DO_NOT_PAD | |
and pad_to_multiple_of is not None | |
and max_length is not None | |
and (max_length % pad_to_multiple_of != 0) | |
): | |
raise ValueError( | |
f"Truncation and padding are both activated but " | |
f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." | |
) | |
return padding_strategy, truncation_strategy, max_length, kwargs | |
def __call__( | |
self, | |
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], | |
text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, | |
add_special_tokens: bool = True, | |
padding: Union[bool, str] = False, | |
truncation: Union[bool, str] = False, | |
max_length: Optional[int] = None, | |
stride: int = 0, | |
is_pretokenized: bool = False, | |
pad_to_multiple_of: Optional[int] = None, | |
return_tensors: Optional[Union[str, TensorType]] = None, | |
return_token_type_ids: Optional[bool] = None, | |
return_attention_mask: Optional[bool] = None, | |
return_overflowing_tokens: bool = False, | |
return_special_tokens_mask: bool = False, | |
return_offsets_mapping: bool = False, | |
return_length: bool = False, | |
verbose: bool = True, | |
**kwargs | |
) -> BatchEncoding: | |
""" | |
Returns a dictionary containing the encoded sequence or sequence pair and additional information: | |
the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. | |
Args: | |
text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]``): | |
The sequence or batch of sequences to be encoded. | |
Each sequence can be a string or a list of strings (pre-tokenized string). | |
If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True` | |
(to lift the ambiguity with a batch of sequences) | |
text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]``): | |
The sequence or batch of sequences to be encoded. | |
Each sequence can be a string or a list of strings (pre-tokenized string). | |
If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True` | |
(to lift the ambiguity with a batch of sequences) | |
""" | |
# Input type checking for clearer error | |
assert isinstance(text, str) or ( | |
isinstance(text, (list, tuple)) | |
and ( | |
len(text) == 0 | |
or ( | |
isinstance(text[0], str) | |
or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str))) | |
) | |
) | |
), ( | |
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " | |
"or `List[List[str]]` (batch of pretokenized examples)." | |
) | |
assert ( | |
text_pair is None | |
or isinstance(text_pair, str) | |
or ( | |
isinstance(text_pair, (list, tuple)) | |
and ( | |
len(text_pair) == 0 | |
or ( | |
isinstance(text_pair[0], str) | |
or ( | |
isinstance(text_pair[0], (list, tuple)) | |
and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str)) | |
) | |
) | |
) | |
) | |
), ( | |
"text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " | |
"or `List[List[str]]` (batch of pretokenized examples)." | |
) | |
is_batched = bool( | |
(not is_pretokenized and isinstance(text, (list, tuple))) | |
or (is_pretokenized and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))) | |
) | |
if is_batched: | |
batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text | |
return self.batch_encode_plus( | |
batch_text_or_text_pairs=batch_text_or_text_pairs, | |
add_special_tokens=add_special_tokens, | |
padding=padding, | |
truncation=truncation, | |
max_length=max_length, | |
stride=stride, | |
is_pretokenized=is_pretokenized, | |
pad_to_multiple_of=pad_to_multiple_of, | |
return_tensors=return_tensors, | |
return_token_type_ids=return_token_type_ids, | |
return_attention_mask=return_attention_mask, | |
return_overflowing_tokens=return_overflowing_tokens, | |
return_special_tokens_mask=return_special_tokens_mask, | |
return_offsets_mapping=return_offsets_mapping, | |
return_length=return_length, | |
verbose=verbose, | |
**kwargs, | |
) | |
else: | |
return self.encode_plus( | |
text=text, | |
text_pair=text_pair, | |
add_special_tokens=add_special_tokens, | |
padding=padding, | |
truncation=truncation, | |
max_length=max_length, | |
stride=stride, | |
is_pretokenized=is_pretokenized, | |
pad_to_multiple_of=pad_to_multiple_of, | |
return_tensors=return_tensors, | |
return_token_type_ids=return_token_type_ids, | |
return_attention_mask=return_attention_mask, | |
return_overflowing_tokens=return_overflowing_tokens, | |
return_special_tokens_mask=return_special_tokens_mask, | |
return_offsets_mapping=return_offsets_mapping, | |
return_length=return_length, | |
verbose=verbose, | |
**kwargs, | |
) | |
def encode_plus( | |
self, | |
text: Union[TextInput, PreTokenizedInput, EncodedInput], | |
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, | |
add_special_tokens: bool = True, | |
padding: Union[bool, str] = False, | |
truncation: Union[bool, str] = False, | |
max_length: Optional[int] = None, | |
stride: int = 0, | |
is_pretokenized: bool = False, | |
pad_to_multiple_of: Optional[int] = None, | |
return_tensors: Optional[Union[str, TensorType]] = None, | |
return_token_type_ids: Optional[bool] = None, | |
return_attention_mask: Optional[bool] = None, | |
return_overflowing_tokens: bool = False, | |
return_special_tokens_mask: bool = False, | |
return_offsets_mapping: bool = False, | |
return_length: bool = False, | |
verbose: bool = True, | |
**kwargs | |
) -> BatchEncoding: | |
""" | |
Returns a dictionary containing the encoded sequence or sequence pair and additional information: | |
the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. | |
Args: | |
text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the later only for not-fast tokenizers)): | |
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using | |
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` | |
method) | |
text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): | |
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized | |
string using the `tokenize` method) or a list of integers (tokenized string ids using the | |
`convert_tokens_to_ids` method) | |
""" | |
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length' | |
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( | |
padding=padding, | |
truncation=truncation, | |
max_length=max_length, | |
pad_to_multiple_of=pad_to_multiple_of, | |
verbose=verbose, | |
**kwargs, | |
) | |
return self._encode_plus( | |
text=text, | |
text_pair=text_pair, | |
add_special_tokens=add_special_tokens, | |
padding_strategy=padding_strategy, | |
truncation_strategy=truncation_strategy, | |
max_length=max_length, | |
stride=stride, | |
is_pretokenized=is_pretokenized, | |
pad_to_multiple_of=pad_to_multiple_of, | |
return_tensors=return_tensors, | |
return_token_type_ids=return_token_type_ids, | |
return_attention_mask=return_attention_mask, | |
return_overflowing_tokens=return_overflowing_tokens, | |
return_special_tokens_mask=return_special_tokens_mask, | |
return_offsets_mapping=return_offsets_mapping, | |
return_length=return_length, | |
verbose=verbose, | |
**kwargs, | |
) | |
def _encode_plus( | |
self, | |
text: Union[TextInput, PreTokenizedInput, EncodedInput], | |
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, | |
add_special_tokens: bool = True, | |
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, | |
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, | |
max_length: Optional[int] = None, | |
stride: int = 0, | |
is_pretokenized: bool = False, | |
pad_to_multiple_of: Optional[int] = None, | |
return_tensors: Optional[Union[str, TensorType]] = None, | |
return_token_type_ids: Optional[bool] = None, | |
return_attention_mask: Optional[bool] = None, | |
return_overflowing_tokens: bool = False, | |
return_special_tokens_mask: bool = False, | |
return_offsets_mapping: bool = False, | |
return_length: bool = False, | |
verbose: bool = True, | |
**kwargs | |
) -> BatchEncoding: | |
raise NotImplementedError | |
def batch_encode_plus( | |
self, | |
batch_text_or_text_pairs: Union[ | |
List[TextInput], | |
List[TextInputPair], | |
List[PreTokenizedInput], | |
List[PreTokenizedInputPair], | |
List[EncodedInput], | |
List[EncodedInputPair], | |
], | |
add_special_tokens: bool = True, | |
padding: Union[bool, str] = False, | |
truncation: Union[bool, str] = False, | |
max_length: Optional[int] = None, | |
stride: int = 0, | |
is_pretokenized: bool = False, | |
pad_to_multiple_of: Optional[int] = None, | |
return_tensors: Optional[Union[str, TensorType]] = None, | |
return_token_type_ids: Optional[bool] = None, | |
return_attention_mask: Optional[bool] = None, | |
return_overflowing_tokens: bool = False, | |
return_special_tokens_mask: bool = False, | |
return_offsets_mapping: bool = False, | |
return_length: bool = False, | |
verbose: bool = True, | |
**kwargs | |
) -> BatchEncoding: | |
""" | |
Returns a dictionary containing the encoded sequence or sequence pair and additional information: | |
the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. | |
Args: | |
batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, | |
:obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, | |
and for not-fast tokenizers, also: | |
:obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`): | |
Batch of sequences or pair of sequences to be encoded. | |
This can be a list of string/string-sequences/int-sequences or a list of pair of | |
string/string-sequences/int-sequence (see details in encode_plus) | |
""" | |
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length' | |
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( | |
padding=padding, | |
truncation=truncation, | |
max_length=max_length, | |
pad_to_multiple_of=pad_to_multiple_of, | |
verbose=verbose, | |
**kwargs, | |
) | |
return self._batch_encode_plus( | |
batch_text_or_text_pairs=batch_text_or_text_pairs, | |
add_special_tokens=add_special_tokens, | |
padding_strategy=padding_strategy, | |
truncation_strategy=truncation_strategy, | |
max_length=max_length, | |
stride=stride, | |
is_pretokenized=is_pretokenized, | |
pad_to_multiple_of=pad_to_multiple_of, | |
return_tensors=return_tensors, | |
return_token_type_ids=return_token_type_ids, | |
return_attention_mask=return_attention_mask, | |
return_overflowing_tokens=return_overflowing_tokens, | |
return_special_tokens_mask=return_special_tokens_mask, | |
return_offsets_mapping=return_offsets_mapping, | |
return_length=return_length, | |
verbose=verbose, | |
**kwargs, | |
) | |
def _batch_encode_plus( | |
self, | |
batch_text_or_text_pairs: Union[ | |
List[TextInput], | |
List[TextInputPair], | |
List[PreTokenizedInput], | |
List[PreTokenizedInputPair], | |
List[EncodedInput], | |
List[EncodedInputPair], | |
], | |
add_special_tokens: bool = True, | |
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, | |
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, | |
max_length: Optional[int] = None, | |
stride: int = 0, | |
is_pretokenized: bool = False, | |
pad_to_multiple_of: Optional[int] = None, | |
return_tensors: Optional[Union[str, TensorType]] = None, | |
return_token_type_ids: Optional[bool] = None, | |
return_attention_mask: Optional[bool] = None, | |
return_overflowing_tokens: bool = False, | |
return_special_tokens_mask: bool = False, | |
return_offsets_mapping: bool = False, | |
return_length: bool = False, | |
verbose: bool = True, | |
**kwargs | |
) -> BatchEncoding: | |
raise NotImplementedError | |
def pad( | |
self, | |
encoded_inputs: Union[ | |
BatchEncoding, | |
List[BatchEncoding], | |
Dict[str, EncodedInput], | |
Dict[str, List[EncodedInput]], | |
List[Dict[str, EncodedInput]], | |
], | |
padding: Union[bool, str] = True, | |
max_length: Optional[int] = None, | |
pad_to_multiple_of: Optional[int] = None, | |
return_attention_mask: Optional[bool] = None, | |
return_tensors: Optional[Union[str, TensorType]] = None, | |
verbose: bool = True, | |
) -> BatchEncoding: | |
""" Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length in the batch. | |
Padding side (left/right) padding token ids are defined at the tokenizer level | |
(with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``) | |
Args: | |
encoded_inputs: Dictionary of tokenized inputs (`Dict[str, List[int]]`) or batch of tokenized inputs. | |
Batch of tokenized inputs can be given as dicts of lists or lists of dicts, both work so you can | |
use ``tokenizer.pad()`` during pre-processing as well as in a PyTorch Dataloader collate function. | |
(`Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`). | |
padding: Boolean or specific strategy to use for padding. | |
Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: | |
- 'longest' (or `True`) Pad to the longest sequence in the batch | |
- 'max_length': Pad to the max length (default) | |
- 'do_not_pad' (or `False`): Do not pad | |
max_length: maximum length of the returned list and optionally padding length (see below). | |
Will truncate by taking into account the special tokens. | |
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. | |
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability | |
>= 7.5 (Volta). | |
return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) | |
return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): | |
Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, | |
PyTorch :obj:`torch.Tensor` or Numpy :oj: `np.ndarray` instead of a list of python integers. | |
verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): | |
Set to ``False`` to avoid printing infos and warnings. | |
""" | |
# If we have a list of dicts, let's convert it in a dict of lists | |
if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)): | |
encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()} | |
assert "input_ids" in encoded_inputs, ( | |
"You should supply an encoding or a list of encodings to this method. " | |
"An encoding is the output of one the encoding methods of the tokenizer, i.e. " | |
"__call__/encode_plus/batch_encode_plus. " | |
) | |
if not encoded_inputs["input_ids"]: | |
if return_attention_mask: | |
encoded_inputs["attention_mask"] = [] | |
return encoded_inputs | |
# Convert padding_strategy in PaddingStrategy | |
padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( | |
padding=padding, max_length=max_length, verbose=verbose | |
) | |
if encoded_inputs["input_ids"] and not isinstance(encoded_inputs["input_ids"][0], (list, tuple)): | |
encoded_inputs = self._pad( | |
encoded_inputs, | |
max_length=max_length, | |
padding_strategy=padding_strategy, | |
pad_to_multiple_of=pad_to_multiple_of, | |
return_attention_mask=return_attention_mask, | |
) | |
return BatchEncoding(encoded_inputs, tensor_type=return_tensors) | |
batch_size = len(encoded_inputs["input_ids"]) | |
assert all( | |
len(v) == batch_size for v in encoded_inputs.values() | |
), "Some items in the output dictionnary have a different batch size than others." | |
if padding_strategy == PaddingStrategy.LONGEST: | |
max_length = max(len(inputs) for inputs in encoded_inputs["input_ids"]) | |
padding_strategy = PaddingStrategy.MAX_LENGTH | |
batch_outputs = {} | |
for i in range(batch_size): | |
inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) | |
outputs = self._pad( | |
inputs, | |
max_length=max_length, | |
padding_strategy=padding_strategy, | |
pad_to_multiple_of=pad_to_multiple_of, | |
return_attention_mask=return_attention_mask, | |
) | |
for key, value in outputs.items(): | |
if key not in batch_outputs: | |
batch_outputs[key] = [] | |
batch_outputs[key].append(value) | |
return BatchEncoding(batch_outputs, tensor_type=return_tensors) | |
def create_token_type_ids_from_sequences(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List[int]: | |
if token_ids_1 is None: | |
return len(token_ids_0) * [0] | |
return [0] * len(token_ids_0) + [1] * len(token_ids_1) | |
def build_inputs_with_special_tokens(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List: | |
""" | |
Build model inputs from a sequence or a pair of sequence for sequence classification tasks | |
by concatenating and adding special tokens. This implementation does not add special tokens. | |
""" | |
if token_ids_1 is None: | |
return token_ids_0 | |
return token_ids_0 + token_ids_1 | |
def prepare_for_model( | |
self, | |
ids: List[int], | |
pair_ids: Optional[List[int]] = None, | |
add_special_tokens: bool = True, | |
padding: Union[bool, str] = False, | |
truncation: Union[bool, str] = False, | |
max_length: Optional[int] = None, | |
stride: int = 0, | |
pad_to_multiple_of: Optional[int] = None, | |
return_tensors: Optional[Union[str, TensorType]] = None, | |
return_token_type_ids: Optional[bool] = None, | |
return_attention_mask: Optional[bool] = None, | |
return_overflowing_tokens: bool = False, | |
return_special_tokens_mask: bool = False, | |
return_offsets_mapping: bool = False, | |
return_length: bool = False, | |
verbose: bool = True, | |
prepend_batch_axis: bool = False, | |
**kwargs | |
) -> BatchEncoding: | |
""" Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. | |
It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and | |
manages a moving window (with user defined stride) for overflowing tokens | |
Args: | |
ids: list of tokenized input ids. Can be obtained from a string by chaining the | |
`tokenize` and `convert_tokens_to_ids` methods. | |
pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the | |
`tokenize` and `convert_tokens_to_ids` methods. | |
""" | |
if "return_lengths" in kwargs: | |
if verbose: | |
warnings.warn( | |
"The PreTrainedTokenizerBase.prepare_for_model `return_lengths` parameter is deprecated. " | |
"Please use `return_length` instead.", | |
FutureWarning, | |
) | |
return_length = kwargs["return_lengths"] | |
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length' | |
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( | |
padding=padding, | |
truncation=truncation, | |
max_length=max_length, | |
pad_to_multiple_of=pad_to_multiple_of, | |
verbose=verbose, | |
**kwargs, | |
) | |
pair = bool(pair_ids is not None) | |
len_ids = len(ids) | |
len_pair_ids = len(pair_ids) if pair else 0 | |
# Load from model defaults | |
if return_token_type_ids is None: | |
return_token_type_ids = "token_type_ids" in self.model_input_names | |
if return_attention_mask is None: | |
return_attention_mask = "attention_mask" in self.model_input_names | |
encoded_inputs = {} | |
# Compute the total size of the returned encodings | |
total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) | |
# Truncation: Handle max sequence length | |
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: | |
ids, pair_ids, overflowing_tokens = self.truncate_sequences( | |
ids, | |
pair_ids=pair_ids, | |
num_tokens_to_remove=total_len - max_length, | |
truncation_strategy=truncation_strategy, | |
stride=stride, | |
) | |
if return_overflowing_tokens: | |
encoded_inputs["overflowing_tokens"] = overflowing_tokens | |
encoded_inputs["num_truncated_tokens"] = total_len - max_length | |
# Add special tokens | |
if add_special_tokens: | |
sequence = self.build_inputs_with_special_tokens(ids, pair_ids) | |
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) | |
else: | |
sequence = ids + pair_ids if pair else ids | |
token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) | |
# Build output dictionnary | |
encoded_inputs["input_ids"] = sequence | |
if return_token_type_ids: | |
encoded_inputs["token_type_ids"] = token_type_ids | |
if return_special_tokens_mask: | |
if add_special_tokens: | |
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) | |
else: | |
encoded_inputs["special_tokens_mask"] = [0] * len(sequence) | |
# Check lengths | |
if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length and verbose: | |
logger.warning( | |
"Token indices sequence length is longer than the specified maximum sequence length " | |
"for this model ({} > {}). Running this sequence through the model will result in " | |
"indexing errors".format(len(ids), self.model_max_length) | |
) | |
# Padding | |
if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask: | |
encoded_inputs = self.pad( | |
encoded_inputs, | |
max_length=max_length, | |
padding=padding_strategy.value, | |
pad_to_multiple_of=pad_to_multiple_of, | |
return_attention_mask=return_attention_mask, | |
) | |
if return_length: | |
encoded_inputs["length"] = len(encoded_inputs["input_ids"]) | |
batch_outputs = BatchEncoding( | |
encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis | |
) | |
return batch_outputs | |
def truncate_sequences( | |
self, | |
ids: List[int], | |
pair_ids: Optional[List[int]] = None, | |
num_tokens_to_remove: int = 0, | |
truncation_strategy: Union[str, TruncationStrategy] = "longest_first", | |
stride: int = 0, | |
) -> Tuple[List[int], List[int], List[int]]: | |
""" Truncates a sequence pair in place to the maximum length. | |
Args: | |
ids: list of tokenized input ids. Can be obtained from a string by chaining the | |
`tokenize` and `convert_tokens_to_ids` methods. | |
pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the | |
`tokenize` and `convert_tokens_to_ids` methods. | |
num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``): | |
number of tokens to remove using the truncation strategy | |
truncation_strategy (:obj:`string`, `optional`, defaults to "longest_first"): | |
String selected in the following options: | |
- 'longest_first' (default): Iteratively reduce the inputs sequence until the input is under max_length | |
starting from the longest one at each token (when there is a pair of input sequences). | |
Overflowing tokens only contains overflow from the first sequence. | |
- 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. | |
- 'only_second': Only truncate the second sequence | |
- 'do_not_truncate' | |
stride (:obj:`int`, `optional`, defaults to ``0``): | |
If set to a number along with max_length, the overflowing tokens returned will contain some tokens | |
from the main sequence returned. The value of this argument defines the number of additional tokens. | |
""" | |
if num_tokens_to_remove <= 0: | |
return ids, pair_ids, [] | |
if not isinstance(truncation_strategy, TruncationStrategy): | |
truncation_strategy = TruncationStrategy(truncation_strategy) | |
overflowing_tokens = [] | |
if truncation_strategy == TruncationStrategy.LONGEST_FIRST: | |
for _ in range(num_tokens_to_remove): | |
if pair_ids is None or len(ids) > len(pair_ids): | |
if not overflowing_tokens: | |
window_len = min(len(ids), stride + 1) | |
else: | |
window_len = 1 | |
overflowing_tokens.extend(ids[-window_len:]) | |
ids = ids[:-1] | |
else: | |
if not overflowing_tokens: | |
window_len = min(len(pair_ids), stride + 1) | |
else: | |
window_len = 1 | |
overflowing_tokens.extend(pair_ids[-window_len:]) | |
pair_ids = pair_ids[:-1] | |
elif truncation_strategy == TruncationStrategy.ONLY_FIRST: | |
if len(ids) > num_tokens_to_remove: | |
window_len = min(len(ids), stride + num_tokens_to_remove) | |
overflowing_tokens = ids[-window_len:] | |
ids = ids[:-num_tokens_to_remove] | |
else: | |
logger.error( | |
f"We need to remove {num_tokens_to_remove} to truncate the input" | |
f"but the first sequence has a length {len(ids)}. " | |
f"Please select another truncation strategy than {truncation_strategy}, " | |
f"for instance 'longest_first' or 'only_second'." | |
) | |
elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None: | |
if len(pair_ids) > num_tokens_to_remove: | |
window_len = min(len(pair_ids), stride + num_tokens_to_remove) | |
overflowing_tokens = pair_ids[-window_len:] | |
pair_ids = pair_ids[:-num_tokens_to_remove] | |
else: | |
logger.error( | |
f"We need to remove {num_tokens_to_remove} to truncate the input" | |
f"but the second sequence has a length {len(pair_ids)}. " | |
f"Please select another truncation strategy than {truncation_strategy}, " | |
f"for instance 'longest_first' or 'only_first'." | |
) | |
return (ids, pair_ids, overflowing_tokens) | |
def _pad( | |
self, | |
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], | |
max_length: Optional[int] = None, | |
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, | |
pad_to_multiple_of: Optional[int] = None, | |
return_attention_mask: Optional[bool] = None, | |
) -> dict: | |
""" Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch) | |
Args: | |
encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). | |
max_length: maximum length of the returned list and optionally padding length (see below). | |
Will truncate by taking into account the special tokens. | |
padding_strategy: PaddingStrategy to use for padding. | |
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch | |
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default) | |
- PaddingStrategy.DO_NOT_PAD: Do not pad | |
The tokenizer padding sides are defined in self.padding_side: | |
- 'left': pads on the left of the sequences | |
- 'right': pads on the right of the sequences | |
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. | |
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability | |
>= 7.5 (Volta). | |
return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) | |
""" | |
# Load from model defaults | |
if return_attention_mask is None: | |
return_attention_mask = "attention_mask" in self.model_input_names | |
if padding_strategy == PaddingStrategy.LONGEST: | |
max_length = len(encoded_inputs["input_ids"]) | |
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): | |
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of | |
needs_to_be_padded = ( | |
padding_strategy != PaddingStrategy.DO_NOT_PAD and len(encoded_inputs["input_ids"]) != max_length | |
) | |
if needs_to_be_padded: | |
difference = max_length - len(encoded_inputs["input_ids"]) | |
if self.padding_side == "right": | |
if return_attention_mask: | |
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference | |
if "token_type_ids" in encoded_inputs: | |
encoded_inputs["token_type_ids"] = ( | |
encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference | |
) | |
if "special_tokens_mask" in encoded_inputs: | |
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference | |
encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference | |
elif self.padding_side == "left": | |
if return_attention_mask: | |
encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) | |
if "token_type_ids" in encoded_inputs: | |
encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ | |
"token_type_ids" | |
] | |
if "special_tokens_mask" in encoded_inputs: | |
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] | |
encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] | |
else: | |
raise ValueError("Invalid padding strategy:" + str(self.padding_side)) | |
else: | |
if return_attention_mask: | |
encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) | |
return encoded_inputs | |
def batch_decode(self, sequences: List[List[int]], **kwargs) -> List[str]: | |
return [self.decode(seq, **kwargs) for seq in sequences] | |
def decode( | |
self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True | |
) -> str: | |
""" | |
Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary | |
with options to remove special tokens and clean up tokenization spaces. | |
Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. | |
Args: | |
token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods. | |
skip_special_tokens: if set to True, will replace special tokens. | |
clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces. | |
""" | |
raise NotImplementedError | |
def get_special_tokens_mask( | |
self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False | |
) -> List[int]: | |
""" | |
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding | |
special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. | |
Args: | |
token_ids_0: list of ids (must not contain special tokens) | |
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids | |
for sequence pairs | |
already_has_special_tokens: (default False) Set to True if the token list is already formated with | |
special tokens for the model | |
Returns: | |
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. | |
""" | |
assert already_has_special_tokens and token_ids_1 is None, ( | |
"You cannot use ``already_has_special_tokens=False`` with this tokenizer. " | |
"Please use a slow (full python) tokenizer to activate this argument." | |
"Or set `return_special_token_mask=True` when calling the encoding method " | |
"to get the special tokens mask in any tokenizer. " | |
) | |
all_special_ids = self.all_special_ids # cache the property | |
special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0] | |
return special_tokens_mask | |
def clean_up_tokenization(out_string: str) -> str: | |
""" Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms. | |
""" | |
out_string = ( | |
out_string.replace(" .", ".") | |
.replace(" ?", "?") | |
.replace(" !", "!") | |
.replace(" ,", ",") | |
.replace(" ' ", "'") | |
.replace(" n't", "n't") | |
.replace(" 'm", "'m") | |
.replace(" 's", "'s") | |
.replace(" 've", "'ve") | |
.replace(" 're", "'re") | |
) | |
return out_string | |