Spaces:
Running
Running
# coding=utf-8 | |
# Copyright 2020 The HuggingFace Inc. team. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
""" Tokenization classes for python tokenizers. | |
For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py | |
""" | |
import itertools | |
import logging | |
import re | |
import unicodedata | |
from typing import Dict, List, Optional, Tuple, Union | |
from .file_utils import add_end_docstrings | |
from .tokenization_utils_base import ( | |
ENCODE_KWARGS_DOCSTRING, | |
ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING, | |
AddedToken, | |
BatchEncoding, | |
EncodedInput, | |
EncodedInputPair, | |
PaddingStrategy, | |
PreTokenizedInput, | |
PreTokenizedInputPair, | |
PreTrainedTokenizerBase, | |
TensorType, | |
TextInput, | |
TextInputPair, | |
TruncationStrategy, | |
) | |
logger = logging.getLogger(__name__) | |
def _is_whitespace(char): | |
"""Checks whether `chars` is a whitespace character.""" | |
# \t, \n, and \r are technically contorl characters but we treat them | |
# as whitespace since they are generally considered as such. | |
if char == " " or char == "\t" or char == "\n" or char == "\r": | |
return True | |
cat = unicodedata.category(char) | |
if cat == "Zs": | |
return True | |
return False | |
def _is_control(char): | |
"""Checks whether `chars` is a control character.""" | |
# These are technically control characters but we count them as whitespace | |
# characters. | |
if char == "\t" or char == "\n" or char == "\r": | |
return False | |
cat = unicodedata.category(char) | |
if cat.startswith("C"): | |
return True | |
return False | |
def _is_punctuation(char): | |
"""Checks whether `chars` is a punctuation character.""" | |
cp = ord(char) | |
# We treat all non-letter/number ASCII as punctuation. | |
# Characters such as "^", "$", and "`" are not in the Unicode | |
# Punctuation class but we treat them as punctuation anyways, for | |
# consistency. | |
if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): | |
return True | |
cat = unicodedata.category(char) | |
if cat.startswith("P"): | |
return True | |
return False | |
def _is_end_of_word(text): | |
"""Checks whether the last character in text is one of a punctuation, control or whitespace character.""" | |
last_char = text[-1] | |
return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char)) | |
def _is_start_of_word(text): | |
"""Checks whether the first character in text is one of a punctuation, control or whitespace character.""" | |
first_char = text[0] | |
return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char)) | |
class PreTrainedTokenizer(PreTrainedTokenizerBase): | |
""" Base class for all slow tokenizers. | |
Handle all the shared methods for tokenization and special tokens as well as methods | |
downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary. | |
This class also contain the added tokens in a unified way on top of all tokenizers so we don't | |
have to handle the specific vocabulary augmentation methods of the various underlying | |
dictionary structures (BPE, sentencepiece...). | |
Class attributes (overridden by derived classes): | |
- ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file | |
required by the model, and as associated values, the filename for saving the associated file (string). | |
- ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys | |
being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the | |
`short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the | |
associated pretrained vocabulary file. | |
- ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained | |
models, and as associated values, the maximum length of the sequence inputs of this model, or None if the | |
model has no maximum input size. | |
- ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the | |
pretrained models, and as associated values, a dictionnary of specific arguments to pass to the | |
``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the | |
``from_pretrained()`` method. | |
Args: | |
- ``model_max_length``: (`Optional`) int: the maximum length in number of tokens for the inputs to the transformer model. | |
When the tokenizer is loaded with `from_pretrained`, this will be set to the value stored for the associated | |
model in ``max_model_input_sizes`` (see above). If no value is provided, will default to VERY_LARGE_INTEGER (`int(1e30)`). | |
no associated max_length can be found in ``max_model_input_sizes``. | |
- ``padding_side``: (`Optional`) string: the side on which the model should have padding applied. | |
Should be selected between ['right', 'left'] | |
- ``model_input_names``: (`Optional`) List[string]: the list of the forward pass inputs accepted by the | |
model ("token_type_ids", "attention_mask"...). | |
- ``bos_token``: (`Optional`) string: a beginning of sentence token. | |
Will be associated to ``self.bos_token`` and ``self.bos_token_id`` | |
- ``eos_token``: (`Optional`) string: an end of sentence token. | |
Will be associated to ``self.eos_token`` and ``self.eos_token_id`` | |
- ``unk_token``: (`Optional`) string: an unknown token. | |
Will be associated to ``self.unk_token`` and ``self.unk_token_id`` | |
- ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). | |
Will be associated to ``self.sep_token`` and ``self.sep_token_id`` | |
- ``pad_token``: (`Optional`) string: a padding token. | |
Will be associated to ``self.pad_token`` and ``self.pad_token_id`` | |
- ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence | |
leveraging self-attention along the full depth of the model). | |
Will be associated to ``self.cls_token`` and ``self.cls_token_id`` | |
- ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language | |
modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id`` | |
- ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. | |
Adding all special tokens here ensure they won't be split by the tokenization process. | |
Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids`` | |
.. automethod:: __call__ | |
""" | |
def __init__(self, **kwargs): | |
super().__init__(**kwargs) | |
# Added tokens - We store this for both slow and fast tokenizers | |
# until the serialization of Fast tokenizers is updated | |
self.added_tokens_encoder: Dict[str, int] = {} | |
self.added_tokens_decoder: Dict[int, str] = {} | |
self.unique_no_split_tokens: List[str] = [] | |
def is_fast(self) -> bool: | |
return False | |
def vocab_size(self) -> int: | |
""" Size of the base vocabulary (without the added tokens) """ | |
raise NotImplementedError | |
def get_vocab(self): | |
""" Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab. """ | |
raise NotImplementedError() | |
def get_added_vocab(self) -> Dict[str, int]: | |
return self.added_tokens_encoder | |
def __len__(self): | |
""" Size of the full vocabulary with the added tokens """ | |
return self.vocab_size + len(self.added_tokens_encoder) | |
def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens=False) -> int: | |
""" | |
Add a list of new tokens to the tokenizer class. If the new tokens are not in the | |
vocabulary, they are added to it with indices starting from length of the current vocabulary. | |
Args: | |
new_tokens: string or list of string. Each string is a token to add. Tokens are only added if they are not | |
already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). | |
Returns: | |
Number of tokens added to the vocabulary. | |
Examples:: | |
# Let's see how to increase the vocabulary of Bert model and tokenizer | |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
model = BertModel.from_pretrained('bert-base-uncased') | |
num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) | |
print('We have added', num_added_toks, 'tokens') | |
model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. | |
""" | |
new_tokens = [str(tok) for tok in new_tokens] | |
tokens_to_add = [] | |
for token in new_tokens: | |
assert isinstance(token, str) | |
if not special_tokens and self.init_kwargs.get("do_lower_case", False): | |
token = token.lower() | |
if ( | |
token != self.unk_token | |
and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) | |
and token not in tokens_to_add | |
): | |
tokens_to_add.append(token) | |
if self.verbose: | |
logger.info("Adding %s to the vocabulary", token) | |
added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add)) | |
added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} | |
self.added_tokens_encoder.update(added_tok_encoder) | |
self.added_tokens_decoder.update(added_tok_decoder) | |
# Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert) | |
if special_tokens: | |
self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(new_tokens))) | |
else: | |
# Or on the newly added tokens | |
self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(tokens_to_add))) | |
return len(tokens_to_add) | |
def num_special_tokens_to_add(self, pair=False): | |
""" | |
Returns the number of added tokens when encoding a sequence with special tokens. | |
Note: | |
This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this | |
inside your training loop. | |
Args: | |
pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the | |
number of added tokens in the case of a single sequence if set to False. | |
Returns: | |
Number of tokens added to sequences | |
""" | |
token_ids_0 = [] | |
token_ids_1 = [] | |
return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) | |
def tokenize(self, text: TextInput, **kwargs): | |
""" Converts a string in a sequence of tokens (string), using the tokenizer. | |
Split in words for word-based vocabulary or sub-words for sub-word-based | |
vocabularies (BPE/SentencePieces/WordPieces). | |
Take care of added tokens. | |
Args: | |
text (:obj:`string`): The sequence to be encoded. | |
**kwargs (:obj: `dict`): Arguments passed to the model-specific `prepare_for_tokenization` preprocessing method. | |
""" | |
# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors | |
all_special_tokens_extended = dict( | |
(str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken) | |
) | |
text, kwargs = self.prepare_for_tokenization(text, **kwargs) | |
if kwargs: | |
logger.warning(f"Keyword arguments {kwargs} not recognized.") | |
# TODO: should this be in the base class? | |
if self.init_kwargs.get("do_lower_case", False): | |
# convert non-special tokens to lowercase | |
escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens] | |
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" | |
text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text) | |
def split_on_token(tok, text): | |
result = [] | |
tok_extended = all_special_tokens_extended.get(tok, None) | |
split_text = text.split(tok) | |
full_word = "" | |
for i, sub_text in enumerate(split_text): | |
# AddedToken can control whitespace stripping around them. | |
# We use them for GPT2 and Roberta to have different behavior depending on the special token | |
# Cf. https://github.com/huggingface/transformers/pull/2778 | |
# and https://github.com/huggingface/transformers/issues/3788 | |
if isinstance(tok_extended, AddedToken): | |
if tok_extended.single_word: | |
# Try to avoid splitting on token | |
if ( | |
i < len(split_text) - 1 | |
and not _is_end_of_word(sub_text) | |
and not _is_start_of_word(split_text[i + 1]) | |
): | |
# Don't extract the special token | |
full_word += sub_text + tok | |
elif full_word: | |
full_word += sub_text | |
result += [full_word] | |
full_word = "" | |
continue | |
# Strip white spaces on the right | |
if tok_extended.rstrip and i > 0: | |
# A bit counter-intuitive but we strip the left of the string | |
# since tok_extended.rstrip means the special token is eating all white spaces on its right | |
sub_text = sub_text.lstrip() | |
# Strip white spaces on the left | |
if tok_extended.lstrip and i < len(split_text) - 1: | |
sub_text = sub_text.rstrip() # Opposite here | |
else: | |
# We strip left and right by default | |
if i < len(split_text) - 1: | |
sub_text = sub_text.rstrip() | |
if i > 0: | |
sub_text = sub_text.lstrip() | |
if i == 0 and not sub_text: | |
result += [tok] | |
elif i == len(split_text) - 1: | |
if sub_text: | |
result += [sub_text] | |
else: | |
pass | |
else: | |
if sub_text: | |
result += [sub_text] | |
result += [tok] | |
return result | |
def split_on_tokens(tok_list, text): | |
if not text.strip(): | |
return [] | |
if not tok_list: | |
return self._tokenize(text) | |
tokenized_text = [] | |
text_list = [text] | |
for tok in tok_list: | |
tokenized_text = [] | |
for sub_text in text_list: | |
if sub_text not in self.unique_no_split_tokens: | |
tokenized_text += split_on_token(tok, sub_text) | |
else: | |
tokenized_text += [sub_text] | |
text_list = tokenized_text | |
return list( | |
itertools.chain.from_iterable( | |
( | |
self._tokenize(token) if token not in self.unique_no_split_tokens else [token] | |
for token in tokenized_text | |
) | |
) | |
) | |
no_split_token = self.unique_no_split_tokens | |
tokenized_text = split_on_tokens(no_split_token, text) | |
return tokenized_text | |
def _tokenize(self, text, **kwargs): | |
""" Converts a string in a sequence of tokens (string), using the tokenizer. | |
Split in words for word-based vocabulary or sub-words for sub-word-based | |
vocabularies (BPE/SentencePieces/WordPieces). | |
Do NOT take care of added tokens. | |
""" | |
raise NotImplementedError | |
def convert_tokens_to_ids(self, tokens): | |
""" Converts a token string (or a sequence of tokens) in a single integer id | |
(or a sequence of ids), using the vocabulary. | |
""" | |
if tokens is None: | |
return None | |
if isinstance(tokens, str): | |
return self._convert_token_to_id_with_added_voc(tokens) | |
ids = [] | |
for token in tokens: | |
ids.append(self._convert_token_to_id_with_added_voc(token)) | |
return ids | |
def _convert_token_to_id_with_added_voc(self, token): | |
if token is None: | |
return None | |
if token in self.added_tokens_encoder: | |
return self.added_tokens_encoder[token] | |
return self._convert_token_to_id(token) | |
def _convert_token_to_id(self, token): | |
raise NotImplementedError | |
def _encode_plus( | |
self, | |
text: Union[TextInput, PreTokenizedInput, EncodedInput], | |
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, | |
add_special_tokens: bool = True, | |
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, | |
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, | |
max_length: Optional[int] = None, | |
stride: int = 0, | |
is_pretokenized: bool = False, | |
pad_to_multiple_of: Optional[int] = None, | |
return_tensors: Optional[Union[str, TensorType]] = None, | |
return_token_type_ids: Optional[bool] = None, | |
return_attention_mask: Optional[bool] = None, | |
return_overflowing_tokens: bool = False, | |
return_special_tokens_mask: bool = False, | |
return_offsets_mapping: bool = False, | |
return_length: bool = False, | |
verbose: bool = True, | |
**kwargs | |
) -> BatchEncoding: | |
def get_input_ids(text): | |
if isinstance(text, str): | |
tokens = self.tokenize(text, **kwargs) | |
return self.convert_tokens_to_ids(tokens) | |
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): | |
if is_pretokenized: | |
tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text))) | |
return self.convert_tokens_to_ids(tokens) | |
else: | |
return self.convert_tokens_to_ids(text) | |
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): | |
return text | |
else: | |
if is_pretokenized: | |
raise ValueError( | |
f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_pretokenized=True`." | |
) | |
else: | |
raise ValueError( | |
f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." | |
) | |
if return_offsets_mapping: | |
raise NotImplementedError( | |
"return_offset_mapping is not available when using Python tokenizers." | |
"To use this feature, change your tokenizer to one deriving from " | |
"transformers.PreTrainedTokenizerFast." | |
"More information on available tokenizers at " | |
"https://github.com/huggingface/transformers/pull/2674" | |
) | |
first_ids = get_input_ids(text) | |
second_ids = get_input_ids(text_pair) if text_pair is not None else None | |
return self.prepare_for_model( | |
first_ids, | |
pair_ids=second_ids, | |
add_special_tokens=add_special_tokens, | |
padding=padding_strategy.value, | |
truncation=truncation_strategy.value, | |
max_length=max_length, | |
stride=stride, | |
pad_to_multiple_of=pad_to_multiple_of, | |
return_tensors=return_tensors, | |
prepend_batch_axis=True, | |
return_attention_mask=return_attention_mask, | |
return_token_type_ids=return_token_type_ids, | |
return_overflowing_tokens=return_overflowing_tokens, | |
return_special_tokens_mask=return_special_tokens_mask, | |
return_length=return_length, | |
verbose=verbose, | |
) | |
def _batch_encode_plus( | |
self, | |
batch_text_or_text_pairs: Union[ | |
List[TextInput], | |
List[TextInputPair], | |
List[PreTokenizedInput], | |
List[PreTokenizedInputPair], | |
List[EncodedInput], | |
List[EncodedInputPair], | |
], | |
add_special_tokens: bool = True, | |
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, | |
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, | |
max_length: Optional[int] = None, | |
stride: int = 0, | |
is_pretokenized: bool = False, | |
pad_to_multiple_of: Optional[int] = None, | |
return_tensors: Optional[Union[str, TensorType]] = None, | |
return_token_type_ids: Optional[bool] = None, | |
return_attention_mask: Optional[bool] = None, | |
return_overflowing_tokens: bool = False, | |
return_special_tokens_mask: bool = False, | |
return_offsets_mapping: bool = False, | |
return_length: bool = False, | |
verbose: bool = True, | |
**kwargs | |
) -> BatchEncoding: | |
def get_input_ids(text): | |
if isinstance(text, str): | |
tokens = self.tokenize(text, **kwargs) | |
return self.convert_tokens_to_ids(tokens) | |
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): | |
if is_pretokenized: | |
tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text))) | |
return self.convert_tokens_to_ids(tokens) | |
else: | |
return self.convert_tokens_to_ids(text) | |
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): | |
return text | |
else: | |
raise ValueError( | |
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." | |
) | |
if return_offsets_mapping: | |
raise NotImplementedError( | |
"return_offset_mapping is not available when using Python tokenizers." | |
"To use this feature, change your tokenizer to one deriving from " | |
"transformers.PreTrainedTokenizerFast." | |
) | |
input_ids = [] | |
for ids_or_pair_ids in batch_text_or_text_pairs: | |
if not isinstance(ids_or_pair_ids, (list, tuple)): | |
ids, pair_ids = ids_or_pair_ids, None | |
elif is_pretokenized and not isinstance(ids_or_pair_ids[0], (list, tuple)): | |
ids, pair_ids = ids_or_pair_ids, None | |
else: | |
ids, pair_ids = ids_or_pair_ids | |
first_ids = get_input_ids(ids) | |
second_ids = get_input_ids(pair_ids) if pair_ids is not None else None | |
input_ids.append((first_ids, second_ids)) | |
batch_outputs = self._batch_prepare_for_model( | |
input_ids, | |
add_special_tokens=add_special_tokens, | |
padding_strategy=padding_strategy, | |
truncation_strategy=truncation_strategy, | |
max_length=max_length, | |
stride=stride, | |
pad_to_multiple_of=pad_to_multiple_of, | |
return_attention_mask=return_attention_mask, | |
return_token_type_ids=return_token_type_ids, | |
return_overflowing_tokens=return_overflowing_tokens, | |
return_special_tokens_mask=return_special_tokens_mask, | |
return_length=return_length, | |
return_tensors=return_tensors, | |
verbose=verbose, | |
) | |
return BatchEncoding(batch_outputs) | |
def _batch_prepare_for_model( | |
self, | |
batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]], | |
add_special_tokens: bool = True, | |
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, | |
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, | |
max_length: Optional[int] = None, | |
stride: int = 0, | |
pad_to_multiple_of: Optional[int] = None, | |
return_tensors: Optional[str] = None, | |
return_token_type_ids: Optional[bool] = None, | |
return_attention_mask: Optional[bool] = None, | |
return_overflowing_tokens: bool = False, | |
return_special_tokens_mask: bool = False, | |
return_length: bool = False, | |
verbose: bool = True, | |
) -> BatchEncoding: | |
""" Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. | |
It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and | |
manages a moving window (with user defined stride) for overflowing tokens | |
Args: | |
batch_ids_pairs: list of tokenized input ids or input ids pairs | |
""" | |
batch_outputs = {} | |
for first_ids, second_ids in batch_ids_pairs: | |
outputs = self.prepare_for_model( | |
first_ids, | |
second_ids, | |
add_special_tokens=add_special_tokens, | |
padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward | |
truncation=truncation_strategy.value, | |
max_length=max_length, | |
stride=stride, | |
pad_to_multiple_of=None, # we pad in batch afterward | |
return_attention_mask=False, # we pad in batch afterward | |
return_token_type_ids=return_token_type_ids, | |
return_overflowing_tokens=return_overflowing_tokens, | |
return_special_tokens_mask=return_special_tokens_mask, | |
return_length=return_length, | |
return_tensors=None, # We convert the whole batch to tensors at the end | |
prepend_batch_axis=False, | |
verbose=verbose, | |
) | |
for key, value in outputs.items(): | |
if key not in batch_outputs: | |
batch_outputs[key] = [] | |
batch_outputs[key].append(value) | |
batch_outputs = self.pad( | |
batch_outputs, | |
padding=padding_strategy.value, | |
max_length=max_length, | |
pad_to_multiple_of=pad_to_multiple_of, | |
return_attention_mask=return_attention_mask, | |
) | |
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) | |
return batch_outputs | |
def prepare_for_tokenization(self, text: str, is_pretokenized=False, **kwargs) -> (str, dict): | |
""" Performs any necessary transformations before tokenization. | |
This method should pop the arguments from kwargs and return kwargs as well. | |
We test kwargs at the end of the encoding process to be sure all the arguments have been used. | |
""" | |
return (text, kwargs) | |
def get_special_tokens_mask( | |
self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False | |
) -> List[int]: | |
""" | |
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding | |
special tokens using the tokenizer ``prepare_for_model`` method. | |
Args: | |
token_ids_0: list of ids (must not contain special tokens) | |
token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids | |
for sequence pairs | |
already_has_special_tokens: (default False) Set to True if the token list is already formated with | |
special tokens for the model | |
Returns: | |
A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. | |
""" | |
return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) | |
def convert_ids_to_tokens( | |
self, ids: Union[int, List[int]], skip_special_tokens: bool = False | |
) -> Union[str, List[str]]: | |
""" Converts a single index or a sequence of indices (integers) in a token " | |
(resp.) a sequence of tokens (str), using the vocabulary and added tokens. | |
Args: | |
skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False | |
""" | |
if isinstance(ids, int): | |
if ids in self.added_tokens_decoder: | |
return self.added_tokens_decoder[ids] | |
else: | |
return self._convert_id_to_token(ids) | |
tokens = [] | |
for index in ids: | |
index = int(index) | |
if skip_special_tokens and index in self.all_special_ids: | |
continue | |
if index in self.added_tokens_decoder: | |
tokens.append(self.added_tokens_decoder[index]) | |
else: | |
tokens.append(self._convert_id_to_token(index)) | |
return tokens | |
def _convert_id_to_token(self, index: int) -> str: | |
raise NotImplementedError | |
def convert_tokens_to_string(self, tokens: List[str]) -> str: | |
""" Converts a sequence of tokens (string) in a single string. | |
The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids)) | |
but we often want to remove sub-word tokenization artifacts at the same time. | |
""" | |
return " ".join(self.convert_ids_to_tokens(tokens)) | |
def decode( | |
self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True | |
) -> str: | |
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) | |
# To avoid mixing byte-level and unicode for byte-level BPT | |
# we need to build string separatly for added tokens and byte-level tokens | |
# cf. https://github.com/huggingface/transformers/issues/1133 | |
sub_texts = [] | |
current_sub_text = [] | |
for token in filtered_tokens: | |
if skip_special_tokens and token in self.all_special_ids: | |
continue | |
if token in self.added_tokens_encoder: | |
if current_sub_text: | |
sub_texts.append(self.convert_tokens_to_string(current_sub_text)) | |
current_sub_text = [] | |
sub_texts.append(token) | |
else: | |
current_sub_text.append(token) | |
if current_sub_text: | |
sub_texts.append(self.convert_tokens_to_string(current_sub_text)) | |
text = " ".join(sub_texts) | |
if clean_up_tokenization_spaces: | |
clean_text = self.clean_up_tokenization(text) | |
return clean_text | |
else: | |
return text | |
def save_vocabulary(self, save_directory) -> Tuple[str]: | |
""" Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens | |
and special token mappings. | |
Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full | |
Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` | |
class method. | |
""" | |
raise NotImplementedError | |