Source code for transformers.models.wav2vec2.tokenization_wav2vec2

# coding=utf-8
# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization class for Wav2Vec2."""

import json
import os
from itertools import groupby
from typing import Dict, List, Optional, Tuple, Union

import numpy as np

from ...file_utils import add_end_docstrings
from ...tokenization_utils import PreTrainedTokenizer
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TensorType
from ...utils import logging


logger = logging.get_logger(__name__)


VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",
    "tokenizer_config_file": "tokenizer_config.json",
}


WAV2VEC2_KWARGS_DOCSTRING = r"""
            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`):
                Activates and controls padding. Accepts the following values:

                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                  single sequence if provided).
                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
                  maximum acceptable input length for the model if that argument is not provided.
                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                  different lengths).
            max_length (:obj:`int`, `optional`):
                Controls the maximum length to use by one of the truncation/padding parameters.

                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
                length is required by one of the truncation/padding parameters. If the model has no specific maximum
                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
            pad_to_multiple_of (:obj:`int`, `optional`):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
            return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
                Whether or not to print more information and warnings.
"""


[docs]class Wav2Vec2Tokenizer(PreTrainedTokenizer): """ Constructs a Wav2Vec2 tokenizer. This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains some of the main methods. Users should refer to the superclass for more information regarding such methods. Args: vocab_file (:obj:`str`): File containing the vocabulary. bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`): The beginning of sentence token. eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`): The end of sentence token. unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`): The token used for padding, for example when batching sequences of different lengths. word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`"|"`): The token used for defining the end of a word. **kwargs Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer` """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = { "vocab_file": { "facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/vocab.json" }, "tokenizer_config_file": { "facebook/wav2vec2-base-960h": "https://huggingface.co/facebook/wav2vec2-base-960h/resolve/main/tokenizer.json", }, } model_input_names = ["input_values"] def __init__( self, vocab_file, bos_token="<s>", eos_token="</s>", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|", do_lower_case=False, **kwargs ): super().__init__( unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, do_lower_case=do_lower_case, word_delimiter_token=word_delimiter_token, **kwargs, ) self._word_delimiter_token = word_delimiter_token self.do_lower_case = do_lower_case with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @property def word_delimiter_token(self) -> str: """ :obj:`str`: Padding token. Log an error if used while not having been set. """ if self._word_delimiter_token is None and self.verbose: logger.error("Using word_delimiter_token, but it is not set yet.") return None return str(self._word_delimiter_token) @property def word_delimiter_token_id(self) -> Optional[int]: """ :obj:`Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns :obj:`None` if the token has not been set. """ if self._word_delimiter_token is None: return None return self.convert_tokens_to_ids(self.word_delimiter_token) @word_delimiter_token.setter def word_delimiter_token(self, value): self._word_delimiter_token = value @word_delimiter_token_id.setter def word_delimiter_token_id(self, value): self._word_delimiter_token = self.convert_tokens_to_ids(value)
[docs] @add_end_docstrings(WAV2VEC2_KWARGS_DOCSTRING) def __call__( self, raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]], padding: Union[bool, str, PaddingStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, verbose: bool = True, **kwargs ) -> BatchEncoding: """ Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of sequences. Args: raw_speech (:obj:`np.ndarray`, :obj:`List[float]`, :obj:`List[np.ndarray]`, :obj:`List[List[float]]`): The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float values, a list of numpy arrayr or a list of list of float values. """ is_batched = bool( isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], np.ndarray) or isinstance(raw_speech[0], (tuple, list))) ) # make sure input is in list format if is_batched and not isinstance(raw_speech[0], np.ndarray): raw_speech = [np.asarray(speech) for speech in raw_speech] elif not is_batched and not isinstance(raw_speech, np.ndarray): raw_speech = np.asarray(raw_speech) # always return batch if not is_batched: raw_speech = [raw_speech] # convert into correct format for padding encoded_inputs = BatchEncoding({"input_values": raw_speech}) padded_inputs = self.pad( encoded_inputs, padding=padding, max_length=max_length, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=False, return_tensors=return_tensors, verbose=verbose, ) return padded_inputs
@property def vocab_size(self) -> int: return len(self.decoder) def get_vocab(self) -> Dict: return dict(self.encoder, **self.added_tokens_encoder) def _convert_token_to_id(self, token: str) -> int: """Converts a token (str) in an index (integer) using the vocab.""" return self.encoder.get(token, self.encoder.get(self.unk_token)) def _convert_id_to_token(self, index: int) -> str: """Converts an index (integer) in a token (str) using the vocab.""" result = self.decoder.get(index, self.unk_token) return result def convert_tokens_to_string(self, tokens: List[str]) -> str: """ Converts a connectionist-temporal-classification (CTC) output tokens into a single string. """ # group same tokens into non-repeating tokens in CTC style decoding grouped_tokens = [token_group[0] for token_group in groupby(tokens)] # filter self.pad_token which is used as CTC-blank token filtered_tokens = list(filter(lambda token: token != self.pad_token, grouped_tokens)) # replace delimiter token string = "".join([" " if token == self.word_delimiter_token else token for token in filtered_tokens]).strip() if self.do_lower_case: string = string.lower() return string def _decode( self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, ) -> str: """ special _decode function is needed for Wav2Vec2Tokenizer because added tokens should be treated exactly the same as tokens of the base vocabulary and therefore the function `convert_tokens_to_string` has to be called on the whole token list and not individually on added tokens """ filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) result = [] for token in filtered_tokens: if skip_special_tokens and token in self.all_special_ids: continue result.append(token) text = self.convert_tokens_to_string(result) if clean_up_tokenization_spaces: clean_text = self.clean_up_tokenization(text) return clean_text else: return text
[docs] def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: if not os.path.isdir(save_directory): logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) return vocab_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] ) with open(vocab_file, "w", encoding="utf-8") as f: f.write(json.dumps(self.encoder, ensure_ascii=False)) return (vocab_file,)