NeMo
tts_uk_fastpitch / tokenizer.py
theodotus's picture
Added tokenizer
260c2c0
from nemo.collections.tts.torch.tts_tokenizers import BaseCharsTokenizer
from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import any_locale_text_preprocessing
def lowercase_text_preprocessing(text):
text = any_locale_text_preprocessing(text)
text = text.lower()
return text
class CharsTokenizer(BaseCharsTokenizer):
PUNCT_LIST = BaseCharsTokenizer.PUNCT_LIST+('+',"—")
def __init__(
self,
chars,
punct=True,
apostrophe=True,
add_blank_at=None,
pad_with_space=False,
non_default_punct_list=None,
text_preprocessing_func=lowercase_text_preprocessing,
):
"""Char-based tokenizer.
Args:
chars: string that represents all possible characters.
punct: Whether to reserve grapheme for basic punctuation or not.
apostrophe: Whether to use apostrophe or not.
add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
if None then no blank in labels.
pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
non_default_punct_list: List of punctuation marks which will be used instead default.
text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer.
"""
super().__init__(
chars=chars,
punct=punct,
apostrophe=apostrophe,
add_blank_at=add_blank_at,
pad_with_space=pad_with_space,
non_default_punct_list=non_default_punct_list,
text_preprocessing_func=text_preprocessing_func,
)