NeMo
theodotus commited on
Commit
260c2c0
1 Parent(s): 4c2a6ed

Added tokenizer

Browse files
Files changed (2) hide show
  1. README.md +7 -0
  2. tokenizer.py +45 -0
README.md CHANGED
@@ -32,6 +32,13 @@ Note: This model generates only spectrograms and a vocoder is needed to convert
32
  In this example HiFiGAN is used.
33
 
34
  ```python
 
 
 
 
 
 
 
35
  # Load FastPitch
36
  from nemo.collections.tts.models import FastPitchModel
37
  spec_generator = FastPitchModel.from_pretrained("theodotus/tts_uk_fastpitch")
 
32
  In this example HiFiGAN is used.
33
 
34
  ```python
35
+ # Load Tokenizer
36
+ from huggingface_hub import hf_hub_download
37
+ hf_hub_download(
38
+ repo_id="theodotus/tts_uk_fastpitch",
39
+ filename="tokenizer.py",
40
+ local_dir = "./"
41
+ )
42
  # Load FastPitch
43
  from nemo.collections.tts.models import FastPitchModel
44
  spec_generator = FastPitchModel.from_pretrained("theodotus/tts_uk_fastpitch")
tokenizer.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nemo.collections.tts.torch.tts_tokenizers import BaseCharsTokenizer
2
+ from nemo.collections.common.tokenizers.text_to_speech.tokenizer_utils import any_locale_text_preprocessing
3
+
4
+
5
+
6
+ def lowercase_text_preprocessing(text):
7
+ text = any_locale_text_preprocessing(text)
8
+ text = text.lower()
9
+ return text
10
+
11
+
12
+
13
+ class CharsTokenizer(BaseCharsTokenizer):
14
+ PUNCT_LIST = BaseCharsTokenizer.PUNCT_LIST+('+',"—")
15
+
16
+ def __init__(
17
+ self,
18
+ chars,
19
+ punct=True,
20
+ apostrophe=True,
21
+ add_blank_at=None,
22
+ pad_with_space=False,
23
+ non_default_punct_list=None,
24
+ text_preprocessing_func=lowercase_text_preprocessing,
25
+ ):
26
+ """Char-based tokenizer.
27
+ Args:
28
+ chars: string that represents all possible characters.
29
+ punct: Whether to reserve grapheme for basic punctuation or not.
30
+ apostrophe: Whether to use apostrophe or not.
31
+ add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
32
+ if None then no blank in labels.
33
+ pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
34
+ non_default_punct_list: List of punctuation marks which will be used instead default.
35
+ text_preprocessing_func: Text preprocessing function for correct execution of the tokenizer.
36
+ """
37
+ super().__init__(
38
+ chars=chars,
39
+ punct=punct,
40
+ apostrophe=apostrophe,
41
+ add_blank_at=add_blank_at,
42
+ pad_with_space=pad_with_space,
43
+ non_default_punct_list=non_default_punct_list,
44
+ text_preprocessing_func=text_preprocessing_func,
45
+ )