TeraSpace commited on
Commit
dfc143a
1 Parent(s): d44113b

Upload 13 files

Browse files
app.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from infer_onnx import TTS
3
+ from ruaccent import RUAccent # https://huggingface.co/TeraTTS/accentuator
4
+
5
+ models = ["TeraTTS/natasha-g2p-vits", "TeraTTS/glados2-g2p-vits"]
6
+
7
+ models = {k:TTS(k) for k in models}
8
+
9
+ accentizer = RUAccent(workdir="./model/ruaccent")
10
+ accentizer.load(omograph_model_size='medium', dict_load_startup=True)
11
+
12
+
13
+ def process_text(text: str) -> str:
14
+ text = accentizer.process_all(text)
15
+ return text
16
+
17
+ def text_to_speech(model_name, text, prep_text):
18
+ if prep_text:
19
+ text = process_text(text)
20
+ audio = models[model_name](text)
21
+ models[model_name].save_wav(audio, 'temp.wav')
22
+
23
+ return 'temp.wav', f"Обработанный текст: '{text}'"
24
+
25
+ model_choice = gr.Dropdown(choices=list(models.keys()), value="TeraTTS/natasha-g2p-vits", label="Выберите модель")
26
+ input_text = gr.Textbox(label="Введите текст для синтеза речи")
27
+ prep_text = gr.Checkbox(label="Предобработать", info="Хотите пред обработать текст?(Ударения, ё)", value=True)
28
+
29
+ output_audio = gr.Audio(label="Аудио", type="numpy")
30
+ output_text = gr.Textbox(label="Обработанный текст")
31
+
32
+ iface = gr.Interface(fn=text_to_speech, inputs=[model_choice, input_text, prep_text], outputs=[output_audio, output_text])
33
+ iface.launch()
infer_onnx.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import scipy.io.wavfile
2
+ import os
3
+ import onnxruntime
4
+ import numpy as np
5
+ from huggingface_hub import snapshot_download
6
+
7
+ class TTS:
8
+ def __init__(self, model_name: str, save_path: str = "./model", add_time_to_end: float = 0.8) -> None:
9
+ if not os.path.exists(save_path):
10
+ os.mkdir(save_path)
11
+
12
+ model_dir = os.path.join(save_path, model_name)
13
+
14
+ if not os.path.exists(model_dir):
15
+ snapshot_download(repo_id=model_name,
16
+ allow_patterns=["*.txt", "*.onnx", "*.json"],
17
+ local_dir=model_dir,
18
+ local_dir_use_symlinks=False
19
+ )
20
+
21
+ sess_options = onnxruntime.SessionOptions()
22
+ self.model = onnxruntime.InferenceSession(os.path.join(model_dir, "exported/model.onnx"), sess_options=sess_options)
23
+
24
+ if os.path.exists(os.path.join(model_dir, "exported/dictionary.txt")):
25
+ from tokenizer import TokenizerG2P
26
+ print("Use g2p")
27
+ self.tokenizer = TokenizerG2P(os.path.join(model_dir, "exported"))
28
+
29
+ else:
30
+ from tokenizer import TokenizerGRUUT
31
+ print("Use gruut")
32
+ self.tokenizer = TokenizerGRUUT(os.path.join(model_dir, "exported"))
33
+
34
+ self.add_time_to_end = add_time_to_end
35
+
36
+
37
+ def _add_silent(self, audio, silence_duration: float = 1.0, sample_rate: int = 22050):
38
+ num_samples_silence = int(sample_rate * silence_duration)
39
+ silence_array = np.zeros(num_samples_silence, dtype=np.float32)
40
+ audio_with_silence = np.concatenate((audio, silence_array), axis=0)
41
+ return audio_with_silence
42
+
43
+
44
+ def save_wav(self, audio, path:str):
45
+ '''save audio to wav'''
46
+ scipy.io.wavfile.write(path, 22050, audio)
47
+
48
+
49
+ def _intersperse(self, lst, item):
50
+ result = [item] * (len(lst) * 2 + 1)
51
+ result[1::2] = lst
52
+ return result
53
+
54
+
55
+ def _get_seq(self, text):
56
+ phoneme_ids = self.tokenizer._get_seq(text)
57
+ phoneme_ids_inter = self._intersperse(phoneme_ids, 0)
58
+ return phoneme_ids_inter
59
+
60
+
61
+ def __call__(self, text: str):
62
+ phoneme_ids = self._get_seq(text)
63
+ text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
64
+ text_lengths = np.array([text.shape[1]], dtype=np.int64)
65
+ scales = np.array(
66
+ [0.667, 1, 0.8],
67
+ dtype=np.float32,
68
+ )
69
+ audio = self.model.run(
70
+ None,
71
+ {
72
+ "input": text,
73
+ "input_lengths": text_lengths,
74
+ "scales": scales,
75
+ "sid": None,
76
+ },
77
+ )[0][0,0][0]
78
+ audio = self._add_silent(audio, silence_duration = self.add_time_to_end)
79
+ return audio
ruaccent/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .ruaccent import RUAccent
ruaccent/accent_model.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from .char_tokenizer import CharTokenizer
3
+ from transformers import AutoModelForTokenClassification
4
+
5
+ class AccentModel:
6
+ def __init__(self) -> None:
7
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
8
+ def load(self, path):
9
+ self.model = AutoModelForTokenClassification.from_pretrained(path).to(self.device)
10
+ self.tokenizer = CharTokenizer.from_pretrained(path)
11
+
12
+ def render_stress(self, word, token_classes):
13
+ if 'STRESS' in token_classes:
14
+ index = token_classes.index('STRESS')
15
+ word = list(word)
16
+ word[index-1] = '+' + word[index-1]
17
+ return ''.join(word)
18
+ else:
19
+ return word
20
+
21
+ def put_accent(self, word):
22
+ inputs = self.tokenizer(word, return_tensors="pt").to(self.device)
23
+ with torch.no_grad():
24
+ logits = self.model(**inputs).logits
25
+ predictions = torch.argmax(logits, dim=2)
26
+ predicted_token_class = [self.model.config.id2label[t.item()] for t in predictions[0]]
27
+ return self.render_stress(word, predicted_token_class)
ruaccent/char_tokenizer.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional, Tuple, List
3
+ from collections import OrderedDict
4
+
5
+ from transformers import PreTrainedTokenizer
6
+
7
+
8
+ def load_vocab(vocab_file):
9
+ vocab = OrderedDict()
10
+ with open(vocab_file, "r", encoding="utf-8") as reader:
11
+ tokens = reader.readlines()
12
+ for index, token in enumerate(tokens):
13
+ token = token.rstrip("\n")
14
+ vocab[token] = index
15
+ return vocab
16
+
17
+
18
+ class CharTokenizer(PreTrainedTokenizer):
19
+ vocab_files_names = {"vocab_file": "vocab.txt"}
20
+
21
+ def __init__(
22
+ self,
23
+ vocab_file=None,
24
+ pad_token="[pad]",
25
+ unk_token="[unk]",
26
+ bos_token="[bos]",
27
+ eos_token="[eos]",
28
+ do_lower_case=False,
29
+ *args,
30
+ **kwargs
31
+ ):
32
+ super().__init__(
33
+ pad_token=pad_token,
34
+ unk_token=unk_token,
35
+ bos_token=bos_token,
36
+ eos_token=eos_token,
37
+ do_lower_case=do_lower_case,
38
+ **kwargs
39
+ )
40
+ self.do_lower_case = do_lower_case
41
+
42
+ if not vocab_file or not os.path.isfile(vocab_file):
43
+ self.vocab = OrderedDict()
44
+ self.ids_to_tokens = OrderedDict()
45
+ else:
46
+ self.vocab = load_vocab(vocab_file)
47
+ self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
48
+
49
+ @property
50
+ def vocab_size(self):
51
+ return len(self.vocab)
52
+
53
+ def get_vocab(self):
54
+ return self.vocab
55
+
56
+ def _convert_token_to_id(self, token):
57
+ if self.do_lower_case:
58
+ token = token.lower()
59
+ return self.vocab.get(token, self.vocab[self.unk_token])
60
+
61
+ def _convert_id_to_token(self, index):
62
+ return self.ids_to_tokens[index]
63
+
64
+ def _tokenize(self, text):
65
+ if self.do_lower_case:
66
+ text = text.lower()
67
+ return list(text)
68
+
69
+ def convert_tokens_to_string(self, tokens):
70
+ return "".join(tokens)
71
+
72
+ def build_inputs_with_special_tokens(
73
+ self,
74
+ token_ids_0: List[int],
75
+ token_ids_1: Optional[List[int]] = None
76
+ ) -> List[int]:
77
+ bos = [self.bos_token_id]
78
+ eos = [self.eos_token_id]
79
+ return bos + token_ids_0 + eos
80
+
81
+ def get_special_tokens_mask(
82
+ self,
83
+ token_ids_0: List[int],
84
+ token_ids_1: Optional[List[int]] = None
85
+ ) -> List[int]:
86
+ return [1] + ([0] * len(token_ids_0)) + [1]
87
+
88
+ def create_token_type_ids_from_sequences(
89
+ self,
90
+ token_ids_0: List[int],
91
+ token_ids_1: Optional[List[int]] = None
92
+ ) -> List[int]:
93
+ return (len(token_ids_0) + 2) * [0]
94
+
95
+ def save_vocabulary(
96
+ self,
97
+ save_directory: str,
98
+ filename_prefix: Optional[str] = None
99
+ ) -> Tuple[str]:
100
+ assert os.path.isdir(save_directory)
101
+ vocab_file = os.path.join(
102
+ save_directory,
103
+ (filename_prefix + "-" if filename_prefix else "") +
104
+ self.vocab_files_names["vocab_file"]
105
+ )
106
+ index = 0
107
+ with open(vocab_file, "w", encoding="utf-8") as writer:
108
+ for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
109
+ assert index == token_index
110
+ writer.write(token + "\n")
111
+ index += 1
112
+ return (vocab_file,)
ruaccent/omograph_model.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
2
+ import torch
3
+
4
+ class OmographModel:
5
+ def __init__(self) -> None:
6
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
7
+
8
+ def load(self, path):
9
+ self.nli_model = AutoModelForSequenceClassification.from_pretrained(path, torch_dtype=torch.bfloat16).to(self.device)
10
+ self.tokenizer = AutoTokenizer.from_pretrained(path)
11
+
12
+ def classify(self, text, hypotheses):
13
+ encodings = self.tokenizer.batch_encode_plus([(text, hyp) for hyp in hypotheses], return_tensors='pt', padding=True)
14
+ input_ids = encodings['input_ids'].to(self.device)
15
+ with torch.no_grad():
16
+ logits = self.nli_model(input_ids)[0]
17
+ entail_contradiction_logits = logits[:,[0,2]]
18
+ probs = entail_contradiction_logits.softmax(dim=1)
19
+ prob_label_is_true = [float(p[1]) for p in probs]
20
+
21
+ return hypotheses[prob_label_is_true.index(max(prob_label_is_true))]
ruaccent/ruaccent.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pathlib
3
+ from huggingface_hub import snapshot_download
4
+ import os
5
+ from os.path import join as join_path
6
+ from .omograph_model import OmographModel
7
+ from .accent_model import AccentModel
8
+ import re
9
+
10
+
11
+ class RUAccent:
12
+ def __init__(self, workdir=None):
13
+ self.omograph_model = OmographModel()
14
+ self.accent_model = AccentModel()
15
+ if not workdir:
16
+ self.workdir = str(pathlib.Path(__file__).resolve().parent)
17
+ else:
18
+ self.workdir = workdir
19
+
20
+ def load(
21
+ self,
22
+ omograph_model_size="medium",
23
+ dict_load_startup=False,
24
+ disable_accent_dict=False,
25
+ repo="TeraTTS/accentuator",
26
+ ):
27
+ if not os.path.exists(
28
+ join_path(self.workdir, "dictionary")
29
+ ) or not os.path.exists(join_path(self.workdir, "nn")):
30
+ snapshot_download(
31
+ repo_id=repo,
32
+ ignore_patterns=["*.md", "*.gitattributes"],
33
+ local_dir=self.workdir,
34
+ local_dir_use_symlinks=False,
35
+ )
36
+ self.omographs = json.load(
37
+ open(join_path(self.workdir, "dictionary/omographs.json"), encoding='utf-8')
38
+ )
39
+ self.yo_words = json.load(
40
+ open(join_path(self.workdir, "dictionary/yo_words.json"), encoding='utf-8')
41
+ )
42
+ self.dict_load_startup = dict_load_startup
43
+
44
+ if dict_load_startup:
45
+ self.accents = json.load(
46
+ open(join_path(self.workdir, "dictionary/accents.json"), encoding='utf-8')
47
+ )
48
+ if disable_accent_dict:
49
+ self.accents = {}
50
+ self.disable_accent_dict = True
51
+ else:
52
+ self.disable_accent_dict = False
53
+
54
+ if omograph_model_size not in ["small", "medium"]:
55
+ raise NotImplementedError
56
+
57
+ self.omograph_model.load(
58
+ join_path(self.workdir, f"nn/nn_omograph/{omograph_model_size}/")
59
+ )
60
+ self.accent_model.load(join_path(self.workdir, "nn/nn_accent/"))
61
+
62
+
63
+ def split_by_words(self, string):
64
+ result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
65
+ return [res for res in result if res]
66
+
67
+ def extract_initial_letters(self, text):
68
+ words = text
69
+ initial_letters = []
70
+ for word in words:
71
+ if len(word) > 2 and '+' not in word and not bool(re.search('[a-zA-Z]', word)):
72
+ initial_letters.append(word[0])
73
+ return initial_letters
74
+
75
+ def load_dict(self, text):
76
+ chars = self.extract_initial_letters(text)
77
+ out_dict = {}
78
+ for char in chars:
79
+ out_dict.update(
80
+ json.load(
81
+ open(
82
+ join_path(self.workdir, f"dictionary/letter_accent/{char}.json"),
83
+ encoding='utf-8'
84
+ )
85
+ )
86
+ )
87
+ return out_dict
88
+
89
+ def count_vowels(self, text):
90
+ vowels = "аеёиоуыэюяАЕЁИОУЫЭЮЯ"
91
+ return sum(1 for char in text if char in vowels)
92
+
93
+ def has_punctuation(self, text):
94
+ for char in text:
95
+ if char in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~":
96
+ return True
97
+ return False
98
+
99
+ def delete_spaces_before_punc(self, text):
100
+ punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
101
+ for char in punc:
102
+ text = text.replace(" " + char, char)
103
+ return text
104
+
105
+ def process_yo(self, text):
106
+ splitted_text = text
107
+
108
+ for i, word in enumerate(splitted_text):
109
+ splitted_text[i] = self.yo_words.get(word, word)
110
+ return splitted_text
111
+
112
+ def process_omographs(self, text):
113
+ splitted_text = text
114
+
115
+ founded_omographs = []
116
+ for i, word in enumerate(splitted_text):
117
+ variants = self.omographs.get(word)
118
+ if variants:
119
+ founded_omographs.append(
120
+ {"word": word, "variants": variants, "position": i}
121
+ )
122
+ for omograph in founded_omographs:
123
+ splitted_text[
124
+ omograph["position"]
125
+ ] = f"<w>{splitted_text[omograph['position']]}</w>"
126
+ cls = self.omograph_model.classify(
127
+ " ".join(splitted_text), omograph["variants"]
128
+ )
129
+ splitted_text[omograph["position"]] = cls
130
+ return splitted_text
131
+
132
+ def process_accent(self, text):
133
+ if not self.dict_load_startup and not self.disable_accent_dict:
134
+ self.accents = self.load_dict(text)
135
+
136
+ splitted_text = text
137
+
138
+ for i, word in enumerate(splitted_text):
139
+ stressed_word = self.accents.get(word, word)
140
+ if stressed_word == word and not self.has_punctuation(word) and self.count_vowels(word) > 1:
141
+ splitted_text[i] = self.accent_model.put_accent(word)
142
+ else:
143
+ splitted_text[i] = stressed_word
144
+ return splitted_text
145
+
146
+ def process_all(self, text):
147
+ text = self.split_by_words(text)
148
+ processed_text = self.process_yo(text)
149
+ processed_text = self.process_omographs(processed_text)
150
+ processed_text = self.process_accent(processed_text)
151
+ processed_text = " ".join(processed_text)
152
+ processed_text = self.delete_spaces_before_punc(processed_text)
153
+ return processed_text
tokenizer/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .gruut import Tokenizer as TokenizerGRUUT
2
+ from .g2p import Tokenizer as TokenizerG2P
tokenizer/g2p/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .tokenizer import Tokenizer
tokenizer/g2p/g2p.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ softletters=set(u"яёюиье")
3
+ startsyl=set(u"#ъьаяоёуюэеиы-")
4
+ others = set(["#", "+", "-", u"ь", u"ъ"])
5
+
6
+ softhard_cons = {
7
+ u"б" : u"b",
8
+ u"в" : u"v",
9
+ u"г" : u"g",
10
+ u"Г" : u"g",
11
+ u"д" : u"d",
12
+ u"з" : u"z",
13
+ u"к" : u"k",
14
+ u"л" : u"l",
15
+ u"м" : u"m",
16
+ u"н" : u"n",
17
+ u"п" : u"p",
18
+ u"р" : u"r",
19
+ u"с" : u"s",
20
+ u"т" : u"t",
21
+ u"ф" : u"f",
22
+ u"х" : u"h"
23
+ }
24
+
25
+ other_cons = {
26
+ u"ж" : u"zh",
27
+ u"ц" : u"c",
28
+ u"ч" : u"ch",
29
+ u"ш" : u"sh",
30
+ u"щ" : u"sch",
31
+ u"й" : u"j"
32
+ }
33
+
34
+ vowels = {
35
+ u"а" : u"a",
36
+ u"я" : u"a",
37
+ u"у" : u"u",
38
+ u"ю" : u"u",
39
+ u"о" : u"o",
40
+ u"ё" : u"o",
41
+ u"э" : u"e",
42
+ u"е" : u"e",
43
+ u"и" : u"i",
44
+ u"ы" : u"y",
45
+ }
46
+
47
+ def pallatize(phones):
48
+ for i, phone in enumerate(phones[:-1]):
49
+ if phone[0] in softhard_cons:
50
+ if phones[i+1][0] in softletters:
51
+ phones[i] = (softhard_cons[phone[0]] + "j", 0)
52
+ else:
53
+ phones[i] = (softhard_cons[phone[0]], 0)
54
+ if phone[0] in other_cons:
55
+ phones[i] = (other_cons[phone[0]], 0)
56
+
57
+ def convert_vowels(phones):
58
+ new_phones = []
59
+ prev = ""
60
+ for phone in phones:
61
+ if prev in startsyl:
62
+ if phone[0] in set(u"яюеё"):
63
+ new_phones.append("j")
64
+ if phone[0] in vowels:
65
+ new_phones.append(vowels[phone[0]] + str(phone[1]))
66
+ else:
67
+ new_phones.append(phone[0])
68
+ prev = phone[0]
69
+
70
+ return new_phones
71
+
72
+ def convert(stressword):
73
+ phones = ("#" + stressword + "#")
74
+
75
+
76
+ # Assign stress marks
77
+ stress_phones = []
78
+ stress = 0
79
+ for phone in phones:
80
+ if phone == "+":
81
+ stress = 1
82
+ else:
83
+ stress_phones.append((phone, stress))
84
+ stress = 0
85
+
86
+ # Pallatize
87
+ pallatize(stress_phones)
88
+
89
+ # Assign stress
90
+ phones = convert_vowels(stress_phones)
91
+
92
+ # Filter
93
+ phones = [x for x in phones if x not in others]
94
+ return " ".join(phones)
tokenizer/g2p/tokenizer.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from .g2p import *
3
+ import json
4
+ import os
5
+
6
+ class Tokenizer():
7
+ def __init__(self, data_path: str) -> None:
8
+ self.dic = {}
9
+ for line in open(os.path.join(data_path, "dictionary.txt")):
10
+ items = line.split()
11
+ self.dic[items[0]] = " ".join(items[1:])
12
+
13
+ self.config = json.load(open(os.path.join(data_path, "config.json")))
14
+
15
+ def g2p(self, text):
16
+ text = re.sub("—", "-", text)
17
+ text = re.sub("([!'(),-.:;?])", r' \1 ', text)
18
+
19
+ phonemes = []
20
+ for word in text.split():
21
+ if re.match("[!'(),-.:;?]", word):
22
+ phonemes.append(word)
23
+ continue
24
+
25
+ word = word.lower()
26
+ if len(phonemes) > 0: phonemes.append(' ')
27
+
28
+ if word in self.dic:
29
+ phonemes.extend(self.dic[word].split())
30
+ else:
31
+ phonemes.extend(convert(word).split())
32
+
33
+ phoneme_id_map = self.config["phoneme_id_map"]
34
+ phoneme_ids = []
35
+ phoneme_ids.extend(phoneme_id_map["^"])
36
+ phoneme_ids.extend(phoneme_id_map["_"])
37
+ for p in phonemes:
38
+ if p in phoneme_id_map:
39
+ phoneme_ids.extend(phoneme_id_map[p])
40
+ phoneme_ids.extend(phoneme_id_map["_"])
41
+ phoneme_ids.extend(phoneme_id_map["$"])
42
+
43
+ return phoneme_ids, phonemes
44
+
45
+ def _get_seq(self, text: str) -> list[int]:
46
+ seq = self.g2p(text)[0]
47
+ return seq
tokenizer/gruut/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .tokenizer import Tokenizer
tokenizer/gruut/tokenizer.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gruut import sentences
2
+ import os
3
+ import re
4
+
5
+ class Tokenizer():
6
+ def __init__(self, path) -> None:
7
+ with open(os.path.join(path, "vocab.txt"), "r", encoding="utf-8") as vocab_file:
8
+ self.symbols = vocab_file.read().split("\n")
9
+ self.symbols = list(map(chr, list(map(int, self.symbols))))
10
+
11
+ self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
12
+
13
+ def _ru_phonems(self, text: str) -> str:
14
+ text = text.lower()
15
+ phonemes = ""
16
+ for sent in sentences(text, lang="ru"):
17
+ for word in sent:
18
+ if word.phonemes:
19
+ phonemes += "".join(word.phonemes)
20
+ phonemes = re.sub(re.compile(r'\s+'), ' ', phonemes).lstrip().rstrip()
21
+ return phonemes
22
+
23
+
24
+ def _text_to_sequence(self, text: str) -> list[int]:
25
+ '''convert text to seq'''
26
+ sequence = []
27
+ clean_text = self._ru_phonems(text)
28
+ for symbol in clean_text:
29
+ symbol_id = self.symbol_to_id[symbol]
30
+ sequence += [symbol_id]
31
+ return sequence
32
+
33
+
34
+ def _get_seq(self, text: str) -> list[int]:
35
+ seq = self._text_to_sequence(text)
36
+ return seq