Upload 13 files
Browse files- app.py +33 -0
- infer_onnx.py +79 -0
- ruaccent/__init__.py +1 -0
- ruaccent/accent_model.py +27 -0
- ruaccent/char_tokenizer.py +112 -0
- ruaccent/omograph_model.py +21 -0
- ruaccent/ruaccent.py +153 -0
- tokenizer/__init__.py +2 -0
- tokenizer/g2p/__init__.py +1 -0
- tokenizer/g2p/g2p.py +94 -0
- tokenizer/g2p/tokenizer.py +47 -0
- tokenizer/gruut/__init__.py +1 -0
- tokenizer/gruut/tokenizer.py +36 -0
app.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from infer_onnx import TTS
|
3 |
+
from ruaccent import RUAccent # https://huggingface.co/TeraTTS/accentuator
|
4 |
+
|
5 |
+
models = ["TeraTTS/natasha-g2p-vits", "TeraTTS/glados2-g2p-vits"]
|
6 |
+
|
7 |
+
models = {k:TTS(k) for k in models}
|
8 |
+
|
9 |
+
accentizer = RUAccent(workdir="./model/ruaccent")
|
10 |
+
accentizer.load(omograph_model_size='medium', dict_load_startup=True)
|
11 |
+
|
12 |
+
|
13 |
+
def process_text(text: str) -> str:
|
14 |
+
text = accentizer.process_all(text)
|
15 |
+
return text
|
16 |
+
|
17 |
+
def text_to_speech(model_name, text, prep_text):
|
18 |
+
if prep_text:
|
19 |
+
text = process_text(text)
|
20 |
+
audio = models[model_name](text)
|
21 |
+
models[model_name].save_wav(audio, 'temp.wav')
|
22 |
+
|
23 |
+
return 'temp.wav', f"Обработанный текст: '{text}'"
|
24 |
+
|
25 |
+
model_choice = gr.Dropdown(choices=list(models.keys()), value="TeraTTS/natasha-g2p-vits", label="Выберите модель")
|
26 |
+
input_text = gr.Textbox(label="Введите текст для синтеза речи")
|
27 |
+
prep_text = gr.Checkbox(label="Предобработать", info="Хотите пред обработать текст?(Ударения, ё)", value=True)
|
28 |
+
|
29 |
+
output_audio = gr.Audio(label="Аудио", type="numpy")
|
30 |
+
output_text = gr.Textbox(label="Обработанный текст")
|
31 |
+
|
32 |
+
iface = gr.Interface(fn=text_to_speech, inputs=[model_choice, input_text, prep_text], outputs=[output_audio, output_text])
|
33 |
+
iface.launch()
|
infer_onnx.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import scipy.io.wavfile
|
2 |
+
import os
|
3 |
+
import onnxruntime
|
4 |
+
import numpy as np
|
5 |
+
from huggingface_hub import snapshot_download
|
6 |
+
|
7 |
+
class TTS:
|
8 |
+
def __init__(self, model_name: str, save_path: str = "./model", add_time_to_end: float = 0.8) -> None:
|
9 |
+
if not os.path.exists(save_path):
|
10 |
+
os.mkdir(save_path)
|
11 |
+
|
12 |
+
model_dir = os.path.join(save_path, model_name)
|
13 |
+
|
14 |
+
if not os.path.exists(model_dir):
|
15 |
+
snapshot_download(repo_id=model_name,
|
16 |
+
allow_patterns=["*.txt", "*.onnx", "*.json"],
|
17 |
+
local_dir=model_dir,
|
18 |
+
local_dir_use_symlinks=False
|
19 |
+
)
|
20 |
+
|
21 |
+
sess_options = onnxruntime.SessionOptions()
|
22 |
+
self.model = onnxruntime.InferenceSession(os.path.join(model_dir, "exported/model.onnx"), sess_options=sess_options)
|
23 |
+
|
24 |
+
if os.path.exists(os.path.join(model_dir, "exported/dictionary.txt")):
|
25 |
+
from tokenizer import TokenizerG2P
|
26 |
+
print("Use g2p")
|
27 |
+
self.tokenizer = TokenizerG2P(os.path.join(model_dir, "exported"))
|
28 |
+
|
29 |
+
else:
|
30 |
+
from tokenizer import TokenizerGRUUT
|
31 |
+
print("Use gruut")
|
32 |
+
self.tokenizer = TokenizerGRUUT(os.path.join(model_dir, "exported"))
|
33 |
+
|
34 |
+
self.add_time_to_end = add_time_to_end
|
35 |
+
|
36 |
+
|
37 |
+
def _add_silent(self, audio, silence_duration: float = 1.0, sample_rate: int = 22050):
|
38 |
+
num_samples_silence = int(sample_rate * silence_duration)
|
39 |
+
silence_array = np.zeros(num_samples_silence, dtype=np.float32)
|
40 |
+
audio_with_silence = np.concatenate((audio, silence_array), axis=0)
|
41 |
+
return audio_with_silence
|
42 |
+
|
43 |
+
|
44 |
+
def save_wav(self, audio, path:str):
|
45 |
+
'''save audio to wav'''
|
46 |
+
scipy.io.wavfile.write(path, 22050, audio)
|
47 |
+
|
48 |
+
|
49 |
+
def _intersperse(self, lst, item):
|
50 |
+
result = [item] * (len(lst) * 2 + 1)
|
51 |
+
result[1::2] = lst
|
52 |
+
return result
|
53 |
+
|
54 |
+
|
55 |
+
def _get_seq(self, text):
|
56 |
+
phoneme_ids = self.tokenizer._get_seq(text)
|
57 |
+
phoneme_ids_inter = self._intersperse(phoneme_ids, 0)
|
58 |
+
return phoneme_ids_inter
|
59 |
+
|
60 |
+
|
61 |
+
def __call__(self, text: str):
|
62 |
+
phoneme_ids = self._get_seq(text)
|
63 |
+
text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
64 |
+
text_lengths = np.array([text.shape[1]], dtype=np.int64)
|
65 |
+
scales = np.array(
|
66 |
+
[0.667, 1, 0.8],
|
67 |
+
dtype=np.float32,
|
68 |
+
)
|
69 |
+
audio = self.model.run(
|
70 |
+
None,
|
71 |
+
{
|
72 |
+
"input": text,
|
73 |
+
"input_lengths": text_lengths,
|
74 |
+
"scales": scales,
|
75 |
+
"sid": None,
|
76 |
+
},
|
77 |
+
)[0][0,0][0]
|
78 |
+
audio = self._add_silent(audio, silence_duration = self.add_time_to_end)
|
79 |
+
return audio
|
ruaccent/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .ruaccent import RUAccent
|
ruaccent/accent_model.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from .char_tokenizer import CharTokenizer
|
3 |
+
from transformers import AutoModelForTokenClassification
|
4 |
+
|
5 |
+
class AccentModel:
|
6 |
+
def __init__(self) -> None:
|
7 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
8 |
+
def load(self, path):
|
9 |
+
self.model = AutoModelForTokenClassification.from_pretrained(path).to(self.device)
|
10 |
+
self.tokenizer = CharTokenizer.from_pretrained(path)
|
11 |
+
|
12 |
+
def render_stress(self, word, token_classes):
|
13 |
+
if 'STRESS' in token_classes:
|
14 |
+
index = token_classes.index('STRESS')
|
15 |
+
word = list(word)
|
16 |
+
word[index-1] = '+' + word[index-1]
|
17 |
+
return ''.join(word)
|
18 |
+
else:
|
19 |
+
return word
|
20 |
+
|
21 |
+
def put_accent(self, word):
|
22 |
+
inputs = self.tokenizer(word, return_tensors="pt").to(self.device)
|
23 |
+
with torch.no_grad():
|
24 |
+
logits = self.model(**inputs).logits
|
25 |
+
predictions = torch.argmax(logits, dim=2)
|
26 |
+
predicted_token_class = [self.model.config.id2label[t.item()] for t in predictions[0]]
|
27 |
+
return self.render_stress(word, predicted_token_class)
|
ruaccent/char_tokenizer.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Optional, Tuple, List
|
3 |
+
from collections import OrderedDict
|
4 |
+
|
5 |
+
from transformers import PreTrainedTokenizer
|
6 |
+
|
7 |
+
|
8 |
+
def load_vocab(vocab_file):
|
9 |
+
vocab = OrderedDict()
|
10 |
+
with open(vocab_file, "r", encoding="utf-8") as reader:
|
11 |
+
tokens = reader.readlines()
|
12 |
+
for index, token in enumerate(tokens):
|
13 |
+
token = token.rstrip("\n")
|
14 |
+
vocab[token] = index
|
15 |
+
return vocab
|
16 |
+
|
17 |
+
|
18 |
+
class CharTokenizer(PreTrainedTokenizer):
|
19 |
+
vocab_files_names = {"vocab_file": "vocab.txt"}
|
20 |
+
|
21 |
+
def __init__(
|
22 |
+
self,
|
23 |
+
vocab_file=None,
|
24 |
+
pad_token="[pad]",
|
25 |
+
unk_token="[unk]",
|
26 |
+
bos_token="[bos]",
|
27 |
+
eos_token="[eos]",
|
28 |
+
do_lower_case=False,
|
29 |
+
*args,
|
30 |
+
**kwargs
|
31 |
+
):
|
32 |
+
super().__init__(
|
33 |
+
pad_token=pad_token,
|
34 |
+
unk_token=unk_token,
|
35 |
+
bos_token=bos_token,
|
36 |
+
eos_token=eos_token,
|
37 |
+
do_lower_case=do_lower_case,
|
38 |
+
**kwargs
|
39 |
+
)
|
40 |
+
self.do_lower_case = do_lower_case
|
41 |
+
|
42 |
+
if not vocab_file or not os.path.isfile(vocab_file):
|
43 |
+
self.vocab = OrderedDict()
|
44 |
+
self.ids_to_tokens = OrderedDict()
|
45 |
+
else:
|
46 |
+
self.vocab = load_vocab(vocab_file)
|
47 |
+
self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
48 |
+
|
49 |
+
@property
|
50 |
+
def vocab_size(self):
|
51 |
+
return len(self.vocab)
|
52 |
+
|
53 |
+
def get_vocab(self):
|
54 |
+
return self.vocab
|
55 |
+
|
56 |
+
def _convert_token_to_id(self, token):
|
57 |
+
if self.do_lower_case:
|
58 |
+
token = token.lower()
|
59 |
+
return self.vocab.get(token, self.vocab[self.unk_token])
|
60 |
+
|
61 |
+
def _convert_id_to_token(self, index):
|
62 |
+
return self.ids_to_tokens[index]
|
63 |
+
|
64 |
+
def _tokenize(self, text):
|
65 |
+
if self.do_lower_case:
|
66 |
+
text = text.lower()
|
67 |
+
return list(text)
|
68 |
+
|
69 |
+
def convert_tokens_to_string(self, tokens):
|
70 |
+
return "".join(tokens)
|
71 |
+
|
72 |
+
def build_inputs_with_special_tokens(
|
73 |
+
self,
|
74 |
+
token_ids_0: List[int],
|
75 |
+
token_ids_1: Optional[List[int]] = None
|
76 |
+
) -> List[int]:
|
77 |
+
bos = [self.bos_token_id]
|
78 |
+
eos = [self.eos_token_id]
|
79 |
+
return bos + token_ids_0 + eos
|
80 |
+
|
81 |
+
def get_special_tokens_mask(
|
82 |
+
self,
|
83 |
+
token_ids_0: List[int],
|
84 |
+
token_ids_1: Optional[List[int]] = None
|
85 |
+
) -> List[int]:
|
86 |
+
return [1] + ([0] * len(token_ids_0)) + [1]
|
87 |
+
|
88 |
+
def create_token_type_ids_from_sequences(
|
89 |
+
self,
|
90 |
+
token_ids_0: List[int],
|
91 |
+
token_ids_1: Optional[List[int]] = None
|
92 |
+
) -> List[int]:
|
93 |
+
return (len(token_ids_0) + 2) * [0]
|
94 |
+
|
95 |
+
def save_vocabulary(
|
96 |
+
self,
|
97 |
+
save_directory: str,
|
98 |
+
filename_prefix: Optional[str] = None
|
99 |
+
) -> Tuple[str]:
|
100 |
+
assert os.path.isdir(save_directory)
|
101 |
+
vocab_file = os.path.join(
|
102 |
+
save_directory,
|
103 |
+
(filename_prefix + "-" if filename_prefix else "") +
|
104 |
+
self.vocab_files_names["vocab_file"]
|
105 |
+
)
|
106 |
+
index = 0
|
107 |
+
with open(vocab_file, "w", encoding="utf-8") as writer:
|
108 |
+
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
|
109 |
+
assert index == token_index
|
110 |
+
writer.write(token + "\n")
|
111 |
+
index += 1
|
112 |
+
return (vocab_file,)
|
ruaccent/omograph_model.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
2 |
+
import torch
|
3 |
+
|
4 |
+
class OmographModel:
|
5 |
+
def __init__(self) -> None:
|
6 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
7 |
+
|
8 |
+
def load(self, path):
|
9 |
+
self.nli_model = AutoModelForSequenceClassification.from_pretrained(path, torch_dtype=torch.bfloat16).to(self.device)
|
10 |
+
self.tokenizer = AutoTokenizer.from_pretrained(path)
|
11 |
+
|
12 |
+
def classify(self, text, hypotheses):
|
13 |
+
encodings = self.tokenizer.batch_encode_plus([(text, hyp) for hyp in hypotheses], return_tensors='pt', padding=True)
|
14 |
+
input_ids = encodings['input_ids'].to(self.device)
|
15 |
+
with torch.no_grad():
|
16 |
+
logits = self.nli_model(input_ids)[0]
|
17 |
+
entail_contradiction_logits = logits[:,[0,2]]
|
18 |
+
probs = entail_contradiction_logits.softmax(dim=1)
|
19 |
+
prob_label_is_true = [float(p[1]) for p in probs]
|
20 |
+
|
21 |
+
return hypotheses[prob_label_is_true.index(max(prob_label_is_true))]
|
ruaccent/ruaccent.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pathlib
|
3 |
+
from huggingface_hub import snapshot_download
|
4 |
+
import os
|
5 |
+
from os.path import join as join_path
|
6 |
+
from .omograph_model import OmographModel
|
7 |
+
from .accent_model import AccentModel
|
8 |
+
import re
|
9 |
+
|
10 |
+
|
11 |
+
class RUAccent:
|
12 |
+
def __init__(self, workdir=None):
|
13 |
+
self.omograph_model = OmographModel()
|
14 |
+
self.accent_model = AccentModel()
|
15 |
+
if not workdir:
|
16 |
+
self.workdir = str(pathlib.Path(__file__).resolve().parent)
|
17 |
+
else:
|
18 |
+
self.workdir = workdir
|
19 |
+
|
20 |
+
def load(
|
21 |
+
self,
|
22 |
+
omograph_model_size="medium",
|
23 |
+
dict_load_startup=False,
|
24 |
+
disable_accent_dict=False,
|
25 |
+
repo="TeraTTS/accentuator",
|
26 |
+
):
|
27 |
+
if not os.path.exists(
|
28 |
+
join_path(self.workdir, "dictionary")
|
29 |
+
) or not os.path.exists(join_path(self.workdir, "nn")):
|
30 |
+
snapshot_download(
|
31 |
+
repo_id=repo,
|
32 |
+
ignore_patterns=["*.md", "*.gitattributes"],
|
33 |
+
local_dir=self.workdir,
|
34 |
+
local_dir_use_symlinks=False,
|
35 |
+
)
|
36 |
+
self.omographs = json.load(
|
37 |
+
open(join_path(self.workdir, "dictionary/omographs.json"), encoding='utf-8')
|
38 |
+
)
|
39 |
+
self.yo_words = json.load(
|
40 |
+
open(join_path(self.workdir, "dictionary/yo_words.json"), encoding='utf-8')
|
41 |
+
)
|
42 |
+
self.dict_load_startup = dict_load_startup
|
43 |
+
|
44 |
+
if dict_load_startup:
|
45 |
+
self.accents = json.load(
|
46 |
+
open(join_path(self.workdir, "dictionary/accents.json"), encoding='utf-8')
|
47 |
+
)
|
48 |
+
if disable_accent_dict:
|
49 |
+
self.accents = {}
|
50 |
+
self.disable_accent_dict = True
|
51 |
+
else:
|
52 |
+
self.disable_accent_dict = False
|
53 |
+
|
54 |
+
if omograph_model_size not in ["small", "medium"]:
|
55 |
+
raise NotImplementedError
|
56 |
+
|
57 |
+
self.omograph_model.load(
|
58 |
+
join_path(self.workdir, f"nn/nn_omograph/{omograph_model_size}/")
|
59 |
+
)
|
60 |
+
self.accent_model.load(join_path(self.workdir, "nn/nn_accent/"))
|
61 |
+
|
62 |
+
|
63 |
+
def split_by_words(self, string):
|
64 |
+
result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
|
65 |
+
return [res for res in result if res]
|
66 |
+
|
67 |
+
def extract_initial_letters(self, text):
|
68 |
+
words = text
|
69 |
+
initial_letters = []
|
70 |
+
for word in words:
|
71 |
+
if len(word) > 2 and '+' not in word and not bool(re.search('[a-zA-Z]', word)):
|
72 |
+
initial_letters.append(word[0])
|
73 |
+
return initial_letters
|
74 |
+
|
75 |
+
def load_dict(self, text):
|
76 |
+
chars = self.extract_initial_letters(text)
|
77 |
+
out_dict = {}
|
78 |
+
for char in chars:
|
79 |
+
out_dict.update(
|
80 |
+
json.load(
|
81 |
+
open(
|
82 |
+
join_path(self.workdir, f"dictionary/letter_accent/{char}.json"),
|
83 |
+
encoding='utf-8'
|
84 |
+
)
|
85 |
+
)
|
86 |
+
)
|
87 |
+
return out_dict
|
88 |
+
|
89 |
+
def count_vowels(self, text):
|
90 |
+
vowels = "аеёиоуыэюяАЕЁИОУЫЭЮЯ"
|
91 |
+
return sum(1 for char in text if char in vowels)
|
92 |
+
|
93 |
+
def has_punctuation(self, text):
|
94 |
+
for char in text:
|
95 |
+
if char in "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~":
|
96 |
+
return True
|
97 |
+
return False
|
98 |
+
|
99 |
+
def delete_spaces_before_punc(self, text):
|
100 |
+
punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
|
101 |
+
for char in punc:
|
102 |
+
text = text.replace(" " + char, char)
|
103 |
+
return text
|
104 |
+
|
105 |
+
def process_yo(self, text):
|
106 |
+
splitted_text = text
|
107 |
+
|
108 |
+
for i, word in enumerate(splitted_text):
|
109 |
+
splitted_text[i] = self.yo_words.get(word, word)
|
110 |
+
return splitted_text
|
111 |
+
|
112 |
+
def process_omographs(self, text):
|
113 |
+
splitted_text = text
|
114 |
+
|
115 |
+
founded_omographs = []
|
116 |
+
for i, word in enumerate(splitted_text):
|
117 |
+
variants = self.omographs.get(word)
|
118 |
+
if variants:
|
119 |
+
founded_omographs.append(
|
120 |
+
{"word": word, "variants": variants, "position": i}
|
121 |
+
)
|
122 |
+
for omograph in founded_omographs:
|
123 |
+
splitted_text[
|
124 |
+
omograph["position"]
|
125 |
+
] = f"<w>{splitted_text[omograph['position']]}</w>"
|
126 |
+
cls = self.omograph_model.classify(
|
127 |
+
" ".join(splitted_text), omograph["variants"]
|
128 |
+
)
|
129 |
+
splitted_text[omograph["position"]] = cls
|
130 |
+
return splitted_text
|
131 |
+
|
132 |
+
def process_accent(self, text):
|
133 |
+
if not self.dict_load_startup and not self.disable_accent_dict:
|
134 |
+
self.accents = self.load_dict(text)
|
135 |
+
|
136 |
+
splitted_text = text
|
137 |
+
|
138 |
+
for i, word in enumerate(splitted_text):
|
139 |
+
stressed_word = self.accents.get(word, word)
|
140 |
+
if stressed_word == word and not self.has_punctuation(word) and self.count_vowels(word) > 1:
|
141 |
+
splitted_text[i] = self.accent_model.put_accent(word)
|
142 |
+
else:
|
143 |
+
splitted_text[i] = stressed_word
|
144 |
+
return splitted_text
|
145 |
+
|
146 |
+
def process_all(self, text):
|
147 |
+
text = self.split_by_words(text)
|
148 |
+
processed_text = self.process_yo(text)
|
149 |
+
processed_text = self.process_omographs(processed_text)
|
150 |
+
processed_text = self.process_accent(processed_text)
|
151 |
+
processed_text = " ".join(processed_text)
|
152 |
+
processed_text = self.delete_spaces_before_punc(processed_text)
|
153 |
+
return processed_text
|
tokenizer/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .gruut import Tokenizer as TokenizerGRUUT
|
2 |
+
from .g2p import Tokenizer as TokenizerG2P
|
tokenizer/g2p/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .tokenizer import Tokenizer
|
tokenizer/g2p/g2p.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
softletters=set(u"яёюиье")
|
3 |
+
startsyl=set(u"#ъьаяоёуюэеиы-")
|
4 |
+
others = set(["#", "+", "-", u"ь", u"ъ"])
|
5 |
+
|
6 |
+
softhard_cons = {
|
7 |
+
u"б" : u"b",
|
8 |
+
u"в" : u"v",
|
9 |
+
u"г" : u"g",
|
10 |
+
u"Г" : u"g",
|
11 |
+
u"д" : u"d",
|
12 |
+
u"з" : u"z",
|
13 |
+
u"к" : u"k",
|
14 |
+
u"л" : u"l",
|
15 |
+
u"м" : u"m",
|
16 |
+
u"н" : u"n",
|
17 |
+
u"п" : u"p",
|
18 |
+
u"р" : u"r",
|
19 |
+
u"с" : u"s",
|
20 |
+
u"т" : u"t",
|
21 |
+
u"ф" : u"f",
|
22 |
+
u"х" : u"h"
|
23 |
+
}
|
24 |
+
|
25 |
+
other_cons = {
|
26 |
+
u"ж" : u"zh",
|
27 |
+
u"ц" : u"c",
|
28 |
+
u"ч" : u"ch",
|
29 |
+
u"ш" : u"sh",
|
30 |
+
u"щ" : u"sch",
|
31 |
+
u"й" : u"j"
|
32 |
+
}
|
33 |
+
|
34 |
+
vowels = {
|
35 |
+
u"а" : u"a",
|
36 |
+
u"я" : u"a",
|
37 |
+
u"у" : u"u",
|
38 |
+
u"ю" : u"u",
|
39 |
+
u"о" : u"o",
|
40 |
+
u"ё" : u"o",
|
41 |
+
u"э" : u"e",
|
42 |
+
u"е" : u"e",
|
43 |
+
u"и" : u"i",
|
44 |
+
u"ы" : u"y",
|
45 |
+
}
|
46 |
+
|
47 |
+
def pallatize(phones):
|
48 |
+
for i, phone in enumerate(phones[:-1]):
|
49 |
+
if phone[0] in softhard_cons:
|
50 |
+
if phones[i+1][0] in softletters:
|
51 |
+
phones[i] = (softhard_cons[phone[0]] + "j", 0)
|
52 |
+
else:
|
53 |
+
phones[i] = (softhard_cons[phone[0]], 0)
|
54 |
+
if phone[0] in other_cons:
|
55 |
+
phones[i] = (other_cons[phone[0]], 0)
|
56 |
+
|
57 |
+
def convert_vowels(phones):
|
58 |
+
new_phones = []
|
59 |
+
prev = ""
|
60 |
+
for phone in phones:
|
61 |
+
if prev in startsyl:
|
62 |
+
if phone[0] in set(u"яюеё"):
|
63 |
+
new_phones.append("j")
|
64 |
+
if phone[0] in vowels:
|
65 |
+
new_phones.append(vowels[phone[0]] + str(phone[1]))
|
66 |
+
else:
|
67 |
+
new_phones.append(phone[0])
|
68 |
+
prev = phone[0]
|
69 |
+
|
70 |
+
return new_phones
|
71 |
+
|
72 |
+
def convert(stressword):
|
73 |
+
phones = ("#" + stressword + "#")
|
74 |
+
|
75 |
+
|
76 |
+
# Assign stress marks
|
77 |
+
stress_phones = []
|
78 |
+
stress = 0
|
79 |
+
for phone in phones:
|
80 |
+
if phone == "+":
|
81 |
+
stress = 1
|
82 |
+
else:
|
83 |
+
stress_phones.append((phone, stress))
|
84 |
+
stress = 0
|
85 |
+
|
86 |
+
# Pallatize
|
87 |
+
pallatize(stress_phones)
|
88 |
+
|
89 |
+
# Assign stress
|
90 |
+
phones = convert_vowels(stress_phones)
|
91 |
+
|
92 |
+
# Filter
|
93 |
+
phones = [x for x in phones if x not in others]
|
94 |
+
return " ".join(phones)
|
tokenizer/g2p/tokenizer.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from .g2p import *
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
|
6 |
+
class Tokenizer():
|
7 |
+
def __init__(self, data_path: str) -> None:
|
8 |
+
self.dic = {}
|
9 |
+
for line in open(os.path.join(data_path, "dictionary.txt")):
|
10 |
+
items = line.split()
|
11 |
+
self.dic[items[0]] = " ".join(items[1:])
|
12 |
+
|
13 |
+
self.config = json.load(open(os.path.join(data_path, "config.json")))
|
14 |
+
|
15 |
+
def g2p(self, text):
|
16 |
+
text = re.sub("—", "-", text)
|
17 |
+
text = re.sub("([!'(),-.:;?])", r' \1 ', text)
|
18 |
+
|
19 |
+
phonemes = []
|
20 |
+
for word in text.split():
|
21 |
+
if re.match("[!'(),-.:;?]", word):
|
22 |
+
phonemes.append(word)
|
23 |
+
continue
|
24 |
+
|
25 |
+
word = word.lower()
|
26 |
+
if len(phonemes) > 0: phonemes.append(' ')
|
27 |
+
|
28 |
+
if word in self.dic:
|
29 |
+
phonemes.extend(self.dic[word].split())
|
30 |
+
else:
|
31 |
+
phonemes.extend(convert(word).split())
|
32 |
+
|
33 |
+
phoneme_id_map = self.config["phoneme_id_map"]
|
34 |
+
phoneme_ids = []
|
35 |
+
phoneme_ids.extend(phoneme_id_map["^"])
|
36 |
+
phoneme_ids.extend(phoneme_id_map["_"])
|
37 |
+
for p in phonemes:
|
38 |
+
if p in phoneme_id_map:
|
39 |
+
phoneme_ids.extend(phoneme_id_map[p])
|
40 |
+
phoneme_ids.extend(phoneme_id_map["_"])
|
41 |
+
phoneme_ids.extend(phoneme_id_map["$"])
|
42 |
+
|
43 |
+
return phoneme_ids, phonemes
|
44 |
+
|
45 |
+
def _get_seq(self, text: str) -> list[int]:
|
46 |
+
seq = self.g2p(text)[0]
|
47 |
+
return seq
|
tokenizer/gruut/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .tokenizer import Tokenizer
|
tokenizer/gruut/tokenizer.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gruut import sentences
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
|
5 |
+
class Tokenizer():
|
6 |
+
def __init__(self, path) -> None:
|
7 |
+
with open(os.path.join(path, "vocab.txt"), "r", encoding="utf-8") as vocab_file:
|
8 |
+
self.symbols = vocab_file.read().split("\n")
|
9 |
+
self.symbols = list(map(chr, list(map(int, self.symbols))))
|
10 |
+
|
11 |
+
self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
|
12 |
+
|
13 |
+
def _ru_phonems(self, text: str) -> str:
|
14 |
+
text = text.lower()
|
15 |
+
phonemes = ""
|
16 |
+
for sent in sentences(text, lang="ru"):
|
17 |
+
for word in sent:
|
18 |
+
if word.phonemes:
|
19 |
+
phonemes += "".join(word.phonemes)
|
20 |
+
phonemes = re.sub(re.compile(r'\s+'), ' ', phonemes).lstrip().rstrip()
|
21 |
+
return phonemes
|
22 |
+
|
23 |
+
|
24 |
+
def _text_to_sequence(self, text: str) -> list[int]:
|
25 |
+
'''convert text to seq'''
|
26 |
+
sequence = []
|
27 |
+
clean_text = self._ru_phonems(text)
|
28 |
+
for symbol in clean_text:
|
29 |
+
symbol_id = self.symbol_to_id[symbol]
|
30 |
+
sequence += [symbol_id]
|
31 |
+
return sequence
|
32 |
+
|
33 |
+
|
34 |
+
def _get_seq(self, text: str) -> list[int]:
|
35 |
+
seq = self._text_to_sequence(text)
|
36 |
+
return seq
|