Spaces:
Runtime error
Runtime error
Add TeraTTS and remove pyttsx3
Browse files- .gitignore +1 -1
- app.py +30 -97
- infer_onnx.py +90 -0
- packages.txt +1 -2
- requirements.txt +9 -1
- tokenizer/__init__.py +2 -0
- tokenizer/g2p/__init__.py +1 -0
- tokenizer/g2p/g2p.py +94 -0
- tokenizer/g2p/tokenizer.py +48 -0
- tokenizer/gruut/__init__.py +1 -0
- tokenizer/gruut/tokenizer.py +37 -0
.gitignore
CHANGED
@@ -151,7 +151,7 @@ dmypy.json
|
|
151 |
|
152 |
# Cython debug symbols
|
153 |
cython_debug/
|
154 |
-
|
155 |
results/
|
156 |
checkpoints/
|
157 |
gradio_cached_examples/
|
|
|
151 |
|
152 |
# Cython debug symbols
|
153 |
cython_debug/
|
154 |
+
model/
|
155 |
results/
|
156 |
checkpoints/
|
157 |
gradio_cached_examples/
|
app.py
CHANGED
@@ -1,12 +1,31 @@
|
|
1 |
-
import os
|
2 |
import uuid
|
3 |
-
import tempfile
|
4 |
-
import pyttsx3
|
5 |
import gradio as gr
|
6 |
from src.gradio_demo import SadTalker
|
7 |
-
|
8 |
from huggingface_hub import snapshot_download
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def get_source_image(image):
|
11 |
return image
|
12 |
|
@@ -33,92 +52,11 @@ def download_model():
|
|
33 |
REPO_ID = 'vinthony/SadTalker-V002rc'
|
34 |
snapshot_download(REPO_ID)
|
35 |
|
36 |
-
# language : en_US, de_DE, ...
|
37 |
-
# gender : VoiceGenderFemale, VoiceGenderMale
|
38 |
-
def change_voice(engine, language='ru_ru', gender='male'):
|
39 |
-
|
40 |
-
selected_voices = []
|
41 |
-
|
42 |
-
language = language.lower() if language else ''
|
43 |
-
gender = gender.lower() if gender else ''
|
44 |
-
|
45 |
-
for voice in engine.getProperty('voices'):
|
46 |
-
voice_appended = False
|
47 |
-
|
48 |
-
for lang in voice.languages:
|
49 |
-
|
50 |
-
lang_str = str(lang, 'utf-8')
|
51 |
-
print("lang", lang_str)
|
52 |
-
|
53 |
-
if lang_str and language in lang_str.lower():
|
54 |
-
selected_voices.append(voice)
|
55 |
-
print("voice appended by lang", voice, lang_str)
|
56 |
-
voice_appended = True
|
57 |
-
break
|
58 |
-
|
59 |
-
if voice_appended:
|
60 |
-
continue
|
61 |
-
|
62 |
-
if voice.id and language in voice.id.lower():
|
63 |
-
selected_voices.append(voice)
|
64 |
-
print("voice appended by id", voice.id)
|
65 |
-
continue
|
66 |
-
|
67 |
-
if voice.name and language in voice.name.lower():
|
68 |
-
selected_voices.append(voice)
|
69 |
-
print("voice appended by name", voice.name)
|
70 |
-
continue
|
71 |
-
|
72 |
-
for voice in selected_voices:
|
73 |
-
if voice.gender and gender in voice.gender.lower():
|
74 |
-
engine.setProperty('voice', voice.id)
|
75 |
-
print("voice selected by gender", voice.gender)
|
76 |
-
return True
|
77 |
-
|
78 |
-
if voice.id and gender in voice.id.lower():
|
79 |
-
engine.setProperty('voice', voice.id)
|
80 |
-
print("voice selected by id", voice.id)
|
81 |
-
return True
|
82 |
-
if voice.name and gender in voice.name.lower():
|
83 |
-
engine.setProperty('voice', voice.id)
|
84 |
-
print("voice selected by name", voice.name)
|
85 |
-
return True
|
86 |
-
|
87 |
-
if len(selected_voices) > 0:
|
88 |
-
engine.setProperty('voice', selected_voices[0].id)
|
89 |
-
print("voice selected by default", selected_voices[0].id)
|
90 |
-
return True
|
91 |
-
|
92 |
-
return False
|
93 |
-
|
94 |
-
def play_text_to_speech(text_input, voice_option):
|
95 |
-
engine = pyttsx3.init()
|
96 |
-
|
97 |
-
change_voice(engine, 'ru', voice_option)
|
98 |
-
|
99 |
-
print("text_input", text_input)
|
100 |
-
print("voice_option", voice_option)
|
101 |
-
|
102 |
-
time_tag = str(uuid.uuid4())
|
103 |
-
save_dir = './results/voice_input'
|
104 |
-
os.makedirs(save_dir, exist_ok=True)
|
105 |
-
file_name = os.path.join(save_dir, os.path.basename(time_tag + '.wav'))
|
106 |
-
|
107 |
-
open(file_name, "wb").close()
|
108 |
-
engine.say(text_input)
|
109 |
-
engine.save_to_file(text_input, file_name)
|
110 |
-
engine.runAndWait()
|
111 |
-
|
112 |
-
print("file saved to", file_name)
|
113 |
-
|
114 |
-
return file_name
|
115 |
-
|
116 |
def sadtalker_demo():
|
117 |
|
118 |
download_model()
|
119 |
|
120 |
sad_talker = SadTalker(lazy_load=True)
|
121 |
-
# tts_talker = TTSTalker()
|
122 |
|
123 |
with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
|
124 |
with gr.Row():
|
@@ -131,6 +69,12 @@ def sadtalker_demo():
|
|
131 |
|
132 |
with gr.Tabs(elem_id="sadtalker_driven_audio"):
|
133 |
with gr.TabItem('Driving Methods'):
|
|
|
|
|
|
|
|
|
|
|
|
|
134 |
with gr.Row():
|
135 |
driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
|
136 |
driven_audio_no = gr.Audio(label="Use IDLE mode, no audio is required", source="upload", type="filepath", visible=False)
|
@@ -139,15 +83,10 @@ def sadtalker_demo():
|
|
139 |
use_idle_mode = gr.Checkbox(label="Use Idle Animation", visible=False)
|
140 |
length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.")
|
141 |
use_idle_mode.change(toggle_audio_file, inputs=use_idle_mode, outputs=[driven_audio, driven_audio_no]) # todo
|
142 |
-
with gr.Row():
|
143 |
-
text_input = gr.Textbox(label="Enter text", multiline=True)
|
144 |
-
voice_option = gr.Radio(['Male', 'Female'], label='Voice Option', value='Female')
|
145 |
with gr.Row():
|
146 |
play_button = gr.Button('Text To Speech', variant='primary')
|
147 |
play_button.click(
|
148 |
-
fn=
|
149 |
-
inputs=[text_input, voice_option],
|
150 |
-
outputs=[driven_audio]
|
151 |
)
|
152 |
with gr.Row():
|
153 |
ref_video = gr.Video(label="Reference Video", source="upload", type="filepath", elem_id="vidref")
|
@@ -158,13 +97,10 @@ def sadtalker_demo():
|
|
158 |
|
159 |
ref_video.change(ref_video_fn, inputs=ref_video, outputs=[use_ref_video]) # todo
|
160 |
|
161 |
-
|
162 |
with gr.Column(variant='panel'):
|
163 |
with gr.Tabs(elem_id="sadtalker_checkbox"):
|
164 |
with gr.TabItem('Settings'):
|
165 |
with gr.Column(variant='panel'):
|
166 |
-
# width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
|
167 |
-
# height = gr.Slider(minimum=64, elem_id="img2img_height", maximum=2048, step=8, label="Manually Crop Height", value=512) # img2img_width
|
168 |
with gr.Row():
|
169 |
pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) #
|
170 |
exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) #
|
@@ -187,8 +123,6 @@ def sadtalker_demo():
|
|
187 |
with gr.Tabs(elem_id="sadtalker_genearted"):
|
188 |
gen_video = gr.Video(label="Generated video", format="mp4")
|
189 |
|
190 |
-
|
191 |
-
|
192 |
submit.click(
|
193 |
fn=sad_talker.test,
|
194 |
inputs=[source_image,
|
@@ -213,7 +147,6 @@ def sadtalker_demo():
|
|
213 |
|
214 |
return sadtalker_interface
|
215 |
|
216 |
-
|
217 |
if __name__ == "__main__":
|
218 |
|
219 |
demo = sadtalker_demo()
|
|
|
1 |
+
import os
|
2 |
import uuid
|
|
|
|
|
3 |
import gradio as gr
|
4 |
from src.gradio_demo import SadTalker
|
5 |
+
from infer_onnx import TTS
|
6 |
from huggingface_hub import snapshot_download
|
7 |
|
8 |
+
|
9 |
+
# Список моделей TTS для выбора
|
10 |
+
models = ["TeraTTS/natasha-g2p-vits", "TeraTTS/glados2-g2p-vits", "TeraTTS/glados-g2p-vits", "TeraTTS/girl_nice-g2p-vits"]
|
11 |
+
|
12 |
+
# Создаем словарь моделей и инициализируем их
|
13 |
+
models = {k: TTS(k) for k in models}
|
14 |
+
|
15 |
+
# Функция для синтеза речи
|
16 |
+
def text_to_speech(model_name, length_scale, text):
|
17 |
+
time_tag = str(uuid.uuid4())
|
18 |
+
save_dir = './results/voice_input'
|
19 |
+
os.makedirs(save_dir, exist_ok=True)
|
20 |
+
file_name = os.path.join(save_dir, os.path.basename(time_tag + '.wav'))
|
21 |
+
|
22 |
+
open(file_name, "wb").close()
|
23 |
+
|
24 |
+
audio = models[model_name](text, length_scale=length_scale)
|
25 |
+
models[model_name].save_wav(audio, file_name, sample_rate=models[model_name].config["samplerate"])
|
26 |
+
|
27 |
+
return file_name
|
28 |
+
|
29 |
def get_source_image(image):
|
30 |
return image
|
31 |
|
|
|
52 |
REPO_ID = 'vinthony/SadTalker-V002rc'
|
53 |
snapshot_download(REPO_ID)
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
def sadtalker_demo():
|
56 |
|
57 |
download_model()
|
58 |
|
59 |
sad_talker = SadTalker(lazy_load=True)
|
|
|
60 |
|
61 |
with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
|
62 |
with gr.Row():
|
|
|
69 |
|
70 |
with gr.Tabs(elem_id="sadtalker_driven_audio"):
|
71 |
with gr.TabItem('Driving Methods'):
|
72 |
+
with gr.Row():
|
73 |
+
model_choice = gr.Dropdown(choices=list(models.keys()), value="TeraTTS/natasha-g2p-vits", label="Choose TTS model")
|
74 |
+
with gr.Row():
|
75 |
+
length_scale = gr.Slider(minimum=0.1, maximum=2.0, label="Length scale (increase length of sound) Default: 1.2", value=1.2)
|
76 |
+
with gr.Row():
|
77 |
+
input_text = gr.Textbox(label="Enter text")
|
78 |
with gr.Row():
|
79 |
driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
|
80 |
driven_audio_no = gr.Audio(label="Use IDLE mode, no audio is required", source="upload", type="filepath", visible=False)
|
|
|
83 |
use_idle_mode = gr.Checkbox(label="Use Idle Animation", visible=False)
|
84 |
length_of_audio = gr.Number(value=5, label="The length(seconds) of the generated video.")
|
85 |
use_idle_mode.change(toggle_audio_file, inputs=use_idle_mode, outputs=[driven_audio, driven_audio_no]) # todo
|
|
|
|
|
|
|
86 |
with gr.Row():
|
87 |
play_button = gr.Button('Text To Speech', variant='primary')
|
88 |
play_button.click(
|
89 |
+
fn=text_to_speech, inputs=[model_choice, length_scale, input_text], outputs=[driven_audio]
|
|
|
|
|
90 |
)
|
91 |
with gr.Row():
|
92 |
ref_video = gr.Video(label="Reference Video", source="upload", type="filepath", elem_id="vidref")
|
|
|
97 |
|
98 |
ref_video.change(ref_video_fn, inputs=ref_video, outputs=[use_ref_video]) # todo
|
99 |
|
|
|
100 |
with gr.Column(variant='panel'):
|
101 |
with gr.Tabs(elem_id="sadtalker_checkbox"):
|
102 |
with gr.TabItem('Settings'):
|
103 |
with gr.Column(variant='panel'):
|
|
|
|
|
104 |
with gr.Row():
|
105 |
pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0) #
|
106 |
exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1) #
|
|
|
123 |
with gr.Tabs(elem_id="sadtalker_genearted"):
|
124 |
gen_video = gr.Video(label="Generated video", format="mp4")
|
125 |
|
|
|
|
|
126 |
submit.click(
|
127 |
fn=sad_talker.test,
|
128 |
inputs=[source_image,
|
|
|
147 |
|
148 |
return sadtalker_interface
|
149 |
|
|
|
150 |
if __name__ == "__main__":
|
151 |
|
152 |
demo = sadtalker_demo()
|
infer_onnx.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import scipy.io.wavfile
|
2 |
+
import os
|
3 |
+
import onnxruntime
|
4 |
+
import numpy as np
|
5 |
+
from huggingface_hub import snapshot_download
|
6 |
+
from num2words import num2words
|
7 |
+
import re
|
8 |
+
from transliterate import translit
|
9 |
+
import json
|
10 |
+
|
11 |
+
class TTS:
|
12 |
+
def __init__(self, model_name: str, save_path: str = "./model", add_time_to_end: float = 0.8) -> None:
|
13 |
+
if not os.path.exists(save_path):
|
14 |
+
os.mkdir(save_path)
|
15 |
+
|
16 |
+
model_dir = os.path.join(save_path, model_name)
|
17 |
+
|
18 |
+
if not os.path.exists(model_dir):
|
19 |
+
snapshot_download(repo_id=model_name,
|
20 |
+
allow_patterns=["*.txt", "*.onnx", "*.json"],
|
21 |
+
local_dir=model_dir,
|
22 |
+
local_dir_use_symlinks=False
|
23 |
+
)
|
24 |
+
|
25 |
+
self.model = onnxruntime.InferenceSession(os.path.join(model_dir, "exported/model.onnx"), providers=['CPUExecutionProvider'])
|
26 |
+
with open(os.path.join(model_dir, "exported/config.json")) as config_file:
|
27 |
+
self.config = json.load(config_file)["model_config"]
|
28 |
+
|
29 |
+
if os.path.exists(os.path.join(model_dir, "exported/dictionary.txt")):
|
30 |
+
from tokenizer import TokenizerG2P
|
31 |
+
print("Use g2p")
|
32 |
+
self.tokenizer = TokenizerG2P(os.path.join(model_dir, "exported"))
|
33 |
+
|
34 |
+
else:
|
35 |
+
from tokenizer import TokenizerGRUUT
|
36 |
+
print("Use gruut")
|
37 |
+
self.tokenizer = TokenizerGRUUT(os.path.join(model_dir, "exported"))
|
38 |
+
|
39 |
+
self.add_time_to_end = add_time_to_end
|
40 |
+
|
41 |
+
|
42 |
+
def _add_silent(self, audio, silence_duration: float = 1.0, sample_rate: int = 22050):
|
43 |
+
num_samples_silence = int(sample_rate * silence_duration)
|
44 |
+
silence_array = np.zeros(num_samples_silence, dtype=np.float32)
|
45 |
+
audio_with_silence = np.concatenate((audio, silence_array), axis=0)
|
46 |
+
return audio_with_silence
|
47 |
+
|
48 |
+
|
49 |
+
def save_wav(self, audio, path:str, sample_rate: int = 22050):
|
50 |
+
'''save audio to wav'''
|
51 |
+
scipy.io.wavfile.write(path, sample_rate, audio)
|
52 |
+
|
53 |
+
|
54 |
+
def _intersperse(self, lst, item):
|
55 |
+
result = [item] * (len(lst) * 2 + 1)
|
56 |
+
result[1::2] = lst
|
57 |
+
return result
|
58 |
+
|
59 |
+
def _get_seq(self, text):
|
60 |
+
phoneme_ids = self.tokenizer._get_seq(text)
|
61 |
+
phoneme_ids_inter = self._intersperse(phoneme_ids, 0)
|
62 |
+
return phoneme_ids_inter
|
63 |
+
|
64 |
+
def _num2wordsshor(self, match):
|
65 |
+
match = match.group()
|
66 |
+
ret = num2words(match, lang ='ru')
|
67 |
+
return ret
|
68 |
+
|
69 |
+
def __call__(self, text: str, length_scale=1.2):
|
70 |
+
text = translit(text, 'ru')
|
71 |
+
text = re.sub(r'\d+',self._num2wordsshor,text)
|
72 |
+
phoneme_ids = self._get_seq(text)
|
73 |
+
text = np.expand_dims(np.array(phoneme_ids, dtype=np.int64), 0)
|
74 |
+
text_lengths = np.array([text.shape[1]], dtype=np.int64)
|
75 |
+
scales = np.array(
|
76 |
+
[0.667, length_scale, 0.8],
|
77 |
+
dtype=np.float32,
|
78 |
+
)
|
79 |
+
audio = self.model.run(
|
80 |
+
None,
|
81 |
+
{
|
82 |
+
"input": text,
|
83 |
+
"input_lengths": text_lengths,
|
84 |
+
"scales": scales,
|
85 |
+
"sid": None,
|
86 |
+
},
|
87 |
+
)[0][0,0][0]
|
88 |
+
|
89 |
+
audio = self._add_silent(audio, silence_duration = self.add_time_to_end, sample_rate=self.config["samplerate"])
|
90 |
+
return audio
|
packages.txt
CHANGED
@@ -1,3 +1,2 @@
|
|
1 |
ffmpeg
|
2 |
-
libsndfile1
|
3 |
-
libespeak1
|
|
|
1 |
ffmpeg
|
2 |
+
libsndfile1
|
|
requirements.txt
CHANGED
@@ -22,4 +22,12 @@ dlib-bin
|
|
22 |
gfpgan
|
23 |
av
|
24 |
safetensors
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
gfpgan
|
23 |
av
|
24 |
safetensors
|
25 |
+
gruut
|
26 |
+
gruut-lang-ru
|
27 |
+
onnxruntime
|
28 |
+
huggingface-hub==0.15.1
|
29 |
+
transformers
|
30 |
+
sentencepiece
|
31 |
+
ruaccent
|
32 |
+
transliterate
|
33 |
+
num2words
|
tokenizer/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .gruut import Tokenizer as TokenizerGRUUT
|
2 |
+
from .g2p import Tokenizer as TokenizerG2P
|
tokenizer/g2p/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .tokenizer import Tokenizer
|
tokenizer/g2p/g2p.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
softletters=set(u"яёюиье")
|
3 |
+
startsyl=set(u"#ъьаяоёуюэеиы-")
|
4 |
+
others = set(["#", "+", "-", u"ь", u"ъ"])
|
5 |
+
|
6 |
+
softhard_cons = {
|
7 |
+
u"б" : u"b",
|
8 |
+
u"в" : u"v",
|
9 |
+
u"г" : u"g",
|
10 |
+
u"Г" : u"g",
|
11 |
+
u"д" : u"d",
|
12 |
+
u"з" : u"z",
|
13 |
+
u"к" : u"k",
|
14 |
+
u"л" : u"l",
|
15 |
+
u"м" : u"m",
|
16 |
+
u"н" : u"n",
|
17 |
+
u"п" : u"p",
|
18 |
+
u"р" : u"r",
|
19 |
+
u"с" : u"s",
|
20 |
+
u"т" : u"t",
|
21 |
+
u"ф" : u"f",
|
22 |
+
u"х" : u"h"
|
23 |
+
}
|
24 |
+
|
25 |
+
other_cons = {
|
26 |
+
u"ж" : u"zh",
|
27 |
+
u"ц" : u"c",
|
28 |
+
u"ч" : u"ch",
|
29 |
+
u"ш" : u"sh",
|
30 |
+
u"щ" : u"sch",
|
31 |
+
u"й" : u"j"
|
32 |
+
}
|
33 |
+
|
34 |
+
vowels = {
|
35 |
+
u"а" : u"a",
|
36 |
+
u"я" : u"a",
|
37 |
+
u"у" : u"u",
|
38 |
+
u"ю" : u"u",
|
39 |
+
u"о" : u"o",
|
40 |
+
u"ё" : u"o",
|
41 |
+
u"э" : u"e",
|
42 |
+
u"е" : u"e",
|
43 |
+
u"и" : u"i",
|
44 |
+
u"ы" : u"y",
|
45 |
+
}
|
46 |
+
|
47 |
+
def pallatize(phones):
|
48 |
+
for i, phone in enumerate(phones[:-1]):
|
49 |
+
if phone[0] in softhard_cons:
|
50 |
+
if phones[i+1][0] in softletters:
|
51 |
+
phones[i] = (softhard_cons[phone[0]] + "j", 0)
|
52 |
+
else:
|
53 |
+
phones[i] = (softhard_cons[phone[0]], 0)
|
54 |
+
if phone[0] in other_cons:
|
55 |
+
phones[i] = (other_cons[phone[0]], 0)
|
56 |
+
|
57 |
+
def convert_vowels(phones):
|
58 |
+
new_phones = []
|
59 |
+
prev = ""
|
60 |
+
for phone in phones:
|
61 |
+
if prev in startsyl:
|
62 |
+
if phone[0] in set(u"яюеё"):
|
63 |
+
new_phones.append("j")
|
64 |
+
if phone[0] in vowels:
|
65 |
+
new_phones.append(vowels[phone[0]] + str(phone[1]))
|
66 |
+
else:
|
67 |
+
new_phones.append(phone[0])
|
68 |
+
prev = phone[0]
|
69 |
+
|
70 |
+
return new_phones
|
71 |
+
|
72 |
+
def convert(stressword):
|
73 |
+
phones = ("#" + stressword + "#")
|
74 |
+
|
75 |
+
|
76 |
+
# Assign stress marks
|
77 |
+
stress_phones = []
|
78 |
+
stress = 0
|
79 |
+
for phone in phones:
|
80 |
+
if phone == "+":
|
81 |
+
stress = 1
|
82 |
+
else:
|
83 |
+
stress_phones.append((phone, stress))
|
84 |
+
stress = 0
|
85 |
+
|
86 |
+
# Pallatize
|
87 |
+
pallatize(stress_phones)
|
88 |
+
|
89 |
+
# Assign stress
|
90 |
+
phones = convert_vowels(stress_phones)
|
91 |
+
|
92 |
+
# Filter
|
93 |
+
phones = [x for x in phones if x not in others]
|
94 |
+
return " ".join(phones)
|
tokenizer/g2p/tokenizer.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
import re
|
3 |
+
from .g2p import *
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
|
7 |
+
class Tokenizer():
|
8 |
+
def __init__(self, data_path: str) -> None:
|
9 |
+
self.dic = {}
|
10 |
+
for line in open(os.path.join(data_path, "dictionary.txt")):
|
11 |
+
items = line.split()
|
12 |
+
self.dic[items[0]] = " ".join(items[1:])
|
13 |
+
|
14 |
+
self.config = json.load(open(os.path.join(data_path, "config.json")))
|
15 |
+
|
16 |
+
def g2p(self, text):
|
17 |
+
text = re.sub("—", "-", text)
|
18 |
+
text = re.sub("([!'(),-.:;?])", r' \1 ', text)
|
19 |
+
|
20 |
+
phonemes = []
|
21 |
+
for word in text.split():
|
22 |
+
if re.match("[!'(),-.:;?]", word):
|
23 |
+
phonemes.append(word)
|
24 |
+
continue
|
25 |
+
|
26 |
+
word = word.lower()
|
27 |
+
if len(phonemes) > 0: phonemes.append(' ')
|
28 |
+
|
29 |
+
if word in self.dic:
|
30 |
+
phonemes.extend(self.dic[word].split())
|
31 |
+
else:
|
32 |
+
phonemes.extend(convert(word).split())
|
33 |
+
|
34 |
+
phoneme_id_map = self.config["phoneme_id_map"]
|
35 |
+
phoneme_ids = []
|
36 |
+
phoneme_ids.extend(phoneme_id_map["^"])
|
37 |
+
phoneme_ids.extend(phoneme_id_map["_"])
|
38 |
+
for p in phonemes:
|
39 |
+
if p in phoneme_id_map:
|
40 |
+
phoneme_ids.extend(phoneme_id_map[p])
|
41 |
+
phoneme_ids.extend(phoneme_id_map["_"])
|
42 |
+
phoneme_ids.extend(phoneme_id_map["$"])
|
43 |
+
|
44 |
+
return phoneme_ids, phonemes
|
45 |
+
|
46 |
+
def _get_seq(self, text: str) -> list[int]:
|
47 |
+
seq = self.g2p(text)[0]
|
48 |
+
return seq
|
tokenizer/gruut/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .tokenizer import Tokenizer
|
tokenizer/gruut/tokenizer.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
from gruut import sentences
|
3 |
+
import os
|
4 |
+
import re
|
5 |
+
|
6 |
+
class Tokenizer():
|
7 |
+
def __init__(self, path) -> None:
|
8 |
+
with open(os.path.join(path, "vocab.txt"), "r", encoding="utf-8") as vocab_file:
|
9 |
+
self.symbols = vocab_file.read().split("\n")
|
10 |
+
self.symbols = list(map(chr, list(map(int, self.symbols))))
|
11 |
+
|
12 |
+
self.symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
|
13 |
+
|
14 |
+
def _ru_phonems(self, text: str) -> str:
|
15 |
+
text = text.lower()
|
16 |
+
phonemes = ""
|
17 |
+
for sent in sentences(text, lang="ru"):
|
18 |
+
for word in sent:
|
19 |
+
if word.phonemes:
|
20 |
+
phonemes += "".join(word.phonemes)
|
21 |
+
phonemes = re.sub(re.compile(r'\s+'), ' ', phonemes).lstrip().rstrip()
|
22 |
+
return phonemes
|
23 |
+
|
24 |
+
|
25 |
+
def _text_to_sequence(self, text: str) -> list[int]:
|
26 |
+
'''convert text to seq'''
|
27 |
+
sequence = []
|
28 |
+
clean_text = self._ru_phonems(text)
|
29 |
+
for symbol in clean_text:
|
30 |
+
symbol_id = self.symbol_to_id[symbol]
|
31 |
+
sequence += [symbol_id]
|
32 |
+
return sequence
|
33 |
+
|
34 |
+
|
35 |
+
def _get_seq(self, text: str) -> list[int]:
|
36 |
+
seq = self._text_to_sequence(text)
|
37 |
+
return seq
|