Spaces:
Running
Running
Yurii Paniv
commited on
Commit
•
a163565
1
Parent(s):
7af6095
Add version 3.0.0
Browse files- README.md +6 -3
- app.py +21 -15
- config.json +20 -2
README.md
CHANGED
@@ -18,8 +18,10 @@ Link to source code and models -> [https://github.com/robinhad/ukrainian-tts](ht
|
|
18 |
|
19 |
Code is licensed under `MIT License`, models are under `GNU GPL v3 License`.
|
20 |
# Support
|
21 |
-
If you like my work, please support -> [https://send.monobank.ua/jar/48iHq4xAXm](https://send.monobank.ua/jar/48iHq4xAXm)
|
22 |
-
|
|
|
|
|
23 |
|
24 |
`Mykyta (male)`:
|
25 |
|
@@ -53,7 +55,8 @@ tts-server --model_path path/to/model.pth \
|
|
53 |
# Attribution 🤝
|
54 |
|
55 |
- Model training - [Yurii Paniv @robinhad](https://github.com/robinhad)
|
56 |
-
- Mykyta, Olena
|
|
|
57 |
- Silence cutting using [HMM-GMM](https://github.com/proger/uk) - [Volodymyr Kyrylov @proger](https://github.com/proger)
|
58 |
- Autostress (with dictionary) using [ukrainian-word-stress](https://github.com/lang-uk/ukrainian-word-stress) - [Oleksiy Syvokon @asivokon](https://github.com/asivokon)
|
59 |
- Autostress (with model) using [ukrainian-accentor](https://github.com/egorsmkv/ukrainian-accentor) - [Bohdan Mykhailenko @NeonBohdan](https://github.com/NeonBohdan) + [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)
|
|
|
18 |
|
19 |
Code is licensed under `MIT License`, models are under `GNU GPL v3 License`.
|
20 |
# Support
|
21 |
+
If you like my work, please support -> [https://send.monobank.ua/jar/48iHq4xAXm](https://send.monobank.ua/jar/48iHq4xAXm)
|
22 |
+
For collaboration and question please contact me here: [Telegram https://t.me/robinhad](https://t.me/robinhad) [Twitter https://twitter.com/robinhad](https://twitter.com/robinhad)
|
23 |
+
You're welcome to join UA Speech Recognition and Synthesis community: [Telegram https://t.me/speech_recognition_uk](https://t.me/speech_recognition_uk)
|
24 |
+
# Examples
|
25 |
|
26 |
`Mykyta (male)`:
|
27 |
|
|
|
55 |
# Attribution 🤝
|
56 |
|
57 |
- Model training - [Yurii Paniv @robinhad](https://github.com/robinhad)
|
58 |
+
- Mykyta, Olena, Lada, Dmytro, Olha dataset - [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)
|
59 |
+
- Dmytro voice - [Dmytro Chaplynskyi @dchaplinsky](https://github.com/dchaplinsky)
|
60 |
- Silence cutting using [HMM-GMM](https://github.com/proger/uk) - [Volodymyr Kyrylov @proger](https://github.com/proger)
|
61 |
- Autostress (with dictionary) using [ukrainian-word-stress](https://github.com/lang-uk/ukrainian-word-stress) - [Oleksiy Syvokon @asivokon](https://github.com/asivokon)
|
62 |
- Autostress (with model) using [ukrainian-accentor](https://github.com/egorsmkv/ukrainian-accentor) - [Bohdan Mykhailenko @NeonBohdan](https://github.com/NeonBohdan) + [Yehor Smoliakov @egorsmkv](https://github.com/egorsmkv)
|
app.py
CHANGED
@@ -20,6 +20,8 @@ class VoiceOption(Enum):
|
|
20 |
Olena = "Олена (жіночий) 👩"
|
21 |
Mykyta = "Микита (чоловічий) 👨"
|
22 |
Lada = "Лада (жіночий) 👩"
|
|
|
|
|
23 |
|
24 |
|
25 |
def download(url, file_name):
|
@@ -33,7 +35,7 @@ def download(url, file_name):
|
|
33 |
|
34 |
|
35 |
print("downloading uk/mykyta/vits-tts")
|
36 |
-
release_number = "v3.0.0
|
37 |
model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model-inference.pth"
|
38 |
config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.json"
|
39 |
speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/speakers.pth"
|
@@ -71,11 +73,14 @@ def tts(text: str, voice: str, stress: str):
|
|
71 |
autostress_with_model = (
|
72 |
True if stress == StressOption.AutomaticStressWithModel.value else False
|
73 |
)
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
79 |
text = preprocess_text(text, autostress_with_model)
|
80 |
text_limit = 7200
|
81 |
text = (
|
@@ -98,23 +103,24 @@ with open("README.md") as file:
|
|
98 |
iface = gr.Interface(
|
99 |
fn=tts,
|
100 |
inputs=[
|
101 |
-
gr.
|
102 |
label="Input",
|
103 |
-
|
104 |
),
|
105 |
-
gr.
|
106 |
label="Голос",
|
107 |
choices=[option.value for option in VoiceOption],
|
108 |
-
|
109 |
),
|
110 |
-
gr.
|
111 |
label="Наголоси",
|
112 |
choices=[option.value for option in StressOption],
|
|
|
113 |
),
|
114 |
],
|
115 |
outputs=[
|
116 |
-
gr.
|
117 |
-
gr.
|
118 |
],
|
119 |
title="🐸💬🇺🇦 - Coqui TTS",
|
120 |
description="Україномовний🇺🇦 TTS за допомогою Coqui TTS (щоб вручну поставити наголос, використовуйте + перед голосною)",
|
@@ -132,12 +138,12 @@ iface = gr.Interface(
|
|
132 |
],
|
133 |
[
|
134 |
"Вв+едіть, будь ласка, св+оє реч+ення.",
|
135 |
-
VoiceOption.
|
136 |
StressOption.AutomaticStress.value,
|
137 |
],
|
138 |
[
|
139 |
"Привіт, як тебе звати?",
|
140 |
-
VoiceOption.
|
141 |
StressOption.AutomaticStress.value,
|
142 |
],
|
143 |
[
|
|
|
20 |
Olena = "Олена (жіночий) 👩"
|
21 |
Mykyta = "Микита (чоловічий) 👨"
|
22 |
Lada = "Лада (жіночий) 👩"
|
23 |
+
Dmytro = "Дмитро (чоловічий) 👩"
|
24 |
+
Olga = "Ольга (жіночий) 👩"
|
25 |
|
26 |
|
27 |
def download(url, file_name):
|
|
|
35 |
|
36 |
|
37 |
print("downloading uk/mykyta/vits-tts")
|
38 |
+
release_number = "v3.0.0"
|
39 |
model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model-inference.pth"
|
40 |
config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.json"
|
41 |
speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/speakers.pth"
|
|
|
73 |
autostress_with_model = (
|
74 |
True if stress == StressOption.AutomaticStressWithModel.value else False
|
75 |
)
|
76 |
+
voice_mapping = {
|
77 |
+
VoiceOption.Olena.value: "olena",
|
78 |
+
VoiceOption.Mykyta.value: "mykyta",
|
79 |
+
VoiceOption.Lada.value: "lada",
|
80 |
+
VoiceOption.Dmytro.value: "dmytro",
|
81 |
+
VoiceOption.Olga.value: "olga",
|
82 |
+
}
|
83 |
+
speaker_name = voice_mapping[voice]
|
84 |
text = preprocess_text(text, autostress_with_model)
|
85 |
text_limit = 7200
|
86 |
text = (
|
|
|
103 |
iface = gr.Interface(
|
104 |
fn=tts,
|
105 |
inputs=[
|
106 |
+
gr.components.Textbox(
|
107 |
label="Input",
|
108 |
+
value="Введіть, будь ласка, своє р+ечення.",
|
109 |
),
|
110 |
+
gr.components.Radio(
|
111 |
label="Голос",
|
112 |
choices=[option.value for option in VoiceOption],
|
113 |
+
value=VoiceOption.Olena.value,
|
114 |
),
|
115 |
+
gr.components.Radio(
|
116 |
label="Наголоси",
|
117 |
choices=[option.value for option in StressOption],
|
118 |
+
value=StressOption.AutomaticStress.value
|
119 |
),
|
120 |
],
|
121 |
outputs=[
|
122 |
+
gr.components.Audio(label="Output"),
|
123 |
+
gr.components.Textbox(label="Наголошений текст"),
|
124 |
],
|
125 |
title="🐸💬🇺🇦 - Coqui TTS",
|
126 |
description="Україномовний🇺🇦 TTS за допомогою Coqui TTS (щоб вручну поставити наголос, використовуйте + перед голосною)",
|
|
|
138 |
],
|
139 |
[
|
140 |
"Вв+едіть, будь ласка, св+оє реч+ення.",
|
141 |
+
VoiceOption.Dmytro.value,
|
142 |
StressOption.AutomaticStress.value,
|
143 |
],
|
144 |
[
|
145 |
"Привіт, як тебе звати?",
|
146 |
+
VoiceOption.Olga.value,
|
147 |
StressOption.AutomaticStress.value,
|
148 |
],
|
149 |
[
|
config.json
CHANGED
@@ -73,7 +73,7 @@
|
|
73 |
"griffin_lim_iters": 60,
|
74 |
"num_mels": 80,
|
75 |
"mel_fmin": 0,
|
76 |
-
"mel_fmax":
|
77 |
"spec_gain": 6.0,
|
78 |
"do_amp_to_db_linear": true,
|
79 |
"do_amp_to_db_mel": true,
|
@@ -158,11 +158,29 @@
|
|
158 |
null,
|
159 |
null
|
160 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
[
|
162 |
"\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
|
163 |
"lada",
|
164 |
null,
|
165 |
null
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
]
|
167 |
],
|
168 |
"eval_split_max_size": null,
|
@@ -243,7 +261,7 @@
|
|
243 |
"init_discriminator": true,
|
244 |
"use_spectral_norm_disriminator": false,
|
245 |
"use_speaker_embedding": true,
|
246 |
-
"num_speakers":
|
247 |
"speakers_file": "speakers.pth",
|
248 |
"d_vector_file": null,
|
249 |
"speaker_embedding_channels": 256,
|
|
|
73 |
"griffin_lim_iters": 60,
|
74 |
"num_mels": 80,
|
75 |
"mel_fmin": 0,
|
76 |
+
"mel_fmax": null,
|
77 |
"spec_gain": 6.0,
|
78 |
"do_amp_to_db_linear": true,
|
79 |
"do_amp_to_db_mel": true,
|
|
|
158 |
null,
|
159 |
null
|
160 |
],
|
161 |
+
[
|
162 |
+
"\u0425\u0442+\u043e \u0442+\u0438 \u0442\u0430\u043a+\u0438\u0439 +\u0456 +\u044f\u043a \u0442\u0435\u0431+\u0435 \u0437\u0432+\u0430\u0442\u0438?",
|
163 |
+
"dmytro",
|
164 |
+
null,
|
165 |
+
null
|
166 |
+
],
|
167 |
[
|
168 |
"\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
|
169 |
"lada",
|
170 |
null,
|
171 |
null
|
172 |
+
],
|
173 |
+
[
|
174 |
+
"\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
|
175 |
+
"dmytro",
|
176 |
+
null,
|
177 |
+
null
|
178 |
+
],
|
179 |
+
[
|
180 |
+
"\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
|
181 |
+
"olga",
|
182 |
+
null,
|
183 |
+
null
|
184 |
]
|
185 |
],
|
186 |
"eval_split_max_size": null,
|
|
|
261 |
"init_discriminator": true,
|
262 |
"use_spectral_norm_disriminator": false,
|
263 |
"use_speaker_embedding": true,
|
264 |
+
"num_speakers": 5,
|
265 |
"speakers_file": "speakers.pth",
|
266 |
"d_vector_file": null,
|
267 |
"speaker_embedding_channels": 256,
|