Yurii Paniv commited on
Commit
6449e88
1 Parent(s): eb57397

Remove Coqui

Browse files
README.md CHANGED
@@ -4,14 +4,14 @@ emoji: 🐌
4
  colorFrom: blue
5
  colorTo: yellow
6
  sdk: gradio
7
- sdk_version : 3.3
8
  python_version: 3.9
9
  app_file: app.py
10
  pinned: false
11
  ---
12
 
13
  # Ukrainian TTS 📢🤖
14
- Ukrainian TTS (text-to-speech) using Coqui TTS.
15
 
16
  ![pytest](https://github.com/robinhad/ukrainian-tts/actions/workflows/hf-sync.yml/badge.svg)
17
  [![Open In HF🤗 Space ](https://img.shields.io/badge/Open%20Demo-%F0%9F%A4%97%20Space-yellow)](https://huggingface.co/spaces/robinhad/ukrainian-tts)
@@ -65,33 +65,15 @@ pip install git+https://github.com/robinhad/ukrainian-tts.git
65
  ```python
66
  from ukrainian_tts.tts import TTS, Voices, Stress
67
 
68
- tts = TTS(use_cuda=False)
69
  with open("test.wav", mode="wb") as file:
70
  _, text = tts.tts("Привіт", Voices.Dmytro.value, Stress.Model.value, file)
71
  print("Accented text:", text)
72
  ```
73
 
74
- ## Run manually:
75
- `Caution: this won't use normalizer and autostress like a web demo. `
76
- 1. `pip install -r requirements.txt`.
77
- 2. Download `model.pth` and `speakers.pth` from "Releases" tab.
78
- 3. Launch as one-time command:
79
- ```
80
- tts --text "Text for TTS" \
81
- --model_path path/to/model.pth \
82
- --config_path path/to/config.json \
83
- --speaker_idx dmytro \
84
- --out_path folder/to/save/output.wav
85
- ```
86
- or alternatively launch web server using:
87
- ```
88
- tts-server --model_path path/to/model.pth \
89
- --config_path path/to/config.json
90
- ```
91
 
92
  # How to train: 🏋️
93
- 1. Refer to ["Nervous beginner guide"](https://tts.readthedocs.io/en/latest/tutorial_for_nervous_beginners.html) in Coqui TTS docs.
94
- 2. Instead of provided `config.json` use one from this repo.
95
 
96
 
97
  # Attribution 🤝
4
  colorFrom: blue
5
  colorTo: yellow
6
  sdk: gradio
7
+ sdk_version : 3.12
8
  python_version: 3.9
9
  app_file: app.py
10
  pinned: false
11
  ---
12
 
13
  # Ukrainian TTS 📢🤖
14
+ Ukrainian TTS (text-to-speech) using ESPNET.
15
 
16
  ![pytest](https://github.com/robinhad/ukrainian-tts/actions/workflows/hf-sync.yml/badge.svg)
17
  [![Open In HF🤗 Space ](https://img.shields.io/badge/Open%20Demo-%F0%9F%A4%97%20Space-yellow)](https://huggingface.co/spaces/robinhad/ukrainian-tts)
65
  ```python
66
  from ukrainian_tts.tts import TTS, Voices, Stress
67
 
68
+ tts = TTS()
69
  with open("test.wav", mode="wb") as file:
70
  _, text = tts.tts("Привіт", Voices.Dmytro.value, Stress.Model.value, file)
71
  print("Accented text:", text)
72
  ```
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # How to train: 🏋️
76
+ TBD
 
77
 
78
 
79
  # Attribution 🤝
app.py CHANGED
@@ -53,11 +53,8 @@ class VoiceOption(Enum):
53
 
54
  print(f"CUDA available? {is_available()}")
55
 
56
- badge = (
57
- "https://visitor-badge-reloaded.herokuapp.com/badge?page_id=robinhad.ukrainian-tts"
58
- )
59
 
60
- ukr_tts = TTS(use_cuda=is_available())
61
 
62
 
63
  def tts(text: str, voice: str, stress: str):
@@ -121,9 +118,9 @@ iface = gr.Interface(
121
  gr.components.Audio(label="Output"),
122
  gr.components.Textbox(label="Наголошений текст"),
123
  ],
124
- title="🐸💬🇺🇦 - Coqui TTS",
125
- description="Україномовний🇺🇦 TTS за допомогою Coqui TTS (щоб вручну поставити наголос, використовуйте + перед голосною)",
126
- article=article + f'\n <center><img src="{badge}" alt="visitors badge"/></center>',
127
  examples=[
128
  [
129
  "Введіть, будь ласка, своє речення.",
53
 
54
  print(f"CUDA available? {is_available()}")
55
 
 
 
 
56
 
57
+ ukr_tts = TTS()
58
 
59
 
60
  def tts(text: str, voice: str, stress: str):
118
  gr.components.Audio(label="Output"),
119
  gr.components.Textbox(label="Наголошений текст"),
120
  ],
121
+ title="🤖💬🇺🇦 - ESPNET",
122
+ description="Україномовний🇺🇦 TTS за допомогою ESPNET (щоб вручну поставити наголос, використовуйте + перед голосною)",
123
+ article=article,
124
  examples=[
125
  [
126
  "Введіть, будь ласка, своє речення.",
config.json DELETED
@@ -1,319 +0,0 @@
1
- {
2
- "output_path": "/home/robinhad/Projects/TTS",
3
- "logger_uri": null,
4
- "run_name": "vits_mykyta_latest",
5
- "project_name": null,
6
- "run_description": "\ud83d\udc38Coqui trainer run.",
7
- "print_step": 25,
8
- "plot_step": 100,
9
- "model_param_stats": false,
10
- "wandb_entity": null,
11
- "dashboard_logger": "tensorboard",
12
- "log_model_step": 5000,
13
- "save_step": 5000,
14
- "save_n_checkpoints": 5,
15
- "save_checkpoints": true,
16
- "save_all_best": false,
17
- "save_best_after": 10000,
18
- "target_loss": null,
19
- "print_eval": false,
20
- "test_delay_epochs": -1,
21
- "run_eval": true,
22
- "run_eval_steps": null,
23
- "distributed_backend": "nccl",
24
- "distributed_url": "tcp://localhost:54321",
25
- "mixed_precision": true,
26
- "epochs": 1500,
27
- "batch_size": 64,
28
- "eval_batch_size": 16,
29
- "grad_clip": [
30
- 1000,
31
- 1000
32
- ],
33
- "scheduler_after_epoch": true,
34
- "lr": 0.001,
35
- "optimizer": "AdamW",
36
- "optimizer_params": {
37
- "betas": [
38
- 0.8,
39
- 0.99
40
- ],
41
- "eps": 1e-09,
42
- "weight_decay": 0.01
43
- },
44
- "lr_scheduler": "",
45
- "lr_scheduler_params": {},
46
- "use_grad_scaler": false,
47
- "cudnn_enable": true,
48
- "cudnn_deterministic": false,
49
- "cudnn_benchmark": false,
50
- "training_seed": 54321,
51
- "model": "vits",
52
- "num_loader_workers": 12,
53
- "num_eval_loader_workers": 12,
54
- "use_noise_augment": false,
55
- "audio": {
56
- "fft_size": 1024,
57
- "win_length": 1024,
58
- "hop_length": 256,
59
- "frame_shift_ms": null,
60
- "frame_length_ms": null,
61
- "stft_pad_mode": "reflect",
62
- "sample_rate": 22050,
63
- "resample": false,
64
- "preemphasis": 0,
65
- "ref_level_db": 35,
66
- "do_sound_norm": true,
67
- "log_func": "np.log",
68
- "do_trim_silence": false,
69
- "trim_db": 35,
70
- "do_rms_norm": false,
71
- "db_level": -24,
72
- "power": 1.1,
73
- "griffin_lim_iters": 60,
74
- "num_mels": 80,
75
- "mel_fmin": 0,
76
- "mel_fmax": null,
77
- "spec_gain": 6.0,
78
- "do_amp_to_db_linear": true,
79
- "do_amp_to_db_mel": true,
80
- "pitch_fmax": 640.0,
81
- "pitch_fmin": 0.0,
82
- "signal_norm": true,
83
- "min_level_db": -100,
84
- "symmetric_norm": true,
85
- "max_norm": 1.0,
86
- "clip_norm": true,
87
- "stats_path": null
88
- },
89
- "use_phonemes": false,
90
- "phonemizer": null,
91
- "phoneme_language": "uk",
92
- "compute_input_seq_cache": false,
93
- "text_cleaner": "basic_cleaners",
94
- "enable_eos_bos_chars": false,
95
- "test_sentences_file": "",
96
- "phoneme_cache_path": "/home/robinhad/Projects/TTS/phoneme_cache",
97
- "characters": {
98
- "characters_class": "TTS.tts.models.vits.VitsCharacters",
99
- "vocab_dict": null,
100
- "pad": "<PAD>",
101
- "eos": "<EOS>",
102
- "bos": "<BOS>",
103
- "blank": "<BLNK>",
104
- "characters": "!\"'(),-/:;.?\u00ab\u00bb+\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0454\u0456\u0457\u0491\u2013\u2014\u2015\u2018\u2019\u201c\u201d\u201e\u2026 ",
105
- "punctuations": "!\"'(),-/:;?\u00ab\u00bb+\u2013\u2014\u2015\u2018\u2019\u201c\u201d\u201e\u2026 ",
106
- "phonemes": null,
107
- "is_unique": true,
108
- "is_sorted": true
109
- },
110
- "add_blank": true,
111
- "batch_group_size": 0,
112
- "loss_masking": null,
113
- "sort_by_audio_len": true,
114
- "min_audio_len": 32768,
115
- "max_audio_len": 264600,
116
- "min_text_len": 1,
117
- "max_text_len": Infinity,
118
- "compute_f0": false,
119
- "compute_linear_spec": true,
120
- "precompute_num_workers": 16,
121
- "start_by_longest": false,
122
- "datasets": [
123
- {
124
- "name": "mailabs",
125
- "path": "/home/robinhad/Data/Audio/ukr-tts-dataset-mai",
126
- "meta_file_train": "",
127
- "ignored_speakers": null,
128
- "language": "",
129
- "meta_file_val": "",
130
- "meta_file_attn_mask": ""
131
- }
132
- ],
133
- "test_sentences": [
134
- [
135
- "\u0414+\u0435\u0441\u044f\u0442\u044c \u0440\u0430\u0437+\u0456\u0432 \u0432\u0456\u0434\u043c+\u0456\u0440\u044f\u0439, +\u0430 \u0440+\u0430\u0437 - \u0432\u0456\u0434\u0440+\u0456\u0436.",
136
- "olena",
137
- null,
138
- null
139
- ],
140
- [
141
- "\u0413\u043e\u0432\u043e\u0440+\u0438, \u043d+\u0456\u0431\u0438 \u0442+\u0438 \u0436\u0438\u0432+\u0438\u0439!"
142
- ],
143
- [
144
- "\u041f\u0435\u0440\u0435\u043f\u0440+\u043e\u0448\u0443\u044e, \u0414+\u0435\u0439\u0432\u0435, \u043d+\u0430 \u0436+\u0430\u043b\u044c, +\u044f \u043d+\u0435 \u043c+\u043e\u0436\u0443 \u0437\u0440\u043e\u0431+\u0438\u0442\u0438 \u0446+\u0435.",
145
- "lada",
146
- null,
147
- null
148
- ],
149
- [
150
- "\u041f\u0435\u0440\u0435\u043f\u0440+\u043e\u0448\u0443\u044e, \u0414+\u0435\u0439\u0432\u0435, \u043d+\u0430 \u0436+\u0430\u043b\u044c, +\u044f \u043d+\u0435 \u043c+\u043e\u0436\u0443 \u0437\u0440\u043e\u0431+\u0438\u0442\u0438 \u0446+\u0435.",
151
- "mykyta",
152
- null,
153
- null
154
- ],
155
- [
156
- "\u0425\u0442+\u043e \u0442+\u0438 \u0442\u0430\u043a+\u0438\u0439 +\u0456 +\u044f\u043a \u0442\u0435\u0431+\u0435 \u0437\u0432+\u0430\u0442\u0438?",
157
- "mykyta",
158
- null,
159
- null
160
- ],
161
- [
162
- "\u0425\u0442+\u043e \u0442+\u0438 \u0442\u0430\u043a+\u0438\u0439 +\u0456 +\u044f\u043a \u0442\u0435\u0431+\u0435 \u0437\u0432+\u0430\u0442\u0438?",
163
- "dmytro",
164
- null,
165
- null
166
- ],
167
- [
168
- "\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
169
- "lada",
170
- null,
171
- null
172
- ],
173
- [
174
- "\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
175
- "dmytro",
176
- null,
177
- null
178
- ],
179
- [
180
- "\u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u0438\u0439 - \u043c+\u0456\u0441\u0442\u043e \u0432 \u0425\u043c\u0435\u043b\u044c\u043d+\u0438\u0446\u044c\u043a\u0456\u0439 +\u043e\u0431\u043b\u0430\u0441\u0442\u0456 \u0423\u043a\u0440\u0430+\u0457\u043d\u0438, \u0446+\u0435\u043d\u0442\u0440 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0457 \u043c\u0456\u0441\u044c\u043a+\u043e\u0457 \u043e\u0431'+\u0454\u0434\u043d\u0430\u043d\u043e\u0457 \u0442\u0435\u0440\u0438\u0442\u043e\u0440\u0456+\u0430\u043b\u044c\u043d\u043e\u0457 \u0433\u0440\u043e\u043c+\u0430\u0434\u0438 +\u0456 \u041a\u0430\u043c'\u044f\u043d+\u0435\u0446\u044c-\u041f\u043e\u0434+\u0456\u043b\u044c\u0441\u044c\u043a\u043e\u0433\u043e \u0440\u0430\u0439+\u043e\u043d\u0443.",
181
- "olga",
182
- null,
183
- null
184
- ]
185
- ],
186
- "eval_split_max_size": null,
187
- "eval_split_size": 0.01,
188
- "use_speaker_weighted_sampler": false,
189
- "speaker_weighted_sampler_alpha": 1.0,
190
- "use_language_weighted_sampler": false,
191
- "language_weighted_sampler_alpha": 1.0,
192
- "use_length_weighted_sampler": false,
193
- "length_weighted_sampler_alpha": 1.0,
194
- "model_args": {
195
- "num_chars": 84,
196
- "out_channels": 513,
197
- "spec_segment_size": 32,
198
- "hidden_channels": 192,
199
- "hidden_channels_ffn_text_encoder": 768,
200
- "num_heads_text_encoder": 2,
201
- "num_layers_text_encoder": 6,
202
- "kernel_size_text_encoder": 3,
203
- "dropout_p_text_encoder": 0.1,
204
- "dropout_p_duration_predictor": 0.5,
205
- "kernel_size_posterior_encoder": 5,
206
- "dilation_rate_posterior_encoder": 1,
207
- "num_layers_posterior_encoder": 16,
208
- "kernel_size_flow": 5,
209
- "dilation_rate_flow": 1,
210
- "num_layers_flow": 4,
211
- "resblock_type_decoder": "1",
212
- "resblock_kernel_sizes_decoder": [
213
- 3,
214
- 7,
215
- 11
216
- ],
217
- "resblock_dilation_sizes_decoder": [
218
- [
219
- 1,
220
- 3,
221
- 5
222
- ],
223
- [
224
- 1,
225
- 3,
226
- 5
227
- ],
228
- [
229
- 1,
230
- 3,
231
- 5
232
- ]
233
- ],
234
- "upsample_rates_decoder": [
235
- 8,
236
- 8,
237
- 2,
238
- 2
239
- ],
240
- "upsample_initial_channel_decoder": 512,
241
- "upsample_kernel_sizes_decoder": [
242
- 16,
243
- 16,
244
- 4,
245
- 4
246
- ],
247
- "periods_multi_period_discriminator": [
248
- 2,
249
- 3,
250
- 5,
251
- 7,
252
- 11
253
- ],
254
- "use_sdp": true,
255
- "noise_scale": 1.0,
256
- "inference_noise_scale": 0.667,
257
- "length_scale": 1,
258
- "noise_scale_dp": 1.0,
259
- "inference_noise_scale_dp": 1.0,
260
- "max_inference_len": null,
261
- "init_discriminator": true,
262
- "use_spectral_norm_disriminator": false,
263
- "use_speaker_embedding": true,
264
- "num_speakers": 5,
265
- "speakers_file": "speakers.pth",
266
- "d_vector_file": null,
267
- "speaker_embedding_channels": 256,
268
- "use_d_vector_file": false,
269
- "d_vector_dim": 0,
270
- "detach_dp_input": true,
271
- "use_language_embedding": false,
272
- "embedded_language_dim": 4,
273
- "num_languages": 0,
274
- "language_ids_file": null,
275
- "use_speaker_encoder_as_loss": false,
276
- "speaker_encoder_config_path": "",
277
- "speaker_encoder_model_path": "",
278
- "condition_dp_on_speaker": true,
279
- "freeze_encoder": false,
280
- "freeze_DP": false,
281
- "freeze_PE": false,
282
- "freeze_flow_decoder": false,
283
- "freeze_waveform_decoder": false,
284
- "encoder_sample_rate": null,
285
- "interpolate_z": true,
286
- "reinit_DP": false,
287
- "reinit_text_encoder": false
288
- },
289
- "lr_gen": 0.0002,
290
- "lr_disc": 0.0002,
291
- "lr_scheduler_gen": "ExponentialLR",
292
- "lr_scheduler_gen_params": {
293
- "gamma": 0.999875,
294
- "last_epoch": -1
295
- },
296
- "lr_scheduler_disc": "ExponentialLR",
297
- "lr_scheduler_disc_params": {
298
- "gamma": 0.999875,
299
- "last_epoch": -1
300
- },
301
- "kl_loss_alpha": 1.0,
302
- "disc_loss_alpha": 1.0,
303
- "gen_loss_alpha": 1.0,
304
- "feat_loss_alpha": 1.0,
305
- "mel_loss_alpha": 45.0,
306
- "dur_loss_alpha": 1.0,
307
- "speaker_encoder_loss_alpha": 1.0,
308
- "return_wav": true,
309
- "r": 1,
310
- "num_speakers": 0,
311
- "use_speaker_embedding": true,
312
- "speakers_file": "speakers.pth",
313
- "speaker_embedding_channels": 256,
314
- "language_ids_file": null,
315
- "use_language_embedding": false,
316
- "use_d_vector_file": false,
317
- "d_vector_file": null,
318
- "d_vector_dim": 0
319
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.yaml ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/train_vits.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: /mnt/tts-storage/exp/22k/tts_train_vits_raw_char
7
+ ngpu: 1
8
+ seed: 3407
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: false
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 300
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - train
38
+ - total_count
39
+ - max
40
+ keep_nbest_models: 10
41
+ nbest_averaging_interval: 0
42
+ grad_clip: -1
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: 50
51
+ use_matplotlib: true
52
+ use_tensorboard: true
53
+ create_graph_in_tensorboard: false
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: null
66
+ batch_size: 20
67
+ valid_batch_size: null
68
+ batch_bins: 1900000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - /mnt/tts-storage/exp/22k/tts_stats_raw_linear_spectrogram_char/train/text_shape.char
72
+ - /mnt/tts-storage/exp/22k/tts_stats_raw_linear_spectrogram_char/train/speech_shape
73
+ valid_shape_file:
74
+ - /mnt/tts-storage/exp/22k/tts_stats_raw_linear_spectrogram_char/valid/text_shape.char
75
+ - /mnt/tts-storage/exp/22k/tts_stats_raw_linear_spectrogram_char/valid/speech_shape
76
+ batch_type: numel
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 150
80
+ - 204800
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ train_data_path_and_name_and_type:
88
+ - - dump/22k/raw/tr_no_dev/text
89
+ - text
90
+ - text
91
+ - - dump/22k/raw/tr_no_dev/wav.scp
92
+ - speech
93
+ - sound
94
+ - - dump/22k/raw/tr_no_dev/utt2sid
95
+ - sids
96
+ - text_int
97
+ valid_data_path_and_name_and_type:
98
+ - - dump/22k/raw/dev/text
99
+ - text
100
+ - text
101
+ - - dump/22k/raw/dev/wav.scp
102
+ - speech
103
+ - sound
104
+ - - dump/22k/raw/dev/utt2sid
105
+ - sids
106
+ - text_int
107
+ allow_variable_data_keys: false
108
+ max_cache_size: 0.0
109
+ max_cache_fd: 32
110
+ valid_max_cache_size: null
111
+ optim: adamw
112
+ optim_conf:
113
+ lr: 0.0002
114
+ betas:
115
+ - 0.8
116
+ - 0.99
117
+ eps: 1.0e-09
118
+ weight_decay: 0.0
119
+ scheduler: exponentiallr
120
+ scheduler_conf:
121
+ gamma: 0.999875
122
+ optim2: adamw
123
+ optim2_conf:
124
+ lr: 0.0002
125
+ betas:
126
+ - 0.8
127
+ - 0.99
128
+ eps: 1.0e-09
129
+ weight_decay: 0.0
130
+ scheduler2: exponentiallr
131
+ scheduler2_conf:
132
+ gamma: 0.999875
133
+ generator_first: false
134
+ token_list:
135
+ - <blank>
136
+ - <unk>
137
+ - +
138
+ - <space>
139
+ - о
140
+ - а
141
+ - и
142
+ - н
143
+ - в
144
+ - і
145
+ - т
146
+ - е
147
+ - с
148
+ - р
149
+ - л
150
+ - у
151
+ - д
152
+ - к
153
+ - м
154
+ - п
155
+ - я
156
+ - з
157
+ - ','
158
+ - б
159
+ - ь
160
+ - г
161
+ - ч
162
+ - й
163
+ - х
164
+ - ж
165
+ - ш
166
+ - ю
167
+ - ц
168
+ - щ
169
+ - —
170
+ - є
171
+ - ї
172
+ - '?'
173
+ - .
174
+ - ф
175
+ - «
176
+ - »
177
+ - '!'
178
+ - ''''
179
+ - ':'
180
+ - …
181
+ - '-'
182
+ - ґ
183
+ - ―
184
+ - –
185
+ - '"'
186
+ - ;
187
+ - “
188
+ - ”
189
+ - <sos/eos>
190
+ odim: null
191
+ model_conf: {}
192
+ use_preprocessor: true
193
+ token_type: char
194
+ bpemodel: null
195
+ non_linguistic_symbols: null
196
+ cleaner: null
197
+ g2p: g2p_en_no_space
198
+ feats_extract: linear_spectrogram
199
+ feats_extract_conf:
200
+ n_fft: 1024
201
+ hop_length: 256
202
+ win_length: null
203
+ normalize: null
204
+ normalize_conf: {}
205
+ tts: vits
206
+ tts_conf:
207
+ generator_type: vits_generator
208
+ generator_params:
209
+ hidden_channels: 192
210
+ spks: 128
211
+ global_channels: 256
212
+ segment_size: 32
213
+ text_encoder_attention_heads: 2
214
+ text_encoder_ffn_expand: 4
215
+ text_encoder_blocks: 6
216
+ text_encoder_positionwise_layer_type: conv1d
217
+ text_encoder_positionwise_conv_kernel_size: 3
218
+ text_encoder_positional_encoding_layer_type: rel_pos
219
+ text_encoder_self_attention_layer_type: rel_selfattn
220
+ text_encoder_activation_type: swish
221
+ text_encoder_normalize_before: true
222
+ text_encoder_dropout_rate: 0.1
223
+ text_encoder_positional_dropout_rate: 0.0
224
+ text_encoder_attention_dropout_rate: 0.1
225
+ use_macaron_style_in_text_encoder: true
226
+ use_conformer_conv_in_text_encoder: false
227
+ text_encoder_conformer_kernel_size: -1
228
+ decoder_kernel_size: 7
229
+ decoder_channels: 512
230
+ decoder_upsample_scales:
231
+ - 8
232
+ - 8
233
+ - 2
234
+ - 2
235
+ decoder_upsample_kernel_sizes:
236
+ - 16
237
+ - 16
238
+ - 4
239
+ - 4
240
+ decoder_resblock_kernel_sizes:
241
+ - 3
242
+ - 7
243
+ - 11
244
+ decoder_resblock_dilations:
245
+ - - 1
246
+ - 3
247
+ - 5
248
+ - - 1
249
+ - 3
250
+ - 5
251
+ - - 1
252
+ - 3
253
+ - 5
254
+ use_weight_norm_in_decoder: true
255
+ posterior_encoder_kernel_size: 5
256
+ posterior_encoder_layers: 16
257
+ posterior_encoder_stacks: 1
258
+ posterior_encoder_base_dilation: 1
259
+ posterior_encoder_dropout_rate: 0.0
260
+ use_weight_norm_in_posterior_encoder: true
261
+ flow_flows: 4
262
+ flow_kernel_size: 5
263
+ flow_base_dilation: 1
264
+ flow_layers: 4
265
+ flow_dropout_rate: 0.0
266
+ use_weight_norm_in_flow: true
267
+ use_only_mean_in_flow: true
268
+ stochastic_duration_predictor_kernel_size: 3
269
+ stochastic_duration_predictor_dropout_rate: 0.5
270
+ stochastic_duration_predictor_flows: 4
271
+ stochastic_duration_predictor_dds_conv_layers: 3
272
+ vocabs: 55
273
+ aux_channels: 513
274
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
275
+ discriminator_params:
276
+ scales: 1
277
+ scale_downsample_pooling: AvgPool1d
278
+ scale_downsample_pooling_params:
279
+ kernel_size: 4
280
+ stride: 2
281
+ padding: 2
282
+ scale_discriminator_params:
283
+ in_channels: 1
284
+ out_channels: 1
285
+ kernel_sizes:
286
+ - 15
287
+ - 41
288
+ - 5
289
+ - 3
290
+ channels: 128
291
+ max_downsample_channels: 1024
292
+ max_groups: 16
293
+ bias: true
294
+ downsample_scales:
295
+ - 2
296
+ - 2
297
+ - 4
298
+ - 4
299
+ - 1
300
+ nonlinear_activation: LeakyReLU
301
+ nonlinear_activation_params:
302
+ negative_slope: 0.1
303
+ use_weight_norm: true
304
+ use_spectral_norm: false
305
+ follow_official_norm: false
306
+ periods:
307
+ - 2
308
+ - 3
309
+ - 5
310
+ - 7
311
+ - 11
312
+ period_discriminator_params:
313
+ in_channels: 1
314
+ out_channels: 1
315
+ kernel_sizes:
316
+ - 5
317
+ - 3
318
+ channels: 32
319
+ downsample_scales:
320
+ - 3
321
+ - 3
322
+ - 3
323
+ - 3
324
+ - 1
325
+ max_downsample_channels: 1024
326
+ bias: true
327
+ nonlinear_activation: LeakyReLU
328
+ nonlinear_activation_params:
329
+ negative_slope: 0.1
330
+ use_weight_norm: true
331
+ use_spectral_norm: false
332
+ generator_adv_loss_params:
333
+ average_by_discriminators: false
334
+ loss_type: mse
335
+ discriminator_adv_loss_params:
336
+ average_by_discriminators: false
337
+ loss_type: mse
338
+ feat_match_loss_params:
339
+ average_by_discriminators: false
340
+ average_by_layers: false
341
+ include_final_outputs: true
342
+ mel_loss_params:
343
+ fs: 22050
344
+ n_fft: 1024
345
+ hop_length: 256
346
+ win_length: null
347
+ window: hann
348
+ n_mels: 80
349
+ fmin: 0
350
+ fmax: null
351
+ log_base: null
352
+ lambda_adv: 1.0
353
+ lambda_mel: 45.0
354
+ lambda_feat_match: 2.0
355
+ lambda_dur: 1.0
356
+ lambda_kl: 1.0
357
+ sampling_rate: 22050
358
+ cache_generator_outputs: true
359
+ pitch_extract: null
360
+ pitch_extract_conf: {}
361
+ pitch_normalize: null
362
+ pitch_normalize_conf: {}
363
+ energy_extract: null
364
+ energy_extract_conf: {}
365
+ energy_normalize: null
366
+ energy_normalize_conf: {}
367
+ required:
368
+ - output_dir
369
+ - token_list
370
+ version: '202209'
371
+ distributed: false
requirements.txt CHANGED
@@ -1,6 +1,9 @@
1
  # requirements for HuggingFace demo. Installs local package.
2
- torch>=1.9
3
- TTS==0.9.0
 
 
4
  ukrainian-word-stress==1.0.1
5
  git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c
6
- gradio==3.3
 
1
  # requirements for HuggingFace demo. Installs local package.
2
+ torch
3
+ --extra-index-url https://download.pytorch.org/whl/cpu
4
+ espnet==202209
5
+ num2words==0.5.12
6
  ukrainian-word-stress==1.0.1
7
  git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c
8
+ gradio==3.12
9
+ huggingface_hub==0.11.1
setup.py CHANGED
@@ -3,8 +3,8 @@ from setuptools import setup, find_packages
3
 
4
  setup(
5
  name="ukrainian-tts",
6
- version="3.0",
7
- description="Ukrainian TTS using Coqui TTS",
8
  author="Yurii Paniv",
9
  author_email="mr.robinhad@gmail.com",
10
  url="https://github.com/robinhad/ukrainian-tts",
@@ -12,8 +12,8 @@ setup(
12
  packages=find_packages(),
13
  python_requires=">3.6.0",
14
  install_requires=[
15
- "torch>=1.9",
16
- "TTS==0.9.0",
17
  "ukrainian-word-stress==1.0.1",
18
  "ukrainian_accentor @ git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c",
19
  ],
3
 
4
  setup(
5
  name="ukrainian-tts",
6
+ version="4.0",
7
+ description="Ukrainian TTS using ESPNET",
8
  author="Yurii Paniv",
9
  author_email="mr.robinhad@gmail.com",
10
  url="https://github.com/robinhad/ukrainian-tts",
12
  packages=find_packages(),
13
  python_requires=">3.6.0",
14
  install_requires=[
15
+ "espnet==202209",
16
+ "num2words==0.5.12",
17
  "ukrainian-word-stress==1.0.1",
18
  "ukrainian_accentor @ git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c",
19
  ],
tests/test_formatter.py CHANGED
@@ -3,8 +3,8 @@ from ukrainian_tts.formatter import preprocess_text
3
 
4
  def test_formatter():
5
  examples = [
6
- ("Quality of life update", "КВюаліті оф ліфе юпдате"),
7
- ("Він украв 20000000 $", "Він украв двадцять мільйонів долар"),
8
  (
9
  "111 000 000 000 доларів державного боргу.",
10
  "сто одинадцять мільярдів доларів державного боргу.",
3
 
4
  def test_formatter():
5
  examples = [
6
+ ("Quality of life update", "квюаліті оф ліфе юпдате"),
7
+ ("Він украв 20000000 $", "він украв двадцять мільйонів долар"),
8
  (
9
  "111 000 000 000 доларів державного боргу.",
10
  "сто одинадцять мільярдів доларів державного боргу.",
tests/test_tts.py CHANGED
@@ -3,9 +3,9 @@ from io import BytesIO
3
 
4
 
5
  def test_tts():
6
- tts = TTS(use_cuda=False)
7
  file = BytesIO()
8
  _, text = tts.tts("Привіт", Voices.Dmytro.value, Stress.Dictionary.value, file)
9
  file.seek(0)
10
- assert text == "Прив+іт"
11
  assert file.getbuffer().nbytes > 1000 # check that file was generated
3
 
4
 
5
  def test_tts():
6
+ tts = TTS()
7
  file = BytesIO()
8
  _, text = tts.tts("Привіт", Voices.Dmytro.value, Stress.Dictionary.value, file)
9
  file.seek(0)
10
+ assert text == "прив+іт"
11
  assert file.getbuffer().nbytes > 1000 # check that file was generated
ukrainian_tts/formatter.py CHANGED
@@ -76,4 +76,5 @@ def preprocess_text(text, use_autostress_model=False):
76
  text = text.replace(english_char.upper(), english[english_char].upper())
77
  text = text.replace(english_char, english[english_char])
78
 
 
79
  return text
76
  text = text.replace(english_char.upper(), english[english_char].upper())
77
  text = text.replace(english_char, english[english_char])
78
 
79
+ text = text.lower()
80
  return text
ukrainian_tts/tts.py CHANGED
@@ -1,21 +1,24 @@
1
  from io import BytesIO
2
  import requests
3
  from os.path import exists, join
4
- from TTS.utils.synthesizer import Synthesizer
5
  from enum import Enum
6
  from .formatter import preprocess_text
7
  from .stress import sentence_to_stress, stress_dict, stress_with_model
8
  from torch import no_grad
 
 
 
9
 
10
 
11
  class Voices(Enum):
12
  """List of available voices for the model."""
13
 
14
- Olena = "olena"
15
- Mykyta = "mykyta"
16
- Lada = "lada"
17
- Dmytro = "dmytro"
18
- Olga = "olga"
19
 
20
 
21
  class Stress(Enum):
@@ -30,14 +33,15 @@ class Stress(Enum):
30
  class TTS:
31
  """ """
32
 
33
- def __init__(self, cache_folder=None, use_cuda=False) -> None:
34
  """
35
  Class to setup a text-to-speech engine, from download to model creation. \n
36
  Downloads or uses files from `cache_folder` directory. \n
37
  By default stores in current directory."""
38
- self.__setup_cache(cache_folder, use_cuda=use_cuda)
 
39
 
40
- def tts(self, text: str, voice: str, stress: str, output_fp=BytesIO()):
41
  """
42
  Run a Text-to-Speech engine and output to `output_fp` BytesIO-like object.
43
  - `text` - your model input text.
@@ -63,39 +67,50 @@ class TTS:
63
  text = preprocess_text(text, stress)
64
  text = sentence_to_stress(text, stress_with_model if stress else stress_dict)
65
 
 
 
 
 
 
 
 
 
 
 
66
  with no_grad():
67
- wavs = self.synthesizer.tts(text, speaker_name=voice)
68
- self.synthesizer.save_wav(wavs, output_fp)
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  output_fp.seek(0)
71
 
72
  return output_fp, text
73
 
74
- def __setup_cache(self, cache_folder=None, use_cuda=False):
75
  """Downloads models and stores them into `cache_folder`. By default stores in current directory."""
76
  print("downloading uk/mykyta/vits-tts")
77
- release_number = "v3.0.0"
78
- model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model-inference.pth"
79
- config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.json"
80
- speakers_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/speakers.pth"
81
 
82
  if cache_folder is None:
83
  cache_folder = "."
84
 
85
  model_path = join(cache_folder, "model.pth")
86
- config_path = join(cache_folder, "config.json")
87
- speakers_path = join(cache_folder, "speakers.pth")
88
 
89
  self.__download(model_link, model_path)
90
  self.__download(config_link, config_path)
91
- self.__download(speakers_link, speakers_path)
92
-
93
- self.synthesizer = Synthesizer(
94
- model_path, config_path, speakers_path, None, None, use_cuda=use_cuda
95
- )
96
-
97
- if self.synthesizer is None:
98
- raise NameError("Model not found")
99
 
100
  def __download(self, url, file_name):
101
  """Downloads file from `url` into local `file_name` file."""
1
  from io import BytesIO
2
  import requests
3
  from os.path import exists, join
4
+ from espnet2.bin.tts_inference import Text2Speech
5
  from enum import Enum
6
  from .formatter import preprocess_text
7
  from .stress import sentence_to_stress, stress_dict, stress_with_model
8
  from torch import no_grad
9
+ import numpy as np
10
+ import time
11
+ import soundfile as sf
12
 
13
 
14
  class Voices(Enum):
15
  """List of available voices for the model."""
16
 
17
+ Olena = 4
18
+ Mykyta = 3
19
+ Lada = 2
20
+ Dmytro = 1
21
+ Olga = 5
22
 
23
 
24
  class Stress(Enum):
33
  class TTS:
34
  """ """
35
 
36
+ def __init__(self, cache_folder=None, device="cpu") -> None:
37
  """
38
  Class to setup a text-to-speech engine, from download to model creation. \n
39
  Downloads or uses files from `cache_folder` directory. \n
40
  By default stores in current directory."""
41
+ self.device = device
42
+ self.__setup_cache(cache_folder)
43
 
44
+ def tts(self, text: str, voice: int, stress: str, output_fp=BytesIO(), speed=1.0):
45
  """
46
  Run a Text-to-Speech engine and output to `output_fp` BytesIO-like object.
47
  - `text` - your model input text.
67
  text = preprocess_text(text, stress)
68
  text = sentence_to_stress(text, stress_with_model if stress else stress_dict)
69
 
70
+ self.synthesizer = Text2Speech(
71
+ train_config="config.yaml",
72
+ model_file="model.pth",
73
+ device=self.device,
74
+ speed_control_alpha=1 / speed,
75
+ # Only for VITS
76
+ noise_scale=0.333,
77
+ noise_scale_dur=0.333,
78
+ )
79
+ # synthesis
80
  with no_grad():
81
+ start = time.time()
82
+ wav = self.synthesizer(text, sids=np.array(voice))["wav"]
83
+
84
+ rtf = (time.time() - start) / (len(wav) / self.synthesizer.fs)
85
+ print(f"RTF = {rtf:5f}")
86
+
87
+ sf.write(
88
+ output_fp,
89
+ wav.view(-1).cpu().numpy(),
90
+ self.synthesizer.fs,
91
+ "PCM_16",
92
+ format="wav",
93
+ )
94
 
95
  output_fp.seek(0)
96
 
97
  return output_fp, text
98
 
99
+ def __setup_cache(self, cache_folder=None):
100
  """Downloads models and stores them into `cache_folder`. By default stores in current directory."""
101
  print("downloading uk/mykyta/vits-tts")
102
+ release_number = "v4.0.0"
103
+ model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/model.pth"
104
+ config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/{release_number}/config.yaml"
 
105
 
106
  if cache_folder is None:
107
  cache_folder = "."
108
 
109
  model_path = join(cache_folder, "model.pth")
110
+ config_path = join(cache_folder, "config.yaml")
 
111
 
112
  self.__download(model_link, model_path)
113
  self.__download(config_link, config_path)
 
 
 
 
 
 
 
 
114
 
115
  def __download(self, url, file_name):
116
  """Downloads file from `url` into local `file_name` file."""