Yurii Paniv commited on
Commit
cb6b82c
โ€ข
1 Parent(s): e35756c

Add VITS model

Browse files
Files changed (4) hide show
  1. .gitignore +3 -0
  2. README.md +2 -2
  3. app.py +9 -14
  4. config.json +220 -157
.gitignore CHANGED
@@ -127,3 +127,6 @@ dmypy.json
127
 
128
  # Pyre type checker
129
  .pyre/
 
 
 
127
 
128
  # Pyre type checker
129
  .pyre/
130
+
131
+ # model files
132
+ *.pth.tar
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: "Ukrainian TTS"
3
- emoji: ๐Ÿธ
4
  colorFrom: green
5
  colorTo: green
6
  sdk: gradio
@@ -15,7 +15,7 @@ Trained on [M-AILABS Ukrainian dataset](https://www.caito.de/2019/01/the-m-ailab
15
 
16
  Link to online demo -> [https://huggingface.co/spaces/robinhad/ukrainian-tts](https://huggingface.co/spaces/robinhad/ukrainian-tts)
17
  # Support
18
- If you like my work, please support -> [SUPPORT LINK](https://send.monobank.ua/jar/48iHq4xAXm)
19
  # Example
20
 
21
  https://user-images.githubusercontent.com/5759207/140622395-9e734c95-159c-4d72-9f56-e8d1f1ac66c2.mp4
1
  ---
2
  title: "Ukrainian TTS"
3
+ emoji: ๐Ÿ‡บ๐Ÿ‡ฆ
4
  colorFrom: green
5
  colorTo: green
6
  sdk: gradio
15
 
16
  Link to online demo -> [https://huggingface.co/spaces/robinhad/ukrainian-tts](https://huggingface.co/spaces/robinhad/ukrainian-tts)
17
  # Support
18
+ If you like my work, please support -> ![mono](https://www.monobank.ua/favicon.ico) [SUPPORT LINK](https://send.monobank.ua/jar/48iHq4xAXm)
19
  # Example
20
 
21
  https://user-images.githubusercontent.com/5759207/140622395-9e734c95-159c-4d72-9f56-e8d1f1ac66c2.mp4
app.py CHANGED
@@ -1,8 +1,6 @@
1
  import tempfile
2
- from typing import Optional
3
 
4
  import gradio as gr
5
- import numpy as np
6
 
7
  from TTS.utils.manage import ModelManager
8
  from TTS.utils.synthesizer import Synthesizer
@@ -10,7 +8,7 @@ import requests
10
  from os.path import exists
11
 
12
  MODEL_NAMES = [
13
- "uk/mai/glow-tts"
14
  ]
15
  MODELS = {}
16
 
@@ -29,21 +27,18 @@ def download(url, file_name):
29
 
30
  for MODEL_NAME in MODEL_NAMES:
31
  print(f"downloading {MODEL_NAME}")
32
- model_path, config_path, model_item = manager.download_model(
33
- f"tts_models/{MODEL_NAME}")
34
- vocoder_name: Optional[str] = model_item["default_vocoder"]
35
- release_number = "0.0.1"
36
- vocoder_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/v{release_number}/vocoder.pth.tar"
37
- vocoder_config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/v{release_number}/vocoder_config.json"
38
 
39
- vocoder_path = "vocoder.pth.tar"
40
- vocoder_config_path = "vocoder_config.json"
41
 
42
- download(vocoder_link, vocoder_path)
43
- download(vocoder_config_link, vocoder_config_path)
44
 
45
  synthesizer = Synthesizer(
46
- model_path, config_path, None, vocoder_path, vocoder_config_path,
47
  )
48
  MODELS[MODEL_NAME] = synthesizer
49
 
1
  import tempfile
 
2
 
3
  import gradio as gr
 
4
 
5
  from TTS.utils.manage import ModelManager
6
  from TTS.utils.synthesizer import Synthesizer
8
  from os.path import exists
9
 
10
  MODEL_NAMES = [
11
+ "uk/mai/vits-tts"
12
  ]
13
  MODELS = {}
14
 
27
 
28
  for MODEL_NAME in MODEL_NAMES:
29
  print(f"downloading {MODEL_NAME}")
30
+ release_number = "1.0.0"
31
+ model_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/v{release_number}/model.pth.tar"
32
+ config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/v{release_number}/config.json"
 
 
 
33
 
34
+ model_path = "model.pth.tar"
35
+ config_path = "config.json"
36
 
37
+ download(model_link, model_path)
38
+ download(config_link, config_path)
39
 
40
  synthesizer = Synthesizer(
41
+ model_path, config_path, None, None, None,
42
  )
43
  MODELS[MODEL_NAME] = synthesizer
44
 
config.json CHANGED
@@ -1,158 +1,221 @@
1
- {
2
- "model": "glow_tts",
3
- "run_name": "coqui_tts",
4
- "run_description": "",
5
- "epochs": 1000,
6
- "batch_size": 32,
7
- "eval_batch_size": 16,
8
- "mixed_precision": true,
9
- "scheduler_after_epoch": false,
10
- "run_eval": true,
11
- "test_delay_epochs": -1,
12
- "print_eval": true,
13
- "dashboard_logger": "tensorboard",
14
- "print_step": 25,
15
- "plot_step": 100,
16
- "model_param_stats": false,
17
- "project_name": null,
18
- "log_model_step": null,
19
- "wandb_entity": null,
20
- "save_step": 10000,
21
- "checkpoint": true,
22
- "keep_all_best": false,
23
- "keep_after": 10000,
24
- "num_loader_workers": 1,
25
- "num_eval_loader_workers": 1,
26
- "use_noise_augment": false,
27
- "output_path": "./ukrainian",
28
- "distributed_backend": "nccl",
29
- "distributed_url": "tcp://localhost:54321",
30
- "audio": {
31
- "fft_size": 1024,
32
- "win_length": 1024,
33
- "hop_length": 256,
34
- "frame_shift_ms": null,
35
- "frame_length_ms": null,
36
- "stft_pad_mode": "reflect",
37
- "sample_rate": 16000,
38
- "resample": false,
39
- "preemphasis": 0.0,
40
- "ref_level_db": 20,
41
- "do_sound_norm": false,
42
- "log_func": "np.log10",
43
- "do_trim_silence": true,
44
- "trim_db": 45,
45
- "power": 1.5,
46
- "griffin_lim_iters": 60,
47
- "num_mels": 80,
48
- "mel_fmin": 0.0,
49
- "mel_fmax": null,
50
- "spec_gain": 20,
51
- "do_amp_to_db_linear": true,
52
- "do_amp_to_db_mel": true,
53
- "signal_norm": true,
54
- "min_level_db": -100,
55
- "symmetric_norm": true,
56
- "max_norm": 4.0,
57
- "clip_norm": true,
58
- "stats_path": null
59
- },
60
- "use_phonemes": false,
61
- "use_espeak_phonemes": false,
62
- "phoneme_language": null,
63
- "compute_input_seq_cache": false,
64
- "text_cleaner": "basic_cleaners",
65
- "enable_eos_bos_chars": false,
66
- "test_sentences_file": "",
67
- "phoneme_cache_path": "./phoneme_cache",
68
- "characters": {
69
- "pad": "_",
70
- "eos": "~",
71
- "bos": "^",
72
- "characters": "!',-.:;?ABIMXaceinoprxy\u0404\u0406\u0407\u0410\u0411\u0412\u0413\u0414\u0415\u0416\u0417\u0418\u0419\u041a\u041b\u041c\u041d\u041e\u041f\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427\u0428\u0429\u042c\u042f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f\u0454\u0456\u0457\u0490\u0491 ",
73
- "punctuations": "!',-.:;? ",
74
- "phonemes": null,
75
- "unique": true
76
- },
77
- "batch_group_size": 0,
78
- "loss_masking": null,
79
- "sort_by_audio_len": false,
80
- "min_seq_len": 3,
81
- "max_seq_len": 500,
82
- "compute_f0": false,
83
- "compute_linear_spec": false,
84
- "add_blank": false,
85
- "datasets": [
86
- {
87
- "name": "ljspeech",
88
- "path": "./data/uk_UK/by_book/female/sumska/kaydasheva",
89
- "meta_file_train": "metadata.csv",
90
- "ununsed_speakers": null,
91
- "meta_file_val": "",
92
- "meta_file_attn_mask": ""
93
- },
94
- {
95
- "name": "ljspeech",
96
- "path": "./data/uk_UK/by_book/female/sumska/mykola_djerya",
97
- "meta_file_train": "metadata.csv",
98
- "ununsed_speakers": null,
99
- "meta_file_val": "",
100
- "meta_file_attn_mask": ""
101
- }
102
- ],
103
- "optimizer": "RAdam",
104
- "optimizer_params": {
105
- "betas": [
106
- 0.9,
107
- 0.998
108
- ],
109
- "weight_decay": 1e-06
110
- },
111
- "lr_scheduler": "NoamLR",
112
- "lr_scheduler_params": {
113
- "warmup_steps": 4000
114
- },
115
- "test_sentences": [
116
- "\u0413\u043e\u0432\u043e\u0440\u0438 \u043d\u0456\u0431\u0438 \u0442\u0438 \u0436\u0438\u0432\u0438\u0439!",
117
- "\u041f\u043e\u043b \u043f\u0435\u0440\u0435\u0442\u043d\u0443\u0432 \u043f\u0443\u0441\u0442\u0435\u043b\u044e",
118
- "\u041f\u0440\u0438\u0432\u0456\u0442, \u0441\u0432\u0456\u0442\u0435!"
119
- ],
120
- "use_speaker_embedding": false,
121
- "use_d_vector_file": false,
122
- "d_vector_dim": 0,
123
- "num_chars": null,
124
- "encoder_type": "rel_pos_transformer",
125
- "encoder_params": {
126
- "kernel_size": 3,
127
- "dropout_p": 0.1,
128
- "num_layers": 6,
129
- "num_heads": 2,
130
- "hidden_channels_ffn": 768,
131
- "input_length": null
132
- },
133
- "use_encoder_prenet": true,
134
- "hidden_channels_enc": 192,
135
- "hidden_channels_dec": 192,
136
- "hidden_channels_dp": 256,
137
- "dropout_p_dp": 0.1,
138
- "dropout_p_dec": 0.05,
139
- "mean_only": true,
140
- "out_channels": 80,
141
- "num_flow_blocks_dec": 12,
142
- "inference_noise_scale": 0.0,
143
- "kernel_size_dec": 5,
144
- "dilation_rate": 1,
145
- "num_block_layers": 4,
146
- "num_speakers": 0,
147
- "c_in_channels": 0,
148
- "num_splits": 4,
149
- "num_squeeze": 2,
150
- "sigmoid_scale": false,
151
- "data_dep_init_steps": 10,
152
- "style_wav_for_test": null,
153
- "length_scale": 1.0,
154
- "d_vector_file": false,
155
- "grad_clip": 5.0,
156
- "lr": 0.001,
157
- "r": 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  }
1
+ {
2
+ "model": "vits",
3
+ "run_name": "vits_ljspeech",
4
+ "run_description": "",
5
+ "epochs": 1000,
6
+ "batch_size": 18,
7
+ "eval_batch_size": 16,
8
+ "mixed_precision": true,
9
+ "scheduler_after_epoch": true,
10
+ "run_eval": true,
11
+ "test_delay_epochs": -1,
12
+ "print_eval": true,
13
+ "dashboard_logger": "tensorboard",
14
+ "print_step": 25,
15
+ "plot_step": 100,
16
+ "model_param_stats": false,
17
+ "project_name": null,
18
+ "log_model_step": null,
19
+ "wandb_entity": null,
20
+ "save_step": 10000,
21
+ "checkpoint": true,
22
+ "keep_all_best": false,
23
+ "keep_after": 10000,
24
+ "num_loader_workers": 12,
25
+ "num_eval_loader_workers": 12,
26
+ "use_noise_augment": false,
27
+ "output_path": "./ukrainian-vits",
28
+ "distributed_backend": "nccl",
29
+ "distributed_url": "tcp://localhost:54321",
30
+ "audio": {
31
+ "fft_size": 1024,
32
+ "win_length": 1024,
33
+ "hop_length": 256,
34
+ "frame_shift_ms": null,
35
+ "frame_length_ms": null,
36
+ "stft_pad_mode": "reflect",
37
+ "sample_rate": 16000,
38
+ "resample": false,
39
+ "preemphasis": 0.0,
40
+ "ref_level_db": 20,
41
+ "do_sound_norm": false,
42
+ "log_func": "np.log",
43
+ "do_trim_silence": true,
44
+ "trim_db": 45,
45
+ "power": 1.3,
46
+ "griffin_lim_iters": 60,
47
+ "num_mels": 80,
48
+ "mel_fmin": 0.0,
49
+ "mel_fmax": null,
50
+ "spec_gain": 1,
51
+ "do_amp_to_db_linear": false,
52
+ "do_amp_to_db_mel": true,
53
+ "signal_norm": false,
54
+ "min_level_db": -100,
55
+ "symmetric_norm": true,
56
+ "max_norm": 4.0,
57
+ "clip_norm": true,
58
+ "stats_path": null
59
+ },
60
+ "use_phonemes": false,
61
+ "use_espeak_phonemes": true,
62
+ "phoneme_language": null,
63
+ "compute_input_seq_cache": true,
64
+ "text_cleaner": "basic_cleaners",
65
+ "enable_eos_bos_chars": false,
66
+ "test_sentences_file": "",
67
+ "phoneme_cache_path": "./ukrainian/phoneme_cache",
68
+ "characters": {
69
+ "pad": "_",
70
+ "eos": "~",
71
+ "bos": "^",
72
+ "characters": "!',-.:;?\u0410\u0411\u0412\u0413\u0490\u0414\u0415\u0404\u0416\u0417\u0418\u0406\u0407\u0419\u041a\u041b\u041c\u041d\u041e\u041f\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427\u0428\u0429\u042c\u042e\u042f\u0430\u0431\u0432\u0433\u0491\u0434\u0435\u0454\u0436\u0437\u0438\u0456\u0457\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044c\u044e\u044f ",
73
+ "punctuations": "!',-.:;? ",
74
+ "phonemes": null,
75
+ "unique": true
76
+ },
77
+ "batch_group_size": 5,
78
+ "loss_masking": null,
79
+ "sort_by_audio_len": true,
80
+ "min_seq_len": 0,
81
+ "max_seq_len": 500000,
82
+ "compute_f0": false,
83
+ "compute_linear_spec": true,
84
+ "add_blank": true,
85
+ "datasets": [
86
+ {
87
+ "name": "ljspeech",
88
+ "path": "./Data/uk_UK/by_book/female/sumska/kaydasheva",
89
+ "meta_file_train": "metadata.csv",
90
+ "ununsed_speakers": null,
91
+ "meta_file_val": "",
92
+ "meta_file_attn_mask": ""
93
+ },
94
+ {
95
+ "name": "ljspeech",
96
+ "path": "./Data/uk_UK/by_book/female/sumska/mykola_djerya",
97
+ "meta_file_train": "metadata.csv",
98
+ "ununsed_speakers": null,
99
+ "meta_file_val": "",
100
+ "meta_file_attn_mask": ""
101
+ }
102
+ ],
103
+ "optimizer": "AdamW",
104
+ "optimizer_params": {
105
+ "betas": [
106
+ 0.8,
107
+ 0.99
108
+ ],
109
+ "eps": 1e-09,
110
+ "weight_decay": 0.01
111
+ },
112
+ "lr_scheduler": "",
113
+ "lr_scheduler_params": {},
114
+ "test_sentences": [
115
+ "\u0413\u043e\u0432\u043e\u0440\u0438 \u043d\u0456\u0431\u0438 \u0442\u0438 \u0436\u0438\u0432\u0438\u0439!",
116
+ "\u041f\u043e\u043b \u043f\u0435\u0440\u0435\u0442\u043d\u0443\u0432 \u043f\u0443\u0441\u0442\u0435\u043b\u044e",
117
+ "\u041f\u0440\u0438\u0432\u0456\u0442, \u0441\u0432\u0456\u0442\u0435!"
118
+ ],
119
+ "model_args": {
120
+ "num_chars": 86,
121
+ "out_channels": 513,
122
+ "spec_segment_size": 24,
123
+ "hidden_channels": 192,
124
+ "hidden_channels_ffn_text_encoder": 768,
125
+ "num_heads_text_encoder": 2,
126
+ "num_layers_text_encoder": 6,
127
+ "kernel_size_text_encoder": 3,
128
+ "dropout_p_text_encoder": 0.1,
129
+ "dropout_p_duration_predictor": 0.5,
130
+ "kernel_size_posterior_encoder": 5,
131
+ "dilation_rate_posterior_encoder": 1,
132
+ "num_layers_posterior_encoder": 16,
133
+ "kernel_size_flow": 5,
134
+ "dilation_rate_flow": 1,
135
+ "num_layers_flow": 4,
136
+ "resblock_type_decoder": "1",
137
+ "resblock_kernel_sizes_decoder": [
138
+ 3,
139
+ 7,
140
+ 11
141
+ ],
142
+ "resblock_dilation_sizes_decoder": [
143
+ [
144
+ 1,
145
+ 3,
146
+ 5
147
+ ],
148
+ [
149
+ 1,
150
+ 3,
151
+ 5
152
+ ],
153
+ [
154
+ 1,
155
+ 3,
156
+ 5
157
+ ]
158
+ ],
159
+ "upsample_rates_decoder": [
160
+ 8,
161
+ 8,
162
+ 2,
163
+ 2
164
+ ],
165
+ "upsample_initial_channel_decoder": 512,
166
+ "upsample_kernel_sizes_decoder": [
167
+ 16,
168
+ 16,
169
+ 4,
170
+ 4
171
+ ],
172
+ "use_sdp": true,
173
+ "noise_scale": 1.0,
174
+ "inference_noise_scale": 0.667,
175
+ "length_scale": 1.0,
176
+ "noise_scale_dp": 1.0,
177
+ "inference_noise_scale_dp": 1.0,
178
+ "max_inference_len": null,
179
+ "init_discriminator": true,
180
+ "use_spectral_norm_disriminator": false,
181
+ "use_speaker_embedding": false,
182
+ "num_speakers": 0,
183
+ "speakers_file": null,
184
+ "speaker_embedding_channels": 256,
185
+ "use_d_vector_file": false,
186
+ "d_vector_file": null,
187
+ "d_vector_dim": 0,
188
+ "detach_dp_input": true
189
+ },
190
+ "grad_clip": [
191
+ 1000.0,
192
+ 1000.0
193
+ ],
194
+ "lr_gen": 0.0002,
195
+ "lr_disc": 0.0002,
196
+ "lr_scheduler_gen": "ExponentialLR",
197
+ "lr_scheduler_gen_params": {
198
+ "gamma": 0.999875,
199
+ "last_epoch": -1
200
+ },
201
+ "lr_scheduler_disc": "ExponentialLR",
202
+ "lr_scheduler_disc_params": {
203
+ "gamma": 0.999875,
204
+ "last_epoch": -1
205
+ },
206
+ "kl_loss_alpha": 1.0,
207
+ "disc_loss_alpha": 1.0,
208
+ "gen_loss_alpha": 1.0,
209
+ "feat_loss_alpha": 1.0,
210
+ "mel_loss_alpha": 45.0,
211
+ "dur_loss_alpha": 1.0,
212
+ "return_wav": true,
213
+ "r": 1,
214
+ "num_speakers": 0,
215
+ "use_speaker_embedding": false,
216
+ "speakers_file": null,
217
+ "speaker_embedding_channels": 256,
218
+ "use_d_vector_file": false,
219
+ "d_vector_file": null,
220
+ "d_vector_dim": 0
221
  }