Yurii Paniv commited on
Commit
49fc4a4
1 Parent(s): f826887

Add support for vocoder

Browse files
Files changed (3) hide show
  1. README.md +2 -0
  2. app.py +26 -9
  3. vocoder_config.json +185 -0
README.md CHANGED
@@ -13,6 +13,8 @@ Ukrainian TTS (text-to-speech) using Coqui TTS.
13
 
14
  Trained on [M-AILABS Ukrainian dataset](https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/).
15
 
 
 
16
  # Example
17
 
18
  https://user-images.githubusercontent.com/5759207/139459556-35aa077b-0425-421f-a8d3-4c503315008d.mp4
13
 
14
  Trained on [M-AILABS Ukrainian dataset](https://www.caito.de/2019/01/the-m-ailabs-speech-dataset/).
15
 
16
+ # Support
17
+ If you like my work, please support -> [SUPPORT LINK](https://send.monobank.ua/jar/48iHq4xAXm)
18
  # Example
19
 
20
  https://user-images.githubusercontent.com/5759207/139459556-35aa077b-0425-421f-a8d3-4c503315008d.mp4
app.py CHANGED
@@ -6,6 +6,8 @@ import numpy as np
6
 
7
  from TTS.utils.manage import ModelManager
8
  from TTS.utils.synthesizer import Synthesizer
 
 
9
 
10
  MODEL_NAMES = [
11
  "uk/mai/glow-tts"
@@ -14,16 +16,31 @@ MODELS = {}
14
 
15
  manager = ModelManager()
16
 
 
 
 
 
 
 
 
 
 
 
 
17
  for MODEL_NAME in MODEL_NAMES:
18
  print(f"downloading {MODEL_NAME}")
19
  model_path, config_path, model_item = manager.download_model(
20
  f"tts_models/{MODEL_NAME}")
21
  vocoder_name: Optional[str] = model_item["default_vocoder"]
22
- vocoder_path = None
23
- vocoder_config_path = None
24
- if vocoder_name is not None:
25
- vocoder_path, vocoder_config_path, _ = manager.download_model(
26
- vocoder_name)
 
 
 
 
27
 
28
  synthesizer = Synthesizer(
29
  model_path, config_path, None, vocoder_path, vocoder_config_path,
@@ -52,14 +69,14 @@ iface = gr.Interface(
52
  default="Привіт, як твої справи?",
53
  ),
54
  gr.inputs.Radio(
55
- label="Pick a TTS Model",
56
  choices=MODEL_NAMES,
57
  ),
58
  ],
59
  outputs=gr.outputs.Audio(label="Output"),
60
- title="🐸💬 - Coqui TTS",
61
  theme="huggingface",
62
- description="🐸💬 - a deep learning toolkit for Text-to-Speech, battle-tested in research and production",
63
- article="more info at https://github.com/coqui-ai/TTS",
64
  )
65
  iface.launch()
6
 
7
  from TTS.utils.manage import ModelManager
8
  from TTS.utils.synthesizer import Synthesizer
9
+ import requests
10
+ from os.path import exists
11
 
12
  MODEL_NAMES = [
13
  "uk/mai/glow-tts"
16
 
17
  manager = ModelManager()
18
 
19
+
20
+ def download(url, file_name):
21
+ if not exists(file_name):
22
+ print(f"Downloading {file_name}")
23
+ r = requests.get(url, allow_redirects=True)
24
+ with open(file_name, 'wb') as file:
25
+ file.write(r.content)
26
+ else:
27
+ print(f"Found {file_name}. Skipping download...")
28
+
29
+
30
  for MODEL_NAME in MODEL_NAMES:
31
  print(f"downloading {MODEL_NAME}")
32
  model_path, config_path, model_item = manager.download_model(
33
  f"tts_models/{MODEL_NAME}")
34
  vocoder_name: Optional[str] = model_item["default_vocoder"]
35
+ release_number = "0.0.1"
36
+ vocoder_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/v{release_number}/vocoder.pth.tar"
37
+ vocoder_config_link = f"https://github.com/robinhad/ukrainian-tts/releases/download/v{release_number}/vocoder_config.json"
38
+
39
+ vocoder_path = "vocoder.pth.tar"
40
+ vocoder_config_path = "vocoder_config.json"
41
+
42
+ download(vocoder_link, vocoder_path)
43
+ download(vocoder_config_link, vocoder_config_path)
44
 
45
  synthesizer = Synthesizer(
46
  model_path, config_path, None, vocoder_path, vocoder_config_path,
69
  default="Привіт, як твої справи?",
70
  ),
71
  gr.inputs.Radio(
72
+ label="Виберіть TTS модель",
73
  choices=MODEL_NAMES,
74
  ),
75
  ],
76
  outputs=gr.outputs.Audio(label="Output"),
77
+ title="🐸💬🇺🇦 - Coqui TTS",
78
  theme="huggingface",
79
+ description="Україномовний🇺🇦 TTS за допомогою Coqui TTS",
80
+ article="Якщо вам подобається, підтримайте за посиланням: [SUPPORT LINK](https://send.monobank.ua/jar/48iHq4xAXm)",
81
  )
82
  iface.launch()
vocoder_config.json ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "multiband_melgan",
3
+ "run_name": "coqui_tts",
4
+ "run_description": "",
5
+ "epochs": 2000,
6
+ "batch_size": 32,
7
+ "eval_batch_size": 16,
8
+ "mixed_precision": true,
9
+ "scheduler_after_epoch": false,
10
+ "run_eval": true,
11
+ "test_delay_epochs": 5,
12
+ "print_eval": false,
13
+ "dashboard_logger": "tensorboard",
14
+ "print_step": 25,
15
+ "plot_step": 100,
16
+ "model_param_stats": false,
17
+ "project_name": null,
18
+ "log_model_step": null,
19
+ "wandb_entity": null,
20
+ "save_step": 10000,
21
+ "checkpoint": true,
22
+ "keep_all_best": false,
23
+ "keep_after": 10000,
24
+ "num_loader_workers": 12,
25
+ "num_eval_loader_workers": 12,
26
+ "use_noise_augment": true,
27
+ "output_path": "/home/robinhad/Projects/TTS/recipes/ljspeech/multiband_melgan",
28
+ "distributed_backend": "nccl",
29
+ "distributed_url": "tcp://localhost:54321",
30
+ "audio": {
31
+ "fft_size": 1024,
32
+ "win_length": 1024,
33
+ "hop_length": 256,
34
+ "frame_shift_ms": null,
35
+ "frame_length_ms": null,
36
+ "stft_pad_mode": "reflect",
37
+ "sample_rate": 16000,
38
+ "resample": false,
39
+ "preemphasis": 0.0,
40
+ "ref_level_db": 20,
41
+ "do_sound_norm": false,
42
+ "log_func": "np.log10",
43
+ "do_trim_silence": true,
44
+ "trim_db": 45,
45
+ "power": 1.5,
46
+ "griffin_lim_iters": 60,
47
+ "num_mels": 80,
48
+ "mel_fmin": 0.0,
49
+ "mel_fmax": null,
50
+ "spec_gain": 20,
51
+ "do_amp_to_db_linear": true,
52
+ "do_amp_to_db_mel": true,
53
+ "signal_norm": true,
54
+ "min_level_db": -100,
55
+ "symmetric_norm": true,
56
+ "max_norm": 4.0,
57
+ "clip_norm": true,
58
+ "stats_path": null
59
+ },
60
+ "eval_split_size": 10,
61
+ "data_path": "../Data/uk_UK/by_book/female",
62
+ "feature_path": null,
63
+ "seq_len": 8192,
64
+ "pad_short": 2000,
65
+ "conv_pad": 0,
66
+ "use_cache": true,
67
+ "wd": 0.0,
68
+ "optimizer": "AdamW",
69
+ "optimizer_params": {
70
+ "betas": [
71
+ 0.8,
72
+ 0.99
73
+ ],
74
+ "weight_decay": 0.0
75
+ },
76
+ "use_stft_loss": true,
77
+ "use_subband_stft_loss": true,
78
+ "use_mse_gan_loss": true,
79
+ "use_hinge_gan_loss": false,
80
+ "use_feat_match_loss": false,
81
+ "use_l1_spec_loss": false,
82
+ "stft_loss_weight": 0.5,
83
+ "subband_stft_loss_weight": 0,
84
+ "mse_G_loss_weight": 2.5,
85
+ "hinge_G_loss_weight": 0,
86
+ "feat_match_loss_weight": 108,
87
+ "l1_spec_loss_weight": 0,
88
+ "stft_loss_params": {
89
+ "n_ffts": [
90
+ 1024,
91
+ 2048,
92
+ 512
93
+ ],
94
+ "hop_lengths": [
95
+ 120,
96
+ 240,
97
+ 50
98
+ ],
99
+ "win_lengths": [
100
+ 600,
101
+ 1200,
102
+ 240
103
+ ]
104
+ },
105
+ "l1_spec_loss_params": {
106
+ "use_mel": true,
107
+ "sample_rate": 16000,
108
+ "n_fft": 1024,
109
+ "hop_length": 256,
110
+ "win_length": 1024,
111
+ "n_mels": 80,
112
+ "mel_fmin": 0.0,
113
+ "mel_fmax": null
114
+ },
115
+ "target_loss": "loss_0",
116
+ "grad_clip": [
117
+ 5,
118
+ 5
119
+ ],
120
+ "lr_gen": 0.0001,
121
+ "lr_disc": 0.0001,
122
+ "lr_scheduler_gen": "MultiStepLR",
123
+ "lr_scheduler_gen_params": {
124
+ "gamma": 0.5,
125
+ "milestones": [
126
+ 100000,
127
+ 200000,
128
+ 300000,
129
+ 400000,
130
+ 500000,
131
+ 600000
132
+ ]
133
+ },
134
+ "lr_scheduler_disc": "MultiStepLR",
135
+ "lr_scheduler_disc_params": {
136
+ "gamma": 0.5,
137
+ "milestones": [
138
+ 100000,
139
+ 200000,
140
+ 300000,
141
+ 400000,
142
+ 500000,
143
+ 600000
144
+ ]
145
+ },
146
+ "use_pqmf": true,
147
+ "diff_samples_for_G_and_D": false,
148
+ "discriminator_model": "melgan_multiscale_discriminator",
149
+ "discriminator_model_params": {
150
+ "base_channels": 16,
151
+ "max_channels": 512,
152
+ "downsample_factors": [
153
+ 4,
154
+ 4,
155
+ 4
156
+ ]
157
+ },
158
+ "generator_model": "multiband_melgan_generator",
159
+ "generator_model_params": {
160
+ "upsample_factors": [
161
+ 8,
162
+ 4,
163
+ 2
164
+ ],
165
+ "num_res_blocks": 4
166
+ },
167
+ "steps_to_start_discriminator": 200000,
168
+ "subband_stft_loss_params": {
169
+ "n_ffts": [
170
+ 384,
171
+ 683,
172
+ 171
173
+ ],
174
+ "hop_lengths": [
175
+ 30,
176
+ 60,
177
+ 10
178
+ ],
179
+ "win_lengths": [
180
+ 150,
181
+ 300,
182
+ 60
183
+ ]
184
+ }
185
+ }