Shadhil commited on
Commit
9b2107c
1 Parent(s): d493428

voice-clone with single audio sample input

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. TTS/.models.json +937 -0
  2. TTS/VERSION +1 -0
  3. TTS/__init__.py +6 -0
  4. TTS/__pycache__/__init__.cpython-39.pyc +0 -0
  5. TTS/__pycache__/api.cpython-39.pyc +0 -0
  6. TTS/__pycache__/cs_api.cpython-39.pyc +0 -0
  7. TTS/__pycache__/model.cpython-39.pyc +0 -0
  8. TTS/api.py +489 -0
  9. TTS/bin/__init__.py +0 -0
  10. TTS/bin/collect_env_info.py +48 -0
  11. TTS/bin/compute_attention_masks.py +165 -0
  12. TTS/bin/compute_embeddings.py +197 -0
  13. TTS/bin/compute_statistics.py +96 -0
  14. TTS/bin/eval_encoder.py +88 -0
  15. TTS/bin/extract_tts_spectrograms.py +287 -0
  16. TTS/bin/find_unique_chars.py +45 -0
  17. TTS/bin/find_unique_phonemes.py +74 -0
  18. TTS/bin/remove_silence_using_vad.py +124 -0
  19. TTS/bin/resample.py +90 -0
  20. TTS/bin/synthesize.py +541 -0
  21. TTS/bin/train_encoder.py +319 -0
  22. TTS/bin/train_tts.py +71 -0
  23. TTS/bin/train_vocoder.py +77 -0
  24. TTS/bin/tune_wavegrad.py +103 -0
  25. TTS/config/__init__.py +138 -0
  26. TTS/config/__pycache__/__init__.cpython-39.pyc +0 -0
  27. TTS/config/__pycache__/shared_configs.cpython-39.pyc +0 -0
  28. TTS/config/shared_configs.py +268 -0
  29. TTS/cs_api.py +317 -0
  30. TTS/encoder/README.md +18 -0
  31. TTS/encoder/__init__.py +0 -0
  32. TTS/encoder/__pycache__/__init__.cpython-39.pyc +0 -0
  33. TTS/encoder/__pycache__/losses.cpython-39.pyc +0 -0
  34. TTS/encoder/configs/base_encoder_config.py +61 -0
  35. TTS/encoder/configs/emotion_encoder_config.py +12 -0
  36. TTS/encoder/configs/speaker_encoder_config.py +11 -0
  37. TTS/encoder/dataset.py +147 -0
  38. TTS/encoder/losses.py +226 -0
  39. TTS/encoder/models/__pycache__/base_encoder.cpython-39.pyc +0 -0
  40. TTS/encoder/models/__pycache__/lstm.cpython-39.pyc +0 -0
  41. TTS/encoder/models/__pycache__/resnet.cpython-39.pyc +0 -0
  42. TTS/encoder/models/base_encoder.py +161 -0
  43. TTS/encoder/models/lstm.py +99 -0
  44. TTS/encoder/models/resnet.py +198 -0
  45. TTS/encoder/requirements.txt +2 -0
  46. TTS/encoder/utils/__init__.py +0 -0
  47. TTS/encoder/utils/__pycache__/__init__.cpython-39.pyc +0 -0
  48. TTS/encoder/utils/__pycache__/generic_utils.cpython-39.pyc +0 -0
  49. TTS/encoder/utils/generic_utils.py +182 -0
  50. TTS/encoder/utils/io.py +38 -0
TTS/.models.json ADDED
@@ -0,0 +1,937 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tts_models": {
3
+ "multilingual": {
4
+ "multi-dataset": {
5
+ "xtts_v2": {
6
+ "description": "XTTS-v2.0.2 by Coqui with 16 languages.",
7
+ "hf_url": [
8
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth",
9
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/config.json",
10
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
11
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
12
+ ],
13
+ "model_hash": "5ce0502bfe3bc88dc8d9312b12a7558c",
14
+ "default_vocoder": null,
15
+ "commit": "480a6cdf7",
16
+ "license": "CPML",
17
+ "contact": "info@coqui.ai",
18
+ "tos_required": true
19
+ },
20
+ "xtts_v1.1": {
21
+ "description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
22
+ "hf_url": [
23
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth",
24
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/config.json",
25
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json",
26
+ "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/hash.md5"
27
+ ],
28
+ "model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59",
29
+ "default_vocoder": null,
30
+ "commit": "82910a63",
31
+ "license": "CPML",
32
+ "contact": "info@coqui.ai",
33
+ "tos_required": true
34
+ },
35
+ "your_tts": {
36
+ "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
37
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
38
+ "default_vocoder": null,
39
+ "commit": "e9a1953e",
40
+ "license": "CC BY-NC-ND 4.0",
41
+ "contact": "egolge@coqui.ai"
42
+ },
43
+ "bark": {
44
+ "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
45
+ "hf_url": [
46
+ "https://coqui.gateway.scarf.sh/hf/bark/coarse_2.pt",
47
+ "https://coqui.gateway.scarf.sh/hf/bark/fine_2.pt",
48
+ "https://app.coqui.ai/tts_model/text_2.pt",
49
+ "https://coqui.gateway.scarf.sh/hf/bark/config.json",
50
+ "https://coqui.gateway.scarf.sh/hf/bark/hubert.pt",
51
+ "https://coqui.gateway.scarf.sh/hf/bark/tokenizer.pth"
52
+ ],
53
+ "default_vocoder": null,
54
+ "commit": "e9a1953e",
55
+ "license": "MIT",
56
+ "contact": "https://www.suno.ai/"
57
+ }
58
+ }
59
+ },
60
+ "bg": {
61
+ "cv": {
62
+ "vits": {
63
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--bg--cv--vits.zip",
64
+ "default_vocoder": null,
65
+ "commit": null,
66
+ "author": "@NeonGeckoCom",
67
+ "license": "bsd-3-clause"
68
+ }
69
+ }
70
+ },
71
+ "cs": {
72
+ "cv": {
73
+ "vits": {
74
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--cs--cv--vits.zip",
75
+ "default_vocoder": null,
76
+ "commit": null,
77
+ "author": "@NeonGeckoCom",
78
+ "license": "bsd-3-clause"
79
+ }
80
+ }
81
+ },
82
+ "da": {
83
+ "cv": {
84
+ "vits": {
85
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--da--cv--vits.zip",
86
+ "default_vocoder": null,
87
+ "commit": null,
88
+ "author": "@NeonGeckoCom",
89
+ "license": "bsd-3-clause"
90
+ }
91
+ }
92
+ },
93
+ "et": {
94
+ "cv": {
95
+ "vits": {
96
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--et--cv--vits.zip",
97
+ "default_vocoder": null,
98
+ "commit": null,
99
+ "author": "@NeonGeckoCom",
100
+ "license": "bsd-3-clause"
101
+ }
102
+ }
103
+ },
104
+ "ga": {
105
+ "cv": {
106
+ "vits": {
107
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ga--cv--vits.zip",
108
+ "default_vocoder": null,
109
+ "commit": null,
110
+ "author": "@NeonGeckoCom",
111
+ "license": "bsd-3-clause"
112
+ }
113
+ }
114
+ },
115
+ "en": {
116
+ "ek1": {
117
+ "tacotron2": {
118
+ "description": "EK1 en-rp tacotron2 by NMStoker",
119
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
120
+ "default_vocoder": "vocoder_models/en/ek1/wavegrad",
121
+ "commit": "c802255",
122
+ "license": "apache 2.0"
123
+ }
124
+ },
125
+ "ljspeech": {
126
+ "tacotron2-DDC": {
127
+ "description": "Tacotron2 with Double Decoder Consistency.",
128
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
129
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
130
+ "commit": "bae2ad0f",
131
+ "author": "Eren Gölge @erogol",
132
+ "license": "apache 2.0",
133
+ "contact": "egolge@coqui.com"
134
+ },
135
+ "tacotron2-DDC_ph": {
136
+ "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
137
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
138
+ "default_vocoder": "vocoder_models/en/ljspeech/univnet",
139
+ "commit": "3900448",
140
+ "author": "Eren Gölge @erogol",
141
+ "license": "apache 2.0",
142
+ "contact": "egolge@coqui.com"
143
+ },
144
+ "glow-tts": {
145
+ "description": "",
146
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
147
+ "stats_file": null,
148
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
149
+ "commit": "",
150
+ "author": "Eren Gölge @erogol",
151
+ "license": "MPL",
152
+ "contact": "egolge@coqui.com"
153
+ },
154
+ "speedy-speech": {
155
+ "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
156
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
157
+ "stats_file": null,
158
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
159
+ "commit": "4581e3d",
160
+ "author": "Eren Gölge @erogol",
161
+ "license": "apache 2.0",
162
+ "contact": "egolge@coqui.com"
163
+ },
164
+ "tacotron2-DCA": {
165
+ "description": "",
166
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
167
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
168
+ "commit": "",
169
+ "author": "Eren Gölge @erogol",
170
+ "license": "MPL",
171
+ "contact": "egolge@coqui.com"
172
+ },
173
+ "vits": {
174
+ "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
175
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
176
+ "default_vocoder": null,
177
+ "commit": "3900448",
178
+ "author": "Eren Gölge @erogol",
179
+ "license": "apache 2.0",
180
+ "contact": "egolge@coqui.com"
181
+ },
182
+ "vits--neon": {
183
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
184
+ "default_vocoder": null,
185
+ "author": "@NeonGeckoCom",
186
+ "license": "bsd-3-clause",
187
+ "contact": null,
188
+ "commit": null
189
+ },
190
+ "fast_pitch": {
191
+ "description": "FastPitch model trained on LJSpeech using the Aligner Network",
192
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
193
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
194
+ "commit": "b27b3ba",
195
+ "author": "Eren Gölge @erogol",
196
+ "license": "apache 2.0",
197
+ "contact": "egolge@coqui.com"
198
+ },
199
+ "overflow": {
200
+ "description": "Overflow model trained on LJSpeech",
201
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
202
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
203
+ "commit": "3b1a28f",
204
+ "author": "Eren Gölge @erogol",
205
+ "license": "apache 2.0",
206
+ "contact": "egolge@coqui.ai"
207
+ },
208
+ "neural_hmm": {
209
+ "description": "Neural HMM model trained on LJSpeech",
210
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
211
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
212
+ "commit": "3b1a28f",
213
+ "author": "Shivam Metha @shivammehta25",
214
+ "license": "apache 2.0",
215
+ "contact": "d83ee8fe45e3c0d776d4a865aca21d7c2ac324c4"
216
+ }
217
+ },
218
+ "vctk": {
219
+ "vits": {
220
+ "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
221
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
222
+ "default_vocoder": null,
223
+ "commit": "3900448",
224
+ "author": "Eren @erogol",
225
+ "license": "apache 2.0",
226
+ "contact": "egolge@coqui.ai"
227
+ },
228
+ "fast_pitch": {
229
+ "description": "FastPitch model trained on VCTK dataseset.",
230
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
231
+ "default_vocoder": null,
232
+ "commit": "bdab788d",
233
+ "author": "Eren @erogol",
234
+ "license": "CC BY-NC-ND 4.0",
235
+ "contact": "egolge@coqui.ai"
236
+ }
237
+ },
238
+ "sam": {
239
+ "tacotron-DDC": {
240
+ "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
241
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
242
+ "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
243
+ "commit": "bae2ad0f",
244
+ "author": "Eren Gölge @erogol",
245
+ "license": "apache 2.0",
246
+ "contact": "egolge@coqui.com"
247
+ }
248
+ },
249
+ "blizzard2013": {
250
+ "capacitron-t2-c50": {
251
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
252
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
253
+ "commit": "d6284e7",
254
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
255
+ "author": "Adam Froghyar @a-froghyar",
256
+ "license": "apache 2.0",
257
+ "contact": "adamfroghyar@gmail.com"
258
+ },
259
+ "capacitron-t2-c150_v2": {
260
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
261
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
262
+ "commit": "a67039d",
263
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
264
+ "author": "Adam Froghyar @a-froghyar",
265
+ "license": "apache 2.0",
266
+ "contact": "adamfroghyar@gmail.com"
267
+ }
268
+ },
269
+ "multi-dataset": {
270
+ "tortoise-v2": {
271
+ "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
272
+ "github_rls_url": [
273
+ "https://app.coqui.ai/tts_model/autoregressive.pth",
274
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/clvp2.pth",
275
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/cvvp.pth",
276
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/diffusion_decoder.pth",
277
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_auto.pth",
278
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/rlg_diffuser.pth",
279
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/vocoder.pth",
280
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/mel_norms.pth",
281
+ "https://coqui.gateway.scarf.sh/v0.14.1_models/config.json"
282
+ ],
283
+ "commit": "c1875f6",
284
+ "default_vocoder": null,
285
+ "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
286
+ "license": "apache 2.0"
287
+ }
288
+ },
289
+ "jenny": {
290
+ "jenny": {
291
+ "description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
292
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.14.0_models/tts_models--en--jenny--jenny.zip",
293
+ "default_vocoder": null,
294
+ "commit": "ba40a1c",
295
+ "license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
296
+ "author": "@noml4u"
297
+ }
298
+ }
299
+ },
300
+ "es": {
301
+ "mai": {
302
+ "tacotron2-DDC": {
303
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
304
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
305
+ "commit": "",
306
+ "author": "Eren Gölge @erogol",
307
+ "license": "MPL",
308
+ "contact": "egolge@coqui.com"
309
+ }
310
+ },
311
+ "css10": {
312
+ "vits": {
313
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--es--css10--vits.zip",
314
+ "default_vocoder": null,
315
+ "commit": null,
316
+ "author": "@NeonGeckoCom",
317
+ "license": "bsd-3-clause"
318
+ }
319
+ }
320
+ },
321
+ "fr": {
322
+ "mai": {
323
+ "tacotron2-DDC": {
324
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
325
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
326
+ "commit": null,
327
+ "author": "Eren Gölge @erogol",
328
+ "license": "MPL",
329
+ "contact": "egolge@coqui.com"
330
+ }
331
+ },
332
+ "css10": {
333
+ "vits": {
334
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fr--css10--vits.zip",
335
+ "default_vocoder": null,
336
+ "commit": null,
337
+ "author": "@NeonGeckoCom",
338
+ "license": "bsd-3-clause"
339
+ }
340
+ }
341
+ },
342
+ "uk": {
343
+ "mai": {
344
+ "glow-tts": {
345
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
346
+ "author": "@robinhad",
347
+ "commit": "bdab788d",
348
+ "license": "MIT",
349
+ "contact": "",
350
+ "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
351
+ },
352
+ "vits": {
353
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--uk--mai--vits.zip",
354
+ "default_vocoder": null,
355
+ "commit": null,
356
+ "author": "@NeonGeckoCom",
357
+ "license": "bsd-3-clause"
358
+ }
359
+ }
360
+ },
361
+ "zh-CN": {
362
+ "baker": {
363
+ "tacotron2-DDC-GST": {
364
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
365
+ "commit": "unknown",
366
+ "author": "@kirianguiller",
367
+ "license": "apache 2.0",
368
+ "default_vocoder": null
369
+ }
370
+ }
371
+ },
372
+ "nl": {
373
+ "mai": {
374
+ "tacotron2-DDC": {
375
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
376
+ "author": "@r-dh",
377
+ "license": "apache 2.0",
378
+ "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
379
+ "stats_file": null,
380
+ "commit": "540d811"
381
+ }
382
+ },
383
+ "css10": {
384
+ "vits": {
385
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--nl--css10--vits.zip",
386
+ "default_vocoder": null,
387
+ "commit": null,
388
+ "author": "@NeonGeckoCom",
389
+ "license": "bsd-3-clause"
390
+ }
391
+ }
392
+ },
393
+ "de": {
394
+ "thorsten": {
395
+ "tacotron2-DCA": {
396
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
397
+ "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
398
+ "author": "@thorstenMueller",
399
+ "license": "apache 2.0",
400
+ "commit": "unknown"
401
+ },
402
+ "vits": {
403
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
404
+ "default_vocoder": null,
405
+ "author": "@thorstenMueller",
406
+ "license": "apache 2.0",
407
+ "commit": "unknown"
408
+ },
409
+ "tacotron2-DDC": {
410
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
411
+ "default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
412
+ "description": "Thorsten-Dec2021-22k-DDC",
413
+ "author": "@thorstenMueller",
414
+ "license": "apache 2.0",
415
+ "commit": "unknown"
416
+ }
417
+ },
418
+ "css10": {
419
+ "vits-neon": {
420
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--css10--vits.zip",
421
+ "default_vocoder": null,
422
+ "author": "@NeonGeckoCom",
423
+ "license": "bsd-3-clause",
424
+ "commit": null
425
+ }
426
+ }
427
+ },
428
+ "ja": {
429
+ "kokoro": {
430
+ "tacotron2-DDC": {
431
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
432
+ "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
433
+ "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
434
+ "author": "@kaiidams",
435
+ "license": "apache 2.0",
436
+ "commit": "401fbd89"
437
+ }
438
+ }
439
+ },
440
+ "tr": {
441
+ "common-voice": {
442
+ "glow-tts": {
443
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
444
+ "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
445
+ "license": "MIT",
446
+ "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
447
+ "author": "Fatih Akademi",
448
+ "commit": null
449
+ }
450
+ }
451
+ },
452
+ "it": {
453
+ "mai_female": {
454
+ "glow-tts": {
455
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
456
+ "default_vocoder": null,
457
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
458
+ "author": "@nicolalandro",
459
+ "license": "apache 2.0",
460
+ "commit": null
461
+ },
462
+ "vits": {
463
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
464
+ "default_vocoder": null,
465
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
466
+ "author": "@nicolalandro",
467
+ "license": "apache 2.0",
468
+ "commit": null
469
+ }
470
+ },
471
+ "mai_male": {
472
+ "glow-tts": {
473
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
474
+ "default_vocoder": null,
475
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
476
+ "author": "@nicolalandro",
477
+ "license": "apache 2.0",
478
+ "commit": null
479
+ },
480
+ "vits": {
481
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
482
+ "default_vocoder": null,
483
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
484
+ "author": "@nicolalandro",
485
+ "license": "apache 2.0",
486
+ "commit": null
487
+ }
488
+ }
489
+ },
490
+ "ewe": {
491
+ "openbible": {
492
+ "vits": {
493
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
494
+ "default_vocoder": null,
495
+ "license": "CC-BY-SA 4.0",
496
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
497
+ "author": "@coqui_ai",
498
+ "commit": "1b22f03"
499
+ }
500
+ }
501
+ },
502
+ "hau": {
503
+ "openbible": {
504
+ "vits": {
505
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--hau--openbible--vits.zip",
506
+ "default_vocoder": null,
507
+ "license": "CC-BY-SA 4.0",
508
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
509
+ "author": "@coqui_ai",
510
+ "commit": "1b22f03"
511
+ }
512
+ }
513
+ },
514
+ "lin": {
515
+ "openbible": {
516
+ "vits": {
517
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--lin--openbible--vits.zip",
518
+ "default_vocoder": null,
519
+ "license": "CC-BY-SA 4.0",
520
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
521
+ "author": "@coqui_ai",
522
+ "commit": "1b22f03"
523
+ }
524
+ }
525
+ },
526
+ "tw_akuapem": {
527
+ "openbible": {
528
+ "vits": {
529
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
530
+ "default_vocoder": null,
531
+ "license": "CC-BY-SA 4.0",
532
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
533
+ "author": "@coqui_ai",
534
+ "commit": "1b22f03"
535
+ }
536
+ }
537
+ },
538
+ "tw_asante": {
539
+ "openbible": {
540
+ "vits": {
541
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
542
+ "default_vocoder": null,
543
+ "license": "CC-BY-SA 4.0",
544
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
545
+ "author": "@coqui_ai",
546
+ "commit": "1b22f03"
547
+ }
548
+ }
549
+ },
550
+ "yor": {
551
+ "openbible": {
552
+ "vits": {
553
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.2_models/tts_models--yor--openbible--vits.zip",
554
+ "default_vocoder": null,
555
+ "license": "CC-BY-SA 4.0",
556
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
557
+ "author": "@coqui_ai",
558
+ "commit": "1b22f03"
559
+ }
560
+ }
561
+ },
562
+ "hu": {
563
+ "css10": {
564
+ "vits": {
565
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hu--css10--vits.zip",
566
+ "default_vocoder": null,
567
+ "commit": null,
568
+ "author": "@NeonGeckoCom",
569
+ "license": "bsd-3-clause"
570
+ }
571
+ }
572
+ },
573
+ "el": {
574
+ "cv": {
575
+ "vits": {
576
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--el--cv--vits.zip",
577
+ "default_vocoder": null,
578
+ "commit": null,
579
+ "author": "@NeonGeckoCom",
580
+ "license": "bsd-3-clause"
581
+ }
582
+ }
583
+ },
584
+ "fi": {
585
+ "css10": {
586
+ "vits": {
587
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--fi--css10--vits.zip",
588
+ "default_vocoder": null,
589
+ "commit": null,
590
+ "author": "@NeonGeckoCom",
591
+ "license": "bsd-3-clause"
592
+ }
593
+ }
594
+ },
595
+ "hr": {
596
+ "cv": {
597
+ "vits": {
598
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--hr--cv--vits.zip",
599
+ "default_vocoder": null,
600
+ "commit": null,
601
+ "author": "@NeonGeckoCom",
602
+ "license": "bsd-3-clause"
603
+ }
604
+ }
605
+ },
606
+ "lt": {
607
+ "cv": {
608
+ "vits": {
609
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lt--cv--vits.zip",
610
+ "default_vocoder": null,
611
+ "commit": null,
612
+ "author": "@NeonGeckoCom",
613
+ "license": "bsd-3-clause"
614
+ }
615
+ }
616
+ },
617
+ "lv": {
618
+ "cv": {
619
+ "vits": {
620
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--lv--cv--vits.zip",
621
+ "default_vocoder": null,
622
+ "commit": null,
623
+ "author": "@NeonGeckoCom",
624
+ "license": "bsd-3-clause"
625
+ }
626
+ }
627
+ },
628
+ "mt": {
629
+ "cv": {
630
+ "vits": {
631
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--mt--cv--vits.zip",
632
+ "default_vocoder": null,
633
+ "commit": null,
634
+ "author": "@NeonGeckoCom",
635
+ "license": "bsd-3-clause"
636
+ }
637
+ }
638
+ },
639
+ "pl": {
640
+ "mai_female": {
641
+ "vits": {
642
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
643
+ "default_vocoder": null,
644
+ "commit": null,
645
+ "author": "@NeonGeckoCom",
646
+ "license": "bsd-3-clause"
647
+ }
648
+ }
649
+ },
650
+ "pt": {
651
+ "cv": {
652
+ "vits": {
653
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--pt--cv--vits.zip",
654
+ "default_vocoder": null,
655
+ "commit": null,
656
+ "author": "@NeonGeckoCom",
657
+ "license": "bsd-3-clause"
658
+ }
659
+ }
660
+ },
661
+ "ro": {
662
+ "cv": {
663
+ "vits": {
664
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--ro--cv--vits.zip",
665
+ "default_vocoder": null,
666
+ "commit": null,
667
+ "author": "@NeonGeckoCom",
668
+ "license": "bsd-3-clause"
669
+ }
670
+ }
671
+ },
672
+ "sk": {
673
+ "cv": {
674
+ "vits": {
675
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sk--cv--vits.zip",
676
+ "default_vocoder": null,
677
+ "commit": null,
678
+ "author": "@NeonGeckoCom",
679
+ "license": "bsd-3-clause"
680
+ }
681
+ }
682
+ },
683
+ "sl": {
684
+ "cv": {
685
+ "vits": {
686
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sl--cv--vits.zip",
687
+ "default_vocoder": null,
688
+ "commit": null,
689
+ "author": "@NeonGeckoCom",
690
+ "license": "bsd-3-clause"
691
+ }
692
+ }
693
+ },
694
+ "sv": {
695
+ "cv": {
696
+ "vits": {
697
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--sv--cv--vits.zip",
698
+ "default_vocoder": null,
699
+ "commit": null,
700
+ "author": "@NeonGeckoCom",
701
+ "license": "bsd-3-clause"
702
+ }
703
+ }
704
+ },
705
+ "ca": {
706
+ "custom": {
707
+ "vits": {
708
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--ca--custom--vits.zip",
709
+ "default_vocoder": null,
710
+ "commit": null,
711
+ "description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
712
+ "author": "@gullabi",
713
+ "license": "CC-BY-4.0"
714
+ }
715
+ }
716
+ },
717
+ "fa": {
718
+ "custom": {
719
+ "glow-tts": {
720
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
721
+ "default_vocoder": null,
722
+ "commit": null,
723
+ "description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
724
+ "author": "@karim23657",
725
+ "license": "CC-BY-4.0"
726
+ }
727
+ }
728
+ },
729
+ "bn": {
730
+ "custom": {
731
+ "vits-male": {
732
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
733
+ "default_vocoder": null,
734
+ "commit": null,
735
+ "description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
736
+ "author": "@mobassir94",
737
+ "license": "Apache 2.0"
738
+ },
739
+ "vits-female": {
740
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
741
+ "default_vocoder": null,
742
+ "commit": null,
743
+ "description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
744
+ "author": "@mobassir94",
745
+ "license": "Apache 2.0"
746
+ }
747
+ }
748
+ },
749
+ "be": {
750
+ "common-voice": {
751
+ "glow-tts":{
752
+ "description": "Belarusian GlowTTS model created by @alex73 (Github).",
753
+ "github_rls_url":"https://coqui.gateway.scarf.sh/v0.16.6/tts_models--be--common-voice--glow-tts.zip",
754
+ "default_vocoder": "vocoder_models/be/common-voice/hifigan",
755
+ "commit": "c0aabb85",
756
+ "license": "CC-BY-SA 4.0",
757
+ "contact": "alex73mail@gmail.com"
758
+ }
759
+ }
760
+ }
761
+ },
762
+ "vocoder_models": {
763
+ "universal": {
764
+ "libri-tts": {
765
+ "wavegrad": {
766
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
767
+ "commit": "ea976b0",
768
+ "author": "Eren Gölge @erogol",
769
+ "license": "MPL",
770
+ "contact": "egolge@coqui.com"
771
+ },
772
+ "fullband-melgan": {
773
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
774
+ "commit": "4132240",
775
+ "author": "Eren Gölge @erogol",
776
+ "license": "MPL",
777
+ "contact": "egolge@coqui.com"
778
+ }
779
+ }
780
+ },
781
+ "en": {
782
+ "ek1": {
783
+ "wavegrad": {
784
+ "description": "EK1 en-rp wavegrad by NMStoker",
785
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
786
+ "commit": "c802255",
787
+ "license": "apache 2.0"
788
+ }
789
+ },
790
+ "ljspeech": {
791
+ "multiband-melgan": {
792
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
793
+ "commit": "ea976b0",
794
+ "author": "Eren Gölge @erogol",
795
+ "license": "MPL",
796
+ "contact": "egolge@coqui.com"
797
+ },
798
+ "hifigan_v2": {
799
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
800
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
801
+ "commit": "bae2ad0f",
802
+ "author": "@erogol",
803
+ "license": "apache 2.0",
804
+ "contact": "egolge@coqui.ai"
805
+ },
806
+ "univnet": {
807
+ "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
808
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
809
+ "commit": "4581e3d",
810
+ "author": "Eren @erogol",
811
+ "license": "apache 2.0",
812
+ "contact": "egolge@coqui.ai"
813
+ }
814
+ },
815
+ "blizzard2013": {
816
+ "hifigan_v2": {
817
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
818
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
819
+ "commit": "d6284e7",
820
+ "author": "Adam Froghyar @a-froghyar",
821
+ "license": "apache 2.0",
822
+ "contact": "adamfroghyar@gmail.com"
823
+ }
824
+ },
825
+ "vctk": {
826
+ "hifigan_v2": {
827
+ "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
828
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
829
+ "commit": "2f07160",
830
+ "author": "Edresson Casanova",
831
+ "license": "apache 2.0",
832
+ "contact": ""
833
+ }
834
+ },
835
+ "sam": {
836
+ "hifigan_v2": {
837
+ "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
838
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
839
+ "commit": "2f07160",
840
+ "author": "Eren Gölge @erogol",
841
+ "license": "apache 2.0",
842
+ "contact": "egolge@coqui.ai"
843
+ }
844
+ }
845
+ },
846
+ "nl": {
847
+ "mai": {
848
+ "parallel-wavegan": {
849
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
850
+ "author": "@r-dh",
851
+ "license": "apache 2.0",
852
+ "commit": "unknown"
853
+ }
854
+ }
855
+ },
856
+ "de": {
857
+ "thorsten": {
858
+ "wavegrad": {
859
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
860
+ "author": "@thorstenMueller",
861
+ "license": "apache 2.0",
862
+ "commit": "unknown"
863
+ },
864
+ "fullband-melgan": {
865
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
866
+ "author": "@thorstenMueller",
867
+ "license": "apache 2.0",
868
+ "commit": "unknown"
869
+ },
870
+ "hifigan_v1": {
871
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
872
+ "description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
873
+ "author": "@thorstenMueller",
874
+ "license": "apache 2.0",
875
+ "commit": "unknown"
876
+ }
877
+ }
878
+ },
879
+ "ja": {
880
+ "kokoro": {
881
+ "hifigan_v1": {
882
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
883
+ "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
884
+ "author": "@kaiidams",
885
+ "license": "apache 2.0",
886
+ "commit": "3900448"
887
+ }
888
+ }
889
+ },
890
+ "uk": {
891
+ "mai": {
892
+ "multiband-melgan": {
893
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
894
+ "author": "@robinhad",
895
+ "commit": "bdab788d",
896
+ "license": "MIT",
897
+ "contact": ""
898
+ }
899
+ }
900
+ },
901
+ "tr": {
902
+ "common-voice": {
903
+ "hifigan": {
904
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
905
+ "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
906
+ "author": "Fatih Akademi",
907
+ "license": "MIT",
908
+ "commit": null
909
+ }
910
+ }
911
+ },
912
+ "be": {
913
+ "common-voice": {
914
+ "hifigan": {
915
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.16.6/vocoder_models--be--common-voice--hifigan.zip",
916
+ "description": "Belarusian HiFiGAN model created by @alex73 (Github).",
917
+ "author": "@alex73",
918
+ "license": "CC-BY-SA 4.0",
919
+ "commit": "c0aabb85"
920
+ }
921
+ }
922
+ }
923
+ },
924
+ "voice_conversion_models": {
925
+ "multilingual": {
926
+ "vctk": {
927
+ "freevc24": {
928
+ "github_rls_url": "https://coqui.gateway.scarf.sh/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
929
+ "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
930
+ "author": "Jing-Yi Li @OlaWod",
931
+ "license": "MIT",
932
+ "commit": null
933
+ }
934
+ }
935
+ }
936
+ }
937
+ }
TTS/VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.20.6
TTS/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ with open(os.path.join(os.path.dirname(__file__), "VERSION"), "r", encoding="utf-8") as f:
4
+ version = f.read().strip()
5
+
6
+ __version__ = version
TTS/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (358 Bytes). View file
 
TTS/__pycache__/api.cpython-39.pyc ADDED
Binary file (18.5 kB). View file
 
TTS/__pycache__/cs_api.cpython-39.pyc ADDED
Binary file (12 kB). View file
 
TTS/__pycache__/model.cpython-39.pyc ADDED
Binary file (2.58 kB). View file
 
TTS/api.py ADDED
@@ -0,0 +1,489 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import warnings
3
+ from pathlib import Path
4
+ from typing import Union
5
+
6
+ import numpy as np
7
+ from torch import nn
8
+
9
+ from TTS.cs_api import CS_API
10
+ from TTS.utils.audio.numpy_transforms import save_wav
11
+ from TTS.utils.manage import ModelManager
12
+ from TTS.utils.synthesizer import Synthesizer
13
+
14
+
15
+ class TTS(nn.Module):
16
+ """TODO: Add voice conversion and Capacitron support."""
17
+
18
+ def __init__(
19
+ self,
20
+ model_name: str = "",
21
+ model_path: str = None,
22
+ config_path: str = None,
23
+ vocoder_path: str = None,
24
+ vocoder_config_path: str = None,
25
+ progress_bar: bool = True,
26
+ cs_api_model: str = "XTTS",
27
+ gpu=False,
28
+ ):
29
+ """🐸TTS python interface that allows to load and use the released models.
30
+
31
+ Example with a multi-speaker model:
32
+ >>> from TTS.api import TTS
33
+ >>> tts = TTS(TTS.list_models()[0])
34
+ >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
35
+ >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
36
+
37
+ Example with a single-speaker model:
38
+ >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
39
+ >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
40
+
41
+ Example loading a model from a path:
42
+ >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False)
43
+ >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
44
+
45
+ Example voice cloning with YourTTS in English, French and Portuguese:
46
+ >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
47
+ >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
48
+ >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
49
+ >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
50
+
51
+ Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
52
+ >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False, gpu=True)
53
+ >>> tts.tts_to_file("This is a test.", file_path="output.wav")
54
+
55
+ Args:
56
+ model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
57
+ model_path (str, optional): Path to the model checkpoint. Defaults to None.
58
+ config_path (str, optional): Path to the model config. Defaults to None.
59
+ vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
60
+ vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
61
+ progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
62
+ cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
63
+ "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
64
+ Defaults to "XTTS".
65
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
66
+ """
67
+ super().__init__()
68
+ self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
69
+
70
+ self.synthesizer = None
71
+ self.voice_converter = None
72
+ self.csapi = None
73
+ self.cs_api_model = cs_api_model
74
+ self.model_name = ""
75
+
76
+ if gpu:
77
+ warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
78
+
79
+ if model_name is not None:
80
+ if "tts_models" in model_name or "coqui_studio" in model_name:
81
+ self.load_tts_model_by_name(model_name, gpu)
82
+ elif "voice_conversion_models" in model_name:
83
+ self.load_vc_model_by_name(model_name, gpu)
84
+
85
+ if model_path:
86
+ self.load_tts_model_by_path(
87
+ model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
88
+ )
89
+
90
+ @property
91
+ def models(self):
92
+ return self.manager.list_tts_models()
93
+
94
+ @property
95
+ def is_multi_speaker(self):
96
+ if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
97
+ return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
98
+ return False
99
+
100
+ @property
101
+ def is_coqui_studio(self):
102
+ if self.model_name is None:
103
+ return False
104
+ return "coqui_studio" in self.model_name
105
+
106
+ @property
107
+ def is_multi_lingual(self):
108
+ # Not sure what sets this to None, but applied a fix to prevent crashing.
109
+ if isinstance(self.model_name, str) and "xtts" in self.model_name:
110
+ return True
111
+ if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
112
+ return self.synthesizer.tts_model.language_manager.num_languages > 1
113
+ return False
114
+
115
+ @property
116
+ def speakers(self):
117
+ if not self.is_multi_speaker:
118
+ return None
119
+ return self.synthesizer.tts_model.speaker_manager.speaker_names
120
+
121
+ @property
122
+ def languages(self):
123
+ if not self.is_multi_lingual:
124
+ return None
125
+ return self.synthesizer.tts_model.language_manager.language_names
126
+
127
+ @staticmethod
128
+ def get_models_file_path():
129
+ return Path(__file__).parent / ".models.json"
130
+
131
+ def list_models(self):
132
+ try:
133
+ csapi = CS_API(model=self.cs_api_model)
134
+ models = csapi.list_speakers_as_tts_models()
135
+ except ValueError as e:
136
+ print(e)
137
+ models = []
138
+ manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
139
+ return manager.list_tts_models() + models
140
+
141
+ def download_model_by_name(self, model_name: str):
142
+ model_path, config_path, model_item = self.manager.download_model(model_name)
143
+ if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
144
+ # return model directory if there are multiple files
145
+ # we assume that the model knows how to load itself
146
+ return None, None, None, None, model_path
147
+ if model_item.get("default_vocoder") is None:
148
+ return model_path, config_path, None, None, None
149
+ vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
150
+ return model_path, config_path, vocoder_path, vocoder_config_path, None
151
+
152
+ def load_vc_model_by_name(self, model_name: str, gpu: bool = False):
153
+ """Load one of the voice conversion models by name.
154
+
155
+ Args:
156
+ model_name (str): Model name to load. You can list models by ```tts.models```.
157
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
158
+ """
159
+ self.model_name = model_name
160
+ model_path, config_path, _, _, _ = self.download_model_by_name(model_name)
161
+ self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu)
162
+
163
+ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
164
+ """Load one of 🐸TTS models by name.
165
+
166
+ Args:
167
+ model_name (str): Model name to load. You can list models by ```tts.models```.
168
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
169
+
170
+ TODO: Add tests
171
+ """
172
+ self.synthesizer = None
173
+ self.csapi = None
174
+ self.model_name = model_name
175
+
176
+ if "coqui_studio" in model_name:
177
+ self.csapi = CS_API()
178
+ else:
179
+ model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
180
+ model_name
181
+ )
182
+
183
+ # init synthesizer
184
+ # None values are fetch from the model
185
+ self.synthesizer = Synthesizer(
186
+ tts_checkpoint=model_path,
187
+ tts_config_path=config_path,
188
+ tts_speakers_file=None,
189
+ tts_languages_file=None,
190
+ vocoder_checkpoint=vocoder_path,
191
+ vocoder_config=vocoder_config_path,
192
+ encoder_checkpoint=None,
193
+ encoder_config=None,
194
+ model_dir=model_dir,
195
+ use_cuda=gpu,
196
+ )
197
+
198
+ def load_tts_model_by_path(
199
+ self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
200
+ ):
201
+ """Load a model from a path.
202
+
203
+ Args:
204
+ model_path (str): Path to the model checkpoint.
205
+ config_path (str): Path to the model config.
206
+ vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
207
+ vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
208
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
209
+ """
210
+
211
+ self.synthesizer = Synthesizer(
212
+ tts_checkpoint=model_path,
213
+ tts_config_path=config_path,
214
+ tts_speakers_file=None,
215
+ tts_languages_file=None,
216
+ vocoder_checkpoint=vocoder_path,
217
+ vocoder_config=vocoder_config,
218
+ encoder_checkpoint=None,
219
+ encoder_config=None,
220
+ use_cuda=gpu,
221
+ )
222
+
223
+ def _check_arguments(
224
+ self,
225
+ speaker: str = None,
226
+ language: str = None,
227
+ speaker_wav: str = None,
228
+ emotion: str = None,
229
+ speed: float = None,
230
+ **kwargs,
231
+ ) -> None:
232
+ """Check if the arguments are valid for the model."""
233
+ if not self.is_coqui_studio:
234
+ # check for the coqui tts models
235
+ if self.is_multi_speaker and (speaker is None and speaker_wav is None):
236
+ raise ValueError("Model is multi-speaker but no `speaker` is provided.")
237
+ if self.is_multi_lingual and language is None:
238
+ raise ValueError("Model is multi-lingual but no `language` is provided.")
239
+ if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
240
+ raise ValueError("Model is not multi-speaker but `speaker` is provided.")
241
+ if not self.is_multi_lingual and language is not None:
242
+ raise ValueError("Model is not multi-lingual but `language` is provided.")
243
+ if not emotion is None and not speed is None:
244
+ raise ValueError("Emotion and speed can only be used with Coqui Studio models.")
245
+ else:
246
+ if emotion is None:
247
+ emotion = "Neutral"
248
+ if speed is None:
249
+ speed = 1.0
250
+ # check for the studio models
251
+ if speaker_wav is not None:
252
+ raise ValueError("Coqui Studio models do not support `speaker_wav` argument.")
253
+ if speaker is not None:
254
+ raise ValueError("Coqui Studio models do not support `speaker` argument.")
255
+ if language is not None and language != "en":
256
+ raise ValueError("Coqui Studio models currently support only `language=en` argument.")
257
+ if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]:
258
+ raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.")
259
+
260
+ def tts_coqui_studio(
261
+ self,
262
+ text: str,
263
+ speaker_name: str = None,
264
+ language: str = None,
265
+ emotion: str = None,
266
+ speed: float = 1.0,
267
+ pipe_out=None,
268
+ file_path: str = None,
269
+ ) -> Union[np.ndarray, str]:
270
+ """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
271
+
272
+ Args:
273
+ text (str):
274
+ Input text to synthesize.
275
+ speaker_name (str, optional):
276
+ Speaker name from Coqui Studio. Defaults to None.
277
+ language (str): Language of the text. If None, the default language of the speaker is used. Language is only
278
+ supported by `XTTS` model.
279
+ emotion (str, optional):
280
+ Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
281
+ with "V1" model. Defaults to None.
282
+ speed (float, optional):
283
+ Speed of the speech. Defaults to 1.0.
284
+ pipe_out (BytesIO, optional):
285
+ Flag to stdout the generated TTS wav file for shell pipe.
286
+ file_path (str, optional):
287
+ Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
288
+
289
+ Returns:
290
+ Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file.
291
+ """
292
+ speaker_name = self.model_name.split("/")[2]
293
+ if file_path is not None:
294
+ return self.csapi.tts_to_file(
295
+ text=text,
296
+ speaker_name=speaker_name,
297
+ language=language,
298
+ speed=speed,
299
+ pipe_out=pipe_out,
300
+ emotion=emotion,
301
+ file_path=file_path,
302
+ )[0]
303
+ return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0]
304
+
305
+ def tts(
306
+ self,
307
+ text: str,
308
+ speaker: str = None,
309
+ language: str = None,
310
+ speaker_wav: str = None,
311
+ emotion: str = None,
312
+ speed: float = None,
313
+ **kwargs,
314
+ ):
315
+ """Convert text to speech.
316
+
317
+ Args:
318
+ text (str):
319
+ Input text to synthesize.
320
+ speaker (str, optional):
321
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
322
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
323
+ language (str): Language of the text. If None, the default language of the speaker is used. Language is only
324
+ supported by `XTTS` model.
325
+ speaker_wav (str, optional):
326
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
327
+ Defaults to None.
328
+ emotion (str, optional):
329
+ Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
330
+ speed (float, optional):
331
+ Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
332
+ Defaults to None.
333
+ """
334
+ self._check_arguments(
335
+ speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
336
+ )
337
+ if self.csapi is not None:
338
+ return self.tts_coqui_studio(
339
+ text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
340
+ )
341
+ wav = self.synthesizer.tts(
342
+ text=text,
343
+ speaker_name=speaker,
344
+ language_name=language,
345
+ speaker_wav=speaker_wav,
346
+ reference_wav=None,
347
+ style_wav=None,
348
+ style_text=None,
349
+ reference_speaker_name=None,
350
+ **kwargs,
351
+ )
352
+ return wav
353
+
354
+ def tts_to_file(
355
+ self,
356
+ text: str,
357
+ speaker: str = None,
358
+ language: str = None,
359
+ speaker_wav: str = None,
360
+ emotion: str = None,
361
+ speed: float = 1.0,
362
+ pipe_out=None,
363
+ file_path: str = "output.wav",
364
+ **kwargs,
365
+ ):
366
+ """Convert text to speech.
367
+
368
+ Args:
369
+ text (str):
370
+ Input text to synthesize.
371
+ speaker (str, optional):
372
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
373
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
374
+ language (str, optional):
375
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
376
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
377
+ speaker_wav (str, optional):
378
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
379
+ Defaults to None.
380
+ emotion (str, optional):
381
+ Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
382
+ speed (float, optional):
383
+ Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
384
+ pipe_out (BytesIO, optional):
385
+ Flag to stdout the generated TTS wav file for shell pipe.
386
+ file_path (str, optional):
387
+ Output file path. Defaults to "output.wav".
388
+ kwargs (dict, optional):
389
+ Additional arguments for the model.
390
+ """
391
+ self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
392
+
393
+ if self.csapi is not None:
394
+ return self.tts_coqui_studio(
395
+ text=text,
396
+ speaker_name=speaker,
397
+ language=language,
398
+ emotion=emotion,
399
+ speed=speed,
400
+ file_path=file_path,
401
+ pipe_out=pipe_out,
402
+ )
403
+ wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
404
+ self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
405
+ return file_path
406
+
407
+ def voice_conversion(
408
+ self,
409
+ source_wav: str,
410
+ target_wav: str,
411
+ ):
412
+ """Voice conversion with FreeVC. Convert source wav to target speaker.
413
+
414
+ Args:``
415
+ source_wav (str):
416
+ Path to the source wav file.
417
+ target_wav (str):`
418
+ Path to the target wav file.
419
+ """
420
+ wav = self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
421
+ return wav
422
+
423
+ def voice_conversion_to_file(
424
+ self,
425
+ source_wav: str,
426
+ target_wav: str,
427
+ file_path: str = "output.wav",
428
+ ):
429
+ """Voice conversion with FreeVC. Convert source wav to target speaker.
430
+
431
+ Args:
432
+ source_wav (str):
433
+ Path to the source wav file.
434
+ target_wav (str):
435
+ Path to the target wav file.
436
+ file_path (str, optional):
437
+ Output file path. Defaults to "output.wav".
438
+ """
439
+ wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
440
+ save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
441
+ return file_path
442
+
443
+ def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None):
444
+ """Convert text to speech with voice conversion.
445
+
446
+ It combines tts with voice conversion to fake voice cloning.
447
+
448
+ - Convert text to speech with tts.
449
+ - Convert the output wav to target speaker with voice conversion.
450
+
451
+ Args:
452
+ text (str):
453
+ Input text to synthesize.
454
+ language (str, optional):
455
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
456
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
457
+ speaker_wav (str, optional):
458
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
459
+ Defaults to None.
460
+ """
461
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
462
+ # Lazy code... save it to a temp file to resample it while reading it for VC
463
+ self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name, speaker_wav=speaker_wav)
464
+ if self.voice_converter is None:
465
+ self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
466
+ wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
467
+ return wav
468
+
469
+ def tts_with_vc_to_file(
470
+ self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav"
471
+ ):
472
+ """Convert text to speech with voice conversion and save to file.
473
+
474
+ Check `tts_with_vc` for more details.
475
+
476
+ Args:
477
+ text (str):
478
+ Input text to synthesize.
479
+ language (str, optional):
480
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
481
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
482
+ speaker_wav (str, optional):
483
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
484
+ Defaults to None.
485
+ file_path (str, optional):
486
+ Output file path. Defaults to "output.wav".
487
+ """
488
+ wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav)
489
+ save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate)
TTS/bin/__init__.py ADDED
File without changes
TTS/bin/collect_env_info.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Get detailed info about the working environment."""
2
+ import os
3
+ import platform
4
+ import sys
5
+
6
+ import numpy
7
+ import torch
8
+
9
+ sys.path += [os.path.abspath(".."), os.path.abspath(".")]
10
+ import json
11
+
12
+ import TTS
13
+
14
+
15
+ def system_info():
16
+ return {
17
+ "OS": platform.system(),
18
+ "architecture": platform.architecture(),
19
+ "version": platform.version(),
20
+ "processor": platform.processor(),
21
+ "python": platform.python_version(),
22
+ }
23
+
24
+
25
+ def cuda_info():
26
+ return {
27
+ "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
28
+ "available": torch.cuda.is_available(),
29
+ "version": torch.version.cuda,
30
+ }
31
+
32
+
33
+ def package_info():
34
+ return {
35
+ "numpy": numpy.__version__,
36
+ "PyTorch_version": torch.__version__,
37
+ "PyTorch_debug": torch.version.debug,
38
+ "TTS": TTS.__version__,
39
+ }
40
+
41
+
42
+ def main():
43
+ details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
44
+ print(json.dumps(details, indent=4, sort_keys=True))
45
+
46
+
47
+ if __name__ == "__main__":
48
+ main()
TTS/bin/compute_attention_masks.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import importlib
3
+ import os
4
+ from argparse import RawTextHelpFormatter
5
+
6
+ import numpy as np
7
+ import torch
8
+ from torch.utils.data import DataLoader
9
+ from tqdm import tqdm
10
+
11
+ from TTS.config import load_config
12
+ from TTS.tts.datasets.TTSDataset import TTSDataset
13
+ from TTS.tts.models import setup_model
14
+ from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
15
+ from TTS.utils.audio import AudioProcessor
16
+ from TTS.utils.io import load_checkpoint
17
+
18
+ if __name__ == "__main__":
19
+ # pylint: disable=bad-option-value
20
+ parser = argparse.ArgumentParser(
21
+ description="""Extract attention masks from trained Tacotron/Tacotron2 models.
22
+ These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
23
+ """Each attention mask is written to the same path as the input wav file with ".npy" file extension.
24
+ (e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
25
+ """
26
+ Example run:
27
+ CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
28
+ --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
29
+ --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
30
+ --dataset_metafile metadata.csv
31
+ --data_path /root/LJSpeech-1.1/
32
+ --batch_size 32
33
+ --dataset ljspeech
34
+ --use_cuda True
35
+ """,
36
+ formatter_class=RawTextHelpFormatter,
37
+ )
38
+ parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
39
+ parser.add_argument(
40
+ "--config_path",
41
+ type=str,
42
+ required=True,
43
+ help="Path to Tacotron/Tacotron2 config file.",
44
+ )
45
+ parser.add_argument(
46
+ "--dataset",
47
+ type=str,
48
+ default="",
49
+ required=True,
50
+ help="Target dataset processor name from TTS.tts.dataset.preprocess.",
51
+ )
52
+
53
+ parser.add_argument(
54
+ "--dataset_metafile",
55
+ type=str,
56
+ default="",
57
+ required=True,
58
+ help="Dataset metafile inclusing file paths with transcripts.",
59
+ )
60
+ parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
61
+ parser.add_argument("--use_cuda", type=bool, default=False, help="enable/disable cuda.")
62
+
63
+ parser.add_argument(
64
+ "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
65
+ )
66
+ args = parser.parse_args()
67
+
68
+ C = load_config(args.config_path)
69
+ ap = AudioProcessor(**C.audio)
70
+
71
+ # if the vocabulary was passed, replace the default
72
+ if "characters" in C.keys():
73
+ symbols, phonemes = make_symbols(**C.characters)
74
+
75
+ # load the model
76
+ num_chars = len(phonemes) if C.use_phonemes else len(symbols)
77
+ # TODO: handle multi-speaker
78
+ model = setup_model(C)
79
+ model, _ = load_checkpoint(model, args.model_path, args.use_cuda, True)
80
+
81
+ # data loader
82
+ preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
83
+ preprocessor = getattr(preprocessor, args.dataset)
84
+ meta_data = preprocessor(args.data_path, args.dataset_metafile)
85
+ dataset = TTSDataset(
86
+ model.decoder.r,
87
+ C.text_cleaner,
88
+ compute_linear_spec=False,
89
+ ap=ap,
90
+ meta_data=meta_data,
91
+ characters=C.characters if "characters" in C.keys() else None,
92
+ add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
93
+ use_phonemes=C.use_phonemes,
94
+ phoneme_cache_path=C.phoneme_cache_path,
95
+ phoneme_language=C.phoneme_language,
96
+ enable_eos_bos=C.enable_eos_bos_chars,
97
+ )
98
+
99
+ dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
100
+ loader = DataLoader(
101
+ dataset,
102
+ batch_size=args.batch_size,
103
+ num_workers=4,
104
+ collate_fn=dataset.collate_fn,
105
+ shuffle=False,
106
+ drop_last=False,
107
+ )
108
+
109
+ # compute attentions
110
+ file_paths = []
111
+ with torch.no_grad():
112
+ for data in tqdm(loader):
113
+ # setup input data
114
+ text_input = data[0]
115
+ text_lengths = data[1]
116
+ linear_input = data[3]
117
+ mel_input = data[4]
118
+ mel_lengths = data[5]
119
+ stop_targets = data[6]
120
+ item_idxs = data[7]
121
+
122
+ # dispatch data to GPU
123
+ if args.use_cuda:
124
+ text_input = text_input.cuda()
125
+ text_lengths = text_lengths.cuda()
126
+ mel_input = mel_input.cuda()
127
+ mel_lengths = mel_lengths.cuda()
128
+
129
+ model_outputs = model.forward(text_input, text_lengths, mel_input)
130
+
131
+ alignments = model_outputs["alignments"].detach()
132
+ for idx, alignment in enumerate(alignments):
133
+ item_idx = item_idxs[idx]
134
+ # interpolate if r > 1
135
+ alignment = (
136
+ torch.nn.functional.interpolate(
137
+ alignment.transpose(0, 1).unsqueeze(0),
138
+ size=None,
139
+ scale_factor=model.decoder.r,
140
+ mode="nearest",
141
+ align_corners=None,
142
+ recompute_scale_factor=None,
143
+ )
144
+ .squeeze(0)
145
+ .transpose(0, 1)
146
+ )
147
+ # remove paddings
148
+ alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
149
+ # set file paths
150
+ wav_file_name = os.path.basename(item_idx)
151
+ align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
152
+ file_path = item_idx.replace(wav_file_name, align_file_name)
153
+ # save output
154
+ wav_file_abs_path = os.path.abspath(item_idx)
155
+ file_abs_path = os.path.abspath(file_path)
156
+ file_paths.append([wav_file_abs_path, file_abs_path])
157
+ np.save(file_path, alignment)
158
+
159
+ # ourput metafile
160
+ metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
161
+
162
+ with open(metafile, "w", encoding="utf-8") as f:
163
+ for p in file_paths:
164
+ f.write(f"{p[0]}|{p[1]}\n")
165
+ print(f" >> Metafile created: {metafile}")
TTS/bin/compute_embeddings.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from argparse import RawTextHelpFormatter
4
+
5
+ import torch
6
+ from tqdm import tqdm
7
+
8
+ from TTS.config import load_config
9
+ from TTS.config.shared_configs import BaseDatasetConfig
10
+ from TTS.tts.datasets import load_tts_samples
11
+ from TTS.tts.utils.managers import save_file
12
+ from TTS.tts.utils.speakers import SpeakerManager
13
+
14
+
15
+ def compute_embeddings(
16
+ model_path,
17
+ config_path,
18
+ output_path,
19
+ old_speakers_file=None,
20
+ old_append=False,
21
+ config_dataset_path=None,
22
+ formatter_name=None,
23
+ dataset_name=None,
24
+ dataset_path=None,
25
+ meta_file_train=None,
26
+ meta_file_val=None,
27
+ disable_cuda=False,
28
+ no_eval=False,
29
+ ):
30
+ use_cuda = torch.cuda.is_available() and not disable_cuda
31
+
32
+ if config_dataset_path is not None:
33
+ c_dataset = load_config(config_dataset_path)
34
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
35
+ else:
36
+ c_dataset = BaseDatasetConfig()
37
+ c_dataset.formatter = formatter_name
38
+ c_dataset.dataset_name = dataset_name
39
+ c_dataset.path = dataset_path
40
+ if meta_file_train is not None:
41
+ c_dataset.meta_file_train = meta_file_train
42
+ if meta_file_val is not None:
43
+ c_dataset.meta_file_val = meta_file_val
44
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
45
+
46
+ if meta_data_eval is None:
47
+ samples = meta_data_train
48
+ else:
49
+ samples = meta_data_train + meta_data_eval
50
+
51
+ encoder_manager = SpeakerManager(
52
+ encoder_model_path=model_path,
53
+ encoder_config_path=config_path,
54
+ d_vectors_file_path=old_speakers_file,
55
+ use_cuda=use_cuda,
56
+ )
57
+
58
+ class_name_key = encoder_manager.encoder_config.class_name_key
59
+
60
+ # compute speaker embeddings
61
+ if old_speakers_file is not None and old_append:
62
+ speaker_mapping = encoder_manager.embeddings
63
+ else:
64
+ speaker_mapping = {}
65
+
66
+ for fields in tqdm(samples):
67
+ class_name = fields[class_name_key]
68
+ audio_file = fields["audio_file"]
69
+ embedding_key = fields["audio_unique_name"]
70
+
71
+ # Only update the speaker name when the embedding is already in the old file.
72
+ if embedding_key in speaker_mapping:
73
+ speaker_mapping[embedding_key]["name"] = class_name
74
+ continue
75
+
76
+ if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
77
+ # get the embedding from the old file
78
+ embedd = encoder_manager.get_embedding_by_clip(embedding_key)
79
+ else:
80
+ # extract the embedding
81
+ embedd = encoder_manager.compute_embedding_from_clip(audio_file)
82
+
83
+ # create speaker_mapping if target dataset is defined
84
+ speaker_mapping[embedding_key] = {}
85
+ speaker_mapping[embedding_key]["name"] = class_name
86
+ speaker_mapping[embedding_key]["embedding"] = embedd
87
+
88
+ if speaker_mapping:
89
+ # save speaker_mapping if target dataset is defined
90
+ if os.path.isdir(output_path):
91
+ mapping_file_path = os.path.join(output_path, "speakers.pth")
92
+ else:
93
+ mapping_file_path = output_path
94
+
95
+ if os.path.dirname(mapping_file_path) != "":
96
+ os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
97
+
98
+ save_file(speaker_mapping, mapping_file_path)
99
+ print("Speaker embeddings saved at:", mapping_file_path)
100
+
101
+
102
+ if __name__ == "__main__":
103
+ parser = argparse.ArgumentParser(
104
+ description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
105
+ """
106
+ Example runs:
107
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
108
+
109
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
110
+ """,
111
+ formatter_class=RawTextHelpFormatter,
112
+ )
113
+ parser.add_argument(
114
+ "--model_path",
115
+ type=str,
116
+ help="Path to model checkpoint file. It defaults to the released speaker encoder.",
117
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
118
+ )
119
+ parser.add_argument(
120
+ "--config_path",
121
+ type=str,
122
+ help="Path to model config file. It defaults to the released speaker encoder config.",
123
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
124
+ )
125
+ parser.add_argument(
126
+ "--config_dataset_path",
127
+ type=str,
128
+ help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
129
+ default=None,
130
+ )
131
+ parser.add_argument(
132
+ "--output_path",
133
+ type=str,
134
+ help="Path for output `pth` or `json` file.",
135
+ default="speakers.pth",
136
+ )
137
+ parser.add_argument(
138
+ "--old_file",
139
+ type=str,
140
+ help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
141
+ default=None,
142
+ )
143
+ parser.add_argument(
144
+ "--old_append",
145
+ help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
146
+ default=False,
147
+ action="store_true",
148
+ )
149
+ parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
150
+ parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
151
+ parser.add_argument(
152
+ "--formatter_name",
153
+ type=str,
154
+ help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
155
+ default=None,
156
+ )
157
+ parser.add_argument(
158
+ "--dataset_name",
159
+ type=str,
160
+ help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
161
+ default=None,
162
+ )
163
+ parser.add_argument(
164
+ "--dataset_path",
165
+ type=str,
166
+ help="Path to the dataset. You either need to provide this or `config_dataset_path`",
167
+ default=None,
168
+ )
169
+ parser.add_argument(
170
+ "--meta_file_train",
171
+ type=str,
172
+ help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
173
+ default=None,
174
+ )
175
+ parser.add_argument(
176
+ "--meta_file_val",
177
+ type=str,
178
+ help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
179
+ default=None,
180
+ )
181
+ args = parser.parse_args()
182
+
183
+ compute_embeddings(
184
+ args.model_path,
185
+ args.config_path,
186
+ args.output_path,
187
+ old_speakers_file=args.old_file,
188
+ old_append=args.old_append,
189
+ config_dataset_path=args.config_dataset_path,
190
+ formatter_name=args.formatter_name,
191
+ dataset_name=args.dataset_name,
192
+ dataset_path=args.dataset_path,
193
+ meta_file_train=args.meta_file_train,
194
+ meta_file_val=args.meta_file_val,
195
+ disable_cuda=args.disable_cuda,
196
+ no_eval=args.no_eval,
197
+ )
TTS/bin/compute_statistics.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import glob
6
+ import os
7
+
8
+ import numpy as np
9
+ from tqdm import tqdm
10
+
11
+ # from TTS.utils.io import load_config
12
+ from TTS.config import load_config
13
+ from TTS.tts.datasets import load_tts_samples
14
+ from TTS.utils.audio import AudioProcessor
15
+
16
+
17
+ def main():
18
+ """Run preprocessing process."""
19
+ parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
20
+ parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
21
+ parser.add_argument("out_path", type=str, help="save path (directory and filename).")
22
+ parser.add_argument(
23
+ "--data_path",
24
+ type=str,
25
+ required=False,
26
+ help="folder including the target set of wavs overriding dataset config.",
27
+ )
28
+ args, overrides = parser.parse_known_args()
29
+
30
+ CONFIG = load_config(args.config_path)
31
+ CONFIG.parse_known_args(overrides, relaxed_parser=True)
32
+
33
+ # load config
34
+ CONFIG.audio.signal_norm = False # do not apply earlier normalization
35
+ CONFIG.audio.stats_path = None # discard pre-defined stats
36
+
37
+ # load audio processor
38
+ ap = AudioProcessor(**CONFIG.audio.to_dict())
39
+
40
+ # load the meta data of target dataset
41
+ if args.data_path:
42
+ dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
43
+ else:
44
+ dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
45
+ print(f" > There are {len(dataset_items)} files.")
46
+
47
+ mel_sum = 0
48
+ mel_square_sum = 0
49
+ linear_sum = 0
50
+ linear_square_sum = 0
51
+ N = 0
52
+ for item in tqdm(dataset_items):
53
+ # compute features
54
+ wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
55
+ linear = ap.spectrogram(wav)
56
+ mel = ap.melspectrogram(wav)
57
+
58
+ # compute stats
59
+ N += mel.shape[1]
60
+ mel_sum += mel.sum(1)
61
+ linear_sum += linear.sum(1)
62
+ mel_square_sum += (mel**2).sum(axis=1)
63
+ linear_square_sum += (linear**2).sum(axis=1)
64
+
65
+ mel_mean = mel_sum / N
66
+ mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
67
+ linear_mean = linear_sum / N
68
+ linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
69
+
70
+ output_file_path = args.out_path
71
+ stats = {}
72
+ stats["mel_mean"] = mel_mean
73
+ stats["mel_std"] = mel_scale
74
+ stats["linear_mean"] = linear_mean
75
+ stats["linear_std"] = linear_scale
76
+
77
+ print(f" > Avg mel spec mean: {mel_mean.mean()}")
78
+ print(f" > Avg mel spec scale: {mel_scale.mean()}")
79
+ print(f" > Avg linear spec mean: {linear_mean.mean()}")
80
+ print(f" > Avg linear spec scale: {linear_scale.mean()}")
81
+
82
+ # set default config values for mean-var scaling
83
+ CONFIG.audio.stats_path = output_file_path
84
+ CONFIG.audio.signal_norm = True
85
+ # remove redundant values
86
+ del CONFIG.audio.max_norm
87
+ del CONFIG.audio.min_level_db
88
+ del CONFIG.audio.symmetric_norm
89
+ del CONFIG.audio.clip_norm
90
+ stats["audio_config"] = CONFIG.audio.to_dict()
91
+ np.save(output_file_path, stats, allow_pickle=True)
92
+ print(f" > stats saved to {output_file_path}")
93
+
94
+
95
+ if __name__ == "__main__":
96
+ main()
TTS/bin/eval_encoder.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from argparse import RawTextHelpFormatter
3
+
4
+ import torch
5
+ from tqdm import tqdm
6
+
7
+ from TTS.config import load_config
8
+ from TTS.tts.datasets import load_tts_samples
9
+ from TTS.tts.utils.speakers import SpeakerManager
10
+
11
+
12
+ def compute_encoder_accuracy(dataset_items, encoder_manager):
13
+ class_name_key = encoder_manager.encoder_config.class_name_key
14
+ map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
15
+
16
+ class_acc_dict = {}
17
+
18
+ # compute embeddings for all wav_files
19
+ for item in tqdm(dataset_items):
20
+ class_name = item[class_name_key]
21
+ wav_file = item["audio_file"]
22
+
23
+ # extract the embedding
24
+ embedd = encoder_manager.compute_embedding_from_clip(wav_file)
25
+ if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
26
+ embedding = torch.FloatTensor(embedd).unsqueeze(0)
27
+ if encoder_manager.use_cuda:
28
+ embedding = embedding.cuda()
29
+
30
+ class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
31
+ predicted_label = map_classid_to_classname[str(class_id)]
32
+ else:
33
+ predicted_label = None
34
+
35
+ if class_name is not None and predicted_label is not None:
36
+ is_equal = int(class_name == predicted_label)
37
+ if class_name not in class_acc_dict:
38
+ class_acc_dict[class_name] = [is_equal]
39
+ else:
40
+ class_acc_dict[class_name].append(is_equal)
41
+ else:
42
+ raise RuntimeError("Error: class_name or/and predicted_label are None")
43
+
44
+ acc_avg = 0
45
+ for key, values in class_acc_dict.items():
46
+ acc = sum(values) / len(values)
47
+ print("Class", key, "Accuracy:", acc)
48
+ acc_avg += acc
49
+
50
+ print("Average Accuracy:", acc_avg / len(class_acc_dict))
51
+
52
+
53
+ if __name__ == "__main__":
54
+ parser = argparse.ArgumentParser(
55
+ description="""Compute the accuracy of the encoder.\n\n"""
56
+ """
57
+ Example runs:
58
+ python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
59
+ """,
60
+ formatter_class=RawTextHelpFormatter,
61
+ )
62
+ parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
63
+ parser.add_argument(
64
+ "config_path",
65
+ type=str,
66
+ help="Path to model config file.",
67
+ )
68
+
69
+ parser.add_argument(
70
+ "config_dataset_path",
71
+ type=str,
72
+ help="Path to dataset config file.",
73
+ )
74
+ parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
75
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
76
+
77
+ args = parser.parse_args()
78
+
79
+ c_dataset = load_config(args.config_dataset_path)
80
+
81
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
82
+ items = meta_data_train + meta_data_eval
83
+
84
+ enc_manager = SpeakerManager(
85
+ encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
86
+ )
87
+
88
+ compute_encoder_accuracy(items, enc_manager)
TTS/bin/extract_tts_spectrograms.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Extract Mel spectrograms with teacher forcing."""
3
+
4
+ import argparse
5
+ import os
6
+
7
+ import numpy as np
8
+ import torch
9
+ from torch.utils.data import DataLoader
10
+ from tqdm import tqdm
11
+
12
+ from TTS.config import load_config
13
+ from TTS.tts.datasets import TTSDataset, load_tts_samples
14
+ from TTS.tts.models import setup_model
15
+ from TTS.tts.utils.speakers import SpeakerManager
16
+ from TTS.tts.utils.text.tokenizer import TTSTokenizer
17
+ from TTS.utils.audio import AudioProcessor
18
+ from TTS.utils.audio.numpy_transforms import quantize
19
+ from TTS.utils.generic_utils import count_parameters
20
+
21
+ use_cuda = torch.cuda.is_available()
22
+
23
+
24
+ def setup_loader(ap, r, verbose=False):
25
+ tokenizer, _ = TTSTokenizer.init_from_config(c)
26
+ dataset = TTSDataset(
27
+ outputs_per_step=r,
28
+ compute_linear_spec=False,
29
+ samples=meta_data,
30
+ tokenizer=tokenizer,
31
+ ap=ap,
32
+ batch_group_size=0,
33
+ min_text_len=c.min_text_len,
34
+ max_text_len=c.max_text_len,
35
+ min_audio_len=c.min_audio_len,
36
+ max_audio_len=c.max_audio_len,
37
+ phoneme_cache_path=c.phoneme_cache_path,
38
+ precompute_num_workers=0,
39
+ use_noise_augment=False,
40
+ verbose=verbose,
41
+ speaker_id_mapping=speaker_manager.name_to_id if c.use_speaker_embedding else None,
42
+ d_vector_mapping=speaker_manager.embeddings if c.use_d_vector_file else None,
43
+ )
44
+
45
+ if c.use_phonemes and c.compute_input_seq_cache:
46
+ # precompute phonemes to have a better estimate of sequence lengths.
47
+ dataset.compute_input_seq(c.num_loader_workers)
48
+ dataset.preprocess_samples()
49
+
50
+ loader = DataLoader(
51
+ dataset,
52
+ batch_size=c.batch_size,
53
+ shuffle=False,
54
+ collate_fn=dataset.collate_fn,
55
+ drop_last=False,
56
+ sampler=None,
57
+ num_workers=c.num_loader_workers,
58
+ pin_memory=False,
59
+ )
60
+ return loader
61
+
62
+
63
+ def set_filename(wav_path, out_path):
64
+ wav_file = os.path.basename(wav_path)
65
+ file_name = wav_file.split(".")[0]
66
+ os.makedirs(os.path.join(out_path, "quant"), exist_ok=True)
67
+ os.makedirs(os.path.join(out_path, "mel"), exist_ok=True)
68
+ os.makedirs(os.path.join(out_path, "wav_gl"), exist_ok=True)
69
+ os.makedirs(os.path.join(out_path, "wav"), exist_ok=True)
70
+ wavq_path = os.path.join(out_path, "quant", file_name)
71
+ mel_path = os.path.join(out_path, "mel", file_name)
72
+ wav_gl_path = os.path.join(out_path, "wav_gl", file_name + ".wav")
73
+ wav_path = os.path.join(out_path, "wav", file_name + ".wav")
74
+ return file_name, wavq_path, mel_path, wav_gl_path, wav_path
75
+
76
+
77
+ def format_data(data):
78
+ # setup input data
79
+ text_input = data["token_id"]
80
+ text_lengths = data["token_id_lengths"]
81
+ mel_input = data["mel"]
82
+ mel_lengths = data["mel_lengths"]
83
+ item_idx = data["item_idxs"]
84
+ d_vectors = data["d_vectors"]
85
+ speaker_ids = data["speaker_ids"]
86
+ attn_mask = data["attns"]
87
+ avg_text_length = torch.mean(text_lengths.float())
88
+ avg_spec_length = torch.mean(mel_lengths.float())
89
+
90
+ # dispatch data to GPU
91
+ if use_cuda:
92
+ text_input = text_input.cuda(non_blocking=True)
93
+ text_lengths = text_lengths.cuda(non_blocking=True)
94
+ mel_input = mel_input.cuda(non_blocking=True)
95
+ mel_lengths = mel_lengths.cuda(non_blocking=True)
96
+ if speaker_ids is not None:
97
+ speaker_ids = speaker_ids.cuda(non_blocking=True)
98
+ if d_vectors is not None:
99
+ d_vectors = d_vectors.cuda(non_blocking=True)
100
+ if attn_mask is not None:
101
+ attn_mask = attn_mask.cuda(non_blocking=True)
102
+ return (
103
+ text_input,
104
+ text_lengths,
105
+ mel_input,
106
+ mel_lengths,
107
+ speaker_ids,
108
+ d_vectors,
109
+ avg_text_length,
110
+ avg_spec_length,
111
+ attn_mask,
112
+ item_idx,
113
+ )
114
+
115
+
116
+ @torch.no_grad()
117
+ def inference(
118
+ model_name,
119
+ model,
120
+ ap,
121
+ text_input,
122
+ text_lengths,
123
+ mel_input,
124
+ mel_lengths,
125
+ speaker_ids=None,
126
+ d_vectors=None,
127
+ ):
128
+ if model_name == "glow_tts":
129
+ speaker_c = None
130
+ if speaker_ids is not None:
131
+ speaker_c = speaker_ids
132
+ elif d_vectors is not None:
133
+ speaker_c = d_vectors
134
+ outputs = model.inference_with_MAS(
135
+ text_input,
136
+ text_lengths,
137
+ mel_input,
138
+ mel_lengths,
139
+ aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
140
+ )
141
+ model_output = outputs["model_outputs"]
142
+ model_output = model_output.detach().cpu().numpy()
143
+
144
+ elif "tacotron" in model_name:
145
+ aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
146
+ outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
147
+ postnet_outputs = outputs["model_outputs"]
148
+ # normalize tacotron output
149
+ if model_name == "tacotron":
150
+ mel_specs = []
151
+ postnet_outputs = postnet_outputs.data.cpu().numpy()
152
+ for b in range(postnet_outputs.shape[0]):
153
+ postnet_output = postnet_outputs[b]
154
+ mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
155
+ model_output = torch.stack(mel_specs).cpu().numpy()
156
+
157
+ elif model_name == "tacotron2":
158
+ model_output = postnet_outputs.detach().cpu().numpy()
159
+ return model_output
160
+
161
+
162
+ def extract_spectrograms(
163
+ data_loader, model, ap, output_path, quantize_bits=0, save_audio=False, debug=False, metada_name="metada.txt"
164
+ ):
165
+ model.eval()
166
+ export_metadata = []
167
+ for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
168
+ # format data
169
+ (
170
+ text_input,
171
+ text_lengths,
172
+ mel_input,
173
+ mel_lengths,
174
+ speaker_ids,
175
+ d_vectors,
176
+ _,
177
+ _,
178
+ _,
179
+ item_idx,
180
+ ) = format_data(data)
181
+
182
+ model_output = inference(
183
+ c.model.lower(),
184
+ model,
185
+ ap,
186
+ text_input,
187
+ text_lengths,
188
+ mel_input,
189
+ mel_lengths,
190
+ speaker_ids,
191
+ d_vectors,
192
+ )
193
+
194
+ for idx in range(text_input.shape[0]):
195
+ wav_file_path = item_idx[idx]
196
+ wav = ap.load_wav(wav_file_path)
197
+ _, wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
198
+
199
+ # quantize and save wav
200
+ if quantize_bits > 0:
201
+ wavq = quantize(wav, quantize_bits)
202
+ np.save(wavq_path, wavq)
203
+
204
+ # save TTS mel
205
+ mel = model_output[idx]
206
+ mel_length = mel_lengths[idx]
207
+ mel = mel[:mel_length, :].T
208
+ np.save(mel_path, mel)
209
+
210
+ export_metadata.append([wav_file_path, mel_path])
211
+ if save_audio:
212
+ ap.save_wav(wav, wav_path)
213
+
214
+ if debug:
215
+ print("Audio for debug saved at:", wav_gl_path)
216
+ wav = ap.inv_melspectrogram(mel)
217
+ ap.save_wav(wav, wav_gl_path)
218
+
219
+ with open(os.path.join(output_path, metada_name), "w", encoding="utf-8") as f:
220
+ for data in export_metadata:
221
+ f.write(f"{data[0]}|{data[1]+'.npy'}\n")
222
+
223
+
224
+ def main(args): # pylint: disable=redefined-outer-name
225
+ # pylint: disable=global-variable-undefined
226
+ global meta_data, speaker_manager
227
+
228
+ # Audio processor
229
+ ap = AudioProcessor(**c.audio)
230
+
231
+ # load data instances
232
+ meta_data_train, meta_data_eval = load_tts_samples(
233
+ c.datasets, eval_split=args.eval, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
234
+ )
235
+
236
+ # use eval and training partitions
237
+ meta_data = meta_data_train + meta_data_eval
238
+
239
+ # init speaker manager
240
+ if c.use_speaker_embedding:
241
+ speaker_manager = SpeakerManager(data_items=meta_data)
242
+ elif c.use_d_vector_file:
243
+ speaker_manager = SpeakerManager(d_vectors_file_path=c.d_vector_file)
244
+ else:
245
+ speaker_manager = None
246
+
247
+ # setup model
248
+ model = setup_model(c)
249
+
250
+ # restore model
251
+ model.load_checkpoint(c, args.checkpoint_path, eval=True)
252
+
253
+ if use_cuda:
254
+ model.cuda()
255
+
256
+ num_params = count_parameters(model)
257
+ print("\n > Model has {} parameters".format(num_params), flush=True)
258
+ # set r
259
+ r = 1 if c.model.lower() == "glow_tts" else model.decoder.r
260
+ own_loader = setup_loader(ap, r, verbose=True)
261
+
262
+ extract_spectrograms(
263
+ own_loader,
264
+ model,
265
+ ap,
266
+ args.output_path,
267
+ quantize_bits=args.quantize_bits,
268
+ save_audio=args.save_audio,
269
+ debug=args.debug,
270
+ metada_name="metada.txt",
271
+ )
272
+
273
+
274
+ if __name__ == "__main__":
275
+ parser = argparse.ArgumentParser()
276
+ parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
277
+ parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
278
+ parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
279
+ parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
280
+ parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
281
+ parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
282
+ parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
283
+ args = parser.parse_args()
284
+
285
+ c = load_config(args.config_path)
286
+ c.audio.trim_silence = False
287
+ main(args)
TTS/bin/find_unique_chars.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ from argparse import RawTextHelpFormatter
4
+
5
+ from TTS.config import load_config
6
+ from TTS.tts.datasets import load_tts_samples
7
+
8
+
9
+ def main():
10
+ # pylint: disable=bad-option-value
11
+ parser = argparse.ArgumentParser(
12
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
13
+ """
14
+ Example runs:
15
+
16
+ python TTS/bin/find_unique_chars.py --config_path config.json
17
+ """,
18
+ formatter_class=RawTextHelpFormatter,
19
+ )
20
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
21
+ args = parser.parse_args()
22
+
23
+ c = load_config(args.config_path)
24
+
25
+ # load all datasets
26
+ train_items, eval_items = load_tts_samples(
27
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
28
+ )
29
+
30
+ items = train_items + eval_items
31
+
32
+ texts = "".join(item["text"] for item in items)
33
+ chars = set(texts)
34
+ lower_chars = filter(lambda c: c.islower(), chars)
35
+ chars_force_lower = [c.lower() for c in chars]
36
+ chars_force_lower = set(chars_force_lower)
37
+
38
+ print(f" > Number of unique characters: {len(chars)}")
39
+ print(f" > Unique characters: {''.join(sorted(chars))}")
40
+ print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
41
+ print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
42
+
43
+
44
+ if __name__ == "__main__":
45
+ main()
TTS/bin/find_unique_phonemes.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+ import argparse
3
+ import multiprocessing
4
+ from argparse import RawTextHelpFormatter
5
+
6
+ from tqdm.contrib.concurrent import process_map
7
+
8
+ from TTS.config import load_config
9
+ from TTS.tts.datasets import load_tts_samples
10
+ from TTS.tts.utils.text.phonemizers import Gruut
11
+
12
+
13
+ def compute_phonemes(item):
14
+ text = item["text"]
15
+ ph = phonemizer.phonemize(text).replace("|", "")
16
+ return set(list(ph))
17
+
18
+
19
+ def main():
20
+ # pylint: disable=W0601
21
+ global c, phonemizer
22
+ # pylint: disable=bad-option-value
23
+ parser = argparse.ArgumentParser(
24
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
25
+ """
26
+ Example runs:
27
+
28
+ python TTS/bin/find_unique_phonemes.py --config_path config.json
29
+ """,
30
+ formatter_class=RawTextHelpFormatter,
31
+ )
32
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
33
+ args = parser.parse_args()
34
+
35
+ c = load_config(args.config_path)
36
+
37
+ # load all datasets
38
+ train_items, eval_items = load_tts_samples(
39
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
40
+ )
41
+ items = train_items + eval_items
42
+ print("Num items:", len(items))
43
+
44
+ language_list = [item["language"] for item in items]
45
+ is_lang_def = all(language_list)
46
+
47
+ if not c.phoneme_language or not is_lang_def:
48
+ raise ValueError("Phoneme language must be defined in config.")
49
+
50
+ if not language_list.count(language_list[0]) == len(language_list):
51
+ raise ValueError(
52
+ "Currently, just one phoneme language per config file is supported !! Please split the dataset config into different configs and run it individually for each language !!"
53
+ )
54
+
55
+ phonemizer = Gruut(language=language_list[0], keep_puncs=True)
56
+
57
+ phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
58
+ phones = []
59
+ for ph in phonemes:
60
+ phones.extend(ph)
61
+
62
+ phones = set(phones)
63
+ lower_phones = filter(lambda c: c.islower(), phones)
64
+ phones_force_lower = [c.lower() for c in phones]
65
+ phones_force_lower = set(phones_force_lower)
66
+
67
+ print(f" > Number of unique phonemes: {len(phones)}")
68
+ print(f" > Unique phonemes: {''.join(sorted(phones))}")
69
+ print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
70
+ print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
71
+
72
+
73
+ if __name__ == "__main__":
74
+ main()
TTS/bin/remove_silence_using_vad.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import multiprocessing
4
+ import os
5
+ import pathlib
6
+
7
+ import torch
8
+ from tqdm import tqdm
9
+
10
+ from TTS.utils.vad import get_vad_model_and_utils, remove_silence
11
+
12
+ torch.set_num_threads(1)
13
+
14
+
15
+ def adjust_path_and_remove_silence(audio_path):
16
+ output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
17
+ # ignore if the file exists
18
+ if os.path.exists(output_path) and not args.force:
19
+ return output_path, False
20
+
21
+ # create all directory structure
22
+ pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
23
+ # remove the silence and save the audio
24
+ output_path, is_speech = remove_silence(
25
+ model_and_utils,
26
+ audio_path,
27
+ output_path,
28
+ trim_just_beginning_and_end=args.trim_just_beginning_and_end,
29
+ use_cuda=args.use_cuda,
30
+ )
31
+ return output_path, is_speech
32
+
33
+
34
+ def preprocess_audios():
35
+ files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
36
+ print("> Number of files: ", len(files))
37
+ if not args.force:
38
+ print("> Ignoring files that already exist in the output idrectory.")
39
+
40
+ if args.trim_just_beginning_and_end:
41
+ print("> Trimming just the beginning and the end with nonspeech parts.")
42
+ else:
43
+ print("> Trimming all nonspeech parts.")
44
+
45
+ filtered_files = []
46
+ if files:
47
+ # create threads
48
+ # num_threads = multiprocessing.cpu_count()
49
+ # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
50
+
51
+ if args.num_processes > 1:
52
+ with multiprocessing.Pool(processes=args.num_processes) as pool:
53
+ results = list(
54
+ tqdm(
55
+ pool.imap_unordered(adjust_path_and_remove_silence, files),
56
+ total=len(files),
57
+ desc="Processing audio files",
58
+ )
59
+ )
60
+ for output_path, is_speech in results:
61
+ if not is_speech:
62
+ filtered_files.append(output_path)
63
+ else:
64
+ for f in tqdm(files):
65
+ output_path, is_speech = adjust_path_and_remove_silence(f)
66
+ if not is_speech:
67
+ filtered_files.append(output_path)
68
+
69
+ # write files that do not have speech
70
+ with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
71
+ for file in filtered_files:
72
+ f.write(str(file) + "\n")
73
+ else:
74
+ print("> No files Found !")
75
+
76
+
77
+ if __name__ == "__main__":
78
+ parser = argparse.ArgumentParser(
79
+ description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
80
+ )
81
+ parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
82
+ parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
83
+ parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
84
+ parser.add_argument(
85
+ "-g",
86
+ "--glob",
87
+ type=str,
88
+ default="**/*.wav",
89
+ help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
90
+ )
91
+ parser.add_argument(
92
+ "-t",
93
+ "--trim_just_beginning_and_end",
94
+ type=bool,
95
+ default=True,
96
+ help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trim. Default True",
97
+ )
98
+ parser.add_argument(
99
+ "-c",
100
+ "--use_cuda",
101
+ type=bool,
102
+ default=False,
103
+ help="If True use cuda",
104
+ )
105
+ parser.add_argument(
106
+ "--use_onnx",
107
+ type=bool,
108
+ default=False,
109
+ help="If True use onnx",
110
+ )
111
+ parser.add_argument(
112
+ "--num_processes",
113
+ type=int,
114
+ default=1,
115
+ help="Number of processes to use",
116
+ )
117
+ args = parser.parse_args()
118
+
119
+ if args.output_dir == "":
120
+ args.output_dir = args.input_dir
121
+
122
+ # load the model and utils
123
+ model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
124
+ preprocess_audios()
TTS/bin/resample.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import os
4
+ from argparse import RawTextHelpFormatter
5
+ from multiprocessing import Pool
6
+ from shutil import copytree
7
+
8
+ import librosa
9
+ import soundfile as sf
10
+ from tqdm import tqdm
11
+
12
+
13
+ def resample_file(func_args):
14
+ filename, output_sr = func_args
15
+ y, sr = librosa.load(filename, sr=output_sr)
16
+ sf.write(filename, y, sr)
17
+
18
+
19
+ def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
20
+ if output_dir:
21
+ print("Recursively copying the input folder...")
22
+ copytree(input_dir, output_dir)
23
+ input_dir = output_dir
24
+
25
+ print("Resampling the audio files...")
26
+ audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
27
+ print(f"Found {len(audio_files)} files...")
28
+ audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
29
+ with Pool(processes=n_jobs) as p:
30
+ with tqdm(total=len(audio_files)) as pbar:
31
+ for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
32
+ pbar.update()
33
+
34
+ print("Done !")
35
+
36
+
37
+ if __name__ == "__main__":
38
+ parser = argparse.ArgumentParser(
39
+ description="""Resample a folder recusively with librosa
40
+ Can be used in place or create a copy of the folder as an output.\n\n
41
+ Example run:
42
+ python TTS/bin/resample.py
43
+ --input_dir /root/LJSpeech-1.1/
44
+ --output_sr 22050
45
+ --output_dir /root/resampled_LJSpeech-1.1/
46
+ --file_ext wav
47
+ --n_jobs 24
48
+ """,
49
+ formatter_class=RawTextHelpFormatter,
50
+ )
51
+
52
+ parser.add_argument(
53
+ "--input_dir",
54
+ type=str,
55
+ default=None,
56
+ required=True,
57
+ help="Path of the folder containing the audio files to resample",
58
+ )
59
+
60
+ parser.add_argument(
61
+ "--output_sr",
62
+ type=int,
63
+ default=22050,
64
+ required=False,
65
+ help="Samlple rate to which the audio files should be resampled",
66
+ )
67
+
68
+ parser.add_argument(
69
+ "--output_dir",
70
+ type=str,
71
+ default=None,
72
+ required=False,
73
+ help="Path of the destination folder. If not defined, the operation is done in place",
74
+ )
75
+
76
+ parser.add_argument(
77
+ "--file_ext",
78
+ type=str,
79
+ default="wav",
80
+ required=False,
81
+ help="Extension of the audio files to resample",
82
+ )
83
+
84
+ parser.add_argument(
85
+ "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
86
+ )
87
+
88
+ args = parser.parse_args()
89
+
90
+ resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
TTS/bin/synthesize.py ADDED
@@ -0,0 +1,541 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import contextlib
6
+ import sys
7
+ from argparse import RawTextHelpFormatter
8
+
9
+ # pylint: disable=redefined-outer-name, unused-argument
10
+ from pathlib import Path
11
+
12
+ description = """
13
+ Synthesize speech on command line.
14
+
15
+ You can either use your trained model or choose a model from the provided list.
16
+
17
+ If you don't specify any models, then it uses LJSpeech based English model.
18
+
19
+ #### Single Speaker Models
20
+
21
+ - List provided models:
22
+
23
+ ```
24
+ $ tts --list_models
25
+ ```
26
+
27
+ - Get model info (for both tts_models and vocoder_models):
28
+
29
+ - Query by type/name:
30
+ The model_info_by_name uses the name as it from the --list_models.
31
+ ```
32
+ $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
33
+ ```
34
+ For example:
35
+ ```
36
+ $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
37
+ $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
38
+ ```
39
+ - Query by type/idx:
40
+ The model_query_idx uses the corresponding idx from --list_models.
41
+
42
+ ```
43
+ $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
44
+ ```
45
+
46
+ For example:
47
+
48
+ ```
49
+ $ tts --model_info_by_idx tts_models/3
50
+ ```
51
+
52
+ - Query info for model info by full name:
53
+ ```
54
+ $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
55
+ ```
56
+
57
+ - Run TTS with default models:
58
+
59
+ ```
60
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav
61
+ ```
62
+
63
+ - Run TTS and pipe out the generated TTS wav file data:
64
+
65
+ ```
66
+ $ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
67
+ ```
68
+
69
+ - Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
70
+
71
+ ```
72
+ $ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
73
+ ```
74
+
75
+ - Run a TTS model with its default vocoder model:
76
+
77
+ ```
78
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
79
+ ```
80
+
81
+ For example:
82
+
83
+ ```
84
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
85
+ ```
86
+
87
+ - Run with specific TTS and vocoder models from the list:
88
+
89
+ ```
90
+ $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
91
+ ```
92
+
93
+ For example:
94
+
95
+ ```
96
+ $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
97
+ ```
98
+
99
+ - Run your own TTS model (Using Griffin-Lim Vocoder):
100
+
101
+ ```
102
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
103
+ ```
104
+
105
+ - Run your own TTS and Vocoder models:
106
+
107
+ ```
108
+ $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
109
+ --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
110
+ ```
111
+
112
+ #### Multi-speaker Models
113
+
114
+ - List the available speakers and choose a <speaker_id> among them:
115
+
116
+ ```
117
+ $ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
118
+ ```
119
+
120
+ - Run the multi-speaker TTS model with the target speaker ID:
121
+
122
+ ```
123
+ $ tts --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
124
+ ```
125
+
126
+ - Run your own multi-speaker TTS model:
127
+
128
+ ```
129
+ $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/model.pth --config_path path/to/config.json --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
130
+ ```
131
+
132
+ ### Voice Conversion Models
133
+
134
+ ```
135
+ $ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
136
+ ```
137
+ """
138
+
139
+
140
+ def str2bool(v):
141
+ if isinstance(v, bool):
142
+ return v
143
+ if v.lower() in ("yes", "true", "t", "y", "1"):
144
+ return True
145
+ if v.lower() in ("no", "false", "f", "n", "0"):
146
+ return False
147
+ raise argparse.ArgumentTypeError("Boolean value expected.")
148
+
149
+
150
+ def main():
151
+ parser = argparse.ArgumentParser(
152
+ description=description.replace(" ```\n", ""),
153
+ formatter_class=RawTextHelpFormatter,
154
+ )
155
+
156
+ parser.add_argument(
157
+ "--list_models",
158
+ type=str2bool,
159
+ nargs="?",
160
+ const=True,
161
+ default=False,
162
+ help="list available pre-trained TTS and vocoder models.",
163
+ )
164
+
165
+ parser.add_argument(
166
+ "--model_info_by_idx",
167
+ type=str,
168
+ default=None,
169
+ help="model info using query format: <model_type>/<model_query_idx>",
170
+ )
171
+
172
+ parser.add_argument(
173
+ "--model_info_by_name",
174
+ type=str,
175
+ default=None,
176
+ help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
177
+ )
178
+
179
+ parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
180
+
181
+ # Args for running pre-trained TTS models.
182
+ parser.add_argument(
183
+ "--model_name",
184
+ type=str,
185
+ default="tts_models/en/ljspeech/tacotron2-DDC",
186
+ help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
187
+ )
188
+ parser.add_argument(
189
+ "--vocoder_name",
190
+ type=str,
191
+ default=None,
192
+ help="Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>",
193
+ )
194
+
195
+ # Args for running custom models
196
+ parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
197
+ parser.add_argument(
198
+ "--model_path",
199
+ type=str,
200
+ default=None,
201
+ help="Path to model file.",
202
+ )
203
+ parser.add_argument(
204
+ "--out_path",
205
+ type=str,
206
+ default="tts_output.wav",
207
+ help="Output wav file path.",
208
+ )
209
+ parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False)
210
+ parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
211
+ parser.add_argument(
212
+ "--vocoder_path",
213
+ type=str,
214
+ help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
215
+ default=None,
216
+ )
217
+ parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
218
+ parser.add_argument(
219
+ "--encoder_path",
220
+ type=str,
221
+ help="Path to speaker encoder model file.",
222
+ default=None,
223
+ )
224
+ parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
225
+
226
+ # args for coqui studio
227
+ parser.add_argument(
228
+ "--cs_model",
229
+ type=str,
230
+ help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
231
+ )
232
+ parser.add_argument(
233
+ "--emotion",
234
+ type=str,
235
+ help="Emotion to condition the model with. Only available for 🐸Coqui Studio `V1` model.",
236
+ default=None,
237
+ )
238
+ parser.add_argument(
239
+ "--language",
240
+ type=str,
241
+ help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
242
+ default=None,
243
+ )
244
+ parser.add_argument(
245
+ "--pipe_out",
246
+ help="stdout the generated TTS wav file for shell pipe.",
247
+ type=str2bool,
248
+ nargs="?",
249
+ const=True,
250
+ default=False,
251
+ )
252
+ parser.add_argument(
253
+ "--speed",
254
+ type=float,
255
+ help="Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0.",
256
+ default=None,
257
+ )
258
+
259
+ # args for multi-speaker synthesis
260
+ parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
261
+ parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
262
+ parser.add_argument(
263
+ "--speaker_idx",
264
+ type=str,
265
+ help="Target speaker ID for a multi-speaker TTS model.",
266
+ default=None,
267
+ )
268
+ parser.add_argument(
269
+ "--language_idx",
270
+ type=str,
271
+ help="Target language ID for a multi-lingual TTS model.",
272
+ default=None,
273
+ )
274
+ parser.add_argument(
275
+ "--speaker_wav",
276
+ nargs="+",
277
+ help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
278
+ default=None,
279
+ )
280
+ parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
281
+ parser.add_argument(
282
+ "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
283
+ )
284
+ parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
285
+ parser.add_argument(
286
+ "--list_speaker_idxs",
287
+ help="List available speaker ids for the defined multi-speaker model.",
288
+ type=str2bool,
289
+ nargs="?",
290
+ const=True,
291
+ default=False,
292
+ )
293
+ parser.add_argument(
294
+ "--list_language_idxs",
295
+ help="List available language ids for the defined multi-lingual model.",
296
+ type=str2bool,
297
+ nargs="?",
298
+ const=True,
299
+ default=False,
300
+ )
301
+ # aux args
302
+ parser.add_argument(
303
+ "--save_spectogram",
304
+ type=bool,
305
+ help="If true save raw spectogram for further (vocoder) processing in out_path.",
306
+ default=False,
307
+ )
308
+ parser.add_argument(
309
+ "--reference_wav",
310
+ type=str,
311
+ help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
312
+ default=None,
313
+ )
314
+ parser.add_argument(
315
+ "--reference_speaker_idx",
316
+ type=str,
317
+ help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
318
+ default=None,
319
+ )
320
+ parser.add_argument(
321
+ "--progress_bar",
322
+ type=str2bool,
323
+ help="If true shows a progress bar for the model download. Defaults to True",
324
+ default=True,
325
+ )
326
+
327
+ # voice conversion args
328
+ parser.add_argument(
329
+ "--source_wav",
330
+ type=str,
331
+ default=None,
332
+ help="Original audio file to convert in the voice of the target_wav",
333
+ )
334
+ parser.add_argument(
335
+ "--target_wav",
336
+ type=str,
337
+ default=None,
338
+ help="Target audio file to convert in the voice of the source_wav",
339
+ )
340
+
341
+ parser.add_argument(
342
+ "--voice_dir",
343
+ type=str,
344
+ default=None,
345
+ help="Voice dir for tortoise model",
346
+ )
347
+
348
+ args = parser.parse_args()
349
+
350
+ # print the description if either text or list_models is not set
351
+ check_args = [
352
+ args.text,
353
+ args.list_models,
354
+ args.list_speaker_idxs,
355
+ args.list_language_idxs,
356
+ args.reference_wav,
357
+ args.model_info_by_idx,
358
+ args.model_info_by_name,
359
+ args.source_wav,
360
+ args.target_wav,
361
+ ]
362
+ if not any(check_args):
363
+ parser.parse_args(["-h"])
364
+
365
+ pipe_out = sys.stdout if args.pipe_out else None
366
+
367
+ with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
368
+ # Late-import to make things load faster
369
+ from TTS.api import TTS
370
+ from TTS.utils.manage import ModelManager
371
+ from TTS.utils.synthesizer import Synthesizer
372
+
373
+ # load model manager
374
+ path = Path(__file__).parent / "../.models.json"
375
+ manager = ModelManager(path, progress_bar=args.progress_bar)
376
+ api = TTS()
377
+
378
+ tts_path = None
379
+ tts_config_path = None
380
+ speakers_file_path = None
381
+ language_ids_file_path = None
382
+ vocoder_path = None
383
+ vocoder_config_path = None
384
+ encoder_path = None
385
+ encoder_config_path = None
386
+ vc_path = None
387
+ vc_config_path = None
388
+ model_dir = None
389
+
390
+ # CASE1 #list : list pre-trained TTS models
391
+ if args.list_models:
392
+ manager.add_cs_api_models(api.list_models())
393
+ manager.list_models()
394
+ sys.exit()
395
+
396
+ # CASE2 #info : model info for pre-trained TTS models
397
+ if args.model_info_by_idx:
398
+ model_query = args.model_info_by_idx
399
+ manager.model_info_by_idx(model_query)
400
+ sys.exit()
401
+
402
+ if args.model_info_by_name:
403
+ model_query_full_name = args.model_info_by_name
404
+ manager.model_info_by_full_name(model_query_full_name)
405
+ sys.exit()
406
+
407
+ # CASE3: TTS with coqui studio models
408
+ if "coqui_studio" in args.model_name:
409
+ print(" > Using 🐸Coqui Studio model: ", args.model_name)
410
+ api = TTS(model_name=args.model_name, cs_api_model=args.cs_model)
411
+ api.tts_to_file(
412
+ text=args.text,
413
+ emotion=args.emotion,
414
+ file_path=args.out_path,
415
+ language=args.language,
416
+ speed=args.speed,
417
+ pipe_out=pipe_out,
418
+ )
419
+ print(" > Saving output to ", args.out_path)
420
+ return
421
+
422
+ # CASE4: load pre-trained model paths
423
+ if args.model_name is not None and not args.model_path:
424
+ model_path, config_path, model_item = manager.download_model(args.model_name)
425
+ # tts model
426
+ if model_item["model_type"] == "tts_models":
427
+ tts_path = model_path
428
+ tts_config_path = config_path
429
+ if "default_vocoder" in model_item:
430
+ args.vocoder_name = (
431
+ model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
432
+ )
433
+
434
+ # voice conversion model
435
+ if model_item["model_type"] == "voice_conversion_models":
436
+ vc_path = model_path
437
+ vc_config_path = config_path
438
+
439
+ # tts model with multiple files to be loaded from the directory path
440
+ if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list):
441
+ model_dir = model_path
442
+ tts_path = None
443
+ tts_config_path = None
444
+ args.vocoder_name = None
445
+
446
+ # load vocoder
447
+ if args.vocoder_name is not None and not args.vocoder_path:
448
+ vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
449
+
450
+ # CASE5: set custom model paths
451
+ if args.model_path is not None:
452
+ tts_path = args.model_path
453
+ tts_config_path = args.config_path
454
+ speakers_file_path = args.speakers_file_path
455
+ language_ids_file_path = args.language_ids_file_path
456
+
457
+ if args.vocoder_path is not None:
458
+ vocoder_path = args.vocoder_path
459
+ vocoder_config_path = args.vocoder_config_path
460
+
461
+ if args.encoder_path is not None:
462
+ encoder_path = args.encoder_path
463
+ encoder_config_path = args.encoder_config_path
464
+
465
+ device = args.device
466
+ if args.use_cuda:
467
+ device = "cuda"
468
+
469
+ # load models
470
+ synthesizer = Synthesizer(
471
+ tts_path,
472
+ tts_config_path,
473
+ speakers_file_path,
474
+ language_ids_file_path,
475
+ vocoder_path,
476
+ vocoder_config_path,
477
+ encoder_path,
478
+ encoder_config_path,
479
+ vc_path,
480
+ vc_config_path,
481
+ model_dir,
482
+ args.voice_dir,
483
+ ).to(device)
484
+
485
+ # query speaker ids of a multi-speaker model.
486
+ if args.list_speaker_idxs:
487
+ print(
488
+ " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
489
+ )
490
+ print(synthesizer.tts_model.speaker_manager.name_to_id)
491
+ return
492
+
493
+ # query langauge ids of a multi-lingual model.
494
+ if args.list_language_idxs:
495
+ print(
496
+ " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
497
+ )
498
+ print(synthesizer.tts_model.language_manager.name_to_id)
499
+ return
500
+
501
+ # check the arguments against a multi-speaker model.
502
+ if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
503
+ print(
504
+ " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
505
+ "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
506
+ )
507
+ return
508
+
509
+ # RUN THE SYNTHESIS
510
+ if args.text:
511
+ print(" > Text: {}".format(args.text))
512
+
513
+ # kick it
514
+ if tts_path is not None:
515
+ wav = synthesizer.tts(
516
+ args.text,
517
+ speaker_name=args.speaker_idx,
518
+ language_name=args.language_idx,
519
+ speaker_wav=args.speaker_wav,
520
+ reference_wav=args.reference_wav,
521
+ style_wav=args.capacitron_style_wav,
522
+ style_text=args.capacitron_style_text,
523
+ reference_speaker_name=args.reference_speaker_idx,
524
+ )
525
+ elif vc_path is not None:
526
+ wav = synthesizer.voice_conversion(
527
+ source_wav=args.source_wav,
528
+ target_wav=args.target_wav,
529
+ )
530
+ elif model_dir is not None:
531
+ wav = synthesizer.tts(
532
+ args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav
533
+ )
534
+
535
+ # save the results
536
+ print(" > Saving output to {}".format(args.out_path))
537
+ synthesizer.save_wav(wav, args.out_path, pipe_out=pipe_out)
538
+
539
+
540
+ if __name__ == "__main__":
541
+ main()
TTS/bin/train_encoder.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os
5
+ import sys
6
+ import time
7
+ import traceback
8
+
9
+ import torch
10
+ from torch.utils.data import DataLoader
11
+ from trainer.torch import NoamLR
12
+ from trainer.trainer_utils import get_optimizer
13
+
14
+ from TTS.encoder.dataset import EncoderDataset
15
+ from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
16
+ from TTS.encoder.utils.training import init_training
17
+ from TTS.encoder.utils.visual import plot_embeddings
18
+ from TTS.tts.datasets import load_tts_samples
19
+ from TTS.utils.audio import AudioProcessor
20
+ from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
21
+ from TTS.utils.io import copy_model_files
22
+ from TTS.utils.samplers import PerfectBatchSampler
23
+ from TTS.utils.training import check_update
24
+
25
+ torch.backends.cudnn.enabled = True
26
+ torch.backends.cudnn.benchmark = True
27
+ torch.manual_seed(54321)
28
+ use_cuda = torch.cuda.is_available()
29
+ num_gpus = torch.cuda.device_count()
30
+ print(" > Using CUDA: ", use_cuda)
31
+ print(" > Number of GPUs: ", num_gpus)
32
+
33
+
34
+ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False):
35
+ num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
36
+ num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
37
+
38
+ dataset = EncoderDataset(
39
+ c,
40
+ ap,
41
+ meta_data_eval if is_val else meta_data_train,
42
+ voice_len=c.voice_len,
43
+ num_utter_per_class=num_utter_per_class,
44
+ num_classes_in_batch=num_classes_in_batch,
45
+ verbose=verbose,
46
+ augmentation_config=c.audio_augmentation if not is_val else None,
47
+ use_torch_spec=c.model_params.get("use_torch_spec", False),
48
+ )
49
+ # get classes list
50
+ classes = dataset.get_class_list()
51
+
52
+ sampler = PerfectBatchSampler(
53
+ dataset.items,
54
+ classes,
55
+ batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
56
+ num_classes_in_batch=num_classes_in_batch,
57
+ num_gpus=1,
58
+ shuffle=not is_val,
59
+ drop_last=True,
60
+ )
61
+
62
+ if len(classes) < num_classes_in_batch:
63
+ if is_val:
64
+ raise RuntimeError(
65
+ f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
66
+ )
67
+ raise RuntimeError(
68
+ f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
69
+ )
70
+
71
+ # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
72
+ if is_val:
73
+ dataset.set_classes(train_classes)
74
+
75
+ loader = DataLoader(
76
+ dataset,
77
+ num_workers=c.num_loader_workers,
78
+ batch_sampler=sampler,
79
+ collate_fn=dataset.collate_fn,
80
+ )
81
+
82
+ return loader, classes, dataset.get_map_classid_to_classname()
83
+
84
+
85
+ def evaluation(model, criterion, data_loader, global_step):
86
+ eval_loss = 0
87
+ for _, data in enumerate(data_loader):
88
+ with torch.no_grad():
89
+ # setup input data
90
+ inputs, labels = data
91
+
92
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
93
+ labels = torch.transpose(
94
+ labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
95
+ ).reshape(labels.shape)
96
+ inputs = torch.transpose(
97
+ inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
98
+ ).reshape(inputs.shape)
99
+
100
+ # dispatch data to GPU
101
+ if use_cuda:
102
+ inputs = inputs.cuda(non_blocking=True)
103
+ labels = labels.cuda(non_blocking=True)
104
+
105
+ # forward pass model
106
+ outputs = model(inputs)
107
+
108
+ # loss computation
109
+ loss = criterion(
110
+ outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
111
+ )
112
+
113
+ eval_loss += loss.item()
114
+
115
+ eval_avg_loss = eval_loss / len(data_loader)
116
+ # save stats
117
+ dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
118
+ # plot the last batch in the evaluation
119
+ figures = {
120
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
121
+ }
122
+ dashboard_logger.eval_figures(global_step, figures)
123
+ return eval_avg_loss
124
+
125
+
126
+ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
127
+ model.train()
128
+ best_loss = float("inf")
129
+ avg_loader_time = 0
130
+ end_time = time.time()
131
+ for epoch in range(c.epochs):
132
+ tot_loss = 0
133
+ epoch_time = 0
134
+ for _, data in enumerate(data_loader):
135
+ start_time = time.time()
136
+
137
+ # setup input data
138
+ inputs, labels = data
139
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
140
+ labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
141
+ labels.shape
142
+ )
143
+ inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
144
+ inputs.shape
145
+ )
146
+ # ToDo: move it to a unit test
147
+ # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
148
+ # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
149
+ # idx = 0
150
+ # for j in range(0, c.num_classes_in_batch, 1):
151
+ # for i in range(j, len(labels), c.num_classes_in_batch):
152
+ # if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
153
+ # print("Invalid")
154
+ # print(labels)
155
+ # exit()
156
+ # idx += 1
157
+ # labels = labels_converted
158
+ # inputs = inputs_converted
159
+
160
+ loader_time = time.time() - end_time
161
+ global_step += 1
162
+
163
+ # setup lr
164
+ if c.lr_decay:
165
+ scheduler.step()
166
+ optimizer.zero_grad()
167
+
168
+ # dispatch data to GPU
169
+ if use_cuda:
170
+ inputs = inputs.cuda(non_blocking=True)
171
+ labels = labels.cuda(non_blocking=True)
172
+
173
+ # forward pass model
174
+ outputs = model(inputs)
175
+
176
+ # loss computation
177
+ loss = criterion(
178
+ outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
179
+ )
180
+ loss.backward()
181
+ grad_norm, _ = check_update(model, c.grad_clip)
182
+ optimizer.step()
183
+
184
+ step_time = time.time() - start_time
185
+ epoch_time += step_time
186
+
187
+ # acumulate the total epoch loss
188
+ tot_loss += loss.item()
189
+
190
+ # Averaged Loader Time
191
+ num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
192
+ avg_loader_time = (
193
+ 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
194
+ if avg_loader_time != 0
195
+ else loader_time
196
+ )
197
+ current_lr = optimizer.param_groups[0]["lr"]
198
+
199
+ if global_step % c.steps_plot_stats == 0:
200
+ # Plot Training Epoch Stats
201
+ train_stats = {
202
+ "loss": loss.item(),
203
+ "lr": current_lr,
204
+ "grad_norm": grad_norm,
205
+ "step_time": step_time,
206
+ "avg_loader_time": avg_loader_time,
207
+ }
208
+ dashboard_logger.train_epoch_stats(global_step, train_stats)
209
+ figures = {
210
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
211
+ }
212
+ dashboard_logger.train_figures(global_step, figures)
213
+
214
+ if global_step % c.print_step == 0:
215
+ print(
216
+ " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
217
+ "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
218
+ global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
219
+ ),
220
+ flush=True,
221
+ )
222
+
223
+ if global_step % c.save_step == 0:
224
+ # save model
225
+ save_checkpoint(model, optimizer, criterion, loss.item(), OUT_PATH, global_step, epoch)
226
+
227
+ end_time = time.time()
228
+
229
+ print("")
230
+ print(
231
+ ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
232
+ "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
233
+ epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
234
+ ),
235
+ flush=True,
236
+ )
237
+ # evaluation
238
+ if c.run_eval:
239
+ model.eval()
240
+ eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
241
+ print("\n\n")
242
+ print("--> EVAL PERFORMANCE")
243
+ print(
244
+ " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
245
+ flush=True,
246
+ )
247
+ # save the best checkpoint
248
+ best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
249
+ model.train()
250
+
251
+ return best_loss, global_step
252
+
253
+
254
+ def main(args): # pylint: disable=redefined-outer-name
255
+ # pylint: disable=global-variable-undefined
256
+ global meta_data_train
257
+ global meta_data_eval
258
+ global train_classes
259
+
260
+ ap = AudioProcessor(**c.audio)
261
+ model = setup_encoder_model(c)
262
+
263
+ optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
264
+
265
+ # pylint: disable=redefined-outer-name
266
+ meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
267
+
268
+ train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False, verbose=True)
269
+ if c.run_eval:
270
+ eval_data_loader, _, _ = setup_loader(ap, is_val=True, verbose=True)
271
+ else:
272
+ eval_data_loader = None
273
+
274
+ num_classes = len(train_classes)
275
+ criterion = model.get_criterion(c, num_classes)
276
+
277
+ if c.loss == "softmaxproto" and c.model != "speaker_encoder":
278
+ c.map_classid_to_classname = map_classid_to_classname
279
+ copy_model_files(c, OUT_PATH)
280
+
281
+ if args.restore_path:
282
+ criterion, args.restore_step = model.load_checkpoint(
283
+ c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
284
+ )
285
+ print(" > Model restored from step %d" % args.restore_step, flush=True)
286
+ else:
287
+ args.restore_step = 0
288
+
289
+ if c.lr_decay:
290
+ scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
291
+ else:
292
+ scheduler = None
293
+
294
+ num_params = count_parameters(model)
295
+ print("\n > Model has {} parameters".format(num_params), flush=True)
296
+
297
+ if use_cuda:
298
+ model = model.cuda()
299
+ criterion.cuda()
300
+
301
+ global_step = args.restore_step
302
+ _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
303
+
304
+
305
+ if __name__ == "__main__":
306
+ args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
307
+
308
+ try:
309
+ main(args)
310
+ except KeyboardInterrupt:
311
+ remove_experiment_folder(OUT_PATH)
312
+ try:
313
+ sys.exit(0)
314
+ except SystemExit:
315
+ os._exit(0) # pylint: disable=protected-access
316
+ except Exception: # pylint: disable=broad-except
317
+ remove_experiment_folder(OUT_PATH)
318
+ traceback.print_exc()
319
+ sys.exit(1)
TTS/bin/train_tts.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.config import load_config, register_config
7
+ from TTS.tts.datasets import load_tts_samples
8
+ from TTS.tts.models import setup_model
9
+
10
+
11
+ @dataclass
12
+ class TrainTTSArgs(TrainerArgs):
13
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
14
+
15
+
16
+ def main():
17
+ """Run `tts` model training directly by a `config.json` file."""
18
+ # init trainer args
19
+ train_args = TrainTTSArgs()
20
+ parser = train_args.init_argparse(arg_prefix="")
21
+
22
+ # override trainer args from comman-line args
23
+ args, config_overrides = parser.parse_known_args()
24
+ train_args.parse_args(args)
25
+
26
+ # load config.json and register
27
+ if args.config_path or args.continue_path:
28
+ if args.config_path:
29
+ # init from a file
30
+ config = load_config(args.config_path)
31
+ if len(config_overrides) > 0:
32
+ config.parse_known_args(config_overrides, relaxed_parser=True)
33
+ elif args.continue_path:
34
+ # continue from a prev experiment
35
+ config = load_config(os.path.join(args.continue_path, "config.json"))
36
+ if len(config_overrides) > 0:
37
+ config.parse_known_args(config_overrides, relaxed_parser=True)
38
+ else:
39
+ # init from console args
40
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
41
+
42
+ config_base = BaseTrainingConfig()
43
+ config_base.parse_known_args(config_overrides)
44
+ config = register_config(config_base.model)()
45
+
46
+ # load training samples
47
+ train_samples, eval_samples = load_tts_samples(
48
+ config.datasets,
49
+ eval_split=True,
50
+ eval_split_max_size=config.eval_split_max_size,
51
+ eval_split_size=config.eval_split_size,
52
+ )
53
+
54
+ # init the model from config
55
+ model = setup_model(config, train_samples + eval_samples)
56
+
57
+ # init the trainer and 🚀
58
+ trainer = Trainer(
59
+ train_args,
60
+ model.config,
61
+ config.output_path,
62
+ model=model,
63
+ train_samples=train_samples,
64
+ eval_samples=eval_samples,
65
+ parse_command_line_args=False,
66
+ )
67
+ trainer.fit()
68
+
69
+
70
+ if __name__ == "__main__":
71
+ main()
TTS/bin/train_vocoder.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dataclasses import dataclass, field
3
+
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.config import load_config, register_config
7
+ from TTS.utils.audio import AudioProcessor
8
+ from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
9
+ from TTS.vocoder.models import setup_model
10
+
11
+
12
+ @dataclass
13
+ class TrainVocoderArgs(TrainerArgs):
14
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
15
+
16
+
17
+ def main():
18
+ """Run `tts` model training directly by a `config.json` file."""
19
+ # init trainer args
20
+ train_args = TrainVocoderArgs()
21
+ parser = train_args.init_argparse(arg_prefix="")
22
+
23
+ # override trainer args from comman-line args
24
+ args, config_overrides = parser.parse_known_args()
25
+ train_args.parse_args(args)
26
+
27
+ # load config.json and register
28
+ if args.config_path or args.continue_path:
29
+ if args.config_path:
30
+ # init from a file
31
+ config = load_config(args.config_path)
32
+ if len(config_overrides) > 0:
33
+ config.parse_known_args(config_overrides, relaxed_parser=True)
34
+ elif args.continue_path:
35
+ # continue from a prev experiment
36
+ config = load_config(os.path.join(args.continue_path, "config.json"))
37
+ if len(config_overrides) > 0:
38
+ config.parse_known_args(config_overrides, relaxed_parser=True)
39
+ else:
40
+ # init from console args
41
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
42
+
43
+ config_base = BaseTrainingConfig()
44
+ config_base.parse_known_args(config_overrides)
45
+ config = register_config(config_base.model)()
46
+
47
+ # load training samples
48
+ if "feature_path" in config and config.feature_path:
49
+ # load pre-computed features
50
+ print(f" > Loading features from: {config.feature_path}")
51
+ eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
52
+ else:
53
+ # load data raw wav files
54
+ eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
55
+
56
+ # setup audio processor
57
+ ap = AudioProcessor(**config.audio)
58
+
59
+ # init the model from config
60
+ model = setup_model(config)
61
+
62
+ # init the trainer and 🚀
63
+ trainer = Trainer(
64
+ train_args,
65
+ config,
66
+ config.output_path,
67
+ model=model,
68
+ train_samples=train_samples,
69
+ eval_samples=eval_samples,
70
+ training_assets={"audio_processor": ap},
71
+ parse_command_line_args=False,
72
+ )
73
+ trainer.fit()
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
TTS/bin/tune_wavegrad.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Search a good noise schedule for WaveGrad for a given number of inference iterations"""
2
+ import argparse
3
+ from itertools import product as cartesian_product
4
+
5
+ import numpy as np
6
+ import torch
7
+ from torch.utils.data import DataLoader
8
+ from tqdm import tqdm
9
+
10
+ from TTS.config import load_config
11
+ from TTS.utils.audio import AudioProcessor
12
+ from TTS.vocoder.datasets.preprocess import load_wav_data
13
+ from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
14
+ from TTS.vocoder.models import setup_model
15
+
16
+ if __name__ == "__main__":
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
19
+ parser.add_argument("--config_path", type=str, help="Path to model config file.")
20
+ parser.add_argument("--data_path", type=str, help="Path to data directory.")
21
+ parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
22
+ parser.add_argument(
23
+ "--num_iter",
24
+ type=int,
25
+ help="Number of model inference iterations that you like to optimize noise schedule for.",
26
+ )
27
+ parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
28
+ parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
29
+ parser.add_argument(
30
+ "--search_depth",
31
+ type=int,
32
+ default=3,
33
+ help="Search granularity. Increasing this increases the run-time exponentially.",
34
+ )
35
+
36
+ # load config
37
+ args = parser.parse_args()
38
+ config = load_config(args.config_path)
39
+
40
+ # setup audio processor
41
+ ap = AudioProcessor(**config.audio)
42
+
43
+ # load dataset
44
+ _, train_data = load_wav_data(args.data_path, 0)
45
+ train_data = train_data[: args.num_samples]
46
+ dataset = WaveGradDataset(
47
+ ap=ap,
48
+ items=train_data,
49
+ seq_len=-1,
50
+ hop_len=ap.hop_length,
51
+ pad_short=config.pad_short,
52
+ conv_pad=config.conv_pad,
53
+ is_training=True,
54
+ return_segments=False,
55
+ use_noise_augment=False,
56
+ use_cache=False,
57
+ verbose=True,
58
+ )
59
+ loader = DataLoader(
60
+ dataset,
61
+ batch_size=1,
62
+ shuffle=False,
63
+ collate_fn=dataset.collate_full_clips,
64
+ drop_last=False,
65
+ num_workers=config.num_loader_workers,
66
+ pin_memory=False,
67
+ )
68
+
69
+ # setup the model
70
+ model = setup_model(config)
71
+ if args.use_cuda:
72
+ model.cuda()
73
+
74
+ # setup optimization parameters
75
+ base_values = sorted(10 * np.random.uniform(size=args.search_depth))
76
+ print(f" > base values: {base_values}")
77
+ exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
78
+ best_error = float("inf")
79
+ best_schedule = None # pylint: disable=C0103
80
+ total_search_iter = len(base_values) ** args.num_iter
81
+ for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
82
+ beta = exponents * base
83
+ model.compute_noise_level(beta)
84
+ for data in loader:
85
+ mel, audio = data
86
+ y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
87
+
88
+ if args.use_cuda:
89
+ y_hat = y_hat.cpu()
90
+ y_hat = y_hat.numpy()
91
+
92
+ mel_hat = []
93
+ for i in range(y_hat.shape[0]):
94
+ m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
95
+ mel_hat.append(torch.from_numpy(m))
96
+
97
+ mel_hat = torch.stack(mel_hat)
98
+ mse = torch.sum((mel - mel_hat) ** 2).mean()
99
+ if mse.item() < best_error:
100
+ best_error = mse.item()
101
+ best_schedule = {"beta": beta}
102
+ print(f" > Found a better schedule. - MSE: {mse.item()}")
103
+ np.save(args.output_path, best_schedule)
TTS/config/__init__.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import Dict
5
+
6
+ import fsspec
7
+ import yaml
8
+ from coqpit import Coqpit
9
+
10
+ from TTS.config.shared_configs import *
11
+ from TTS.utils.generic_utils import find_module
12
+
13
+
14
+ def read_json_with_comments(json_path):
15
+ """for backward compat."""
16
+ # fallback to json
17
+ with fsspec.open(json_path, "r", encoding="utf-8") as f:
18
+ input_str = f.read()
19
+ # handle comments
20
+ input_str = re.sub(r"\\\n", "", input_str)
21
+ input_str = re.sub(r"//.*\n", "\n", input_str)
22
+ data = json.loads(input_str)
23
+ return data
24
+
25
+
26
+ def register_config(model_name: str) -> Coqpit:
27
+ """Find the right config for the given model name.
28
+
29
+ Args:
30
+ model_name (str): Model name.
31
+
32
+ Raises:
33
+ ModuleNotFoundError: No matching config for the model name.
34
+
35
+ Returns:
36
+ Coqpit: config class.
37
+ """
38
+ config_class = None
39
+ config_name = model_name + "_config"
40
+
41
+ # TODO: fix this
42
+ if model_name == "xtts":
43
+ from TTS.tts.configs.xtts_config import XttsConfig
44
+
45
+ config_class = XttsConfig
46
+ paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
47
+ for path in paths:
48
+ try:
49
+ config_class = find_module(path, config_name)
50
+ except ModuleNotFoundError:
51
+ pass
52
+ if config_class is None:
53
+ raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
54
+ return config_class
55
+
56
+
57
+ def _process_model_name(config_dict: Dict) -> str:
58
+ """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
59
+
60
+ Args:
61
+ config_dict (Dict): A dictionary including the config fields.
62
+
63
+ Returns:
64
+ str: Formatted modelname.
65
+ """
66
+ model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
67
+ model_name = model_name.replace("_generator", "").replace("_discriminator", "")
68
+ return model_name
69
+
70
+
71
+ def load_config(config_path: str) -> Coqpit:
72
+ """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
73
+ to find the corresponding Config class. Then initialize the Config.
74
+
75
+ Args:
76
+ config_path (str): path to the config file.
77
+
78
+ Raises:
79
+ TypeError: given config file has an unknown type.
80
+
81
+ Returns:
82
+ Coqpit: TTS config object.
83
+ """
84
+ config_dict = {}
85
+ ext = os.path.splitext(config_path)[1]
86
+ if ext in (".yml", ".yaml"):
87
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
88
+ data = yaml.safe_load(f)
89
+ elif ext == ".json":
90
+ try:
91
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
92
+ data = json.load(f)
93
+ except json.decoder.JSONDecodeError:
94
+ # backwards compat.
95
+ data = read_json_with_comments(config_path)
96
+ else:
97
+ raise TypeError(f" [!] Unknown config file type {ext}")
98
+ config_dict.update(data)
99
+ model_name = _process_model_name(config_dict)
100
+ config_class = register_config(model_name.lower())
101
+ config = config_class()
102
+ config.from_dict(config_dict)
103
+ return config
104
+
105
+
106
+ def check_config_and_model_args(config, arg_name, value):
107
+ """Check the give argument in `config.model_args` if exist or in `config` for
108
+ the given value.
109
+
110
+ Return False if the argument does not exist in `config.model_args` or `config`.
111
+ This is to patch up the compatibility between models with and without `model_args`.
112
+
113
+ TODO: Remove this in the future with a unified approach.
114
+ """
115
+ if hasattr(config, "model_args"):
116
+ if arg_name in config.model_args:
117
+ return config.model_args[arg_name] == value
118
+ if hasattr(config, arg_name):
119
+ return config[arg_name] == value
120
+ return False
121
+
122
+
123
+ def get_from_config_or_model_args(config, arg_name):
124
+ """Get the given argument from `config.model_args` if exist or in `config`."""
125
+ if hasattr(config, "model_args"):
126
+ if arg_name in config.model_args:
127
+ return config.model_args[arg_name]
128
+ return config[arg_name]
129
+
130
+
131
+ def get_from_config_or_model_args_with_default(config, arg_name, def_val):
132
+ """Get the given argument from `config.model_args` if exist or in `config`."""
133
+ if hasattr(config, "model_args"):
134
+ if arg_name in config.model_args:
135
+ return config.model_args[arg_name]
136
+ if hasattr(config, arg_name):
137
+ return config[arg_name]
138
+ return def_val
TTS/config/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (4.08 kB). View file
 
TTS/config/__pycache__/shared_configs.cpython-39.pyc ADDED
Binary file (9.52 kB). View file
 
TTS/config/shared_configs.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+ from typing import List
3
+
4
+ from coqpit import Coqpit, check_argument
5
+ from trainer import TrainerConfig
6
+
7
+
8
+ @dataclass
9
+ class BaseAudioConfig(Coqpit):
10
+ """Base config to definge audio processing parameters. It is used to initialize
11
+ ```TTS.utils.audio.AudioProcessor.```
12
+
13
+ Args:
14
+ fft_size (int):
15
+ Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
16
+
17
+ win_length (int):
18
+ Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
19
+ ```fft_size```. Defaults to 1024.
20
+
21
+ hop_length (int):
22
+ Number of audio samples between adjacent STFT columns. Defaults to 1024.
23
+
24
+ frame_shift_ms (int):
25
+ Set ```hop_length``` based on milliseconds and sampling rate.
26
+
27
+ frame_length_ms (int):
28
+ Set ```win_length``` based on milliseconds and sampling rate.
29
+
30
+ stft_pad_mode (str):
31
+ Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
32
+
33
+ sample_rate (int):
34
+ Audio sampling rate. Defaults to 22050.
35
+
36
+ resample (bool):
37
+ Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
38
+
39
+ preemphasis (float):
40
+ Preemphasis coefficient. Defaults to 0.0.
41
+
42
+ ref_level_db (int): 20
43
+ Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
44
+ Defaults to 20.
45
+
46
+ do_sound_norm (bool):
47
+ Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
48
+
49
+ log_func (str):
50
+ Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
51
+
52
+ do_trim_silence (bool):
53
+ Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
54
+
55
+ do_amp_to_db_linear (bool, optional):
56
+ enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
57
+
58
+ do_amp_to_db_mel (bool, optional):
59
+ enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
60
+
61
+ pitch_fmax (float, optional):
62
+ Maximum frequency of the F0 frames. Defaults to ```640```.
63
+
64
+ pitch_fmin (float, optional):
65
+ Minimum frequency of the F0 frames. Defaults to ```1```.
66
+
67
+ trim_db (int):
68
+ Silence threshold used for silence trimming. Defaults to 45.
69
+
70
+ do_rms_norm (bool, optional):
71
+ enable/disable RMS volume normalization when loading an audio file. Defaults to False.
72
+
73
+ db_level (int, optional):
74
+ dB level used for rms normalization. The range is -99 to 0. Defaults to None.
75
+
76
+ power (float):
77
+ Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
78
+ artifacts in the synthesized voice. Defaults to 1.5.
79
+
80
+ griffin_lim_iters (int):
81
+ Number of Griffing Lim iterations. Defaults to 60.
82
+
83
+ num_mels (int):
84
+ Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
85
+
86
+ mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
87
+ It needs to be adjusted for a dataset. Defaults to 0.
88
+
89
+ mel_fmax (float):
90
+ Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
91
+
92
+ spec_gain (int):
93
+ Gain applied when converting amplitude to DB. Defaults to 20.
94
+
95
+ signal_norm (bool):
96
+ enable/disable signal normalization. Defaults to True.
97
+
98
+ min_level_db (int):
99
+ minimum db threshold for the computed melspectrograms. Defaults to -100.
100
+
101
+ symmetric_norm (bool):
102
+ enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
103
+ [0, k], Defaults to True.
104
+
105
+ max_norm (float):
106
+ ```k``` defining the normalization range. Defaults to 4.0.
107
+
108
+ clip_norm (bool):
109
+ enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
110
+
111
+ stats_path (str):
112
+ Path to the computed stats file. Defaults to None.
113
+ """
114
+
115
+ # stft parameters
116
+ fft_size: int = 1024
117
+ win_length: int = 1024
118
+ hop_length: int = 256
119
+ frame_shift_ms: int = None
120
+ frame_length_ms: int = None
121
+ stft_pad_mode: str = "reflect"
122
+ # audio processing parameters
123
+ sample_rate: int = 22050
124
+ resample: bool = False
125
+ preemphasis: float = 0.0
126
+ ref_level_db: int = 20
127
+ do_sound_norm: bool = False
128
+ log_func: str = "np.log10"
129
+ # silence trimming
130
+ do_trim_silence: bool = True
131
+ trim_db: int = 45
132
+ # rms volume normalization
133
+ do_rms_norm: bool = False
134
+ db_level: float = None
135
+ # griffin-lim params
136
+ power: float = 1.5
137
+ griffin_lim_iters: int = 60
138
+ # mel-spec params
139
+ num_mels: int = 80
140
+ mel_fmin: float = 0.0
141
+ mel_fmax: float = None
142
+ spec_gain: int = 20
143
+ do_amp_to_db_linear: bool = True
144
+ do_amp_to_db_mel: bool = True
145
+ # f0 params
146
+ pitch_fmax: float = 640.0
147
+ pitch_fmin: float = 1.0
148
+ # normalization params
149
+ signal_norm: bool = True
150
+ min_level_db: int = -100
151
+ symmetric_norm: bool = True
152
+ max_norm: float = 4.0
153
+ clip_norm: bool = True
154
+ stats_path: str = None
155
+
156
+ def check_values(
157
+ self,
158
+ ):
159
+ """Check config fields"""
160
+ c = asdict(self)
161
+ check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
162
+ check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
163
+ check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
164
+ check_argument(
165
+ "frame_length_ms",
166
+ c,
167
+ restricted=True,
168
+ min_val=10,
169
+ max_val=1000,
170
+ alternative="win_length",
171
+ )
172
+ check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
173
+ check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
174
+ check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
175
+ check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
176
+ check_argument("power", c, restricted=True, min_val=1, max_val=5)
177
+ check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
178
+
179
+ # normalization parameters
180
+ check_argument("signal_norm", c, restricted=True)
181
+ check_argument("symmetric_norm", c, restricted=True)
182
+ check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
183
+ check_argument("clip_norm", c, restricted=True)
184
+ check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
185
+ check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
186
+ check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
187
+ check_argument("do_trim_silence", c, restricted=True)
188
+ check_argument("trim_db", c, restricted=True)
189
+
190
+
191
+ @dataclass
192
+ class BaseDatasetConfig(Coqpit):
193
+ """Base config for TTS datasets.
194
+
195
+ Args:
196
+ formatter (str):
197
+ Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
198
+
199
+ dataset_name (str):
200
+ Unique name for the dataset. Defaults to `""`.
201
+
202
+ path (str):
203
+ Root path to the dataset files. Defaults to `""`.
204
+
205
+ meta_file_train (str):
206
+ Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
207
+ Defaults to `""`.
208
+
209
+ ignored_speakers (List):
210
+ List of speakers IDs that are not used at the training. Default None.
211
+
212
+ language (str):
213
+ Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
214
+
215
+ phonemizer (str):
216
+ Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.
217
+
218
+ meta_file_val (str):
219
+ Name of the dataset meta file that defines the instances used at validation.
220
+
221
+ meta_file_attn_mask (str):
222
+ Path to the file that lists the attention mask files used with models that require attention masks to
223
+ train the duration predictor.
224
+ """
225
+
226
+ formatter: str = ""
227
+ dataset_name: str = ""
228
+ path: str = ""
229
+ meta_file_train: str = ""
230
+ ignored_speakers: List[str] = None
231
+ language: str = ""
232
+ phonemizer: str = ""
233
+ meta_file_val: str = ""
234
+ meta_file_attn_mask: str = ""
235
+
236
+ def check_values(
237
+ self,
238
+ ):
239
+ """Check config fields"""
240
+ c = asdict(self)
241
+ check_argument("formatter", c, restricted=True)
242
+ check_argument("path", c, restricted=True)
243
+ check_argument("meta_file_train", c, restricted=True)
244
+ check_argument("meta_file_val", c, restricted=False)
245
+ check_argument("meta_file_attn_mask", c, restricted=False)
246
+
247
+
248
+ @dataclass
249
+ class BaseTrainingConfig(TrainerConfig):
250
+ """Base config to define the basic 🐸TTS training parameters that are shared
251
+ among all the models. It is based on ```Trainer.TrainingConfig```.
252
+
253
+ Args:
254
+ model (str):
255
+ Name of the model that is used in the training.
256
+
257
+ num_loader_workers (int):
258
+ Number of workers for training time dataloader.
259
+
260
+ num_eval_loader_workers (int):
261
+ Number of workers for evaluation time dataloader.
262
+ """
263
+
264
+ model: str = None
265
+ # dataloading
266
+ num_loader_workers: int = 0
267
+ num_eval_loader_workers: int = 0
268
+ use_noise_augment: bool = False
TTS/cs_api.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.client
2
+ import json
3
+ import os
4
+ import tempfile
5
+ import urllib.request
6
+ from typing import Tuple
7
+
8
+ import numpy as np
9
+ import requests
10
+ from scipy.io import wavfile
11
+
12
+ from TTS.utils.audio.numpy_transforms import save_wav
13
+
14
+
15
+ class Speaker(object):
16
+ """Convert dict to object."""
17
+
18
+ def __init__(self, d, is_voice=False):
19
+ self.is_voice = is_voice
20
+ for k, v in d.items():
21
+ if isinstance(k, (list, tuple)):
22
+ setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
23
+ else:
24
+ setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
25
+
26
+ def __repr__(self):
27
+ return str(self.__dict__)
28
+
29
+
30
+ class CS_API:
31
+ """🐸Coqui Studio API Wrapper.
32
+
33
+ 🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
34
+ interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
35
+ characteristics. You can use these voices to generate new audio files or use them in your applications.
36
+ You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
37
+ You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
38
+ https://app.coqui.ai/account. We can either enter the token as an environment variable as
39
+ `export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
40
+ Visit https://app.coqui.ai/api for more information.
41
+
42
+
43
+ Args:
44
+ api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
45
+ `COQUI_STUDIO_TOKEN`.
46
+ model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.
47
+
48
+
49
+ Example listing all available speakers:
50
+ >>> from TTS.api import CS_API
51
+ >>> tts = CS_API()
52
+ >>> tts.speakers
53
+
54
+ Example listing all emotions:
55
+ >>> # emotions are only available for `V1` model
56
+ >>> from TTS.api import CS_API
57
+ >>> tts = CS_API(model="V1")
58
+ >>> tts.emotions
59
+
60
+ Example with a built-in 🐸 speaker:
61
+ >>> from TTS.api import CS_API
62
+ >>> tts = CS_API()
63
+ >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name)
64
+ >>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
65
+
66
+ Example with multi-language model:
67
+ >>> from TTS.api import CS_API
68
+ >>> tts = CS_API(model="XTTS")
69
+ >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
70
+ """
71
+
72
+ MODEL_ENDPOINTS = {
73
+ "V1": {
74
+ "list_speakers": "https://app.coqui.ai/api/v2/speakers",
75
+ "synthesize": "https://app.coqui.ai/api/v2/samples",
76
+ "list_voices": "https://app.coqui.ai/api/v2/voices",
77
+ },
78
+ "XTTS": {
79
+ "list_speakers": "https://app.coqui.ai/api/v2/speakers",
80
+ "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
81
+ "list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
82
+ },
83
+ }
84
+
85
+ SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
86
+
87
+ def __init__(self, api_token=None, model="XTTS"):
88
+ self.api_token = api_token
89
+ self.model = model
90
+ self.headers = None
91
+ self._speakers = None
92
+ self._check_token()
93
+
94
+ @staticmethod
95
+ def ping_api():
96
+ URL = "https://coqui.gateway.scarf.sh/tts/api"
97
+ _ = requests.get(URL)
98
+
99
+ @property
100
+ def speakers(self):
101
+ if self._speakers is None:
102
+ self._speakers = self.list_all_speakers()
103
+ return self._speakers
104
+
105
+ @property
106
+ def emotions(self):
107
+ """Return a list of available emotions.
108
+
109
+ TODO: Get this from the API endpoint.
110
+ """
111
+ if self.model == "V1":
112
+ return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
113
+ else:
114
+ raise ValueError(f"❗ Emotions are not available for {self.model}.")
115
+
116
+ def _check_token(self):
117
+ if self.api_token is None:
118
+ self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
119
+ self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
120
+ if not self.api_token:
121
+ raise ValueError(
122
+ "No API token found for 🐸Coqui Studio voices - https://coqui.ai \n"
123
+ "Visit 🔗https://app.coqui.ai/account to get one.\n"
124
+ "Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
125
+ ""
126
+ )
127
+
128
+ def list_all_speakers(self):
129
+ """Return both built-in Coqui Studio speakers and custom voices created by the user."""
130
+ return self.list_speakers() + self.list_voices()
131
+
132
+ def list_speakers(self):
133
+ """List built-in Coqui Studio speakers."""
134
+ self._check_token()
135
+ conn = http.client.HTTPSConnection("app.coqui.ai")
136
+ url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
137
+ conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
138
+ res = conn.getresponse()
139
+ data = res.read()
140
+ return [Speaker(s) for s in json.loads(data)["result"]]
141
+
142
+ def list_voices(self):
143
+ """List custom voices created by the user."""
144
+ conn = http.client.HTTPSConnection("app.coqui.ai")
145
+ url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
146
+ conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
147
+ res = conn.getresponse()
148
+ data = res.read()
149
+ return [Speaker(s, True) for s in json.loads(data)["result"]]
150
+
151
+ def list_speakers_as_tts_models(self):
152
+ """List speakers in ModelManager format."""
153
+ models = []
154
+ for speaker in self.speakers:
155
+ model = f"coqui_studio/multilingual/{speaker.name}/{self.model}"
156
+ models.append(model)
157
+ return models
158
+
159
+ def name_to_speaker(self, name):
160
+ for speaker in self.speakers:
161
+ if speaker.name == name:
162
+ return speaker
163
+ raise ValueError(f"Speaker {name} not found in {self.speakers}")
164
+
165
+ def id_to_speaker(self, speaker_id):
166
+ for speaker in self.speakers:
167
+ if speaker.id == speaker_id:
168
+ return speaker
169
+ raise ValueError(f"Speaker {speaker_id} not found.")
170
+
171
+ @staticmethod
172
+ def url_to_np(url):
173
+ tmp_file, _ = urllib.request.urlretrieve(url)
174
+ rate, data = wavfile.read(tmp_file)
175
+ return data, rate
176
+
177
+ @staticmethod
178
+ def _create_payload(model, text, speaker, speed, emotion, language):
179
+ payload = {}
180
+ # if speaker.is_voice:
181
+ payload["voice_id"] = speaker.id
182
+ # else:
183
+ payload["speaker_id"] = speaker.id
184
+
185
+ if model == "V1":
186
+ payload.update(
187
+ {
188
+ "emotion": emotion,
189
+ "name": speaker.name,
190
+ "text": text,
191
+ "speed": speed,
192
+ }
193
+ )
194
+ elif model == "XTTS":
195
+ payload.update(
196
+ {
197
+ "name": speaker.name,
198
+ "text": text,
199
+ "speed": speed,
200
+ "language": language,
201
+ }
202
+ )
203
+ else:
204
+ raise ValueError(f"❗ Unknown model {model}")
205
+ return payload
206
+
207
+ def _check_tts_args(self, text, speaker_name, speaker_id, emotion, speed, language):
208
+ assert text is not None, "❗ text is required for V1 model."
209
+ assert speaker_name is not None, "❗ speaker_name is required for V1 model."
210
+ if self.model == "V1":
211
+ if emotion is None:
212
+ emotion = "Neutral"
213
+ assert language is None, "❗ language is not supported for V1 model."
214
+ elif self.model == "XTTS":
215
+ assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
216
+ assert language is not None, "❗ Language is required for XTTS model."
217
+ assert (
218
+ language in self.SUPPORTED_LANGUAGES
219
+ ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
220
+ return text, speaker_name, speaker_id, emotion, speed, language
221
+
222
+ def tts(
223
+ self,
224
+ text: str,
225
+ speaker_name: str = None,
226
+ speaker_id=None,
227
+ emotion=None,
228
+ speed=1.0,
229
+ language=None, # pylint: disable=unused-argument
230
+ ) -> Tuple[np.ndarray, int]:
231
+ """Synthesize speech from text.
232
+
233
+ Args:
234
+ text (str): Text to synthesize.
235
+ speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
236
+ voices (user generated speakers) with `list_voices()`.
237
+ speaker_id (str): Speaker ID. If None, the speaker name is used.
238
+ emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only
239
+ supported by `V1` model. Defaults to None.
240
+ speed (float): Speed of the speech. 1.0 is normal speed.
241
+ language (str): Language of the text. If None, the default language of the speaker is used. Language is only
242
+ supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
243
+ """
244
+ self._check_token()
245
+ self.ping_api()
246
+
247
+ if speaker_name is None and speaker_id is None:
248
+ raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
249
+ if speaker_id is None:
250
+ speaker = self.name_to_speaker(speaker_name)
251
+ else:
252
+ speaker = self.id_to_speaker(speaker_id)
253
+
254
+ text, speaker_name, speaker_id, emotion, speed, language = self._check_tts_args(
255
+ text, speaker_name, speaker_id, emotion, speed, language
256
+ )
257
+
258
+ conn = http.client.HTTPSConnection("app.coqui.ai")
259
+ payload = self._create_payload(self.model, text, speaker, speed, emotion, language)
260
+ url = self.MODEL_ENDPOINTS[self.model]["synthesize"]
261
+ conn.request("POST", url, json.dumps(payload), self.headers)
262
+ res = conn.getresponse()
263
+ data = res.read()
264
+ try:
265
+ wav, sr = self.url_to_np(json.loads(data)["audio_url"])
266
+ except KeyError as e:
267
+ raise ValueError(f" [!] 🐸 API returned error: {data}") from e
268
+ return wav, sr
269
+
270
+ def tts_to_file(
271
+ self,
272
+ text: str,
273
+ speaker_name: str,
274
+ speaker_id=None,
275
+ emotion=None,
276
+ speed=1.0,
277
+ pipe_out=None,
278
+ language=None,
279
+ file_path: str = None,
280
+ ) -> str:
281
+ """Synthesize speech from text and save it to a file.
282
+
283
+ Args:
284
+ text (str): Text to synthesize.
285
+ speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
286
+ voices (user generated speakers) with `list_voices()`.
287
+ speaker_id (str): Speaker ID. If None, the speaker name is used.
288
+ emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
289
+ speed (float): Speed of the speech. 1.0 is normal speed.
290
+ pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
291
+ language (str): Language of the text. If None, the default language of the speaker is used. Language is only
292
+ supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
293
+ file_path (str): Path to save the file. If None, a temporary file is created.
294
+ """
295
+ if file_path is None:
296
+ file_path = tempfile.mktemp(".wav")
297
+ wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
298
+ save_wav(wav=wav, path=file_path, sample_rate=sr, pipe_out=pipe_out)
299
+ return file_path
300
+
301
+
302
+ if __name__ == "__main__":
303
+ import time
304
+
305
+ api = CS_API()
306
+ print(api.speakers)
307
+ print(api.list_speakers_as_tts_models())
308
+
309
+ ts = time.time()
310
+ wav, sr = api.tts(
311
+ "It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name
312
+ )
313
+ print(f" [i] XTTS took {time.time() - ts:.2f}s")
314
+
315
+ filepath = api.tts_to_file(
316
+ text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav"
317
+ )
TTS/encoder/README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Speaker Encoder
2
+
3
+ This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
4
+
5
+ With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
6
+
7
+ Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
8
+
9
+ ![](umap.png)
10
+
11
+ Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
12
+
13
+ To run the code, you need to follow the same flow as in TTS.
14
+
15
+ - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
16
+ - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
17
+ - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
18
+ - Watch training on Tensorboard as in TTS
TTS/encoder/__init__.py ADDED
File without changes
TTS/encoder/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (160 Bytes). View file
 
TTS/encoder/__pycache__/losses.cpython-39.pyc ADDED
Binary file (7.83 kB). View file
 
TTS/encoder/configs/base_encoder_config.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass, field
2
+ from typing import Dict, List
3
+
4
+ from coqpit import MISSING
5
+
6
+ from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
7
+
8
+
9
+ @dataclass
10
+ class BaseEncoderConfig(BaseTrainingConfig):
11
+ """Defines parameters for a Generic Encoder model."""
12
+
13
+ model: str = None
14
+ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
15
+ datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
16
+ # model params
17
+ model_params: Dict = field(
18
+ default_factory=lambda: {
19
+ "model_name": "lstm",
20
+ "input_dim": 80,
21
+ "proj_dim": 256,
22
+ "lstm_dim": 768,
23
+ "num_lstm_layers": 3,
24
+ "use_lstm_with_projection": True,
25
+ }
26
+ )
27
+
28
+ audio_augmentation: Dict = field(default_factory=lambda: {})
29
+
30
+ # training params
31
+ epochs: int = 10000
32
+ loss: str = "angleproto"
33
+ grad_clip: float = 3.0
34
+ lr: float = 0.0001
35
+ optimizer: str = "radam"
36
+ optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
37
+ lr_decay: bool = False
38
+ warmup_steps: int = 4000
39
+
40
+ # logging params
41
+ tb_model_param_stats: bool = False
42
+ steps_plot_stats: int = 10
43
+ save_step: int = 1000
44
+ print_step: int = 20
45
+ run_eval: bool = False
46
+
47
+ # data loader
48
+ num_classes_in_batch: int = MISSING
49
+ num_utter_per_class: int = MISSING
50
+ eval_num_classes_in_batch: int = None
51
+ eval_num_utter_per_class: int = None
52
+
53
+ num_loader_workers: int = MISSING
54
+ voice_len: float = 1.6
55
+
56
+ def check_values(self):
57
+ super().check_values()
58
+ c = asdict(self)
59
+ assert (
60
+ c["model_params"]["input_dim"] == self.audio.num_mels
61
+ ), " [!] model input dimendion must be equal to melspectrogram dimension."
TTS/encoder/configs/emotion_encoder_config.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+
3
+ from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4
+
5
+
6
+ @dataclass
7
+ class EmotionEncoderConfig(BaseEncoderConfig):
8
+ """Defines parameters for Emotion Encoder model."""
9
+
10
+ model: str = "emotion_encoder"
11
+ map_classid_to_classname: dict = None
12
+ class_name_key: str = "emotion_name"
TTS/encoder/configs/speaker_encoder_config.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+
3
+ from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4
+
5
+
6
+ @dataclass
7
+ class SpeakerEncoderConfig(BaseEncoderConfig):
8
+ """Defines parameters for Speaker Encoder model."""
9
+
10
+ model: str = "speaker_encoder"
11
+ class_name_key: str = "speaker_name"
TTS/encoder/dataset.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ import torch
4
+ from torch.utils.data import Dataset
5
+
6
+ from TTS.encoder.utils.generic_utils import AugmentWAV
7
+
8
+
9
+ class EncoderDataset(Dataset):
10
+ def __init__(
11
+ self,
12
+ config,
13
+ ap,
14
+ meta_data,
15
+ voice_len=1.6,
16
+ num_classes_in_batch=64,
17
+ num_utter_per_class=10,
18
+ verbose=False,
19
+ augmentation_config=None,
20
+ use_torch_spec=None,
21
+ ):
22
+ """
23
+ Args:
24
+ ap (TTS.tts.utils.AudioProcessor): audio processor object.
25
+ meta_data (list): list of dataset instances.
26
+ seq_len (int): voice segment length in seconds.
27
+ verbose (bool): print diagnostic information.
28
+ """
29
+ super().__init__()
30
+ self.config = config
31
+ self.items = meta_data
32
+ self.sample_rate = ap.sample_rate
33
+ self.seq_len = int(voice_len * self.sample_rate)
34
+ self.num_utter_per_class = num_utter_per_class
35
+ self.ap = ap
36
+ self.verbose = verbose
37
+ self.use_torch_spec = use_torch_spec
38
+ self.classes, self.items = self.__parse_items()
39
+
40
+ self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
41
+
42
+ # Data Augmentation
43
+ self.augmentator = None
44
+ self.gaussian_augmentation_config = None
45
+ if augmentation_config:
46
+ self.data_augmentation_p = augmentation_config["p"]
47
+ if self.data_augmentation_p and ("additive" in augmentation_config or "rir" in augmentation_config):
48
+ self.augmentator = AugmentWAV(ap, augmentation_config)
49
+
50
+ if "gaussian" in augmentation_config.keys():
51
+ self.gaussian_augmentation_config = augmentation_config["gaussian"]
52
+
53
+ if self.verbose:
54
+ print("\n > DataLoader initialization")
55
+ print(f" | > Classes per Batch: {num_classes_in_batch}")
56
+ print(f" | > Number of instances : {len(self.items)}")
57
+ print(f" | > Sequence length: {self.seq_len}")
58
+ print(f" | > Num Classes: {len(self.classes)}")
59
+ print(f" | > Classes: {self.classes}")
60
+
61
+ def load_wav(self, filename):
62
+ audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
63
+ return audio
64
+
65
+ def __parse_items(self):
66
+ class_to_utters = {}
67
+ for item in self.items:
68
+ path_ = item["audio_file"]
69
+ class_name = item[self.config.class_name_key]
70
+ if class_name in class_to_utters.keys():
71
+ class_to_utters[class_name].append(path_)
72
+ else:
73
+ class_to_utters[class_name] = [
74
+ path_,
75
+ ]
76
+
77
+ # skip classes with number of samples >= self.num_utter_per_class
78
+ class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}
79
+
80
+ classes = list(class_to_utters.keys())
81
+ classes.sort()
82
+
83
+ new_items = []
84
+ for item in self.items:
85
+ path_ = item["audio_file"]
86
+ class_name = item["emotion_name"] if self.config.model == "emotion_encoder" else item["speaker_name"]
87
+ # ignore filtered classes
88
+ if class_name not in classes:
89
+ continue
90
+ # ignore small audios
91
+ if self.load_wav(path_).shape[0] - self.seq_len <= 0:
92
+ continue
93
+
94
+ new_items.append({"wav_file_path": path_, "class_name": class_name})
95
+
96
+ return classes, new_items
97
+
98
+ def __len__(self):
99
+ return len(self.items)
100
+
101
+ def get_num_classes(self):
102
+ return len(self.classes)
103
+
104
+ def get_class_list(self):
105
+ return self.classes
106
+
107
+ def set_classes(self, classes):
108
+ self.classes = classes
109
+ self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}
110
+
111
+ def get_map_classid_to_classname(self):
112
+ return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())
113
+
114
+ def __getitem__(self, idx):
115
+ return self.items[idx]
116
+
117
+ def collate_fn(self, batch):
118
+ # get the batch class_ids
119
+ labels = []
120
+ feats = []
121
+ for item in batch:
122
+ utter_path = item["wav_file_path"]
123
+ class_name = item["class_name"]
124
+
125
+ # get classid
126
+ class_id = self.classname_to_classid[class_name]
127
+ # load wav file
128
+ wav = self.load_wav(utter_path)
129
+ offset = random.randint(0, wav.shape[0] - self.seq_len)
130
+ wav = wav[offset : offset + self.seq_len]
131
+
132
+ if self.augmentator is not None and self.data_augmentation_p:
133
+ if random.random() < self.data_augmentation_p:
134
+ wav = self.augmentator.apply_one(wav)
135
+
136
+ if not self.use_torch_spec:
137
+ mel = self.ap.melspectrogram(wav)
138
+ feats.append(torch.FloatTensor(mel))
139
+ else:
140
+ feats.append(torch.FloatTensor(wav))
141
+
142
+ labels.append(class_id)
143
+
144
+ feats = torch.stack(feats)
145
+ labels = torch.LongTensor(labels)
146
+
147
+ return feats, labels
TTS/encoder/losses.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from torch import nn
4
+
5
+
6
+ # adapted from https://github.com/cvqluu/GE2E-Loss
7
+ class GE2ELoss(nn.Module):
8
+ def __init__(self, init_w=10.0, init_b=-5.0, loss_method="softmax"):
9
+ """
10
+ Implementation of the Generalized End-to-End loss defined in https://arxiv.org/abs/1710.10467 [1]
11
+ Accepts an input of size (N, M, D)
12
+ where N is the number of speakers in the batch,
13
+ M is the number of utterances per speaker,
14
+ and D is the dimensionality of the embedding vector (e.g. d-vector)
15
+ Args:
16
+ - init_w (float): defines the initial value of w in Equation (5) of [1]
17
+ - init_b (float): definies the initial value of b in Equation (5) of [1]
18
+ """
19
+ super().__init__()
20
+ # pylint: disable=E1102
21
+ self.w = nn.Parameter(torch.tensor(init_w))
22
+ # pylint: disable=E1102
23
+ self.b = nn.Parameter(torch.tensor(init_b))
24
+ self.loss_method = loss_method
25
+
26
+ print(" > Initialized Generalized End-to-End loss")
27
+
28
+ assert self.loss_method in ["softmax", "contrast"]
29
+
30
+ if self.loss_method == "softmax":
31
+ self.embed_loss = self.embed_loss_softmax
32
+ if self.loss_method == "contrast":
33
+ self.embed_loss = self.embed_loss_contrast
34
+
35
+ # pylint: disable=R0201
36
+ def calc_new_centroids(self, dvecs, centroids, spkr, utt):
37
+ """
38
+ Calculates the new centroids excluding the reference utterance
39
+ """
40
+ excl = torch.cat((dvecs[spkr, :utt], dvecs[spkr, utt + 1 :]))
41
+ excl = torch.mean(excl, 0)
42
+ new_centroids = []
43
+ for i, centroid in enumerate(centroids):
44
+ if i == spkr:
45
+ new_centroids.append(excl)
46
+ else:
47
+ new_centroids.append(centroid)
48
+ return torch.stack(new_centroids)
49
+
50
+ def calc_cosine_sim(self, dvecs, centroids):
51
+ """
52
+ Make the cosine similarity matrix with dims (N,M,N)
53
+ """
54
+ cos_sim_matrix = []
55
+ for spkr_idx, speaker in enumerate(dvecs):
56
+ cs_row = []
57
+ for utt_idx, utterance in enumerate(speaker):
58
+ new_centroids = self.calc_new_centroids(dvecs, centroids, spkr_idx, utt_idx)
59
+ # vector based cosine similarity for speed
60
+ cs_row.append(
61
+ torch.clamp(
62
+ torch.mm(
63
+ utterance.unsqueeze(1).transpose(0, 1),
64
+ new_centroids.transpose(0, 1),
65
+ )
66
+ / (torch.norm(utterance) * torch.norm(new_centroids, dim=1)),
67
+ 1e-6,
68
+ )
69
+ )
70
+ cs_row = torch.cat(cs_row, dim=0)
71
+ cos_sim_matrix.append(cs_row)
72
+ return torch.stack(cos_sim_matrix)
73
+
74
+ # pylint: disable=R0201
75
+ def embed_loss_softmax(self, dvecs, cos_sim_matrix):
76
+ """
77
+ Calculates the loss on each embedding $L(e_{ji})$ by taking softmax
78
+ """
79
+ N, M, _ = dvecs.shape
80
+ L = []
81
+ for j in range(N):
82
+ L_row = []
83
+ for i in range(M):
84
+ L_row.append(-F.log_softmax(cos_sim_matrix[j, i], 0)[j])
85
+ L_row = torch.stack(L_row)
86
+ L.append(L_row)
87
+ return torch.stack(L)
88
+
89
+ # pylint: disable=R0201
90
+ def embed_loss_contrast(self, dvecs, cos_sim_matrix):
91
+ """
92
+ Calculates the loss on each embedding $L(e_{ji})$ by contrast loss with closest centroid
93
+ """
94
+ N, M, _ = dvecs.shape
95
+ L = []
96
+ for j in range(N):
97
+ L_row = []
98
+ for i in range(M):
99
+ centroids_sigmoids = torch.sigmoid(cos_sim_matrix[j, i])
100
+ excl_centroids_sigmoids = torch.cat((centroids_sigmoids[:j], centroids_sigmoids[j + 1 :]))
101
+ L_row.append(1.0 - torch.sigmoid(cos_sim_matrix[j, i, j]) + torch.max(excl_centroids_sigmoids))
102
+ L_row = torch.stack(L_row)
103
+ L.append(L_row)
104
+ return torch.stack(L)
105
+
106
+ def forward(self, x, _label=None):
107
+ """
108
+ Calculates the GE2E loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
109
+ """
110
+
111
+ assert x.size()[1] >= 2
112
+
113
+ centroids = torch.mean(x, 1)
114
+ cos_sim_matrix = self.calc_cosine_sim(x, centroids)
115
+ torch.clamp(self.w, 1e-6)
116
+ cos_sim_matrix = self.w * cos_sim_matrix + self.b
117
+ L = self.embed_loss(x, cos_sim_matrix)
118
+ return L.mean()
119
+
120
+
121
+ # adapted from https://github.com/clovaai/voxceleb_trainer/blob/master/loss/angleproto.py
122
+ class AngleProtoLoss(nn.Module):
123
+ """
124
+ Implementation of the Angular Prototypical loss defined in https://arxiv.org/abs/2003.11982
125
+ Accepts an input of size (N, M, D)
126
+ where N is the number of speakers in the batch,
127
+ M is the number of utterances per speaker,
128
+ and D is the dimensionality of the embedding vector
129
+ Args:
130
+ - init_w (float): defines the initial value of w
131
+ - init_b (float): definies the initial value of b
132
+ """
133
+
134
+ def __init__(self, init_w=10.0, init_b=-5.0):
135
+ super().__init__()
136
+ # pylint: disable=E1102
137
+ self.w = nn.Parameter(torch.tensor(init_w))
138
+ # pylint: disable=E1102
139
+ self.b = nn.Parameter(torch.tensor(init_b))
140
+ self.criterion = torch.nn.CrossEntropyLoss()
141
+
142
+ print(" > Initialized Angular Prototypical loss")
143
+
144
+ def forward(self, x, _label=None):
145
+ """
146
+ Calculates the AngleProto loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
147
+ """
148
+
149
+ assert x.size()[1] >= 2
150
+
151
+ out_anchor = torch.mean(x[:, 1:, :], 1)
152
+ out_positive = x[:, 0, :]
153
+ num_speakers = out_anchor.size()[0]
154
+
155
+ cos_sim_matrix = F.cosine_similarity(
156
+ out_positive.unsqueeze(-1).expand(-1, -1, num_speakers),
157
+ out_anchor.unsqueeze(-1).expand(-1, -1, num_speakers).transpose(0, 2),
158
+ )
159
+ torch.clamp(self.w, 1e-6)
160
+ cos_sim_matrix = cos_sim_matrix * self.w + self.b
161
+ label = torch.arange(num_speakers).to(cos_sim_matrix.device)
162
+ L = self.criterion(cos_sim_matrix, label)
163
+ return L
164
+
165
+
166
+ class SoftmaxLoss(nn.Module):
167
+ """
168
+ Implementation of the Softmax loss as defined in https://arxiv.org/abs/2003.11982
169
+ Args:
170
+ - embedding_dim (float): speaker embedding dim
171
+ - n_speakers (float): number of speakers
172
+ """
173
+
174
+ def __init__(self, embedding_dim, n_speakers):
175
+ super().__init__()
176
+
177
+ self.criterion = torch.nn.CrossEntropyLoss()
178
+ self.fc = nn.Linear(embedding_dim, n_speakers)
179
+
180
+ print("Initialised Softmax Loss")
181
+
182
+ def forward(self, x, label=None):
183
+ # reshape for compatibility
184
+ x = x.reshape(-1, x.size()[-1])
185
+ label = label.reshape(-1)
186
+
187
+ x = self.fc(x)
188
+ L = self.criterion(x, label)
189
+
190
+ return L
191
+
192
+ def inference(self, embedding):
193
+ x = self.fc(embedding)
194
+ activations = torch.nn.functional.softmax(x, dim=1).squeeze(0)
195
+ class_id = torch.argmax(activations)
196
+ return class_id
197
+
198
+
199
+ class SoftmaxAngleProtoLoss(nn.Module):
200
+ """
201
+ Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
202
+ Args:
203
+ - embedding_dim (float): speaker embedding dim
204
+ - n_speakers (float): number of speakers
205
+ - init_w (float): defines the initial value of w
206
+ - init_b (float): definies the initial value of b
207
+ """
208
+
209
+ def __init__(self, embedding_dim, n_speakers, init_w=10.0, init_b=-5.0):
210
+ super().__init__()
211
+
212
+ self.softmax = SoftmaxLoss(embedding_dim, n_speakers)
213
+ self.angleproto = AngleProtoLoss(init_w, init_b)
214
+
215
+ print("Initialised SoftmaxAnglePrototypical Loss")
216
+
217
+ def forward(self, x, label=None):
218
+ """
219
+ Calculates the SoftmaxAnglePrototypical loss for an input of dimensions (num_speakers, num_utts_per_speaker, dvec_feats)
220
+ """
221
+
222
+ Lp = self.angleproto(x)
223
+
224
+ Ls = self.softmax(x, label)
225
+
226
+ return Ls + Lp
TTS/encoder/models/__pycache__/base_encoder.cpython-39.pyc ADDED
Binary file (4.52 kB). View file
 
TTS/encoder/models/__pycache__/lstm.cpython-39.pyc ADDED
Binary file (3.62 kB). View file
 
TTS/encoder/models/__pycache__/resnet.cpython-39.pyc ADDED
Binary file (5.84 kB). View file
 
TTS/encoder/models/base_encoder.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torchaudio
4
+ from coqpit import Coqpit
5
+ from torch import nn
6
+
7
+ from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
8
+ from TTS.utils.generic_utils import set_init_dict
9
+ from TTS.utils.io import load_fsspec
10
+
11
+
12
+ class PreEmphasis(nn.Module):
13
+ def __init__(self, coefficient=0.97):
14
+ super().__init__()
15
+ self.coefficient = coefficient
16
+ self.register_buffer("filter", torch.FloatTensor([-self.coefficient, 1.0]).unsqueeze(0).unsqueeze(0))
17
+
18
+ def forward(self, x):
19
+ assert len(x.size()) == 2
20
+
21
+ x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
22
+ return torch.nn.functional.conv1d(x, self.filter).squeeze(1)
23
+
24
+
25
+ class BaseEncoder(nn.Module):
26
+ """Base `encoder` class. Every new `encoder` model must inherit this.
27
+
28
+ It defines common `encoder` specific functions.
29
+ """
30
+
31
+ # pylint: disable=W0102
32
+ def __init__(self):
33
+ super(BaseEncoder, self).__init__()
34
+
35
+ def get_torch_mel_spectrogram_class(self, audio_config):
36
+ return torch.nn.Sequential(
37
+ PreEmphasis(audio_config["preemphasis"]),
38
+ # TorchSTFT(
39
+ # n_fft=audio_config["fft_size"],
40
+ # hop_length=audio_config["hop_length"],
41
+ # win_length=audio_config["win_length"],
42
+ # sample_rate=audio_config["sample_rate"],
43
+ # window="hamming_window",
44
+ # mel_fmin=0.0,
45
+ # mel_fmax=None,
46
+ # use_htk=True,
47
+ # do_amp_to_db=False,
48
+ # n_mels=audio_config["num_mels"],
49
+ # power=2.0,
50
+ # use_mel=True,
51
+ # mel_norm=None,
52
+ # )
53
+ torchaudio.transforms.MelSpectrogram(
54
+ sample_rate=audio_config["sample_rate"],
55
+ n_fft=audio_config["fft_size"],
56
+ win_length=audio_config["win_length"],
57
+ hop_length=audio_config["hop_length"],
58
+ window_fn=torch.hamming_window,
59
+ n_mels=audio_config["num_mels"],
60
+ ),
61
+ )
62
+
63
+ @torch.no_grad()
64
+ def inference(self, x, l2_norm=True):
65
+ return self.forward(x, l2_norm)
66
+
67
+ @torch.no_grad()
68
+ def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True, l2_norm=True):
69
+ """
70
+ Generate embeddings for a batch of utterances
71
+ x: 1xTxD
72
+ """
73
+ # map to the waveform size
74
+ if self.use_torch_spec:
75
+ num_frames = num_frames * self.audio_config["hop_length"]
76
+
77
+ max_len = x.shape[1]
78
+
79
+ if max_len < num_frames:
80
+ num_frames = max_len
81
+
82
+ offsets = np.linspace(0, max_len - num_frames, num=num_eval)
83
+
84
+ frames_batch = []
85
+ for offset in offsets:
86
+ offset = int(offset)
87
+ end_offset = int(offset + num_frames)
88
+ frames = x[:, offset:end_offset]
89
+ frames_batch.append(frames)
90
+
91
+ frames_batch = torch.cat(frames_batch, dim=0)
92
+ embeddings = self.inference(frames_batch, l2_norm=l2_norm)
93
+
94
+ if return_mean:
95
+ embeddings = torch.mean(embeddings, dim=0, keepdim=True)
96
+ return embeddings
97
+
98
+ def get_criterion(self, c: Coqpit, num_classes=None):
99
+ if c.loss == "ge2e":
100
+ criterion = GE2ELoss(loss_method="softmax")
101
+ elif c.loss == "angleproto":
102
+ criterion = AngleProtoLoss()
103
+ elif c.loss == "softmaxproto":
104
+ criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_classes)
105
+ else:
106
+ raise Exception("The %s not is a loss supported" % c.loss)
107
+ return criterion
108
+
109
+ def load_checkpoint(
110
+ self,
111
+ config: Coqpit,
112
+ checkpoint_path: str,
113
+ eval: bool = False,
114
+ use_cuda: bool = False,
115
+ criterion=None,
116
+ cache=False,
117
+ ):
118
+ state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"), cache=cache)
119
+ try:
120
+ self.load_state_dict(state["model"])
121
+ print(" > Model fully restored. ")
122
+ except (KeyError, RuntimeError) as error:
123
+ # If eval raise the error
124
+ if eval:
125
+ raise error
126
+
127
+ print(" > Partial model initialization.")
128
+ model_dict = self.state_dict()
129
+ model_dict = set_init_dict(model_dict, state["model"], c)
130
+ self.load_state_dict(model_dict)
131
+ del model_dict
132
+
133
+ # load the criterion for restore_path
134
+ if criterion is not None and "criterion" in state:
135
+ try:
136
+ criterion.load_state_dict(state["criterion"])
137
+ except (KeyError, RuntimeError) as error:
138
+ print(" > Criterion load ignored because of:", error)
139
+
140
+ # instance and load the criterion for the encoder classifier in inference time
141
+ if (
142
+ eval
143
+ and criterion is None
144
+ and "criterion" in state
145
+ and getattr(config, "map_classid_to_classname", None) is not None
146
+ ):
147
+ criterion = self.get_criterion(config, len(config.map_classid_to_classname))
148
+ criterion.load_state_dict(state["criterion"])
149
+
150
+ if use_cuda:
151
+ self.cuda()
152
+ if criterion is not None:
153
+ criterion = criterion.cuda()
154
+
155
+ if eval:
156
+ self.eval()
157
+ assert not self.training
158
+
159
+ if not eval:
160
+ return criterion, state["step"]
161
+ return criterion
TTS/encoder/models/lstm.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ from TTS.encoder.models.base_encoder import BaseEncoder
5
+
6
+
7
+ class LSTMWithProjection(nn.Module):
8
+ def __init__(self, input_size, hidden_size, proj_size):
9
+ super().__init__()
10
+ self.input_size = input_size
11
+ self.hidden_size = hidden_size
12
+ self.proj_size = proj_size
13
+ self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
14
+ self.linear = nn.Linear(hidden_size, proj_size, bias=False)
15
+
16
+ def forward(self, x):
17
+ self.lstm.flatten_parameters()
18
+ o, (_, _) = self.lstm(x)
19
+ return self.linear(o)
20
+
21
+
22
+ class LSTMWithoutProjection(nn.Module):
23
+ def __init__(self, input_dim, lstm_dim, proj_dim, num_lstm_layers):
24
+ super().__init__()
25
+ self.lstm = nn.LSTM(input_size=input_dim, hidden_size=lstm_dim, num_layers=num_lstm_layers, batch_first=True)
26
+ self.linear = nn.Linear(lstm_dim, proj_dim, bias=True)
27
+ self.relu = nn.ReLU()
28
+
29
+ def forward(self, x):
30
+ _, (hidden, _) = self.lstm(x)
31
+ return self.relu(self.linear(hidden[-1]))
32
+
33
+
34
+ class LSTMSpeakerEncoder(BaseEncoder):
35
+ def __init__(
36
+ self,
37
+ input_dim,
38
+ proj_dim=256,
39
+ lstm_dim=768,
40
+ num_lstm_layers=3,
41
+ use_lstm_with_projection=True,
42
+ use_torch_spec=False,
43
+ audio_config=None,
44
+ ):
45
+ super().__init__()
46
+ self.use_lstm_with_projection = use_lstm_with_projection
47
+ self.use_torch_spec = use_torch_spec
48
+ self.audio_config = audio_config
49
+ self.proj_dim = proj_dim
50
+
51
+ layers = []
52
+ # choise LSTM layer
53
+ if use_lstm_with_projection:
54
+ layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
55
+ for _ in range(num_lstm_layers - 1):
56
+ layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
57
+ self.layers = nn.Sequential(*layers)
58
+ else:
59
+ self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim, num_lstm_layers)
60
+
61
+ self.instancenorm = nn.InstanceNorm1d(input_dim)
62
+
63
+ if self.use_torch_spec:
64
+ self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
65
+ else:
66
+ self.torch_spec = None
67
+
68
+ self._init_layers()
69
+
70
+ def _init_layers(self):
71
+ for name, param in self.layers.named_parameters():
72
+ if "bias" in name:
73
+ nn.init.constant_(param, 0.0)
74
+ elif "weight" in name:
75
+ nn.init.xavier_normal_(param)
76
+
77
+ def forward(self, x, l2_norm=True):
78
+ """Forward pass of the model.
79
+
80
+ Args:
81
+ x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
82
+ to compute the spectrogram on-the-fly.
83
+ l2_norm (bool): Whether to L2-normalize the outputs.
84
+
85
+ Shapes:
86
+ - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
87
+ """
88
+ with torch.no_grad():
89
+ with torch.cuda.amp.autocast(enabled=False):
90
+ if self.use_torch_spec:
91
+ x.squeeze_(1)
92
+ x = self.torch_spec(x)
93
+ x = self.instancenorm(x).transpose(1, 2)
94
+ d = self.layers(x)
95
+ if self.use_lstm_with_projection:
96
+ d = d[:, -1]
97
+ if l2_norm:
98
+ d = torch.nn.functional.normalize(d, p=2, dim=1)
99
+ return d
TTS/encoder/models/resnet.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+ # from TTS.utils.audio.torch_transforms import TorchSTFT
5
+ from TTS.encoder.models.base_encoder import BaseEncoder
6
+
7
+
8
+ class SELayer(nn.Module):
9
+ def __init__(self, channel, reduction=8):
10
+ super(SELayer, self).__init__()
11
+ self.avg_pool = nn.AdaptiveAvgPool2d(1)
12
+ self.fc = nn.Sequential(
13
+ nn.Linear(channel, channel // reduction),
14
+ nn.ReLU(inplace=True),
15
+ nn.Linear(channel // reduction, channel),
16
+ nn.Sigmoid(),
17
+ )
18
+
19
+ def forward(self, x):
20
+ b, c, _, _ = x.size()
21
+ y = self.avg_pool(x).view(b, c)
22
+ y = self.fc(y).view(b, c, 1, 1)
23
+ return x * y
24
+
25
+
26
+ class SEBasicBlock(nn.Module):
27
+ expansion = 1
28
+
29
+ def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
30
+ super(SEBasicBlock, self).__init__()
31
+ self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
32
+ self.bn1 = nn.BatchNorm2d(planes)
33
+ self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
34
+ self.bn2 = nn.BatchNorm2d(planes)
35
+ self.relu = nn.ReLU(inplace=True)
36
+ self.se = SELayer(planes, reduction)
37
+ self.downsample = downsample
38
+ self.stride = stride
39
+
40
+ def forward(self, x):
41
+ residual = x
42
+
43
+ out = self.conv1(x)
44
+ out = self.relu(out)
45
+ out = self.bn1(out)
46
+
47
+ out = self.conv2(out)
48
+ out = self.bn2(out)
49
+ out = self.se(out)
50
+
51
+ if self.downsample is not None:
52
+ residual = self.downsample(x)
53
+
54
+ out += residual
55
+ out = self.relu(out)
56
+ return out
57
+
58
+
59
+ class ResNetSpeakerEncoder(BaseEncoder):
60
+ """Implementation of the model H/ASP without batch normalization in speaker embedding. This model was proposed in: https://arxiv.org/abs/2009.14153
61
+ Adapted from: https://github.com/clovaai/voxceleb_trainer
62
+ """
63
+
64
+ # pylint: disable=W0102
65
+ def __init__(
66
+ self,
67
+ input_dim=64,
68
+ proj_dim=512,
69
+ layers=[3, 4, 6, 3],
70
+ num_filters=[32, 64, 128, 256],
71
+ encoder_type="ASP",
72
+ log_input=False,
73
+ use_torch_spec=False,
74
+ audio_config=None,
75
+ ):
76
+ super(ResNetSpeakerEncoder, self).__init__()
77
+
78
+ self.encoder_type = encoder_type
79
+ self.input_dim = input_dim
80
+ self.log_input = log_input
81
+ self.use_torch_spec = use_torch_spec
82
+ self.audio_config = audio_config
83
+ self.proj_dim = proj_dim
84
+
85
+ self.conv1 = nn.Conv2d(1, num_filters[0], kernel_size=3, stride=1, padding=1)
86
+ self.relu = nn.ReLU(inplace=True)
87
+ self.bn1 = nn.BatchNorm2d(num_filters[0])
88
+
89
+ self.inplanes = num_filters[0]
90
+ self.layer1 = self.create_layer(SEBasicBlock, num_filters[0], layers[0])
91
+ self.layer2 = self.create_layer(SEBasicBlock, num_filters[1], layers[1], stride=(2, 2))
92
+ self.layer3 = self.create_layer(SEBasicBlock, num_filters[2], layers[2], stride=(2, 2))
93
+ self.layer4 = self.create_layer(SEBasicBlock, num_filters[3], layers[3], stride=(2, 2))
94
+
95
+ self.instancenorm = nn.InstanceNorm1d(input_dim)
96
+
97
+ if self.use_torch_spec:
98
+ self.torch_spec = self.get_torch_mel_spectrogram_class(audio_config)
99
+ else:
100
+ self.torch_spec = None
101
+
102
+ outmap_size = int(self.input_dim / 8)
103
+
104
+ self.attention = nn.Sequential(
105
+ nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
106
+ nn.ReLU(),
107
+ nn.BatchNorm1d(128),
108
+ nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
109
+ nn.Softmax(dim=2),
110
+ )
111
+
112
+ if self.encoder_type == "SAP":
113
+ out_dim = num_filters[3] * outmap_size
114
+ elif self.encoder_type == "ASP":
115
+ out_dim = num_filters[3] * outmap_size * 2
116
+ else:
117
+ raise ValueError("Undefined encoder")
118
+
119
+ self.fc = nn.Linear(out_dim, proj_dim)
120
+
121
+ self._init_layers()
122
+
123
+ def _init_layers(self):
124
+ for m in self.modules():
125
+ if isinstance(m, nn.Conv2d):
126
+ nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
127
+ elif isinstance(m, nn.BatchNorm2d):
128
+ nn.init.constant_(m.weight, 1)
129
+ nn.init.constant_(m.bias, 0)
130
+
131
+ def create_layer(self, block, planes, blocks, stride=1):
132
+ downsample = None
133
+ if stride != 1 or self.inplanes != planes * block.expansion:
134
+ downsample = nn.Sequential(
135
+ nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
136
+ nn.BatchNorm2d(planes * block.expansion),
137
+ )
138
+
139
+ layers = []
140
+ layers.append(block(self.inplanes, planes, stride, downsample))
141
+ self.inplanes = planes * block.expansion
142
+ for _ in range(1, blocks):
143
+ layers.append(block(self.inplanes, planes))
144
+
145
+ return nn.Sequential(*layers)
146
+
147
+ # pylint: disable=R0201
148
+ def new_parameter(self, *size):
149
+ out = nn.Parameter(torch.FloatTensor(*size))
150
+ nn.init.xavier_normal_(out)
151
+ return out
152
+
153
+ def forward(self, x, l2_norm=False):
154
+ """Forward pass of the model.
155
+
156
+ Args:
157
+ x (Tensor): Raw waveform signal or spectrogram frames. If input is a waveform, `torch_spec` must be `True`
158
+ to compute the spectrogram on-the-fly.
159
+ l2_norm (bool): Whether to L2-normalize the outputs.
160
+
161
+ Shapes:
162
+ - x: :math:`(N, 1, T_{in})` or :math:`(N, D_{spec}, T_{in})`
163
+ """
164
+ x.squeeze_(1)
165
+ # if you torch spec compute it otherwise use the mel spec computed by the AP
166
+ if self.use_torch_spec:
167
+ x = self.torch_spec(x)
168
+
169
+ if self.log_input:
170
+ x = (x + 1e-6).log()
171
+ x = self.instancenorm(x).unsqueeze(1)
172
+
173
+ x = self.conv1(x)
174
+ x = self.relu(x)
175
+ x = self.bn1(x)
176
+
177
+ x = self.layer1(x)
178
+ x = self.layer2(x)
179
+ x = self.layer3(x)
180
+ x = self.layer4(x)
181
+
182
+ x = x.reshape(x.size()[0], -1, x.size()[-1])
183
+
184
+ w = self.attention(x)
185
+
186
+ if self.encoder_type == "SAP":
187
+ x = torch.sum(x * w, dim=2)
188
+ elif self.encoder_type == "ASP":
189
+ mu = torch.sum(x * w, dim=2)
190
+ sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-5))
191
+ x = torch.cat((mu, sg), 1)
192
+
193
+ x = x.view(x.size()[0], -1)
194
+ x = self.fc(x)
195
+
196
+ if l2_norm:
197
+ x = torch.nn.functional.normalize(x, p=2, dim=1)
198
+ return x
TTS/encoder/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ umap-learn
2
+ numpy>=1.17.0
TTS/encoder/utils/__init__.py ADDED
File without changes
TTS/encoder/utils/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (166 Bytes). View file
 
TTS/encoder/utils/__pycache__/generic_utils.cpython-39.pyc ADDED
Binary file (5.01 kB). View file
 
TTS/encoder/utils/generic_utils.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import glob
3
+ import os
4
+ import random
5
+ import re
6
+
7
+ import numpy as np
8
+ from scipy import signal
9
+
10
+ from TTS.encoder.models.lstm import LSTMSpeakerEncoder
11
+ from TTS.encoder.models.resnet import ResNetSpeakerEncoder
12
+ from TTS.utils.io import save_fsspec
13
+
14
+
15
+ class AugmentWAV(object):
16
+ def __init__(self, ap, augmentation_config):
17
+ self.ap = ap
18
+ self.use_additive_noise = False
19
+
20
+ if "additive" in augmentation_config.keys():
21
+ self.additive_noise_config = augmentation_config["additive"]
22
+ additive_path = self.additive_noise_config["sounds_path"]
23
+ if additive_path:
24
+ self.use_additive_noise = True
25
+ # get noise types
26
+ self.additive_noise_types = []
27
+ for key in self.additive_noise_config.keys():
28
+ if isinstance(self.additive_noise_config[key], dict):
29
+ self.additive_noise_types.append(key)
30
+
31
+ additive_files = glob.glob(os.path.join(additive_path, "**/*.wav"), recursive=True)
32
+
33
+ self.noise_list = {}
34
+
35
+ for wav_file in additive_files:
36
+ noise_dir = wav_file.replace(additive_path, "").split(os.sep)[0]
37
+ # ignore not listed directories
38
+ if noise_dir not in self.additive_noise_types:
39
+ continue
40
+ if not noise_dir in self.noise_list:
41
+ self.noise_list[noise_dir] = []
42
+ self.noise_list[noise_dir].append(wav_file)
43
+
44
+ print(
45
+ f" | > Using Additive Noise Augmentation: with {len(additive_files)} audios instances from {self.additive_noise_types}"
46
+ )
47
+
48
+ self.use_rir = False
49
+
50
+ if "rir" in augmentation_config.keys():
51
+ self.rir_config = augmentation_config["rir"]
52
+ if self.rir_config["rir_path"]:
53
+ self.rir_files = glob.glob(os.path.join(self.rir_config["rir_path"], "**/*.wav"), recursive=True)
54
+ self.use_rir = True
55
+
56
+ print(f" | > Using RIR Noise Augmentation: with {len(self.rir_files)} audios instances")
57
+
58
+ self.create_augmentation_global_list()
59
+
60
+ def create_augmentation_global_list(self):
61
+ if self.use_additive_noise:
62
+ self.global_noise_list = self.additive_noise_types
63
+ else:
64
+ self.global_noise_list = []
65
+ if self.use_rir:
66
+ self.global_noise_list.append("RIR_AUG")
67
+
68
+ def additive_noise(self, noise_type, audio):
69
+ clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
70
+
71
+ noise_list = random.sample(
72
+ self.noise_list[noise_type],
73
+ random.randint(
74
+ self.additive_noise_config[noise_type]["min_num_noises"],
75
+ self.additive_noise_config[noise_type]["max_num_noises"],
76
+ ),
77
+ )
78
+
79
+ audio_len = audio.shape[0]
80
+ noises_wav = None
81
+ for noise in noise_list:
82
+ noiseaudio = self.ap.load_wav(noise, sr=self.ap.sample_rate)[:audio_len]
83
+
84
+ if noiseaudio.shape[0] < audio_len:
85
+ continue
86
+
87
+ noise_snr = random.uniform(
88
+ self.additive_noise_config[noise_type]["min_snr_in_db"],
89
+ self.additive_noise_config[noise_type]["max_num_noises"],
90
+ )
91
+ noise_db = 10 * np.log10(np.mean(noiseaudio**2) + 1e-4)
92
+ noise_wav = np.sqrt(10 ** ((clean_db - noise_db - noise_snr) / 10)) * noiseaudio
93
+
94
+ if noises_wav is None:
95
+ noises_wav = noise_wav
96
+ else:
97
+ noises_wav += noise_wav
98
+
99
+ # if all possible files is less than audio, choose other files
100
+ if noises_wav is None:
101
+ return self.additive_noise(noise_type, audio)
102
+
103
+ return audio + noises_wav
104
+
105
+ def reverberate(self, audio):
106
+ audio_len = audio.shape[0]
107
+
108
+ rir_file = random.choice(self.rir_files)
109
+ rir = self.ap.load_wav(rir_file, sr=self.ap.sample_rate)
110
+ rir = rir / np.sqrt(np.sum(rir**2))
111
+ return signal.convolve(audio, rir, mode=self.rir_config["conv_mode"])[:audio_len]
112
+
113
+ def apply_one(self, audio):
114
+ noise_type = random.choice(self.global_noise_list)
115
+ if noise_type == "RIR_AUG":
116
+ return self.reverberate(audio)
117
+
118
+ return self.additive_noise(noise_type, audio)
119
+
120
+
121
+ def to_camel(text):
122
+ text = text.capitalize()
123
+ return re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), text)
124
+
125
+
126
+ def setup_encoder_model(config: "Coqpit"):
127
+ if config.model_params["model_name"].lower() == "lstm":
128
+ model = LSTMSpeakerEncoder(
129
+ config.model_params["input_dim"],
130
+ config.model_params["proj_dim"],
131
+ config.model_params["lstm_dim"],
132
+ config.model_params["num_lstm_layers"],
133
+ use_torch_spec=config.model_params.get("use_torch_spec", False),
134
+ audio_config=config.audio,
135
+ )
136
+ elif config.model_params["model_name"].lower() == "resnet":
137
+ model = ResNetSpeakerEncoder(
138
+ input_dim=config.model_params["input_dim"],
139
+ proj_dim=config.model_params["proj_dim"],
140
+ log_input=config.model_params.get("log_input", False),
141
+ use_torch_spec=config.model_params.get("use_torch_spec", False),
142
+ audio_config=config.audio,
143
+ )
144
+ return model
145
+
146
+
147
+ def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
148
+ checkpoint_path = "checkpoint_{}.pth".format(current_step)
149
+ checkpoint_path = os.path.join(out_path, checkpoint_path)
150
+ print(" | | > Checkpoint saving : {}".format(checkpoint_path))
151
+
152
+ new_state_dict = model.state_dict()
153
+ state = {
154
+ "model": new_state_dict,
155
+ "optimizer": optimizer.state_dict() if optimizer is not None else None,
156
+ "criterion": criterion.state_dict(),
157
+ "step": current_step,
158
+ "epoch": epoch,
159
+ "loss": model_loss,
160
+ "date": datetime.date.today().strftime("%B %d, %Y"),
161
+ }
162
+ save_fsspec(state, checkpoint_path)
163
+
164
+
165
+ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path, current_step, epoch):
166
+ if model_loss < best_loss:
167
+ new_state_dict = model.state_dict()
168
+ state = {
169
+ "model": new_state_dict,
170
+ "optimizer": optimizer.state_dict(),
171
+ "criterion": criterion.state_dict(),
172
+ "step": current_step,
173
+ "epoch": epoch,
174
+ "loss": model_loss,
175
+ "date": datetime.date.today().strftime("%B %d, %Y"),
176
+ }
177
+ best_loss = model_loss
178
+ bestmodel_path = "best_model.pth"
179
+ bestmodel_path = os.path.join(out_path, bestmodel_path)
180
+ print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
181
+ save_fsspec(state, bestmodel_path)
182
+ return best_loss
TTS/encoder/utils/io.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import os
3
+
4
+ from TTS.utils.io import save_fsspec
5
+
6
+
7
+ def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
8
+ checkpoint_path = "checkpoint_{}.pth".format(current_step)
9
+ checkpoint_path = os.path.join(out_path, checkpoint_path)
10
+ print(" | | > Checkpoint saving : {}".format(checkpoint_path))
11
+
12
+ new_state_dict = model.state_dict()
13
+ state = {
14
+ "model": new_state_dict,
15
+ "optimizer": optimizer.state_dict() if optimizer is not None else None,
16
+ "step": current_step,
17
+ "loss": model_loss,
18
+ "date": datetime.date.today().strftime("%B %d, %Y"),
19
+ }
20
+ save_fsspec(state, checkpoint_path)
21
+
22
+
23
+ def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_step):
24
+ if model_loss < best_loss:
25
+ new_state_dict = model.state_dict()
26
+ state = {
27
+ "model": new_state_dict,
28
+ "optimizer": optimizer.state_dict(),
29
+ "step": current_step,
30
+ "loss": model_loss,
31
+ "date": datetime.date.today().strftime("%B %d, %Y"),
32
+ }
33
+ best_loss = model_loss
34
+ bestmodel_path = "best_model.pth"
35
+ bestmodel_path = os.path.join(out_path, bestmodel_path)
36
+ print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
37
+ save_fsspec(state, bestmodel_path)
38
+ return best_loss