wannaphong commited on
Commit
510b23c
1 Parent(s): 9f357eb
README.md CHANGED
@@ -1,3 +1,24 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ # KhanomTan TTS v1.0
6
+
7
+ KhnomTan TTS (ขนมตาล) is a open-source Thai text-to-speech model that supports multilingual speakers. It supports Thai, English, and others.
8
+
9
+ KhnomTan TTS is a YourTTS model that trained with supports Thai. We add the Thai speech corpus from TSync 1* and TSync 2* to [mbarnig/lb-de-fr-en-pt-12800-TTS-CORPUS](https://huggingface.co/datasets/mbarnig/lb-de-fr-en-pt-12800-TTS-CORPUS) that train the model with YourTTS model.
10
+
11
+ ## Config
12
+ We have Thai characters to the graphemes config for training the model and use the Speaker Encoder model from the speaker encoder model from [🐸 Coqui-TTS](https://github.com/coqui-ai/TTS/releases/tag/speaker_encoder_model).
13
+
14
+ ## Dataset
15
+ We add Tsync 1 corpus and Tsync 2 corpus, which are not complete datasets, and then add those to [mbarnig/lb-de-fr-en-pt-12800-TTS-CORPUS](https://huggingface.co/datasets/mbarnig/lb-de-fr-en-pt-12800-TTS-CORPUS) dataset.
16
+
17
+ ## Trained the model
18
+ We use the 🐸 Coqui-TTS multilingual VITS-model recipe (version 0.7.1 or the commit id is d46fbc240ccf21797d42ac26cb27eb0b9f8d31c4) for training the model, and we use the speaker encoder model from [🐸 Coqui-TTS](https://github.com/coqui-ai/TTS/releases/tag/speaker_encoder_model) then we release the best model to public access.
19
+
20
+ Model cards: ____
21
+ Dataset (Tsync 1 and Tsync 2 only): ____
22
+ GitHub: ____
23
+
24
+ *Note: Those are not complete corpus. We can access the public corpus only.
best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc1edb9b669cf2b40c24bb2fabd9cfb904fb6692a4569c5402fb1ea650ce3160
3
+ size 1028070250
checkpoint_440000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12aaa5ec82aea04ad88bc1d78b7f89f0a1fd842cab58bf8073529b7e5095f8af
3
+ size 1028071658
config.json ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/workspace/wt/tts/TTS-1/recipes/multilingual/vits_tts",
3
+ "logger_uri": null,
4
+ "run_name": "vits_vctk",
5
+ "project_name": null,
6
+ "run_description": "\ud83d\udc38Coqui trainer run.",
7
+ "print_step": 25,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "log_model_step": 10000,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 5,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": -1,
21
+ "run_eval": true,
22
+ "run_eval_steps": null,
23
+ "distributed_backend": "nccl",
24
+ "distributed_url": "tcp://localhost:54321",
25
+ "mixed_precision": false,
26
+ "epochs": 1000,
27
+ "batch_size": 32,
28
+ "eval_batch_size": 16,
29
+ "grad_clip": [
30
+ 1000.0,
31
+ 1000.0
32
+ ],
33
+ "scheduler_after_epoch": true,
34
+ "lr": 0.001,
35
+ "optimizer": "AdamW",
36
+ "optimizer_params": {
37
+ "betas": [
38
+ 0.8,
39
+ 0.99
40
+ ],
41
+ "eps": 1e-09,
42
+ "weight_decay": 0.01
43
+ },
44
+ "lr_scheduler": "",
45
+ "lr_scheduler_params": {},
46
+ "use_grad_scaler": false,
47
+ "cudnn_enable": true,
48
+ "cudnn_deterministic": false,
49
+ "cudnn_benchmark": false,
50
+ "training_seed": 54321,
51
+ "model": "vits",
52
+ "num_loader_workers": 4,
53
+ "num_eval_loader_workers": 4,
54
+ "use_noise_augment": false,
55
+ "audio": {
56
+ "fft_size": 1024,
57
+ "sample_rate": 16000,
58
+ "win_length": 1024,
59
+ "hop_length": 256,
60
+ "num_mels": 80,
61
+ "mel_fmin": 0,
62
+ "mel_fmax": null
63
+ },
64
+ "use_phonemes": false,
65
+ "phonemizer": null,
66
+ "phoneme_language": "en-us",
67
+ "compute_input_seq_cache": true,
68
+ "text_cleaner": "multilingual_cleaners",
69
+ "enable_eos_bos_chars": false,
70
+ "test_sentences_file": "",
71
+ "phoneme_cache_path": "/workspace/wt/tts/TTS-1/recipes/multilingual/vits_tts/phoneme_cache",
72
+ "characters": {
73
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
74
+ "vocab_dict": null,
75
+ "pad": "<PAD>",
76
+ "eos": "<EOS>",
77
+ "bos": "<BOS>",
78
+ "blank": "<BLNK>",
79
+ "characters": "!\u00a1'(),-.:;\u00bf?abcdefghijklmnopqrstuvwxyz\u00b5\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e6\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0105\u0107\u0119\u0142\u0144\u0153\u015b\u015f\u017a\u017c\u0192\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d\u044e\u044f\u0451\u0454\u0456\u0457\u0491\u04e7 \u00ab\u00b0\u00b1\u00b5\u00bb$%&\u2018\u2019\u201a\u201c`\u201d\u201e\u0e01\u0e02\u0e03\u0e04\u0e05\u0e06\u0e07\u0e08\u0e09\u0e0a\u0e0b\u0e0c\u0e0d\u0e0e\u0e0f\u0e10\u0e11\u0e12\u0e13\u0e14\u0e15\u0e16\u0e17\u0e18\u0e19\u0e1a\u0e1b\u0e1c\u0e1d\u0e1e\u0e1f\u0e20\u0e21\u0e22\u0e23\u0e25\u0e27\u0e28\u0e29\u0e2a\u0e2b\u0e2c\u0e2d\u0e2e\u0e24\u0e26\u0e30\u0e31\u0e32\u0e33\u0e34\u0e35\u0e36\u0e37\u0e38\u0e39\u0e40\u0e41\u0e42\u0e43\u0e44\u0e45\u0e4d\u0e47\u0e48\u0e49\u0e4a\u0e4b\u0e2f\u0e3a\u0e46\u0e4c\u0e4d\u0e4e",
80
+ "punctuations": "!\u00a1'(),-.:;\u00bf? ",
81
+ "phonemes": null,
82
+ "is_unique": true,
83
+ "is_sorted": true
84
+ },
85
+ "add_blank": true,
86
+ "batch_group_size": 0,
87
+ "loss_masking": null,
88
+ "min_audio_len": 32768,
89
+ "max_audio_len": 160000,
90
+ "min_text_len": 1,
91
+ "max_text_len": Infinity,
92
+ "compute_f0": false,
93
+ "compute_linear_spec": true,
94
+ "precompute_num_workers": 0,
95
+ "start_by_longest": false,
96
+ "datasets": [
97
+ {
98
+ "name": "mailabs",
99
+ "path": "/workspace/wt/tts/mailabs2/x-lb",
100
+ "meta_file_train": null,
101
+ "ignored_speakers": null,
102
+ "language": "x-lb",
103
+ "meta_file_val": "",
104
+ "meta_file_attn_mask": ""
105
+ },
106
+ {
107
+ "name": "mailabs",
108
+ "path": "/workspace/wt/tts/mailabs2/fr-fr",
109
+ "meta_file_train": null,
110
+ "ignored_speakers": null,
111
+ "language": "fr-fr",
112
+ "meta_file_val": "",
113
+ "meta_file_attn_mask": ""
114
+ },
115
+ {
116
+ "name": "mailabs",
117
+ "path": "/workspace/wt/tts/mailabs2/th-th",
118
+ "meta_file_train": null,
119
+ "ignored_speakers": null,
120
+ "language": "th-th",
121
+ "meta_file_val": "",
122
+ "meta_file_attn_mask": ""
123
+ },
124
+ {
125
+ "name": "mailabs",
126
+ "path": "/workspace/wt/tts/mailabs2/pt-br",
127
+ "meta_file_train": null,
128
+ "ignored_speakers": null,
129
+ "language": "pt-br",
130
+ "meta_file_val": "",
131
+ "meta_file_attn_mask": ""
132
+ },
133
+ {
134
+ "name": "mailabs",
135
+ "path": "/workspace/wt/tts/mailabs2/en",
136
+ "meta_file_train": null,
137
+ "ignored_speakers": null,
138
+ "language": "en",
139
+ "meta_file_val": "",
140
+ "meta_file_attn_mask": ""
141
+ },
142
+ {
143
+ "name": "mailabs",
144
+ "path": "/workspace/wt/tts/mailabs2/x-de",
145
+ "meta_file_train": null,
146
+ "ignored_speakers": null,
147
+ "language": "x-de",
148
+ "meta_file_val": "",
149
+ "meta_file_attn_mask": ""
150
+ }
151
+ ],
152
+ "test_sentences": [
153
+ [
154
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
155
+ "Linda",
156
+ null,
157
+ "en"
158
+ ],
159
+ [
160
+ "\u0e17\u0e14 \u0e2a\u0e2d\u0e1a \u0e01\u0e32\u0e23 \u0e2d\u0e48\u0e32\u0e19 \u0e20\u0e32\u0e29\u0e32 \u0e44\u0e17\u0e22",
161
+ "Tsynctwo",
162
+ null,
163
+ "th-th"
164
+ ]
165
+ ],
166
+ "eval_split_max_size": null,
167
+ "eval_split_size": 0.01,
168
+ "use_speaker_weighted_sampler": false,
169
+ "speaker_weighted_sampler_alpha": 1.0,
170
+ "use_language_weighted_sampler": true,
171
+ "language_weighted_sampler_alpha": 1.0,
172
+ "use_length_weighted_sampler": false,
173
+ "length_weighted_sampler_alpha": 1.0,
174
+ "model_args": {
175
+ "num_chars": 220,
176
+ "out_channels": 513,
177
+ "spec_segment_size": 32,
178
+ "hidden_channels": 192,
179
+ "hidden_channels_ffn_text_encoder": 768,
180
+ "num_heads_text_encoder": 2,
181
+ "num_layers_text_encoder": 6,
182
+ "kernel_size_text_encoder": 3,
183
+ "dropout_p_text_encoder": 0.1,
184
+ "dropout_p_duration_predictor": 0.5,
185
+ "kernel_size_posterior_encoder": 5,
186
+ "dilation_rate_posterior_encoder": 1,
187
+ "num_layers_posterior_encoder": 16,
188
+ "kernel_size_flow": 5,
189
+ "dilation_rate_flow": 1,
190
+ "num_layers_flow": 4,
191
+ "resblock_type_decoder": "1",
192
+ "resblock_kernel_sizes_decoder": [
193
+ 3,
194
+ 7,
195
+ 11
196
+ ],
197
+ "resblock_dilation_sizes_decoder": [
198
+ [
199
+ 1,
200
+ 3,
201
+ 5
202
+ ],
203
+ [
204
+ 1,
205
+ 3,
206
+ 5
207
+ ],
208
+ [
209
+ 1,
210
+ 3,
211
+ 5
212
+ ]
213
+ ],
214
+ "upsample_rates_decoder": [
215
+ 8,
216
+ 8,
217
+ 2,
218
+ 2
219
+ ],
220
+ "upsample_initial_channel_decoder": 512,
221
+ "upsample_kernel_sizes_decoder": [
222
+ 16,
223
+ 16,
224
+ 4,
225
+ 4
226
+ ],
227
+ "periods_multi_period_discriminator": [
228
+ 2,
229
+ 3,
230
+ 5,
231
+ 7,
232
+ 11
233
+ ],
234
+ "use_sdp": false,
235
+ "noise_scale": 1.0,
236
+ "inference_noise_scale": 0.667,
237
+ "length_scale": 1.0,
238
+ "noise_scale_dp": 1.0,
239
+ "inference_noise_scale_dp": 1.0,
240
+ "max_inference_len": null,
241
+ "init_discriminator": true,
242
+ "use_spectral_norm_disriminator": false,
243
+ "use_speaker_embedding": true,
244
+ "num_speakers": 20,
245
+ "speakers_file": "/workspace/wt/tts/TTS-1/recipes/multilingual/vits_tts/vits_vctk-August-13-2022_06+53AM-d46fbc24/speakers.pth",
246
+ "d_vector_file": null,
247
+ "speaker_embedding_channels": 256,
248
+ "use_d_vector_file": false,
249
+ "d_vector_dim": 0,
250
+ "detach_dp_input": true,
251
+ "use_language_embedding": true,
252
+ "embedded_language_dim": 4,
253
+ "num_languages": 6,
254
+ "language_ids_file": "/workspace/wt/tts/TTS-1/recipes/multilingual/vits_tts/vits_vctk-August-13-2022_06+53AM-d46fbc24/language_ids.json",
255
+ "use_speaker_encoder_as_loss": false,
256
+ "speaker_encoder_config_path": "/workspace/wt/tts/TTS-1/recipes/multilingual/vits_tts/config_se.json",
257
+ "speaker_encoder_model_path": "/workspace/wt/tts/TTS-1/recipes/multilingual/vits_tts/model_se.pth",
258
+ "condition_dp_on_speaker": true,
259
+ "freeze_encoder": false,
260
+ "freeze_DP": false,
261
+ "freeze_PE": false,
262
+ "freeze_flow_decoder": false,
263
+ "freeze_waveform_decoder": false,
264
+ "encoder_sample_rate": null,
265
+ "interpolate_z": true,
266
+ "reinit_DP": false,
267
+ "reinit_text_encoder": false
268
+ },
269
+ "lr_gen": 0.0002,
270
+ "lr_disc": 0.0002,
271
+ "lr_scheduler_gen": "ExponentialLR",
272
+ "lr_scheduler_gen_params": {
273
+ "gamma": 0.999875,
274
+ "last_epoch": -1
275
+ },
276
+ "lr_scheduler_disc": "ExponentialLR",
277
+ "lr_scheduler_disc_params": {
278
+ "gamma": 0.999875,
279
+ "last_epoch": -1
280
+ },
281
+ "kl_loss_alpha": 1.0,
282
+ "disc_loss_alpha": 1.0,
283
+ "gen_loss_alpha": 1.0,
284
+ "feat_loss_alpha": 1.0,
285
+ "mel_loss_alpha": 45.0,
286
+ "dur_loss_alpha": 1.0,
287
+ "speaker_encoder_loss_alpha": 1.0,
288
+ "return_wav": true,
289
+ "r": 1,
290
+ "num_speakers": 0,
291
+ "use_speaker_embedding": true,
292
+ "speakers_file": "/workspace/wt/tts/TTS-1/recipes/multilingual/vits_tts/vits_vctk-August-13-2022_06+53AM-d46fbc24/speakers.pth",
293
+ "speaker_embedding_channels": 256,
294
+ "language_ids_file": "/workspace/wt/tts/TTS-1/recipes/multilingual/vits_tts/vits_vctk-August-13-2022_06+53AM-d46fbc24/language_ids.json",
295
+ "use_language_embedding": true,
296
+ "use_d_vector_file": false,
297
+ "d_vector_file": null,
298
+ "d_vector_dim": 0
299
+ }
config_se.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "speaker_encoder",
3
+ "run_name": "speaker_encoder",
4
+ "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev",
5
+ "epochs": 100000,
6
+ "batch_size": null,
7
+ "eval_batch_size": null,
8
+ "mixed_precision": false,
9
+ "run_eval": true,
10
+ "test_delay_epochs": 0,
11
+ "print_eval": false,
12
+ "print_step": 50,
13
+ "tb_plot_step": 100,
14
+ "tb_model_param_stats": false,
15
+ "save_step": 1000,
16
+ "checkpoint": true,
17
+ "keep_all_best": false,
18
+ "keep_after": 10000,
19
+ "num_loader_workers": 8,
20
+ "num_val_loader_workers": 0,
21
+ "use_noise_augment": false,
22
+ "output_path": "../checkpoints/speaker_encoder/language_balanced/normalized/angleproto-4-samples-by-speakers/",
23
+ "distributed_backend": "nccl",
24
+ "distributed_url": "tcp://localhost:54321",
25
+ "audio": {
26
+ "fft_size": 512,
27
+ "win_length": 400,
28
+ "hop_length": 160,
29
+ "frame_shift_ms": null,
30
+ "frame_length_ms": null,
31
+ "stft_pad_mode": "reflect",
32
+ "sample_rate": 16000,
33
+ "resample": false,
34
+ "preemphasis": 0.97,
35
+ "ref_level_db": 20,
36
+ "do_sound_norm": false,
37
+ "do_trim_silence": false,
38
+ "trim_db": 60,
39
+ "power": 1.5,
40
+ "griffin_lim_iters": 60,
41
+ "num_mels": 64,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": 8000.0,
44
+ "spec_gain": 20,
45
+ "signal_norm": false,
46
+ "min_level_db": -100,
47
+ "symmetric_norm": false,
48
+ "max_norm": 4.0,
49
+ "clip_norm": false,
50
+ "stats_path": null,
51
+ "do_rms_norm": true,
52
+ "db_level": -27.0
53
+ },
54
+ "datasets": [
55
+ {
56
+ "name": "voxceleb2",
57
+ "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/",
58
+ "meta_file_train": null,
59
+ "ununsed_speakers": null,
60
+ "meta_file_val": null,
61
+ "meta_file_attn_mask": "",
62
+ "language": "voxceleb"
63
+ }
64
+ ],
65
+ "model_params": {
66
+ "model_name": "resnet",
67
+ "input_dim": 64,
68
+ "use_torch_spec": true,
69
+ "log_input": true,
70
+ "proj_dim": 512
71
+ },
72
+ "audio_augmentation": {
73
+ "p": 0.5,
74
+ "rir": {
75
+ "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
76
+ "conv_mode": "full"
77
+ },
78
+ "additive": {
79
+ "sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
80
+ "speech": {
81
+ "min_snr_in_db": 13,
82
+ "max_snr_in_db": 20,
83
+ "min_num_noises": 1,
84
+ "max_num_noises": 1
85
+ },
86
+ "noise": {
87
+ "min_snr_in_db": 0,
88
+ "max_snr_in_db": 15,
89
+ "min_num_noises": 1,
90
+ "max_num_noises": 1
91
+ },
92
+ "music": {
93
+ "min_snr_in_db": 5,
94
+ "max_snr_in_db": 15,
95
+ "min_num_noises": 1,
96
+ "max_num_noises": 1
97
+ }
98
+ },
99
+ "gaussian": {
100
+ "p": 0.0,
101
+ "min_amplitude": 0.0,
102
+ "max_amplitude": 1e-05
103
+ }
104
+ },
105
+ "storage": {
106
+ "sample_from_storage_p": 0.5,
107
+ "storage_size": 40
108
+ },
109
+ "max_train_step": 1000000,
110
+ "loss": "angleproto",
111
+ "grad_clip": 3.0,
112
+ "lr": 0.0001,
113
+ "lr_decay": false,
114
+ "warmup_steps": 4000,
115
+ "wd": 1e-06,
116
+ "steps_plot_stats": 100,
117
+ "num_speakers_in_batch": 100,
118
+ "num_utters_per_speaker": 4,
119
+ "skip_speakers": true,
120
+ "voice_len": 2.0
121
+ }
language_ids.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "en": 0,
3
+ "fr-fr": 1,
4
+ "pt-br": 2,
5
+ "th-th": 3,
6
+ "x-de": 4,
7
+ "x-lb": 5
8
+ }
model-training/events.out.tfevents.1660373622.29615ef310df ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f296dcd5b7a02a7cde7813cc362814ee19d170fb4aca22a4a1361142e83867b2
3
+ size 592190465
model_se.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f96efb20cbeeefd81fd8336d7f0155bf8902f82f9474e58ccb19d9e12345172
3
+ size 44610930
speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f76ff12945402e4be4a1c3c0396399a60fb08a14cf2c43b9814088a6a81ee0fc
3
+ size 687