saefro991 commited on
Commit
5e16dfd
1 Parent(s): 3cffb04

Update model

Browse files
README.md ADDED
@@ -0,0 +1,826 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: multilingual
7
+ datasets:
8
+ - masmultts
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `saefro991/tts_ipa_css10_7lang_textpretrain_residual_freeze`
15
+
16
+ This model was trained by Takaaki-Saeki using masmultts recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 11a7d61312439111d4996d55935ede718d494262
26
+ pip install -e .
27
+ cd egs2/masmultts/tts_phn_css10_adap_residual_freeze
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model saefro991/tts_ipa_css10_7lang_textpretrain_residual_freeze
29
+ ```
30
+
31
+
32
+
33
+ ## TTS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/train.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: sequence
43
+ output_dir: exp/tts_train_raw_phn_none
44
+ ngpu: 1
45
+ seed: 0
46
+ num_workers: 1
47
+ num_att_plot: 1
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: null
51
+ dist_rank: null
52
+ local_rank: 0
53
+ dist_master_addr: null
54
+ dist_master_port: null
55
+ dist_launcher: null
56
+ multiprocessing_distributed: false
57
+ unused_parameters: false
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: true
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 200
65
+ patience: null
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - valid
75
+ - loss
76
+ - min
77
+ - - train
78
+ - loss
79
+ - min
80
+ keep_nbest_models: 3
81
+ nbest_averaging_interval: 0
82
+ grad_clip: 2.0
83
+ grad_clip_type: 2.0
84
+ grad_noise: false
85
+ accum_grad: 4
86
+ no_forward_run: false
87
+ resume: true
88
+ train_dtype: float32
89
+ use_amp: false
90
+ log_interval: null
91
+ use_matplotlib: true
92
+ use_tensorboard: true
93
+ create_graph_in_tensorboard: false
94
+ use_wandb: false
95
+ wandb_project: null
96
+ wandb_id: null
97
+ wandb_entity: null
98
+ wandb_name: null
99
+ wandb_model_log_interval: -1
100
+ detect_anomaly: false
101
+ pretrain_path: null
102
+ init_param:
103
+ - ../tts_pretrain_phn_residual/exp/tts_train_phn_none/2epoch.pth:tts_pretrain.encoder:tts.encoder
104
+ - ../tts_pretrain_phn_residual/exp/tts_train_phn_none/2epoch.pth:tts_pretrain.lid_emb:tts.lid_emb
105
+ ignore_init_mismatch: false
106
+ freeze_param:
107
+ - tts.encoder.adapter
108
+ - tts.encoder.embed
109
+ - tts.lid_emb
110
+ num_iters_per_epoch: null
111
+ batch_size: 20
112
+ valid_batch_size: null
113
+ batch_bins: 400000
114
+ valid_batch_bins: null
115
+ train_shape_file:
116
+ - exp/tts_stats_raw_phn_none/train/text_shape.phn
117
+ - exp/tts_stats_raw_phn_none/train/speech_shape
118
+ valid_shape_file:
119
+ - exp/tts_stats_raw_phn_none/valid/text_shape.phn
120
+ - exp/tts_stats_raw_phn_none/valid/speech_shape
121
+ batch_type: numel
122
+ valid_batch_type: null
123
+ fold_length:
124
+ - 150
125
+ - 204800
126
+ sort_in_batch: descending
127
+ sort_batch: descending
128
+ multiple_iterator: false
129
+ chunk_length: 500
130
+ chunk_shift_ratio: 0.5
131
+ num_cache_chunks: 1024
132
+ train_data_path_and_name_and_type:
133
+ - - /local/11454483.1.gpu/dump/raw/train/text
134
+ - text
135
+ - text
136
+ - - /local/11454483.1.gpu/dump/raw/train/wav.scp
137
+ - speech
138
+ - sound
139
+ - - /local/11454483.1.gpu/dump/xvector/train/xvector.scp
140
+ - spembs
141
+ - kaldi_ark
142
+ - - /local/11454483.1.gpu/dump/raw/train/utt2lid
143
+ - lids
144
+ - text_int
145
+ valid_data_path_and_name_and_type:
146
+ - - /local/11454483.1.gpu/dump/raw/dev/text
147
+ - text
148
+ - text
149
+ - - /local/11454483.1.gpu/dump/raw/dev/wav.scp
150
+ - speech
151
+ - sound
152
+ - - /local/11454483.1.gpu/dump/xvector/dev/xvector.scp
153
+ - spembs
154
+ - kaldi_ark
155
+ - - /local/11454483.1.gpu/dump/raw/dev/utt2lid
156
+ - lids
157
+ - text_int
158
+ allow_variable_data_keys: false
159
+ max_cache_size: 0.0
160
+ max_cache_fd: 32
161
+ valid_max_cache_size: null
162
+ optim: adam
163
+ optim_conf:
164
+ lr: 1.0
165
+ scheduler: noamlr
166
+ scheduler_conf:
167
+ model_size: 512
168
+ warmup_steps: 50000
169
+ token_list:
170
+ - <blank>
171
+ - <unk>
172
+ - n
173
+ - t
174
+ - s
175
+ - l
176
+ - a
177
+ - e
178
+ - k
179
+ - d
180
+ - m
181
+ - ə
182
+ - r
183
+ - i
184
+ - p
185
+ - o
186
+ - v
187
+ - ɪ
188
+ - ˈa
189
+ - ɾ
190
+ - j
191
+ - z
192
+ - ˈɛ
193
+ - ˈe
194
+ - ɛ
195
+ - b
196
+ - ˈo
197
+ - f
198
+ - ˈi
199
+ - u
200
+ - ð
201
+ - ʁ
202
+ - h
203
+ - ɡ
204
+ - ɔ
205
+ - ʃ
206
+ - ˈu
207
+ - w
208
+ - ˌe
209
+ - ts
210
+ - ŋ
211
+ - ˌa
212
+ - æ
213
+ - iː
214
+ - ˈɪ
215
+ - ˈiː
216
+ - ˈaː
217
+ - ɹ
218
+ - ʊ
219
+ - ɑ
220
+ - ˈeː
221
+ - ˈɔ
222
+ - x
223
+ - aː
224
+ - tʃ
225
+ - ˌi
226
+ - ˌo
227
+ - tː
228
+ - oː
229
+ - ɣ
230
+ - ˈoː
231
+ - eː
232
+ - y
233
+ - θ
234
+ - ɲ
235
+ - ə-
236
+ - ʋ
237
+ - ʒ
238
+ - ˌɛ
239
+ - ˈɑ
240
+ - β
241
+ - uː
242
+ - ˈuː
243
+ - ˈaɪ
244
+ - ç
245
+ - ˈɑ̃
246
+ - ˈɔ̃
247
+ - ˈæ
248
+ - ɚ
249
+ - ˌɪ
250
+ - ɑ̃
251
+ - ˌu
252
+ - ˌɔ
253
+ - ˈy
254
+ - ɜ
255
+ - tʲ
256
+ - ˈeɪ
257
+ - ˈɑː
258
+ - ˌeː
259
+ - ʌ
260
+ - ᵻ
261
+ - ɐ
262
+ - ˌɑ
263
+ - ɨ
264
+ - ɔ̃
265
+ - dʒ
266
+ - e-
267
+ - ˌiː
268
+ - a-
269
+ - ˈʌ
270
+ - ˌʊ
271
+ - əl
272
+ - ʎ
273
+ - ˌaɪ
274
+ - aɪ
275
+ - ˈɔː
276
+ - ss
277
+ - ˈaʊ
278
+ - rʲ
279
+ - kː
280
+ - ˈoʊ
281
+ - ˌaː
282
+ - ɑː
283
+ - nʲ
284
+ - ˌoː
285
+ - ø
286
+ - ˈɛɪ
287
+ - ɛɪ
288
+ - ˌæ
289
+ - ʂ
290
+ - ɲʲ
291
+ - ˌɑː
292
+ - ɕ
293
+ - ˈai
294
+ - vʲ
295
+ - dʲ
296
+ - ai
297
+ - ei
298
+ - ɛ̃
299
+ - mʲ
300
+ - ˈø
301
+ - ɭ
302
+ - ˈɵ
303
+ - pː
304
+ - ˈɛ̃
305
+ - ɔː
306
+ - oʊ
307
+ - ˈɜː
308
+ - ˈʊ
309
+ - tɕ
310
+ - ɟ
311
+ - ˌaʊ
312
+ - ˈœ
313
+ - kʲ
314
+ - ˈuo
315
+ - ˈoi
316
+ - æː
317
+ - dʑ
318
+ - l̩
319
+ - ˈie
320
+ - ɪː
321
+ - ie
322
+ - oi
323
+ - ˌeɪ
324
+ - ˈɨ
325
+ - yː
326
+ - ˈɪː
327
+ - ˌy
328
+ - øː
329
+ - ˈʏ
330
+ - ˈɛː
331
+ - ˈoːɹ
332
+ - ˌuː
333
+ - ˌʌ
334
+ - ˈeu
335
+ - ˈei
336
+ - aʊ
337
+ - ˌoi
338
+ - bː
339
+ - ˌai
340
+ - ˈœy
341
+ - ˈøː
342
+ - ˈɑːɹ
343
+ - œ̃
344
+ - ˈæː
345
+ - au
346
+ - y-
347
+ - r̝̊
348
+ - ɵ
349
+ - ˌɵ
350
+ - c
351
+ - ˌɛɪ
352
+ - ˈɔø
353
+ - ˈyː
354
+ - ee
355
+ - pʲ
356
+ - ˈee
357
+ - bʲ
358
+ - ˈyø
359
+ - iə
360
+ - ˈiə
361
+ - ˌɨ
362
+ - ˌøː
363
+ - ɔːɹ
364
+ - ɔø
365
+ - eɪ
366
+ - ʑ
367
+ - ˈau
368
+ - ˈʊɹ
369
+ - r̝
370
+ - dʒː
371
+ - ˌeʊ
372
+ - ˈɔːɹ
373
+ - ˌoʊ
374
+ - ˌʊɹ
375
+ - ɑːɹ
376
+ - ˈæy
377
+ - ˌyː
378
+ - s^
379
+ - eu
380
+ - ˌə
381
+ - tʃː
382
+ - ˈə
383
+ - ˌei
384
+ - ea
385
+ - tsʲ
386
+ - ẽ
387
+ - ʌʊ
388
+ - œy
389
+ - ˈʌʊ
390
+ - nʲʲ
391
+ - ˌæi
392
+ - ˌʏ
393
+ - ˌɛː
394
+ - ˈɪɹ
395
+ - æi
396
+ - ˈɛɹ
397
+ - ˈæi
398
+ - ˈɔɪ
399
+ - ã
400
+ - dzː
401
+ - r̩
402
+ - ˈẽ
403
+ - ou
404
+ - œ
405
+ - ɜː
406
+ - uo
407
+ - tʲʲ
408
+ - ˌø
409
+ - ɛɹ
410
+ - ɭʲ
411
+ - iɪ
412
+ - (en)
413
+ - ʂʲ
414
+ - tsː
415
+ - ˌuo
416
+ - ˌʌʊ
417
+ - oːɹ
418
+ - ˈou
419
+ - ˌɛ̃
420
+ - ʝ
421
+ - eʊ
422
+ - ɨ̃
423
+ - ˈɔa
424
+ - ɟː
425
+ - ʊɐ
426
+ - ˈr̩
427
+ - tʃʲ
428
+ - uɪ
429
+ - ɡʲ
430
+ - ˈea
431
+ - ˌʊɐ
432
+ - ˈʊɐ
433
+ - ɛː
434
+ - ˌyi
435
+ - t^
436
+ - tɕʲ
437
+ - ˌea
438
+ - (fr)
439
+ - ɕʲ
440
+ - ʀ
441
+ - ˌɔø
442
+ - ʏ
443
+ - ˌœ
444
+ - ˈoɪ
445
+ - ˌau
446
+ - eɑ
447
+ - ˌɪː
448
+ - ˈeʊ
449
+ - ˈiɪ
450
+ - ˈã
451
+ - ˌɔː
452
+ - ˌã
453
+ - sʲ
454
+ - ˈaɪɚ
455
+ - ˌɑ̃
456
+ - ˌæː
457
+ - ey
458
+ - ˌœy
459
+ - ˈaɪə
460
+ - d̪
461
+ - ɾʲ
462
+ - ˌøi
463
+ - dː
464
+ - ˌie
465
+ - ui
466
+ - fʲ
467
+ - n̩
468
+ - ʔ
469
+ - ˌou
470
+ - yi
471
+ - ˌɑːɹ
472
+ - tsʲʲ
473
+ - ˌɐ
474
+ - ˈœ̃
475
+ - ˌyø
476
+ - dz
477
+ - ɡː
478
+ - ɾʲʲ
479
+ - ˈl̩
480
+ - ˈøy
481
+ - ˌæy
482
+ - cː
483
+ - æy
484
+ - ʊɹ
485
+ - ʑʲ
486
+ - ˌɜː
487
+ - yʊ
488
+ - ˌɛɹ
489
+ - pf
490
+ - dʑʲ
491
+ - ˌoːɹ
492
+ - ˈɨ̃
493
+ - ˈiʊ
494
+ - õ
495
+ - ɔa
496
+ - ˌɔa
497
+ - ˌee
498
+ - ˈĩ
499
+ - ˌiɪ
500
+ - ˌɔːɹ
501
+ - ˈɒ
502
+ - ja
503
+ - ĩ
504
+ - ˈũ
505
+ - ɒ
506
+ - ũ
507
+ - ʃʲ
508
+ - ɪɹ
509
+ - ju
510
+ - (de)
511
+ - yø
512
+ - ˌeu
513
+ - d^
514
+ - ˈiu
515
+ - ˈja
516
+ - øi
517
+ - ˈeɑ
518
+ - ˈyi
519
+ - ɾʲˌʲ
520
+ - ʃʲʲ
521
+ - ʃʲˌʲ
522
+ - aɪə
523
+ - ˈuɪ
524
+ - iu
525
+ - ˈõ
526
+ - iɐ
527
+ - ˌẽ
528
+ - iʊ
529
+ - ˌr̩
530
+ - ˈui
531
+ - əʊ
532
+ - u"
533
+ - ˌɔ̃
534
+ - ˈəʊ
535
+ - iy
536
+ - ʲ
537
+ - zʲˌʲ
538
+ - (it)
539
+ - ˌɒ
540
+ - ɔɪ
541
+ - ˌɪɹ
542
+ - ˈɵː
543
+ - ˈu"
544
+ - nʲˌʲ
545
+ - (nl)
546
+ - ˌl̩
547
+ - ˈey
548
+ - βː
549
+ - lʲʲ
550
+ - oɪ
551
+ - ˈiɐ
552
+ - ˌiɐ
553
+ - lʲ
554
+ - tsʲˌʲ
555
+ - xʲ
556
+ - ˌũ
557
+ - mʲʲ
558
+ - dʒʲ
559
+ - ˌeo
560
+ - ˈju
561
+ - r̩ː
562
+ - lʲˌʲ
563
+ - ˈøi
564
+ - t^ː
565
+ - əɪ
566
+ - l̩ː
567
+ - tʃˌʲ
568
+ - eo
569
+ - zʲʲ
570
+ - ˌiy
571
+ - aʲ
572
+ - ˌoɪ
573
+ - tl#
574
+ - ˈyɪ
575
+ - ˌiə
576
+ - ˌey
577
+ - øy
578
+ - dʲʲ
579
+ - ˈl̩ː
580
+ - ˈyʊ
581
+ - ˌɨ̃
582
+ - ʀʲ
583
+ - ɣː
584
+ - ˈeo
585
+ - ˈʊə
586
+ - ˌiu
587
+ - ˌøy
588
+ - ˈəɪ
589
+ - ˈeə
590
+ - aɪɚ
591
+ - ɪ^
592
+ - eə
593
+ - ˌĩ
594
+ - t̪
595
+ - vʲʲ
596
+ - (es)
597
+ - (gn)
598
+ - zʲ
599
+ - ˌõ
600
+ - əː
601
+ - bʲʲ
602
+ - (base)
603
+ - ˌəʊ
604
+ - ˈə-
605
+ - (ru)
606
+ - ˌɔɪ
607
+ - ˈæiː
608
+ - tsˌʲ
609
+ - ˈr̩ː
610
+ - ə--
611
+ - ˌn̩
612
+ - uʲ
613
+ - ˈw
614
+ - hʲ
615
+ - ˌeə
616
+ - yɪ
617
+ - fʲʲ
618
+ - ˌyʊ
619
+ - (el)
620
+ - ˌaɪɚ
621
+ - ˈəː
622
+ - ˌʊə
623
+ - ɵː
624
+ - t̪ː
625
+ - w-
626
+ - (sl)
627
+ - eʲ
628
+ - ˈa-
629
+ - ˌr̩ː
630
+ - mʲˌʲ
631
+ - (fi)
632
+ - ʒʲʲ
633
+ - çʲ
634
+ - ˌaɪə
635
+ - ˈɚ
636
+ - (lt)
637
+ - pʲʲ
638
+ - ˈɜ
639
+ - ˌuɪ
640
+ - ˌja
641
+ - (pl)
642
+ - ˈe-
643
+ - ˌe-
644
+ - (et)
645
+ - ˈoːʲ
646
+ - (kl)
647
+ - ˈõː
648
+ - (hu)
649
+ - ˈiy
650
+ - ʊə
651
+ - ˈaʲ
652
+ - ˌl̩ː
653
+ - lˌʲ
654
+ - '1'
655
+ - ʒʲ
656
+ - (cs)
657
+ - ˈææ
658
+ - ˈts-
659
+ - ts-
660
+ - ˌʊː
661
+ - ˌy"
662
+ - cʲ
663
+ - wʲ
664
+ - ˈãː
665
+ - ˈuʲ
666
+ - (ro)
667
+ - ˌɜ
668
+ - (sk)
669
+ - oːʲ
670
+ - ʊː
671
+ - ˈtl#tl#
672
+ - ʃˈʲ
673
+ - ɬ
674
+ - ˌə-
675
+ - (hr)
676
+ - tl#tl#
677
+ - ˌœ̃
678
+ - ˈʊː
679
+ - l̩ʲ
680
+ - dʒˌʲ
681
+ - tsˈʲ
682
+ - pʲˌʲ
683
+ - ˈʌː
684
+ - ˈeʲ
685
+ - aːʲ
686
+ - vʲˌʲ
687
+ - ˈj
688
+ - ()
689
+ - eːː
690
+ - ˌãː
691
+ - ˈuːʲ
692
+ - ˈeeʲ
693
+ - <sos/eos>
694
+ odim: null
695
+ model_conf: {}
696
+ use_preprocessor: true
697
+ token_type: phn
698
+ bpemodel: null
699
+ non_linguistic_symbols: null
700
+ cleaner: null
701
+ g2p: null
702
+ feats_extract: fbank
703
+ feats_extract_conf:
704
+ n_fft: 1024
705
+ hop_length: 256
706
+ win_length: null
707
+ fs: 16000
708
+ fmin: 80
709
+ fmax: 7600
710
+ n_mels: 80
711
+ normalize: global_mvn
712
+ normalize_conf:
713
+ stats_file: exp/tts_stats_raw_phn_none/train/feats_stats.npz
714
+ tts: transformer
715
+ tts_conf:
716
+ embed_dim: 0
717
+ eprenet_conv_layers: 0
718
+ eprenet_conv_filts: 0
719
+ eprenet_conv_chans: 0
720
+ dprenet_layers: 2
721
+ dprenet_units: 256
722
+ adim: 512
723
+ aheads: 8
724
+ elayers: 6
725
+ eunits: 1024
726
+ dlayers: 6
727
+ dunits: 1024
728
+ positionwise_layer_type: conv1d
729
+ positionwise_conv_kernel_size: 1
730
+ postnet_layers: 5
731
+ postnet_filts: 5
732
+ postnet_chans: 256
733
+ spk_embed_dim: 192
734
+ spk_embed_integration_type: add
735
+ use_gst: true
736
+ gst_heads: 4
737
+ gst_tokens: 16
738
+ use_masking: true
739
+ bce_pos_weight: 5.0
740
+ use_scaled_pos_enc: true
741
+ encoder_normalize_before: true
742
+ decoder_normalize_before: true
743
+ reduction_factor: 1
744
+ init_type: xavier_uniform
745
+ init_enc_alpha: 1.0
746
+ init_dec_alpha: 1.0
747
+ eprenet_dropout_rate: 0.0
748
+ dprenet_dropout_rate: 0.5
749
+ postnet_dropout_rate: 0.5
750
+ transformer_enc_dropout_rate: 0.1
751
+ transformer_enc_positional_dropout_rate: 0.1
752
+ transformer_enc_attn_dropout_rate: 0.1
753
+ transformer_dec_dropout_rate: 0.1
754
+ transformer_dec_positional_dropout_rate: 0.1
755
+ transformer_dec_attn_dropout_rate: 0.1
756
+ transformer_enc_dec_attn_dropout_rate: 0.1
757
+ use_guided_attn_loss: true
758
+ num_heads_applied_guided_attn: 2
759
+ num_layers_applied_guided_attn: 2
760
+ modules_applied_guided_attn:
761
+ - encoder-decoder
762
+ guided_attn_loss_sigma: 0.4
763
+ guided_attn_loss_lambda: 10.0
764
+ langs: 21
765
+ lang_family_encoding: false
766
+ num_lang_family: 7
767
+ use_adapter: true
768
+ adapter_type: residual
769
+ use_encoder_w_lid: true
770
+ pitch_extract: null
771
+ pitch_extract_conf: {}
772
+ pitch_normalize: null
773
+ pitch_normalize_conf: {}
774
+ energy_extract: null
775
+ energy_extract_conf: {}
776
+ energy_normalize: null
777
+ energy_normalize_conf: {}
778
+ required:
779
+ - output_dir
780
+ - token_list
781
+ version: '202209'
782
+ distributed: false
783
+ ```
784
+
785
+ </details>
786
+
787
+
788
+
789
+ ### Citing ESPnet
790
+
791
+ ```BibTex
792
+ @inproceedings{watanabe2018espnet,
793
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
794
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
795
+ year={2018},
796
+ booktitle={Proceedings of Interspeech},
797
+ pages={2207--2211},
798
+ doi={10.21437/Interspeech.2018-1456},
799
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
800
+ }
801
+
802
+
803
+
804
+
805
+ @inproceedings{hayashi2020espnet,
806
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
807
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
808
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
809
+ pages={7654--7658},
810
+ year={2020},
811
+ organization={IEEE}
812
+ }
813
+ ```
814
+
815
+ or arXiv:
816
+
817
+ ```bibtex
818
+ @misc{watanabe2018espnet,
819
+ title={ESPnet: End-to-End Speech Processing Toolkit},
820
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
821
+ year={2018},
822
+ eprint={1804.00015},
823
+ archivePrefix={arXiv},
824
+ primaryClass={cs.CL}
825
+ }
826
+ ```
dump/raw/org/train/lang2lid ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <unk> 0
2
+ cs_cz 1
3
+ de_de 2
4
+ el_gr 3
5
+ en_uk 4
6
+ en_us 5
7
+ es_419 6
8
+ et_ee 7
9
+ fi_fi 8
10
+ fr_fr 9
11
+ hr_hr 10
12
+ hu_hu 11
13
+ it_it 12
14
+ lt_lt 13
15
+ nl_nl 14
16
+ pl_pl 15
17
+ ro_ro 16
18
+ ru_ru 17
19
+ sk_sk 18
20
+ sl_si 19
21
+ uk_ua 20
dump/xvector/test/spk_xvector.ark ADDED
Binary file (5.54 kB). View file
 
dump/xvector/test/spk_xvector.scp ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ css10_de dump/xvector/test/spk_xvector.ark:9
2
+ css10_el dump/xvector/test/spk_xvector.ark:801
3
+ css10_fi dump/xvector/test/spk_xvector.ark:1593
4
+ css10_fr dump/xvector/test/spk_xvector.ark:2385
5
+ css10_hu dump/xvector/test/spk_xvector.ark:3177
6
+ css10_nl dump/xvector/test/spk_xvector.ark:3969
7
+ css10_ru dump/xvector/test/spk_xvector.ark:4761
dump/xvector/train/spk_xvector.ark ADDED
Binary file (5.54 kB). View file
 
dump/xvector/train/spk_xvector.scp ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ css10_de dump/xvector/train/spk_xvector.ark:9
2
+ css10_el dump/xvector/train/spk_xvector.ark:801
3
+ css10_fi dump/xvector/train/spk_xvector.ark:1593
4
+ css10_fr dump/xvector/train/spk_xvector.ark:2385
5
+ css10_hu dump/xvector/train/spk_xvector.ark:3177
6
+ css10_nl dump/xvector/train/spk_xvector.ark:3969
7
+ css10_ru dump/xvector/train/spk_xvector.ark:4761
exp/tts_stats_raw_phn_none/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed1b26fa9a899031a8727e1a4607db371c918145e6dc39d89cc0556ca2e65237
3
+ size 1402
exp/tts_train_raw_phn_none/config.yaml ADDED
@@ -0,0 +1,745 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_raw_phn_none
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 1
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 200
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 3
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 2.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 4
49
+ no_forward_run: false
50
+ resume: true
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ create_graph_in_tensorboard: false
57
+ use_wandb: false
58
+ wandb_project: null
59
+ wandb_id: null
60
+ wandb_entity: null
61
+ wandb_name: null
62
+ wandb_model_log_interval: -1
63
+ detect_anomaly: false
64
+ pretrain_path: null
65
+ init_param:
66
+ - ../tts_pretrain_phn_residual/exp/tts_train_phn_none/2epoch.pth:tts_pretrain.encoder:tts.encoder
67
+ - ../tts_pretrain_phn_residual/exp/tts_train_phn_none/2epoch.pth:tts_pretrain.lid_emb:tts.lid_emb
68
+ ignore_init_mismatch: false
69
+ freeze_param:
70
+ - tts.encoder.adapter
71
+ - tts.encoder.embed
72
+ - tts.lid_emb
73
+ num_iters_per_epoch: null
74
+ batch_size: 20
75
+ valid_batch_size: null
76
+ batch_bins: 400000
77
+ valid_batch_bins: null
78
+ train_shape_file:
79
+ - exp/tts_stats_raw_phn_none/train/text_shape.phn
80
+ - exp/tts_stats_raw_phn_none/train/speech_shape
81
+ valid_shape_file:
82
+ - exp/tts_stats_raw_phn_none/valid/text_shape.phn
83
+ - exp/tts_stats_raw_phn_none/valid/speech_shape
84
+ batch_type: numel
85
+ valid_batch_type: null
86
+ fold_length:
87
+ - 150
88
+ - 204800
89
+ sort_in_batch: descending
90
+ sort_batch: descending
91
+ multiple_iterator: false
92
+ chunk_length: 500
93
+ chunk_shift_ratio: 0.5
94
+ num_cache_chunks: 1024
95
+ train_data_path_and_name_and_type:
96
+ - - /local/11454483.1.gpu/dump/raw/train/text
97
+ - text
98
+ - text
99
+ - - /local/11454483.1.gpu/dump/raw/train/wav.scp
100
+ - speech
101
+ - sound
102
+ - - /local/11454483.1.gpu/dump/xvector/train/xvector.scp
103
+ - spembs
104
+ - kaldi_ark
105
+ - - /local/11454483.1.gpu/dump/raw/train/utt2lid
106
+ - lids
107
+ - text_int
108
+ valid_data_path_and_name_and_type:
109
+ - - /local/11454483.1.gpu/dump/raw/dev/text
110
+ - text
111
+ - text
112
+ - - /local/11454483.1.gpu/dump/raw/dev/wav.scp
113
+ - speech
114
+ - sound
115
+ - - /local/11454483.1.gpu/dump/xvector/dev/xvector.scp
116
+ - spembs
117
+ - kaldi_ark
118
+ - - /local/11454483.1.gpu/dump/raw/dev/utt2lid
119
+ - lids
120
+ - text_int
121
+ allow_variable_data_keys: false
122
+ max_cache_size: 0.0
123
+ max_cache_fd: 32
124
+ valid_max_cache_size: null
125
+ optim: adam
126
+ optim_conf:
127
+ lr: 1.0
128
+ scheduler: noamlr
129
+ scheduler_conf:
130
+ model_size: 512
131
+ warmup_steps: 50000
132
+ token_list:
133
+ - <blank>
134
+ - <unk>
135
+ - n
136
+ - t
137
+ - s
138
+ - l
139
+ - a
140
+ - e
141
+ - k
142
+ - d
143
+ - m
144
+ - ə
145
+ - r
146
+ - i
147
+ - p
148
+ - o
149
+ - v
150
+ - ɪ
151
+ - ˈa
152
+ - ɾ
153
+ - j
154
+ - z
155
+ - ˈɛ
156
+ - ˈe
157
+ - ɛ
158
+ - b
159
+ - ˈo
160
+ - f
161
+ - ˈi
162
+ - u
163
+ - ð
164
+ - ʁ
165
+ - h
166
+ - ɡ
167
+ - ɔ
168
+ - ʃ
169
+ - ˈu
170
+ - w
171
+ - ˌe
172
+ - ts
173
+ - ŋ
174
+ - ˌa
175
+ - æ
176
+ - iː
177
+ - ˈɪ
178
+ - ˈiː
179
+ - ˈaː
180
+ - ɹ
181
+ - ʊ
182
+ - ɑ
183
+ - ˈeː
184
+ - ˈɔ
185
+ - x
186
+ - aː
187
+ - tʃ
188
+ - ˌi
189
+ - ˌo
190
+ - tː
191
+ - oː
192
+ - ɣ
193
+ - ˈoː
194
+ - eː
195
+ - y
196
+ - θ
197
+ - ɲ
198
+ - ə-
199
+ - ʋ
200
+ - ʒ
201
+ - ˌɛ
202
+ - ˈɑ
203
+ - β
204
+ - uː
205
+ - ˈuː
206
+ - ˈaɪ
207
+ - ç
208
+ - ˈɑ̃
209
+ - ˈɔ̃
210
+ - ˈæ
211
+ - ɚ
212
+ - ˌɪ
213
+ - ɑ̃
214
+ - ˌu
215
+ - ˌɔ
216
+ - ˈy
217
+ - ɜ
218
+ - tʲ
219
+ - ˈeɪ
220
+ - ˈɑː
221
+ - ˌeː
222
+ - ʌ
223
+ - ᵻ
224
+ - ɐ
225
+ - ˌɑ
226
+ - ɨ
227
+ - ɔ̃
228
+ - dʒ
229
+ - e-
230
+ - ˌiː
231
+ - a-
232
+ - ˈʌ
233
+ - ˌʊ
234
+ - əl
235
+ - ʎ
236
+ - ˌaɪ
237
+ - aɪ
238
+ - ˈɔː
239
+ - ss
240
+ - ˈaʊ
241
+ - rʲ
242
+ - kː
243
+ - ˈoʊ
244
+ - ˌaː
245
+ - ɑː
246
+ - nʲ
247
+ - ˌoː
248
+ - ø
249
+ - ˈɛɪ
250
+ - ɛɪ
251
+ - ˌæ
252
+ - ʂ
253
+ - ɲʲ
254
+ - ˌɑː
255
+ - ɕ
256
+ - ˈai
257
+ - vʲ
258
+ - dʲ
259
+ - ai
260
+ - ei
261
+ - ɛ̃
262
+ - mʲ
263
+ - ˈø
264
+ - ɭ
265
+ - ˈɵ
266
+ - pː
267
+ - ˈɛ̃
268
+ - ɔː
269
+ - oʊ
270
+ - ˈɜː
271
+ - ˈʊ
272
+ - tɕ
273
+ - ɟ
274
+ - ˌaʊ
275
+ - ˈœ
276
+ - kʲ
277
+ - ˈuo
278
+ - ˈoi
279
+ - æː
280
+ - dʑ
281
+ - l̩
282
+ - ˈie
283
+ - ɪː
284
+ - ie
285
+ - oi
286
+ - ˌeɪ
287
+ - ˈɨ
288
+ - yː
289
+ - ˈɪː
290
+ - ˌy
291
+ - øː
292
+ - ˈʏ
293
+ - ˈɛː
294
+ - ˈoːɹ
295
+ - ˌuː
296
+ - ˌʌ
297
+ - ˈeu
298
+ - ˈei
299
+ - aʊ
300
+ - ˌoi
301
+ - bː
302
+ - ˌai
303
+ - ˈœy
304
+ - ˈøː
305
+ - ˈɑːɹ
306
+ - œ̃
307
+ - ˈæː
308
+ - au
309
+ - y-
310
+ - r̝̊
311
+ - ɵ
312
+ - ˌɵ
313
+ - c
314
+ - ˌɛɪ
315
+ - ˈɔø
316
+ - ˈyː
317
+ - ee
318
+ - pʲ
319
+ - ˈee
320
+ - bʲ
321
+ - ˈyø
322
+ - iə
323
+ - ˈiə
324
+ - ˌɨ
325
+ - ˌøː
326
+ - ɔːɹ
327
+ - ɔø
328
+ - eɪ
329
+ - ʑ
330
+ - ˈau
331
+ - ˈʊɹ
332
+ - r̝
333
+ - dʒː
334
+ - ˌeʊ
335
+ - ˈɔːɹ
336
+ - ˌoʊ
337
+ - ˌʊɹ
338
+ - ɑːɹ
339
+ - ˈæy
340
+ - ˌyː
341
+ - s^
342
+ - eu
343
+ - ˌə
344
+ - tʃː
345
+ - ˈə
346
+ - ˌei
347
+ - ea
348
+ - tsʲ
349
+ - ẽ
350
+ - ʌʊ
351
+ - œy
352
+ - ˈʌʊ
353
+ - nʲʲ
354
+ - ˌæi
355
+ - ˌʏ
356
+ - ˌɛː
357
+ - ˈɪɹ
358
+ - æi
359
+ - ˈɛɹ
360
+ - ˈæi
361
+ - ˈɔɪ
362
+ - ã
363
+ - dzː
364
+ - r̩
365
+ - ˈẽ
366
+ - ou
367
+ - œ
368
+ - ɜː
369
+ - uo
370
+ - tʲʲ
371
+ - ˌø
372
+ - ɛɹ
373
+ - ɭʲ
374
+ - iɪ
375
+ - (en)
376
+ - ʂʲ
377
+ - tsː
378
+ - ˌuo
379
+ - ˌʌʊ
380
+ - oːɹ
381
+ - ˈou
382
+ - ˌɛ̃
383
+ - ʝ
384
+ - eʊ
385
+ - ɨ̃
386
+ - ˈɔa
387
+ - ɟː
388
+ - ʊɐ
389
+ - ˈr̩
390
+ - tʃʲ
391
+ - uɪ
392
+ - ɡʲ
393
+ - ˈea
394
+ - ˌʊɐ
395
+ - ˈʊɐ
396
+ - ɛː
397
+ - ˌyi
398
+ - t^
399
+ - tɕʲ
400
+ - ˌea
401
+ - (fr)
402
+ - ɕʲ
403
+ - ʀ
404
+ - ˌɔø
405
+ - ʏ
406
+ - ˌœ
407
+ - ˈoɪ
408
+ - ˌau
409
+ - eɑ
410
+ - ˌɪː
411
+ - ˈeʊ
412
+ - ˈiɪ
413
+ - ˈã
414
+ - ˌɔː
415
+ - ˌã
416
+ - sʲ
417
+ - ˈaɪɚ
418
+ - ˌɑ̃
419
+ - ˌæː
420
+ - ey
421
+ - ˌœy
422
+ - ˈaɪə
423
+ - d̪
424
+ - ɾʲ
425
+ - ˌøi
426
+ - dː
427
+ - ˌie
428
+ - ui
429
+ - fʲ
430
+ - n̩
431
+ - ʔ
432
+ - ˌou
433
+ - yi
434
+ - ˌɑːɹ
435
+ - tsʲʲ
436
+ - ˌɐ
437
+ - ˈœ̃
438
+ - ˌyø
439
+ - dz
440
+ - ɡː
441
+ - ɾʲʲ
442
+ - ˈl̩
443
+ - ˈøy
444
+ - ˌæy
445
+ - cː
446
+ - æy
447
+ - ʊɹ
448
+ - ʑʲ
449
+ - ˌɜː
450
+ - yʊ
451
+ - ˌɛɹ
452
+ - pf
453
+ - dʑʲ
454
+ - ˌoːɹ
455
+ - ˈɨ̃
456
+ - ˈiʊ
457
+ - õ
458
+ - ɔa
459
+ - ˌɔa
460
+ - ˌee
461
+ - ˈĩ
462
+ - ˌiɪ
463
+ - ˌɔːɹ
464
+ - ˈɒ
465
+ - ja
466
+ - ĩ
467
+ - ˈũ
468
+ - ɒ
469
+ - ũ
470
+ - ʃʲ
471
+ - ɪɹ
472
+ - ju
473
+ - (de)
474
+ - yø
475
+ - ˌeu
476
+ - d^
477
+ - ˈiu
478
+ - ˈja
479
+ - øi
480
+ - ˈeɑ
481
+ - ˈyi
482
+ - ɾʲˌʲ
483
+ - ʃʲʲ
484
+ - ʃʲˌʲ
485
+ - aɪə
486
+ - ˈuɪ
487
+ - iu
488
+ - ˈõ
489
+ - iɐ
490
+ - ˌẽ
491
+ - iʊ
492
+ - ˌr̩
493
+ - ˈui
494
+ - əʊ
495
+ - u"
496
+ - ˌɔ̃
497
+ - ˈəʊ
498
+ - iy
499
+ - ʲ
500
+ - zʲˌʲ
501
+ - (it)
502
+ - ˌɒ
503
+ - ɔɪ
504
+ - ˌɪɹ
505
+ - ˈɵː
506
+ - ˈu"
507
+ - nʲˌʲ
508
+ - (nl)
509
+ - ˌl̩
510
+ - ˈey
511
+ - βː
512
+ - lʲʲ
513
+ - oɪ
514
+ - ˈiɐ
515
+ - ˌiɐ
516
+ - lʲ
517
+ - tsʲˌʲ
518
+ - xʲ
519
+ - ˌũ
520
+ - mʲʲ
521
+ - dʒʲ
522
+ - ˌeo
523
+ - ˈju
524
+ - r̩ː
525
+ - lʲˌʲ
526
+ - ˈøi
527
+ - t^ː
528
+ - əɪ
529
+ - l̩ː
530
+ - tʃˌʲ
531
+ - eo
532
+ - zʲʲ
533
+ - ˌiy
534
+ - aʲ
535
+ - ˌoɪ
536
+ - tl#
537
+ - ˈyɪ
538
+ - ˌiə
539
+ - ˌey
540
+ - øy
541
+ - dʲʲ
542
+ - ˈl̩ː
543
+ - ˈyʊ
544
+ - ˌɨ̃
545
+ - ʀʲ
546
+ - ɣː
547
+ - ˈeo
548
+ - ˈʊə
549
+ - ˌiu
550
+ - ˌøy
551
+ - ˈəɪ
552
+ - ˈeə
553
+ - aɪɚ
554
+ - ɪ^
555
+ - eə
556
+ - ˌĩ
557
+ - t̪
558
+ - vʲʲ
559
+ - (es)
560
+ - (gn)
561
+ - zʲ
562
+ - ˌõ
563
+ - əː
564
+ - bʲʲ
565
+ - (base)
566
+ - ˌəʊ
567
+ - ˈə-
568
+ - (ru)
569
+ - ˌɔɪ
570
+ - ˈæiː
571
+ - tsˌʲ
572
+ - ˈr̩ː
573
+ - ə--
574
+ - ˌn̩
575
+ - uʲ
576
+ - ˈw
577
+ - hʲ
578
+ - ˌeə
579
+ - yɪ
580
+ - fʲʲ
581
+ - ˌyʊ
582
+ - (el)
583
+ - ˌaɪɚ
584
+ - ˈəː
585
+ - ˌʊə
586
+ - ɵː
587
+ - t̪ː
588
+ - w-
589
+ - (sl)
590
+ - eʲ
591
+ - ˈa-
592
+ - ˌr̩ː
593
+ - mʲˌʲ
594
+ - (fi)
595
+ - ʒʲʲ
596
+ - çʲ
597
+ - ˌaɪə
598
+ - ˈɚ
599
+ - (lt)
600
+ - pʲʲ
601
+ - ˈɜ
602
+ - ˌuɪ
603
+ - ˌja
604
+ - (pl)
605
+ - ˈe-
606
+ - ˌe-
607
+ - (et)
608
+ - ˈoːʲ
609
+ - (kl)
610
+ - ˈõː
611
+ - (hu)
612
+ - ˈiy
613
+ - ʊə
614
+ - ˈaʲ
615
+ - ˌl̩ː
616
+ - lˌʲ
617
+ - '1'
618
+ - ʒʲ
619
+ - (cs)
620
+ - ˈææ
621
+ - ˈts-
622
+ - ts-
623
+ - ˌʊː
624
+ - ˌy"
625
+ - cʲ
626
+ - wʲ
627
+ - ˈãː
628
+ - ˈuʲ
629
+ - (ro)
630
+ - ˌɜ
631
+ - (sk)
632
+ - oːʲ
633
+ - ʊː
634
+ - ˈtl#tl#
635
+ - ʃˈʲ
636
+ - ɬ
637
+ - ˌə-
638
+ - (hr)
639
+ - tl#tl#
640
+ - ˌœ̃
641
+ - ˈʊː
642
+ - l̩ʲ
643
+ - dʒˌʲ
644
+ - tsˈʲ
645
+ - pʲˌʲ
646
+ - ˈʌː
647
+ - ˈeʲ
648
+ - aːʲ
649
+ - vʲˌʲ
650
+ - ˈj
651
+ - ()
652
+ - eːː
653
+ - ˌãː
654
+ - ˈuːʲ
655
+ - ˈeeʲ
656
+ - <sos/eos>
657
+ odim: null
658
+ model_conf: {}
659
+ use_preprocessor: true
660
+ token_type: phn
661
+ bpemodel: null
662
+ non_linguistic_symbols: null
663
+ cleaner: null
664
+ g2p: null
665
+ feats_extract: fbank
666
+ feats_extract_conf:
667
+ n_fft: 1024
668
+ hop_length: 256
669
+ win_length: null
670
+ fs: 16000
671
+ fmin: 80
672
+ fmax: 7600
673
+ n_mels: 80
674
+ normalize: global_mvn
675
+ normalize_conf:
676
+ stats_file: exp/tts_stats_raw_phn_none/train/feats_stats.npz
677
+ tts: transformer
678
+ tts_conf:
679
+ embed_dim: 0
680
+ eprenet_conv_layers: 0
681
+ eprenet_conv_filts: 0
682
+ eprenet_conv_chans: 0
683
+ dprenet_layers: 2
684
+ dprenet_units: 256
685
+ adim: 512
686
+ aheads: 8
687
+ elayers: 6
688
+ eunits: 1024
689
+ dlayers: 6
690
+ dunits: 1024
691
+ positionwise_layer_type: conv1d
692
+ positionwise_conv_kernel_size: 1
693
+ postnet_layers: 5
694
+ postnet_filts: 5
695
+ postnet_chans: 256
696
+ spk_embed_dim: 192
697
+ spk_embed_integration_type: add
698
+ use_gst: true
699
+ gst_heads: 4
700
+ gst_tokens: 16
701
+ use_masking: true
702
+ bce_pos_weight: 5.0
703
+ use_scaled_pos_enc: true
704
+ encoder_normalize_before: true
705
+ decoder_normalize_before: true
706
+ reduction_factor: 1
707
+ init_type: xavier_uniform
708
+ init_enc_alpha: 1.0
709
+ init_dec_alpha: 1.0
710
+ eprenet_dropout_rate: 0.0
711
+ dprenet_dropout_rate: 0.5
712
+ postnet_dropout_rate: 0.5
713
+ transformer_enc_dropout_rate: 0.1
714
+ transformer_enc_positional_dropout_rate: 0.1
715
+ transformer_enc_attn_dropout_rate: 0.1
716
+ transformer_dec_dropout_rate: 0.1
717
+ transformer_dec_positional_dropout_rate: 0.1
718
+ transformer_dec_attn_dropout_rate: 0.1
719
+ transformer_enc_dec_attn_dropout_rate: 0.1
720
+ use_guided_attn_loss: true
721
+ num_heads_applied_guided_attn: 2
722
+ num_layers_applied_guided_attn: 2
723
+ modules_applied_guided_attn:
724
+ - encoder-decoder
725
+ guided_attn_loss_sigma: 0.4
726
+ guided_attn_loss_lambda: 10.0
727
+ langs: 21
728
+ lang_family_encoding: false
729
+ num_lang_family: 7
730
+ use_adapter: true
731
+ adapter_type: residual
732
+ use_encoder_w_lid: true
733
+ pitch_extract: null
734
+ pitch_extract_conf: {}
735
+ pitch_normalize: null
736
+ pitch_normalize_conf: {}
737
+ energy_extract: null
738
+ energy_extract_conf: {}
739
+ energy_normalize: null
740
+ energy_normalize_conf: {}
741
+ required:
742
+ - output_dir
743
+ - token_list
744
+ version: '202209'
745
+ distributed: false
exp/tts_train_raw_phn_none/images/backward_time.png ADDED
exp/tts_train_raw_phn_none/images/bce_loss.png ADDED
exp/tts_train_raw_phn_none/images/decoder_alpha.png ADDED
exp/tts_train_raw_phn_none/images/enc_dec_attn_loss.png ADDED
exp/tts_train_raw_phn_none/images/encoder_alpha.png ADDED
exp/tts_train_raw_phn_none/images/forward_time.png ADDED
exp/tts_train_raw_phn_none/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_train_raw_phn_none/images/iter_time.png ADDED
exp/tts_train_raw_phn_none/images/l1_loss.png ADDED
exp/tts_train_raw_phn_none/images/l2_loss.png ADDED
exp/tts_train_raw_phn_none/images/lid_loss.png ADDED
exp/tts_train_raw_phn_none/images/lid_loss_mlm.png ADDED
exp/tts_train_raw_phn_none/images/loss.png ADDED
exp/tts_train_raw_phn_none/images/mlm_acc.png ADDED
exp/tts_train_raw_phn_none/images/mlm_loss.png ADDED
exp/tts_train_raw_phn_none/images/optim0_lr0.png ADDED
exp/tts_train_raw_phn_none/images/optim_step_time.png ADDED
exp/tts_train_raw_phn_none/images/train_time.png ADDED
exp/tts_train_raw_phn_none/latest.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b78279f273076f6c3b0ba49725dfb890a7b74bf886258ff2cc8cc68038abd9ce
3
+ size 138451573
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202209'
2
+ files:
3
+ model_file: exp/tts_train_raw_phn_none/latest.pth
4
+ python: "3.8.13 (default, Mar 28 2022, 11:38:47) \n[GCC 7.5.0]"
5
+ timestamp: 1691375968.669351
6
+ torch: 1.11.0+cu113
7
+ yaml_files:
8
+ train_config: exp/tts_train_raw_phn_none/config.yaml