KevinGeng commited on
Commit
671e149
β€’
1 Parent(s): 1b3fb80

add TTS_models and vocoder lfs support

Browse files
Files changed (36) hide show
  1. .gitattributes +2 -1
  2. {TTS_model β†’ TTS_models/libritts_xvector_vits}/config.yaml +0 -0
  3. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/discriminator_backward_time.png +0 -0
  4. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/discriminator_fake_loss.png +0 -0
  5. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/discriminator_forward_time.png +0 -0
  6. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/discriminator_loss.png +0 -0
  7. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/discriminator_optim_step_time.png +0 -0
  8. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/discriminator_real_loss.png +0 -0
  9. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/discriminator_train_time.png +0 -0
  10. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_adv_loss.png +0 -0
  11. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_backward_time.png +0 -0
  12. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_dur_loss.png +0 -0
  13. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_feat_match_loss.png +0 -0
  14. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_forward_time.png +0 -0
  15. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_kl_loss.png +0 -0
  16. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_loss.png +0 -0
  17. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_mel_loss.png +0 -0
  18. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_optim_step_time.png +0 -0
  19. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_train_time.png +0 -0
  20. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/gpu_max_cached_mem_GB.png +0 -0
  21. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/iter_time.png +0 -0
  22. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/optim0_lr0.png +0 -0
  23. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/optim1_lr0.png +0 -0
  24. {TTS_model β†’ TTS_models/libritts_xvector_vits}/images/train_time.png +0 -0
  25. {TTS_model β†’ TTS_models/libritts_xvector_vits}/train.total_count.ave_10best.pth +0 -0
  26. app.py +3 -3
  27. vocoders/libritts_hifigan.v1/checkpoint-2500000steps.pkl +3 -0
  28. vocoders/libritts_hifigan.v1/config.yml +191 -0
  29. vocoders/libritts_hifigan.v1/stats.h5 +3 -0
  30. vocoders/vctk_parallel_wavegan.v1.long/._checkpoint-1000000steps.pkl +3 -0
  31. vocoders/vctk_parallel_wavegan.v1.long/._config.yml +0 -0
  32. vocoders/vctk_parallel_wavegan.v1.long/._stats.h5 +3 -0
  33. vocoders/vctk_parallel_wavegan.v1.long/._train_nodev_all_vctk_parallel_wavegan.v1.long +0 -0
  34. vocoders/vctk_parallel_wavegan.v1.long/checkpoint-1000000steps.pkl +3 -0
  35. vocoders/vctk_parallel_wavegan.v1.long/config.yml +104 -0
  36. vocoders/vctk_parallel_wavegan.v1.long/stats.h5 +3 -0
.gitattributes CHANGED
@@ -33,4 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  xvector filter=lfs diff=lfs merge=lfs -text
36
- TTS_model filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
  xvector filter=lfs diff=lfs merge=lfs -text
36
+ TTS_models filter=lfs diff=lfs merge=lfs -text
37
+ vocoders filter=lfs diff=lfs merge=lfs -text
{TTS_model β†’ TTS_models/libritts_xvector_vits}/config.yaml RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/discriminator_backward_time.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/discriminator_fake_loss.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/discriminator_forward_time.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/discriminator_loss.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/discriminator_optim_step_time.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/discriminator_real_loss.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/discriminator_train_time.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_adv_loss.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_backward_time.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_dur_loss.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_feat_match_loss.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_forward_time.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_kl_loss.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_loss.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_mel_loss.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_optim_step_time.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/generator_train_time.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/gpu_max_cached_mem_GB.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/iter_time.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/optim0_lr0.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/optim1_lr0.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/images/train_time.png RENAMED
File without changes
{TTS_model β†’ TTS_models/libritts_xvector_vits}/train.total_count.ave_10best.pth RENAMED
File without changes
app.py CHANGED
@@ -64,8 +64,8 @@ from espnet2.utils.types import str_or_none
64
 
65
  # local import
66
  text2speech = Text2Speech.from_pretrained(
67
- train_config = "TTS_model/config.yaml",
68
- model_file="TTS_model/train.total_count.ave_10best.pth",
69
  vocoder_tag=str_or_none(vocoder_tag),
70
  device="cuda",
71
  use_att_constraint=False,
@@ -106,7 +106,7 @@ transformer_text2speech = Text2Speech.from_pretrained(
106
  forward_window=3,
107
  speed_control_alpha=1.0,
108
  )
109
-
110
  # from google.cloud import texttospeech
111
  # Google_TTS_client = texttospeech.TextToSpeechClient()
112
 
 
64
 
65
  # local import
66
  text2speech = Text2Speech.from_pretrained(
67
+ train_config = "TTS_models/libritts_xvector_vits/config.yaml",
68
+ model_file="TTS_models/libritts_xvector_vits/train.total_count.ave_10best.pth",
69
  vocoder_tag=str_or_none(vocoder_tag),
70
  device="cuda",
71
  use_att_constraint=False,
 
106
  forward_window=3,
107
  speed_control_alpha=1.0,
108
  )
109
+ pdb.set_trace()
110
  # from google.cloud import texttospeech
111
  # Google_TTS_client = texttospeech.TextToSpeechClient()
112
 
vocoders/libritts_hifigan.v1/checkpoint-2500000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:859f68c82afdf0f22746db6a6b7fb3cee3010a94db09f831c9e5f1a41eb9b0b0
3
+ size 1004606893
vocoders/libritts_hifigan.v1/config.yml ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ allow_cache: false
2
+ batch_max_steps: 8400
3
+ batch_size: 16
4
+ config: conf/hifigan.v1.yaml
5
+ dev_dumpdir: dump/dev_clean/norm
6
+ dev_feats_scp: null
7
+ dev_segments: null
8
+ dev_wav_scp: null
9
+ discriminator_adv_loss_params:
10
+ average_by_discriminators: false
11
+ discriminator_grad_norm: -1
12
+ discriminator_optimizer_params:
13
+ betas:
14
+ - 0.5
15
+ - 0.9
16
+ lr: 0.0002
17
+ weight_decay: 0.0
18
+ discriminator_optimizer_type: Adam
19
+ discriminator_params:
20
+ follow_official_norm: true
21
+ period_discriminator_params:
22
+ bias: true
23
+ channels: 32
24
+ downsample_scales:
25
+ - 3
26
+ - 3
27
+ - 3
28
+ - 3
29
+ - 1
30
+ in_channels: 1
31
+ kernel_sizes:
32
+ - 5
33
+ - 3
34
+ max_downsample_channels: 1024
35
+ nonlinear_activation: LeakyReLU
36
+ nonlinear_activation_params:
37
+ negative_slope: 0.1
38
+ out_channels: 1
39
+ use_spectral_norm: false
40
+ use_weight_norm: true
41
+ periods:
42
+ - 2
43
+ - 3
44
+ - 5
45
+ - 7
46
+ - 11
47
+ scale_discriminator_params:
48
+ bias: true
49
+ channels: 128
50
+ downsample_scales:
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 1
56
+ in_channels: 1
57
+ kernel_sizes:
58
+ - 15
59
+ - 41
60
+ - 5
61
+ - 3
62
+ max_downsample_channels: 1024
63
+ max_groups: 16
64
+ nonlinear_activation: LeakyReLU
65
+ nonlinear_activation_params:
66
+ negative_slope: 0.1
67
+ out_channels: 1
68
+ scale_downsample_pooling: AvgPool1d
69
+ scale_downsample_pooling_params:
70
+ kernel_size: 4
71
+ padding: 2
72
+ stride: 2
73
+ scales: 3
74
+ discriminator_scheduler_params:
75
+ gamma: 0.5
76
+ milestones:
77
+ - 200000
78
+ - 400000
79
+ - 600000
80
+ - 800000
81
+ discriminator_scheduler_type: MultiStepLR
82
+ discriminator_train_start_steps: 0
83
+ discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
84
+ distributed: false
85
+ eval_interval_steps: 1000
86
+ feat_match_loss_params:
87
+ average_by_discriminators: false
88
+ average_by_layers: false
89
+ include_final_outputs: false
90
+ fft_size: 2048
91
+ fmax: 7600
92
+ fmin: 80
93
+ format: hdf5
94
+ generator_adv_loss_params:
95
+ average_by_discriminators: false
96
+ generator_grad_norm: -1
97
+ generator_optimizer_params:
98
+ betas:
99
+ - 0.5
100
+ - 0.9
101
+ lr: 0.0002
102
+ weight_decay: 0.0
103
+ generator_optimizer_type: Adam
104
+ generator_params:
105
+ bias: true
106
+ channels: 512
107
+ in_channels: 80
108
+ kernel_size: 7
109
+ nonlinear_activation: LeakyReLU
110
+ nonlinear_activation_params:
111
+ negative_slope: 0.1
112
+ out_channels: 1
113
+ resblock_dilations:
114
+ - - 1
115
+ - 3
116
+ - 5
117
+ - - 1
118
+ - 3
119
+ - 5
120
+ - - 1
121
+ - 3
122
+ - 5
123
+ resblock_kernel_sizes:
124
+ - 3
125
+ - 7
126
+ - 11
127
+ upsample_kernal_sizes:
128
+ - 10
129
+ - 10
130
+ - 8
131
+ - 6
132
+ upsample_scales:
133
+ - 5
134
+ - 5
135
+ - 4
136
+ - 3
137
+ use_additional_convs: true
138
+ use_weight_norm: true
139
+ generator_scheduler_params:
140
+ gamma: 0.5
141
+ milestones:
142
+ - 200000
143
+ - 400000
144
+ - 600000
145
+ - 800000
146
+ generator_scheduler_type: MultiStepLR
147
+ generator_train_start_steps: 1
148
+ generator_type: HiFiGANGenerator
149
+ global_gain_scale: 1.0
150
+ hop_size: 300
151
+ lambda_adv: 1.0
152
+ lambda_aux: 45.0
153
+ lambda_feat_match: 2.0
154
+ log_interval_steps: 100
155
+ mel_loss_params:
156
+ fft_size: 2048
157
+ fmax: 12000
158
+ fmin: 0
159
+ fs: 24000
160
+ hop_size: 300
161
+ log_base: null
162
+ num_mels: 80
163
+ win_length: 1200
164
+ window: hann
165
+ num_mels: 80
166
+ num_save_intermediate_results: 4
167
+ num_workers: 2
168
+ outdir: exp/train_nodev_clean_libritts_hifigan.v1
169
+ pin_memory: true
170
+ pretrain: ''
171
+ rank: 0
172
+ remove_short_samples: false
173
+ resume: exp/train_nodev_clean_libritts_hifigan.v1/checkpoint-1890000steps.pkl
174
+ sampling_rate: 24000
175
+ save_interval_steps: 10000
176
+ train_dumpdir: dump/train_nodev_clean/norm
177
+ train_feats_scp: null
178
+ train_max_steps: 2500000
179
+ train_segments: null
180
+ train_wav_scp: null
181
+ trim_frame_size: 1024
182
+ trim_hop_size: 256
183
+ trim_silence: false
184
+ trim_threshold_in_db: 20
185
+ use_feat_match_loss: true
186
+ use_mel_loss: true
187
+ use_stft_loss: false
188
+ verbose: 1
189
+ version: 0.5.1
190
+ win_length: 1200
191
+ window: hann
vocoders/libritts_hifigan.v1/stats.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94d55b1ea02f33cb24aa5d1fbc4087519d36ca89ca032e3dc0ca6627df4ed190
3
+ size 4736
vocoders/vctk_parallel_wavegan.v1.long/._checkpoint-1000000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccb8cfc739515054284e6ac7c75afdab0c771eba7d132c4e19efff528147a1a1
3
+ size 223
vocoders/vctk_parallel_wavegan.v1.long/._config.yml ADDED
Binary file (223 Bytes). View file
 
vocoders/vctk_parallel_wavegan.v1.long/._stats.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f688626155bff6853a6045408be9bb248828abc482e2218ee0d93183cede5062
3
+ size 223
vocoders/vctk_parallel_wavegan.v1.long/._train_nodev_all_vctk_parallel_wavegan.v1.long ADDED
Binary file (187 Bytes). View file
 
vocoders/vctk_parallel_wavegan.v1.long/checkpoint-1000000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3e92516432be8da4dbcc9e284b0e8f5c0273fd8d7be939ca4c0db3f7c8c73e7
3
+ size 17498016
vocoders/vctk_parallel_wavegan.v1.long/config.yml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ allow_cache: true
2
+ batch_max_steps: 24000
3
+ batch_size: 6
4
+ config: conf/parallel_wavegan.v1.long.yaml
5
+ dev_dumpdir: dump/dev_all/norm
6
+ dev_feats_scp: null
7
+ dev_segments: null
8
+ dev_wav_scp: null
9
+ discriminator_grad_norm: 1
10
+ discriminator_optimizer_params:
11
+ eps: 1.0e-06
12
+ lr: 5.0e-05
13
+ weight_decay: 0.0
14
+ discriminator_params:
15
+ bias: true
16
+ conv_channels: 64
17
+ in_channels: 1
18
+ kernel_size: 3
19
+ layers: 10
20
+ nonlinear_activation: LeakyReLU
21
+ nonlinear_activation_params:
22
+ negative_slope: 0.2
23
+ out_channels: 1
24
+ use_weight_norm: true
25
+ discriminator_scheduler_params:
26
+ gamma: 0.5
27
+ step_size: 200000
28
+ discriminator_train_start_steps: 100000
29
+ distributed: false
30
+ eval_interval_steps: 1000
31
+ fft_size: 2048
32
+ fmax: 7600
33
+ fmin: 80
34
+ format: hdf5
35
+ generator_grad_norm: 10
36
+ generator_optimizer_params:
37
+ eps: 1.0e-06
38
+ lr: 0.0001
39
+ weight_decay: 0.0
40
+ generator_params:
41
+ aux_channels: 80
42
+ aux_context_window: 2
43
+ dropout: 0.0
44
+ gate_channels: 128
45
+ in_channels: 1
46
+ kernel_size: 3
47
+ layers: 30
48
+ out_channels: 1
49
+ residual_channels: 64
50
+ skip_channels: 64
51
+ stacks: 3
52
+ upsample_net: ConvInUpsampleNetwork
53
+ upsample_params:
54
+ upsample_scales:
55
+ - 4
56
+ - 5
57
+ - 3
58
+ - 5
59
+ use_weight_norm: true
60
+ generator_scheduler_params:
61
+ gamma: 0.5
62
+ step_size: 200000
63
+ global_gain_scale: 1.0
64
+ hop_size: 300
65
+ lambda_adv: 4.0
66
+ log_interval_steps: 100
67
+ num_mels: 80
68
+ num_save_intermediate_results: 4
69
+ num_workers: 2
70
+ outdir: exp/train_nodev_all_vctk_parallel_wavegan.v1.long
71
+ pin_memory: true
72
+ pretrain: ''
73
+ rank: 0
74
+ remove_short_samples: true
75
+ resume: exp/train_nodev_all_vctk_parallel_wavegan.v1.long/checkpoint-970000steps.pkl
76
+ sampling_rate: 24000
77
+ save_interval_steps: 5000
78
+ stft_loss_params:
79
+ fft_sizes:
80
+ - 1024
81
+ - 2048
82
+ - 512
83
+ hop_sizes:
84
+ - 120
85
+ - 240
86
+ - 50
87
+ win_lengths:
88
+ - 600
89
+ - 1200
90
+ - 240
91
+ window: hann_window
92
+ train_dumpdir: dump/train_nodev_all/norm
93
+ train_feats_scp: null
94
+ train_max_steps: 1000000
95
+ train_segments: null
96
+ train_wav_scp: null
97
+ trim_frame_size: 1024
98
+ trim_hop_size: 256
99
+ trim_silence: false
100
+ trim_threshold_in_db: 20
101
+ verbose: 1
102
+ version: 0.3.4
103
+ win_length: 1200
104
+ window: hann
vocoders/vctk_parallel_wavegan.v1.long/stats.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3144070fcf475d49873d9ee9c5433ae222587e569d38dd6fee8c6fff91d69c94
3
+ size 4736