te-ch commited on
Commit
d9c1056
·
2 Parent(s): 23ce701 0ff4340

removed junk 2

Browse files
app.py CHANGED
@@ -39,7 +39,6 @@ iface = gr.Interface(
39
  value="L'Èlia i l'Alí a l'aula. L'oli i l'ou. Lulú olorava la lila.",
40
  ),
41
  gr.Dropdown(label="dialect", choices="")
42
-
43
  ],
44
  outputs=[
45
  gr.Markdown(label="Fonemes")
 
39
  value="L'Èlia i l'Alí a l'aula. L'oli i l'ou. Lulú olorava la lila.",
40
  ),
41
  gr.Dropdown(label="dialect", choices="")
 
42
  ],
43
  outputs=[
44
  gr.Markdown(label="Fonemes")
models/bsc/best_model.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b15fa7d2052bada1cf421e49d2d03b00e95b49fcd0e42b7af1d92da2880cdecc
3
- size 1038659133
 
 
 
 
models/bsc/config.json DELETED
@@ -1,262 +0,0 @@
1
- {
2
- "output_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/experiments_from_previous",
3
- "logger_uri": null,
4
- "run_name": "multispeaker_vits_ca_1e4_1e4_32",
5
- "project_name": null,
6
- "run_description": "\ud83d\udc38Coqui trainer run.",
7
- "print_step": 25,
8
- "plot_step": 100,
9
- "model_param_stats": false,
10
- "wandb_entity": null,
11
- "dashboard_logger": "tensorboard",
12
- "log_model_step": 1000,
13
- "save_step": 1000,
14
- "save_n_checkpoints": 5,
15
- "save_checkpoints": true,
16
- "save_all_best": true,
17
- "save_best_after": 10000,
18
- "target_loss": null,
19
- "print_eval": true,
20
- "test_delay_epochs": -1,
21
- "run_eval": true,
22
- "run_eval_steps": null,
23
- "distributed_backend": "nccl",
24
- "distributed_url": "tcp://localhost:54321",
25
- "mixed_precision": false,
26
- "epochs": 1000,
27
- "batch_size": 16,
28
- "eval_batch_size": 8,
29
- "grad_clip": [
30
- 1000.0,
31
- 1000.0
32
- ],
33
- "scheduler_after_epoch": true,
34
- "lr": 0.001,
35
- "optimizer": "AdamW",
36
- "optimizer_params": {
37
- "betas": [
38
- 0.8,
39
- 0.99
40
- ],
41
- "eps": 1e-09,
42
- "weight_decay": 0.01
43
- },
44
- "lr_scheduler": "",
45
- "lr_scheduler_params": null,
46
- "use_grad_scaler": false,
47
- "cudnn_enable": true,
48
- "cudnn_deterministic": false,
49
- "cudnn_benchmark": false,
50
- "training_seed": 54321,
51
- "model": "vits",
52
- "num_loader_workers": 4,
53
- "num_eval_loader_workers": 4,
54
- "use_noise_augment": false,
55
- "audio": {
56
- "fft_size": 1024,
57
- "sample_rate": 22050,
58
- "win_length": 1024,
59
- "hop_length": 256,
60
- "num_mels": 80,
61
- "mel_fmin": 0,
62
- "mel_fmax": null
63
- },
64
- "use_phonemes": true,
65
- "phonemizer": "espeak",
66
- "phoneme_language": "ca",
67
- "compute_input_seq_cache": true,
68
- "text_cleaner": "multilingual_cleaners",
69
- "enable_eos_bos_chars": false,
70
- "test_sentences_file": "",
71
- "phoneme_cache_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/phoneme_cache",
72
- "characters": {
73
- "characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
74
- "vocab_dict": null,
75
- "pad": "<PAD>",
76
- "eos": "<EOS>",
77
- "bos": "<BOS>",
78
- "blank": "<BLNK>",
79
- "characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
80
- "punctuations": "!'(),-.:;? ",
81
- "phonemes": null,
82
- "is_unique": false,
83
- "is_sorted": true
84
- },
85
- "add_blank": true,
86
- "batch_group_size": 5,
87
- "loss_masking": null,
88
- "min_audio_len": 1,
89
- "max_audio_len": Infinity,
90
- "min_text_len": 1,
91
- "max_text_len": 325,
92
- "compute_f0": false,
93
- "compute_linear_spec": true,
94
- "precompute_num_workers": 0,
95
- "start_by_longest": false,
96
- "datasets": [
97
- {
98
- "formatter": "vctk_old",
99
- "dataset_name": "vctk_old",
100
- "path": "/gpfs/scratch/bsc88/bsc88474/data/multispeaker_ca",
101
- "meta_file_train": "",
102
- "ignored_speakers": [
103
- "uri",
104
- "09796",
105
- "05450"
106
- ],
107
- "language": "ca",
108
- "meta_file_val": "",
109
- "meta_file_attn_mask": ""
110
- }
111
- ],
112
- "test_sentences": [
113
- [
114
- "Per exemple, dels nostres bancs que inverteixen en armament de les nostres empreses."
115
- ],
116
- [
117
- "Preguntin-se si aix\u00f2 era necessari."
118
- ],
119
- [
120
- "La suposada ocultaci\u00f3 dels informes que advertien de risc s\u00edsmic."
121
- ],
122
- [
123
- "\u00c9s de 633 milions d'euros quan es far\u00e0 la publicaci\u00f3 detallada."
124
- ]
125
- ],
126
- "eval_split_max_size": null,
127
- "eval_split_size": 0.01,
128
- "use_speaker_weighted_sampler": false,
129
- "speaker_weighted_sampler_alpha": 1.0,
130
- "use_language_weighted_sampler": false,
131
- "language_weighted_sampler_alpha": 1.0,
132
- "use_length_weighted_sampler": false,
133
- "length_weighted_sampler_alpha": 1.0,
134
- "model_args": {
135
- "num_chars": 131,
136
- "out_channels": 513,
137
- "spec_segment_size": 32,
138
- "hidden_channels": 192,
139
- "hidden_channels_ffn_text_encoder": 768,
140
- "num_heads_text_encoder": 2,
141
- "num_layers_text_encoder": 6,
142
- "kernel_size_text_encoder": 3,
143
- "dropout_p_text_encoder": 0.1,
144
- "dropout_p_duration_predictor": 0.5,
145
- "kernel_size_posterior_encoder": 5,
146
- "dilation_rate_posterior_encoder": 1,
147
- "num_layers_posterior_encoder": 16,
148
- "kernel_size_flow": 5,
149
- "dilation_rate_flow": 1,
150
- "num_layers_flow": 4,
151
- "resblock_type_decoder": "1",
152
- "resblock_kernel_sizes_decoder": [
153
- 3,
154
- 7,
155
- 11
156
- ],
157
- "resblock_dilation_sizes_decoder": [
158
- [
159
- 1,
160
- 3,
161
- 5
162
- ],
163
- [
164
- 1,
165
- 3,
166
- 5
167
- ],
168
- [
169
- 1,
170
- 3,
171
- 5
172
- ]
173
- ],
174
- "upsample_rates_decoder": [
175
- 8,
176
- 8,
177
- 2,
178
- 2
179
- ],
180
- "upsample_initial_channel_decoder": 512,
181
- "upsample_kernel_sizes_decoder": [
182
- 16,
183
- 16,
184
- 4,
185
- 4
186
- ],
187
- "periods_multi_period_discriminator": [
188
- 2,
189
- 3,
190
- 5,
191
- 7,
192
- 11
193
- ],
194
- "use_sdp": true,
195
- "noise_scale": 1.0,
196
- "inference_noise_scale": 0.667,
197
- "length_scale": 1.0,
198
- "noise_scale_dp": 1.0,
199
- "inference_noise_scale_dp": 1.0,
200
- "max_inference_len": null,
201
- "init_discriminator": true,
202
- "use_spectral_norm_disriminator": false,
203
- "use_speaker_embedding": true,
204
- "num_speakers": 257,
205
- "speakers_file": "/home/user/app/models/bsc/speakers.pth",
206
- "d_vector_file": null,
207
- "speaker_embedding_channels": 256,
208
- "use_d_vector_file": false,
209
- "d_vector_dim": 0,
210
- "detach_dp_input": true,
211
- "use_language_embedding": false,
212
- "embedded_language_dim": 4,
213
- "num_languages": 0,
214
- "language_ids_file": null,
215
- "use_speaker_encoder_as_loss": false,
216
- "speaker_encoder_config_path": "",
217
- "speaker_encoder_model_path": "",
218
- "condition_dp_on_speaker": true,
219
- "freeze_encoder": false,
220
- "freeze_DP": false,
221
- "freeze_PE": false,
222
- "freeze_flow_decoder": false,
223
- "freeze_waveform_decoder": false,
224
- "encoder_sample_rate": null,
225
- "interpolate_z": true,
226
- "reinit_DP": false,
227
- "reinit_text_encoder": false
228
- },
229
- "lr_gen": 0.0001,
230
- "lr_disc": 0.0001,
231
- "lr_scheduler_gen": "ExponentialLR",
232
- "lr_scheduler_gen_params": {
233
- "gamma": 0.999875,
234
- "last_epoch": -1
235
- },
236
- "lr_scheduler_disc": "ExponentialLR",
237
- "lr_scheduler_disc_params": {
238
- "gamma": 0.999875,
239
- "last_epoch": -1
240
- },
241
- "kl_loss_alpha": 1.0,
242
- "disc_loss_alpha": 1.0,
243
- "gen_loss_alpha": 1.0,
244
- "feat_loss_alpha": 1.0,
245
- "mel_loss_alpha": 45.0,
246
- "dur_loss_alpha": 1.0,
247
- "speaker_encoder_loss_alpha": 1.0,
248
- "return_wav": true,
249
- "use_weighted_sampler": false,
250
- "weighted_sampler_attrs": null,
251
- "weighted_sampler_multipliers": null,
252
- "r": 1,
253
- "num_speakers": 257,
254
- "use_speaker_embedding": true,
255
- "speakers_file": "/home/user/app/models/bsc/speakers.pth",
256
- "speaker_embedding_channels": 256,
257
- "language_ids_file": null,
258
- "use_language_embedding": false,
259
- "use_d_vector_file": false,
260
- "d_vector_file": null,
261
- "d_vector_dim": 0
262
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/bsc/speaker_map.json DELETED
@@ -1,10 +0,0 @@
1
- {
2
- "f_cen_05": "05739",
3
- "f_cen_81": "8162d651b6211f06f655a69cd7fdd383d6b4287e9ba132b9898ef9ac8687349e777626333d23bed93f9264aae965efb14ed650cb64fd0ad90494aff903eaef11",
4
- "f_occ_31": "31535cb2ece4710d08fdbeefb6f8f75ed093fee4cf8573bd601d960f8c6156f0fd0a85712761691e86e31160b993ee0eacb10c4c8aed000cc394cf7c7d207a7e",
5
- "f_occ_de": "dee065b956b99b10db4763759d64c41791af1a7e77f1864f90a2b0847a12633dcf9bc108db7eaf73cc8d0e750f5c37383a56cd77cc2276d3960104c6bebe6346",
6
- "f_sep_31": "31e6f3a011661320b2e59b6f8be43f6db2243e9feabc2b9787c1413788e13eb0e5810bed983bf7ff66e46417d183a91ed50b3b9be9d89e4f51aada72293b9881",
7
- "m_cen_08": "08935",
8
- "m_occ_44": "30b1f81c579755895581259d79a8a5a3ca45b908b0bd14ad1c6418f39aa1e2f47cb4749c69b5440cdb92e3bafb772e19e7bc2b16d196b061addd173a1309e491",
9
- "m_val_89": "896256329fbeb5b8116349c31d8a39a7d36d5f970d48558e1db5417d611e240e4dbf473f6e49137f7aa6116394b7deabb0bbec4a014896cdc9484ee91458117d"
10
- }
 
 
 
 
 
 
 
 
 
 
 
models/bsc/speakers.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6dacda0b8dd3e111c5072f8f33c08b4a29b92ac79aaf22ceca912d01e7deb905
3
- size 30191
 
 
 
 
models/collectivat/catotron-ona-TTS-API-entry.json DELETED
@@ -1,10 +0,0 @@
1
- {
2
- "voice": "ona-fast-hifigan",
3
- "lang": "ca",
4
- "model_type": "coqui",
5
- "tts_config_path": "fast-speech_config.json",
6
- "tts_model_path": "fast-speech_best_model.pth",
7
- "vocoder_config_path": "ljspeech--hifigan_v2_config.json",
8
- "vocoder_model_path": "ljspeech--hifigan_v2_model_file.pth",
9
- "load": true
10
- }
 
 
 
 
 
 
 
 
 
 
 
models/collectivat/fast-speech_best_model.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a5aefb9f49f6172e34b816e1de8f5234012f0a9a05747973f6610e40869983f
3
- size 457921637
 
 
 
 
models/collectivat/fast-speech_config.json DELETED
@@ -1,213 +0,0 @@
1
- {
2
- "output_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron",
3
- "logger_uri": null,
4
- "run_name": "fast_pitch_ljspeech",
5
- "project_name": null,
6
- "run_description": "\ud83d\udc38Coqui trainer run.",
7
- "print_step": 50,
8
- "plot_step": 100,
9
- "model_param_stats": false,
10
- "wandb_entity": null,
11
- "dashboard_logger": "tensorboard",
12
- "log_model_step": null,
13
- "save_step": 10000,
14
- "save_n_checkpoints": 5,
15
- "save_checkpoints": true,
16
- "save_all_best": false,
17
- "save_best_after": 1000,
18
- "target_loss": null,
19
- "print_eval": false,
20
- "test_delay_epochs": -1,
21
- "run_eval": true,
22
- "run_eval_steps": null,
23
- "distributed_backend": "nccl",
24
- "distributed_url": "tcp://localhost:54321",
25
- "mixed_precision": false,
26
- "epochs": 1000,
27
- "batch_size": 16,
28
- "eval_batch_size": 16,
29
- "grad_clip": 5.0,
30
- "scheduler_after_epoch": true,
31
- "lr": 0.0001,
32
- "optimizer": "Adam",
33
- "optimizer_params": {
34
- "betas": [
35
- 0.9,
36
- 0.998
37
- ],
38
- "weight_decay": 1e-06
39
- },
40
- "lr_scheduler": "NoamLR",
41
- "lr_scheduler_params": {
42
- "warmup_steps": 4000
43
- },
44
- "use_grad_scaler": false,
45
- "cudnn_enable": true,
46
- "cudnn_deterministic": false,
47
- "cudnn_benchmark": false,
48
- "training_seed": 54321,
49
- "model": "fast_pitch",
50
- "num_loader_workers": 8,
51
- "num_eval_loader_workers": 4,
52
- "use_noise_augment": false,
53
- "audio": {
54
- "fft_size": 1024,
55
- "win_length": 1024,
56
- "hop_length": 256,
57
- "frame_shift_ms": null,
58
- "frame_length_ms": null,
59
- "stft_pad_mode": "reflect",
60
- "sample_rate": 22050,
61
- "resample": false,
62
- "preemphasis": 0.0,
63
- "ref_level_db": 20,
64
- "do_sound_norm": false,
65
- "log_func": "np.log",
66
- "do_trim_silence": true,
67
- "trim_db": 60.0,
68
- "do_rms_norm": false,
69
- "db_level": null,
70
- "power": 1.5,
71
- "griffin_lim_iters": 60,
72
- "num_mels": 80,
73
- "mel_fmin": 0.0,
74
- "mel_fmax": 8000,
75
- "spec_gain": 1.0,
76
- "do_amp_to_db_linear": true,
77
- "do_amp_to_db_mel": true,
78
- "pitch_fmax": 640.0,
79
- "pitch_fmin": 0.0,
80
- "signal_norm": false,
81
- "min_level_db": -100,
82
- "symmetric_norm": true,
83
- "max_norm": 4.0,
84
- "clip_norm": true,
85
- "stats_path": null
86
- },
87
- "use_phonemes": false,
88
- "phonemizer": null,
89
- "phoneme_language": "ca-es",
90
- "compute_input_seq_cache": true,
91
- "text_cleaner": "multilingual_cleaners",
92
- "enable_eos_bos_chars": false,
93
- "test_sentences_file": "",
94
- "phoneme_cache_path": null,
95
- "characters": {
96
- "characters_class": "TTS.tts.utils.text.characters.Graphemes",
97
- "vocab_dict": null,
98
- "pad": "_",
99
- "eos": "*",
100
- "bos": "^",
101
- "blank": null,
102
- "characters": "A\u00c0\u00c1BC\u00c7DE\u00c9\u00c8FGHI\u00cd\u00cfJKLMNO\u00d3\u00d2PQRSTU\u00dc\u00daVWXYZa\u00e0\u00e1bc\u00e7de\u00e9\u00e8fghi\u00ed\u00efjklmno\u00f3\u00f2pqrstu\u00fc\u00favwxyz",
103
- "punctuations": "!'(),-.:;?\u00b7 ",
104
- "phonemes": "",
105
- "is_unique": true,
106
- "is_sorted": true
107
- },
108
- "add_blank": false,
109
- "batch_group_size": 0,
110
- "loss_masking": null,
111
- "min_audio_len": 1,
112
- "max_audio_len": Infinity,
113
- "min_text_len": 1,
114
- "max_text_len": Infinity,
115
- "compute_f0": true,
116
- "compute_linear_spec": false,
117
- "precompute_num_workers": 4,
118
- "start_by_longest": false,
119
- "datasets": [
120
- {
121
- "name": "custom_turkish",
122
- "path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
123
- "meta_file_train": "upc_ona_train.txt",
124
- "ignored_speakers": null,
125
- "language": "",
126
- "meta_file_val": "",
127
- "meta_file_attn_mask": ""
128
- },
129
- {
130
- "name": "custom_turkish",
131
- "path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
132
- "meta_file_train": "upc_ona_val.txt",
133
- "ignored_speakers": null,
134
- "language": "",
135
- "meta_file_val": "",
136
- "meta_file_attn_mask": ""
137
- }
138
- ],
139
- "test_sentences": [
140
- "Hola Barcelona!",
141
- "Escriviu al text."
142
- ],
143
- "eval_split_max_size": null,
144
- "eval_split_size": 0.01,
145
- "use_speaker_weighted_sampler": false,
146
- "speaker_weighted_sampler_alpha": 1.0,
147
- "use_language_weighted_sampler": false,
148
- "language_weighted_sampler_alpha": 1.0,
149
- "use_length_weighted_sampler": false,
150
- "length_weighted_sampler_alpha": 1.0,
151
- "base_model": "forward_tts",
152
- "model_args": {
153
- "num_chars": 89,
154
- "out_channels": 80,
155
- "hidden_channels": 384,
156
- "use_aligner": true,
157
- "use_pitch": true,
158
- "pitch_predictor_hidden_channels": 256,
159
- "pitch_predictor_kernel_size": 3,
160
- "pitch_predictor_dropout_p": 0.1,
161
- "pitch_embedding_kernel_size": 3,
162
- "duration_predictor_hidden_channels": 256,
163
- "duration_predictor_kernel_size": 3,
164
- "duration_predictor_dropout_p": 0.1,
165
- "positional_encoding": true,
166
- "poisitonal_encoding_use_scale": true,
167
- "length_scale": 1,
168
- "encoder_type": "fftransformer",
169
- "encoder_params": {
170
- "hidden_channels_ffn": 1024,
171
- "num_heads": 1,
172
- "num_layers": 6,
173
- "dropout_p": 0.1
174
- },
175
- "decoder_type": "fftransformer",
176
- "decoder_params": {
177
- "hidden_channels_ffn": 1024,
178
- "num_heads": 1,
179
- "num_layers": 6,
180
- "dropout_p": 0.1
181
- },
182
- "detach_duration_predictor": false,
183
- "max_duration": 75,
184
- "num_speakers": 1,
185
- "use_speaker_embedding": false,
186
- "speakers_file": null,
187
- "use_d_vector_file": false,
188
- "d_vector_dim": null,
189
- "d_vector_file": null
190
- },
191
- "num_speakers": 0,
192
- "speakers_file": null,
193
- "use_speaker_embedding": false,
194
- "use_d_vector_file": false,
195
- "d_vector_file": false,
196
- "d_vector_dim": 0,
197
- "spec_loss_type": "mse",
198
- "duration_loss_type": "mse",
199
- "use_ssim_loss": true,
200
- "ssim_loss_alpha": 1.0,
201
- "spec_loss_alpha": 1.0,
202
- "aligner_loss_alpha": 1.0,
203
- "pitch_loss_alpha": 0.1,
204
- "dur_loss_alpha": 0.1,
205
- "binary_align_loss_alpha": 0.1,
206
- "binary_loss_warmup_epochs": 150,
207
- "min_seq_len": 13,
208
- "max_seq_len": 500000,
209
- "r": 1,
210
- "f0_cache_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/f0_cache",
211
- "restore_path": "/home/twbgmy/.local/share/tts/tts_models--en--ljspeech--fast_pitch/model_file.pth",
212
- "github_branch": "* dev"
213
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/collectivat/ljspeech--hifigan_v2_config.json DELETED
@@ -1,158 +0,0 @@
1
- {
2
- "run_name": "hifigan",
3
- "run_description": "universal hifigan trained on LibriTTS with no spectrogram normalization and using log() for scaling instead of log10()",
4
-
5
-
6
- // AUDIO PARAMETERS
7
- "audio":{
8
- "fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
9
- "win_length": 1024, // stft window length in ms.
10
- "hop_length": 256, // stft window hop-lengh in ms.
11
- "frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
12
- "frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
13
-
14
- // Audio processing parameters
15
- "sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
16
- "preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
17
- "ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
18
- "log_func": "np.log",
19
-
20
- // Silence trimming
21
- "do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
22
- "trim_db": 60, // threshold for timming silence. Set this according to your dataset.
23
-
24
- // MelSpectrogram parameters
25
- "num_mels": 80, // size of the mel spec frame.
26
- "mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
27
- "mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
28
- "spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
29
-
30
- // Normalization parameters
31
- "signal_norm": false, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
32
- "min_level_db": -100, // lower bound for normalization
33
- "symmetric_norm": true, // move normalization to range [-1, 1]
34
- "max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
35
- "clip_norm": true, // clip normalized values into the range.
36
- "stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
37
- },
38
-
39
- // DISTRIBUTED TRAINING
40
- "distributed":{
41
- "backend": "nccl",
42
- "url": "tcp:\/\/localhost:54324"
43
- },
44
-
45
- // MODEL PARAMETERS
46
- "use_pqmf": false,
47
-
48
- // LOSS PARAMETERS
49
- "use_stft_loss": false,
50
- "use_subband_stft_loss": false,
51
- "use_mse_gan_loss": true,
52
- "use_hinge_gan_loss": false,
53
- "use_feat_match_loss": true, // use only with melgan discriminators
54
- "use_l1_spec_loss": true,
55
-
56
- // loss weights
57
- "stft_loss_weight": 0,
58
- "subband_stft_loss_weight": 0,
59
- "mse_G_loss_weight": 1,
60
- "hinge_G_loss_weight": 0,
61
- "feat_match_loss_weight": 10,
62
- "l1_spec_loss_weight": 45,
63
-
64
- // multiscale stft loss parameters
65
- // "stft_loss_params": {
66
- // "n_ffts": [1024, 2048, 512],
67
- // "hop_lengths": [120, 240, 50],
68
- // "win_lengths": [600, 1200, 240]
69
- // },
70
-
71
- "l1_spec_loss_params": {
72
- "use_mel": true,
73
- "sample_rate": 16000,
74
- "n_fft": 1024,
75
- "hop_length": 256,
76
- "win_length": 1024,
77
- "n_mels": 80,
78
- "mel_fmin": 0.0,
79
- "mel_fmax": null
80
- },
81
-
82
- "target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch
83
-
84
- // DISCRIMINATOR
85
- "discriminator_model": "hifigan_discriminator",
86
- //"discriminator_model_params":{
87
- // "peroids": [2, 3, 5, 7, 11],
88
- // "base_channels": 16,
89
- // "max_channels":512,
90
- // "downsample_factors":[4, 4, 4]
91
- //},
92
- "steps_to_start_discriminator": 0, // steps required to start GAN trainining.1
93
-
94
- // GENERATOR
95
- "generator_model": "hifigan_generator",
96
- "generator_model_params": {
97
- "resblock_type": "1",
98
- "upsample_factors": [8,8,2,2],
99
- "upsample_kernel_sizes": [16,16,4,4],
100
- "upsample_initial_channel": 128,
101
- "resblock_kernel_sizes": [3,7,11],
102
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]]
103
- },
104
-
105
- // DATASET
106
- "data_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/vo_voice_quality_transformation/",
107
- "feature_path": null,
108
- // "feature_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/tacotron-DCA/",
109
- "seq_len": 8192,
110
- "pad_short": 2000,
111
- "conv_pad": 0,
112
- "use_noise_augment": false,
113
- "use_cache": true,
114
- "reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
115
-
116
- // TRAINING
117
- "batch_size": 16, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
118
-
119
- // VALIDATION
120
- "run_eval": true,
121
- "test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
122
- "test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
123
-
124
- // OPTIMIZER
125
- "epochs": 10000, // total number of epochs to train.
126
- "wd": 0.0, // Weight decay weight.
127
- "gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
128
- "disc_clip_grad": -1, // Discriminator gradient clipping threshold.
129
- // "lr_scheduler_gen": "ExponentialLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
130
- // "lr_scheduler_gen_params": {
131
- // "gamma": 0.999,
132
- // "last_epoch": -1
133
- // },
134
- // "lr_scheduler_disc": "ExponentialLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
135
- // "lr_scheduler_disc_params": {
136
- // "gamma": 0.999,
137
- // "last_epoch": -1
138
- // },
139
- "lr_gen": 0.00001, // Initial learning rate. If Noam decay is active, maximum learning rate.
140
- "lr_disc": 0.00001,
141
-
142
- // TENSORBOARD and LOGGING
143
- "print_step": 25, // Number of steps to log traning on console.
144
- "print_eval": false, // If True, it prints loss values for each step in eval run.
145
- "save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
146
- "checkpoint": true, // If true, it saves checkpoints per "save_step"
147
- "tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
148
-
149
- // DATA LOADING
150
- "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
151
- "num_val_loader_workers": 4, // number of evaluation data loader processes.
152
- "eval_split_size": 10,
153
-
154
- // PATHS
155
- "output_path": "/home/erogol/gdrive/Trainings/sam/"
156
- }
157
-
158
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/collectivat/ljspeech--hifigan_v2_model_file.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4047e93886faa1aba11948efa71f59dcb0ec9117e286660e59b91892ef98d129
3
- size 3794153
 
 
 
 
models/mms/G_100000.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0382edd70333f8ddc663177e672c8a66312e1b30f7929a8f9d458ef66f6b5349
3
- size 436622793
 
 
 
 
models/mms/config.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "eval_interval": 1000,
5
- "seed": 1234,
6
- "epochs": 20000,
7
- "learning_rate": 0.0002,
8
- "betas": [
9
- 0.8,
10
- 0.99
11
- ],
12
- "eps": 1e-09,
13
- "batch_size": 64,
14
- "fp16_run": true,
15
- "lr_decay": 0.999875,
16
- "segment_size": 8192,
17
- "init_lr_ratio": 1,
18
- "warmup_epochs": 0,
19
- "c_mel": 45,
20
- "c_kl": 1.0
21
- },
22
- "data": {
23
- "training_files": "train.ltr",
24
- "validation_files": "dev.ltr",
25
- "text_cleaners": [
26
- "transliteration_cleaners"
27
- ],
28
- "max_wav_value": 32768.0,
29
- "sampling_rate": 16000,
30
- "filter_length": 1024,
31
- "hop_length": 256,
32
- "win_length": 1024,
33
- "n_mel_channels": 80,
34
- "mel_fmin": 0.0,
35
- "mel_fmax": null,
36
- "add_blank": true,
37
- "n_speakers": 0,
38
- "cleaned_text": true
39
- },
40
- "model": {
41
- "inter_channels": 192,
42
- "hidden_channels": 192,
43
- "filter_channels": 768,
44
- "n_heads": 2,
45
- "n_layers": 6,
46
- "kernel_size": 3,
47
- "p_dropout": 0.1,
48
- "resblock": "1",
49
- "resblock_kernel_sizes": [
50
- 3,
51
- 7,
52
- 11
53
- ],
54
- "resblock_dilation_sizes": [
55
- [
56
- 1,
57
- 3,
58
- 5
59
- ],
60
- [
61
- 1,
62
- 3,
63
- 5
64
- ],
65
- [
66
- 1,
67
- 3,
68
- 5
69
- ]
70
- ],
71
- "upsample_rates": [
72
- 8,
73
- 8,
74
- 2,
75
- 2
76
- ],
77
- "upsample_initial_channel": 512,
78
- "upsample_kernel_sizes": [
79
- 16,
80
- 16,
81
- 4,
82
- 4
83
- ],
84
- "n_layers_q": 3,
85
- "use_spectral_norm": false
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/mms/vocab.txt DELETED
@@ -1,39 +0,0 @@
1
- z
2
- f
3
- i
4
- g
5
- m
6
- o
7
- r
8
- è
9
- h
10
- l
11
- v
12
- à
13
- u
14
- d
15
- ú
16
- ç
17
- p
18
- s
19
- '
20
- é
21
- _
22
- -
23
- e
24
- a
25
-
26
- x
27
- ü
28
- q
29
- t
30
- b
31
- í
32
- ó
33
- ï
34
- ò
35
-
36
- c
37
- j
38
- n
39
- y
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/piper/MODEL_CARD DELETED
@@ -1,15 +0,0 @@
1
- # Model card for upc_ona (x-low)
2
-
3
- * Language: ca (Catalan)
4
- * Speakers: 1
5
- * Quality: x-low
6
- * Samplerate: 16,000Hz
7
-
8
- ## Dataset
9
-
10
- * URL: https://collectivat.cat/asr#upc-festcat-tts-corpora
11
- * License: CC BY-SA 3.0 ES
12
-
13
- ## Training
14
-
15
- Trained from scratch.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/piper/ca-upc_ona-x-low.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:13661d26423e0c791823823a5971f4e1aaf644a62e65e0e94d299c0e70560e14
3
- size 20628813
 
 
 
 
models/piper/ca-upc_ona-x-low.onnx.json DELETED
@@ -1,409 +0,0 @@
1
- {
2
- "audio": {
3
- "sample_rate": 16000
4
- },
5
- "espeak": {
6
- "voice": "ca"
7
- },
8
- "inference": {
9
- "noise_scale": 0.667,
10
- "length_scale": 1,
11
- "noise_w": 0.8
12
- },
13
- "phoneme_map": {},
14
- "phoneme_id_map": {
15
- "_": [
16
- 0
17
- ],
18
- "^": [
19
- 1
20
- ],
21
- "$": [
22
- 2
23
- ],
24
- " ": [
25
- 3
26
- ],
27
- "!": [
28
- 4
29
- ],
30
- "'": [
31
- 5
32
- ],
33
- "(": [
34
- 6
35
- ],
36
- ")": [
37
- 7
38
- ],
39
- ",": [
40
- 8
41
- ],
42
- "-": [
43
- 9
44
- ],
45
- ".": [
46
- 10
47
- ],
48
- ":": [
49
- 11
50
- ],
51
- ";": [
52
- 12
53
- ],
54
- "?": [
55
- 13
56
- ],
57
- "a": [
58
- 14
59
- ],
60
- "b": [
61
- 15
62
- ],
63
- "c": [
64
- 16
65
- ],
66
- "d": [
67
- 17
68
- ],
69
- "e": [
70
- 18
71
- ],
72
- "f": [
73
- 19
74
- ],
75
- "h": [
76
- 20
77
- ],
78
- "i": [
79
- 21
80
- ],
81
- "j": [
82
- 22
83
- ],
84
- "k": [
85
- 23
86
- ],
87
- "l": [
88
- 24
89
- ],
90
- "m": [
91
- 25
92
- ],
93
- "n": [
94
- 26
95
- ],
96
- "o": [
97
- 27
98
- ],
99
- "p": [
100
- 28
101
- ],
102
- "q": [
103
- 29
104
- ],
105
- "r": [
106
- 30
107
- ],
108
- "s": [
109
- 31
110
- ],
111
- "t": [
112
- 32
113
- ],
114
- "u": [
115
- 33
116
- ],
117
- "v": [
118
- 34
119
- ],
120
- "w": [
121
- 35
122
- ],
123
- "x": [
124
- 36
125
- ],
126
- "y": [
127
- 37
128
- ],
129
- "z": [
130
- 38
131
- ],
132
- "æ": [
133
- 39
134
- ],
135
- "ç": [
136
- 40
137
- ],
138
- "ð": [
139
- 41
140
- ],
141
- "ø": [
142
- 42
143
- ],
144
- "ħ": [
145
- 43
146
- ],
147
- "ŋ": [
148
- 44
149
- ],
150
- "œ": [
151
- 45
152
- ],
153
- "ǀ": [
154
- 46
155
- ],
156
- "ǁ": [
157
- 47
158
- ],
159
- "ǂ": [
160
- 48
161
- ],
162
- "ǃ": [
163
- 49
164
- ],
165
- "ɐ": [
166
- 50
167
- ],
168
- "ɑ": [
169
- 51
170
- ],
171
- "ɒ": [
172
- 52
173
- ],
174
- "ɓ": [
175
- 53
176
- ],
177
- "ɔ": [
178
- 54
179
- ],
180
- "ɕ": [
181
- 55
182
- ],
183
- "ɖ": [
184
- 56
185
- ],
186
- "ɗ": [
187
- 57
188
- ],
189
- "ɘ": [
190
- 58
191
- ],
192
- "ə": [
193
- 59
194
- ],
195
- "ɚ": [
196
- 60
197
- ],
198
- "ɛ": [
199
- 61
200
- ],
201
- "ɜ": [
202
- 62
203
- ],
204
- "ɞ": [
205
- 63
206
- ],
207
- "ɟ": [
208
- 64
209
- ],
210
- "ɠ": [
211
- 65
212
- ],
213
- "ɡ": [
214
- 66
215
- ],
216
- "ɢ": [
217
- 67
218
- ],
219
- "ɣ": [
220
- 68
221
- ],
222
- "ɤ": [
223
- 69
224
- ],
225
- "ɥ": [
226
- 70
227
- ],
228
- "ɦ": [
229
- 71
230
- ],
231
- "ɧ": [
232
- 72
233
- ],
234
- "ɨ": [
235
- 73
236
- ],
237
- "ɪ": [
238
- 74
239
- ],
240
- "ɫ": [
241
- 75
242
- ],
243
- "ɬ": [
244
- 76
245
- ],
246
- "ɭ": [
247
- 77
248
- ],
249
- "ɮ": [
250
- 78
251
- ],
252
- "ɯ": [
253
- 79
254
- ],
255
- "ɰ": [
256
- 80
257
- ],
258
- "ɱ": [
259
- 81
260
- ],
261
- "ɲ": [
262
- 82
263
- ],
264
- "ɳ": [
265
- 83
266
- ],
267
- "ɴ": [
268
- 84
269
- ],
270
- "ɵ": [
271
- 85
272
- ],
273
- "ɶ": [
274
- 86
275
- ],
276
- "ɸ": [
277
- 87
278
- ],
279
- "ɹ": [
280
- 88
281
- ],
282
- "ɺ": [
283
- 89
284
- ],
285
- "ɻ": [
286
- 90
287
- ],
288
- "ɽ": [
289
- 91
290
- ],
291
- "ɾ": [
292
- 92
293
- ],
294
- "ʀ": [
295
- 93
296
- ],
297
- "ʁ": [
298
- 94
299
- ],
300
- "ʂ": [
301
- 95
302
- ],
303
- "ʃ": [
304
- 96
305
- ],
306
- "ʄ": [
307
- 97
308
- ],
309
- "ʈ": [
310
- 98
311
- ],
312
- "ʉ": [
313
- 99
314
- ],
315
- "ʊ": [
316
- 100
317
- ],
318
- "ʋ": [
319
- 101
320
- ],
321
- "ʌ": [
322
- 102
323
- ],
324
- "ʍ": [
325
- 103
326
- ],
327
- "ʎ": [
328
- 104
329
- ],
330
- "ʏ": [
331
- 105
332
- ],
333
- "ʐ": [
334
- 106
335
- ],
336
- "ʑ": [
337
- 107
338
- ],
339
- "ʒ": [
340
- 108
341
- ],
342
- "ʔ": [
343
- 109
344
- ],
345
- "ʕ": [
346
- 110
347
- ],
348
- "ʘ": [
349
- 111
350
- ],
351
- "ʙ": [
352
- 112
353
- ],
354
- "ʛ": [
355
- 113
356
- ],
357
- "ʜ": [
358
- 114
359
- ],
360
- "ʝ": [
361
- 115
362
- ],
363
- "ʟ": [
364
- 116
365
- ],
366
- "ʡ": [
367
- 117
368
- ],
369
- "ʢ": [
370
- 118
371
- ],
372
- "ʲ": [
373
- 119
374
- ],
375
- "ˈ": [
376
- 120
377
- ],
378
- "ˌ": [
379
- 121
380
- ],
381
- "ː": [
382
- 122
383
- ],
384
- "ˑ": [
385
- 123
386
- ],
387
- "˞": [
388
- 124
389
- ],
390
- "β": [
391
- 125
392
- ],
393
- "θ": [
394
- 126
395
- ],
396
- "χ": [
397
- 127
398
- ],
399
- "ᵻ": [
400
- 128
401
- ],
402
- "ⱱ": [
403
- 129
404
- ]
405
- },
406
- "num_symbols": 130,
407
- "num_speakers": 1,
408
- "speaker_id_map": {}
409
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,2 +1,2 @@
1
  gradio
2
- espeak-phonemizer>=1.1.0,<2
 
1
  gradio
2
+ espeak-phonemizer>=1.1.0,<2