Spaces:
Runtime error
Runtime error
removed junk 2
Browse files- app.py +0 -1
- models/bsc/best_model.pth +0 -3
- models/bsc/config.json +0 -262
- models/bsc/speaker_map.json +0 -10
- models/bsc/speakers.pth +0 -3
- models/collectivat/catotron-ona-TTS-API-entry.json +0 -10
- models/collectivat/fast-speech_best_model.pth +0 -3
- models/collectivat/fast-speech_config.json +0 -213
- models/collectivat/ljspeech--hifigan_v2_config.json +0 -158
- models/collectivat/ljspeech--hifigan_v2_model_file.pth +0 -3
- models/mms/G_100000.pth +0 -3
- models/mms/config.json +0 -87
- models/mms/vocab.txt +0 -39
- models/piper/MODEL_CARD +0 -15
- models/piper/ca-upc_ona-x-low.onnx +0 -3
- models/piper/ca-upc_ona-x-low.onnx.json +0 -409
- requirements.txt +1 -1
app.py
CHANGED
@@ -39,7 +39,6 @@ iface = gr.Interface(
|
|
39 |
value="L'Èlia i l'Alí a l'aula. L'oli i l'ou. Lulú olorava la lila.",
|
40 |
),
|
41 |
gr.Dropdown(label="dialect", choices="")
|
42 |
-
|
43 |
],
|
44 |
outputs=[
|
45 |
gr.Markdown(label="Fonemes")
|
|
|
39 |
value="L'Èlia i l'Alí a l'aula. L'oli i l'ou. Lulú olorava la lila.",
|
40 |
),
|
41 |
gr.Dropdown(label="dialect", choices="")
|
|
|
42 |
],
|
43 |
outputs=[
|
44 |
gr.Markdown(label="Fonemes")
|
models/bsc/best_model.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b15fa7d2052bada1cf421e49d2d03b00e95b49fcd0e42b7af1d92da2880cdecc
|
3 |
-
size 1038659133
|
|
|
|
|
|
|
|
models/bsc/config.json
DELETED
@@ -1,262 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"output_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/experiments_from_previous",
|
3 |
-
"logger_uri": null,
|
4 |
-
"run_name": "multispeaker_vits_ca_1e4_1e4_32",
|
5 |
-
"project_name": null,
|
6 |
-
"run_description": "\ud83d\udc38Coqui trainer run.",
|
7 |
-
"print_step": 25,
|
8 |
-
"plot_step": 100,
|
9 |
-
"model_param_stats": false,
|
10 |
-
"wandb_entity": null,
|
11 |
-
"dashboard_logger": "tensorboard",
|
12 |
-
"log_model_step": 1000,
|
13 |
-
"save_step": 1000,
|
14 |
-
"save_n_checkpoints": 5,
|
15 |
-
"save_checkpoints": true,
|
16 |
-
"save_all_best": true,
|
17 |
-
"save_best_after": 10000,
|
18 |
-
"target_loss": null,
|
19 |
-
"print_eval": true,
|
20 |
-
"test_delay_epochs": -1,
|
21 |
-
"run_eval": true,
|
22 |
-
"run_eval_steps": null,
|
23 |
-
"distributed_backend": "nccl",
|
24 |
-
"distributed_url": "tcp://localhost:54321",
|
25 |
-
"mixed_precision": false,
|
26 |
-
"epochs": 1000,
|
27 |
-
"batch_size": 16,
|
28 |
-
"eval_batch_size": 8,
|
29 |
-
"grad_clip": [
|
30 |
-
1000.0,
|
31 |
-
1000.0
|
32 |
-
],
|
33 |
-
"scheduler_after_epoch": true,
|
34 |
-
"lr": 0.001,
|
35 |
-
"optimizer": "AdamW",
|
36 |
-
"optimizer_params": {
|
37 |
-
"betas": [
|
38 |
-
0.8,
|
39 |
-
0.99
|
40 |
-
],
|
41 |
-
"eps": 1e-09,
|
42 |
-
"weight_decay": 0.01
|
43 |
-
},
|
44 |
-
"lr_scheduler": "",
|
45 |
-
"lr_scheduler_params": null,
|
46 |
-
"use_grad_scaler": false,
|
47 |
-
"cudnn_enable": true,
|
48 |
-
"cudnn_deterministic": false,
|
49 |
-
"cudnn_benchmark": false,
|
50 |
-
"training_seed": 54321,
|
51 |
-
"model": "vits",
|
52 |
-
"num_loader_workers": 4,
|
53 |
-
"num_eval_loader_workers": 4,
|
54 |
-
"use_noise_augment": false,
|
55 |
-
"audio": {
|
56 |
-
"fft_size": 1024,
|
57 |
-
"sample_rate": 22050,
|
58 |
-
"win_length": 1024,
|
59 |
-
"hop_length": 256,
|
60 |
-
"num_mels": 80,
|
61 |
-
"mel_fmin": 0,
|
62 |
-
"mel_fmax": null
|
63 |
-
},
|
64 |
-
"use_phonemes": true,
|
65 |
-
"phonemizer": "espeak",
|
66 |
-
"phoneme_language": "ca",
|
67 |
-
"compute_input_seq_cache": true,
|
68 |
-
"text_cleaner": "multilingual_cleaners",
|
69 |
-
"enable_eos_bos_chars": false,
|
70 |
-
"test_sentences_file": "",
|
71 |
-
"phoneme_cache_path": "/gpfs/projects/bsc88/speech/tts/TTS_v0.8.0/recipes/multispeaker/phoneme_cache",
|
72 |
-
"characters": {
|
73 |
-
"characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
|
74 |
-
"vocab_dict": null,
|
75 |
-
"pad": "<PAD>",
|
76 |
-
"eos": "<EOS>",
|
77 |
-
"bos": "<BOS>",
|
78 |
-
"blank": "<BLNK>",
|
79 |
-
"characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
|
80 |
-
"punctuations": "!'(),-.:;? ",
|
81 |
-
"phonemes": null,
|
82 |
-
"is_unique": false,
|
83 |
-
"is_sorted": true
|
84 |
-
},
|
85 |
-
"add_blank": true,
|
86 |
-
"batch_group_size": 5,
|
87 |
-
"loss_masking": null,
|
88 |
-
"min_audio_len": 1,
|
89 |
-
"max_audio_len": Infinity,
|
90 |
-
"min_text_len": 1,
|
91 |
-
"max_text_len": 325,
|
92 |
-
"compute_f0": false,
|
93 |
-
"compute_linear_spec": true,
|
94 |
-
"precompute_num_workers": 0,
|
95 |
-
"start_by_longest": false,
|
96 |
-
"datasets": [
|
97 |
-
{
|
98 |
-
"formatter": "vctk_old",
|
99 |
-
"dataset_name": "vctk_old",
|
100 |
-
"path": "/gpfs/scratch/bsc88/bsc88474/data/multispeaker_ca",
|
101 |
-
"meta_file_train": "",
|
102 |
-
"ignored_speakers": [
|
103 |
-
"uri",
|
104 |
-
"09796",
|
105 |
-
"05450"
|
106 |
-
],
|
107 |
-
"language": "ca",
|
108 |
-
"meta_file_val": "",
|
109 |
-
"meta_file_attn_mask": ""
|
110 |
-
}
|
111 |
-
],
|
112 |
-
"test_sentences": [
|
113 |
-
[
|
114 |
-
"Per exemple, dels nostres bancs que inverteixen en armament de les nostres empreses."
|
115 |
-
],
|
116 |
-
[
|
117 |
-
"Preguntin-se si aix\u00f2 era necessari."
|
118 |
-
],
|
119 |
-
[
|
120 |
-
"La suposada ocultaci\u00f3 dels informes que advertien de risc s\u00edsmic."
|
121 |
-
],
|
122 |
-
[
|
123 |
-
"\u00c9s de 633 milions d'euros quan es far\u00e0 la publicaci\u00f3 detallada."
|
124 |
-
]
|
125 |
-
],
|
126 |
-
"eval_split_max_size": null,
|
127 |
-
"eval_split_size": 0.01,
|
128 |
-
"use_speaker_weighted_sampler": false,
|
129 |
-
"speaker_weighted_sampler_alpha": 1.0,
|
130 |
-
"use_language_weighted_sampler": false,
|
131 |
-
"language_weighted_sampler_alpha": 1.0,
|
132 |
-
"use_length_weighted_sampler": false,
|
133 |
-
"length_weighted_sampler_alpha": 1.0,
|
134 |
-
"model_args": {
|
135 |
-
"num_chars": 131,
|
136 |
-
"out_channels": 513,
|
137 |
-
"spec_segment_size": 32,
|
138 |
-
"hidden_channels": 192,
|
139 |
-
"hidden_channels_ffn_text_encoder": 768,
|
140 |
-
"num_heads_text_encoder": 2,
|
141 |
-
"num_layers_text_encoder": 6,
|
142 |
-
"kernel_size_text_encoder": 3,
|
143 |
-
"dropout_p_text_encoder": 0.1,
|
144 |
-
"dropout_p_duration_predictor": 0.5,
|
145 |
-
"kernel_size_posterior_encoder": 5,
|
146 |
-
"dilation_rate_posterior_encoder": 1,
|
147 |
-
"num_layers_posterior_encoder": 16,
|
148 |
-
"kernel_size_flow": 5,
|
149 |
-
"dilation_rate_flow": 1,
|
150 |
-
"num_layers_flow": 4,
|
151 |
-
"resblock_type_decoder": "1",
|
152 |
-
"resblock_kernel_sizes_decoder": [
|
153 |
-
3,
|
154 |
-
7,
|
155 |
-
11
|
156 |
-
],
|
157 |
-
"resblock_dilation_sizes_decoder": [
|
158 |
-
[
|
159 |
-
1,
|
160 |
-
3,
|
161 |
-
5
|
162 |
-
],
|
163 |
-
[
|
164 |
-
1,
|
165 |
-
3,
|
166 |
-
5
|
167 |
-
],
|
168 |
-
[
|
169 |
-
1,
|
170 |
-
3,
|
171 |
-
5
|
172 |
-
]
|
173 |
-
],
|
174 |
-
"upsample_rates_decoder": [
|
175 |
-
8,
|
176 |
-
8,
|
177 |
-
2,
|
178 |
-
2
|
179 |
-
],
|
180 |
-
"upsample_initial_channel_decoder": 512,
|
181 |
-
"upsample_kernel_sizes_decoder": [
|
182 |
-
16,
|
183 |
-
16,
|
184 |
-
4,
|
185 |
-
4
|
186 |
-
],
|
187 |
-
"periods_multi_period_discriminator": [
|
188 |
-
2,
|
189 |
-
3,
|
190 |
-
5,
|
191 |
-
7,
|
192 |
-
11
|
193 |
-
],
|
194 |
-
"use_sdp": true,
|
195 |
-
"noise_scale": 1.0,
|
196 |
-
"inference_noise_scale": 0.667,
|
197 |
-
"length_scale": 1.0,
|
198 |
-
"noise_scale_dp": 1.0,
|
199 |
-
"inference_noise_scale_dp": 1.0,
|
200 |
-
"max_inference_len": null,
|
201 |
-
"init_discriminator": true,
|
202 |
-
"use_spectral_norm_disriminator": false,
|
203 |
-
"use_speaker_embedding": true,
|
204 |
-
"num_speakers": 257,
|
205 |
-
"speakers_file": "/home/user/app/models/bsc/speakers.pth",
|
206 |
-
"d_vector_file": null,
|
207 |
-
"speaker_embedding_channels": 256,
|
208 |
-
"use_d_vector_file": false,
|
209 |
-
"d_vector_dim": 0,
|
210 |
-
"detach_dp_input": true,
|
211 |
-
"use_language_embedding": false,
|
212 |
-
"embedded_language_dim": 4,
|
213 |
-
"num_languages": 0,
|
214 |
-
"language_ids_file": null,
|
215 |
-
"use_speaker_encoder_as_loss": false,
|
216 |
-
"speaker_encoder_config_path": "",
|
217 |
-
"speaker_encoder_model_path": "",
|
218 |
-
"condition_dp_on_speaker": true,
|
219 |
-
"freeze_encoder": false,
|
220 |
-
"freeze_DP": false,
|
221 |
-
"freeze_PE": false,
|
222 |
-
"freeze_flow_decoder": false,
|
223 |
-
"freeze_waveform_decoder": false,
|
224 |
-
"encoder_sample_rate": null,
|
225 |
-
"interpolate_z": true,
|
226 |
-
"reinit_DP": false,
|
227 |
-
"reinit_text_encoder": false
|
228 |
-
},
|
229 |
-
"lr_gen": 0.0001,
|
230 |
-
"lr_disc": 0.0001,
|
231 |
-
"lr_scheduler_gen": "ExponentialLR",
|
232 |
-
"lr_scheduler_gen_params": {
|
233 |
-
"gamma": 0.999875,
|
234 |
-
"last_epoch": -1
|
235 |
-
},
|
236 |
-
"lr_scheduler_disc": "ExponentialLR",
|
237 |
-
"lr_scheduler_disc_params": {
|
238 |
-
"gamma": 0.999875,
|
239 |
-
"last_epoch": -1
|
240 |
-
},
|
241 |
-
"kl_loss_alpha": 1.0,
|
242 |
-
"disc_loss_alpha": 1.0,
|
243 |
-
"gen_loss_alpha": 1.0,
|
244 |
-
"feat_loss_alpha": 1.0,
|
245 |
-
"mel_loss_alpha": 45.0,
|
246 |
-
"dur_loss_alpha": 1.0,
|
247 |
-
"speaker_encoder_loss_alpha": 1.0,
|
248 |
-
"return_wav": true,
|
249 |
-
"use_weighted_sampler": false,
|
250 |
-
"weighted_sampler_attrs": null,
|
251 |
-
"weighted_sampler_multipliers": null,
|
252 |
-
"r": 1,
|
253 |
-
"num_speakers": 257,
|
254 |
-
"use_speaker_embedding": true,
|
255 |
-
"speakers_file": "/home/user/app/models/bsc/speakers.pth",
|
256 |
-
"speaker_embedding_channels": 256,
|
257 |
-
"language_ids_file": null,
|
258 |
-
"use_language_embedding": false,
|
259 |
-
"use_d_vector_file": false,
|
260 |
-
"d_vector_file": null,
|
261 |
-
"d_vector_dim": 0
|
262 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/bsc/speaker_map.json
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"f_cen_05": "05739",
|
3 |
-
"f_cen_81": "8162d651b6211f06f655a69cd7fdd383d6b4287e9ba132b9898ef9ac8687349e777626333d23bed93f9264aae965efb14ed650cb64fd0ad90494aff903eaef11",
|
4 |
-
"f_occ_31": "31535cb2ece4710d08fdbeefb6f8f75ed093fee4cf8573bd601d960f8c6156f0fd0a85712761691e86e31160b993ee0eacb10c4c8aed000cc394cf7c7d207a7e",
|
5 |
-
"f_occ_de": "dee065b956b99b10db4763759d64c41791af1a7e77f1864f90a2b0847a12633dcf9bc108db7eaf73cc8d0e750f5c37383a56cd77cc2276d3960104c6bebe6346",
|
6 |
-
"f_sep_31": "31e6f3a011661320b2e59b6f8be43f6db2243e9feabc2b9787c1413788e13eb0e5810bed983bf7ff66e46417d183a91ed50b3b9be9d89e4f51aada72293b9881",
|
7 |
-
"m_cen_08": "08935",
|
8 |
-
"m_occ_44": "30b1f81c579755895581259d79a8a5a3ca45b908b0bd14ad1c6418f39aa1e2f47cb4749c69b5440cdb92e3bafb772e19e7bc2b16d196b061addd173a1309e491",
|
9 |
-
"m_val_89": "896256329fbeb5b8116349c31d8a39a7d36d5f970d48558e1db5417d611e240e4dbf473f6e49137f7aa6116394b7deabb0bbec4a014896cdc9484ee91458117d"
|
10 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/bsc/speakers.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:6dacda0b8dd3e111c5072f8f33c08b4a29b92ac79aaf22ceca912d01e7deb905
|
3 |
-
size 30191
|
|
|
|
|
|
|
|
models/collectivat/catotron-ona-TTS-API-entry.json
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"voice": "ona-fast-hifigan",
|
3 |
-
"lang": "ca",
|
4 |
-
"model_type": "coqui",
|
5 |
-
"tts_config_path": "fast-speech_config.json",
|
6 |
-
"tts_model_path": "fast-speech_best_model.pth",
|
7 |
-
"vocoder_config_path": "ljspeech--hifigan_v2_config.json",
|
8 |
-
"vocoder_model_path": "ljspeech--hifigan_v2_model_file.pth",
|
9 |
-
"load": true
|
10 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/collectivat/fast-speech_best_model.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:3a5aefb9f49f6172e34b816e1de8f5234012f0a9a05747973f6610e40869983f
|
3 |
-
size 457921637
|
|
|
|
|
|
|
|
models/collectivat/fast-speech_config.json
DELETED
@@ -1,213 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"output_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron",
|
3 |
-
"logger_uri": null,
|
4 |
-
"run_name": "fast_pitch_ljspeech",
|
5 |
-
"project_name": null,
|
6 |
-
"run_description": "\ud83d\udc38Coqui trainer run.",
|
7 |
-
"print_step": 50,
|
8 |
-
"plot_step": 100,
|
9 |
-
"model_param_stats": false,
|
10 |
-
"wandb_entity": null,
|
11 |
-
"dashboard_logger": "tensorboard",
|
12 |
-
"log_model_step": null,
|
13 |
-
"save_step": 10000,
|
14 |
-
"save_n_checkpoints": 5,
|
15 |
-
"save_checkpoints": true,
|
16 |
-
"save_all_best": false,
|
17 |
-
"save_best_after": 1000,
|
18 |
-
"target_loss": null,
|
19 |
-
"print_eval": false,
|
20 |
-
"test_delay_epochs": -1,
|
21 |
-
"run_eval": true,
|
22 |
-
"run_eval_steps": null,
|
23 |
-
"distributed_backend": "nccl",
|
24 |
-
"distributed_url": "tcp://localhost:54321",
|
25 |
-
"mixed_precision": false,
|
26 |
-
"epochs": 1000,
|
27 |
-
"batch_size": 16,
|
28 |
-
"eval_batch_size": 16,
|
29 |
-
"grad_clip": 5.0,
|
30 |
-
"scheduler_after_epoch": true,
|
31 |
-
"lr": 0.0001,
|
32 |
-
"optimizer": "Adam",
|
33 |
-
"optimizer_params": {
|
34 |
-
"betas": [
|
35 |
-
0.9,
|
36 |
-
0.998
|
37 |
-
],
|
38 |
-
"weight_decay": 1e-06
|
39 |
-
},
|
40 |
-
"lr_scheduler": "NoamLR",
|
41 |
-
"lr_scheduler_params": {
|
42 |
-
"warmup_steps": 4000
|
43 |
-
},
|
44 |
-
"use_grad_scaler": false,
|
45 |
-
"cudnn_enable": true,
|
46 |
-
"cudnn_deterministic": false,
|
47 |
-
"cudnn_benchmark": false,
|
48 |
-
"training_seed": 54321,
|
49 |
-
"model": "fast_pitch",
|
50 |
-
"num_loader_workers": 8,
|
51 |
-
"num_eval_loader_workers": 4,
|
52 |
-
"use_noise_augment": false,
|
53 |
-
"audio": {
|
54 |
-
"fft_size": 1024,
|
55 |
-
"win_length": 1024,
|
56 |
-
"hop_length": 256,
|
57 |
-
"frame_shift_ms": null,
|
58 |
-
"frame_length_ms": null,
|
59 |
-
"stft_pad_mode": "reflect",
|
60 |
-
"sample_rate": 22050,
|
61 |
-
"resample": false,
|
62 |
-
"preemphasis": 0.0,
|
63 |
-
"ref_level_db": 20,
|
64 |
-
"do_sound_norm": false,
|
65 |
-
"log_func": "np.log",
|
66 |
-
"do_trim_silence": true,
|
67 |
-
"trim_db": 60.0,
|
68 |
-
"do_rms_norm": false,
|
69 |
-
"db_level": null,
|
70 |
-
"power": 1.5,
|
71 |
-
"griffin_lim_iters": 60,
|
72 |
-
"num_mels": 80,
|
73 |
-
"mel_fmin": 0.0,
|
74 |
-
"mel_fmax": 8000,
|
75 |
-
"spec_gain": 1.0,
|
76 |
-
"do_amp_to_db_linear": true,
|
77 |
-
"do_amp_to_db_mel": true,
|
78 |
-
"pitch_fmax": 640.0,
|
79 |
-
"pitch_fmin": 0.0,
|
80 |
-
"signal_norm": false,
|
81 |
-
"min_level_db": -100,
|
82 |
-
"symmetric_norm": true,
|
83 |
-
"max_norm": 4.0,
|
84 |
-
"clip_norm": true,
|
85 |
-
"stats_path": null
|
86 |
-
},
|
87 |
-
"use_phonemes": false,
|
88 |
-
"phonemizer": null,
|
89 |
-
"phoneme_language": "ca-es",
|
90 |
-
"compute_input_seq_cache": true,
|
91 |
-
"text_cleaner": "multilingual_cleaners",
|
92 |
-
"enable_eos_bos_chars": false,
|
93 |
-
"test_sentences_file": "",
|
94 |
-
"phoneme_cache_path": null,
|
95 |
-
"characters": {
|
96 |
-
"characters_class": "TTS.tts.utils.text.characters.Graphemes",
|
97 |
-
"vocab_dict": null,
|
98 |
-
"pad": "_",
|
99 |
-
"eos": "*",
|
100 |
-
"bos": "^",
|
101 |
-
"blank": null,
|
102 |
-
"characters": "A\u00c0\u00c1BC\u00c7DE\u00c9\u00c8FGHI\u00cd\u00cfJKLMNO\u00d3\u00d2PQRSTU\u00dc\u00daVWXYZa\u00e0\u00e1bc\u00e7de\u00e9\u00e8fghi\u00ed\u00efjklmno\u00f3\u00f2pqrstu\u00fc\u00favwxyz",
|
103 |
-
"punctuations": "!'(),-.:;?\u00b7 ",
|
104 |
-
"phonemes": "",
|
105 |
-
"is_unique": true,
|
106 |
-
"is_sorted": true
|
107 |
-
},
|
108 |
-
"add_blank": false,
|
109 |
-
"batch_group_size": 0,
|
110 |
-
"loss_masking": null,
|
111 |
-
"min_audio_len": 1,
|
112 |
-
"max_audio_len": Infinity,
|
113 |
-
"min_text_len": 1,
|
114 |
-
"max_text_len": Infinity,
|
115 |
-
"compute_f0": true,
|
116 |
-
"compute_linear_spec": false,
|
117 |
-
"precompute_num_workers": 4,
|
118 |
-
"start_by_longest": false,
|
119 |
-
"datasets": [
|
120 |
-
{
|
121 |
-
"name": "custom_turkish",
|
122 |
-
"path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
|
123 |
-
"meta_file_train": "upc_ona_train.txt",
|
124 |
-
"ignored_speakers": null,
|
125 |
-
"language": "",
|
126 |
-
"meta_file_val": "",
|
127 |
-
"meta_file_attn_mask": ""
|
128 |
-
},
|
129 |
-
{
|
130 |
-
"name": "custom_turkish",
|
131 |
-
"path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/upc_ona",
|
132 |
-
"meta_file_train": "upc_ona_val.txt",
|
133 |
-
"ignored_speakers": null,
|
134 |
-
"language": "",
|
135 |
-
"meta_file_val": "",
|
136 |
-
"meta_file_attn_mask": ""
|
137 |
-
}
|
138 |
-
],
|
139 |
-
"test_sentences": [
|
140 |
-
"Hola Barcelona!",
|
141 |
-
"Escriviu al text."
|
142 |
-
],
|
143 |
-
"eval_split_max_size": null,
|
144 |
-
"eval_split_size": 0.01,
|
145 |
-
"use_speaker_weighted_sampler": false,
|
146 |
-
"speaker_weighted_sampler_alpha": 1.0,
|
147 |
-
"use_language_weighted_sampler": false,
|
148 |
-
"language_weighted_sampler_alpha": 1.0,
|
149 |
-
"use_length_weighted_sampler": false,
|
150 |
-
"length_weighted_sampler_alpha": 1.0,
|
151 |
-
"base_model": "forward_tts",
|
152 |
-
"model_args": {
|
153 |
-
"num_chars": 89,
|
154 |
-
"out_channels": 80,
|
155 |
-
"hidden_channels": 384,
|
156 |
-
"use_aligner": true,
|
157 |
-
"use_pitch": true,
|
158 |
-
"pitch_predictor_hidden_channels": 256,
|
159 |
-
"pitch_predictor_kernel_size": 3,
|
160 |
-
"pitch_predictor_dropout_p": 0.1,
|
161 |
-
"pitch_embedding_kernel_size": 3,
|
162 |
-
"duration_predictor_hidden_channels": 256,
|
163 |
-
"duration_predictor_kernel_size": 3,
|
164 |
-
"duration_predictor_dropout_p": 0.1,
|
165 |
-
"positional_encoding": true,
|
166 |
-
"poisitonal_encoding_use_scale": true,
|
167 |
-
"length_scale": 1,
|
168 |
-
"encoder_type": "fftransformer",
|
169 |
-
"encoder_params": {
|
170 |
-
"hidden_channels_ffn": 1024,
|
171 |
-
"num_heads": 1,
|
172 |
-
"num_layers": 6,
|
173 |
-
"dropout_p": 0.1
|
174 |
-
},
|
175 |
-
"decoder_type": "fftransformer",
|
176 |
-
"decoder_params": {
|
177 |
-
"hidden_channels_ffn": 1024,
|
178 |
-
"num_heads": 1,
|
179 |
-
"num_layers": 6,
|
180 |
-
"dropout_p": 0.1
|
181 |
-
},
|
182 |
-
"detach_duration_predictor": false,
|
183 |
-
"max_duration": 75,
|
184 |
-
"num_speakers": 1,
|
185 |
-
"use_speaker_embedding": false,
|
186 |
-
"speakers_file": null,
|
187 |
-
"use_d_vector_file": false,
|
188 |
-
"d_vector_dim": null,
|
189 |
-
"d_vector_file": null
|
190 |
-
},
|
191 |
-
"num_speakers": 0,
|
192 |
-
"speakers_file": null,
|
193 |
-
"use_speaker_embedding": false,
|
194 |
-
"use_d_vector_file": false,
|
195 |
-
"d_vector_file": false,
|
196 |
-
"d_vector_dim": 0,
|
197 |
-
"spec_loss_type": "mse",
|
198 |
-
"duration_loss_type": "mse",
|
199 |
-
"use_ssim_loss": true,
|
200 |
-
"ssim_loss_alpha": 1.0,
|
201 |
-
"spec_loss_alpha": 1.0,
|
202 |
-
"aligner_loss_alpha": 1.0,
|
203 |
-
"pitch_loss_alpha": 0.1,
|
204 |
-
"dur_loss_alpha": 0.1,
|
205 |
-
"binary_align_loss_alpha": 0.1,
|
206 |
-
"binary_loss_warmup_epochs": 150,
|
207 |
-
"min_seq_len": 13,
|
208 |
-
"max_seq_len": 500000,
|
209 |
-
"r": 1,
|
210 |
-
"f0_cache_path": "/home/twbgmy/play/TTS-play/TTS/recipes/catotron/f0_cache",
|
211 |
-
"restore_path": "/home/twbgmy/.local/share/tts/tts_models--en--ljspeech--fast_pitch/model_file.pth",
|
212 |
-
"github_branch": "* dev"
|
213 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/collectivat/ljspeech--hifigan_v2_config.json
DELETED
@@ -1,158 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"run_name": "hifigan",
|
3 |
-
"run_description": "universal hifigan trained on LibriTTS with no spectrogram normalization and using log() for scaling instead of log10()",
|
4 |
-
|
5 |
-
|
6 |
-
// AUDIO PARAMETERS
|
7 |
-
"audio":{
|
8 |
-
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
|
9 |
-
"win_length": 1024, // stft window length in ms.
|
10 |
-
"hop_length": 256, // stft window hop-lengh in ms.
|
11 |
-
"frame_length_ms": null, // stft window length in ms.If null, 'win_length' is used.
|
12 |
-
"frame_shift_ms": null, // stft window hop-lengh in ms. If null, 'hop_length' is used.
|
13 |
-
|
14 |
-
// Audio processing parameters
|
15 |
-
"sample_rate": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.
|
16 |
-
"preemphasis": 0.0, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
|
17 |
-
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
|
18 |
-
"log_func": "np.log",
|
19 |
-
|
20 |
-
// Silence trimming
|
21 |
-
"do_trim_silence": false,// enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)
|
22 |
-
"trim_db": 60, // threshold for timming silence. Set this according to your dataset.
|
23 |
-
|
24 |
-
// MelSpectrogram parameters
|
25 |
-
"num_mels": 80, // size of the mel spec frame.
|
26 |
-
"mel_fmin": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
|
27 |
-
"mel_fmax": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!
|
28 |
-
"spec_gain": 1.0, // scaler value appplied after log transform of spectrogram.
|
29 |
-
|
30 |
-
// Normalization parameters
|
31 |
-
"signal_norm": false, // normalize spec values. Mean-Var normalization if 'stats_path' is defined otherwise range normalization defined by the other params.
|
32 |
-
"min_level_db": -100, // lower bound for normalization
|
33 |
-
"symmetric_norm": true, // move normalization to range [-1, 1]
|
34 |
-
"max_norm": 4.0, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
|
35 |
-
"clip_norm": true, // clip normalized values into the range.
|
36 |
-
"stats_path": null // DO NOT USE WITH MULTI_SPEAKER MODEL. scaler stats file computed by 'compute_statistics.py'. If it is defined, mean-std based notmalization is used and other normalization params are ignored
|
37 |
-
},
|
38 |
-
|
39 |
-
// DISTRIBUTED TRAINING
|
40 |
-
"distributed":{
|
41 |
-
"backend": "nccl",
|
42 |
-
"url": "tcp:\/\/localhost:54324"
|
43 |
-
},
|
44 |
-
|
45 |
-
// MODEL PARAMETERS
|
46 |
-
"use_pqmf": false,
|
47 |
-
|
48 |
-
// LOSS PARAMETERS
|
49 |
-
"use_stft_loss": false,
|
50 |
-
"use_subband_stft_loss": false,
|
51 |
-
"use_mse_gan_loss": true,
|
52 |
-
"use_hinge_gan_loss": false,
|
53 |
-
"use_feat_match_loss": true, // use only with melgan discriminators
|
54 |
-
"use_l1_spec_loss": true,
|
55 |
-
|
56 |
-
// loss weights
|
57 |
-
"stft_loss_weight": 0,
|
58 |
-
"subband_stft_loss_weight": 0,
|
59 |
-
"mse_G_loss_weight": 1,
|
60 |
-
"hinge_G_loss_weight": 0,
|
61 |
-
"feat_match_loss_weight": 10,
|
62 |
-
"l1_spec_loss_weight": 45,
|
63 |
-
|
64 |
-
// multiscale stft loss parameters
|
65 |
-
// "stft_loss_params": {
|
66 |
-
// "n_ffts": [1024, 2048, 512],
|
67 |
-
// "hop_lengths": [120, 240, 50],
|
68 |
-
// "win_lengths": [600, 1200, 240]
|
69 |
-
// },
|
70 |
-
|
71 |
-
"l1_spec_loss_params": {
|
72 |
-
"use_mel": true,
|
73 |
-
"sample_rate": 16000,
|
74 |
-
"n_fft": 1024,
|
75 |
-
"hop_length": 256,
|
76 |
-
"win_length": 1024,
|
77 |
-
"n_mels": 80,
|
78 |
-
"mel_fmin": 0.0,
|
79 |
-
"mel_fmax": null
|
80 |
-
},
|
81 |
-
|
82 |
-
"target_loss": "avg_G_loss", // loss value to pick the best model to save after each epoch
|
83 |
-
|
84 |
-
// DISCRIMINATOR
|
85 |
-
"discriminator_model": "hifigan_discriminator",
|
86 |
-
//"discriminator_model_params":{
|
87 |
-
// "peroids": [2, 3, 5, 7, 11],
|
88 |
-
// "base_channels": 16,
|
89 |
-
// "max_channels":512,
|
90 |
-
// "downsample_factors":[4, 4, 4]
|
91 |
-
//},
|
92 |
-
"steps_to_start_discriminator": 0, // steps required to start GAN trainining.1
|
93 |
-
|
94 |
-
// GENERATOR
|
95 |
-
"generator_model": "hifigan_generator",
|
96 |
-
"generator_model_params": {
|
97 |
-
"resblock_type": "1",
|
98 |
-
"upsample_factors": [8,8,2,2],
|
99 |
-
"upsample_kernel_sizes": [16,16,4,4],
|
100 |
-
"upsample_initial_channel": 128,
|
101 |
-
"resblock_kernel_sizes": [3,7,11],
|
102 |
-
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]]
|
103 |
-
},
|
104 |
-
|
105 |
-
// DATASET
|
106 |
-
"data_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/vo_voice_quality_transformation/",
|
107 |
-
"feature_path": null,
|
108 |
-
// "feature_path": "/home/erogol/gdrive/Datasets/non-binary-voice-files/tacotron-DCA/",
|
109 |
-
"seq_len": 8192,
|
110 |
-
"pad_short": 2000,
|
111 |
-
"conv_pad": 0,
|
112 |
-
"use_noise_augment": false,
|
113 |
-
"use_cache": true,
|
114 |
-
"reinit_layers": [], // give a list of layer names to restore from the given checkpoint. If not defined, it reloads all heuristically matching layers.
|
115 |
-
|
116 |
-
// TRAINING
|
117 |
-
"batch_size": 16, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
|
118 |
-
|
119 |
-
// VALIDATION
|
120 |
-
"run_eval": true,
|
121 |
-
"test_delay_epochs": 10, //Until attention is aligned, testing only wastes computation time.
|
122 |
-
"test_sentences_file": null, // set a file to load sentences to be used for testing. If it is null then we use default english sentences.
|
123 |
-
|
124 |
-
// OPTIMIZER
|
125 |
-
"epochs": 10000, // total number of epochs to train.
|
126 |
-
"wd": 0.0, // Weight decay weight.
|
127 |
-
"gen_clip_grad": -1, // Generator gradient clipping threshold. Apply gradient clipping if > 0
|
128 |
-
"disc_clip_grad": -1, // Discriminator gradient clipping threshold.
|
129 |
-
// "lr_scheduler_gen": "ExponentialLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
130 |
-
// "lr_scheduler_gen_params": {
|
131 |
-
// "gamma": 0.999,
|
132 |
-
// "last_epoch": -1
|
133 |
-
// },
|
134 |
-
// "lr_scheduler_disc": "ExponentialLR", // one of the schedulers from https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
|
135 |
-
// "lr_scheduler_disc_params": {
|
136 |
-
// "gamma": 0.999,
|
137 |
-
// "last_epoch": -1
|
138 |
-
// },
|
139 |
-
"lr_gen": 0.00001, // Initial learning rate. If Noam decay is active, maximum learning rate.
|
140 |
-
"lr_disc": 0.00001,
|
141 |
-
|
142 |
-
// TENSORBOARD and LOGGING
|
143 |
-
"print_step": 25, // Number of steps to log traning on console.
|
144 |
-
"print_eval": false, // If True, it prints loss values for each step in eval run.
|
145 |
-
"save_step": 25000, // Number of training steps expected to plot training stats on TB and save model checkpoints.
|
146 |
-
"checkpoint": true, // If true, it saves checkpoints per "save_step"
|
147 |
-
"tb_model_param_stats": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging.
|
148 |
-
|
149 |
-
// DATA LOADING
|
150 |
-
"num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values.
|
151 |
-
"num_val_loader_workers": 4, // number of evaluation data loader processes.
|
152 |
-
"eval_split_size": 10,
|
153 |
-
|
154 |
-
// PATHS
|
155 |
-
"output_path": "/home/erogol/gdrive/Trainings/sam/"
|
156 |
-
}
|
157 |
-
|
158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/collectivat/ljspeech--hifigan_v2_model_file.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4047e93886faa1aba11948efa71f59dcb0ec9117e286660e59b91892ef98d129
|
3 |
-
size 3794153
|
|
|
|
|
|
|
|
models/mms/G_100000.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:0382edd70333f8ddc663177e672c8a66312e1b30f7929a8f9d458ef66f6b5349
|
3 |
-
size 436622793
|
|
|
|
|
|
|
|
models/mms/config.json
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"train": {
|
3 |
-
"log_interval": 200,
|
4 |
-
"eval_interval": 1000,
|
5 |
-
"seed": 1234,
|
6 |
-
"epochs": 20000,
|
7 |
-
"learning_rate": 0.0002,
|
8 |
-
"betas": [
|
9 |
-
0.8,
|
10 |
-
0.99
|
11 |
-
],
|
12 |
-
"eps": 1e-09,
|
13 |
-
"batch_size": 64,
|
14 |
-
"fp16_run": true,
|
15 |
-
"lr_decay": 0.999875,
|
16 |
-
"segment_size": 8192,
|
17 |
-
"init_lr_ratio": 1,
|
18 |
-
"warmup_epochs": 0,
|
19 |
-
"c_mel": 45,
|
20 |
-
"c_kl": 1.0
|
21 |
-
},
|
22 |
-
"data": {
|
23 |
-
"training_files": "train.ltr",
|
24 |
-
"validation_files": "dev.ltr",
|
25 |
-
"text_cleaners": [
|
26 |
-
"transliteration_cleaners"
|
27 |
-
],
|
28 |
-
"max_wav_value": 32768.0,
|
29 |
-
"sampling_rate": 16000,
|
30 |
-
"filter_length": 1024,
|
31 |
-
"hop_length": 256,
|
32 |
-
"win_length": 1024,
|
33 |
-
"n_mel_channels": 80,
|
34 |
-
"mel_fmin": 0.0,
|
35 |
-
"mel_fmax": null,
|
36 |
-
"add_blank": true,
|
37 |
-
"n_speakers": 0,
|
38 |
-
"cleaned_text": true
|
39 |
-
},
|
40 |
-
"model": {
|
41 |
-
"inter_channels": 192,
|
42 |
-
"hidden_channels": 192,
|
43 |
-
"filter_channels": 768,
|
44 |
-
"n_heads": 2,
|
45 |
-
"n_layers": 6,
|
46 |
-
"kernel_size": 3,
|
47 |
-
"p_dropout": 0.1,
|
48 |
-
"resblock": "1",
|
49 |
-
"resblock_kernel_sizes": [
|
50 |
-
3,
|
51 |
-
7,
|
52 |
-
11
|
53 |
-
],
|
54 |
-
"resblock_dilation_sizes": [
|
55 |
-
[
|
56 |
-
1,
|
57 |
-
3,
|
58 |
-
5
|
59 |
-
],
|
60 |
-
[
|
61 |
-
1,
|
62 |
-
3,
|
63 |
-
5
|
64 |
-
],
|
65 |
-
[
|
66 |
-
1,
|
67 |
-
3,
|
68 |
-
5
|
69 |
-
]
|
70 |
-
],
|
71 |
-
"upsample_rates": [
|
72 |
-
8,
|
73 |
-
8,
|
74 |
-
2,
|
75 |
-
2
|
76 |
-
],
|
77 |
-
"upsample_initial_channel": 512,
|
78 |
-
"upsample_kernel_sizes": [
|
79 |
-
16,
|
80 |
-
16,
|
81 |
-
4,
|
82 |
-
4
|
83 |
-
],
|
84 |
-
"n_layers_q": 3,
|
85 |
-
"use_spectral_norm": false
|
86 |
-
}
|
87 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/mms/vocab.txt
DELETED
@@ -1,39 +0,0 @@
|
|
1 |
-
z
|
2 |
-
f
|
3 |
-
i
|
4 |
-
g
|
5 |
-
m
|
6 |
-
o
|
7 |
-
r
|
8 |
-
è
|
9 |
-
h
|
10 |
-
l
|
11 |
-
v
|
12 |
-
à
|
13 |
-
u
|
14 |
-
d
|
15 |
-
ú
|
16 |
-
ç
|
17 |
-
p
|
18 |
-
s
|
19 |
-
'
|
20 |
-
é
|
21 |
-
_
|
22 |
-
-
|
23 |
-
e
|
24 |
-
a
|
25 |
-
—
|
26 |
-
x
|
27 |
-
ü
|
28 |
-
q
|
29 |
-
t
|
30 |
-
b
|
31 |
-
í
|
32 |
-
ó
|
33 |
-
ï
|
34 |
-
ò
|
35 |
-
|
36 |
-
c
|
37 |
-
j
|
38 |
-
n
|
39 |
-
y
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/piper/MODEL_CARD
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
# Model card for upc_ona (x-low)
|
2 |
-
|
3 |
-
* Language: ca (Catalan)
|
4 |
-
* Speakers: 1
|
5 |
-
* Quality: x-low
|
6 |
-
* Samplerate: 16,000Hz
|
7 |
-
|
8 |
-
## Dataset
|
9 |
-
|
10 |
-
* URL: https://collectivat.cat/asr#upc-festcat-tts-corpora
|
11 |
-
* License: CC BY-SA 3.0 ES
|
12 |
-
|
13 |
-
## Training
|
14 |
-
|
15 |
-
Trained from scratch.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
models/piper/ca-upc_ona-x-low.onnx
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:13661d26423e0c791823823a5971f4e1aaf644a62e65e0e94d299c0e70560e14
|
3 |
-
size 20628813
|
|
|
|
|
|
|
|
models/piper/ca-upc_ona-x-low.onnx.json
DELETED
@@ -1,409 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"audio": {
|
3 |
-
"sample_rate": 16000
|
4 |
-
},
|
5 |
-
"espeak": {
|
6 |
-
"voice": "ca"
|
7 |
-
},
|
8 |
-
"inference": {
|
9 |
-
"noise_scale": 0.667,
|
10 |
-
"length_scale": 1,
|
11 |
-
"noise_w": 0.8
|
12 |
-
},
|
13 |
-
"phoneme_map": {},
|
14 |
-
"phoneme_id_map": {
|
15 |
-
"_": [
|
16 |
-
0
|
17 |
-
],
|
18 |
-
"^": [
|
19 |
-
1
|
20 |
-
],
|
21 |
-
"$": [
|
22 |
-
2
|
23 |
-
],
|
24 |
-
" ": [
|
25 |
-
3
|
26 |
-
],
|
27 |
-
"!": [
|
28 |
-
4
|
29 |
-
],
|
30 |
-
"'": [
|
31 |
-
5
|
32 |
-
],
|
33 |
-
"(": [
|
34 |
-
6
|
35 |
-
],
|
36 |
-
")": [
|
37 |
-
7
|
38 |
-
],
|
39 |
-
",": [
|
40 |
-
8
|
41 |
-
],
|
42 |
-
"-": [
|
43 |
-
9
|
44 |
-
],
|
45 |
-
".": [
|
46 |
-
10
|
47 |
-
],
|
48 |
-
":": [
|
49 |
-
11
|
50 |
-
],
|
51 |
-
";": [
|
52 |
-
12
|
53 |
-
],
|
54 |
-
"?": [
|
55 |
-
13
|
56 |
-
],
|
57 |
-
"a": [
|
58 |
-
14
|
59 |
-
],
|
60 |
-
"b": [
|
61 |
-
15
|
62 |
-
],
|
63 |
-
"c": [
|
64 |
-
16
|
65 |
-
],
|
66 |
-
"d": [
|
67 |
-
17
|
68 |
-
],
|
69 |
-
"e": [
|
70 |
-
18
|
71 |
-
],
|
72 |
-
"f": [
|
73 |
-
19
|
74 |
-
],
|
75 |
-
"h": [
|
76 |
-
20
|
77 |
-
],
|
78 |
-
"i": [
|
79 |
-
21
|
80 |
-
],
|
81 |
-
"j": [
|
82 |
-
22
|
83 |
-
],
|
84 |
-
"k": [
|
85 |
-
23
|
86 |
-
],
|
87 |
-
"l": [
|
88 |
-
24
|
89 |
-
],
|
90 |
-
"m": [
|
91 |
-
25
|
92 |
-
],
|
93 |
-
"n": [
|
94 |
-
26
|
95 |
-
],
|
96 |
-
"o": [
|
97 |
-
27
|
98 |
-
],
|
99 |
-
"p": [
|
100 |
-
28
|
101 |
-
],
|
102 |
-
"q": [
|
103 |
-
29
|
104 |
-
],
|
105 |
-
"r": [
|
106 |
-
30
|
107 |
-
],
|
108 |
-
"s": [
|
109 |
-
31
|
110 |
-
],
|
111 |
-
"t": [
|
112 |
-
32
|
113 |
-
],
|
114 |
-
"u": [
|
115 |
-
33
|
116 |
-
],
|
117 |
-
"v": [
|
118 |
-
34
|
119 |
-
],
|
120 |
-
"w": [
|
121 |
-
35
|
122 |
-
],
|
123 |
-
"x": [
|
124 |
-
36
|
125 |
-
],
|
126 |
-
"y": [
|
127 |
-
37
|
128 |
-
],
|
129 |
-
"z": [
|
130 |
-
38
|
131 |
-
],
|
132 |
-
"æ": [
|
133 |
-
39
|
134 |
-
],
|
135 |
-
"ç": [
|
136 |
-
40
|
137 |
-
],
|
138 |
-
"ð": [
|
139 |
-
41
|
140 |
-
],
|
141 |
-
"ø": [
|
142 |
-
42
|
143 |
-
],
|
144 |
-
"ħ": [
|
145 |
-
43
|
146 |
-
],
|
147 |
-
"ŋ": [
|
148 |
-
44
|
149 |
-
],
|
150 |
-
"œ": [
|
151 |
-
45
|
152 |
-
],
|
153 |
-
"ǀ": [
|
154 |
-
46
|
155 |
-
],
|
156 |
-
"ǁ": [
|
157 |
-
47
|
158 |
-
],
|
159 |
-
"ǂ": [
|
160 |
-
48
|
161 |
-
],
|
162 |
-
"ǃ": [
|
163 |
-
49
|
164 |
-
],
|
165 |
-
"ɐ": [
|
166 |
-
50
|
167 |
-
],
|
168 |
-
"ɑ": [
|
169 |
-
51
|
170 |
-
],
|
171 |
-
"ɒ": [
|
172 |
-
52
|
173 |
-
],
|
174 |
-
"ɓ": [
|
175 |
-
53
|
176 |
-
],
|
177 |
-
"ɔ": [
|
178 |
-
54
|
179 |
-
],
|
180 |
-
"ɕ": [
|
181 |
-
55
|
182 |
-
],
|
183 |
-
"ɖ": [
|
184 |
-
56
|
185 |
-
],
|
186 |
-
"ɗ": [
|
187 |
-
57
|
188 |
-
],
|
189 |
-
"ɘ": [
|
190 |
-
58
|
191 |
-
],
|
192 |
-
"ə": [
|
193 |
-
59
|
194 |
-
],
|
195 |
-
"ɚ": [
|
196 |
-
60
|
197 |
-
],
|
198 |
-
"ɛ": [
|
199 |
-
61
|
200 |
-
],
|
201 |
-
"ɜ": [
|
202 |
-
62
|
203 |
-
],
|
204 |
-
"ɞ": [
|
205 |
-
63
|
206 |
-
],
|
207 |
-
"ɟ": [
|
208 |
-
64
|
209 |
-
],
|
210 |
-
"ɠ": [
|
211 |
-
65
|
212 |
-
],
|
213 |
-
"ɡ": [
|
214 |
-
66
|
215 |
-
],
|
216 |
-
"ɢ": [
|
217 |
-
67
|
218 |
-
],
|
219 |
-
"ɣ": [
|
220 |
-
68
|
221 |
-
],
|
222 |
-
"ɤ": [
|
223 |
-
69
|
224 |
-
],
|
225 |
-
"ɥ": [
|
226 |
-
70
|
227 |
-
],
|
228 |
-
"ɦ": [
|
229 |
-
71
|
230 |
-
],
|
231 |
-
"ɧ": [
|
232 |
-
72
|
233 |
-
],
|
234 |
-
"ɨ": [
|
235 |
-
73
|
236 |
-
],
|
237 |
-
"ɪ": [
|
238 |
-
74
|
239 |
-
],
|
240 |
-
"ɫ": [
|
241 |
-
75
|
242 |
-
],
|
243 |
-
"ɬ": [
|
244 |
-
76
|
245 |
-
],
|
246 |
-
"ɭ": [
|
247 |
-
77
|
248 |
-
],
|
249 |
-
"ɮ": [
|
250 |
-
78
|
251 |
-
],
|
252 |
-
"ɯ": [
|
253 |
-
79
|
254 |
-
],
|
255 |
-
"ɰ": [
|
256 |
-
80
|
257 |
-
],
|
258 |
-
"ɱ": [
|
259 |
-
81
|
260 |
-
],
|
261 |
-
"ɲ": [
|
262 |
-
82
|
263 |
-
],
|
264 |
-
"ɳ": [
|
265 |
-
83
|
266 |
-
],
|
267 |
-
"ɴ": [
|
268 |
-
84
|
269 |
-
],
|
270 |
-
"ɵ": [
|
271 |
-
85
|
272 |
-
],
|
273 |
-
"ɶ": [
|
274 |
-
86
|
275 |
-
],
|
276 |
-
"ɸ": [
|
277 |
-
87
|
278 |
-
],
|
279 |
-
"ɹ": [
|
280 |
-
88
|
281 |
-
],
|
282 |
-
"ɺ": [
|
283 |
-
89
|
284 |
-
],
|
285 |
-
"ɻ": [
|
286 |
-
90
|
287 |
-
],
|
288 |
-
"ɽ": [
|
289 |
-
91
|
290 |
-
],
|
291 |
-
"ɾ": [
|
292 |
-
92
|
293 |
-
],
|
294 |
-
"ʀ": [
|
295 |
-
93
|
296 |
-
],
|
297 |
-
"ʁ": [
|
298 |
-
94
|
299 |
-
],
|
300 |
-
"ʂ": [
|
301 |
-
95
|
302 |
-
],
|
303 |
-
"ʃ": [
|
304 |
-
96
|
305 |
-
],
|
306 |
-
"ʄ": [
|
307 |
-
97
|
308 |
-
],
|
309 |
-
"ʈ": [
|
310 |
-
98
|
311 |
-
],
|
312 |
-
"ʉ": [
|
313 |
-
99
|
314 |
-
],
|
315 |
-
"ʊ": [
|
316 |
-
100
|
317 |
-
],
|
318 |
-
"ʋ": [
|
319 |
-
101
|
320 |
-
],
|
321 |
-
"ʌ": [
|
322 |
-
102
|
323 |
-
],
|
324 |
-
"ʍ": [
|
325 |
-
103
|
326 |
-
],
|
327 |
-
"ʎ": [
|
328 |
-
104
|
329 |
-
],
|
330 |
-
"ʏ": [
|
331 |
-
105
|
332 |
-
],
|
333 |
-
"ʐ": [
|
334 |
-
106
|
335 |
-
],
|
336 |
-
"ʑ": [
|
337 |
-
107
|
338 |
-
],
|
339 |
-
"ʒ": [
|
340 |
-
108
|
341 |
-
],
|
342 |
-
"ʔ": [
|
343 |
-
109
|
344 |
-
],
|
345 |
-
"ʕ": [
|
346 |
-
110
|
347 |
-
],
|
348 |
-
"ʘ": [
|
349 |
-
111
|
350 |
-
],
|
351 |
-
"ʙ": [
|
352 |
-
112
|
353 |
-
],
|
354 |
-
"ʛ": [
|
355 |
-
113
|
356 |
-
],
|
357 |
-
"ʜ": [
|
358 |
-
114
|
359 |
-
],
|
360 |
-
"ʝ": [
|
361 |
-
115
|
362 |
-
],
|
363 |
-
"ʟ": [
|
364 |
-
116
|
365 |
-
],
|
366 |
-
"ʡ": [
|
367 |
-
117
|
368 |
-
],
|
369 |
-
"ʢ": [
|
370 |
-
118
|
371 |
-
],
|
372 |
-
"ʲ": [
|
373 |
-
119
|
374 |
-
],
|
375 |
-
"ˈ": [
|
376 |
-
120
|
377 |
-
],
|
378 |
-
"ˌ": [
|
379 |
-
121
|
380 |
-
],
|
381 |
-
"ː": [
|
382 |
-
122
|
383 |
-
],
|
384 |
-
"ˑ": [
|
385 |
-
123
|
386 |
-
],
|
387 |
-
"˞": [
|
388 |
-
124
|
389 |
-
],
|
390 |
-
"β": [
|
391 |
-
125
|
392 |
-
],
|
393 |
-
"θ": [
|
394 |
-
126
|
395 |
-
],
|
396 |
-
"χ": [
|
397 |
-
127
|
398 |
-
],
|
399 |
-
"ᵻ": [
|
400 |
-
128
|
401 |
-
],
|
402 |
-
"ⱱ": [
|
403 |
-
129
|
404 |
-
]
|
405 |
-
},
|
406 |
-
"num_symbols": 130,
|
407 |
-
"num_speakers": 1,
|
408 |
-
"speaker_id_map": {}
|
409 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
gradio
|
2 |
-
espeak-phonemizer>=1.1.0,<2
|
|
|
1 |
gradio
|
2 |
+
espeak-phonemizer>=1.1.0,<2
|