jvision commited on
Commit
7cdafc6
1 Parent(s): 319f608

First commit

Browse files
.gitattributes CHANGED
@@ -32,3 +32,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ checkpoint_85000.pth filter=lfs diff=lfs merge=lfs -text
36
+ supplemental/speakers-combined.pth filter=lfs diff=lfs merge=lfs -text
37
+ supplemental/speakers-dataset.pth filter=lfs diff=lfs merge=lfs -text
38
+ supplemental/model_se.pth.tar filter=lfs diff=lfs merge=lfs -text
39
+ samples/ filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,33 @@
1
  ---
2
  license: cc-by-4.0
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: cc-by-4.0
3
+ language:
4
+ - en
5
+ pipeline_tag: text-to-speech
6
+ tags:
7
+ - tts
8
+ - audio
9
+ - text to speech
10
+ - text-to-speech
11
+ - coqui
12
+ - coqui-tts
13
  ---
14
+
15
+ <p>VCTK_p226: 22 year old male, English accent (Surrey)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p226.wav" type="audio/wav"></audio> </p>
16
+ <p>VCTK_p227: 38 year old male, English accent (Cumbria)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p227.wav" type="audio/wav"></audio> </p>
17
+ <p>VCTK_p232: 23 year old male, English accent (Southern England)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p232.wav" type="audio/wav"></audio> </p>
18
+ <p>VCTK_p243: 22 year old male, English accent (London)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p243.wav" type="audio/wav"></audio> </p>
19
+ <p>VCTK_p254: 21 year old male, English accent (Surrey)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p254.wav" type="audio/wav"></audio> </p>
20
+ <p>VCTK_p256: 24 year old male, English accent (Birmingham)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p256.wav" type="audio/wav"></audio> </p>
21
+ <p>VCTK_p258: 22 year old male, English accent (Southern England)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p258.wav" type="audio/wav"></audio> </p>
22
+ <p>VCTK_p259: 23 year old male, English accent (Nottingham)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p259.wav" type="audio/wav"></audio> </p>
23
+ <p>VCTK_p270: 21 year old male, English accent (Yorkshire)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p270.wav" type="audio/wav"></audio> </p>
24
+ <p>VCTK_p273: 23 year old male, English accent (Suffolk)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p273.wav" type="audio/wav"></audio> </p>
25
+ <p>VCTK_p274: 22 year old male, English accent (Essex)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p274.wav" type="audio/wav"></audio> </p>
26
+ <p>VCTK_p278: 22 year old male, English accent (Cheshire)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p278.wav" type="audio/wav"></audio> </p>
27
+ <p>VCTK_p279: 23 year old male, English accent (Leicester)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p279.wav" type="audio/wav"></audio> </p>
28
+ <p>VCTK_p286: 23 year old male, English accent (Newcastle)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p286.wav" type="audio/wav"></audio> </p>
29
+ <p>VCTK_p287: 23 year old male, English accent (York)<audio controls><source src="https://huggingface.co/voices/VCTK_European_English_Females/resolve/main/samples/p287.wav" type="audio/wav"></audio> </p>
30
+
31
+ Built with [YourTTS](https://github.com/Edresson/YourTTS)
32
+
33
+ Dataset: https://datashare.ed.ac.uk/handle/10283/2950
checkpoint_85000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3803c8193b214c7657fcd1dde8177e2c23c0c2d7345582d9a40649d05f7e19c4
3
+ size 1043206989
config.json ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/home/iguana/projects/java/tts-voicetrain/projects/art/output",
3
+ "logger_uri": null,
4
+ "run_name": "VCTK_European_English_Males",
5
+ "project_name": "YourTTS",
6
+ "run_description": "\n - Original YourTTS trained using VCTK dataset\n ",
7
+ "print_step": 50,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": 1000,
14
+ "save_step": 5000,
15
+ "save_n_checkpoints": 2,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 10000,
19
+ "target_loss": "loss_1",
20
+ "print_eval": false,
21
+ "test_delay_epochs": 0,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": true,
27
+ "epochs": 1000,
28
+ "batch_size": 16,
29
+ "eval_batch_size": 16,
30
+ "grad_clip": [
31
+ 1000.0,
32
+ 1000.0
33
+ ],
34
+ "scheduler_after_epoch": true,
35
+ "lr": 0.001,
36
+ "optimizer": "AdamW",
37
+ "optimizer_params": {
38
+ "betas": [
39
+ 0.8,
40
+ 0.99
41
+ ],
42
+ "eps": 1e-09,
43
+ "weight_decay": 0.01
44
+ },
45
+ "lr_scheduler": null,
46
+ "lr_scheduler_params": {},
47
+ "use_grad_scaler": false,
48
+ "cudnn_enable": true,
49
+ "cudnn_deterministic": false,
50
+ "cudnn_benchmark": false,
51
+ "training_seed": 54321,
52
+ "model": "vits",
53
+ "num_loader_workers": 8,
54
+ "num_eval_loader_workers": 0,
55
+ "use_noise_augment": false,
56
+ "audio": {
57
+ "fft_size": 1024,
58
+ "sample_rate": 16000,
59
+ "win_length": 1024,
60
+ "hop_length": 256,
61
+ "num_mels": 80,
62
+ "mel_fmin": 0,
63
+ "mel_fmax": null
64
+ },
65
+ "use_phonemes": false,
66
+ "phonemizer": "espeak",
67
+ "phoneme_language": "en-us",
68
+ "compute_input_seq_cache": true,
69
+ "text_cleaner": "multilingual_cleaners",
70
+ "enable_eos_bos_chars": false,
71
+ "test_sentences_file": "",
72
+ "phoneme_cache_path": "projects/art/phoneme_cache",
73
+ "characters": {
74
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
75
+ "vocab_dict": null,
76
+ "pad": "_",
77
+ "eos": "&",
78
+ "bos": "*",
79
+ "blank": null,
80
+ "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00af\u00b7\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e6\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u00ff\u0101\u0105\u0107\u0113\u0119\u011b\u012b\u0131\u0142\u0144\u014d\u0151\u0153\u015b\u016b\u0171\u017a\u017c\u01ce\u01d0\u01d2\u01d4\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d\u044e\u044f\u0451\u0454\u0456\u0457\u0491\u2013!'(),-.:;? ",
81
+ "punctuations": "!'(),-.:;? ",
82
+ "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
83
+ "is_unique": true,
84
+ "is_sorted": true
85
+ },
86
+ "add_blank": true,
87
+ "batch_group_size": 48,
88
+ "loss_masking": null,
89
+ "min_audio_len": 1,
90
+ "max_audio_len": 160000,
91
+ "min_text_len": 1,
92
+ "max_text_len": Infinity,
93
+ "compute_f0": false,
94
+ "compute_energy": false,
95
+ "compute_linear_spec": true,
96
+ "precompute_num_workers": 12,
97
+ "start_by_longest": true,
98
+ "shuffle": false,
99
+ "drop_last": false,
100
+ "datasets": [
101
+ {
102
+ "formatter": "vctk",
103
+ "language": "en"
104
+ }
105
+ ],
106
+ "test_sentences": [
107
+ [
108
+ "This is an example of a sentence to be synthesized.",
109
+ "VCTK_p226",
110
+ null,
111
+ "en"
112
+ ],
113
+ [
114
+ "This is an example of a sentence to be synthesized.",
115
+ "VCTK_p227",
116
+ null,
117
+ "en"
118
+ ]
119
+ ],
120
+ "eval_split_max_size": 256,
121
+ "eval_split_size": 0.01,
122
+ "use_speaker_weighted_sampler": false,
123
+ "speaker_weighted_sampler_alpha": 1.0,
124
+ "use_language_weighted_sampler": false,
125
+ "language_weighted_sampler_alpha": 1.0,
126
+ "use_length_weighted_sampler": false,
127
+ "length_weighted_sampler_alpha": 1.0,
128
+ "model_args": {
129
+ "num_chars": 284,
130
+ "out_channels": 513,
131
+ "spec_segment_size": 32,
132
+ "hidden_channels": 192,
133
+ "hidden_channels_ffn_text_encoder": 768,
134
+ "num_heads_text_encoder": 2,
135
+ "num_layers_text_encoder": 10,
136
+ "kernel_size_text_encoder": 3,
137
+ "dropout_p_text_encoder": 0.1,
138
+ "dropout_p_duration_predictor": 0.5,
139
+ "kernel_size_posterior_encoder": 5,
140
+ "dilation_rate_posterior_encoder": 1,
141
+ "num_layers_posterior_encoder": 16,
142
+ "kernel_size_flow": 5,
143
+ "dilation_rate_flow": 1,
144
+ "num_layers_flow": 4,
145
+ "resblock_type_decoder": "2",
146
+ "resblock_kernel_sizes_decoder": [
147
+ 3,
148
+ 7,
149
+ 11
150
+ ],
151
+ "resblock_dilation_sizes_decoder": [
152
+ [
153
+ 1,
154
+ 3,
155
+ 5
156
+ ],
157
+ [
158
+ 1,
159
+ 3,
160
+ 5
161
+ ],
162
+ [
163
+ 1,
164
+ 3,
165
+ 5
166
+ ]
167
+ ],
168
+ "upsample_rates_decoder": [
169
+ 8,
170
+ 8,
171
+ 2,
172
+ 2
173
+ ],
174
+ "upsample_initial_channel_decoder": 512,
175
+ "upsample_kernel_sizes_decoder": [
176
+ 16,
177
+ 16,
178
+ 4,
179
+ 4
180
+ ],
181
+ "periods_multi_period_discriminator": [
182
+ 2,
183
+ 3,
184
+ 5,
185
+ 7,
186
+ 11
187
+ ],
188
+ "use_sdp": true,
189
+ "noise_scale": 1.0,
190
+ "inference_noise_scale": 0.667,
191
+ "length_scale": 1.0,
192
+ "noise_scale_dp": 1.0,
193
+ "inference_noise_scale_dp": 1.0,
194
+ "max_inference_len": null,
195
+ "init_discriminator": true,
196
+ "use_spectral_norm_disriminator": false,
197
+ "use_speaker_embedding": false,
198
+ "num_speakers": 21,
199
+ "speakers_file": "supplemental/speakers.pth",
200
+ "d_vector_file": [
201
+ "supplemental/speakers-base.json",
202
+ "supplemental/speakers-dataset.pth"
203
+ ],
204
+ "speaker_embedding_channels": 256,
205
+ "use_d_vector_file": true,
206
+ "d_vector_dim": 512,
207
+ "detach_dp_input": true,
208
+ "use_language_embedding": true,
209
+ "embedded_language_dim": 4,
210
+ "num_languages": 3,
211
+ "language_ids_file": "supplemental/language_ids.json",
212
+ "use_speaker_encoder_as_loss": true,
213
+ "speaker_encoder_config_path": "supplemental/config_se.json",
214
+ "speaker_encoder_model_path": "supplemental/model_se.pth.tar",
215
+ "condition_dp_on_speaker": true,
216
+ "freeze_encoder": false,
217
+ "freeze_DP": false,
218
+ "freeze_PE": false,
219
+ "freeze_flow_decoder": false,
220
+ "freeze_waveform_decoder": false,
221
+ "encoder_sample_rate": null,
222
+ "interpolate_z": true,
223
+ "reinit_DP": false,
224
+ "reinit_text_encoder": false
225
+ },
226
+ "lr_gen": 0.0002,
227
+ "lr_disc": 0.0002,
228
+ "lr_scheduler_gen": "ExponentialLR",
229
+ "lr_scheduler_gen_params": {
230
+ "gamma": 0.999875,
231
+ "last_epoch": -1
232
+ },
233
+ "lr_scheduler_disc": "ExponentialLR",
234
+ "lr_scheduler_disc_params": {
235
+ "gamma": 0.999875,
236
+ "last_epoch": -1
237
+ },
238
+ "kl_loss_alpha": 1.0,
239
+ "disc_loss_alpha": 1.0,
240
+ "gen_loss_alpha": 1.0,
241
+ "feat_loss_alpha": 1.0,
242
+ "mel_loss_alpha": 45.0,
243
+ "dur_loss_alpha": 1.0,
244
+ "speaker_encoder_loss_alpha": 9.0,
245
+ "return_wav": true,
246
+ "use_weighted_sampler": false,
247
+ "weighted_sampler_attrs": {},
248
+ "weighted_sampler_multipliers": {},
249
+ "r": 1,
250
+ "num_speakers": 21,
251
+ "use_speaker_embedding": false,
252
+ "speakers_file": "supplemental/speakers.pth",
253
+ "speaker_embedding_channels": 256,
254
+ "language_ids_file": "supplemental/language_ids.json",
255
+ "use_language_embedding": true,
256
+ "use_d_vector_file": true,
257
+ "d_vector_file": [
258
+ "supplemental/speakers-base.json",
259
+ "supplemental/speakers-dataset.pth"
260
+ ],
261
+ "d_vector_dim": 512
262
+ }
prepare_model.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import subprocess
4
+
5
+ def generate_html_output(data, repository_path):
6
+ with open('speakers.md', 'a') as file:
7
+ for speaker_id, speaker_info in data.items():
8
+ out_path = f"{repository_path}/samples/{speaker_id}.wav"
9
+ age = speaker_info['age']
10
+ gender = speaker_info['gender']
11
+ if gender == 'F':
12
+ gender = 'female'
13
+ elif gender == "M":
14
+ gender = 'male'
15
+ accents = speaker_info['accents']
16
+ region = speaker_info['region']
17
+
18
+ file.write(f"<p>VCTK_{speaker_id}: {age} year old {gender}, {accents} accent ({region})<audio controls><source src=\"{out_path}\" type=\"audio/wav\"></audio> </p>\n")
19
+
20
+
21
+ # Load the data from the provided dictionary
22
+ data = {
23
+ "p226": {"age": 22, "gender": "M", "accents": "English", "region": "Surrey"},
24
+ "p227": {"age": 38, "gender": "M", "accents": "English", "region": "Cumbria"},
25
+ "p232": {"age": 23, "gender": "M", "accents": "English", "region": "Southern England"},
26
+ "p243": {"age": 22, "gender": "M", "accents": "English", "region": "London"},
27
+ "p254": {"age": 21, "gender": "M", "accents": "English", "region": "Surrey"},
28
+ "p256": {"age": 24, "gender": "M", "accents": "English", "region": "Birmingham"},
29
+ "p258": {"age": 22, "gender": "M", "accents": "English", "region": "Southern England"},
30
+ "p259": {"age": 23, "gender": "M", "accents": "English", "region": "Nottingham"},
31
+ "p270": {"age": 21, "gender": "M", "accents": "English", "region": "Yorkshire"},
32
+ "p273": {"age": 23, "gender": "M", "accents": "English", "region": "Suffolk"},
33
+ "p274": {"age": 22, "gender": "M", "accents": "English", "region": "Essex"},
34
+ "p278": {"age": 22, "gender": "M", "accents": "English", "region": "Cheshire"},
35
+ "p279": {"age": 23, "gender": "M", "accents": "English", "region": "Leicester"},
36
+ "p286": {"age": 23, "gender": "M", "accents": "English", "region": "Newcastle"},
37
+ "p287": {"age": 23, "gender": "M", "accents": "English", "region": "York"}
38
+ }
39
+
40
+
41
+ # Convert the data to JSON format
42
+ json_data = json.dumps(data, indent=2)
43
+
44
+ # Save the JSON data to a file
45
+ with open('speakers-log.json', 'w') as file:
46
+ file.write(json_data)
47
+
48
+ # Run the TTS command to get the speaker indices
49
+ command = "tts --model_path checkpoint_85000.pth --config_path config.json --list_speaker_idxs | grep -vE '^(\s*\||\s*>|\s*$)'"
50
+ output = subprocess.check_output(command, shell=True, text=True)
51
+
52
+ # Parse the JSON output into a Python dictionary
53
+ speaker_indices = eval(output)
54
+
55
+ # Load the speaker IDs from speakers.json
56
+ with open('speakers-log.json', 'r') as file:
57
+ speaker_ids = json.load(file)
58
+
59
+ # Create the speakers.md file
60
+ with open('speakers.md', 'w') as file:
61
+ for speaker_idx in speaker_indices:
62
+ # Remove the 'VCTK_' prefix
63
+ speaker_id = speaker_idx.replace('VCTK_', '')
64
+
65
+ # Lookup the speaker ID in the loaded speaker IDs
66
+ if speaker_id in speaker_ids:
67
+ speaker_id_json = speaker_ids[speaker_id]
68
+ else:
69
+ continue
70
+
71
+ # # Generate the TTS command to create the audio file
72
+ text = f"Hello, I am from {speaker_id_json['region']}. I hope that you will select my voice for your project. Thank you."
73
+ # # make samples directory if it doesn't exist
74
+ if not os.path.exists("samples"):
75
+ os.makedirs("samples")
76
+
77
+ out_path = f"samples/{speaker_id}.wav"
78
+ tts_command = f"tts --text \"{text}\" --model_path checkpoint_85000.pth --language_idx en --config_path config.json --speaker_idx \"VCTK_{speaker_id}\" --out_path {out_path}"
79
+
80
+ # Execute the TTS command
81
+ os.system(tts_command)
82
+
83
+ # Write the speaker information to the speakers.md file
84
+ generate_html_output({speaker_id: speaker_id_json}, "https://huggingface.co/voices/VCTK_European_English_Females/resolve/main")
samples/p226.wav ADDED
Binary file (238 kB). View file
 
samples/p227.wav ADDED
Binary file (240 kB). View file
 
samples/p232.wav ADDED
Binary file (215 kB). View file
 
samples/p243.wav ADDED
Binary file (249 kB). View file
 
samples/p254.wav ADDED
Binary file (220 kB). View file
 
samples/p256.wav ADDED
Binary file (250 kB). View file
 
samples/p258.wav ADDED
Binary file (230 kB). View file
 
samples/p259.wav ADDED
Binary file (239 kB). View file
 
samples/p270.wav ADDED
Binary file (251 kB). View file
 
samples/p273.wav ADDED
Binary file (212 kB). View file
 
samples/p274.wav ADDED
Binary file (259 kB). View file
 
samples/p278.wav ADDED
Binary file (211 kB). View file
 
samples/p279.wav ADDED
Binary file (217 kB). View file
 
samples/p286.wav ADDED
Binary file (232 kB). View file
 
samples/p287.wav ADDED
Binary file (221 kB). View file
 
speakers-log.json ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "p226": {
3
+ "age": 22,
4
+ "gender": "M",
5
+ "accents": "English",
6
+ "region": "Surrey"
7
+ },
8
+ "p227": {
9
+ "age": 38,
10
+ "gender": "M",
11
+ "accents": "English",
12
+ "region": "Cumbria"
13
+ },
14
+ "p232": {
15
+ "age": 23,
16
+ "gender": "M",
17
+ "accents": "English",
18
+ "region": "Southern England"
19
+ },
20
+ "p243": {
21
+ "age": 22,
22
+ "gender": "M",
23
+ "accents": "English",
24
+ "region": "London"
25
+ },
26
+ "p254": {
27
+ "age": 21,
28
+ "gender": "M",
29
+ "accents": "English",
30
+ "region": "Surrey"
31
+ },
32
+ "p256": {
33
+ "age": 24,
34
+ "gender": "M",
35
+ "accents": "English",
36
+ "region": "Birmingham"
37
+ },
38
+ "p258": {
39
+ "age": 22,
40
+ "gender": "M",
41
+ "accents": "English",
42
+ "region": "Southern England"
43
+ },
44
+ "p259": {
45
+ "age": 23,
46
+ "gender": "M",
47
+ "accents": "English",
48
+ "region": "Nottingham"
49
+ },
50
+ "p270": {
51
+ "age": 21,
52
+ "gender": "M",
53
+ "accents": "English",
54
+ "region": "Yorkshire"
55
+ },
56
+ "p273": {
57
+ "age": 23,
58
+ "gender": "M",
59
+ "accents": "English",
60
+ "region": "Suffolk"
61
+ },
62
+ "p274": {
63
+ "age": 22,
64
+ "gender": "M",
65
+ "accents": "English",
66
+ "region": "Essex"
67
+ },
68
+ "p278": {
69
+ "age": 22,
70
+ "gender": "M",
71
+ "accents": "English",
72
+ "region": "Cheshire"
73
+ },
74
+ "p279": {
75
+ "age": 23,
76
+ "gender": "M",
77
+ "accents": "English",
78
+ "region": "Leicester"
79
+ },
80
+ "p286": {
81
+ "age": 23,
82
+ "gender": "M",
83
+ "accents": "English",
84
+ "region": "Newcastle"
85
+ },
86
+ "p287": {
87
+ "age": 23,
88
+ "gender": "M",
89
+ "accents": "English",
90
+ "region": "York"
91
+ }
92
+ }
supplemental/config_se.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "speaker_encoder",
3
+ "run_name": "speaker_encoder",
4
+ "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev",
5
+ "epochs": 100000,
6
+ "batch_size": null,
7
+ "eval_batch_size": null,
8
+ "mixed_precision": false,
9
+ "run_eval": true,
10
+ "test_delay_epochs": 0,
11
+ "print_eval": false,
12
+ "print_step": 50,
13
+ "tb_plot_step": 100,
14
+ "tb_model_param_stats": false,
15
+ "save_step": 1000,
16
+ "checkpoint": true,
17
+ "keep_all_best": false,
18
+ "keep_after": 10000,
19
+ "num_loader_workers": 8,
20
+ "num_val_loader_workers": 0,
21
+ "use_noise_augment": false,
22
+ "output_path": "../checkpoints/speaker_encoder/language_balanced/normalized/angleproto-4-samples-by-speakers/",
23
+ "distributed_backend": "nccl",
24
+ "distributed_url": "tcp://localhost:54321",
25
+ "audio": {
26
+ "fft_size": 512,
27
+ "win_length": 400,
28
+ "hop_length": 160,
29
+ "frame_shift_ms": null,
30
+ "frame_length_ms": null,
31
+ "stft_pad_mode": "reflect",
32
+ "sample_rate": 16000,
33
+ "resample": false,
34
+ "preemphasis": 0.97,
35
+ "ref_level_db": 20,
36
+ "do_sound_norm": false,
37
+ "do_trim_silence": false,
38
+ "trim_db": 60,
39
+ "power": 1.5,
40
+ "griffin_lim_iters": 60,
41
+ "num_mels": 64,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": 8000.0,
44
+ "spec_gain": 20,
45
+ "signal_norm": false,
46
+ "min_level_db": -100,
47
+ "symmetric_norm": false,
48
+ "max_norm": 4.0,
49
+ "clip_norm": false,
50
+ "stats_path": null,
51
+ "do_rms_norm": true,
52
+ "db_level": -27.0
53
+ },
54
+ "datasets": [
55
+ {
56
+ "name": "voxceleb2",
57
+ "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/",
58
+ "meta_file_train": null,
59
+ "ununsed_speakers": null,
60
+ "meta_file_val": null,
61
+ "meta_file_attn_mask": "",
62
+ "language": "voxceleb"
63
+ }
64
+ ],
65
+ "model_params": {
66
+ "model_name": "resnet",
67
+ "input_dim": 64,
68
+ "use_torch_spec": true,
69
+ "log_input": true,
70
+ "proj_dim": 512
71
+ },
72
+ "audio_augmentation": {
73
+ "p": 0.5,
74
+ "rir": {
75
+ "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
76
+ "conv_mode": "full"
77
+ },
78
+ "additive": {
79
+ "sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
80
+ "speech": {
81
+ "min_snr_in_db": 13,
82
+ "max_snr_in_db": 20,
83
+ "min_num_noises": 1,
84
+ "max_num_noises": 1
85
+ },
86
+ "noise": {
87
+ "min_snr_in_db": 0,
88
+ "max_snr_in_db": 15,
89
+ "min_num_noises": 1,
90
+ "max_num_noises": 1
91
+ },
92
+ "music": {
93
+ "min_snr_in_db": 5,
94
+ "max_snr_in_db": 15,
95
+ "min_num_noises": 1,
96
+ "max_num_noises": 1
97
+ }
98
+ },
99
+ "gaussian": {
100
+ "p": 0.0,
101
+ "min_amplitude": 0.0,
102
+ "max_amplitude": 1e-05
103
+ }
104
+ },
105
+ "storage": {
106
+ "sample_from_storage_p": 0.5,
107
+ "storage_size": 40
108
+ },
109
+ "max_train_step": 1000000,
110
+ "loss": "angleproto",
111
+ "grad_clip": 3.0,
112
+ "lr": 0.0001,
113
+ "lr_decay": false,
114
+ "warmup_steps": 4000,
115
+ "wd": 1e-06,
116
+ "steps_plot_stats": 100,
117
+ "num_speakers_in_batch": 100,
118
+ "num_utters_per_speaker": 4,
119
+ "skip_speakers": true,
120
+ "voice_len": 2.0
121
+ }
supplemental/language_ids.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "en": 0,
3
+ "fr-fr": 1,
4
+ "pt-br": 2
5
+ }
supplemental/model_se.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f96efb20cbeeefd81fd8336d7f0155bf8902f82f9474e58ccb19d9e12345172
3
+ size 44610930
supplemental/speaker_ids.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "female-en-5": 0,
3
+ "female-en-5\n": 1,
4
+ "female-pt-4\n": 2,
5
+ "male-en-2": 3,
6
+ "male-en-2\n": 4,
7
+ "male-pt-3\n": 5,
8
+ "VCTK_p226": 6,
9
+ "VCTK_p227": 7,
10
+ "VCTK_p232": 8,
11
+ "VCTK_p243": 9,
12
+ "VCTK_p254": 10,
13
+ "VCTK_p256": 11,
14
+ "VCTK_p258": 12,
15
+ "VCTK_p259": 13,
16
+ "VCTK_p270": 14,
17
+ "VCTK_p273": 15,
18
+ "VCTK_p274": 16,
19
+ "VCTK_p278": 17,
20
+ "VCTK_p279": 18,
21
+ "VCTK_p286": 19,
22
+ "VCTK_p287": 20
23
+ }
supplemental/speakers-base.json ADDED
The diff for this file is too large to render. See raw diff
 
supplemental/speakers-combined.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ea3adb812f223fd0b49d864c830b25223c69eb100d194c82daef16d16a47a87
3
+ size 29383279
supplemental/speakers-dataset.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:593bd22570f52af32925f83724178dc1739da586db6b5b1f58ac803f035c5478
3
+ size 29202479
supplemental/speakers.json ADDED
Binary file (110 MB). View file