Kevin676 ramkamal2000 commited on
Commit
ec78133
0 Parent(s):

Duplicate from ramkamal2000/voice-conversion-yourtts

Browse files

Co-authored-by: Ramkamal T B <ramkamal2000@users.noreply.huggingface.co>

Files changed (12) hide show
  1. .gitattributes +36 -0
  2. README.md +14 -0
  3. SE_checkpoint.pth.tar +3 -0
  4. app.py +180 -0
  5. best_model.pth.tar +3 -0
  6. config.json +373 -0
  7. config_se.json +119 -0
  8. language_ids.json +5 -0
  9. ntr.wav +3 -0
  10. requirements.txt +4 -0
  11. speakers.json +0 -0
  12. timcast1.wav +3 -0
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *.tar filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Voice Conversion Yourtts
3
+ emoji: 😻
4
+ colorFrom: yellow
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.17.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: unknown
11
+ duplicated_from: ramkamal2000/voice-conversion-yourtts
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
SE_checkpoint.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f96efb20cbeeefd81fd8336d7f0155bf8902f82f9474e58ccb19d9e12345172
3
+ size 44610930
app.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # !git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS
2
+
3
+ import os
4
+ import shutil
5
+ import gradio as gr
6
+
7
+ import sys
8
+
9
+ import string
10
+ import time
11
+ import argparse
12
+ import json
13
+
14
+ import numpy as np
15
+ # import IPython
16
+ # from IPython.display import Audio
17
+
18
+ import torch
19
+
20
+ from TTS.tts.utils.synthesis import synthesis
21
+ from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
22
+ try:
23
+ from TTS.utils.audio import AudioProcessor
24
+ except:
25
+ from TTS.utils.audio import AudioProcessor
26
+
27
+
28
+ from TTS.tts.models import setup_model
29
+ from TTS.config import load_config
30
+ from TTS.tts.models.vits import *
31
+
32
+ from TTS.tts.utils.speakers import SpeakerManager
33
+ from pydub import AudioSegment
34
+
35
+ # from google.colab import files
36
+ import librosa
37
+
38
+ from scipy.io.wavfile import write, read
39
+
40
+ import subprocess
41
+
42
+ '''
43
+ from google.colab import drive
44
+ drive.mount('/content/drive')
45
+
46
+ src_path = os.path.join(os.path.join(os.path.join(os.path.join(os.getcwd(), 'drive'), 'MyDrive'), 'Colab Notebooks'), 'best_model_latest.pth.tar')
47
+ dst_path = os.path.join(os.getcwd(), 'best_model.pth.tar')
48
+
49
+ shutil.copy(src_path, dst_path)
50
+ '''
51
+
52
+ TTS_PATH = "TTS/"
53
+
54
+ # add libraries into environment
55
+ sys.path.append(TTS_PATH) # set this if TTS is not installed globally
56
+
57
+ # Paths definition
58
+
59
+ OUT_PATH = 'out/'
60
+
61
+ # create output path
62
+ os.makedirs(OUT_PATH, exist_ok=True)
63
+
64
+ # model vars
65
+ MODEL_PATH = 'best_model.pth.tar'
66
+ CONFIG_PATH = 'config.json'
67
+ TTS_LANGUAGES = "language_ids.json"
68
+ TTS_SPEAKERS = "speakers.json"
69
+ USE_CUDA = torch.cuda.is_available()
70
+
71
+ # load the config
72
+ C = load_config(CONFIG_PATH)
73
+
74
+ # load the audio processor
75
+ ap = AudioProcessor(**C.audio)
76
+
77
+ speaker_embedding = None
78
+
79
+ C.model_args['d_vector_file'] = TTS_SPEAKERS
80
+ C.model_args['use_speaker_encoder_as_loss'] = False
81
+
82
+ model = setup_model(C)
83
+ model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
84
+ # print(model.language_manager.num_languages, model.embedded_language_dim)
85
+ # print(model.emb_l)
86
+ cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
87
+ # remove speaker encoder
88
+ model_weights = cp['model'].copy()
89
+ for key in list(model_weights.keys()):
90
+ if "speaker_encoder" in key:
91
+ del model_weights[key]
92
+
93
+ model.load_state_dict(model_weights)
94
+
95
+ model.eval()
96
+
97
+ if USE_CUDA:
98
+ model = model.cuda()
99
+
100
+ # synthesize voice
101
+ use_griffin_lim = False
102
+
103
+ # Paths definition
104
+
105
+ CONFIG_SE_PATH = "config_se.json"
106
+ CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
107
+
108
+ # Load the Speaker encoder
109
+
110
+ SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)
111
+
112
+ # Define helper function
113
+
114
+ def compute_spec(ref_file):
115
+ y, sr = librosa.load(ref_file, sr=ap.sample_rate)
116
+ spec = ap.spectrogram(y)
117
+ spec = torch.FloatTensor(spec).unsqueeze(0)
118
+ return spec
119
+
120
+
121
+ def voice_conversion(ta, ra, da):
122
+
123
+ target_audio = 'target.wav'
124
+ reference_audio = 'reference.wav'
125
+ driving_audio = 'driving.wav'
126
+
127
+ write(target_audio, ta[0], ta[1])
128
+ write(reference_audio, ra[0], ra[1])
129
+ write(driving_audio, da[0], da[1])
130
+
131
+ # !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
132
+ # !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
133
+ # !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
134
+
135
+ files = [target_audio, reference_audio, driving_audio]
136
+
137
+ for file in files:
138
+ subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
139
+
140
+ # ta_ = read(target_audio)
141
+
142
+ target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
143
+ target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
144
+
145
+ driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
146
+ driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
147
+
148
+ # Convert the voice
149
+
150
+ driving_spec = compute_spec(driving_audio)
151
+ y_lengths = torch.tensor([driving_spec.size(-1)])
152
+ if USE_CUDA:
153
+ ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
154
+ ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
155
+ else:
156
+ ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
157
+ ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()
158
+
159
+ # print("Reference Audio after decoder:")
160
+ # IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
161
+
162
+ return (ap.sample_rate, ref_wav_voc)
163
+
164
+ c3 = gr.Interface(
165
+ fn=voice_conversion,
166
+ inputs=[gr.Audio(label='Target Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Clip To Convert')],
167
+ outputs=gr.Audio(label='Target Speaker - Converted Clip'),
168
+ examples=[['ntr.wav', 'timcast1.wav', 'timcast1.wav']],
169
+ description="Use this cool too to convert your voice to another person's! \nThe first audio input requires an audio file that of the target speaker. The second and third audio inputs require audio files from the person who's voice you want to convert."
170
+ )
171
+
172
+ c1_m2 = gr.Interface(
173
+ fn=voice_conversion,
174
+ inputs=[gr.Audio(label='Target Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Reference Clip', source='microphone'), gr.Audio(label='Input Speaker - Clip To Convert', source='microphone')],
175
+ outputs=gr.Audio(label='Target Speaker - Converted Clip'),
176
+ description="Use this cool too to convert your voice to another person's! \nThe first audio input requires an audio file that of the target speaker. The second and third audio inputs require live recordings from the person who's voice you want to convert."
177
+ )
178
+
179
+ demo = gr.TabbedInterface([c3, c1_m2], ["Pre-Recorded", "Microphone"], title="Voice Conversion")
180
+ demo.launch(debug='True')
best_model.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:017bfd8907c80bb5857d65d0223f0e4e4b9d699ef52e2a853d9cc7eb7e308cf0
3
+ size 379957289
config.json ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "vits",
3
+ "run_name": "vits_tts-portuguese",
4
+ "run_description": "",
5
+ "epochs": 1000,
6
+ "batch_size": 52,
7
+ "eval_batch_size": 52,
8
+ "mixed_precision": false,
9
+ "scheduler_after_epoch": true,
10
+ "run_eval": true,
11
+ "test_delay_epochs": -1,
12
+ "print_eval": true,
13
+ "dashboard_logger": "tensorboard",
14
+ "print_step": 25,
15
+ "plot_step": 100,
16
+ "model_param_stats": false,
17
+ "project_name": null,
18
+ "log_model_step": 10000,
19
+ "wandb_entity": null,
20
+ "save_step": 10000,
21
+ "checkpoint": true,
22
+ "keep_all_best": false,
23
+ "keep_after": 10000,
24
+ "num_loader_workers": 4,
25
+ "num_eval_loader_workers": 4,
26
+ "use_noise_augment": false,
27
+ "use_language_weighted_sampler": true,
28
+ "output_path": "../checkpoints/VITS-multilingual/VITS_fixes/new/new-SE/use_noise_aument_false/xlarge-ZS-PT-VCTK/pt-en+LibriTTS-fr/speaker_encoder_as_loss_9_alpha/mixed-p-false-bug-SDP-fixed/",
29
+ "distributed_backend": "nccl",
30
+ "distributed_url": "tcp://localhost:54321",
31
+ "audio": {
32
+ "fft_size": 1024,
33
+ "win_length": 1024,
34
+ "hop_length": 256,
35
+ "frame_shift_ms": null,
36
+ "frame_length_ms": null,
37
+ "stft_pad_mode": "reflect",
38
+ "sample_rate": 16000,
39
+ "resample": false,
40
+ "preemphasis": 0.0,
41
+ "ref_level_db": 20,
42
+ "do_sound_norm": false,
43
+ "log_func": "np.log",
44
+ "do_trim_silence": true,
45
+ "trim_db": 45,
46
+ "power": 1.5,
47
+ "griffin_lim_iters": 60,
48
+ "num_mels": 80,
49
+ "mel_fmin": 0.0,
50
+ "mel_fmax": null,
51
+ "spec_gain": 1,
52
+ "do_amp_to_db_linear": false,
53
+ "do_amp_to_db_mel": true,
54
+ "signal_norm": false,
55
+ "min_level_db": -100,
56
+ "symmetric_norm": true,
57
+ "max_norm": 4.0,
58
+ "clip_norm": true,
59
+ "stats_path": null
60
+ },
61
+ "use_phonemes": false,
62
+ "use_espeak_phonemes": false,
63
+ "phoneme_language": "pt-br",
64
+ "compute_input_seq_cache": false,
65
+ "text_cleaner": "multilingual_cleaners",
66
+ "enable_eos_bos_chars": false,
67
+ "test_sentences_file": "",
68
+ "phoneme_cache_path": null,
69
+ "characters": {
70
+ "pad": "_",
71
+ "eos": "&",
72
+ "bos": "*",
73
+ "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00af\u00b7\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e6\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u00ff\u0101\u0105\u0107\u0113\u0119\u011b\u012b\u0131\u0142\u0144\u014d\u0151\u0153\u015b\u016b\u0171\u017a\u017c\u01ce\u01d0\u01d2\u01d4\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d\u044e\u044f\u0451\u0454\u0456\u0457\u0491\u2013!'(),-.:;? ",
74
+ "punctuations": "!'(),-.:;? ",
75
+ "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
76
+ "unique": true
77
+ },
78
+ "batch_group_size": 0,
79
+ "loss_masking": null,
80
+ "min_seq_len": 90,
81
+ "max_seq_len": 270,
82
+ "compute_f0": false,
83
+ "compute_linear_spec": true,
84
+ "add_blank": true,
85
+ "datasets": [
86
+ {
87
+ "name": "vctk",
88
+ "path": "../../datasets/VCTK-Corpus-removed-silence_16Khz/",
89
+ "meta_file_train": null,
90
+ "ununsed_speakers": [
91
+ "p225",
92
+ "p234",
93
+ "p238",
94
+ "p245",
95
+ "p248",
96
+ "p261",
97
+ "p294",
98
+ "p302",
99
+ "p326",
100
+ "p335",
101
+ "p347"
102
+ ],
103
+ "language": "en",
104
+ "meta_file_val": null,
105
+ "meta_file_attn_mask": ""
106
+ },
107
+ {
108
+ "name": "libri_tts",
109
+ "path": "../../datasets/LibriTTS/LibriTTS/dataset-preprocessed-clean-100-and-360/dataset-22k/",
110
+ "meta_file_train": "metadata_all.csv",
111
+ "ununsed_speakers": null,
112
+ "language": "en",
113
+ "meta_file_val": "dev-clean_500.csv",
114
+ "meta_file_attn_mask": ""
115
+ },
116
+ {
117
+ "name": "brspeech",
118
+ "path": "../../datasets/TTS-Portuguese-Corpus_16khz/",
119
+ "meta_file_train": "train_TTS-Portuguese_Corpus_metadata.csv",
120
+ "ununsed_speakers": null,
121
+ "language": "pt-br",
122
+ "meta_file_val": "eval_TTS-Portuguese_Corpus_metadata.csv",
123
+ "meta_file_attn_mask": ""
124
+ },
125
+ {
126
+ "name": "mailabs",
127
+ "path": "../../datasets/M-AILABS/fr_FR",
128
+ "meta_file_train": "",
129
+ "ununsed_speakers": null,
130
+ "language": "fr-fr",
131
+ "meta_file_val": null,
132
+ "meta_file_attn_mask": null
133
+ }
134
+ ],
135
+ "optimizer": "AdamW",
136
+ "optimizer_params": {
137
+ "betas": [
138
+ 0.8,
139
+ 0.99
140
+ ],
141
+ "eps": 1e-09,
142
+ "weight_decay": 0.01
143
+ },
144
+ "lr_scheduler": "",
145
+ "lr_scheduler_params": null,
146
+ "test_sentences": [
147
+ [
148
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
149
+ "VCTK_p225",
150
+ null,
151
+ "en"
152
+ ],
153
+ [
154
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
155
+ "ED",
156
+ null,
157
+ "en"
158
+ ],
159
+ [
160
+ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
161
+ "bernard",
162
+ null,
163
+ "en"
164
+ ],
165
+ [
166
+ "This cake is great. It's so delicious and moist.",
167
+ "VCTK_p234",
168
+ null,
169
+ "en"
170
+ ],
171
+ [
172
+ "This cake is great. It's so delicious and moist.",
173
+ "ED",
174
+ null,
175
+ "en"
176
+ ],
177
+ [
178
+ "This cake is great. It's so delicious and moist.",
179
+ "ezwa",
180
+ null,
181
+ "en"
182
+ ],
183
+ [
184
+ "Hoje \u00e9 fundamental encontrar a raz\u00e3o da exist\u00eancia humana.",
185
+ "ED",
186
+ null,
187
+ "pt-br"
188
+ ],
189
+ [
190
+ "Hoje \u00e9 fundamental encontrar a raz\u00e3o da exist\u00eancia humana.",
191
+ "VCTK_p238",
192
+ null,
193
+ "pt-br"
194
+ ],
195
+ [
196
+ "Hoje \u00e9 fundamental encontrar a raz\u00e3o da exist\u00eancia humana.",
197
+ "gilles_g_le_blanc",
198
+ null,
199
+ "pt-br"
200
+ ],
201
+ [
202
+ "Em muitas cidades a popula\u00e7\u00e3o est\u00e1 diminuindo.",
203
+ "ED",
204
+ null,
205
+ "pt-br"
206
+ ],
207
+ [
208
+ "Em muitas cidades a popula\u00e7\u00e3o est\u00e1 diminuindo.",
209
+ "VCTK_p245",
210
+ null,
211
+ "pt-br"
212
+ ],
213
+ [
214
+ "Em muitas cidades a popula\u00e7\u00e3o est\u00e1 diminuindo.",
215
+ "nadine_eckert_boulet",
216
+ null,
217
+ "pt-br"
218
+ ],
219
+ [
220
+ "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
221
+ "VCTK_p245",
222
+ null,
223
+ "fr-fr"
224
+ ],
225
+ [
226
+ "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
227
+ "ED",
228
+ null,
229
+ "fr-fr"
230
+ ],
231
+ [
232
+ "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
233
+ "ezwa",
234
+ null,
235
+ "fr-fr"
236
+ ],
237
+ [
238
+ "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
239
+ "bernard",
240
+ null,
241
+ "fr-fr"
242
+ ],
243
+ [
244
+ "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
245
+ "gilles_g_le_blanc",
246
+ null,
247
+ "fr-fr"
248
+ ],
249
+ [
250
+ "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
251
+ "nadine_eckert_boulet",
252
+ null,
253
+ "fr-fr"
254
+ ],
255
+ [
256
+ "Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
257
+ "zeckou",
258
+ null,
259
+ "fr-fr"
260
+ ]
261
+ ],
262
+ "use_speaker_embedding": true,
263
+ "use_d_vector_file": true,
264
+ "d_vector_dim": 512,
265
+ "model_args": {
266
+ "num_chars": 165,
267
+ "out_channels": 513,
268
+ "spec_segment_size": 62,
269
+ "hidden_channels": 192,
270
+ "hidden_channels_ffn_text_encoder": 768,
271
+ "num_heads_text_encoder": 2,
272
+ "num_layers_text_encoder": 10,
273
+ "kernel_size_text_encoder": 3,
274
+ "dropout_p_text_encoder": 0.1,
275
+ "dropout_p_duration_predictor": 0.5,
276
+ "kernel_size_posterior_encoder": 5,
277
+ "dilation_rate_posterior_encoder": 1,
278
+ "num_layers_posterior_encoder": 16,
279
+ "kernel_size_flow": 5,
280
+ "dilation_rate_flow": 1,
281
+ "num_layers_flow": 4,
282
+ "resblock_type_decoder": 1,
283
+ "resblock_kernel_sizes_decoder": [
284
+ 3,
285
+ 7,
286
+ 11
287
+ ],
288
+ "resblock_dilation_sizes_decoder": [
289
+ [
290
+ 1,
291
+ 3,
292
+ 5
293
+ ],
294
+ [
295
+ 1,
296
+ 3,
297
+ 5
298
+ ],
299
+ [
300
+ 1,
301
+ 3,
302
+ 5
303
+ ]
304
+ ],
305
+ "upsample_rates_decoder": [
306
+ 8,
307
+ 8,
308
+ 2,
309
+ 2
310
+ ],
311
+ "upsample_initial_channel_decoder": 512,
312
+ "upsample_kernel_sizes_decoder": [
313
+ 16,
314
+ 16,
315
+ 4,
316
+ 4
317
+ ],
318
+ "use_sdp": true,
319
+ "noise_scale": 1.0,
320
+ "inference_noise_scale": 0.667,
321
+ "length_scale": 1,
322
+ "noise_scale_dp": 1.0,
323
+ "inference_noise_scale_dp": 0.8,
324
+ "max_inference_len": null,
325
+ "init_discriminator": true,
326
+ "use_spectral_norm_disriminator": false,
327
+ "use_speaker_embedding": true,
328
+ "num_speakers": 1244,
329
+ "speakers_file": null,
330
+ "d_vector_file": "../speaker_embeddings/new-SE/VCTK-LibriTTS+TTS-PT+MAILABS-FR/speakers.json",
331
+ "speaker_embedding_channels": 512,
332
+ "use_d_vector_file": true,
333
+ "d_vector_dim": 512,
334
+ "detach_dp_input": true,
335
+ "use_language_embedding": true,
336
+ "embedded_language_dim": 4,
337
+ "num_languages": 3,
338
+ "use_speaker_encoder_as_loss": true,
339
+ "speaker_encoder_config_path": "../checkpoints/Speaker_Encoder/Resnet-original-paper/config.json",
340
+ "speaker_encoder_model_path": "../checkpoints/Speaker_Encoder/Resnet-original-paper/converted_checkpoint.pth.tar",
341
+ "fine_tuning_mode": 0,
342
+ "freeze_encoder": false,
343
+ "freeze_DP": false,
344
+ "freeze_PE": false,
345
+ "freeze_flow_decoder": false,
346
+ "freeze_waveform_decoder": false
347
+ },
348
+ "grad_clip": [
349
+ 5.0,
350
+ 5.0
351
+ ],
352
+ "lr_gen": 0.0002,
353
+ "lr_disc": 0.0002,
354
+ "lr_scheduler_gen": "ExponentialLR",
355
+ "lr_scheduler_gen_params": {
356
+ "gamma": 0.999875,
357
+ "last_epoch": -1
358
+ },
359
+ "lr_scheduler_disc": "ExponentialLR",
360
+ "lr_scheduler_disc_params": {
361
+ "gamma": 0.999875,
362
+ "last_epoch": -1
363
+ },
364
+ "kl_loss_alpha": 1.0,
365
+ "disc_loss_alpha": 1.0,
366
+ "gen_loss_alpha": 1.0,
367
+ "feat_loss_alpha": 1.0,
368
+ "mel_loss_alpha": 45.0,
369
+ "dur_loss_alpha": 1.0,
370
+ "speaker_encoder_loss_alpha": 9.0,
371
+ "return_wav": true,
372
+ "r": 1
373
+ }
config_se.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "speaker_encoder",
3
+ "run_name": "speaker_encoder",
4
+ "run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev",
5
+ "epochs": 100000,
6
+ "batch_size": null,
7
+ "eval_batch_size": null,
8
+ "mixed_precision": false,
9
+ "run_eval": true,
10
+ "test_delay_epochs": 0,
11
+ "print_eval": false,
12
+ "print_step": 50,
13
+ "tb_plot_step": 100,
14
+ "tb_model_param_stats": false,
15
+ "save_step": 1000,
16
+ "checkpoint": true,
17
+ "keep_all_best": false,
18
+ "keep_after": 10000,
19
+ "num_loader_workers": 8,
20
+ "num_val_loader_workers": 0,
21
+ "use_noise_augment": false,
22
+ "output_path": "../checkpoints/speaker_encoder/language_balanced/normalized/angleproto-4-samples-by-speakers/",
23
+ "distributed_backend": "nccl",
24
+ "distributed_url": "tcp://localhost:54321",
25
+ "audio": {
26
+ "fft_size": 512,
27
+ "win_length": 400,
28
+ "hop_length": 160,
29
+ "frame_shift_ms": null,
30
+ "frame_length_ms": null,
31
+ "stft_pad_mode": "reflect",
32
+ "sample_rate": 16000,
33
+ "resample": false,
34
+ "preemphasis": 0.97,
35
+ "ref_level_db": 20,
36
+ "do_sound_norm": false,
37
+ "do_trim_silence": false,
38
+ "trim_db": 60,
39
+ "power": 1.5,
40
+ "griffin_lim_iters": 60,
41
+ "num_mels": 64,
42
+ "mel_fmin": 0.0,
43
+ "mel_fmax": 8000.0,
44
+ "spec_gain": 20,
45
+ "signal_norm": false,
46
+ "min_level_db": -100,
47
+ "symmetric_norm": false,
48
+ "max_norm": 4.0,
49
+ "clip_norm": false,
50
+ "stats_path": null
51
+ },
52
+ "datasets": [
53
+ {
54
+ "name": "voxceleb2",
55
+ "path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/",
56
+ "meta_file_train": null,
57
+ "ununsed_speakers": null,
58
+ "meta_file_val": null,
59
+ "meta_file_attn_mask": "",
60
+ "language": "voxceleb"
61
+ }
62
+ ],
63
+ "model_params": {
64
+ "model_name": "resnet",
65
+ "input_dim": 64,
66
+ "use_torch_spec": true,
67
+ "log_input": true,
68
+ "proj_dim": 512
69
+ },
70
+ "audio_augmentation": {
71
+ "p": 0.5,
72
+ "rir": {
73
+ "rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
74
+ "conv_mode": "full"
75
+ },
76
+ "additive": {
77
+ "sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
78
+ "speech": {
79
+ "min_snr_in_db": 13,
80
+ "max_snr_in_db": 20,
81
+ "min_num_noises": 1,
82
+ "max_num_noises": 1
83
+ },
84
+ "noise": {
85
+ "min_snr_in_db": 0,
86
+ "max_snr_in_db": 15,
87
+ "min_num_noises": 1,
88
+ "max_num_noises": 1
89
+ },
90
+ "music": {
91
+ "min_snr_in_db": 5,
92
+ "max_snr_in_db": 15,
93
+ "min_num_noises": 1,
94
+ "max_num_noises": 1
95
+ }
96
+ },
97
+ "gaussian": {
98
+ "p": 0.0,
99
+ "min_amplitude": 0.0,
100
+ "max_amplitude": 1e-05
101
+ }
102
+ },
103
+ "storage": {
104
+ "sample_from_storage_p": 0.5,
105
+ "storage_size": 40
106
+ },
107
+ "max_train_step": 1000000,
108
+ "loss": "angleproto",
109
+ "grad_clip": 3.0,
110
+ "lr": 0.0001,
111
+ "lr_decay": false,
112
+ "warmup_steps": 4000,
113
+ "wd": 1e-06,
114
+ "steps_plot_stats": 100,
115
+ "num_speakers_in_batch": 100,
116
+ "num_utters_per_speaker": 4,
117
+ "skip_speakers": true,
118
+ "voice_len": 2.0
119
+ }
language_ids.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "en": 0,
3
+ "fr-fr": 1,
4
+ "pt-br": 2
5
+ }
ntr.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b75c56ba7545d0a96bf6a12c02ef38edc4beded66fd4d32d1b92543045e43617
3
+ size 1940444
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ git+https://github.com/Edresson/Coqui-TTS@multilingual-torchaudio-SE
2
+ torchaudio==0.9.0
3
+ pydub
4
+ ffmpeg-normalize==1.21.0
speakers.json ADDED
The diff for this file is too large to render. See raw diff
 
timcast1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fb4d35e5e20c59e6deb69694da0bd403f80704e2f3d9b8d4c4d1a5b558bc6c1
3
+ size 1764044