Spaces:
Build error
Build error
Commit
•
ec78133
0
Parent(s):
Duplicate from ramkamal2000/voice-conversion-yourtts
Browse filesCo-authored-by: Ramkamal T B <ramkamal2000@users.noreply.huggingface.co>
- .gitattributes +36 -0
- README.md +14 -0
- SE_checkpoint.pth.tar +3 -0
- app.py +180 -0
- best_model.pth.tar +3 -0
- config.json +373 -0
- config_se.json +119 -0
- language_ids.json +5 -0
- ntr.wav +3 -0
- requirements.txt +4 -0
- speakers.json +0 -0
- timcast1.wav +3 -0
.gitattributes
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Voice Conversion Yourtts
|
3 |
+
emoji: 😻
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: green
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 3.17.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: unknown
|
11 |
+
duplicated_from: ramkamal2000/voice-conversion-yourtts
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
SE_checkpoint.pth.tar
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f96efb20cbeeefd81fd8336d7f0155bf8902f82f9474e58ccb19d9e12345172
|
3 |
+
size 44610930
|
app.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# !git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS
|
2 |
+
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
import sys
|
8 |
+
|
9 |
+
import string
|
10 |
+
import time
|
11 |
+
import argparse
|
12 |
+
import json
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
# import IPython
|
16 |
+
# from IPython.display import Audio
|
17 |
+
|
18 |
+
import torch
|
19 |
+
|
20 |
+
from TTS.tts.utils.synthesis import synthesis
|
21 |
+
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
|
22 |
+
try:
|
23 |
+
from TTS.utils.audio import AudioProcessor
|
24 |
+
except:
|
25 |
+
from TTS.utils.audio import AudioProcessor
|
26 |
+
|
27 |
+
|
28 |
+
from TTS.tts.models import setup_model
|
29 |
+
from TTS.config import load_config
|
30 |
+
from TTS.tts.models.vits import *
|
31 |
+
|
32 |
+
from TTS.tts.utils.speakers import SpeakerManager
|
33 |
+
from pydub import AudioSegment
|
34 |
+
|
35 |
+
# from google.colab import files
|
36 |
+
import librosa
|
37 |
+
|
38 |
+
from scipy.io.wavfile import write, read
|
39 |
+
|
40 |
+
import subprocess
|
41 |
+
|
42 |
+
'''
|
43 |
+
from google.colab import drive
|
44 |
+
drive.mount('/content/drive')
|
45 |
+
|
46 |
+
src_path = os.path.join(os.path.join(os.path.join(os.path.join(os.getcwd(), 'drive'), 'MyDrive'), 'Colab Notebooks'), 'best_model_latest.pth.tar')
|
47 |
+
dst_path = os.path.join(os.getcwd(), 'best_model.pth.tar')
|
48 |
+
|
49 |
+
shutil.copy(src_path, dst_path)
|
50 |
+
'''
|
51 |
+
|
52 |
+
TTS_PATH = "TTS/"
|
53 |
+
|
54 |
+
# add libraries into environment
|
55 |
+
sys.path.append(TTS_PATH) # set this if TTS is not installed globally
|
56 |
+
|
57 |
+
# Paths definition
|
58 |
+
|
59 |
+
OUT_PATH = 'out/'
|
60 |
+
|
61 |
+
# create output path
|
62 |
+
os.makedirs(OUT_PATH, exist_ok=True)
|
63 |
+
|
64 |
+
# model vars
|
65 |
+
MODEL_PATH = 'best_model.pth.tar'
|
66 |
+
CONFIG_PATH = 'config.json'
|
67 |
+
TTS_LANGUAGES = "language_ids.json"
|
68 |
+
TTS_SPEAKERS = "speakers.json"
|
69 |
+
USE_CUDA = torch.cuda.is_available()
|
70 |
+
|
71 |
+
# load the config
|
72 |
+
C = load_config(CONFIG_PATH)
|
73 |
+
|
74 |
+
# load the audio processor
|
75 |
+
ap = AudioProcessor(**C.audio)
|
76 |
+
|
77 |
+
speaker_embedding = None
|
78 |
+
|
79 |
+
C.model_args['d_vector_file'] = TTS_SPEAKERS
|
80 |
+
C.model_args['use_speaker_encoder_as_loss'] = False
|
81 |
+
|
82 |
+
model = setup_model(C)
|
83 |
+
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
|
84 |
+
# print(model.language_manager.num_languages, model.embedded_language_dim)
|
85 |
+
# print(model.emb_l)
|
86 |
+
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
|
87 |
+
# remove speaker encoder
|
88 |
+
model_weights = cp['model'].copy()
|
89 |
+
for key in list(model_weights.keys()):
|
90 |
+
if "speaker_encoder" in key:
|
91 |
+
del model_weights[key]
|
92 |
+
|
93 |
+
model.load_state_dict(model_weights)
|
94 |
+
|
95 |
+
model.eval()
|
96 |
+
|
97 |
+
if USE_CUDA:
|
98 |
+
model = model.cuda()
|
99 |
+
|
100 |
+
# synthesize voice
|
101 |
+
use_griffin_lim = False
|
102 |
+
|
103 |
+
# Paths definition
|
104 |
+
|
105 |
+
CONFIG_SE_PATH = "config_se.json"
|
106 |
+
CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
|
107 |
+
|
108 |
+
# Load the Speaker encoder
|
109 |
+
|
110 |
+
SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)
|
111 |
+
|
112 |
+
# Define helper function
|
113 |
+
|
114 |
+
def compute_spec(ref_file):
|
115 |
+
y, sr = librosa.load(ref_file, sr=ap.sample_rate)
|
116 |
+
spec = ap.spectrogram(y)
|
117 |
+
spec = torch.FloatTensor(spec).unsqueeze(0)
|
118 |
+
return spec
|
119 |
+
|
120 |
+
|
121 |
+
def voice_conversion(ta, ra, da):
|
122 |
+
|
123 |
+
target_audio = 'target.wav'
|
124 |
+
reference_audio = 'reference.wav'
|
125 |
+
driving_audio = 'driving.wav'
|
126 |
+
|
127 |
+
write(target_audio, ta[0], ta[1])
|
128 |
+
write(reference_audio, ra[0], ra[1])
|
129 |
+
write(driving_audio, da[0], da[1])
|
130 |
+
|
131 |
+
# !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
|
132 |
+
# !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
|
133 |
+
# !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f
|
134 |
+
|
135 |
+
files = [target_audio, reference_audio, driving_audio]
|
136 |
+
|
137 |
+
for file in files:
|
138 |
+
subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])
|
139 |
+
|
140 |
+
# ta_ = read(target_audio)
|
141 |
+
|
142 |
+
target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
|
143 |
+
target_emb = torch.FloatTensor(target_emb).unsqueeze(0)
|
144 |
+
|
145 |
+
driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
|
146 |
+
driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
|
147 |
+
|
148 |
+
# Convert the voice
|
149 |
+
|
150 |
+
driving_spec = compute_spec(driving_audio)
|
151 |
+
y_lengths = torch.tensor([driving_spec.size(-1)])
|
152 |
+
if USE_CUDA:
|
153 |
+
ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
|
154 |
+
ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
|
155 |
+
else:
|
156 |
+
ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
|
157 |
+
ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()
|
158 |
+
|
159 |
+
# print("Reference Audio after decoder:")
|
160 |
+
# IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))
|
161 |
+
|
162 |
+
return (ap.sample_rate, ref_wav_voc)
|
163 |
+
|
164 |
+
c3 = gr.Interface(
|
165 |
+
fn=voice_conversion,
|
166 |
+
inputs=[gr.Audio(label='Target Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Clip To Convert')],
|
167 |
+
outputs=gr.Audio(label='Target Speaker - Converted Clip'),
|
168 |
+
examples=[['ntr.wav', 'timcast1.wav', 'timcast1.wav']],
|
169 |
+
description="Use this cool too to convert your voice to another person's! \nThe first audio input requires an audio file that of the target speaker. The second and third audio inputs require audio files from the person who's voice you want to convert."
|
170 |
+
)
|
171 |
+
|
172 |
+
c1_m2 = gr.Interface(
|
173 |
+
fn=voice_conversion,
|
174 |
+
inputs=[gr.Audio(label='Target Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Reference Clip', source='microphone'), gr.Audio(label='Input Speaker - Clip To Convert', source='microphone')],
|
175 |
+
outputs=gr.Audio(label='Target Speaker - Converted Clip'),
|
176 |
+
description="Use this cool too to convert your voice to another person's! \nThe first audio input requires an audio file that of the target speaker. The second and third audio inputs require live recordings from the person who's voice you want to convert."
|
177 |
+
)
|
178 |
+
|
179 |
+
demo = gr.TabbedInterface([c3, c1_m2], ["Pre-Recorded", "Microphone"], title="Voice Conversion")
|
180 |
+
demo.launch(debug='True')
|
best_model.pth.tar
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:017bfd8907c80bb5857d65d0223f0e4e4b9d699ef52e2a853d9cc7eb7e308cf0
|
3 |
+
size 379957289
|
config.json
ADDED
@@ -0,0 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "vits",
|
3 |
+
"run_name": "vits_tts-portuguese",
|
4 |
+
"run_description": "",
|
5 |
+
"epochs": 1000,
|
6 |
+
"batch_size": 52,
|
7 |
+
"eval_batch_size": 52,
|
8 |
+
"mixed_precision": false,
|
9 |
+
"scheduler_after_epoch": true,
|
10 |
+
"run_eval": true,
|
11 |
+
"test_delay_epochs": -1,
|
12 |
+
"print_eval": true,
|
13 |
+
"dashboard_logger": "tensorboard",
|
14 |
+
"print_step": 25,
|
15 |
+
"plot_step": 100,
|
16 |
+
"model_param_stats": false,
|
17 |
+
"project_name": null,
|
18 |
+
"log_model_step": 10000,
|
19 |
+
"wandb_entity": null,
|
20 |
+
"save_step": 10000,
|
21 |
+
"checkpoint": true,
|
22 |
+
"keep_all_best": false,
|
23 |
+
"keep_after": 10000,
|
24 |
+
"num_loader_workers": 4,
|
25 |
+
"num_eval_loader_workers": 4,
|
26 |
+
"use_noise_augment": false,
|
27 |
+
"use_language_weighted_sampler": true,
|
28 |
+
"output_path": "../checkpoints/VITS-multilingual/VITS_fixes/new/new-SE/use_noise_aument_false/xlarge-ZS-PT-VCTK/pt-en+LibriTTS-fr/speaker_encoder_as_loss_9_alpha/mixed-p-false-bug-SDP-fixed/",
|
29 |
+
"distributed_backend": "nccl",
|
30 |
+
"distributed_url": "tcp://localhost:54321",
|
31 |
+
"audio": {
|
32 |
+
"fft_size": 1024,
|
33 |
+
"win_length": 1024,
|
34 |
+
"hop_length": 256,
|
35 |
+
"frame_shift_ms": null,
|
36 |
+
"frame_length_ms": null,
|
37 |
+
"stft_pad_mode": "reflect",
|
38 |
+
"sample_rate": 16000,
|
39 |
+
"resample": false,
|
40 |
+
"preemphasis": 0.0,
|
41 |
+
"ref_level_db": 20,
|
42 |
+
"do_sound_norm": false,
|
43 |
+
"log_func": "np.log",
|
44 |
+
"do_trim_silence": true,
|
45 |
+
"trim_db": 45,
|
46 |
+
"power": 1.5,
|
47 |
+
"griffin_lim_iters": 60,
|
48 |
+
"num_mels": 80,
|
49 |
+
"mel_fmin": 0.0,
|
50 |
+
"mel_fmax": null,
|
51 |
+
"spec_gain": 1,
|
52 |
+
"do_amp_to_db_linear": false,
|
53 |
+
"do_amp_to_db_mel": true,
|
54 |
+
"signal_norm": false,
|
55 |
+
"min_level_db": -100,
|
56 |
+
"symmetric_norm": true,
|
57 |
+
"max_norm": 4.0,
|
58 |
+
"clip_norm": true,
|
59 |
+
"stats_path": null
|
60 |
+
},
|
61 |
+
"use_phonemes": false,
|
62 |
+
"use_espeak_phonemes": false,
|
63 |
+
"phoneme_language": "pt-br",
|
64 |
+
"compute_input_seq_cache": false,
|
65 |
+
"text_cleaner": "multilingual_cleaners",
|
66 |
+
"enable_eos_bos_chars": false,
|
67 |
+
"test_sentences_file": "",
|
68 |
+
"phoneme_cache_path": null,
|
69 |
+
"characters": {
|
70 |
+
"pad": "_",
|
71 |
+
"eos": "&",
|
72 |
+
"bos": "*",
|
73 |
+
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00af\u00b7\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e6\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u00ff\u0101\u0105\u0107\u0113\u0119\u011b\u012b\u0131\u0142\u0144\u014d\u0151\u0153\u015b\u016b\u0171\u017a\u017c\u01ce\u01d0\u01d2\u01d4\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d\u044e\u044f\u0451\u0454\u0456\u0457\u0491\u2013!'(),-.:;? ",
|
74 |
+
"punctuations": "!'(),-.:;? ",
|
75 |
+
"phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
76 |
+
"unique": true
|
77 |
+
},
|
78 |
+
"batch_group_size": 0,
|
79 |
+
"loss_masking": null,
|
80 |
+
"min_seq_len": 90,
|
81 |
+
"max_seq_len": 270,
|
82 |
+
"compute_f0": false,
|
83 |
+
"compute_linear_spec": true,
|
84 |
+
"add_blank": true,
|
85 |
+
"datasets": [
|
86 |
+
{
|
87 |
+
"name": "vctk",
|
88 |
+
"path": "../../datasets/VCTK-Corpus-removed-silence_16Khz/",
|
89 |
+
"meta_file_train": null,
|
90 |
+
"ununsed_speakers": [
|
91 |
+
"p225",
|
92 |
+
"p234",
|
93 |
+
"p238",
|
94 |
+
"p245",
|
95 |
+
"p248",
|
96 |
+
"p261",
|
97 |
+
"p294",
|
98 |
+
"p302",
|
99 |
+
"p326",
|
100 |
+
"p335",
|
101 |
+
"p347"
|
102 |
+
],
|
103 |
+
"language": "en",
|
104 |
+
"meta_file_val": null,
|
105 |
+
"meta_file_attn_mask": ""
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"name": "libri_tts",
|
109 |
+
"path": "../../datasets/LibriTTS/LibriTTS/dataset-preprocessed-clean-100-and-360/dataset-22k/",
|
110 |
+
"meta_file_train": "metadata_all.csv",
|
111 |
+
"ununsed_speakers": null,
|
112 |
+
"language": "en",
|
113 |
+
"meta_file_val": "dev-clean_500.csv",
|
114 |
+
"meta_file_attn_mask": ""
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"name": "brspeech",
|
118 |
+
"path": "../../datasets/TTS-Portuguese-Corpus_16khz/",
|
119 |
+
"meta_file_train": "train_TTS-Portuguese_Corpus_metadata.csv",
|
120 |
+
"ununsed_speakers": null,
|
121 |
+
"language": "pt-br",
|
122 |
+
"meta_file_val": "eval_TTS-Portuguese_Corpus_metadata.csv",
|
123 |
+
"meta_file_attn_mask": ""
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"name": "mailabs",
|
127 |
+
"path": "../../datasets/M-AILABS/fr_FR",
|
128 |
+
"meta_file_train": "",
|
129 |
+
"ununsed_speakers": null,
|
130 |
+
"language": "fr-fr",
|
131 |
+
"meta_file_val": null,
|
132 |
+
"meta_file_attn_mask": null
|
133 |
+
}
|
134 |
+
],
|
135 |
+
"optimizer": "AdamW",
|
136 |
+
"optimizer_params": {
|
137 |
+
"betas": [
|
138 |
+
0.8,
|
139 |
+
0.99
|
140 |
+
],
|
141 |
+
"eps": 1e-09,
|
142 |
+
"weight_decay": 0.01
|
143 |
+
},
|
144 |
+
"lr_scheduler": "",
|
145 |
+
"lr_scheduler_params": null,
|
146 |
+
"test_sentences": [
|
147 |
+
[
|
148 |
+
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
149 |
+
"VCTK_p225",
|
150 |
+
null,
|
151 |
+
"en"
|
152 |
+
],
|
153 |
+
[
|
154 |
+
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
155 |
+
"ED",
|
156 |
+
null,
|
157 |
+
"en"
|
158 |
+
],
|
159 |
+
[
|
160 |
+
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
|
161 |
+
"bernard",
|
162 |
+
null,
|
163 |
+
"en"
|
164 |
+
],
|
165 |
+
[
|
166 |
+
"This cake is great. It's so delicious and moist.",
|
167 |
+
"VCTK_p234",
|
168 |
+
null,
|
169 |
+
"en"
|
170 |
+
],
|
171 |
+
[
|
172 |
+
"This cake is great. It's so delicious and moist.",
|
173 |
+
"ED",
|
174 |
+
null,
|
175 |
+
"en"
|
176 |
+
],
|
177 |
+
[
|
178 |
+
"This cake is great. It's so delicious and moist.",
|
179 |
+
"ezwa",
|
180 |
+
null,
|
181 |
+
"en"
|
182 |
+
],
|
183 |
+
[
|
184 |
+
"Hoje \u00e9 fundamental encontrar a raz\u00e3o da exist\u00eancia humana.",
|
185 |
+
"ED",
|
186 |
+
null,
|
187 |
+
"pt-br"
|
188 |
+
],
|
189 |
+
[
|
190 |
+
"Hoje \u00e9 fundamental encontrar a raz\u00e3o da exist\u00eancia humana.",
|
191 |
+
"VCTK_p238",
|
192 |
+
null,
|
193 |
+
"pt-br"
|
194 |
+
],
|
195 |
+
[
|
196 |
+
"Hoje \u00e9 fundamental encontrar a raz\u00e3o da exist\u00eancia humana.",
|
197 |
+
"gilles_g_le_blanc",
|
198 |
+
null,
|
199 |
+
"pt-br"
|
200 |
+
],
|
201 |
+
[
|
202 |
+
"Em muitas cidades a popula\u00e7\u00e3o est\u00e1 diminuindo.",
|
203 |
+
"ED",
|
204 |
+
null,
|
205 |
+
"pt-br"
|
206 |
+
],
|
207 |
+
[
|
208 |
+
"Em muitas cidades a popula\u00e7\u00e3o est\u00e1 diminuindo.",
|
209 |
+
"VCTK_p245",
|
210 |
+
null,
|
211 |
+
"pt-br"
|
212 |
+
],
|
213 |
+
[
|
214 |
+
"Em muitas cidades a popula\u00e7\u00e3o est\u00e1 diminuindo.",
|
215 |
+
"nadine_eckert_boulet",
|
216 |
+
null,
|
217 |
+
"pt-br"
|
218 |
+
],
|
219 |
+
[
|
220 |
+
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
|
221 |
+
"VCTK_p245",
|
222 |
+
null,
|
223 |
+
"fr-fr"
|
224 |
+
],
|
225 |
+
[
|
226 |
+
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
|
227 |
+
"ED",
|
228 |
+
null,
|
229 |
+
"fr-fr"
|
230 |
+
],
|
231 |
+
[
|
232 |
+
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
|
233 |
+
"ezwa",
|
234 |
+
null,
|
235 |
+
"fr-fr"
|
236 |
+
],
|
237 |
+
[
|
238 |
+
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
|
239 |
+
"bernard",
|
240 |
+
null,
|
241 |
+
"fr-fr"
|
242 |
+
],
|
243 |
+
[
|
244 |
+
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
|
245 |
+
"gilles_g_le_blanc",
|
246 |
+
null,
|
247 |
+
"fr-fr"
|
248 |
+
],
|
249 |
+
[
|
250 |
+
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
|
251 |
+
"nadine_eckert_boulet",
|
252 |
+
null,
|
253 |
+
"fr-fr"
|
254 |
+
],
|
255 |
+
[
|
256 |
+
"Il m'a fallu beaucoup de temps pour d\u00e9velopper une voix, et maintenant que je l'ai, je ne vais pas me taire.",
|
257 |
+
"zeckou",
|
258 |
+
null,
|
259 |
+
"fr-fr"
|
260 |
+
]
|
261 |
+
],
|
262 |
+
"use_speaker_embedding": true,
|
263 |
+
"use_d_vector_file": true,
|
264 |
+
"d_vector_dim": 512,
|
265 |
+
"model_args": {
|
266 |
+
"num_chars": 165,
|
267 |
+
"out_channels": 513,
|
268 |
+
"spec_segment_size": 62,
|
269 |
+
"hidden_channels": 192,
|
270 |
+
"hidden_channels_ffn_text_encoder": 768,
|
271 |
+
"num_heads_text_encoder": 2,
|
272 |
+
"num_layers_text_encoder": 10,
|
273 |
+
"kernel_size_text_encoder": 3,
|
274 |
+
"dropout_p_text_encoder": 0.1,
|
275 |
+
"dropout_p_duration_predictor": 0.5,
|
276 |
+
"kernel_size_posterior_encoder": 5,
|
277 |
+
"dilation_rate_posterior_encoder": 1,
|
278 |
+
"num_layers_posterior_encoder": 16,
|
279 |
+
"kernel_size_flow": 5,
|
280 |
+
"dilation_rate_flow": 1,
|
281 |
+
"num_layers_flow": 4,
|
282 |
+
"resblock_type_decoder": 1,
|
283 |
+
"resblock_kernel_sizes_decoder": [
|
284 |
+
3,
|
285 |
+
7,
|
286 |
+
11
|
287 |
+
],
|
288 |
+
"resblock_dilation_sizes_decoder": [
|
289 |
+
[
|
290 |
+
1,
|
291 |
+
3,
|
292 |
+
5
|
293 |
+
],
|
294 |
+
[
|
295 |
+
1,
|
296 |
+
3,
|
297 |
+
5
|
298 |
+
],
|
299 |
+
[
|
300 |
+
1,
|
301 |
+
3,
|
302 |
+
5
|
303 |
+
]
|
304 |
+
],
|
305 |
+
"upsample_rates_decoder": [
|
306 |
+
8,
|
307 |
+
8,
|
308 |
+
2,
|
309 |
+
2
|
310 |
+
],
|
311 |
+
"upsample_initial_channel_decoder": 512,
|
312 |
+
"upsample_kernel_sizes_decoder": [
|
313 |
+
16,
|
314 |
+
16,
|
315 |
+
4,
|
316 |
+
4
|
317 |
+
],
|
318 |
+
"use_sdp": true,
|
319 |
+
"noise_scale": 1.0,
|
320 |
+
"inference_noise_scale": 0.667,
|
321 |
+
"length_scale": 1,
|
322 |
+
"noise_scale_dp": 1.0,
|
323 |
+
"inference_noise_scale_dp": 0.8,
|
324 |
+
"max_inference_len": null,
|
325 |
+
"init_discriminator": true,
|
326 |
+
"use_spectral_norm_disriminator": false,
|
327 |
+
"use_speaker_embedding": true,
|
328 |
+
"num_speakers": 1244,
|
329 |
+
"speakers_file": null,
|
330 |
+
"d_vector_file": "../speaker_embeddings/new-SE/VCTK-LibriTTS+TTS-PT+MAILABS-FR/speakers.json",
|
331 |
+
"speaker_embedding_channels": 512,
|
332 |
+
"use_d_vector_file": true,
|
333 |
+
"d_vector_dim": 512,
|
334 |
+
"detach_dp_input": true,
|
335 |
+
"use_language_embedding": true,
|
336 |
+
"embedded_language_dim": 4,
|
337 |
+
"num_languages": 3,
|
338 |
+
"use_speaker_encoder_as_loss": true,
|
339 |
+
"speaker_encoder_config_path": "../checkpoints/Speaker_Encoder/Resnet-original-paper/config.json",
|
340 |
+
"speaker_encoder_model_path": "../checkpoints/Speaker_Encoder/Resnet-original-paper/converted_checkpoint.pth.tar",
|
341 |
+
"fine_tuning_mode": 0,
|
342 |
+
"freeze_encoder": false,
|
343 |
+
"freeze_DP": false,
|
344 |
+
"freeze_PE": false,
|
345 |
+
"freeze_flow_decoder": false,
|
346 |
+
"freeze_waveform_decoder": false
|
347 |
+
},
|
348 |
+
"grad_clip": [
|
349 |
+
5.0,
|
350 |
+
5.0
|
351 |
+
],
|
352 |
+
"lr_gen": 0.0002,
|
353 |
+
"lr_disc": 0.0002,
|
354 |
+
"lr_scheduler_gen": "ExponentialLR",
|
355 |
+
"lr_scheduler_gen_params": {
|
356 |
+
"gamma": 0.999875,
|
357 |
+
"last_epoch": -1
|
358 |
+
},
|
359 |
+
"lr_scheduler_disc": "ExponentialLR",
|
360 |
+
"lr_scheduler_disc_params": {
|
361 |
+
"gamma": 0.999875,
|
362 |
+
"last_epoch": -1
|
363 |
+
},
|
364 |
+
"kl_loss_alpha": 1.0,
|
365 |
+
"disc_loss_alpha": 1.0,
|
366 |
+
"gen_loss_alpha": 1.0,
|
367 |
+
"feat_loss_alpha": 1.0,
|
368 |
+
"mel_loss_alpha": 45.0,
|
369 |
+
"dur_loss_alpha": 1.0,
|
370 |
+
"speaker_encoder_loss_alpha": 9.0,
|
371 |
+
"return_wav": true,
|
372 |
+
"r": 1
|
373 |
+
}
|
config_se.json
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"model": "speaker_encoder",
|
3 |
+
"run_name": "speaker_encoder",
|
4 |
+
"run_description": "resnet speaker encoder trained with commonvoice all languages dev and train, Voxceleb 1 dev and Voxceleb 2 dev",
|
5 |
+
"epochs": 100000,
|
6 |
+
"batch_size": null,
|
7 |
+
"eval_batch_size": null,
|
8 |
+
"mixed_precision": false,
|
9 |
+
"run_eval": true,
|
10 |
+
"test_delay_epochs": 0,
|
11 |
+
"print_eval": false,
|
12 |
+
"print_step": 50,
|
13 |
+
"tb_plot_step": 100,
|
14 |
+
"tb_model_param_stats": false,
|
15 |
+
"save_step": 1000,
|
16 |
+
"checkpoint": true,
|
17 |
+
"keep_all_best": false,
|
18 |
+
"keep_after": 10000,
|
19 |
+
"num_loader_workers": 8,
|
20 |
+
"num_val_loader_workers": 0,
|
21 |
+
"use_noise_augment": false,
|
22 |
+
"output_path": "../checkpoints/speaker_encoder/language_balanced/normalized/angleproto-4-samples-by-speakers/",
|
23 |
+
"distributed_backend": "nccl",
|
24 |
+
"distributed_url": "tcp://localhost:54321",
|
25 |
+
"audio": {
|
26 |
+
"fft_size": 512,
|
27 |
+
"win_length": 400,
|
28 |
+
"hop_length": 160,
|
29 |
+
"frame_shift_ms": null,
|
30 |
+
"frame_length_ms": null,
|
31 |
+
"stft_pad_mode": "reflect",
|
32 |
+
"sample_rate": 16000,
|
33 |
+
"resample": false,
|
34 |
+
"preemphasis": 0.97,
|
35 |
+
"ref_level_db": 20,
|
36 |
+
"do_sound_norm": false,
|
37 |
+
"do_trim_silence": false,
|
38 |
+
"trim_db": 60,
|
39 |
+
"power": 1.5,
|
40 |
+
"griffin_lim_iters": 60,
|
41 |
+
"num_mels": 64,
|
42 |
+
"mel_fmin": 0.0,
|
43 |
+
"mel_fmax": 8000.0,
|
44 |
+
"spec_gain": 20,
|
45 |
+
"signal_norm": false,
|
46 |
+
"min_level_db": -100,
|
47 |
+
"symmetric_norm": false,
|
48 |
+
"max_norm": 4.0,
|
49 |
+
"clip_norm": false,
|
50 |
+
"stats_path": null
|
51 |
+
},
|
52 |
+
"datasets": [
|
53 |
+
{
|
54 |
+
"name": "voxceleb2",
|
55 |
+
"path": "/workspace/scratch/ecasanova/datasets/VoxCeleb/vox2_dev_aac/",
|
56 |
+
"meta_file_train": null,
|
57 |
+
"ununsed_speakers": null,
|
58 |
+
"meta_file_val": null,
|
59 |
+
"meta_file_attn_mask": "",
|
60 |
+
"language": "voxceleb"
|
61 |
+
}
|
62 |
+
],
|
63 |
+
"model_params": {
|
64 |
+
"model_name": "resnet",
|
65 |
+
"input_dim": 64,
|
66 |
+
"use_torch_spec": true,
|
67 |
+
"log_input": true,
|
68 |
+
"proj_dim": 512
|
69 |
+
},
|
70 |
+
"audio_augmentation": {
|
71 |
+
"p": 0.5,
|
72 |
+
"rir": {
|
73 |
+
"rir_path": "/workspace/store/ecasanova/ComParE/RIRS_NOISES/simulated_rirs/",
|
74 |
+
"conv_mode": "full"
|
75 |
+
},
|
76 |
+
"additive": {
|
77 |
+
"sounds_path": "/workspace/store/ecasanova/ComParE/musan/",
|
78 |
+
"speech": {
|
79 |
+
"min_snr_in_db": 13,
|
80 |
+
"max_snr_in_db": 20,
|
81 |
+
"min_num_noises": 1,
|
82 |
+
"max_num_noises": 1
|
83 |
+
},
|
84 |
+
"noise": {
|
85 |
+
"min_snr_in_db": 0,
|
86 |
+
"max_snr_in_db": 15,
|
87 |
+
"min_num_noises": 1,
|
88 |
+
"max_num_noises": 1
|
89 |
+
},
|
90 |
+
"music": {
|
91 |
+
"min_snr_in_db": 5,
|
92 |
+
"max_snr_in_db": 15,
|
93 |
+
"min_num_noises": 1,
|
94 |
+
"max_num_noises": 1
|
95 |
+
}
|
96 |
+
},
|
97 |
+
"gaussian": {
|
98 |
+
"p": 0.0,
|
99 |
+
"min_amplitude": 0.0,
|
100 |
+
"max_amplitude": 1e-05
|
101 |
+
}
|
102 |
+
},
|
103 |
+
"storage": {
|
104 |
+
"sample_from_storage_p": 0.5,
|
105 |
+
"storage_size": 40
|
106 |
+
},
|
107 |
+
"max_train_step": 1000000,
|
108 |
+
"loss": "angleproto",
|
109 |
+
"grad_clip": 3.0,
|
110 |
+
"lr": 0.0001,
|
111 |
+
"lr_decay": false,
|
112 |
+
"warmup_steps": 4000,
|
113 |
+
"wd": 1e-06,
|
114 |
+
"steps_plot_stats": 100,
|
115 |
+
"num_speakers_in_batch": 100,
|
116 |
+
"num_utters_per_speaker": 4,
|
117 |
+
"skip_speakers": true,
|
118 |
+
"voice_len": 2.0
|
119 |
+
}
|
language_ids.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"en": 0,
|
3 |
+
"fr-fr": 1,
|
4 |
+
"pt-br": 2
|
5 |
+
}
|
ntr.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b75c56ba7545d0a96bf6a12c02ef38edc4beded66fd4d32d1b92543045e43617
|
3 |
+
size 1940444
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
git+https://github.com/Edresson/Coqui-TTS@multilingual-torchaudio-SE
|
2 |
+
torchaudio==0.9.0
|
3 |
+
pydub
|
4 |
+
ffmpeg-normalize==1.21.0
|
speakers.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
timcast1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2fb4d35e5e20c59e6deb69694da0bd403f80704e2f3d9b8d4c4d1a5b558bc6c1
|
3 |
+
size 1764044
|