|
import gradio as gr |
|
import torch |
|
import numpy as np |
|
|
|
|
|
from encoder import inference as encoder |
|
from synthesizer.inference import Synthesizer |
|
from vocoder import inference as vocoder |
|
|
|
|
|
encoder_model_path = "encoder/saved_models/pretrained.pt" |
|
synthesizer_model_path = "synthesizer/saved_models/pretrained/pretrained.pt" |
|
vocoder_model_path = "vocoder/saved_models/pretrained/pretrained.pt" |
|
|
|
encoder.load_model(encoder_model_path) |
|
synthesizer = Synthesizer(synthesizer_model_path) |
|
vocoder.load_model(vocoder_model_path) |
|
|
|
def clone_voice(reference_audio, text): |
|
|
|
sample_rate, audio = reference_audio |
|
|
|
|
|
preprocessed_wav = encoder.preprocess_wav(audio) |
|
|
|
|
|
embed = encoder.embed_utterance(preprocessed_wav) |
|
|
|
|
|
specs = synthesizer.synthesize_spectrograms([text], [embed]) |
|
|
|
|
|
generated_wav = vocoder.infer_waveform(specs[0]) |
|
|
|
|
|
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") |
|
|
|
|
|
return (synthesizer.sample_rate, generated_wav) |
|
|
|
|
|
demo = gr.Interface( |
|
fn=clone_voice, |
|
inputs=[ |
|
gr.Audio(source="upload", type="numpy", label="Voz de Referencia"), |
|
gr.Textbox(label="Texto a Clonar") |
|
], |
|
outputs=gr.Audio(label="Voz Clonada") |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|