''' import gradio as gr def greet(name): return "Hello " + name + "!!" iface = gr.Interface(fn=greet, inputs="text", outputs="text") iface.launch() ''' import gradio import os import shutil import gradio as gr import sys import string import time import argparse import json import numpy as np import torch import librosa import subprocess from pydub import AudioSegment from scipy.io.wavfile import write, read from transformers import WavLMModel from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols try: from TTS.utils.audio import AudioProcessor except: from TTS.utils.audio import AudioProcessor from TTS.tts.models import setup_model from TTS.config import load_config from TTS.tts.models.vits import * from TTS.tts.utils.speakers import SpeakerManager import utils from models import SynthesizerTrn from mel_processing import mel_spectrogram_torch from speaker_encoder.voice_encoder import SpeakerEncoder TTS_PATH = "TTS/" sys.path.append(TTS_PATH) # set this if TTS is not installed globally OUT_PATH = 'out/' os.makedirs(OUT_PATH, exist_ok=True) TTS_SPEAKERS = "yourtts_config/speakers.json" USE_CUDA = torch.cuda.is_available() device = torch.device("cuda" if USE_CUDA else "cpu") CONFIG_PATH = 'yourtts_config/config.json' C = load_config(CONFIG_PATH) ap = AudioProcessor(**C.audio) speaker_embedding = None C.model_args['d_vector_file'] = TTS_SPEAKERS C.model_args['use_speaker_encoder_as_loss'] = False model = setup_model(C) TTS_LANGUAGES = "yourtts_config/language_ids.json" model.language_manager.set_language_ids_from_file(TTS_LANGUAGES) # print(model.language_manager.num_languages, model.embedded_language_dim) # print(model.emb_l) MODEL_PATH = 'yourtts_config/best_model.pth.tar' cp = torch.load(MODEL_PATH, map_location=torch.device('cpu')) model_weights = cp['model'].copy() for key in list(model_weights.keys()): if "speaker_encoder" in key: del model_weights[key] model.load_state_dict(model_weights) model.eval() if USE_CUDA: model = model.cuda() use_griffin_lim = False CONFIG_SE_PATH = "yourtts_config/config_se.json" CHECKPOINT_SE_PATH = "yourtts_config/SE_checkpoint.pth.tar" SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA) def compute_spec(ref_file): y, sr = librosa.load(ref_file, sr=ap.sample_rate) spec = ap.spectrogram(y) spec = torch.FloatTensor(spec).unsqueeze(0) return spec print("Loading FreeVC...") hps = utils.get_hparams_from_file("configs/freevc.json") freevc = SynthesizerTrn( hps.data.filter_length // 2 + 1, hps.train.segment_size // hps.data.hop_length, **hps.model).to(device) _ = freevc.eval() _ = utils.load_checkpoint("checkpoints/freevc.pth", freevc, None) smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt') print("Loading WavLM for content...") cmodel = utils.get_cmodel(device).to(device) # cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device) def voice_conversion_yourtts(da, ta, normalize=False): # write(target_audio, ta[0], ta[1]) # write(driving_audio, da[0], da[1]) # !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f # !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f # !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f files = [da, ta] subprocess.run(["ffmpeg-normalize", da, "-nt", "rms", "-t=-27", "-o", "source_yourtts.wav", "-ar", "16000", "-f"]) subprocess.run(["ffmpeg-normalize", ta, "-nt", "rms", "-t=-27", "-o", "target_yourtts.wav", "-ar", "16000", "-f"]) # ta_ = read(target_audio) target_emb = SE_speaker_manager.compute_d_vector_from_clip(["target_yourtts.wav"]) target_emb = torch.FloatTensor(target_emb).unsqueeze(0) driving_emb = SE_speaker_manager.compute_d_vector_from_clip(["source_yourtts.wav"]) driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0) # Convert the voice driving_spec = compute_spec("source_yourtts.wav") y_lengths = torch.tensor([driving_spec.size(-1)]) if USE_CUDA: ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda()) ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy() else: ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb) ref_wav_voc = ref_wav_voc.squeeze().detach().numpy() # print("Reference Audio after decoder:") # IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate)) return (ap.sample_rate, ref_wav_voc) def voice_conversion_freevc(src, tgt, normalize=False): with torch.no_grad(): subprocess.run(["ffmpeg-normalize", tgt, "-nt", "rms", "-t=-27", "-o", "target_fvc.wav", "-ar", "16000", "-f"]) wav_tgt, _ = librosa.load("target_fvc.wav", sr=hps.data.sampling_rate) wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20) g_tgt = smodel.embed_utterance(wav_tgt) g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device) subprocess.run(["ffmpeg-normalize", src, "-nt", "rms", "-t=-27", "-o", "source_fvc.wav", "-ar", "16000", "-f"]) wav_src, _ = librosa.load("source_fvc.wav", sr=hps.data.sampling_rate) wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device) # c = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device) c = utils.get_content(cmodel, wav_src) audio = freevc.infer(c, g=g_tgt) audio = audio[0][0].data.cpu().float().numpy() write("out.wav", hps.data.sampling_rate, audio) out = "out.wav" return out model1 = gr.Dropdown(choices=["FreeVC", "YourTTS"], value="FreeVC",type="value", label="Model") model2 = gr.Dropdown(choices=["FreeVC", "YourTTS"], value="FreeVC",type="value", label="Model") audio1 = gr.inputs.Audio(label="Source Speaker - Input Audio", type='filepath') audio2 = gr.inputs.Audio(label="Target Speaker - Input Audio", type='filepath') microphone = gr.inputs.Audio(label="Source Speaker - Input Audio", source='microphone') audio3 = gr.inputs.Audio(label="Target Speaker - Input Audio", type='filepath') inputs_1 = [model1, audio1, audio2] inputs_2 = [model2, microphone, audio3] outputs_1 = gr.outputs.Audio(label="Target Speaker - Output Audio", type='filepath') outputs_2 = gr.outputs.Audio(label="Target Speaker - Output Audio", type='filepath') def voice_conversion(mod, sa, ta): if mod=='FreeVC': return voice_conversion_freevc(sa, ta) else: return voice_conversion_yourtts(sa, ta) examples_1 = [['FreeVC', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav'], ['YourTTS', 'sample_inputs/ntr.wav', 'sample_inputs/timcast1.wav']] vc_1 = gr.Interface( fn=voice_conversion, inputs=inputs_1, outputs=outputs_1, examples=examples_1, description="Use this cool tool to convert your voice to another person's! \n Upload files in wav format for the source speaker and the target speaker.\n \nThis demonstration is made by T B Ramkamal, for partial credit towards completion of my Dual Degree Project" ) vc_2 = gr.Interface( fn=voice_conversion, inputs=inputs_2, outputs=outputs_2, description="Use this cool tool to convert your voice to another person's! \n Upload files in wav format for the target speaker and record the voice of the input speaker using the microphone.\n \nThis demonstration is made by T B Ramkamal, for partial credit towards completion of my Dual Degree Project" ) demo = gr.TabbedInterface([vc_1, vc_2], ["wav Input", "Microphone Input"], title="Voice Conversion Demo") demo.launch(debug='True')