# !git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS import os import shutil import gradio as gr import sys import string import time import argparse import json import numpy as np # import IPython # from IPython.display import Audio import torch from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols try: from TTS.utils.audio import AudioProcessor except: from TTS.utils.audio import AudioProcessor from TTS.tts.models import setup_model from TTS.config import load_config from TTS.tts.models.vits import * from TTS.tts.utils.speakers import SpeakerManager from pydub import AudioSegment # from google.colab import files import librosa from scipy.io.wavfile import write, read import subprocess ''' from google.colab import drive drive.mount('/content/drive') src_path = os.path.join(os.path.join(os.path.join(os.path.join(os.getcwd(), 'drive'), 'MyDrive'), 'Colab Notebooks'), 'best_model_latest.pth.tar') dst_path = os.path.join(os.getcwd(), 'best_model.pth.tar') shutil.copy(src_path, dst_path) ''' TTS_PATH = "TTS/" # add libraries into environment sys.path.append(TTS_PATH) # set this if TTS is not installed globally # Paths definition OUT_PATH = 'out/' # create output path os.makedirs(OUT_PATH, exist_ok=True) # model vars MODEL_PATH = 'best_model.pth.tar' CONFIG_PATH = 'config.json' TTS_LANGUAGES = "language_ids.json" TTS_SPEAKERS = "speakers.json" USE_CUDA = torch.cuda.is_available() # load the config C = load_config(CONFIG_PATH) # load the audio processor ap = AudioProcessor(**C.audio) speaker_embedding = None C.model_args['d_vector_file'] = TTS_SPEAKERS C.model_args['use_speaker_encoder_as_loss'] = False model = setup_model(C) model.language_manager.set_language_ids_from_file(TTS_LANGUAGES) # print(model.language_manager.num_languages, model.embedded_language_dim) # print(model.emb_l) cp = torch.load(MODEL_PATH, map_location=torch.device('cpu')) # remove speaker encoder model_weights = cp['model'].copy() for key in list(model_weights.keys()): if "speaker_encoder" in key: del model_weights[key] model.load_state_dict(model_weights) model.eval() if USE_CUDA: model = model.cuda() # synthesize voice use_griffin_lim = False # Paths definition CONFIG_SE_PATH = "config_se.json" CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar" # Load the Speaker encoder SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA) # Define helper function def compute_spec(ref_file): y, sr = librosa.load(ref_file, sr=ap.sample_rate) spec = ap.spectrogram(y) spec = torch.FloatTensor(spec).unsqueeze(0) return spec def voice_conversion(ta, ra, da): target_audio = 'target.wav' reference_audio = 'reference.wav' driving_audio = 'driving.wav' write(target_audio, ta[0], ta[1]) write(reference_audio, ra[0], ra[1]) write(driving_audio, da[0], da[1]) # !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f # !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f # !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f files = [target_audio, reference_audio, driving_audio] for file in files: subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"]) # ta_ = read(target_audio) target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio]) target_emb = torch.FloatTensor(target_emb).unsqueeze(0) driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio]) driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0) # Convert the voice driving_spec = compute_spec(driving_audio) y_lengths = torch.tensor([driving_spec.size(-1)]) if USE_CUDA: ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda()) ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy() else: ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb) ref_wav_voc = ref_wav_voc.squeeze().detach().numpy() # print("Reference Audio after decoder:") # IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate)) return (ap.sample_rate, ref_wav_voc) c3 = gr.Interface( fn=voice_conversion, inputs=[gr.Audio(label='Target Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Clip To Convert')], outputs=gr.Audio(label='Target Speaker - Converted Clip'), examples=[['ntr.wav', 'timcast1.wav', 'timcast1.wav']], description="Use this cool too to convert your voice to another person's! \nThe first audio input requires an audio file that of the target speaker. The second and third audio inputs require audio files from the person who's voice you want to convert." ) c1_m2 = gr.Interface( fn=voice_conversion, inputs=[gr.Audio(label='Target Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Reference Clip', source='microphone'), gr.Audio(label='Input Speaker - Clip To Convert', source='microphone')], outputs=gr.Audio(label='Target Speaker - Converted Clip'), description="Use this cool too to convert your voice to another person's! \nThe first audio input requires an audio file that of the target speaker. The second and third audio inputs require live recordings from the person who's voice you want to convert." ) demo = gr.TabbedInterface([c3, c1_m2], ["Pre-Recorded", "Microphone"], title="Voice Conversion") demo.launch(debug='True')