!git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS import os import shutil import gradio as gr import sys import string import time import argparse import json import numpy as np import IPython from IPython.display import Audio import torch from TTS.tts.utils.synthesis import synthesis from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols try: from TTS.utils.audio import AudioProcessor except: from TTS.utils.audio import AudioProcessor from TTS.tts.models import setup_model from TTS.config import load_config from TTS.tts.models.vits import * from TTS.tts.utils.speakers import SpeakerManager from pydub import AudioSegment from google.colab import files import librosa from scipy.io.wavfile import write, read ''' from google.colab import drive drive.mount('/content/drive') src_path = os.path.join(os.path.join(os.path.join(os.path.join(os.getcwd(), 'drive'), 'MyDrive'), 'Colab Notebooks'), 'best_model_latest.pth.tar') dst_path = os.path.join(os.getcwd(), 'best_model.pth.tar') shutil.copy(src_path, dst_path) ''' TTS_PATH = "TTS/" # add libraries into environment sys.path.append(TTS_PATH) # set this if TTS is not installed globally # Paths definition OUT_PATH = 'out/' # create output path os.makedirs(OUT_PATH, exist_ok=True) # model vars MODEL_PATH = 'best_model.pth.tar' CONFIG_PATH = 'config.json' TTS_LANGUAGES = "language_ids.json" TTS_SPEAKERS = "speakers.json" USE_CUDA = torch.cuda.is_available() # load the config C = load_config(CONFIG_PATH) # load the audio processor ap = AudioProcessor(**C.audio) speaker_embedding = None C.model_args['d_vector_file'] = TTS_SPEAKERS C.model_args['use_speaker_encoder_as_loss'] = False model = setup_model(C) model.language_manager.set_language_ids_from_file(TTS_LANGUAGES) # print(model.language_manager.num_languages, model.embedded_language_dim) # print(model.emb_l) cp = torch.load(MODEL_PATH, map_location=torch.device('cpu')) # remove speaker encoder model_weights = cp['model'].copy() for key in list(model_weights.keys()): if "speaker_encoder" in key: del model_weights[key] model.load_state_dict(model_weights) model.eval() if USE_CUDA: model = model.cuda() # synthesize voice use_griffin_lim = False # Paths definition # CONFIG_SE_PATH = "config_se.json" # CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar" # Load the Speaker encoder SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA) # Define helper function def compute_spec(ref_file): y, sr = librosa.load(ref_file, sr=ap.sample_rate) spec = ap.spectrogram(y) spec = torch.FloatTensor(spec).unsqueeze(0) return spec def voice_conversion(ta, ra, da): target_audio = 'target.wav' reference_audio = 'reference.wav' driving_audio = 'driving.wav' write(target_audio, ta[0], ta[1]) write(reference_audio, ra[0], ra[1]) write(driving_audio, da[0], da[1]) !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f # ta_ = read(target_audio) target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio]) target_emb = torch.FloatTensor(target_emb).unsqueeze(0) driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio]) driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0) # Convert the voice driving_spec = compute_spec(driving_audio) y_lengths = torch.tensor([driving_spec.size(-1)]) if USE_CUDA: ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda()) ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy() else: ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb) ref_wav_voc = ref_wav_voc.squeeze().detach().numpy() # print("Reference Audio after decoder:") # IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate)) return (ap.sample_rate, ref_wav_voc) demo = gr.Interface( fn=voice_conversion, inputs=["audio", "audio", "audio"], outputs="audio" ) demo.launch(debug=True, share=True)