voice-cloning-yourtts

Build error

App Files Files Community

voice-cloning-yourtts / app.py

ramkamal2000

title

c4baa74 almost 2 years ago

raw

history blame

5.69 kB

	# !git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS

	import os
	import shutil
	import gradio as gr

	import sys

	import string
	import time
	import argparse
	import json

	import numpy as np
	# import IPython
	# from IPython.display import Audio

	import torch

	from TTS.tts.utils.synthesis import synthesis
	from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
	try:
	from TTS.utils.audio import AudioProcessor
	except:
	from TTS.utils.audio import AudioProcessor


	from TTS.tts.models import setup_model
	from TTS.config import load_config
	from TTS.tts.models.vits import *

	from TTS.tts.utils.speakers import SpeakerManager
	from pydub import AudioSegment

	# from google.colab import files
	import librosa

	from scipy.io.wavfile import write, read

	import subprocess

	'''
	from google.colab import drive
	drive.mount('/content/drive')

	src_path = os.path.join(os.path.join(os.path.join(os.path.join(os.getcwd(), 'drive'), 'MyDrive'), 'Colab Notebooks'), 'best_model_latest.pth.tar')
	dst_path = os.path.join(os.getcwd(), 'best_model.pth.tar')

	shutil.copy(src_path, dst_path)
	'''

	TTS_PATH = "TTS/"

	# add libraries into environment
	sys.path.append(TTS_PATH) # set this if TTS is not installed globally

	# Paths definition

	OUT_PATH = 'out/'

	# create output path
	os.makedirs(OUT_PATH, exist_ok=True)

	# model vars
	MODEL_PATH = 'best_model.pth.tar'
	CONFIG_PATH = 'config.json'
	TTS_LANGUAGES = "language_ids.json"
	TTS_SPEAKERS = "speakers.json"
	USE_CUDA = torch.cuda.is_available()

	# load the config
	C = load_config(CONFIG_PATH)

	# load the audio processor
	ap = AudioProcessor(**C.audio)

	speaker_embedding = None

	C.model_args['d_vector_file'] = TTS_SPEAKERS
	C.model_args['use_speaker_encoder_as_loss'] = False

	model = setup_model(C)
	model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
	# print(model.language_manager.num_languages, model.embedded_language_dim)
	# print(model.emb_l)
	cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
	# remove speaker encoder
	model_weights = cp['model'].copy()
	for key in list(model_weights.keys()):
	if "speaker_encoder" in key:
	del model_weights[key]

	model.load_state_dict(model_weights)

	model.eval()

	if USE_CUDA:
	model = model.cuda()

	# synthesize voice
	use_griffin_lim = False

	# Paths definition

	CONFIG_SE_PATH = "config_se.json"
	CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"

	# Load the Speaker encoder

	SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)

	# Define helper function

	def compute_spec(ref_file):
	y, sr = librosa.load(ref_file, sr=ap.sample_rate)
	spec = ap.spectrogram(y)
	spec = torch.FloatTensor(spec).unsqueeze(0)
	return spec


	def voice_conversion(ta, ra, da):

	target_audio = 'target.wav'
	reference_audio = 'reference.wav'
	driving_audio = 'driving.wav'

	write(target_audio, ta[0], ta[1])
	write(reference_audio, ra[0], ra[1])
	write(driving_audio, da[0], da[1])

	# !ffmpeg-normalize $target_audio -nt rms -t=-27 -o $target_audio -ar 16000 -f
	# !ffmpeg-normalize $reference_audio -nt rms -t=-27 -o $reference_audio -ar 16000 -f
	# !ffmpeg-normalize $driving_audio -nt rms -t=-27 -o $driving_audio -ar 16000 -f

	files = [target_audio, reference_audio, driving_audio]

	for file in files:
	subprocess.run(["ffmpeg-normalize", file, "-nt", "rms", "-t=-27", "-o", file, "-ar", "16000", "-f"])

	# ta_ = read(target_audio)

	target_emb = SE_speaker_manager.compute_d_vector_from_clip([target_audio])
	target_emb = torch.FloatTensor(target_emb).unsqueeze(0)

	driving_emb = SE_speaker_manager.compute_d_vector_from_clip([reference_audio])
	driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)

	# Convert the voice

	driving_spec = compute_spec(driving_audio)
	y_lengths = torch.tensor([driving_spec.size(-1)])
	if USE_CUDA:
	ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
	ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
	else:
	ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
	ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()

	# print("Reference Audio after decoder:")
	# IPython.display.display(Audio(ref_wav_voc, rate=ap.sample_rate))

	return (ap.sample_rate, ref_wav_voc)

	c3 = gr.Interface(
	fn=voice_conversion,
	inputs=[gr.Audio(label='Target Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Clip To Convert')],
	outputs=gr.Audio(label='Target Speaker - Converted Clip'),
	examples=[['ntr.wav', 'timcast1.wav', 'timcast1.wav']],
	description="Use this cool too to convert your voice to another person's! \nThe first audio input requires an audio file that of the target speaker. The second and third audio inputs require audio files from the person who's voice you want to convert."
	)

	c1_m2 = gr.Interface(
	fn=voice_conversion,
	inputs=[gr.Audio(label='Target Speaker - Reference Clip'), gr.Audio(label='Input Speaker - Reference Clip', source='microphone'), gr.Audio(label='Input Speaker - Clip To Convert', source='microphone')],
	outputs=gr.Audio(label='Target Speaker - Converted Clip'),
	description="Use this cool too to convert your voice to another person's! \nThe first audio input requires an audio file that of the target speaker. The second and third audio inputs require live recordings from the person who's voice you want to convert."
	)

	demo = gr.TabbedInterface([c3, c1_m2], ["Pre-Recorded", "Microphone"], title="Voice Conversion")
	demo.launch(debug='True')