Tacotron-zero-short-voice-clone

Runtime error

App Files Files Community

Tacotron-zero-short-voice-clone / app.py

kira4424

Update app.py

72542b5 over 1 year ago

raw

history blame

3.93 kB


	import gradio as gr

	import re
	import random
	import string
	import librosa
	import numpy as np

	from pathlib import Path
	from scipy.io.wavfile import write

	from encoder import inference as encoder
	from vocoder.hifigan import inference as gan_vocoder
	from synthesizer.inference import Synthesizer

	class Mandarin:
	def __init__(self):
	self.encoder_path = "encoder/saved_models/pretrained.pt"
	self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt"
	self.config_fpath = "vocoder/hifigan/config_16k_.json"
	self.accent = "synthesizer/saved_models/普通话.pt"

	synthesizers_cache = {}
	if synthesizers_cache.get(self.accent) is None:
	self.current_synt = Synthesizer(Path(self.accent))
	synthesizers_cache[self.accent] = self.current_synt
	else:
	self.current_synt = synthesizers_cache[self.accent]

	encoder.load_model(Path(self.encoder_path))
	gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath)

	def setVoice(self, timbre):
	self.timbre = timbre
	wav, sample_rate, = librosa.load(self.timbre)

	encoder_wav = encoder.preprocess_wav(wav, sample_rate)
	self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

	def say(self, text):
	texts = filter(None, text.split("\n"))
	punctuation = "！，。、？!,.?：:" # punctuate and split/clean text
	processed_texts = []
	for text in texts:
	for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
	if processed_text:
	processed_texts.append(processed_text.strip())
	texts = processed_texts
	embeds = [self.embed] * len(texts)

	specs = self.current_synt.synthesize_spectrograms(texts, embeds)
	spec = np.concatenate(specs, axis=1)
	wav, sample_rate = gan_vocoder.infer_waveform(spec)

	return wav, sample_rate


	def greet(audio, text, voice=None):
	print(f"Log print: audio name=[{audio.name}], text=[{text}]")

	if voice is None:
	voice = Mandarin()
	voice.setVoice(audio.name)
	voice.say("加载成功")
	wav, sample_rate = voice.say(text)

	output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav"

	write(output_file, sample_rate, wav.astype(np.float32))

	return output_file, voice

	def new_greet(audio, text):
	print(f"Log print: audio name=[{audio.name}], text=[{text}]")

	wav, sample_rate = voice.say(text)

	output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav"

	write(output_file, sample_rate, wav.astype(np.float32))

	return output_file

	def main():
	demo = gr.Interface(
	fn=greet,
	inputs=[gr.inputs.Audio(type="file"),"text", "state"],
	outputs=[gr.outputs.Audio(type="file"), "state"],
	title="Tacotron Zero-short Voice Clone (Chinese Version)"
	)

	demo.launch()

	def new_main():
	with gr.Blocks() as demo:
	title = gr.Markdown("# Tacotron Zero-short Voice Clone (Chinese Version)")
	with gr.Row():
	with gr.Column():
	input_audio = gr.Audio(type="file", label="Source Audio", value="exp/lihao_01.wav")
	input_text = gr.Textbox(value="大家好，我是正在搬砖的李昊，这是一段合成音频。")
	with gr.Row():
	clear = gr.ClearButton()
	submit = gr.Button(value="Submit", variant='primary')
	with gr.Column():
	output_audio = gr.Audio(type="file", label="Output Audio")

	_ = submit.click(new_greet, inputs=[input_audio, input_text], outputs=[output_audio])

	demo.launch()

	if __name__=="__main__":
	voice = Mandarin()
	# voice.setVoice(audio.name)
	# voice.say("加载成功")
	new_main()