import gradio as gr import re import random import string import librosa import numpy as np from pathlib import Path from scipy.io.wavfile import write from encoder import inference as encoder from vocoder.hifigan import inference as gan_vocoder from synthesizer.inference import Synthesizer class Mandarin: def __init__(self): self.encoder_path = "encoder/saved_models/pretrained.pt" self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt" self.config_fpath = "vocoder/hifigan/config_16k_.json" self.accent = "synthesizer/saved_models/普通话.pt" synthesizers_cache = {} if synthesizers_cache.get(self.accent) is None: self.current_synt = Synthesizer(Path(self.accent)) synthesizers_cache[self.accent] = self.current_synt else: self.current_synt = synthesizers_cache[self.accent] encoder.load_model(Path(self.encoder_path)) gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath) def setVoice(self, timbre): self.timbre = timbre wav, sample_rate, = librosa.load(self.timbre) encoder_wav = encoder.preprocess_wav(wav, sample_rate) self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True) def say(self, text): texts = filter(None, text.split("\n")) punctuation = "!,。、?!,.?::" # punctuate and split/clean text processed_texts = [] for text in texts: for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'): if processed_text: processed_texts.append(processed_text.strip()) texts = processed_texts embeds = [self.embed] * len(texts) specs = self.current_synt.synthesize_spectrograms(texts, embeds) spec = np.concatenate(specs, axis=1) wav, sample_rate = gan_vocoder.infer_waveform(spec) return wav, sample_rate def greet(audio, text, voice=None): print(f"Log print: audio name=[{audio.name}], text=[{text}]") if voice is None: voice = Mandarin() voice.setVoice(audio.name) voice.say("加载成功") wav, sample_rate = voice.say(text) output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav" write(output_file, sample_rate, wav.astype(np.float32)) return output_file, voice def new_greet(audio, text): audio_path = audio print(f"Log print: audio name=[{audio_path}], text=[{text}]") voice.setVoice(audio_path) wav, sample_rate = voice.say(text) output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav" write(output_file, sample_rate, wav.astype(np.float32)) return output_file def main(): demo = gr.Interface( fn=greet, inputs=[gr.inputs.Audio(type="file"),"text", "state"], outputs=[gr.outputs.Audio(type="file"), "state"], title="Tacotron Zero-short Voice Clone (Chinese Version)" ) demo.launch() def new_main(): with gr.Blocks() as demo: title = gr.Markdown("#
Tacotron Zero-short Voice Clone (Chinese Version)
") gr.Markdown( """这是Zero-short语音克隆模型Tacotron的中文版本。
1. 上传想要克隆的人的一段语音,长度3~8秒即可。
2. 输入想要合成的文本
3. 点击Submit按钮,稍等30秒便可合成语音 """ ) with gr.Row(): with gr.Column(): # input_audio = gr.Audio(type="file", label="Source Audio", value="exp/lihao_01.wav") input_audio = gr.Audio(type="filepath", label="Source Audio:") input_text = gr.Textbox(value="大家好,我是正在搬砖的李昊,这是一段合成音频。", label="TTS Text:") with gr.Row(): # clear = gr.ClearButton() submit = gr.Button(value="Submit", variant='primary') with gr.Column(): output_audio = gr.Audio(type="filepath", label="Output Audio:") gr.Markdown("Audio Examples:") gr.Examples( examples=[["exp/lihao_01.wav", "大家好,我是正在搬砖的李昊,这是一段合成音频。"]], inputs=[input_audio, input_text], outputs=[output_audio], fn=new_greet, cache_examples=True, ) _ = submit.click(new_greet, inputs=[input_audio, input_text], outputs=[output_audio]) demo.queue(concurrency_count=20) demo.launch() if __name__=="__main__": voice = Mandarin() # voice.setVoice(audio.name) # voice.say("加载成功") new_main()