Spaces:
Runtime error
Runtime error
import gradio as gr | |
import re | |
import random | |
import string | |
import librosa | |
import numpy as np | |
from pathlib import Path | |
from scipy.io.wavfile import write | |
from encoder import inference as encoder | |
from vocoder.hifigan import inference as gan_vocoder | |
from synthesizer.inference import Synthesizer | |
class Mandarin: | |
def __init__(self): | |
self.encoder_path = "encoder/saved_models/pretrained.pt" | |
self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt" | |
self.config_fpath = "vocoder/hifigan/config_16k_.json" | |
self.accent = "synthesizer/saved_models/普通话.pt" | |
synthesizers_cache = {} | |
if synthesizers_cache.get(self.accent) is None: | |
self.current_synt = Synthesizer(Path(self.accent)) | |
synthesizers_cache[self.accent] = self.current_synt | |
else: | |
self.current_synt = synthesizers_cache[self.accent] | |
encoder.load_model(Path(self.encoder_path)) | |
gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath) | |
def setVoice(self, timbre): | |
self.timbre = timbre | |
wav, sample_rate, = librosa.load(self.timbre) | |
encoder_wav = encoder.preprocess_wav(wav, sample_rate) | |
self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True) | |
def say(self, text): | |
texts = filter(None, text.split("\n")) | |
punctuation = "!,。、?!,.?::" # punctuate and split/clean text | |
processed_texts = [] | |
for text in texts: | |
for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'): | |
if processed_text: | |
processed_texts.append(processed_text.strip()) | |
texts = processed_texts | |
embeds = [self.embed] * len(texts) | |
specs = self.current_synt.synthesize_spectrograms(texts, embeds) | |
spec = np.concatenate(specs, axis=1) | |
wav, sample_rate = gan_vocoder.infer_waveform(spec) | |
return wav, sample_rate | |
def greet(audio, text, voice=None): | |
print(f"Log print: audio name=[{audio.name}], text=[{text}]") | |
if voice is None: | |
voice = Mandarin() | |
voice.setVoice(audio.name) | |
voice.say("加载成功") | |
wav, sample_rate = voice.say(text) | |
output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav" | |
write(output_file, sample_rate, wav.astype(np.float32)) | |
return output_file, voice | |
def new_greet(audio, text): | |
audio_path = audio | |
print(f"Log print: audio name=[{audio_path}], text=[{text}]") | |
voice.setVoice(audio_path) | |
wav, sample_rate = voice.say(text) | |
output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav" | |
write(output_file, sample_rate, wav.astype(np.float32)) | |
return output_file | |
def main(): | |
demo = gr.Interface( | |
fn=greet, | |
inputs=[gr.inputs.Audio(type="file"),"text", "state"], | |
outputs=[gr.outputs.Audio(type="file"), "state"], | |
title="Tacotron Zero-short Voice Clone (Chinese Version)" | |
) | |
demo.launch() | |
def new_main(): | |
with gr.Blocks() as demo: | |
title = gr.Markdown("# <center>Tacotron Zero-short Voice Clone (Chinese Version)</center>") | |
gr.Markdown( | |
"""这是Zero-short语音克隆模型Tacotron的中文版本。<br> | |
1. 上传想要克隆的人的一段语音,长度3~8秒即可。<br> | |
2. 输入想要合成的文本<br> | |
3. 点击Submit按钮,稍等30秒便可合成语音 | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
# input_audio = gr.Audio(type="file", label="Source Audio", value="exp/lihao_01.wav") | |
input_audio = gr.Audio(type="filepath", label="Source Audio:") | |
input_text = gr.Textbox(value="大家好,我是正在搬砖的李昊,这是一段合成音频。", label="TTS Text:") | |
with gr.Row(): | |
# clear = gr.ClearButton() | |
submit = gr.Button(value="Submit", variant='primary') | |
with gr.Column(): | |
output_audio = gr.Audio(type="filepath", label="Output Audio:") | |
gr.Markdown("Audio Examples:") | |
gr.Examples( | |
examples=[["exp/lihao_01.wav", "大家好,我是正在搬砖的李昊,这是一段合成音频。"]], | |
inputs=[input_audio, input_text], | |
outputs=[output_audio], | |
fn=new_greet, | |
cache_examples=True, | |
) | |
_ = submit.click(new_greet, inputs=[input_audio, input_text], outputs=[output_audio]) | |
demo.queue(concurrency_count=20) | |
demo.launch() | |
if __name__=="__main__": | |
voice = Mandarin() | |
# voice.setVoice(audio.name) | |
# voice.say("加载成功") | |
new_main() | |