import gradio as gr import re import random import string import librosa import numpy as np from pathlib import Path from scipy.io.wavfile import write from encoder import inference as encoder from vocoder.hifigan import inference as gan_vocoder from synthesizer.inference import Synthesizer class Mandarin: def __init__(self): self.encoder_path = "encoder/saved_models/pretrained.pt" self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt" self.config_fpath = "vocoder/hifigan/config_16k_.json" self.accent = "synthesizer/saved_models/普通话.pt" synthesizers_cache = {} if synthesizers_cache.get(self.accent) is None: self.current_synt = Synthesizer(Path(self.accent)) synthesizers_cache[self.accent] = self.current_synt else: self.current_synt = synthesizers_cache[self.accent] encoder.load_model(Path(self.encoder_path)) gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath) def setVoice(self, timbre): self.timbre = timbre wav, sample_rate, = librosa.load(self.timbre) encoder_wav = encoder.preprocess_wav(wav, sample_rate) self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True) def say(self, text): texts = filter(None, text.split("\n")) punctuation = "!,。、?!,.?::" # punctuate and split/clean text processed_texts = [] for text in texts: for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'): if processed_text: processed_texts.append(processed_text.strip()) texts = processed_texts embeds = [self.embed] * len(texts) specs = self.current_synt.synthesize_spectrograms(texts, embeds) spec = np.concatenate(specs, axis=1) wav, sample_rate = gan_vocoder.infer_waveform(spec) return wav, sample_rate def greet(audio, text, voice=None): if voice is None: voice = Mandarin() voice.setVoice(audio.name) voice.say("加载成功") wav, sample_rate = voice.say(text) output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav" write(output_file, sample_rate, wav.astype(np.float32)) return output_file, voice def main(): gr.Interface( fn=greet, inputs=[gr.inputs.Audio(type="file"),"text", "state"], outputs=[gr.outputs.Audio(type="file"), "state"] ).launch() if __name__=="__main__": main()