Spaces:
Runtime error
Runtime error
import gradio as gr | |
import re | |
import random | |
import string | |
import librosa | |
import numpy as np | |
from pathlib import Path | |
from scipy.io.wavfile import write | |
from encoder import inference as encoder | |
from vocoder.hifigan import inference as gan_vocoder | |
from synthesizer.inference import Synthesizer | |
class Mandarin: | |
def __init__(self): | |
self.encoder_path = "encoder/saved_models/pretrained.pt" | |
self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt" | |
self.config_fpath = "vocoder/hifigan/config_16k_.json" | |
self.accent = "synthesizer/saved_models/普通话.pt" | |
synthesizers_cache = {} | |
if synthesizers_cache.get(self.accent) is None: | |
self.current_synt = Synthesizer(Path(self.accent)) | |
synthesizers_cache[self.accent] = self.current_synt | |
else: | |
self.current_synt = synthesizers_cache[self.accent] | |
encoder.load_model(Path(self.encoder_path)) | |
gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath) | |
def setVoice(self, timbre): | |
self.timbre = timbre | |
wav, sample_rate, = librosa.load(self.timbre) | |
encoder_wav = encoder.preprocess_wav(wav, sample_rate) | |
self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True) | |
def say(self, text): | |
texts = filter(None, text.split("\n")) | |
punctuation = "!,。、?!,.?::" # punctuate and split/clean text | |
processed_texts = [] | |
for text in texts: | |
for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'): | |
if processed_text: | |
processed_texts.append(processed_text.strip()) | |
texts = processed_texts | |
embeds = [self.embed] * len(texts) | |
specs = self.current_synt.synthesize_spectrograms(texts, embeds) | |
spec = np.concatenate(specs, axis=1) | |
wav, sample_rate = gan_vocoder.infer_waveform(spec) | |
return wav, sample_rate | |
def greet(audio, text, voice=None): | |
if voice is None: | |
voice = Mandarin() | |
voice.setVoice(audio.name) | |
voice.say("加载成功") | |
wav, sample_rate = voice.say(text) | |
output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav" | |
write(output_file, sample_rate, wav.astype(np.float32)) | |
return output_file, voice | |
def main(): | |
gr.Interface( | |
fn=greet, | |
inputs=[gr.inputs.Audio(type="file"),"text", "state"], | |
outputs=[gr.outputs.Audio(type="file"), "state"] | |
).launch() | |
if __name__=="__main__": | |
main() | |