MockingBird / app.py
lewiswu1209's picture
initial commit
f4dac30
import gradio as gr
import re
import random
import string
import librosa
import numpy as np
from pathlib import Path
from scipy.io.wavfile import write
from encoder import inference as encoder
from vocoder.hifigan import inference as gan_vocoder
from synthesizer.inference import Synthesizer
class Mandarin:
def __init__(self):
self.encoder_path = "encoder/saved_models/pretrained.pt"
self.vocoder_path = "vocoder/saved_models/pretrained/g_hifigan.pt"
self.config_fpath = "vocoder/hifigan/config_16k_.json"
self.accent = "synthesizer/saved_models/普通话.pt"
synthesizers_cache = {}
if synthesizers_cache.get(self.accent) is None:
self.current_synt = Synthesizer(Path(self.accent))
synthesizers_cache[self.accent] = self.current_synt
else:
self.current_synt = synthesizers_cache[self.accent]
encoder.load_model(Path(self.encoder_path))
gan_vocoder.load_model(Path(self.vocoder_path), self.config_fpath)
def setVoice(self, timbre):
self.timbre = timbre
wav, sample_rate, = librosa.load(self.timbre)
encoder_wav = encoder.preprocess_wav(wav, sample_rate)
self.embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
def say(self, text):
texts = filter(None, text.split("\n"))
punctuation = "!,。、?!,.?::" # punctuate and split/clean text
processed_texts = []
for text in texts:
for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
if processed_text:
processed_texts.append(processed_text.strip())
texts = processed_texts
embeds = [self.embed] * len(texts)
specs = self.current_synt.synthesize_spectrograms(texts, embeds)
spec = np.concatenate(specs, axis=1)
wav, sample_rate = gan_vocoder.infer_waveform(spec)
return wav, sample_rate
def greet(audio, text, voice=None):
if voice is None:
voice = Mandarin()
voice.setVoice(audio.name)
voice.say("加载成功")
wav, sample_rate = voice.say(text)
output_file = "".join( random.sample(string.ascii_lowercase + string.digits, 11) ) + ".wav"
write(output_file, sample_rate, wav.astype(np.float32))
return output_file, voice
def main():
gr.Interface(
fn=greet,
inputs=[gr.inputs.Audio(type="file"),"text", "state"],
outputs=[gr.outputs.Audio(type="file"), "state"]
).launch()
if __name__=="__main__":
main()