import torch import scipy.io.wavfile from espnet2.bin.tts_inference import Text2Speech from espnet2.utils.types import str_or_none tagen = 'kan-bayashi/ljspeech_vits' vocoder_tagen = "none" text2speechen = Text2Speech.from_pretrained( model_tag=str_or_none(tagen), vocoder_tag=str_or_none(vocoder_tagen), device="cpu", # Only for Tacotron 2 & Transformer threshold=0.5, # Only for Tacotron 2 minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3, # Only for FastSpeech & FastSpeech2 & VITS speed_control_alpha=1.0, # Only for VITS noise_scale=0.333, noise_scale_dur=0.333, ) def inference(text, lang): print("Converting to Audio") with torch.no_grad(): if lang == "english": wav = text2speechen(text)["wav"] scipy.io.wavfile.write( "./audio/out.wav", text2speechen.fs, wav.view(-1).cpu().numpy()) return "./audio/out.wav"