File size: 993 Bytes
bbdb87d
 
 
 
04ea122
a1797f5
 
bbdb87d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04ea122
 
c7380eb
a1797f5
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import torch
import scipy.io.wavfile
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none

tagen = 'kan-bayashi/ljspeech_vits'
vocoder_tagen = "none"

text2speechen = Text2Speech.from_pretrained(
    model_tag=str_or_none(tagen),
    vocoder_tag=str_or_none(vocoder_tagen),
    device="cpu",
    # Only for Tacotron 2 & Transformer
    threshold=0.5,
    # Only for Tacotron 2
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=False,
    backward_window=1,
    forward_window=3,
    # Only for FastSpeech & FastSpeech2 & VITS
    speed_control_alpha=1.0,
    # Only for VITS
    noise_scale=0.333,
    noise_scale_dur=0.333,
)


def inference(text, lang):
    print("Converting to Audio")
    with torch.no_grad():
        if lang == "english":
            wav = text2speechen(text)["wav"]
            scipy.io.wavfile.write(
                "./audio/out.wav", text2speechen.fs, wav.view(-1).cpu().numpy())
    return "./audio/out.wav"