File size: 3,537 Bytes
11120b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9070b28
11120b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9070b28
11120b4
 
 
 
 
 
 
 
 
 
 
 
b17c40c
11120b4
 
 
 
 
 
 
 
 
 
 
 
48157e5
11120b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66de3a6
11120b4
 
 
 
b3d59f4
11120b4
 
 
 
 
 
66de3a6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import sys
import json
from subprocess import call

import torch
import gradio as gr
from scipy.io.wavfile import write
from huggingface_hub import hf_hub_url, cached_download
import nltk
from nltk.tokenize import word_tokenize

nltk.download("punkt")

AUTH_TOKEN = os.environ["HF_TOKEN"]

# download models
url = hf_hub_url(repo_id="bookbot/grad-tts-en-ft-Weildan-v2", filename="grad_24000.pt")
grad_tts_model_path = cached_download(url, use_auth_token=AUTH_TOKEN)
torch.hub.download_url_to_file(
    "https://github.com/AK391/Speech-Backbones/releases/download/v1/hifigan.pt",
    "hifigan.pt",
)

# build MAS
current = os.getcwd()
os.chdir(current + "/Grad-TTS/model/monotonic_align")
call("python setup.py build_ext --inplace", shell=True)
os.chdir("../../../")

sys.path.append("Grad-TTS/")
import params
from model import GradTTS
from text import text_to_sequence, cmudict
from text.symbols import symbols
from utils import intersperse

sys.path.append("Grad-TTS/hifi-gan/")
from env import AttrDict
from models import Generator as HiFiGAN

SPEAKERS = 247

# load models
generator = GradTTS(
    len(symbols) + 1,
    SPEAKERS,
    params.spk_emb_dim,
    params.n_enc_channels,
    params.filter_channels,
    params.filter_channels_dp,
    params.n_heads,
    params.n_enc_layers,
    params.enc_kernel,
    params.enc_dropout,
    params.window_size,
    params.n_feats,
    params.dec_dim,
    params.beta_min,
    params.beta_max,
    pe_scale=1000,
)

generator.load_state_dict(
    torch.load(grad_tts_model_path, map_location=lambda loc, storage: loc)
)
_ = generator.eval()

cmu = cmudict.CMUDict("./Grad-TTS/resources/cmu_dictionary_id")

with open("./Grad-TTS/checkpts/hifigan-config.json") as f:
    h = AttrDict(json.load(f))

hifigan = HiFiGAN(h)
hifigan.load_state_dict(
    torch.load("./hifigan.pt", map_location=lambda loc, storage: loc)["generator"]
)
_ = hifigan.eval()
hifigan.remove_weight_norm()


def inference(text, n_timesteps):
    text = " ".join(word_tokenize(text))
    x = torch.LongTensor(
        intersperse(text_to_sequence(text, dictionary=cmu), len(symbols))
    )[None]
    x_lengths = torch.LongTensor([x.shape[-1]])

    y_enc, y_dec, attn = generator.forward(
        x,
        x_lengths,
        n_timesteps=n_timesteps,
        temperature=1.5,
        stoc=False,
        spk=torch.LongTensor([0]) if SPEAKERS > 1 else None,
        length_scale=1.0,
    )

    with torch.no_grad():
        audio = hifigan.forward(y_dec).cpu().squeeze().clamp(-1, 1).detach().numpy()

    write("out.wav", 22050, audio)
    return "./out.wav"


inputs = [
    gr.inputs.Textbox(lines=5, label="Input Text"),
    gr.inputs.Slider(minimum=0, maximum=100, step=10, label="Timesteps"),
]

outputs = gr.outputs.Audio(type="file", label="Output Audio")
title = "Bookbot Grad-TTS Weildan Demo 🐨"
description = "Hi there! You can start typing any input text ⌨, select your preferred timesteps ⌚, and hit submit! πŸš‚ Please be patient as it may take a while - the greater the timestep, the longer the generation 😁"

utterances = [
    "Selamat pagi! Selamat datang di Jakarta!",
    "Kak, harga nasi gorengnya berapa ya?",
    "Bapak bilang, Malik hebat. Bisa bersih bersih seperti Bapak.",
    "Here are the match lineups for the Colombia Haiti match.",
]

timesteps = [(i * 10) + 50 for i in range(len(utterances))]
examples = [list(l) for l in zip(utterances, timesteps)]

gr.Interface(
    inference, inputs, outputs, title=title, description=description, examples=examples
).launch()