Spaces:
Runtime error
Runtime error
Serhiy Stetskovych
commited on
Commit
·
bd45f54
1
Parent(s):
e42c405
Last changes
Browse files- app.py +5 -70
- prompt.wav +0 -0
app.py
CHANGED
@@ -14,10 +14,6 @@ from hifigan.env import AttrDict
|
|
14 |
from hifigan.models import Generator as HiFiGAN
|
15 |
|
16 |
|
17 |
-
#from BigVGAN.models import BigVGAN
|
18 |
-
#from BigVGAN.env import AttrDict as BigVGANAttrDict
|
19 |
-
|
20 |
-
|
21 |
from pflow.models.pflow_tts import pflowTTS
|
22 |
from pflow.text import text_to_sequence, sequence_to_text
|
23 |
from pflow.utils.utils import intersperse
|
@@ -26,60 +22,14 @@ from pflow.utils.model import normalize
|
|
26 |
|
27 |
|
28 |
|
29 |
-
BIGVGAN_CONFIG = {
|
30 |
-
"resblock": "1",
|
31 |
-
"num_gpus": 0,
|
32 |
-
"batch_size": 32,
|
33 |
-
"learning_rate": 0.0001,
|
34 |
-
"adam_b1": 0.8,
|
35 |
-
"adam_b2": 0.99,
|
36 |
-
"lr_decay": 0.999,
|
37 |
-
"seed": 1234,
|
38 |
-
|
39 |
-
"upsample_rates": [4,4,2,2,2,2],
|
40 |
-
"upsample_kernel_sizes": [8,8,4,4,4,4],
|
41 |
-
"upsample_initial_channel": 1536,
|
42 |
-
"resblock_kernel_sizes": [3,7,11],
|
43 |
-
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
44 |
-
|
45 |
-
"activation": "snakebeta",
|
46 |
-
"snake_logscale": True,
|
47 |
-
|
48 |
-
"resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
|
49 |
-
"mpd_reshapes": [2, 3, 5, 7, 11],
|
50 |
-
"use_spectral_norm": False,
|
51 |
-
"discriminator_channel_mult": 1,
|
52 |
-
|
53 |
-
"segment_size": 8192,
|
54 |
-
"num_mels": 80,
|
55 |
-
"num_freq": 1025,
|
56 |
-
"n_fft": 1024,
|
57 |
-
"hop_size": 256,
|
58 |
-
"win_size": 1024,
|
59 |
-
|
60 |
-
"sampling_rate": 22050,
|
61 |
-
|
62 |
-
"fmin": 0,
|
63 |
-
"fmax": 8000,
|
64 |
-
"fmax_for_loss": None,
|
65 |
-
|
66 |
-
"num_workers": 4,
|
67 |
-
|
68 |
-
"dist_config": {
|
69 |
-
"dist_backend": "nccl",
|
70 |
-
"dist_url": "tcp://localhost:54321",
|
71 |
-
"world_size": 1
|
72 |
-
}
|
73 |
-
}
|
74 |
-
|
75 |
PFLOW_MODEL_PATH = 'checkpoint_epoch=649.ckpt'
|
76 |
-
VOCODER_MODEL_PATH = '
|
77 |
-
VOCODER_BIGVGAN_MODEL_PATH = 'g_05000000'
|
78 |
|
|
|
79 |
wav, sr = torchaudio.load('prompt.wav')
|
80 |
|
81 |
prompt = mel_spectrogram(
|
82 |
-
wav,
|
83 |
1024,
|
84 |
80,
|
85 |
22050,
|
@@ -114,20 +64,6 @@ def load_hifigan(checkpoint_path, device):
|
|
114 |
return hifigan
|
115 |
|
116 |
|
117 |
-
def load_bigvgan(checkpoint_path, device):
|
118 |
-
print("Loading '{}'".format(checkpoint_path))
|
119 |
-
checkpoint_dict = torch.load(checkpoint_path, map_location=device)
|
120 |
-
|
121 |
-
|
122 |
-
h = BigVGANAttrDict(BIGVGAN_CONFIG)
|
123 |
-
torch.manual_seed(h.seed)
|
124 |
-
|
125 |
-
generator = BigVGAN(h).to(device)
|
126 |
-
generator.load_state_dict(checkpoint_dict['generator'])
|
127 |
-
generator.eval()
|
128 |
-
generator.remove_weight_norm()
|
129 |
-
return generator
|
130 |
-
|
131 |
|
132 |
def to_waveform(mel, vocoder, denoiser=None):
|
133 |
audio = vocoder(mel).clamp(-1, 1)
|
@@ -154,9 +90,8 @@ def get_device():
|
|
154 |
device = get_device()
|
155 |
model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device)
|
156 |
_ = model.eval()
|
157 |
-
#vocoder = load_bigvgan(VOCODER_BIGVGAN_MODEL_PATH, device)
|
158 |
vocoder = load_hifigan(VOCODER_MODEL_PATH, device)
|
159 |
-
denoiser =
|
160 |
|
161 |
@torch.inference_mode()
|
162 |
def synthesise(text, temperature, speed):
|
@@ -172,7 +107,7 @@ def synthesise(text, temperature, speed):
|
|
172 |
temperature=temperature,
|
173 |
length_scale=1/speed,
|
174 |
prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
|
175 |
-
|
176 |
)
|
177 |
waveform = to_waveform(output["mel"], vocoder, denoiser)
|
178 |
|
|
|
14 |
from hifigan.models import Generator as HiFiGAN
|
15 |
|
16 |
|
|
|
|
|
|
|
|
|
17 |
from pflow.models.pflow_tts import pflowTTS
|
18 |
from pflow.text import text_to_sequence, sequence_to_text
|
19 |
from pflow.utils.utils import intersperse
|
|
|
22 |
|
23 |
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
PFLOW_MODEL_PATH = 'checkpoint_epoch=649.ckpt'
|
26 |
+
VOCODER_MODEL_PATH = 'g_00140000_m'
|
|
|
27 |
|
28 |
+
transform = torchaudio.transforms.Vol(gain=-32, gain_type="db")
|
29 |
wav, sr = torchaudio.load('prompt.wav')
|
30 |
|
31 |
prompt = mel_spectrogram(
|
32 |
+
transform(wav),
|
33 |
1024,
|
34 |
80,
|
35 |
22050,
|
|
|
64 |
return hifigan
|
65 |
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
def to_waveform(mel, vocoder, denoiser=None):
|
69 |
audio = vocoder(mel).clamp(-1, 1)
|
|
|
90 |
device = get_device()
|
91 |
model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device)
|
92 |
_ = model.eval()
|
|
|
93 |
vocoder = load_hifigan(VOCODER_MODEL_PATH, device)
|
94 |
+
denoiser = Denoiser(vocoder, mode="zeros")
|
95 |
|
96 |
@torch.inference_mode()
|
97 |
def synthesise(text, temperature, speed):
|
|
|
107 |
temperature=temperature,
|
108 |
length_scale=1/speed,
|
109 |
prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
|
110 |
+
guidance_scale=1.0
|
111 |
)
|
112 |
waveform = to_waveform(output["mel"], vocoder, denoiser)
|
113 |
|
prompt.wav
CHANGED
Binary files a/prompt.wav and b/prompt.wav differ
|
|