Spaces:
Running
on
Zero
Running
on
Zero
Serhiy Stetskovych
commited on
Commit
•
37f9a5d
1
Parent(s):
39cc8c4
App with new vocoder
Browse files- app.py +33 -28
- prompt.wav +0 -0
- prompt22050.wav +0 -0
app.py
CHANGED
@@ -28,15 +28,17 @@ from vocos import Vocos
|
|
28 |
|
29 |
PFLOW_MODEL_PATH = 'checkpoints/checkpoint_epoch=649.ckpt'
|
30 |
#PFLOW_MODEL_PATH = 'checkpoint_m_epoch=054.ckpt'
|
31 |
-
|
|
|
|
|
32 |
HIFIGAN_MODEL_PATH = 'checkpoints/g_00120000'
|
33 |
|
34 |
|
35 |
transform = torchaudio.transforms.Vol(gain=-32, gain_type="db")
|
36 |
-
wav, sr = torchaudio.load('
|
37 |
|
38 |
prompt = mel_spectrogram(
|
39 |
-
wav,
|
40 |
1024,
|
41 |
80,
|
42 |
22050,
|
@@ -85,7 +87,7 @@ def load_vocos(checkpoint_path, config_path, device):
|
|
85 |
|
86 |
|
87 |
def to_waveform(mel, vocoder, denoiser=None):
|
88 |
-
return vocoder.decode(mel).cpu().squeeze()
|
89 |
|
90 |
# audio = vocoder(mel).clamp(-1, 1)
|
91 |
# if denoiser is not None:
|
@@ -113,9 +115,10 @@ model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device)
|
|
113 |
_ = model.eval()
|
114 |
|
115 |
|
116 |
-
|
117 |
-
|
118 |
-
#vocos_44100 = load_vocos('checkpoints/vocos_checkpoint_epoch=
|
|
|
119 |
denoiser = None#Denoiser(vocoder, mode="zeros")
|
120 |
|
121 |
|
@@ -134,23 +137,25 @@ def synthesise(text, speed):
|
|
134 |
length_scale=1/speed,
|
135 |
prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
|
136 |
|
137 |
-
guidance_scale=
|
138 |
|
139 |
)
|
140 |
-
waveform_vocos =
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
|
145 |
|
146 |
-
return text_processed['x_phones'][1::2], (22050, waveform_vocos.numpy())
|
147 |
|
148 |
|
149 |
description = f'''
|
150 |
# Експериментальна апка для генерації аудіо з тексту.
|
151 |
|
152 |
pflow checkpoint {PFLOW_MODEL_PATH}
|
153 |
-
|
|
|
|
|
154 |
'''
|
155 |
|
156 |
|
@@ -164,28 +169,28 @@ if __name__ == "__main__":
|
|
164 |
],
|
165 |
outputs=[
|
166 |
gr.Text(label='Фонемізований текст:', lines=5),
|
167 |
-
# gr.Audio(
|
168 |
-
# label="Vocos 44100 аудіо:",
|
169 |
-
# autoplay=False,
|
170 |
-
# streaming=False,
|
171 |
-
# type="numpy",
|
172 |
-
# ),
|
173 |
gr.Audio(
|
174 |
-
label="Vocos аудіо:",
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
autoplay=False,
|
176 |
streaming=False,
|
177 |
type="numpy",
|
178 |
),
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
|
186 |
],
|
187 |
allow_flagging ='manual',
|
188 |
-
flagging_options=[("Якщо дуже погоне аудіо, тисни цю кнопку.", "negative")],
|
189 |
cache_examples=True,
|
190 |
title='',
|
191 |
# description=description,
|
|
|
28 |
|
29 |
PFLOW_MODEL_PATH = 'checkpoints/checkpoint_epoch=649.ckpt'
|
30 |
#PFLOW_MODEL_PATH = 'checkpoint_m_epoch=054.ckpt'
|
31 |
+
VOCODER22_MODEL_PATH = 'BSC-LT/vocos-mel-22khz'
|
32 |
+
VOCODER44_MODEL_PATH = 'patriotyk/vocos-mel-hifigan-compat-44100khz'
|
33 |
+
|
34 |
HIFIGAN_MODEL_PATH = 'checkpoints/g_00120000'
|
35 |
|
36 |
|
37 |
transform = torchaudio.transforms.Vol(gain=-32, gain_type="db")
|
38 |
+
wav, sr = torchaudio.load('prompt22050.wav')
|
39 |
|
40 |
prompt = mel_spectrogram(
|
41 |
+
transform(wav),
|
42 |
1024,
|
43 |
80,
|
44 |
22050,
|
|
|
87 |
|
88 |
|
89 |
def to_waveform(mel, vocoder, denoiser=None):
|
90 |
+
return vocoder.decode(mel).clamp(-1, 1).cpu().squeeze()
|
91 |
|
92 |
# audio = vocoder(mel).clamp(-1, 1)
|
93 |
# if denoiser is not None:
|
|
|
115 |
_ = model.eval()
|
116 |
|
117 |
|
118 |
+
hifigan = load_hifigan(HIFIGAN_MODEL_PATH, device)
|
119 |
+
vocos_22050 = Vocos.from_pretrained(VOCODER22_MODEL_PATH)
|
120 |
+
#vocos_44100 = load_vocos('checkpoints/vocos_checkpoint_epoch=209_step=3924480_val_loss=3.7036_44100_11.ckpt', 'vocos.yaml', device)
|
121 |
+
vocos_44100 = Vocos.from_pretrained(VOCODER44_MODEL_PATH)
|
122 |
denoiser = None#Denoiser(vocoder, mode="zeros")
|
123 |
|
124 |
|
|
|
137 |
length_scale=1/speed,
|
138 |
prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
|
139 |
|
140 |
+
guidance_scale=2.0
|
141 |
|
142 |
)
|
143 |
+
waveform_vocos = vocos_22050.decode(output["mel"]).cpu().squeeze()
|
144 |
+
waveform_vocos_44100 = vocos_44100.decode(output["mel"]).cpu().squeeze()
|
145 |
+
waveform_hifigan = hifigan(output["mel"]).clamp(-1, 1).cpu().squeeze()
|
146 |
+
transform = torchaudio.transforms.Vol(gain=-18, gain_type="db")
|
147 |
|
148 |
|
149 |
+
return text_processed['x_phones'][1::2], (44100, waveform_vocos_44100.numpy()), (22050, waveform_vocos.numpy()), (22050, transform(waveform_hifigan).numpy())
|
150 |
|
151 |
|
152 |
description = f'''
|
153 |
# Експериментальна апка для генерації аудіо з тексту.
|
154 |
|
155 |
pflow checkpoint {PFLOW_MODEL_PATH}
|
156 |
+
Vocos 44100 аудіо - {VOCODER44_MODEL_PATH}
|
157 |
+
Vocos 22050 аудіо - {VOCODER22_MODEL_PATH}
|
158 |
+
HIFIGAN 22050 аудіо - {HIFIGAN_MODEL_PATH}
|
159 |
'''
|
160 |
|
161 |
|
|
|
169 |
],
|
170 |
outputs=[
|
171 |
gr.Text(label='Фонемізований текст:', lines=5),
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
gr.Audio(
|
173 |
+
label="Vocos 44100 аудіо:",
|
174 |
+
autoplay=False,
|
175 |
+
streaming=False,
|
176 |
+
type="numpy",
|
177 |
+
),
|
178 |
+
gr.Audio(
|
179 |
+
label="Vocos 22050 аудіо:",
|
180 |
autoplay=False,
|
181 |
streaming=False,
|
182 |
type="numpy",
|
183 |
),
|
184 |
+
gr.Audio(
|
185 |
+
label="HIFIGAN 22050 аудіо:",
|
186 |
+
autoplay=False,
|
187 |
+
streaming=False,
|
188 |
+
type="numpy",
|
189 |
+
)
|
190 |
|
191 |
],
|
192 |
allow_flagging ='manual',
|
193 |
+
#flagging_options=[("Якщо дуже погоне аудіо, тисни цю кнопку.", "negative")],
|
194 |
cache_examples=True,
|
195 |
title='',
|
196 |
# description=description,
|
prompt.wav
DELETED
Binary file (112 kB)
|
|
prompt22050.wav
ADDED
Binary file (655 kB). View file
|
|