Serhiy Stetskovych commited on
Commit
7eda31a
1 Parent(s): ba9220a

Add vocos.

Browse files
app.py CHANGED
@@ -6,6 +6,7 @@ import gradio as gr
6
  import numpy as np
7
 
8
  import torch
 
9
 
10
 
11
  from hifigan.config import v1
@@ -19,17 +20,23 @@ from pflow.text import text_to_sequence, sequence_to_text
19
  from pflow.utils.utils import intersperse
20
  from pflow.data.text_mel_datamodule import mel_spectrogram
21
  from pflow.utils.model import normalize
 
22
 
23
 
24
 
25
- PFLOW_MODEL_PATH = 'checkpoint_epoch=649.ckpt'
26
- VOCODER_MODEL_PATH = 'g_00140000_m'
 
 
 
 
 
27
 
28
  transform = torchaudio.transforms.Vol(gain=-32, gain_type="db")
29
  wav, sr = torchaudio.load('prompt.wav')
30
 
31
  prompt = mel_spectrogram(
32
- transform(wav),
33
  1024,
34
  80,
35
  22050,
@@ -42,6 +49,7 @@ prompt = mel_spectrogram(
42
 
43
 
44
 
 
45
  def process_text(text: str, device: torch.device):
46
  x = torch.tensor(
47
  intersperse(text_to_sequence(text, ["ukr_cleaners"]), 0),
@@ -65,12 +73,25 @@ def load_hifigan(checkpoint_path, device):
65
 
66
 
67
 
 
 
 
 
 
 
 
 
 
 
 
68
  def to_waveform(mel, vocoder, denoiser=None):
69
- audio = vocoder(mel).clamp(-1, 1)
70
- if denoiser is not None:
71
- audio = denoiser(audio.squeeze(), strength=0.00025).cpu().squeeze()
72
 
73
- return audio.cpu().squeeze()
 
 
 
 
74
 
75
 
76
 
@@ -90,11 +111,16 @@ def get_device():
90
  device = get_device()
91
  model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device)
92
  _ = model.eval()
93
- vocoder = load_hifigan(VOCODER_MODEL_PATH, device)
94
- denoiser = Denoiser(vocoder, mode="zeros")
 
 
 
 
 
95
 
96
  @torch.inference_mode()
97
- def synthesise(text, temperature, speed):
98
  if len(text) > 1000:
99
  raise gr.Error("Текст повинен бути коротшим за 1000 символів.")
100
 
@@ -104,21 +130,27 @@ def synthesise(text, temperature, speed):
104
  text_processed["x"].to(device),
105
  text_processed["x_lengths"].to(device),
106
  n_timesteps=40,
107
- temperature=temperature,
108
  length_scale=1/speed,
109
  prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
110
- guidance_scale=1.0
 
 
111
  )
112
- waveform = to_waveform(output["mel"], vocoder, denoiser)
 
 
 
 
113
 
114
- return text_processed['x_phones'][1::2], (22050, waveform.numpy())
115
 
116
 
117
  description = f'''
118
  # Експериментальна апка для генерації аудіо з тексту.
119
 
120
  pflow checkpoint {PFLOW_MODEL_PATH}
121
- vocoder: HIFIGAN(трейнутий на датасеті, з нуля) - {VOCODER_MODEL_PATH}
122
  '''
123
 
124
 
@@ -128,17 +160,28 @@ if __name__ == "__main__":
128
  description=description,
129
  inputs=[
130
  gr.Text(label='Текст для синтезу:', lines=5, max_lines=10),
131
- gr.Slider(minimum=0.0, maximum=1.0, label="Температура", value=0.4),
132
  gr.Slider(minimum=0.6, maximum=2.0, label="Швидкість", value=1.0)
133
  ],
134
  outputs=[
135
  gr.Text(label='Фонемізований текст:', lines=5),
 
 
 
 
 
 
136
  gr.Audio(
137
- label="Згенероване аудіо:",
138
  autoplay=False,
139
  streaming=False,
140
  type="numpy",
141
- )
 
 
 
 
 
 
142
 
143
  ],
144
  allow_flagging ='manual',
 
6
  import numpy as np
7
 
8
  import torch
9
+ import json
10
 
11
 
12
  from hifigan.config import v1
 
20
  from pflow.utils.utils import intersperse
21
  from pflow.data.text_mel_datamodule import mel_spectrogram
22
  from pflow.utils.model import normalize
23
+ from vocos import Vocos
24
 
25
 
26
 
27
+
28
+
29
+ PFLOW_MODEL_PATH = 'checkpoints/checkpoint_epoch=649.ckpt'
30
+ #PFLOW_MODEL_PATH = 'checkpoint_m_epoch=054.ckpt'
31
+ VOCODER_MODEL_PATH = 'checkpoints/pytorch_model.bin'
32
+ HIFIGAN_MODEL_PATH = 'checkpoints/g_00120000'
33
+
34
 
35
  transform = torchaudio.transforms.Vol(gain=-32, gain_type="db")
36
  wav, sr = torchaudio.load('prompt.wav')
37
 
38
  prompt = mel_spectrogram(
39
+ wav,
40
  1024,
41
  80,
42
  22050,
 
49
 
50
 
51
 
52
+
53
  def process_text(text: str, device: torch.device):
54
  x = torch.tensor(
55
  intersperse(text_to_sequence(text, ["ukr_cleaners"]), 0),
 
73
 
74
 
75
 
76
+
77
+ def load_vocos(checkpoint_path, config_path, device):
78
+ model = Vocos.from_hparams(config_path)
79
+
80
+ raw_model = torch.load(checkpoint_path, map_location=torch.device('cpu'))
81
+ raw_model = raw_model if 'state_dict' not in raw_model else raw_model['state_dict']
82
+ model.load_state_dict(raw_model, strict=False)
83
+ model.eval()
84
+ return model
85
+
86
+
87
  def to_waveform(mel, vocoder, denoiser=None):
88
+ return vocoder.decode(mel).cpu().squeeze()
 
 
89
 
90
+ # audio = vocoder(mel).clamp(-1, 1)
91
+ # if denoiser is not None:
92
+ # audio = denoiser(audio.squeeze(), strength=0.00025).cpu().squeeze()
93
+
94
+ # return audio.cpu().squeeze()
95
 
96
 
97
 
 
111
  device = get_device()
112
  model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device)
113
  _ = model.eval()
114
+
115
+
116
+ #hifigan = load_hifigan(HIFIGAN_MODEL_PATH, device)
117
+ vocos = load_vocos(VOCODER_MODEL_PATH, 'config.yaml', device)
118
+ #vocos_44100 = load_vocos('checkpoints/vocos_checkpoint_epoch=4_step=93440_val_loss=5.2596_44100_10.ckpt', 'vocos.yaml', device)
119
+ denoiser = None#Denoiser(vocoder, mode="zeros")
120
+
121
 
122
  @torch.inference_mode()
123
+ def synthesise(text, speed):
124
  if len(text) > 1000:
125
  raise gr.Error("Текст повинен бути коротшим за 1000 символів.")
126
 
 
130
  text_processed["x"].to(device),
131
  text_processed["x_lengths"].to(device),
132
  n_timesteps=40,
133
+ temperature=0.0,
134
  length_scale=1/speed,
135
  prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
136
+
137
+ guidance_scale=1.5
138
+
139
  )
140
+ waveform_vocos = vocos.decode(output["mel"]).cpu().squeeze()
141
+ #waveform_vocos_44100 = vocos_44100.decode(output["mel"]).cpu().squeeze()
142
+ #waveform_hifigan = hifigan(output["mel"]).clamp(-1, 1).cpu().squeeze()
143
+ #transform = torchaudio.transforms.Vol(gain=-18, gain_type="db")
144
+
145
 
146
+ return text_processed['x_phones'][1::2], (22050, waveform_vocos.numpy())
147
 
148
 
149
  description = f'''
150
  # Експериментальна апка для генерації аудіо з тексту.
151
 
152
  pflow checkpoint {PFLOW_MODEL_PATH}
153
+ vocoder: Vocos - {VOCODER_MODEL_PATH}
154
  '''
155
 
156
 
 
160
  description=description,
161
  inputs=[
162
  gr.Text(label='Текст для синтезу:', lines=5, max_lines=10),
 
163
  gr.Slider(minimum=0.6, maximum=2.0, label="Швидкість", value=1.0)
164
  ],
165
  outputs=[
166
  gr.Text(label='Фонемізований текст:', lines=5),
167
+ # gr.Audio(
168
+ # label="Vocos 44100 аудіо:",
169
+ # autoplay=False,
170
+ # streaming=False,
171
+ # type="numpy",
172
+ # ),
173
  gr.Audio(
174
+ label="Vocos аудіо:",
175
  autoplay=False,
176
  streaming=False,
177
  type="numpy",
178
+ ),
179
+ # gr.Audio(
180
+ # label="HIFIGAN аудіо:",
181
+ # autoplay=False,
182
+ # streaming=False,
183
+ # type="numpy",
184
+ # )
185
 
186
  ],
187
  allow_flagging ='manual',
checkpoint_epoch=499.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:39051170c6c0d9abce47d0073f796912d5ce3854ade8f707cb30333f50160d99
3
- size 279562867
 
 
 
 
checkpoint_epoch=649.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f86bc69330121d97d876f8cc38a8f7c36c443be40b2b0b4389b9684d4c351c6a
3
- size 279563122
 
 
 
 
g_00120000 → checkpoints/pytorch_model.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f25c6dbc515ed387edd5d2e5683a50510aa33986e8a79273efe1216084f0f078
3
- size 55824433
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0af7b6f4b153819ada44a917135acf33944cdbb70cde0701eda3d100153799c7
3
+ size 54051047
config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pytorch_lightning==1.8.6
2
+
3
+ feature_extractor:
4
+ class_path: vocos.feature_extractors.MelSpectrogramFeatures
5
+ init_args:
6
+ sample_rate: 22050
7
+ n_fft: 1024
8
+ hop_length: 256
9
+ n_mels: 80
10
+ padding: same
11
+ f_min: 0
12
+ f_max: 8000
13
+ norm: "slaney"
14
+ mel_scale: "slaney"
15
+
16
+
17
+ backbone:
18
+ class_path: vocos.models.VocosBackbone
19
+ init_args:
20
+ input_channels: 80
21
+ dim: 512
22
+ intermediate_dim: 1536
23
+ num_layers: 8
24
+
25
+ head:
26
+ class_path: vocos.heads.ISTFTHead
27
+ init_args:
28
+ dim: 512
29
+ n_fft: 1024
30
+ hop_length: 256
31
+ padding: same
32
+
33
+
g_00140000_m DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4edf30ac1bbb52cd250f0c38d615df23978d39ee8415c2a4c636344367adfd1
3
- size 55824433