Serhiy Stetskovych commited on
Commit
bd45f54
·
1 Parent(s): e42c405

Last changes

Browse files
Files changed (2) hide show
  1. app.py +5 -70
  2. prompt.wav +0 -0
app.py CHANGED
@@ -14,10 +14,6 @@ from hifigan.env import AttrDict
14
  from hifigan.models import Generator as HiFiGAN
15
 
16
 
17
- #from BigVGAN.models import BigVGAN
18
- #from BigVGAN.env import AttrDict as BigVGANAttrDict
19
-
20
-
21
  from pflow.models.pflow_tts import pflowTTS
22
  from pflow.text import text_to_sequence, sequence_to_text
23
  from pflow.utils.utils import intersperse
@@ -26,60 +22,14 @@ from pflow.utils.model import normalize
26
 
27
 
28
 
29
- BIGVGAN_CONFIG = {
30
- "resblock": "1",
31
- "num_gpus": 0,
32
- "batch_size": 32,
33
- "learning_rate": 0.0001,
34
- "adam_b1": 0.8,
35
- "adam_b2": 0.99,
36
- "lr_decay": 0.999,
37
- "seed": 1234,
38
-
39
- "upsample_rates": [4,4,2,2,2,2],
40
- "upsample_kernel_sizes": [8,8,4,4,4,4],
41
- "upsample_initial_channel": 1536,
42
- "resblock_kernel_sizes": [3,7,11],
43
- "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
44
-
45
- "activation": "snakebeta",
46
- "snake_logscale": True,
47
-
48
- "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
49
- "mpd_reshapes": [2, 3, 5, 7, 11],
50
- "use_spectral_norm": False,
51
- "discriminator_channel_mult": 1,
52
-
53
- "segment_size": 8192,
54
- "num_mels": 80,
55
- "num_freq": 1025,
56
- "n_fft": 1024,
57
- "hop_size": 256,
58
- "win_size": 1024,
59
-
60
- "sampling_rate": 22050,
61
-
62
- "fmin": 0,
63
- "fmax": 8000,
64
- "fmax_for_loss": None,
65
-
66
- "num_workers": 4,
67
-
68
- "dist_config": {
69
- "dist_backend": "nccl",
70
- "dist_url": "tcp://localhost:54321",
71
- "world_size": 1
72
- }
73
- }
74
-
75
  PFLOW_MODEL_PATH = 'checkpoint_epoch=649.ckpt'
76
- VOCODER_MODEL_PATH = 'g_00120000'
77
- VOCODER_BIGVGAN_MODEL_PATH = 'g_05000000'
78
 
 
79
  wav, sr = torchaudio.load('prompt.wav')
80
 
81
  prompt = mel_spectrogram(
82
- wav,
83
  1024,
84
  80,
85
  22050,
@@ -114,20 +64,6 @@ def load_hifigan(checkpoint_path, device):
114
  return hifigan
115
 
116
 
117
- def load_bigvgan(checkpoint_path, device):
118
- print("Loading '{}'".format(checkpoint_path))
119
- checkpoint_dict = torch.load(checkpoint_path, map_location=device)
120
-
121
-
122
- h = BigVGANAttrDict(BIGVGAN_CONFIG)
123
- torch.manual_seed(h.seed)
124
-
125
- generator = BigVGAN(h).to(device)
126
- generator.load_state_dict(checkpoint_dict['generator'])
127
- generator.eval()
128
- generator.remove_weight_norm()
129
- return generator
130
-
131
 
132
  def to_waveform(mel, vocoder, denoiser=None):
133
  audio = vocoder(mel).clamp(-1, 1)
@@ -154,9 +90,8 @@ def get_device():
154
  device = get_device()
155
  model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device)
156
  _ = model.eval()
157
- #vocoder = load_bigvgan(VOCODER_BIGVGAN_MODEL_PATH, device)
158
  vocoder = load_hifigan(VOCODER_MODEL_PATH, device)
159
- denoiser = None #Denoiser(vocoder, mode="zeros")
160
 
161
  @torch.inference_mode()
162
  def synthesise(text, temperature, speed):
@@ -172,7 +107,7 @@ def synthesise(text, temperature, speed):
172
  temperature=temperature,
173
  length_scale=1/speed,
174
  prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
175
- guidance_scale=1.5
176
  )
177
  waveform = to_waveform(output["mel"], vocoder, denoiser)
178
 
 
14
  from hifigan.models import Generator as HiFiGAN
15
 
16
 
 
 
 
 
17
  from pflow.models.pflow_tts import pflowTTS
18
  from pflow.text import text_to_sequence, sequence_to_text
19
  from pflow.utils.utils import intersperse
 
22
 
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  PFLOW_MODEL_PATH = 'checkpoint_epoch=649.ckpt'
26
+ VOCODER_MODEL_PATH = 'g_00140000_m'
 
27
 
28
+ transform = torchaudio.transforms.Vol(gain=-32, gain_type="db")
29
  wav, sr = torchaudio.load('prompt.wav')
30
 
31
  prompt = mel_spectrogram(
32
+ transform(wav),
33
  1024,
34
  80,
35
  22050,
 
64
  return hifigan
65
 
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  def to_waveform(mel, vocoder, denoiser=None):
69
  audio = vocoder(mel).clamp(-1, 1)
 
90
  device = get_device()
91
  model = pflowTTS.load_from_checkpoint(PFLOW_MODEL_PATH, map_location=device)
92
  _ = model.eval()
 
93
  vocoder = load_hifigan(VOCODER_MODEL_PATH, device)
94
+ denoiser = Denoiser(vocoder, mode="zeros")
95
 
96
  @torch.inference_mode()
97
  def synthesise(text, temperature, speed):
 
107
  temperature=temperature,
108
  length_scale=1/speed,
109
  prompt=normalize(prompt, model.mel_mean, model.mel_std).to(device),
110
+ guidance_scale=1.0
111
  )
112
  waveform = to_waveform(output["mel"], vocoder, denoiser)
113
 
prompt.wav CHANGED
Binary files a/prompt.wav and b/prompt.wav differ