Spaces:
Runtime error
Runtime error
Run on CPU
Browse files
app.py
CHANGED
@@ -53,7 +53,6 @@ def tts(text,
|
|
53 |
random_seed):
|
54 |
|
55 |
torch.manual_seed(random_seed)
|
56 |
-
torch.cuda.manual_seed(random_seed)
|
57 |
np.random.seed(random_seed)
|
58 |
|
59 |
text_len = len(text)
|
@@ -63,12 +62,12 @@ def tts(text,
|
|
63 |
else:
|
64 |
text = text_to_sequence(str(text), ["english_cleaners2"])
|
65 |
|
66 |
-
token = add_blank_token(text).unsqueeze(0)
|
67 |
-
token_length = torch.LongTensor([token.size(-1)])
|
68 |
|
69 |
# Prompt load
|
70 |
# sample_rate, audio = prompt
|
71 |
-
# audio = torch.FloatTensor([audio])
|
72 |
# if audio.shape[0] != 1:
|
73 |
# audio = audio[:1,:]
|
74 |
# audio = audio / 32768
|
@@ -89,28 +88,28 @@ def tts(text,
|
|
89 |
# If you have a memory issue during denosing the prompt, try to denoise the prompt with cpu before TTS
|
90 |
# We will have a plan to replace a memory-efficient denoiser
|
91 |
if denoise == 0:
|
92 |
-
audio = torch.cat([audio
|
93 |
else:
|
94 |
with torch.no_grad():
|
95 |
|
96 |
if ori_prompt_len > 80000:
|
97 |
denoised_audio = []
|
98 |
for i in range((ori_prompt_len//80000)):
|
99 |
-
denoised_audio.append(denoise(audio.squeeze(0)
|
100 |
|
101 |
-
denoised_audio.append(denoise(audio.squeeze(0)
|
102 |
denoised_audio = torch.cat(denoised_audio, dim=1)
|
103 |
else:
|
104 |
-
denoised_audio = denoise(audio.squeeze(0)
|
105 |
|
106 |
-
audio = torch.cat([audio
|
107 |
|
108 |
audio = audio[:,:ori_prompt_len] # 20231108 We found that large size of padding decreases a performance so we remove the paddings after denosing.
|
109 |
|
110 |
if audio.shape[-1]<48000:
|
111 |
audio = torch.cat([audio,audio,audio,audio,audio], dim=1)
|
112 |
|
113 |
-
src_mel = mel_fn(audio
|
114 |
|
115 |
src_length = torch.LongTensor([src_mel.size(2)]).to(device)
|
116 |
src_length2 = torch.cat([src_length,src_length], dim=0)
|
@@ -120,9 +119,9 @@ def tts(text,
|
|
120 |
w2v_x, pitch = text2w2v.infer_noise_control(token, token_length, src_mel, src_length2,
|
121 |
noise_scale=ttv_temperature, noise_scale_w=duratuion_temperature,
|
122 |
length_scale=duratuion_length, denoise_ratio=denoise_ratio)
|
123 |
-
src_length = torch.LongTensor([w2v_x.size(2)])
|
124 |
|
125 |
-
pitch[pitch<torch.log(torch.tensor([55])
|
126 |
|
127 |
## Hierarchical Speech Synthesizer (W2V, F0 --> 16k Audio)
|
128 |
converted_audio = \
|
@@ -165,7 +164,7 @@ def main():
|
|
165 |
a = parser.parse_args()
|
166 |
|
167 |
global device, hps, hps_t2w2v,h_sr,h_sr48, hps_denoiser
|
168 |
-
device =
|
169 |
|
170 |
hps = utils.get_hparams_from_file(os.path.join(os.path.split(a.ckpt)[0], 'config.json'))
|
171 |
hps_t2w2v = utils.get_hparams_from_file(os.path.join(os.path.split(a.ckpt_text2w2v)[0], 'config.json'))
|
@@ -184,27 +183,27 @@ def main():
|
|
184 |
f_max=hps.data.mel_fmax,
|
185 |
n_mels=hps.data.n_mel_channels,
|
186 |
window_fn=torch.hann_window
|
187 |
-
)
|
188 |
|
189 |
net_g = SynthesizerTrn(hps.data.filter_length // 2 + 1,
|
190 |
hps.train.segment_size // hps.data.hop_length,
|
191 |
-
**hps.model)
|
192 |
net_g.load_state_dict(torch.load(a.ckpt))
|
193 |
_ = net_g.eval()
|
194 |
|
195 |
text2w2v = Text2W2V(hps.data.filter_length // 2 + 1,
|
196 |
hps.train.segment_size // hps.data.hop_length,
|
197 |
-
**hps_t2w2v.model)
|
198 |
text2w2v.load_state_dict(torch.load(a.ckpt_text2w2v))
|
199 |
text2w2v.eval()
|
200 |
|
201 |
speechsr = SpeechSR48(h_sr48.data.n_mel_channels,
|
202 |
h_sr48.train.segment_size // h_sr48.data.hop_length,
|
203 |
-
**h_sr48.model)
|
204 |
utils.load_checkpoint(a.ckpt_sr48, speechsr, None)
|
205 |
speechsr.eval()
|
206 |
|
207 |
-
denoiser = MPNet(hps_denoiser)
|
208 |
state_dict = load_checkpoint(a.denoiser_ckpt, device)
|
209 |
denoiser.load_state_dict(state_dict['generator'])
|
210 |
denoiser.eval()
|
@@ -219,7 +218,7 @@ def main():
|
|
219 |
gr.Slider(0,1,0),
|
220 |
gr.Slider(0,9999,1111)],
|
221 |
outputs = 'audio',
|
222 |
-
title = 'HierSpeech++',
|
223 |
description = '''<div>
|
224 |
<p style="text-align: left"> HierSpeech++ is a zero-shot speech synthesis model.</p>
|
225 |
<p style="text-align: left"> Our model is trained with LibriTTS dataset so this model only supports english. We will release a multi-lingual HierSpeech++ soon.</p>
|
|
|
53 |
random_seed):
|
54 |
|
55 |
torch.manual_seed(random_seed)
|
|
|
56 |
np.random.seed(random_seed)
|
57 |
|
58 |
text_len = len(text)
|
|
|
62 |
else:
|
63 |
text = text_to_sequence(str(text), ["english_cleaners2"])
|
64 |
|
65 |
+
token = add_blank_token(text).unsqueeze(0)
|
66 |
+
token_length = torch.LongTensor([token.size(-1)])
|
67 |
|
68 |
# Prompt load
|
69 |
# sample_rate, audio = prompt
|
70 |
+
# audio = torch.FloatTensor([audio])
|
71 |
# if audio.shape[0] != 1:
|
72 |
# audio = audio[:1,:]
|
73 |
# audio = audio / 32768
|
|
|
88 |
# If you have a memory issue during denosing the prompt, try to denoise the prompt with cpu before TTS
|
89 |
# We will have a plan to replace a memory-efficient denoiser
|
90 |
if denoise == 0:
|
91 |
+
audio = torch.cat([audio, audio], dim=0)
|
92 |
else:
|
93 |
with torch.no_grad():
|
94 |
|
95 |
if ori_prompt_len > 80000:
|
96 |
denoised_audio = []
|
97 |
for i in range((ori_prompt_len//80000)):
|
98 |
+
denoised_audio.append(denoise(audio.squeeze(0)[i*80000:(i+1)*80000], denoiser, hps_denoiser))
|
99 |
|
100 |
+
denoised_audio.append(denoise(audio.squeeze(0)[(i+1)*80000:], denoiser, hps_denoiser))
|
101 |
denoised_audio = torch.cat(denoised_audio, dim=1)
|
102 |
else:
|
103 |
+
denoised_audio = denoise(audio.squeeze(0), denoiser, hps_denoiser)
|
104 |
|
105 |
+
audio = torch.cat([audio, denoised_audio[:,:audio.shape[-1]]], dim=0)
|
106 |
|
107 |
audio = audio[:,:ori_prompt_len] # 20231108 We found that large size of padding decreases a performance so we remove the paddings after denosing.
|
108 |
|
109 |
if audio.shape[-1]<48000:
|
110 |
audio = torch.cat([audio,audio,audio,audio,audio], dim=1)
|
111 |
|
112 |
+
src_mel = mel_fn(audio)
|
113 |
|
114 |
src_length = torch.LongTensor([src_mel.size(2)]).to(device)
|
115 |
src_length2 = torch.cat([src_length,src_length], dim=0)
|
|
|
119 |
w2v_x, pitch = text2w2v.infer_noise_control(token, token_length, src_mel, src_length2,
|
120 |
noise_scale=ttv_temperature, noise_scale_w=duratuion_temperature,
|
121 |
length_scale=duratuion_length, denoise_ratio=denoise_ratio)
|
122 |
+
src_length = torch.LongTensor([w2v_x.size(2)])
|
123 |
|
124 |
+
pitch[pitch<torch.log(torch.tensor([55]))] = 0
|
125 |
|
126 |
## Hierarchical Speech Synthesizer (W2V, F0 --> 16k Audio)
|
127 |
converted_audio = \
|
|
|
164 |
a = parser.parse_args()
|
165 |
|
166 |
global device, hps, hps_t2w2v,h_sr,h_sr48, hps_denoiser
|
167 |
+
device = 'cpu'
|
168 |
|
169 |
hps = utils.get_hparams_from_file(os.path.join(os.path.split(a.ckpt)[0], 'config.json'))
|
170 |
hps_t2w2v = utils.get_hparams_from_file(os.path.join(os.path.split(a.ckpt_text2w2v)[0], 'config.json'))
|
|
|
183 |
f_max=hps.data.mel_fmax,
|
184 |
n_mels=hps.data.n_mel_channels,
|
185 |
window_fn=torch.hann_window
|
186 |
+
)
|
187 |
|
188 |
net_g = SynthesizerTrn(hps.data.filter_length // 2 + 1,
|
189 |
hps.train.segment_size // hps.data.hop_length,
|
190 |
+
**hps.model)
|
191 |
net_g.load_state_dict(torch.load(a.ckpt))
|
192 |
_ = net_g.eval()
|
193 |
|
194 |
text2w2v = Text2W2V(hps.data.filter_length // 2 + 1,
|
195 |
hps.train.segment_size // hps.data.hop_length,
|
196 |
+
**hps_t2w2v.model)
|
197 |
text2w2v.load_state_dict(torch.load(a.ckpt_text2w2v))
|
198 |
text2w2v.eval()
|
199 |
|
200 |
speechsr = SpeechSR48(h_sr48.data.n_mel_channels,
|
201 |
h_sr48.train.segment_size // h_sr48.data.hop_length,
|
202 |
+
**h_sr48.model)
|
203 |
utils.load_checkpoint(a.ckpt_sr48, speechsr, None)
|
204 |
speechsr.eval()
|
205 |
|
206 |
+
denoiser = MPNet(hps_denoiser)
|
207 |
state_dict = load_checkpoint(a.denoiser_ckpt, device)
|
208 |
denoiser.load_state_dict(state_dict['generator'])
|
209 |
denoiser.eval()
|
|
|
218 |
gr.Slider(0,1,0),
|
219 |
gr.Slider(0,9999,1111)],
|
220 |
outputs = 'audio',
|
221 |
+
title = 'HierSpeech++ (CPU)',
|
222 |
description = '''<div>
|
223 |
<p style="text-align: left"> HierSpeech++ is a zero-shot speech synthesis model.</p>
|
224 |
<p style="text-align: left"> Our model is trained with LibriTTS dataset so this model only supports english. We will release a multi-lingual HierSpeech++ soon.</p>
|