Spaces:
Running
Running
support timbre confusion
Browse files- models/tts/vits/vits.py +16 -6
models/tts/vits/vits.py
CHANGED
@@ -317,12 +317,15 @@ class SynthesizerTrn(nn.Module):
|
|
317 |
"logs_q": logs_q,
|
318 |
}
|
319 |
return outputs
|
320 |
-
|
|
|
321 |
def infer(
|
322 |
self,
|
323 |
x,
|
324 |
x_lengths,
|
325 |
-
|
|
|
|
|
326 |
noise_scale=1,
|
327 |
length_scale=1,
|
328 |
noise_scale_w=1.0,
|
@@ -330,13 +333,20 @@ class SynthesizerTrn(nn.Module):
|
|
330 |
):
|
331 |
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
332 |
if self.n_speakers > 0:
|
333 |
-
|
334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
else:
|
336 |
g = None
|
337 |
-
|
338 |
-
print('g.shape: ', g.shape)
|
339 |
|
|
|
|
|
340 |
if self.use_sdp:
|
341 |
logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
342 |
else:
|
|
|
317 |
"logs_q": logs_q,
|
318 |
}
|
319 |
return outputs
|
320 |
+
|
321 |
+
|
322 |
def infer(
|
323 |
self,
|
324 |
x,
|
325 |
x_lengths,
|
326 |
+
sid_1=None,
|
327 |
+
sid_2=None,
|
328 |
+
alpha=0.5,
|
329 |
noise_scale=1,
|
330 |
length_scale=1,
|
331 |
noise_scale_w=1.0,
|
|
|
333 |
):
|
334 |
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
335 |
if self.n_speakers > 0:
|
336 |
+
if sid_2 is None:
|
337 |
+
sid = sid_1.squeeze(-1)
|
338 |
+
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
339 |
+
else:
|
340 |
+
sid_1= sid_1.squeeze(-1)
|
341 |
+
g_1 = self.emb_g(sid_1).unsqueeze(-1)
|
342 |
+
sid_2= sid_2.squeeze(-1)
|
343 |
+
g_2 = self.emb_g(sid_2).unsqueeze(-1)
|
344 |
+
g = interpolate_embeddings(g_1,g_2,alpha)
|
345 |
else:
|
346 |
g = None
|
|
|
|
|
347 |
|
348 |
+
print('g.shape: ', g.shape)
|
349 |
+
|
350 |
if self.use_sdp:
|
351 |
logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
352 |
else:
|