zyingt commited on
Commit
ade41ec
1 Parent(s): 73880c7

support timbre confusion

Browse files
Files changed (1) hide show
  1. models/tts/vits/vits.py +16 -6
models/tts/vits/vits.py CHANGED
@@ -317,12 +317,15 @@ class SynthesizerTrn(nn.Module):
317
  "logs_q": logs_q,
318
  }
319
  return outputs
320
-
 
321
  def infer(
322
  self,
323
  x,
324
  x_lengths,
325
- sid=None,
 
 
326
  noise_scale=1,
327
  length_scale=1,
328
  noise_scale_w=1.0,
@@ -330,13 +333,20 @@ class SynthesizerTrn(nn.Module):
330
  ):
331
  x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
332
  if self.n_speakers > 0:
333
- sid = sid.squeeze(-1)
334
- g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
 
 
 
 
 
 
 
335
  else:
336
  g = None
337
-
338
- print('g.shape: ', g.shape)
339
 
 
 
340
  if self.use_sdp:
341
  logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
342
  else:
 
317
  "logs_q": logs_q,
318
  }
319
  return outputs
320
+
321
+
322
  def infer(
323
  self,
324
  x,
325
  x_lengths,
326
+ sid_1=None,
327
+ sid_2=None,
328
+ alpha=0.5,
329
  noise_scale=1,
330
  length_scale=1,
331
  noise_scale_w=1.0,
 
333
  ):
334
  x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
335
  if self.n_speakers > 0:
336
+ if sid_2 is None:
337
+ sid = sid_1.squeeze(-1)
338
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
339
+ else:
340
+ sid_1= sid_1.squeeze(-1)
341
+ g_1 = self.emb_g(sid_1).unsqueeze(-1)
342
+ sid_2= sid_2.squeeze(-1)
343
+ g_2 = self.emb_g(sid_2).unsqueeze(-1)
344
+ g = interpolate_embeddings(g_1,g_2,alpha)
345
  else:
346
  g = None
 
 
347
 
348
+ print('g.shape: ', g.shape)
349
+
350
  if self.use_sdp:
351
  logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
352
  else: