Plachta commited on
Commit
4b86b18
1 Parent(s): 141e3fd

Update ONNXVITS_infer.py

Browse files
Files changed (1) hide show
  1. ONNXVITS_infer.py +6 -28
ONNXVITS_infer.py CHANGED
@@ -125,6 +125,7 @@ class SynthesizerTrn(models.SynthesizerTrn):
125
  gin_channels=0,
126
  use_sdp=True,
127
  emotion_embedding=False,
 
128
  **kwargs):
129
 
130
  super().__init__(
@@ -149,6 +150,7 @@ class SynthesizerTrn(models.SynthesizerTrn):
149
  use_sdp=use_sdp,
150
  **kwargs
151
  )
 
152
  self.enc_p = TextEncoder(n_vocab,
153
  inter_channels,
154
  hidden_channels,
@@ -172,7 +174,7 @@ class SynthesizerTrn(models.SynthesizerTrn):
172
  g = None
173
 
174
  # logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
175
- logw = runonnx("ONNX_net/dp.onnx", x=x.numpy(), x_mask=x_mask.numpy(), g=g.numpy())
176
  logw = torch.from_numpy(logw[0])
177
 
178
  w = torch.exp(logw) * x_mask * length_scale
@@ -189,35 +191,11 @@ class SynthesizerTrn(models.SynthesizerTrn):
189
  z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
190
 
191
  # z = self.flow(z_p, y_mask, g=g, reverse=True)
192
- z = runonnx("ONNX_net/flow.onnx", z_p=z_p.numpy(), y_mask=y_mask.numpy(), g=g.numpy())
193
  z = torch.from_numpy(z[0])
194
 
195
  # o = self.dec((z * y_mask)[:,:,:max_len], g=g)
196
- o = runonnx("ONNX_net/dec.onnx", z_in=(z * y_mask)[:, :, :max_len].numpy(), g=g.numpy())
197
  o = torch.from_numpy(o[0])
198
 
199
- return o, attn, y_mask, (z, z_p, m_p, logs_p)
200
-
201
- def predict_duration(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None,
202
- emotion_embedding=None):
203
- from ONNXVITS_utils import runonnx
204
-
205
- # x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
206
- x, m_p, logs_p, x_mask = runonnx("ONNX_net/enc_p.onnx", x=x.numpy(), x_lengths=x_lengths.numpy())
207
- x = torch.from_numpy(x)
208
- m_p = torch.from_numpy(m_p)
209
- logs_p = torch.from_numpy(logs_p)
210
- x_mask = torch.from_numpy(x_mask)
211
-
212
- if self.n_speakers > 0:
213
- g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
214
- else:
215
- g = None
216
-
217
- # logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
218
- logw = runonnx("ONNX_net/dp.onnx", x=x.numpy(), x_mask=x_mask.numpy(), g=g.numpy())
219
- logw = torch.from_numpy(logw[0])
220
-
221
- w = torch.exp(logw) * x_mask * length_scale
222
- w_ceil = torch.ceil(w)
223
- return list(w_ceil.squeeze())
 
125
  gin_channels=0,
126
  use_sdp=True,
127
  emotion_embedding=False,
128
+ ONNX_dir="./ONNX_net/",
129
  **kwargs):
130
 
131
  super().__init__(
 
150
  use_sdp=use_sdp,
151
  **kwargs
152
  )
153
+ self.ONNX_dir = ONNX_dir
154
  self.enc_p = TextEncoder(n_vocab,
155
  inter_channels,
156
  hidden_channels,
 
174
  g = None
175
 
176
  # logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
177
+ logw = runonnx(f"{self.ONNX_dir}dp.onnx", x=x.numpy(), x_mask=x_mask.numpy(), g=g.numpy())
178
  logw = torch.from_numpy(logw[0])
179
 
180
  w = torch.exp(logw) * x_mask * length_scale
 
191
  z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
192
 
193
  # z = self.flow(z_p, y_mask, g=g, reverse=True)
194
+ z = runonnx(f"{self.ONNX_dir}flow.onnx", z_p=z_p.numpy(), y_mask=y_mask.numpy(), g=g.numpy())
195
  z = torch.from_numpy(z[0])
196
 
197
  # o = self.dec((z * y_mask)[:,:,:max_len], g=g)
198
+ o = runonnx(f"{self.ONNX_dir}dec.onnx", z_in=(z * y_mask)[:, :, :max_len].numpy(), g=g.numpy())
199
  o = torch.from_numpy(o[0])
200
 
201
+ return o, attn, y_mask, (z, z_p, m_p, logs_p)