Update ONNXVITS_infer.py
Browse files- ONNXVITS_infer.py +6 -28
ONNXVITS_infer.py
CHANGED
@@ -125,6 +125,7 @@ class SynthesizerTrn(models.SynthesizerTrn):
|
|
125 |
gin_channels=0,
|
126 |
use_sdp=True,
|
127 |
emotion_embedding=False,
|
|
|
128 |
**kwargs):
|
129 |
|
130 |
super().__init__(
|
@@ -149,6 +150,7 @@ class SynthesizerTrn(models.SynthesizerTrn):
|
|
149 |
use_sdp=use_sdp,
|
150 |
**kwargs
|
151 |
)
|
|
|
152 |
self.enc_p = TextEncoder(n_vocab,
|
153 |
inter_channels,
|
154 |
hidden_channels,
|
@@ -172,7 +174,7 @@ class SynthesizerTrn(models.SynthesizerTrn):
|
|
172 |
g = None
|
173 |
|
174 |
# logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
175 |
-
logw = runonnx("
|
176 |
logw = torch.from_numpy(logw[0])
|
177 |
|
178 |
w = torch.exp(logw) * x_mask * length_scale
|
@@ -189,35 +191,11 @@ class SynthesizerTrn(models.SynthesizerTrn):
|
|
189 |
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
190 |
|
191 |
# z = self.flow(z_p, y_mask, g=g, reverse=True)
|
192 |
-
z = runonnx("
|
193 |
z = torch.from_numpy(z[0])
|
194 |
|
195 |
# o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
196 |
-
o = runonnx("
|
197 |
o = torch.from_numpy(o[0])
|
198 |
|
199 |
-
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
200 |
-
|
201 |
-
def predict_duration(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None,
|
202 |
-
emotion_embedding=None):
|
203 |
-
from ONNXVITS_utils import runonnx
|
204 |
-
|
205 |
-
# x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
206 |
-
x, m_p, logs_p, x_mask = runonnx("ONNX_net/enc_p.onnx", x=x.numpy(), x_lengths=x_lengths.numpy())
|
207 |
-
x = torch.from_numpy(x)
|
208 |
-
m_p = torch.from_numpy(m_p)
|
209 |
-
logs_p = torch.from_numpy(logs_p)
|
210 |
-
x_mask = torch.from_numpy(x_mask)
|
211 |
-
|
212 |
-
if self.n_speakers > 0:
|
213 |
-
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
214 |
-
else:
|
215 |
-
g = None
|
216 |
-
|
217 |
-
# logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
218 |
-
logw = runonnx("ONNX_net/dp.onnx", x=x.numpy(), x_mask=x_mask.numpy(), g=g.numpy())
|
219 |
-
logw = torch.from_numpy(logw[0])
|
220 |
-
|
221 |
-
w = torch.exp(logw) * x_mask * length_scale
|
222 |
-
w_ceil = torch.ceil(w)
|
223 |
-
return list(w_ceil.squeeze())
|
|
|
125 |
gin_channels=0,
|
126 |
use_sdp=True,
|
127 |
emotion_embedding=False,
|
128 |
+
ONNX_dir="./ONNX_net/",
|
129 |
**kwargs):
|
130 |
|
131 |
super().__init__(
|
|
|
150 |
use_sdp=use_sdp,
|
151 |
**kwargs
|
152 |
)
|
153 |
+
self.ONNX_dir = ONNX_dir
|
154 |
self.enc_p = TextEncoder(n_vocab,
|
155 |
inter_channels,
|
156 |
hidden_channels,
|
|
|
174 |
g = None
|
175 |
|
176 |
# logw = self.dp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w)
|
177 |
+
logw = runonnx(f"{self.ONNX_dir}dp.onnx", x=x.numpy(), x_mask=x_mask.numpy(), g=g.numpy())
|
178 |
logw = torch.from_numpy(logw[0])
|
179 |
|
180 |
w = torch.exp(logw) * x_mask * length_scale
|
|
|
191 |
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
192 |
|
193 |
# z = self.flow(z_p, y_mask, g=g, reverse=True)
|
194 |
+
z = runonnx(f"{self.ONNX_dir}flow.onnx", z_p=z_p.numpy(), y_mask=y_mask.numpy(), g=g.numpy())
|
195 |
z = torch.from_numpy(z[0])
|
196 |
|
197 |
# o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
198 |
+
o = runonnx(f"{self.ONNX_dir}dec.onnx", z_in=(z * y_mask)[:, :, :max_len].numpy(), g=g.numpy())
|
199 |
o = torch.from_numpy(o[0])
|
200 |
|
201 |
+
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|