BlairLeng commited on
Commit
2e33d6a
โ€ข
1 Parent(s): 5f735a0
Files changed (42) hide show
  1. .DS_Store +0 -0
  2. src/app.py โ†’ app.py +0 -0
  3. src/app_more_general.py โ†’ app_more_general.py +0 -0
  4. src/app_withAudio.py โ†’ app_withAudio.py +0 -0
  5. src/app_with_text_preload.py โ†’ app_with_text_preload.py +0 -0
  6. src/chenxi_update_log.txt โ†’ chenxi_update_log.txt +0 -0
  7. src/map_embeddings_with_colors.py โ†’ map_embeddings_with_colors.py +0 -0
  8. {src/pkl โ†’ pkl}/.DS_Store +0 -0
  9. {src/pkl โ†’ pkl}/dict_text.pkl +0 -0
  10. {src/pkl โ†’ pkl}/embeds.jsonl +0 -0
  11. {src/pkl โ†’ pkl}/embeds2.jsonl +0 -0
  12. {src/pkl โ†’ pkl}/maps.pkl +0 -0
  13. {src/pkl โ†’ pkl}/pklๆ–‡ไปถ่งฃ้‡Š.txt +0 -0
  14. {src/pkl โ†’ pkl}/text_image.pkl +0 -0
  15. {src/pkl โ†’ pkl}/texts.jsonl +0 -0
  16. {src/pkl โ†’ pkl}/title_to_text.pkl +0 -0
  17. src/.DS_Store +0 -0
  18. src/tts_vits/Readme.md +0 -22
  19. src/tts_vits/attentions.py +0 -303
  20. src/tts_vits/commons.py +0 -188
  21. src/tts_vits/configs/config.json +0 -90
  22. src/tts_vits/hubert/__init__.py +0 -0
  23. src/tts_vits/hubert/hubert_model.py +0 -222
  24. src/tts_vits/hubert/hubert_model_onnx.py +0 -217
  25. src/tts_vits/hubert/put_hubert_ckpt_here +0 -0
  26. src/tts_vits/inference/__init__.py +0 -0
  27. src/tts_vits/inference/chunks_temp.json +0 -1
  28. src/tts_vits/inference/infer_tool.py +0 -326
  29. src/tts_vits/inference/infer_tool_grad.py +0 -160
  30. src/tts_vits/inference/slicer.py +0 -145
  31. src/tts_vits/inference_main.py +0 -55
  32. src/tts_vits/models.py +0 -351
  33. src/tts_vits/modules.py +0 -342
  34. src/tts_vits/requirements.txt +0 -16
  35. src/tts_vits/utils.py +0 -338
  36. src/tts_vits/vdecoder/__init__.py +0 -0
  37. src/tts_vits/vdecoder/hifigan/env.py +0 -15
  38. src/tts_vits/vdecoder/hifigan/models.py +0 -503
  39. src/tts_vits/vdecoder/hifigan/nvSTFT.py +0 -111
  40. src/tts_vits/vdecoder/hifigan/utils.py +0 -68
  41. src/tts_vits/vits_haruhi.py +0 -51
  42. src/text.py โ†’ text.py +0 -0
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
src/app.py โ†’ app.py RENAMED
File without changes
src/app_more_general.py โ†’ app_more_general.py RENAMED
File without changes
src/app_withAudio.py โ†’ app_withAudio.py RENAMED
File without changes
src/app_with_text_preload.py โ†’ app_with_text_preload.py RENAMED
File without changes
src/chenxi_update_log.txt โ†’ chenxi_update_log.txt RENAMED
File without changes
src/map_embeddings_with_colors.py โ†’ map_embeddings_with_colors.py RENAMED
File without changes
{src/pkl โ†’ pkl}/.DS_Store RENAMED
File without changes
{src/pkl โ†’ pkl}/dict_text.pkl RENAMED
File without changes
{src/pkl โ†’ pkl}/embeds.jsonl RENAMED
File without changes
{src/pkl โ†’ pkl}/embeds2.jsonl RENAMED
File without changes
{src/pkl โ†’ pkl}/maps.pkl RENAMED
File without changes
{src/pkl โ†’ pkl}/pklๆ–‡ไปถ่งฃ้‡Š.txt RENAMED
File without changes
{src/pkl โ†’ pkl}/text_image.pkl RENAMED
File without changes
{src/pkl โ†’ pkl}/texts.jsonl RENAMED
File without changes
{src/pkl โ†’ pkl}/title_to_text.pkl RENAMED
File without changes
src/.DS_Store DELETED
Binary file (8.2 kB)
 
src/tts_vits/Readme.md DELETED
@@ -1,22 +0,0 @@
1
- ## ๅ‡‰ๅฎซๆ˜ฅๆ—ฅ็š„vitsๅ˜ๅฃฐ
2
-
3
- ### ๅฎ‰่ฃ…็Žฏๅขƒ
4
-
5
- ```
6
- pip install -r requirements.txt
7
- ```
8
-
9
- ## ๆจกๅž‹ไธ‹่ฝฝ
10
-
11
- [ๆจกๅž‹](https://huggingface.co/scixing/Haruhi_Vits/blob/main/Haruhi_54000.pth)่ฏฅๆจกๅž‹ๅœจ็จ‹ๅบไธญไฝฟ็”จset_model_pathๅŠ ่ฝฝ
12
-
13
- [hubertๆจกๅž‹](https://huggingface.co/scixing/Haruhi_Vits/blob/main/hubert-soft-0d54a1f4.pt)่ฏฅๆจกๅž‹ๆ”พๅ…ฅ`tts_vits\hubert`ๆ–‡ไปถๅคนไธ‹
14
-
15
- ## ไฝฟ็”จๆ–นๆณ•
16
-
17
- ```python
18
- # ่ฎพ็ฝฎๆจกๅž‹่ทฏๅพ„
19
- set_model_path("vits_models/Haruhi_54000.pth")
20
- # ็”Ÿๆˆ่ฏญ้Ÿณ ็ฌฌไธ€ไธชๅ‚ๆ•ฐไธบๆ–‡ๆœฌ ็ฌฌไบŒไธชๅ‚ๆ•ฐไธบ้Ÿณ้ซ˜
21
- vits_haruhi("็œŸๅฎŸใฏใ„ใคใ‚‚ใฒใจใค", 8)
22
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/attentions.py DELETED
@@ -1,303 +0,0 @@
1
- import copy
2
- import math
3
- import numpy as np
4
- import torch
5
- from torch import nn
6
- from torch.nn import functional as F
7
-
8
- import commons
9
- import modules
10
- from modules import LayerNorm
11
-
12
-
13
- class Encoder(nn.Module):
14
- def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
15
- super().__init__()
16
- self.hidden_channels = hidden_channels
17
- self.filter_channels = filter_channels
18
- self.n_heads = n_heads
19
- self.n_layers = n_layers
20
- self.kernel_size = kernel_size
21
- self.p_dropout = p_dropout
22
- self.window_size = window_size
23
-
24
- self.drop = nn.Dropout(p_dropout)
25
- self.attn_layers = nn.ModuleList()
26
- self.norm_layers_1 = nn.ModuleList()
27
- self.ffn_layers = nn.ModuleList()
28
- self.norm_layers_2 = nn.ModuleList()
29
- for i in range(self.n_layers):
30
- self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
31
- self.norm_layers_1.append(LayerNorm(hidden_channels))
32
- self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
33
- self.norm_layers_2.append(LayerNorm(hidden_channels))
34
-
35
- def forward(self, x, x_mask):
36
- attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
37
- x = x * x_mask
38
- for i in range(self.n_layers):
39
- y = self.attn_layers[i](x, x, attn_mask)
40
- y = self.drop(y)
41
- x = self.norm_layers_1[i](x + y)
42
-
43
- y = self.ffn_layers[i](x, x_mask)
44
- y = self.drop(y)
45
- x = self.norm_layers_2[i](x + y)
46
- x = x * x_mask
47
- return x
48
-
49
-
50
- class Decoder(nn.Module):
51
- def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
52
- super().__init__()
53
- self.hidden_channels = hidden_channels
54
- self.filter_channels = filter_channels
55
- self.n_heads = n_heads
56
- self.n_layers = n_layers
57
- self.kernel_size = kernel_size
58
- self.p_dropout = p_dropout
59
- self.proximal_bias = proximal_bias
60
- self.proximal_init = proximal_init
61
-
62
- self.drop = nn.Dropout(p_dropout)
63
- self.self_attn_layers = nn.ModuleList()
64
- self.norm_layers_0 = nn.ModuleList()
65
- self.encdec_attn_layers = nn.ModuleList()
66
- self.norm_layers_1 = nn.ModuleList()
67
- self.ffn_layers = nn.ModuleList()
68
- self.norm_layers_2 = nn.ModuleList()
69
- for i in range(self.n_layers):
70
- self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
71
- self.norm_layers_0.append(LayerNorm(hidden_channels))
72
- self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
73
- self.norm_layers_1.append(LayerNorm(hidden_channels))
74
- self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
75
- self.norm_layers_2.append(LayerNorm(hidden_channels))
76
-
77
- def forward(self, x, x_mask, h, h_mask):
78
- """
79
- x: decoder input
80
- h: encoder output
81
- """
82
- self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
83
- encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
84
- x = x * x_mask
85
- for i in range(self.n_layers):
86
- y = self.self_attn_layers[i](x, x, self_attn_mask)
87
- y = self.drop(y)
88
- x = self.norm_layers_0[i](x + y)
89
-
90
- y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
91
- y = self.drop(y)
92
- x = self.norm_layers_1[i](x + y)
93
-
94
- y = self.ffn_layers[i](x, x_mask)
95
- y = self.drop(y)
96
- x = self.norm_layers_2[i](x + y)
97
- x = x * x_mask
98
- return x
99
-
100
-
101
- class MultiHeadAttention(nn.Module):
102
- def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
103
- super().__init__()
104
- assert channels % n_heads == 0
105
-
106
- self.channels = channels
107
- self.out_channels = out_channels
108
- self.n_heads = n_heads
109
- self.p_dropout = p_dropout
110
- self.window_size = window_size
111
- self.heads_share = heads_share
112
- self.block_length = block_length
113
- self.proximal_bias = proximal_bias
114
- self.proximal_init = proximal_init
115
- self.attn = None
116
-
117
- self.k_channels = channels // n_heads
118
- self.conv_q = nn.Conv1d(channels, channels, 1)
119
- self.conv_k = nn.Conv1d(channels, channels, 1)
120
- self.conv_v = nn.Conv1d(channels, channels, 1)
121
- self.conv_o = nn.Conv1d(channels, out_channels, 1)
122
- self.drop = nn.Dropout(p_dropout)
123
-
124
- if window_size is not None:
125
- n_heads_rel = 1 if heads_share else n_heads
126
- rel_stddev = self.k_channels**-0.5
127
- self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
128
- self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
129
-
130
- nn.init.xavier_uniform_(self.conv_q.weight)
131
- nn.init.xavier_uniform_(self.conv_k.weight)
132
- nn.init.xavier_uniform_(self.conv_v.weight)
133
- if proximal_init:
134
- with torch.no_grad():
135
- self.conv_k.weight.copy_(self.conv_q.weight)
136
- self.conv_k.bias.copy_(self.conv_q.bias)
137
-
138
- def forward(self, x, c, attn_mask=None):
139
- q = self.conv_q(x)
140
- k = self.conv_k(c)
141
- v = self.conv_v(c)
142
-
143
- x, self.attn = self.attention(q, k, v, mask=attn_mask)
144
-
145
- x = self.conv_o(x)
146
- return x
147
-
148
- def attention(self, query, key, value, mask=None):
149
- # reshape [b, d, t] -> [b, n_h, t, d_k]
150
- b, d, t_s, t_t = (*key.size(), query.size(2))
151
- query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
152
- key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
153
- value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
154
-
155
- scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
156
- if self.window_size is not None:
157
- assert t_s == t_t, "Relative attention is only available for self-attention."
158
- key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
159
- rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
160
- scores_local = self._relative_position_to_absolute_position(rel_logits)
161
- scores = scores + scores_local
162
- if self.proximal_bias:
163
- assert t_s == t_t, "Proximal bias is only available for self-attention."
164
- scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
165
- if mask is not None:
166
- scores = scores.masked_fill(mask == 0, -1e4)
167
- if self.block_length is not None:
168
- assert t_s == t_t, "Local attention is only available for self-attention."
169
- block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
170
- scores = scores.masked_fill(block_mask == 0, -1e4)
171
- p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
172
- p_attn = self.drop(p_attn)
173
- output = torch.matmul(p_attn, value)
174
- if self.window_size is not None:
175
- relative_weights = self._absolute_position_to_relative_position(p_attn)
176
- value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
177
- output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
178
- output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
179
- return output, p_attn
180
-
181
- def _matmul_with_relative_values(self, x, y):
182
- """
183
- x: [b, h, l, m]
184
- y: [h or 1, m, d]
185
- ret: [b, h, l, d]
186
- """
187
- ret = torch.matmul(x, y.unsqueeze(0))
188
- return ret
189
-
190
- def _matmul_with_relative_keys(self, x, y):
191
- """
192
- x: [b, h, l, d]
193
- y: [h or 1, m, d]
194
- ret: [b, h, l, m]
195
- """
196
- ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
197
- return ret
198
-
199
- def _get_relative_embeddings(self, relative_embeddings, length):
200
- max_relative_position = 2 * self.window_size + 1
201
- # Pad first before slice to avoid using cond ops.
202
- pad_length = max(length - (self.window_size + 1), 0)
203
- slice_start_position = max((self.window_size + 1) - length, 0)
204
- slice_end_position = slice_start_position + 2 * length - 1
205
- if pad_length > 0:
206
- padded_relative_embeddings = F.pad(
207
- relative_embeddings,
208
- commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
209
- else:
210
- padded_relative_embeddings = relative_embeddings
211
- used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
212
- return used_relative_embeddings
213
-
214
- def _relative_position_to_absolute_position(self, x):
215
- """
216
- x: [b, h, l, 2*l-1]
217
- ret: [b, h, l, l]
218
- """
219
- batch, heads, length, _ = x.size()
220
- # Concat columns of pad to shift from relative to absolute indexing.
221
- x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
222
-
223
- # Concat extra elements so to add up to shape (len+1, 2*len-1).
224
- x_flat = x.view([batch, heads, length * 2 * length])
225
- x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
226
-
227
- # Reshape and slice out the padded elements.
228
- x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
229
- return x_final
230
-
231
- def _absolute_position_to_relative_position(self, x):
232
- """
233
- x: [b, h, l, l]
234
- ret: [b, h, l, 2*l-1]
235
- """
236
- batch, heads, length, _ = x.size()
237
- # padd along column
238
- x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
239
- x_flat = x.view([batch, heads, length**2 + length*(length -1)])
240
- # add 0's in the beginning that will skew the elements after reshape
241
- x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
242
- x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
243
- return x_final
244
-
245
- def _attention_bias_proximal(self, length):
246
- """Bias for self-attention to encourage attention to close positions.
247
- Args:
248
- length: an integer scalar.
249
- Returns:
250
- a Tensor with shape [1, 1, length, length]
251
- """
252
- r = torch.arange(length, dtype=torch.float32)
253
- diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
254
- return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
255
-
256
-
257
- class FFN(nn.Module):
258
- def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
259
- super().__init__()
260
- self.in_channels = in_channels
261
- self.out_channels = out_channels
262
- self.filter_channels = filter_channels
263
- self.kernel_size = kernel_size
264
- self.p_dropout = p_dropout
265
- self.activation = activation
266
- self.causal = causal
267
-
268
- if causal:
269
- self.padding = self._causal_padding
270
- else:
271
- self.padding = self._same_padding
272
-
273
- self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
274
- self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
275
- self.drop = nn.Dropout(p_dropout)
276
-
277
- def forward(self, x, x_mask):
278
- x = self.conv_1(self.padding(x * x_mask))
279
- if self.activation == "gelu":
280
- x = x * torch.sigmoid(1.702 * x)
281
- else:
282
- x = torch.relu(x)
283
- x = self.drop(x)
284
- x = self.conv_2(self.padding(x * x_mask))
285
- return x * x_mask
286
-
287
- def _causal_padding(self, x):
288
- if self.kernel_size == 1:
289
- return x
290
- pad_l = self.kernel_size - 1
291
- pad_r = 0
292
- padding = [[0, 0], [0, 0], [pad_l, pad_r]]
293
- x = F.pad(x, commons.convert_pad_shape(padding))
294
- return x
295
-
296
- def _same_padding(self, x):
297
- if self.kernel_size == 1:
298
- return x
299
- pad_l = (self.kernel_size - 1) // 2
300
- pad_r = self.kernel_size // 2
301
- padding = [[0, 0], [0, 0], [pad_l, pad_r]]
302
- x = F.pad(x, commons.convert_pad_shape(padding))
303
- return x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/commons.py DELETED
@@ -1,188 +0,0 @@
1
- import math
2
- import numpy as np
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
-
7
- def slice_pitch_segments(x, ids_str, segment_size=4):
8
- ret = torch.zeros_like(x[:, :segment_size])
9
- for i in range(x.size(0)):
10
- idx_str = ids_str[i]
11
- idx_end = idx_str + segment_size
12
- ret[i] = x[i, idx_str:idx_end]
13
- return ret
14
-
15
- def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4):
16
- b, d, t = x.size()
17
- if x_lengths is None:
18
- x_lengths = t
19
- ids_str_max = x_lengths - segment_size + 1
20
- ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
21
- ret = slice_segments(x, ids_str, segment_size)
22
- ret_pitch = slice_pitch_segments(pitch, ids_str, segment_size)
23
- return ret, ret_pitch, ids_str
24
-
25
- def init_weights(m, mean=0.0, std=0.01):
26
- classname = m.__class__.__name__
27
- if classname.find("Conv") != -1:
28
- m.weight.data.normal_(mean, std)
29
-
30
-
31
- def get_padding(kernel_size, dilation=1):
32
- return int((kernel_size*dilation - dilation)/2)
33
-
34
-
35
- def convert_pad_shape(pad_shape):
36
- l = pad_shape[::-1]
37
- pad_shape = [item for sublist in l for item in sublist]
38
- return pad_shape
39
-
40
-
41
- def intersperse(lst, item):
42
- result = [item] * (len(lst) * 2 + 1)
43
- result[1::2] = lst
44
- return result
45
-
46
-
47
- def kl_divergence(m_p, logs_p, m_q, logs_q):
48
- """KL(P||Q)"""
49
- kl = (logs_q - logs_p) - 0.5
50
- kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
51
- return kl
52
-
53
-
54
- def rand_gumbel(shape):
55
- """Sample from the Gumbel distribution, protect from overflows."""
56
- uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
57
- return -torch.log(-torch.log(uniform_samples))
58
-
59
-
60
- def rand_gumbel_like(x):
61
- g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
62
- return g
63
-
64
-
65
- def slice_segments(x, ids_str, segment_size=4):
66
- ret = torch.zeros_like(x[:, :, :segment_size])
67
- for i in range(x.size(0)):
68
- idx_str = ids_str[i]
69
- idx_end = idx_str + segment_size
70
- ret[i] = x[i, :, idx_str:idx_end]
71
- return ret
72
-
73
-
74
- def rand_slice_segments(x, x_lengths=None, segment_size=4):
75
- b, d, t = x.size()
76
- if x_lengths is None:
77
- x_lengths = t
78
- ids_str_max = x_lengths - segment_size + 1
79
- ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
80
- ret = slice_segments(x, ids_str, segment_size)
81
- return ret, ids_str
82
-
83
-
84
- def rand_spec_segments(x, x_lengths=None, segment_size=4):
85
- b, d, t = x.size()
86
- if x_lengths is None:
87
- x_lengths = t
88
- ids_str_max = x_lengths - segment_size
89
- ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
90
- ret = slice_segments(x, ids_str, segment_size)
91
- return ret, ids_str
92
-
93
-
94
- def get_timing_signal_1d(
95
- length, channels, min_timescale=1.0, max_timescale=1.0e4):
96
- position = torch.arange(length, dtype=torch.float)
97
- num_timescales = channels // 2
98
- log_timescale_increment = (
99
- math.log(float(max_timescale) / float(min_timescale)) /
100
- (num_timescales - 1))
101
- inv_timescales = min_timescale * torch.exp(
102
- torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
103
- scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
104
- signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
105
- signal = F.pad(signal, [0, 0, 0, channels % 2])
106
- signal = signal.view(1, channels, length)
107
- return signal
108
-
109
-
110
- def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
111
- b, channels, length = x.size()
112
- signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
113
- return x + signal.to(dtype=x.dtype, device=x.device)
114
-
115
-
116
- def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
117
- b, channels, length = x.size()
118
- signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
119
- return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
120
-
121
-
122
- def subsequent_mask(length):
123
- mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
124
- return mask
125
-
126
-
127
- @torch.jit.script
128
- def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
129
- n_channels_int = n_channels[0]
130
- in_act = input_a + input_b
131
- t_act = torch.tanh(in_act[:, :n_channels_int, :])
132
- s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
133
- acts = t_act * s_act
134
- return acts
135
-
136
-
137
- def convert_pad_shape(pad_shape):
138
- l = pad_shape[::-1]
139
- pad_shape = [item for sublist in l for item in sublist]
140
- return pad_shape
141
-
142
-
143
- def shift_1d(x):
144
- x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
145
- return x
146
-
147
-
148
- def sequence_mask(length, max_length=None):
149
- if max_length is None:
150
- max_length = length.max()
151
- x = torch.arange(max_length, dtype=length.dtype, device=length.device)
152
- return x.unsqueeze(0) < length.unsqueeze(1)
153
-
154
-
155
- def generate_path(duration, mask):
156
- """
157
- duration: [b, 1, t_x]
158
- mask: [b, 1, t_y, t_x]
159
- """
160
- device = duration.device
161
-
162
- b, _, t_y, t_x = mask.shape
163
- cum_duration = torch.cumsum(duration, -1)
164
-
165
- cum_duration_flat = cum_duration.view(b * t_x)
166
- path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
167
- path = path.view(b, t_x, t_y)
168
- path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
169
- path = path.unsqueeze(1).transpose(2,3) * mask
170
- return path
171
-
172
-
173
- def clip_grad_value_(parameters, clip_value, norm_type=2):
174
- if isinstance(parameters, torch.Tensor):
175
- parameters = [parameters]
176
- parameters = list(filter(lambda p: p.grad is not None, parameters))
177
- norm_type = float(norm_type)
178
- if clip_value is not None:
179
- clip_value = float(clip_value)
180
-
181
- total_norm = 0
182
- for p in parameters:
183
- param_norm = p.grad.data.norm(norm_type)
184
- total_norm += param_norm.item() ** norm_type
185
- if clip_value is not None:
186
- p.grad.data.clamp_(min=-clip_value, max=clip_value)
187
- total_norm = total_norm ** (1. / norm_type)
188
- return total_norm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/configs/config.json DELETED
@@ -1,90 +0,0 @@
1
- {
2
- "train": {
3
- "log_interval": 200,
4
- "eval_interval": 1000,
5
- "seed": 1234,
6
- "epochs": 10000,
7
- "learning_rate": 0.0001,
8
- "betas": [
9
- 0.8,
10
- 0.99
11
- ],
12
- "eps": 1e-09,
13
- "batch_size": 12,
14
- "fp16_run": false,
15
- "lr_decay": 0.999875,
16
- "segment_size": 17920,
17
- "init_lr_ratio": 1,
18
- "warmup_epochs": 0,
19
- "c_mel": 45,
20
- "c_kl": 1.0,
21
- "use_sr": true,
22
- "max_speclen": 384,
23
- "port": "8011"
24
- },
25
- "data": {
26
- "training_files": "filelists/train.txt",
27
- "validation_files": "filelists/val.txt",
28
- "max_wav_value": 32768.0,
29
- "sampling_rate": 32000,
30
- "filter_length": 1280,
31
- "hop_length": 320,
32
- "win_length": 1280,
33
- "n_mel_channels": 80,
34
- "mel_fmin": 0.0,
35
- "mel_fmax": null
36
- },
37
- "model": {
38
- "inter_channels": 192,
39
- "hidden_channels": 192,
40
- "filter_channels": 768,
41
- "n_heads": 2,
42
- "n_layers": 6,
43
- "kernel_size": 3,
44
- "p_dropout": 0.1,
45
- "resblock": "1",
46
- "resblock_kernel_sizes": [
47
- 3,
48
- 7,
49
- 11
50
- ],
51
- "resblock_dilation_sizes": [
52
- [
53
- 1,
54
- 3,
55
- 5
56
- ],
57
- [
58
- 1,
59
- 3,
60
- 5
61
- ],
62
- [
63
- 1,
64
- 3,
65
- 5
66
- ]
67
- ],
68
- "upsample_rates": [
69
- 10,
70
- 8,
71
- 2,
72
- 2
73
- ],
74
- "upsample_initial_channel": 512,
75
- "upsample_kernel_sizes": [
76
- 16,
77
- 16,
78
- 4,
79
- 4
80
- ],
81
- "n_layers_q": 3,
82
- "use_spectral_norm": false,
83
- "gin_channels": 256,
84
- "ssl_dim": 256,
85
- "n_speakers": 2
86
- },
87
- "spk": {
88
- "haruhi": 0
89
- }
90
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/hubert/__init__.py DELETED
File without changes
src/tts_vits/hubert/hubert_model.py DELETED
@@ -1,222 +0,0 @@
1
- import copy
2
- import random
3
- from typing import Optional, Tuple
4
-
5
- import torch
6
- import torch.nn as nn
7
- import torch.nn.functional as t_func
8
- from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
9
-
10
-
11
- class Hubert(nn.Module):
12
- def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
13
- super().__init__()
14
- self._mask = mask
15
- self.feature_extractor = FeatureExtractor()
16
- self.feature_projection = FeatureProjection()
17
- self.positional_embedding = PositionalConvEmbedding()
18
- self.norm = nn.LayerNorm(768)
19
- self.dropout = nn.Dropout(0.1)
20
- self.encoder = TransformerEncoder(
21
- nn.TransformerEncoderLayer(
22
- 768, 12, 3072, activation="gelu", batch_first=True
23
- ),
24
- 12,
25
- )
26
- self.proj = nn.Linear(768, 256)
27
-
28
- self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
29
- self.label_embedding = nn.Embedding(num_label_embeddings, 256)
30
-
31
- def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
32
- mask = None
33
- if self.training and self._mask:
34
- mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
35
- x[mask] = self.masked_spec_embed.to(x.dtype)
36
- return x, mask
37
-
38
- def encode(
39
- self, x: torch.Tensor, layer: Optional[int] = None
40
- ) -> Tuple[torch.Tensor, torch.Tensor]:
41
- x = self.feature_extractor(x)
42
- x = self.feature_projection(x.transpose(1, 2))
43
- x, mask = self.mask(x)
44
- x = x + self.positional_embedding(x)
45
- x = self.dropout(self.norm(x))
46
- x = self.encoder(x, output_layer=layer)
47
- return x, mask
48
-
49
- def logits(self, x: torch.Tensor) -> torch.Tensor:
50
- logits = torch.cosine_similarity(
51
- x.unsqueeze(2),
52
- self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
53
- dim=-1,
54
- )
55
- return logits / 0.1
56
-
57
- def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
58
- x, mask = self.encode(x)
59
- x = self.proj(x)
60
- logits = self.logits(x)
61
- return logits, mask
62
-
63
-
64
- class HubertSoft(Hubert):
65
- def __init__(self):
66
- super().__init__()
67
-
68
- @torch.inference_mode()
69
- def units(self, wav: torch.Tensor) -> torch.Tensor:
70
- wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
71
- x, _ = self.encode(wav)
72
- return self.proj(x)
73
-
74
-
75
- class FeatureExtractor(nn.Module):
76
- def __init__(self):
77
- super().__init__()
78
- self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
79
- self.norm0 = nn.GroupNorm(512, 512)
80
- self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
81
- self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
82
- self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
83
- self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
84
- self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
85
- self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
86
-
87
- def forward(self, x: torch.Tensor) -> torch.Tensor:
88
- x = t_func.gelu(self.norm0(self.conv0(x)))
89
- x = t_func.gelu(self.conv1(x))
90
- x = t_func.gelu(self.conv2(x))
91
- x = t_func.gelu(self.conv3(x))
92
- x = t_func.gelu(self.conv4(x))
93
- x = t_func.gelu(self.conv5(x))
94
- x = t_func.gelu(self.conv6(x))
95
- return x
96
-
97
-
98
- class FeatureProjection(nn.Module):
99
- def __init__(self):
100
- super().__init__()
101
- self.norm = nn.LayerNorm(512)
102
- self.projection = nn.Linear(512, 768)
103
- self.dropout = nn.Dropout(0.1)
104
-
105
- def forward(self, x: torch.Tensor) -> torch.Tensor:
106
- x = self.norm(x)
107
- x = self.projection(x)
108
- x = self.dropout(x)
109
- return x
110
-
111
-
112
- class PositionalConvEmbedding(nn.Module):
113
- def __init__(self):
114
- super().__init__()
115
- self.conv = nn.Conv1d(
116
- 768,
117
- 768,
118
- kernel_size=128,
119
- padding=128 // 2,
120
- groups=16,
121
- )
122
- self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
123
-
124
- def forward(self, x: torch.Tensor) -> torch.Tensor:
125
- x = self.conv(x.transpose(1, 2))
126
- x = t_func.gelu(x[:, :, :-1])
127
- return x.transpose(1, 2)
128
-
129
-
130
- class TransformerEncoder(nn.Module):
131
- def __init__(
132
- self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
133
- ) -> None:
134
- super(TransformerEncoder, self).__init__()
135
- self.layers = nn.ModuleList(
136
- [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
137
- )
138
- self.num_layers = num_layers
139
-
140
- def forward(
141
- self,
142
- src: torch.Tensor,
143
- mask: torch.Tensor = None,
144
- src_key_padding_mask: torch.Tensor = None,
145
- output_layer: Optional[int] = None,
146
- ) -> torch.Tensor:
147
- output = src
148
- for layer in self.layers[:output_layer]:
149
- output = layer(
150
- output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
151
- )
152
- return output
153
-
154
-
155
- def _compute_mask(
156
- shape: Tuple[int, int],
157
- mask_prob: float,
158
- mask_length: int,
159
- device: torch.device,
160
- min_masks: int = 0,
161
- ) -> torch.Tensor:
162
- batch_size, sequence_length = shape
163
-
164
- if mask_length < 1:
165
- raise ValueError("`mask_length` has to be bigger than 0.")
166
-
167
- if mask_length > sequence_length:
168
- raise ValueError(
169
- f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
170
- )
171
-
172
- # compute number of masked spans in batch
173
- num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
174
- num_masked_spans = max(num_masked_spans, min_masks)
175
-
176
- # make sure num masked indices <= sequence_length
177
- if num_masked_spans * mask_length > sequence_length:
178
- num_masked_spans = sequence_length // mask_length
179
-
180
- # SpecAugment mask to fill
181
- mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
182
-
183
- # uniform distribution to sample from, make sure that offset samples are < sequence_length
184
- uniform_dist = torch.ones(
185
- (batch_size, sequence_length - (mask_length - 1)), device=device
186
- )
187
-
188
- # get random indices to mask
189
- mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
190
-
191
- # expand masked indices to masked spans
192
- mask_indices = (
193
- mask_indices.unsqueeze(dim=-1)
194
- .expand((batch_size, num_masked_spans, mask_length))
195
- .reshape(batch_size, num_masked_spans * mask_length)
196
- )
197
- offsets = (
198
- torch.arange(mask_length, device=device)[None, None, :]
199
- .expand((batch_size, num_masked_spans, mask_length))
200
- .reshape(batch_size, num_masked_spans * mask_length)
201
- )
202
- mask_idxs = mask_indices + offsets
203
-
204
- # scatter indices to mask
205
- mask = mask.scatter(1, mask_idxs, True)
206
-
207
- return mask
208
-
209
-
210
- def hubert_soft(
211
- path: str,
212
- ) -> HubertSoft:
213
- r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
214
- Args:
215
- path (str): path of a pretrained model
216
- """
217
- hubert = HubertSoft()
218
- checkpoint = torch.load(path)
219
- consume_prefix_in_state_dict_if_present(checkpoint, "module.")
220
- hubert.load_state_dict(checkpoint)
221
- hubert.eval()
222
- return hubert
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/hubert/hubert_model_onnx.py DELETED
@@ -1,217 +0,0 @@
1
- import copy
2
- import random
3
- from typing import Optional, Tuple
4
-
5
- import torch
6
- import torch.nn as nn
7
- import torch.nn.functional as t_func
8
- from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
9
-
10
-
11
- class Hubert(nn.Module):
12
- def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
13
- super().__init__()
14
- self._mask = mask
15
- self.feature_extractor = FeatureExtractor()
16
- self.feature_projection = FeatureProjection()
17
- self.positional_embedding = PositionalConvEmbedding()
18
- self.norm = nn.LayerNorm(768)
19
- self.dropout = nn.Dropout(0.1)
20
- self.encoder = TransformerEncoder(
21
- nn.TransformerEncoderLayer(
22
- 768, 12, 3072, activation="gelu", batch_first=True
23
- ),
24
- 12,
25
- )
26
- self.proj = nn.Linear(768, 256)
27
-
28
- self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
29
- self.label_embedding = nn.Embedding(num_label_embeddings, 256)
30
-
31
- def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
32
- mask = None
33
- if self.training and self._mask:
34
- mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
35
- x[mask] = self.masked_spec_embed.to(x.dtype)
36
- return x, mask
37
-
38
- def encode(
39
- self, x: torch.Tensor, layer: Optional[int] = None
40
- ) -> Tuple[torch.Tensor, torch.Tensor]:
41
- x = self.feature_extractor(x)
42
- x = self.feature_projection(x.transpose(1, 2))
43
- x, mask = self.mask(x)
44
- x = x + self.positional_embedding(x)
45
- x = self.dropout(self.norm(x))
46
- x = self.encoder(x, output_layer=layer)
47
- return x, mask
48
-
49
- def logits(self, x: torch.Tensor) -> torch.Tensor:
50
- logits = torch.cosine_similarity(
51
- x.unsqueeze(2),
52
- self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
53
- dim=-1,
54
- )
55
- return logits / 0.1
56
-
57
-
58
- class HubertSoft(Hubert):
59
- def __init__(self):
60
- super().__init__()
61
-
62
- def units(self, wav: torch.Tensor) -> torch.Tensor:
63
- wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
64
- x, _ = self.encode(wav)
65
- return self.proj(x)
66
-
67
- def forward(self, x):
68
- return self.units(x)
69
-
70
- class FeatureExtractor(nn.Module):
71
- def __init__(self):
72
- super().__init__()
73
- self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
74
- self.norm0 = nn.GroupNorm(512, 512)
75
- self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
76
- self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
77
- self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
78
- self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
79
- self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
80
- self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
81
-
82
- def forward(self, x: torch.Tensor) -> torch.Tensor:
83
- x = t_func.gelu(self.norm0(self.conv0(x)))
84
- x = t_func.gelu(self.conv1(x))
85
- x = t_func.gelu(self.conv2(x))
86
- x = t_func.gelu(self.conv3(x))
87
- x = t_func.gelu(self.conv4(x))
88
- x = t_func.gelu(self.conv5(x))
89
- x = t_func.gelu(self.conv6(x))
90
- return x
91
-
92
-
93
- class FeatureProjection(nn.Module):
94
- def __init__(self):
95
- super().__init__()
96
- self.norm = nn.LayerNorm(512)
97
- self.projection = nn.Linear(512, 768)
98
- self.dropout = nn.Dropout(0.1)
99
-
100
- def forward(self, x: torch.Tensor) -> torch.Tensor:
101
- x = self.norm(x)
102
- x = self.projection(x)
103
- x = self.dropout(x)
104
- return x
105
-
106
-
107
- class PositionalConvEmbedding(nn.Module):
108
- def __init__(self):
109
- super().__init__()
110
- self.conv = nn.Conv1d(
111
- 768,
112
- 768,
113
- kernel_size=128,
114
- padding=128 // 2,
115
- groups=16,
116
- )
117
- self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
118
-
119
- def forward(self, x: torch.Tensor) -> torch.Tensor:
120
- x = self.conv(x.transpose(1, 2))
121
- x = t_func.gelu(x[:, :, :-1])
122
- return x.transpose(1, 2)
123
-
124
-
125
- class TransformerEncoder(nn.Module):
126
- def __init__(
127
- self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
128
- ) -> None:
129
- super(TransformerEncoder, self).__init__()
130
- self.layers = nn.ModuleList(
131
- [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
132
- )
133
- self.num_layers = num_layers
134
-
135
- def forward(
136
- self,
137
- src: torch.Tensor,
138
- mask: torch.Tensor = None,
139
- src_key_padding_mask: torch.Tensor = None,
140
- output_layer: Optional[int] = None,
141
- ) -> torch.Tensor:
142
- output = src
143
- for layer in self.layers[:output_layer]:
144
- output = layer(
145
- output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
146
- )
147
- return output
148
-
149
-
150
- def _compute_mask(
151
- shape: Tuple[int, int],
152
- mask_prob: float,
153
- mask_length: int,
154
- device: torch.device,
155
- min_masks: int = 0,
156
- ) -> torch.Tensor:
157
- batch_size, sequence_length = shape
158
-
159
- if mask_length < 1:
160
- raise ValueError("`mask_length` has to be bigger than 0.")
161
-
162
- if mask_length > sequence_length:
163
- raise ValueError(
164
- f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
165
- )
166
-
167
- # compute number of masked spans in batch
168
- num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
169
- num_masked_spans = max(num_masked_spans, min_masks)
170
-
171
- # make sure num masked indices <= sequence_length
172
- if num_masked_spans * mask_length > sequence_length:
173
- num_masked_spans = sequence_length // mask_length
174
-
175
- # SpecAugment mask to fill
176
- mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
177
-
178
- # uniform distribution to sample from, make sure that offset samples are < sequence_length
179
- uniform_dist = torch.ones(
180
- (batch_size, sequence_length - (mask_length - 1)), device=device
181
- )
182
-
183
- # get random indices to mask
184
- mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
185
-
186
- # expand masked indices to masked spans
187
- mask_indices = (
188
- mask_indices.unsqueeze(dim=-1)
189
- .expand((batch_size, num_masked_spans, mask_length))
190
- .reshape(batch_size, num_masked_spans * mask_length)
191
- )
192
- offsets = (
193
- torch.arange(mask_length, device=device)[None, None, :]
194
- .expand((batch_size, num_masked_spans, mask_length))
195
- .reshape(batch_size, num_masked_spans * mask_length)
196
- )
197
- mask_idxs = mask_indices + offsets
198
-
199
- # scatter indices to mask
200
- mask = mask.scatter(1, mask_idxs, True)
201
-
202
- return mask
203
-
204
-
205
- def hubert_soft(
206
- path: str,
207
- ) -> HubertSoft:
208
- r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
209
- Args:
210
- path (str): path of a pretrained model
211
- """
212
- hubert = HubertSoft()
213
- checkpoint = torch.load(path)
214
- consume_prefix_in_state_dict_if_present(checkpoint, "module.")
215
- hubert.load_state_dict(checkpoint)
216
- hubert.eval()
217
- return hubert
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/hubert/put_hubert_ckpt_here DELETED
File without changes
src/tts_vits/inference/__init__.py DELETED
File without changes
src/tts_vits/inference/chunks_temp.json DELETED
@@ -1 +0,0 @@
1
- {"info": "temp_dict"}
 
 
src/tts_vits/inference/infer_tool.py DELETED
@@ -1,326 +0,0 @@
1
- import hashlib
2
- import json
3
- import logging
4
- import os
5
- import time
6
- from pathlib import Path
7
-
8
- import librosa
9
- import maad
10
- import numpy as np
11
- # import onnxruntime
12
- import parselmouth
13
- import soundfile
14
- import torch
15
- import torchaudio
16
-
17
- from hubert import hubert_model
18
- import utils
19
- from models import SynthesizerTrn
20
-
21
- logging.getLogger('matplotlib').setLevel(logging.WARNING)
22
-
23
-
24
- def read_temp(file_name):
25
- if not os.path.exists(file_name):
26
- with open(file_name, "w") as f:
27
- f.write(json.dumps({"info": "temp_dict"}))
28
- return {}
29
- else:
30
- try:
31
- with open(file_name, "r") as f:
32
- data = f.read()
33
- data_dict = json.loads(data)
34
- if os.path.getsize(file_name) > 50 * 1024 * 1024:
35
- f_name = file_name.split("/")[-1]
36
- print(f"clean {f_name}")
37
- for wav_hash in list(data_dict.keys()):
38
- if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
39
- del data_dict[wav_hash]
40
- except Exception as e:
41
- print(e)
42
- print(f"{file_name} error,auto rebuild file")
43
- data_dict = {"info": "temp_dict"}
44
- return data_dict
45
-
46
-
47
- def write_temp(file_name, data):
48
- with open(file_name, "w") as f:
49
- f.write(json.dumps(data))
50
-
51
-
52
- def timeit(func):
53
- def run(*args, **kwargs):
54
- t = time.time()
55
- res = func(*args, **kwargs)
56
- print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
57
- return res
58
-
59
- return run
60
-
61
-
62
- def format_wav(audio_path):
63
- if Path(audio_path).suffix == '.wav':
64
- return
65
- raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
66
- soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
67
-
68
-
69
- def get_end_file(dir_path, end):
70
- file_lists = []
71
- for root, dirs, files in os.walk(dir_path):
72
- files = [f for f in files if f[0] != '.']
73
- dirs[:] = [d for d in dirs if d[0] != '.']
74
- for f_file in files:
75
- if f_file.endswith(end):
76
- file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
77
- return file_lists
78
-
79
-
80
- def get_md5(content):
81
- return hashlib.new("md5", content).hexdigest()
82
-
83
-
84
- def resize2d_f0(x, target_len):
85
- source = np.array(x)
86
- source[source < 0.001] = np.nan
87
- target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
88
- source)
89
- res = np.nan_to_num(target)
90
- return res
91
-
92
- def get_f0(x, p_len,f0_up_key=0):
93
-
94
- time_step = 160 / 16000 * 1000
95
- f0_min = 50
96
- f0_max = 1100
97
- f0_mel_min = 1127 * np.log(1 + f0_min / 700)
98
- f0_mel_max = 1127 * np.log(1 + f0_max / 700)
99
-
100
- f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
101
- time_step=time_step / 1000, voicing_threshold=0.6,
102
- pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
103
-
104
- pad_size=(p_len - len(f0) + 1) // 2
105
- if(pad_size>0 or p_len - len(f0) - pad_size>0):
106
- f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
107
-
108
- f0 *= pow(2, f0_up_key / 12)
109
- f0_mel = 1127 * np.log(1 + f0 / 700)
110
- f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
111
- f0_mel[f0_mel <= 1] = 1
112
- f0_mel[f0_mel > 255] = 255
113
- f0_coarse = np.rint(f0_mel).astype(np.int)
114
- return f0_coarse, f0
115
-
116
- def clean_pitch(input_pitch):
117
- num_nan = np.sum(input_pitch == 1)
118
- if num_nan / len(input_pitch) > 0.9:
119
- input_pitch[input_pitch != 1] = 1
120
- return input_pitch
121
-
122
-
123
- def plt_pitch(input_pitch):
124
- input_pitch = input_pitch.astype(float)
125
- input_pitch[input_pitch == 1] = np.nan
126
- return input_pitch
127
-
128
-
129
- def f0_to_pitch(ff):
130
- f0_pitch = 69 + 12 * np.log2(ff / 440)
131
- return f0_pitch
132
-
133
-
134
- def fill_a_to_b(a, b):
135
- if len(a) < len(b):
136
- for _ in range(0, len(b) - len(a)):
137
- a.append(a[0])
138
-
139
-
140
- def mkdir(paths: list):
141
- for path in paths:
142
- if not os.path.exists(path):
143
- os.mkdir(path)
144
-
145
-
146
- class Svc(object):
147
- def __init__(self, net_g_path, config_path, hubert_path="hubert/hubert-soft-0d54a1f4.pt",
148
- onnx=False):
149
- self.onnx = onnx
150
- self.net_g_path = net_g_path
151
- self.hubert_path = hubert_path
152
- self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
153
- self.net_g_ms = None
154
- self.hps_ms = utils.get_hparams_from_file(config_path)
155
- self.target_sample = self.hps_ms.data.sampling_rate
156
- self.hop_size = self.hps_ms.data.hop_length
157
- self.speakers = {}
158
- for spk, sid in self.hps_ms.spk.items():
159
- self.speakers[sid] = spk
160
- self.spk2id = self.hps_ms.spk
161
- # ๅŠ ่ฝฝhubert
162
- self.hubert_soft = hubert_model.hubert_soft(hubert_path)
163
- if torch.cuda.is_available():
164
- self.hubert_soft = self.hubert_soft.cuda()
165
- self.load_model()
166
-
167
- def load_model(self):
168
- # ่Žทๅ–ๆจกๅž‹้…็ฝฎ
169
- if self.onnx:
170
- raise NotImplementedError
171
- # self.net_g_ms = SynthesizerTrnForONNX(
172
- # 178,
173
- # self.hps_ms.data.filter_length // 2 + 1,
174
- # self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
175
- # n_speakers=self.hps_ms.data.n_speakers,
176
- # **self.hps_ms.model)
177
- # _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
178
- else:
179
- self.net_g_ms = SynthesizerTrn(
180
- self.hps_ms.data.filter_length // 2 + 1,
181
- self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
182
- **self.hps_ms.model)
183
- _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
184
- if "half" in self.net_g_path and torch.cuda.is_available():
185
- _ = self.net_g_ms.half().eval().to(self.dev)
186
- else:
187
- _ = self.net_g_ms.eval().to(self.dev)
188
-
189
- def get_units(self, source, sr):
190
-
191
- source = source.unsqueeze(0).to(self.dev)
192
- with torch.inference_mode():
193
- start = time.time()
194
- units = self.hubert_soft.units(source)
195
- use_time = time.time() - start
196
- print("hubert use time:{}".format(use_time))
197
- return units
198
-
199
-
200
- def get_unit_pitch(self, in_path, tran):
201
- source, sr = torchaudio.load(in_path)
202
- source = torchaudio.functional.resample(source, sr, 16000)
203
- if len(source.shape) == 2 and source.shape[1] >= 2:
204
- source = torch.mean(source, dim=0).unsqueeze(0)
205
- soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
206
- f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0]*2, tran)
207
- return soft, f0
208
-
209
- def infer(self, speaker_id, tran, raw_path):
210
- if type(speaker_id) == str:
211
- speaker_id = self.spk2id[speaker_id]
212
- sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
213
- soft, pitch = self.get_unit_pitch(raw_path, tran)
214
- f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.dev)
215
- if "half" in self.net_g_path and torch.cuda.is_available():
216
- stn_tst = torch.HalfTensor(soft)
217
- else:
218
- stn_tst = torch.FloatTensor(soft)
219
- with torch.no_grad():
220
- x_tst = stn_tst.unsqueeze(0).to(self.dev)
221
- start = time.time()
222
- x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
223
- audio = self.net_g_ms.infer(x_tst, f0=f0, g=sid)[0,0].data.float()
224
- use_time = time.time() - start
225
- print("vits use time:{}".format(use_time))
226
- return audio, audio.shape[-1]
227
-
228
-
229
- # class SvcONNXInferModel(object):
230
- # def __init__(self, hubert_onnx, vits_onnx, config_path):
231
- # self.config_path = config_path
232
- # self.vits_onnx = vits_onnx
233
- # self.hubert_onnx = hubert_onnx
234
- # self.hubert_onnx_session = onnxruntime.InferenceSession(hubert_onnx, providers=['CUDAExecutionProvider', ])
235
- # self.inspect_onnx(self.hubert_onnx_session)
236
- # self.vits_onnx_session = onnxruntime.InferenceSession(vits_onnx, providers=['CUDAExecutionProvider', ])
237
- # self.inspect_onnx(self.vits_onnx_session)
238
- # self.hps_ms = utils.get_hparams_from_file(self.config_path)
239
- # self.target_sample = self.hps_ms.data.sampling_rate
240
- # self.feature_input = FeatureInput(self.hps_ms.data.sampling_rate, self.hps_ms.data.hop_length)
241
- #
242
- # @staticmethod
243
- # def inspect_onnx(session):
244
- # for i in session.get_inputs():
245
- # print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type))
246
- # for i in session.get_outputs():
247
- # print("name:{}\tshape:{}\tdtype:{}".format(i.name, i.shape, i.type))
248
- #
249
- # def infer(self, speaker_id, tran, raw_path):
250
- # sid = np.array([int(speaker_id)], dtype=np.int64)
251
- # soft, pitch = self.get_unit_pitch(raw_path, tran)
252
- # pitch = np.expand_dims(pitch, axis=0).astype(np.int64)
253
- # stn_tst = soft
254
- # x_tst = np.expand_dims(stn_tst, axis=0)
255
- # x_tst_lengths = np.array([stn_tst.shape[0]], dtype=np.int64)
256
- # # ไฝฟ็”จONNX Runtime่ฟ›่กŒๆŽจ็†
257
- # start = time.time()
258
- # audio = self.vits_onnx_session.run(output_names=["audio"],
259
- # input_feed={
260
- # "hidden_unit": x_tst,
261
- # "lengths": x_tst_lengths,
262
- # "pitch": pitch,
263
- # "sid": sid,
264
- # })[0][0, 0]
265
- # use_time = time.time() - start
266
- # print("vits_onnx_session.run time:{}".format(use_time))
267
- # audio = torch.from_numpy(audio)
268
- # return audio, audio.shape[-1]
269
- #
270
- # def get_units(self, source, sr):
271
- # source = torchaudio.functional.resample(source, sr, 16000)
272
- # if len(source.shape) == 2 and source.shape[1] >= 2:
273
- # source = torch.mean(source, dim=0).unsqueeze(0)
274
- # source = source.unsqueeze(0)
275
- # # ไฝฟ็”จONNX Runtime่ฟ›่กŒๆŽจ็†
276
- # start = time.time()
277
- # units = self.hubert_onnx_session.run(output_names=["embed"],
278
- # input_feed={"source": source.numpy()})[0]
279
- # use_time = time.time() - start
280
- # print("hubert_onnx_session.run time:{}".format(use_time))
281
- # return units
282
- #
283
- # def transcribe(self, source, sr, length, transform):
284
- # feature_pit = self.feature_input.compute_f0(source, sr)
285
- # feature_pit = feature_pit * 2 ** (transform / 12)
286
- # feature_pit = resize2d_f0(feature_pit, length)
287
- # coarse_pit = self.feature_input.coarse_f0(feature_pit)
288
- # return coarse_pit
289
- #
290
- # def get_unit_pitch(self, in_path, tran):
291
- # source, sr = torchaudio.load(in_path)
292
- # soft = self.get_units(source, sr).squeeze(0)
293
- # input_pitch = self.transcribe(source.numpy()[0], sr, soft.shape[0], tran)
294
- # return soft, input_pitch
295
-
296
-
297
- class RealTimeVC:
298
- def __init__(self):
299
- self.last_chunk = None
300
- self.last_o = None
301
- self.chunk_len = 16000 # ๅŒบๅ—้•ฟๅบฆ
302
- self.pre_len = 3840 # ไบคๅ‰ๆทกๅŒ–้•ฟๅบฆ๏ผŒ640็š„ๅ€ๆ•ฐ
303
-
304
- """่พ“ๅ…ฅ่พ“ๅ‡บ้ƒฝๆ˜ฏ1็ปดnumpy ้Ÿณ้ข‘ๆณขๅฝขๆ•ฐ็ป„"""
305
-
306
- def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path):
307
- audio, sr = torchaudio.load(input_wav_path)
308
- audio = audio.cpu().numpy()[0]
309
- temp_wav = io.BytesIO()
310
- if self.last_chunk is None:
311
- input_wav_path.seek(0)
312
- audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
313
- audio = audio.cpu().numpy()
314
- self.last_chunk = audio[-self.pre_len:]
315
- self.last_o = audio
316
- return audio[-self.chunk_len:]
317
- else:
318
- audio = np.concatenate([self.last_chunk, audio])
319
- soundfile.write(temp_wav, audio, sr, format="wav")
320
- temp_wav.seek(0)
321
- audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav)
322
- audio = audio.cpu().numpy()
323
- ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
324
- self.last_chunk = audio[-self.pre_len:]
325
- self.last_o = audio
326
- return ret[self.chunk_len:2 * self.chunk_len]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/inference/infer_tool_grad.py DELETED
@@ -1,160 +0,0 @@
1
- import hashlib
2
- import json
3
- import logging
4
- import os
5
- import time
6
- from pathlib import Path
7
- import io
8
- import librosa
9
- import maad
10
- import numpy as np
11
- from inference import slicer
12
- import parselmouth
13
- import soundfile
14
- import torch
15
- import torchaudio
16
-
17
- from hubert import hubert_model
18
- import utils
19
- from models import SynthesizerTrn
20
- logging.getLogger('numba').setLevel(logging.WARNING)
21
- logging.getLogger('matplotlib').setLevel(logging.WARNING)
22
-
23
- def resize2d_f0(x, target_len):
24
- source = np.array(x)
25
- source[source < 0.001] = np.nan
26
- target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
27
- source)
28
- res = np.nan_to_num(target)
29
- return res
30
-
31
- def get_f0(x, p_len,f0_up_key=0):
32
-
33
- time_step = 160 / 16000 * 1000
34
- f0_min = 50
35
- f0_max = 1100
36
- f0_mel_min = 1127 * np.log(1 + f0_min / 700)
37
- f0_mel_max = 1127 * np.log(1 + f0_max / 700)
38
-
39
- f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
40
- time_step=time_step / 1000, voicing_threshold=0.6,
41
- pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
42
-
43
- pad_size=(p_len - len(f0) + 1) // 2
44
- if(pad_size>0 or p_len - len(f0) - pad_size>0):
45
- f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
46
-
47
- f0 *= pow(2, f0_up_key / 12)
48
- f0_mel = 1127 * np.log(1 + f0 / 700)
49
- f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
50
- f0_mel[f0_mel <= 1] = 1
51
- f0_mel[f0_mel > 255] = 255
52
- f0_coarse = np.rint(f0_mel).astype(np.int)
53
- return f0_coarse, f0
54
-
55
- def clean_pitch(input_pitch):
56
- num_nan = np.sum(input_pitch == 1)
57
- if num_nan / len(input_pitch) > 0.9:
58
- input_pitch[input_pitch != 1] = 1
59
- return input_pitch
60
-
61
-
62
- def plt_pitch(input_pitch):
63
- input_pitch = input_pitch.astype(float)
64
- input_pitch[input_pitch == 1] = np.nan
65
- return input_pitch
66
-
67
-
68
- def f0_to_pitch(ff):
69
- f0_pitch = 69 + 12 * np.log2(ff / 440)
70
- return f0_pitch
71
-
72
-
73
- def fill_a_to_b(a, b):
74
- if len(a) < len(b):
75
- for _ in range(0, len(b) - len(a)):
76
- a.append(a[0])
77
-
78
-
79
- def mkdir(paths: list):
80
- for path in paths:
81
- if not os.path.exists(path):
82
- os.mkdir(path)
83
-
84
-
85
- class VitsSvc(object):
86
- def __init__(self):
87
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
88
- self.SVCVITS = None
89
- self.hps = None
90
- self.speakers = None
91
- self.hubert_soft = hubert_model.hubert_soft("hubert/model.pt")
92
-
93
- def set_device(self, device):
94
- self.device = torch.device(device)
95
- self.hubert_soft.to(self.device)
96
- if self.SVCVITS != None:
97
- self.SVCVITS.to(self.device)
98
-
99
- def loadCheckpoint(self, path):
100
- self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
101
- self.SVCVITS = SynthesizerTrn(
102
- self.hps.data.filter_length // 2 + 1,
103
- self.hps.train.segment_size // self.hps.data.hop_length,
104
- **self.hps.model)
105
- _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.SVCVITS, None)
106
- _ = self.SVCVITS.eval().to(self.device)
107
- self.speakers = self.hps.spk
108
-
109
- def get_units(self, source, sr):
110
- source = source.unsqueeze(0).to(self.device)
111
- with torch.inference_mode():
112
- units = self.hubert_soft.units(source)
113
- return units
114
-
115
-
116
- def get_unit_pitch(self, in_path, tran):
117
- source, sr = torchaudio.load(in_path)
118
- source = torchaudio.functional.resample(source, sr, 16000)
119
- if len(source.shape) == 2 and source.shape[1] >= 2:
120
- source = torch.mean(source, dim=0).unsqueeze(0)
121
- soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
122
- f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0]*2, tran)
123
- return soft, f0
124
-
125
- def infer(self, speaker_id, tran, raw_path):
126
- speaker_id = self.speakers[speaker_id]
127
- sid = torch.LongTensor([int(speaker_id)]).to(self.device).unsqueeze(0)
128
- soft, pitch = self.get_unit_pitch(raw_path, tran)
129
- f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device)
130
- stn_tst = torch.FloatTensor(soft)
131
- with torch.no_grad():
132
- x_tst = stn_tst.unsqueeze(0).to(self.device)
133
- x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
134
- audio = self.SVCVITS.infer(x_tst, f0=f0, g=sid)[0,0].data.float()
135
- return audio, audio.shape[-1]
136
-
137
- def inference(self,srcaudio,chara,tran,slice_db):
138
- sampling_rate, audio = srcaudio
139
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
140
- if len(audio.shape) > 1:
141
- audio = librosa.to_mono(audio.transpose(1, 0))
142
- if sampling_rate != 16000:
143
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
144
- soundfile.write("tmpwav.wav", audio, 16000, format="wav")
145
- chunks = slicer.cut("tmpwav.wav", db_thresh=slice_db)
146
- audio_data, audio_sr = slicer.chunks2audio("tmpwav.wav", chunks)
147
- audio = []
148
- for (slice_tag, data) in audio_data:
149
- length = int(np.ceil(len(data) / audio_sr * self.hps.data.sampling_rate))
150
- raw_path = io.BytesIO()
151
- soundfile.write(raw_path, data, audio_sr, format="wav")
152
- raw_path.seek(0)
153
- if slice_tag:
154
- _audio = np.zeros(length)
155
- else:
156
- out_audio, out_sr = self.infer(chara, tran, raw_path)
157
- _audio = out_audio.cpu().numpy()
158
- audio.extend(list(_audio))
159
- audio = (np.array(audio) * 32768.0).astype('int16')
160
- return (self.hps.data.sampling_rate,audio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/inference/slicer.py DELETED
@@ -1,145 +0,0 @@
1
- import librosa
2
- import torch
3
- import torchaudio
4
-
5
-
6
- class Slicer:
7
- def __init__(self,
8
- sr: int,
9
- threshold: float = -40.,
10
- min_length: int = 5000,
11
- min_interval: int = 300,
12
- hop_size: int = 20,
13
- max_sil_kept: int = 5000):
14
- if not min_length >= min_interval >= hop_size:
15
- raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
16
- if not max_sil_kept >= hop_size:
17
- raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
18
- min_interval = sr * min_interval / 1000
19
- self.threshold = 10 ** (threshold / 20.)
20
- self.hop_size = round(sr * hop_size / 1000)
21
- self.win_size = min(round(min_interval), 4 * self.hop_size)
22
- self.min_length = round(sr * min_length / 1000 / self.hop_size)
23
- self.min_interval = round(min_interval / self.hop_size)
24
- self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
25
-
26
- def _apply_slice(self, waveform, begin, end):
27
- if len(waveform.shape) > 1:
28
- return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
29
- else:
30
- return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
31
-
32
- # @timeit
33
- def slice(self, waveform):
34
- if len(waveform.shape) > 1:
35
- samples = librosa.to_mono(waveform)
36
- else:
37
- samples = waveform
38
- if samples.shape[0] <= self.min_length:
39
- return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
40
- rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
41
- sil_tags = []
42
- silence_start = None
43
- clip_start = 0
44
- for i, rms in enumerate(rms_list):
45
- # Keep looping while frame is silent.
46
- if rms < self.threshold:
47
- # Record start of silent frames.
48
- if silence_start is None:
49
- silence_start = i
50
- continue
51
- # Keep looping while frame is not silent and silence start has not been recorded.
52
- if silence_start is None:
53
- continue
54
- # Clear recorded silence start if interval is not enough or clip is too short
55
- is_leading_silence = silence_start == 0 and i > self.max_sil_kept
56
- need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
57
- if not is_leading_silence and not need_slice_middle:
58
- silence_start = None
59
- continue
60
- # Need slicing. Record the range of silent frames to be removed.
61
- if i - silence_start <= self.max_sil_kept:
62
- pos = rms_list[silence_start: i + 1].argmin() + silence_start
63
- if silence_start == 0:
64
- sil_tags.append((0, pos))
65
- else:
66
- sil_tags.append((pos, pos))
67
- clip_start = pos
68
- elif i - silence_start <= self.max_sil_kept * 2:
69
- pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
70
- pos += i - self.max_sil_kept
71
- pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
72
- pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
73
- if silence_start == 0:
74
- sil_tags.append((0, pos_r))
75
- clip_start = pos_r
76
- else:
77
- sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
78
- clip_start = max(pos_r, pos)
79
- else:
80
- pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
81
- pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
82
- if silence_start == 0:
83
- sil_tags.append((0, pos_r))
84
- else:
85
- sil_tags.append((pos_l, pos_r))
86
- clip_start = pos_r
87
- silence_start = None
88
- # Deal with trailing silence.
89
- total_frames = rms_list.shape[0]
90
- if silence_start is not None and total_frames - silence_start >= self.min_interval:
91
- silence_end = min(total_frames, silence_start + self.max_sil_kept)
92
- pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
93
- sil_tags.append((pos, total_frames + 1))
94
- # Apply and return slices.
95
- if len(sil_tags) == 0:
96
- return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
97
- else:
98
- chunks = []
99
- # ็ฌฌไธ€ๆฎต้™้Ÿณๅนถ้žไปŽๅคดๅผ€ๅง‹๏ผŒ่กฅไธŠๆœ‰ๅฃฐ็‰‡ๆฎต
100
- if sil_tags[0][0]:
101
- chunks.append(
102
- {"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
103
- for i in range(0, len(sil_tags)):
104
- # ๆ ‡่ฏ†ๆœ‰ๅฃฐ็‰‡ๆฎต๏ผˆ่ทณ่ฟ‡็ฌฌไธ€ๆฎต๏ผ‰
105
- if i:
106
- chunks.append({"slice": False,
107
- "split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
108
- # ๆ ‡่ฏ†ๆ‰€ๆœ‰้™้Ÿณ็‰‡ๆฎต
109
- chunks.append({"slice": True,
110
- "split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
111
- # ๆœ€ๅŽไธ€ๆฎต้™้Ÿณๅนถ้ž็ป“ๅฐพ๏ผŒ่กฅไธŠ็ป“ๅฐพ็‰‡ๆฎต
112
- if sil_tags[-1][1] * self.hop_size < len(waveform):
113
- chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
114
- chunk_dict = {}
115
- for i in range(len(chunks)):
116
- chunk_dict[str(i)] = chunks[i]
117
- return chunk_dict
118
-
119
-
120
- def cut(audio_path, db_thresh=-30, min_len=5000):
121
- audio, sr = librosa.load(audio_path, sr=None)
122
- slicer = Slicer(
123
- sr=sr,
124
- threshold=db_thresh,
125
- min_length=min_len
126
- )
127
- chunks = slicer.slice(audio)
128
- return chunks
129
-
130
-
131
- def chunks2audio(audio_path, chunks):
132
- chunks = dict(chunks)
133
- audio, sr = torchaudio.load(audio_path)
134
- # audio, sr = librosa.load(audio_path, sr=None)
135
-
136
- if len(audio.shape) == 2 and audio.shape[1] >= 2:
137
- audio = torch.mean(audio, dim=0).unsqueeze(0)
138
- # audio = audio[0]
139
- audio = audio.cpu().numpy()[0]
140
- result = []
141
- for k, v in chunks.items():
142
- tag = v["split_time"].split(",")
143
- if tag[0] != tag[1]:
144
- result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
145
- return result, sr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/inference_main.py DELETED
@@ -1,55 +0,0 @@
1
- import io
2
- import logging
3
- import time
4
- from pathlib import Path
5
-
6
- import librosa
7
- import numpy as np
8
- import soundfile
9
-
10
- from inference import infer_tool
11
- from inference import slicer
12
- from inference.infer_tool import Svc
13
- import uuid
14
-
15
- logging.getLogger('numba').setLevel(logging.WARNING)
16
- # chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
17
- infer_tool.mkdir(["./results"])
18
- model_path = "vits_models/Haruhi_54000.pth"
19
- config_path = "configs/config.json"
20
- svc_model = Svc(model_path, config_path)
21
-
22
-
23
- def set_model_path(path):
24
- global model_path
25
- model_path = path
26
-
27
-
28
- def infer_to(spk, tran, voice):
29
- slice_db = -40
30
-
31
- wav_format = 'wav'
32
- # audio_file = io.BytesIO(voice)
33
- audio_file = voice
34
- chunks = slicer.cut(audio_file, db_thresh=slice_db)
35
- # audio_file = io.BytesIO(voice)
36
- audio_data, audio_sr = slicer.chunks2audio(audio_file, chunks)
37
- audio = []
38
- for (slice_tag, data) in audio_data:
39
- print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
40
- length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
41
- raw_path = io.BytesIO()
42
- soundfile.write(raw_path, data, audio_sr, format="wav")
43
- raw_path.seek(0)
44
- if slice_tag:
45
- print('jump empty segment')
46
- _audio = np.zeros(length)
47
- else:
48
- out_audio, out_sr = svc_model.infer(spk, tran, raw_path)
49
- _audio = out_audio.cpu().numpy()
50
- audio.extend(list(_audio))
51
- infer_tool.mkdir(["./vits_results"])
52
- res_path = f'./vits_results/{tran}key_{spk}_{str(uuid.uuid4())}.{wav_format}'
53
- soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
54
-
55
- return res_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/models.py DELETED
@@ -1,351 +0,0 @@
1
- import copy
2
- import math
3
- import torch
4
- from torch import nn
5
- from torch.nn import functional as F
6
-
7
- import attentions
8
- import commons
9
- import modules
10
-
11
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
12
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13
- from commons import init_weights, get_padding
14
- from vdecoder.hifigan.models import Generator
15
- from utils import f0_to_coarse
16
-
17
- class ResidualCouplingBlock(nn.Module):
18
- def __init__(self,
19
- channels,
20
- hidden_channels,
21
- kernel_size,
22
- dilation_rate,
23
- n_layers,
24
- n_flows=4,
25
- gin_channels=0):
26
- super().__init__()
27
- self.channels = channels
28
- self.hidden_channels = hidden_channels
29
- self.kernel_size = kernel_size
30
- self.dilation_rate = dilation_rate
31
- self.n_layers = n_layers
32
- self.n_flows = n_flows
33
- self.gin_channels = gin_channels
34
-
35
- self.flows = nn.ModuleList()
36
- for i in range(n_flows):
37
- self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
38
- self.flows.append(modules.Flip())
39
-
40
- def forward(self, x, x_mask, g=None, reverse=False):
41
- if not reverse:
42
- for flow in self.flows:
43
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
44
- else:
45
- for flow in reversed(self.flows):
46
- x = flow(x, x_mask, g=g, reverse=reverse)
47
- return x
48
-
49
-
50
- class Encoder(nn.Module):
51
- def __init__(self,
52
- in_channels,
53
- out_channels,
54
- hidden_channels,
55
- kernel_size,
56
- dilation_rate,
57
- n_layers,
58
- gin_channels=0):
59
- super().__init__()
60
- self.in_channels = in_channels
61
- self.out_channels = out_channels
62
- self.hidden_channels = hidden_channels
63
- self.kernel_size = kernel_size
64
- self.dilation_rate = dilation_rate
65
- self.n_layers = n_layers
66
- self.gin_channels = gin_channels
67
-
68
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
69
- self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
70
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
71
-
72
- def forward(self, x, x_lengths, g=None):
73
- # print(x.shape,x_lengths.shape)
74
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
75
- x = self.pre(x) * x_mask
76
- x = self.enc(x, x_mask, g=g)
77
- stats = self.proj(x) * x_mask
78
- m, logs = torch.split(stats, self.out_channels, dim=1)
79
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
80
- return z, m, logs, x_mask
81
-
82
-
83
- class TextEncoder(nn.Module):
84
- def __init__(self,
85
- in_channels,
86
- out_channels,
87
- hidden_channels,
88
- kernel_size,
89
- dilation_rate,
90
- n_layers,
91
- gin_channels=0,
92
- filter_channels=None,
93
- n_heads=None,
94
- p_dropout=None):
95
- super().__init__()
96
- self.in_channels = in_channels
97
- self.out_channels = out_channels
98
- self.hidden_channels = hidden_channels
99
- self.kernel_size = kernel_size
100
- self.dilation_rate = dilation_rate
101
- self.n_layers = n_layers
102
- self.gin_channels = gin_channels
103
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
104
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
105
- self.f0_emb = nn.Embedding(256, hidden_channels)
106
-
107
- self.enc_ = attentions.Encoder(
108
- hidden_channels,
109
- filter_channels,
110
- n_heads,
111
- n_layers,
112
- kernel_size,
113
- p_dropout)
114
-
115
- def forward(self, x, x_lengths, f0=None):
116
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
117
- x = self.pre(x) * x_mask
118
- x = x + self.f0_emb(f0).transpose(1,2)
119
- x = self.enc_(x * x_mask, x_mask)
120
- stats = self.proj(x) * x_mask
121
- m, logs = torch.split(stats, self.out_channels, dim=1)
122
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
123
-
124
- return z, m, logs, x_mask
125
-
126
-
127
-
128
- class DiscriminatorP(torch.nn.Module):
129
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
130
- super(DiscriminatorP, self).__init__()
131
- self.period = period
132
- self.use_spectral_norm = use_spectral_norm
133
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
134
- self.convs = nn.ModuleList([
135
- norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
136
- norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
137
- norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
138
- norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
139
- norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
140
- ])
141
- self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
142
-
143
- def forward(self, x):
144
- fmap = []
145
-
146
- # 1d to 2d
147
- b, c, t = x.shape
148
- if t % self.period != 0: # pad first
149
- n_pad = self.period - (t % self.period)
150
- x = F.pad(x, (0, n_pad), "reflect")
151
- t = t + n_pad
152
- x = x.view(b, c, t // self.period, self.period)
153
-
154
- for l in self.convs:
155
- x = l(x)
156
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
157
- fmap.append(x)
158
- x = self.conv_post(x)
159
- fmap.append(x)
160
- x = torch.flatten(x, 1, -1)
161
-
162
- return x, fmap
163
-
164
-
165
- class DiscriminatorS(torch.nn.Module):
166
- def __init__(self, use_spectral_norm=False):
167
- super(DiscriminatorS, self).__init__()
168
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
169
- self.convs = nn.ModuleList([
170
- norm_f(Conv1d(1, 16, 15, 1, padding=7)),
171
- norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
172
- norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
173
- norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
174
- norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
175
- norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
176
- ])
177
- self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
178
-
179
- def forward(self, x):
180
- fmap = []
181
-
182
- for l in self.convs:
183
- x = l(x)
184
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
185
- fmap.append(x)
186
- x = self.conv_post(x)
187
- fmap.append(x)
188
- x = torch.flatten(x, 1, -1)
189
-
190
- return x, fmap
191
-
192
-
193
- class MultiPeriodDiscriminator(torch.nn.Module):
194
- def __init__(self, use_spectral_norm=False):
195
- super(MultiPeriodDiscriminator, self).__init__()
196
- periods = [2,3,5,7,11]
197
-
198
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
199
- discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
200
- self.discriminators = nn.ModuleList(discs)
201
-
202
- def forward(self, y, y_hat):
203
- y_d_rs = []
204
- y_d_gs = []
205
- fmap_rs = []
206
- fmap_gs = []
207
- for i, d in enumerate(self.discriminators):
208
- y_d_r, fmap_r = d(y)
209
- y_d_g, fmap_g = d(y_hat)
210
- y_d_rs.append(y_d_r)
211
- y_d_gs.append(y_d_g)
212
- fmap_rs.append(fmap_r)
213
- fmap_gs.append(fmap_g)
214
-
215
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
216
-
217
-
218
- class SpeakerEncoder(torch.nn.Module):
219
- def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
220
- super(SpeakerEncoder, self).__init__()
221
- self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
222
- self.linear = nn.Linear(model_hidden_size, model_embedding_size)
223
- self.relu = nn.ReLU()
224
-
225
- def forward(self, mels):
226
- self.lstm.flatten_parameters()
227
- _, (hidden, _) = self.lstm(mels)
228
- embeds_raw = self.relu(self.linear(hidden[-1]))
229
- return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
230
-
231
- def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
232
- mel_slices = []
233
- for i in range(0, total_frames-partial_frames, partial_hop):
234
- mel_range = torch.arange(i, i+partial_frames)
235
- mel_slices.append(mel_range)
236
-
237
- return mel_slices
238
-
239
- def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
240
- mel_len = mel.size(1)
241
- last_mel = mel[:,-partial_frames:]
242
-
243
- if mel_len > partial_frames:
244
- mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
245
- mels = list(mel[:,s] for s in mel_slices)
246
- mels.append(last_mel)
247
- mels = torch.stack(tuple(mels), 0).squeeze(1)
248
-
249
- with torch.no_grad():
250
- partial_embeds = self(mels)
251
- embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
252
- #embed = embed / torch.linalg.norm(embed, 2)
253
- else:
254
- with torch.no_grad():
255
- embed = self(last_mel)
256
-
257
- return embed
258
-
259
-
260
- class SynthesizerTrn(nn.Module):
261
- """
262
- Synthesizer for Training
263
- """
264
-
265
- def __init__(self,
266
- spec_channels,
267
- segment_size,
268
- inter_channels,
269
- hidden_channels,
270
- filter_channels,
271
- n_heads,
272
- n_layers,
273
- kernel_size,
274
- p_dropout,
275
- resblock,
276
- resblock_kernel_sizes,
277
- resblock_dilation_sizes,
278
- upsample_rates,
279
- upsample_initial_channel,
280
- upsample_kernel_sizes,
281
- gin_channels,
282
- ssl_dim,
283
- n_speakers,
284
- **kwargs):
285
-
286
- super().__init__()
287
- self.spec_channels = spec_channels
288
- self.inter_channels = inter_channels
289
- self.hidden_channels = hidden_channels
290
- self.filter_channels = filter_channels
291
- self.n_heads = n_heads
292
- self.n_layers = n_layers
293
- self.kernel_size = kernel_size
294
- self.p_dropout = p_dropout
295
- self.resblock = resblock
296
- self.resblock_kernel_sizes = resblock_kernel_sizes
297
- self.resblock_dilation_sizes = resblock_dilation_sizes
298
- self.upsample_rates = upsample_rates
299
- self.upsample_initial_channel = upsample_initial_channel
300
- self.upsample_kernel_sizes = upsample_kernel_sizes
301
- self.segment_size = segment_size
302
- self.gin_channels = gin_channels
303
- self.ssl_dim = ssl_dim
304
- self.emb_g = nn.Embedding(n_speakers, gin_channels)
305
-
306
- self.enc_p_ = TextEncoder(ssl_dim, inter_channels, hidden_channels, 5, 1, 16,0, filter_channels, n_heads, p_dropout)
307
- hps = {
308
- "sampling_rate": 32000,
309
- "inter_channels": 192,
310
- "resblock": "1",
311
- "resblock_kernel_sizes": [3, 7, 11],
312
- "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
313
- "upsample_rates": [10, 8, 2, 2],
314
- "upsample_initial_channel": 512,
315
- "upsample_kernel_sizes": [16, 16, 4, 4],
316
- "gin_channels": 256,
317
- }
318
- self.dec = Generator(h=hps)
319
- self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
320
- self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
321
-
322
- def forward(self, c, f0, spec, g=None, mel=None, c_lengths=None, spec_lengths=None):
323
- if c_lengths == None:
324
- c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
325
- if spec_lengths == None:
326
- spec_lengths = (torch.ones(spec.size(0)) * spec.size(-1)).to(spec.device)
327
-
328
- g = self.emb_g(g).transpose(1,2)
329
-
330
- z_ptemp, m_p, logs_p, _ = self.enc_p_(c, c_lengths, f0=f0_to_coarse(f0))
331
- z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
332
-
333
- z_p = self.flow(z, spec_mask, g=g)
334
- z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(z, f0, spec_lengths, self.segment_size)
335
-
336
- # o = self.dec(z_slice, g=g)
337
- o = self.dec(z_slice, g=g, f0=pitch_slice)
338
-
339
- return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
340
-
341
- def infer(self, c, f0, g=None, mel=None, c_lengths=None):
342
- if c_lengths == None:
343
- c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
344
- g = self.emb_g(g).transpose(1,2)
345
-
346
- z_p, m_p, logs_p, c_mask = self.enc_p_(c, c_lengths, f0=f0_to_coarse(f0))
347
- z = self.flow(z_p, c_mask, g=g, reverse=True)
348
-
349
- o = self.dec(z * c_mask, g=g, f0=f0)
350
-
351
- return o
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/modules.py DELETED
@@ -1,342 +0,0 @@
1
- import copy
2
- import math
3
- import numpy as np
4
- import scipy
5
- import torch
6
- from torch import nn
7
- from torch.nn import functional as F
8
-
9
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
10
- from torch.nn.utils import weight_norm, remove_weight_norm
11
-
12
- import commons
13
- from commons import init_weights, get_padding
14
-
15
-
16
- LRELU_SLOPE = 0.1
17
-
18
-
19
- class LayerNorm(nn.Module):
20
- def __init__(self, channels, eps=1e-5):
21
- super().__init__()
22
- self.channels = channels
23
- self.eps = eps
24
-
25
- self.gamma = nn.Parameter(torch.ones(channels))
26
- self.beta = nn.Parameter(torch.zeros(channels))
27
-
28
- def forward(self, x):
29
- x = x.transpose(1, -1)
30
- x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
31
- return x.transpose(1, -1)
32
-
33
-
34
- class ConvReluNorm(nn.Module):
35
- def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
36
- super().__init__()
37
- self.in_channels = in_channels
38
- self.hidden_channels = hidden_channels
39
- self.out_channels = out_channels
40
- self.kernel_size = kernel_size
41
- self.n_layers = n_layers
42
- self.p_dropout = p_dropout
43
- assert n_layers > 1, "Number of layers should be larger than 0."
44
-
45
- self.conv_layers = nn.ModuleList()
46
- self.norm_layers = nn.ModuleList()
47
- self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
48
- self.norm_layers.append(LayerNorm(hidden_channels))
49
- self.relu_drop = nn.Sequential(
50
- nn.ReLU(),
51
- nn.Dropout(p_dropout))
52
- for _ in range(n_layers-1):
53
- self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
54
- self.norm_layers.append(LayerNorm(hidden_channels))
55
- self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
56
- self.proj.weight.data.zero_()
57
- self.proj.bias.data.zero_()
58
-
59
- def forward(self, x, x_mask):
60
- x_org = x
61
- for i in range(self.n_layers):
62
- x = self.conv_layers[i](x * x_mask)
63
- x = self.norm_layers[i](x)
64
- x = self.relu_drop(x)
65
- x = x_org + self.proj(x)
66
- return x * x_mask
67
-
68
-
69
- class DDSConv(nn.Module):
70
- """
71
- Dialted and Depth-Separable Convolution
72
- """
73
- def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
74
- super().__init__()
75
- self.channels = channels
76
- self.kernel_size = kernel_size
77
- self.n_layers = n_layers
78
- self.p_dropout = p_dropout
79
-
80
- self.drop = nn.Dropout(p_dropout)
81
- self.convs_sep = nn.ModuleList()
82
- self.convs_1x1 = nn.ModuleList()
83
- self.norms_1 = nn.ModuleList()
84
- self.norms_2 = nn.ModuleList()
85
- for i in range(n_layers):
86
- dilation = kernel_size ** i
87
- padding = (kernel_size * dilation - dilation) // 2
88
- self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
89
- groups=channels, dilation=dilation, padding=padding
90
- ))
91
- self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
92
- self.norms_1.append(LayerNorm(channels))
93
- self.norms_2.append(LayerNorm(channels))
94
-
95
- def forward(self, x, x_mask, g=None):
96
- if g is not None:
97
- x = x + g
98
- for i in range(self.n_layers):
99
- y = self.convs_sep[i](x * x_mask)
100
- y = self.norms_1[i](y)
101
- y = F.gelu(y)
102
- y = self.convs_1x1[i](y)
103
- y = self.norms_2[i](y)
104
- y = F.gelu(y)
105
- y = self.drop(y)
106
- x = x + y
107
- return x * x_mask
108
-
109
-
110
- class WN(torch.nn.Module):
111
- def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
112
- super(WN, self).__init__()
113
- assert(kernel_size % 2 == 1)
114
- self.hidden_channels =hidden_channels
115
- self.kernel_size = kernel_size,
116
- self.dilation_rate = dilation_rate
117
- self.n_layers = n_layers
118
- self.gin_channels = gin_channels
119
- self.p_dropout = p_dropout
120
-
121
- self.in_layers = torch.nn.ModuleList()
122
- self.res_skip_layers = torch.nn.ModuleList()
123
- self.drop = nn.Dropout(p_dropout)
124
-
125
- if gin_channels != 0:
126
- cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
127
- self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
128
-
129
- for i in range(n_layers):
130
- dilation = dilation_rate ** i
131
- padding = int((kernel_size * dilation - dilation) / 2)
132
- in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
133
- dilation=dilation, padding=padding)
134
- in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
135
- self.in_layers.append(in_layer)
136
-
137
- # last one is not necessary
138
- if i < n_layers - 1:
139
- res_skip_channels = 2 * hidden_channels
140
- else:
141
- res_skip_channels = hidden_channels
142
-
143
- res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
144
- res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
145
- self.res_skip_layers.append(res_skip_layer)
146
-
147
- def forward(self, x, x_mask, g=None, **kwargs):
148
- output = torch.zeros_like(x)
149
- n_channels_tensor = torch.IntTensor([self.hidden_channels])
150
-
151
- if g is not None:
152
- g = self.cond_layer(g)
153
-
154
- for i in range(self.n_layers):
155
- x_in = self.in_layers[i](x)
156
- if g is not None:
157
- cond_offset = i * 2 * self.hidden_channels
158
- g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
159
- else:
160
- g_l = torch.zeros_like(x_in)
161
-
162
- acts = commons.fused_add_tanh_sigmoid_multiply(
163
- x_in,
164
- g_l,
165
- n_channels_tensor)
166
- acts = self.drop(acts)
167
-
168
- res_skip_acts = self.res_skip_layers[i](acts)
169
- if i < self.n_layers - 1:
170
- res_acts = res_skip_acts[:,:self.hidden_channels,:]
171
- x = (x + res_acts) * x_mask
172
- output = output + res_skip_acts[:,self.hidden_channels:,:]
173
- else:
174
- output = output + res_skip_acts
175
- return output * x_mask
176
-
177
- def remove_weight_norm(self):
178
- if self.gin_channels != 0:
179
- torch.nn.utils.remove_weight_norm(self.cond_layer)
180
- for l in self.in_layers:
181
- torch.nn.utils.remove_weight_norm(l)
182
- for l in self.res_skip_layers:
183
- torch.nn.utils.remove_weight_norm(l)
184
-
185
-
186
- class ResBlock1(torch.nn.Module):
187
- def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
188
- super(ResBlock1, self).__init__()
189
- self.convs1 = nn.ModuleList([
190
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
191
- padding=get_padding(kernel_size, dilation[0]))),
192
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
193
- padding=get_padding(kernel_size, dilation[1]))),
194
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
195
- padding=get_padding(kernel_size, dilation[2])))
196
- ])
197
- self.convs1.apply(init_weights)
198
-
199
- self.convs2 = nn.ModuleList([
200
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
201
- padding=get_padding(kernel_size, 1))),
202
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
203
- padding=get_padding(kernel_size, 1))),
204
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
205
- padding=get_padding(kernel_size, 1)))
206
- ])
207
- self.convs2.apply(init_weights)
208
-
209
- def forward(self, x, x_mask=None):
210
- for c1, c2 in zip(self.convs1, self.convs2):
211
- xt = F.leaky_relu(x, LRELU_SLOPE)
212
- if x_mask is not None:
213
- xt = xt * x_mask
214
- xt = c1(xt)
215
- xt = F.leaky_relu(xt, LRELU_SLOPE)
216
- if x_mask is not None:
217
- xt = xt * x_mask
218
- xt = c2(xt)
219
- x = xt + x
220
- if x_mask is not None:
221
- x = x * x_mask
222
- return x
223
-
224
- def remove_weight_norm(self):
225
- for l in self.convs1:
226
- remove_weight_norm(l)
227
- for l in self.convs2:
228
- remove_weight_norm(l)
229
-
230
-
231
- class ResBlock2(torch.nn.Module):
232
- def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
233
- super(ResBlock2, self).__init__()
234
- self.convs = nn.ModuleList([
235
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
236
- padding=get_padding(kernel_size, dilation[0]))),
237
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
238
- padding=get_padding(kernel_size, dilation[1])))
239
- ])
240
- self.convs.apply(init_weights)
241
-
242
- def forward(self, x, x_mask=None):
243
- for c in self.convs:
244
- xt = F.leaky_relu(x, LRELU_SLOPE)
245
- if x_mask is not None:
246
- xt = xt * x_mask
247
- xt = c(xt)
248
- x = xt + x
249
- if x_mask is not None:
250
- x = x * x_mask
251
- return x
252
-
253
- def remove_weight_norm(self):
254
- for l in self.convs:
255
- remove_weight_norm(l)
256
-
257
-
258
- class Log(nn.Module):
259
- def forward(self, x, x_mask, reverse=False, **kwargs):
260
- if not reverse:
261
- y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
262
- logdet = torch.sum(-y, [1, 2])
263
- return y, logdet
264
- else:
265
- x = torch.exp(x) * x_mask
266
- return x
267
-
268
-
269
- class Flip(nn.Module):
270
- def forward(self, x, *args, reverse=False, **kwargs):
271
- x = torch.flip(x, [1])
272
- if not reverse:
273
- logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
274
- return x, logdet
275
- else:
276
- return x
277
-
278
-
279
- class ElementwiseAffine(nn.Module):
280
- def __init__(self, channels):
281
- super().__init__()
282
- self.channels = channels
283
- self.m = nn.Parameter(torch.zeros(channels,1))
284
- self.logs = nn.Parameter(torch.zeros(channels,1))
285
-
286
- def forward(self, x, x_mask, reverse=False, **kwargs):
287
- if not reverse:
288
- y = self.m + torch.exp(self.logs) * x
289
- y = y * x_mask
290
- logdet = torch.sum(self.logs * x_mask, [1,2])
291
- return y, logdet
292
- else:
293
- x = (x - self.m) * torch.exp(-self.logs) * x_mask
294
- return x
295
-
296
-
297
- class ResidualCouplingLayer(nn.Module):
298
- def __init__(self,
299
- channels,
300
- hidden_channels,
301
- kernel_size,
302
- dilation_rate,
303
- n_layers,
304
- p_dropout=0,
305
- gin_channels=0,
306
- mean_only=False):
307
- assert channels % 2 == 0, "channels should be divisible by 2"
308
- super().__init__()
309
- self.channels = channels
310
- self.hidden_channels = hidden_channels
311
- self.kernel_size = kernel_size
312
- self.dilation_rate = dilation_rate
313
- self.n_layers = n_layers
314
- self.half_channels = channels // 2
315
- self.mean_only = mean_only
316
-
317
- self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
318
- self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
319
- self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
320
- self.post.weight.data.zero_()
321
- self.post.bias.data.zero_()
322
-
323
- def forward(self, x, x_mask, g=None, reverse=False):
324
- x0, x1 = torch.split(x, [self.half_channels]*2, 1)
325
- h = self.pre(x0) * x_mask
326
- h = self.enc(h, x_mask, g=g)
327
- stats = self.post(h) * x_mask
328
- if not self.mean_only:
329
- m, logs = torch.split(stats, [self.half_channels]*2, 1)
330
- else:
331
- m = stats
332
- logs = torch.zeros_like(m)
333
-
334
- if not reverse:
335
- x1 = m + x1 * torch.exp(logs) * x_mask
336
- x = torch.cat([x0, x1], 1)
337
- logdet = torch.sum(logs, [1,2])
338
- return x, logdet
339
- else:
340
- x1 = (x1 - m) * torch.exp(-logs) * x_mask
341
- x = torch.cat([x0, x1], 1)
342
- return x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/requirements.txt DELETED
@@ -1,16 +0,0 @@
1
- Flask==2.1.2
2
- Flask_Cors==3.0.10
3
- gradio==3.4.1
4
- playsound==1.3.0
5
- PyAudio==0.2.12
6
- pydub==0.25.1
7
- pyworld==0.3.3
8
- requests==2.28.1
9
- scipy==1.7.3
10
- sounddevice==0.4.5
11
- SoundFile==0.10.3.post1
12
- starlette==0.19.1
13
- torchaudio==0.10.0
14
- tqdm==4.63.0
15
- scikit-maad
16
- praat-parselmouth
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/utils.py DELETED
@@ -1,338 +0,0 @@
1
- import os
2
- import glob
3
- import sys
4
- import argparse
5
- import logging
6
- import json
7
- import subprocess
8
-
9
- import librosa
10
- import numpy as np
11
- import torchaudio
12
- from scipy.io.wavfile import read
13
- import torch
14
- import torchvision
15
- from torch.nn import functional as F
16
- from commons import sequence_mask
17
- from hubert import hubert_model
18
- MATPLOTLIB_FLAG = False
19
-
20
- logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
21
- logger = logging
22
-
23
- f0_bin = 256
24
- f0_max = 1100.0
25
- f0_min = 50.0
26
- f0_mel_min = 1127 * np.log(1 + f0_min / 700)
27
- f0_mel_max = 1127 * np.log(1 + f0_max / 700)
28
-
29
- def f0_to_coarse(f0):
30
- is_torch = isinstance(f0, torch.Tensor)
31
- f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
32
- f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
33
-
34
- f0_mel[f0_mel <= 1] = 1
35
- f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
36
- f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
37
- assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
38
- return f0_coarse
39
-
40
-
41
- def get_hubert_model(rank=None):
42
-
43
- hubert_soft = hubert_model.hubert_soft("hubert/hubert-soft-0d54a1f4.pt")
44
- if rank is not None:
45
- hubert_soft = hubert_soft.cuda(rank)
46
- return hubert_soft
47
-
48
- def get_hubert_content(hmodel, y=None, path=None):
49
- if path is not None:
50
- source, sr = torchaudio.load(path)
51
- source = torchaudio.functional.resample(source, sr, 16000)
52
- if len(source.shape) == 2 and source.shape[1] >= 2:
53
- source = torch.mean(source, dim=0).unsqueeze(0)
54
- else:
55
- source = y
56
- source = source.unsqueeze(0)
57
- with torch.inference_mode():
58
- units = hmodel.units(source)
59
- return units.transpose(1,2)
60
-
61
-
62
- def get_content(cmodel, y):
63
- with torch.no_grad():
64
- c = cmodel.extract_features(y.squeeze(1))[0]
65
- c = c.transpose(1, 2)
66
- return c
67
-
68
-
69
-
70
- def transform(mel, height): # 68-92
71
- #r = np.random.random()
72
- #rate = r * 0.3 + 0.85 # 0.85-1.15
73
- #height = int(mel.size(-2) * rate)
74
- tgt = torchvision.transforms.functional.resize(mel, (height, mel.size(-1)))
75
- if height >= mel.size(-2):
76
- return tgt[:, :mel.size(-2), :]
77
- else:
78
- silence = tgt[:,-1:,:].repeat(1,mel.size(-2)-height,1)
79
- silence += torch.randn_like(silence) / 10
80
- return torch.cat((tgt, silence), 1)
81
-
82
-
83
- def stretch(mel, width): # 0.5-2
84
- return torchvision.transforms.functional.resize(mel, (mel.size(-2), width))
85
-
86
-
87
- def load_checkpoint(checkpoint_path, model, optimizer=None):
88
- assert os.path.isfile(checkpoint_path)
89
- checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
90
- iteration = checkpoint_dict['iteration']
91
- learning_rate = checkpoint_dict['learning_rate']
92
- if iteration is None:
93
- iteration = 1
94
- if learning_rate is None:
95
- learning_rate = 0.0002
96
- if optimizer is not None and checkpoint_dict['optimizer'] is not None:
97
- optimizer.load_state_dict(checkpoint_dict['optimizer'])
98
- saved_state_dict = checkpoint_dict['model']
99
- if hasattr(model, 'module'):
100
- state_dict = model.module.state_dict()
101
- else:
102
- state_dict = model.state_dict()
103
- new_state_dict= {}
104
- for k, v in state_dict.items():
105
- try:
106
- new_state_dict[k] = saved_state_dict[k]
107
- except:
108
- logger.info("%s is not in the checkpoint" % k)
109
- new_state_dict[k] = v
110
- if hasattr(model, 'module'):
111
- model.module.load_state_dict(new_state_dict)
112
- else:
113
- model.load_state_dict(new_state_dict)
114
- logger.info("Loaded checkpoint '{}' (iteration {})" .format(
115
- checkpoint_path, iteration))
116
- return model, optimizer, learning_rate, iteration
117
-
118
-
119
- def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
120
- # ckptname = checkpoint_path.split(os.sep)[-1]
121
- # newest_step = int(ckptname.split(".")[0].split("_")[1])
122
- # val_steps = 2000
123
- # last_ckptname = checkpoint_path.replace(str(newest_step), str(newest_step - val_steps*3))
124
- # if newest_step >= val_steps*3:
125
- # os.system(f"rm {last_ckptname}")
126
- logger.info("Saving model and optimizer state at iteration {} to {}".format(
127
- iteration, checkpoint_path))
128
- if hasattr(model, 'module'):
129
- state_dict = model.module.state_dict()
130
- else:
131
- state_dict = model.state_dict()
132
- torch.save({'model': state_dict,
133
- 'iteration': iteration,
134
- 'optimizer': optimizer.state_dict(),
135
- 'learning_rate': learning_rate}, checkpoint_path)
136
-
137
-
138
- def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
139
- for k, v in scalars.items():
140
- writer.add_scalar(k, v, global_step)
141
- for k, v in histograms.items():
142
- writer.add_histogram(k, v, global_step)
143
- for k, v in images.items():
144
- writer.add_image(k, v, global_step, dataformats='HWC')
145
- for k, v in audios.items():
146
- writer.add_audio(k, v, global_step, audio_sampling_rate)
147
-
148
-
149
- def latest_checkpoint_path(dir_path, regex="G_*.pth"):
150
- f_list = glob.glob(os.path.join(dir_path, regex))
151
- f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
152
- x = f_list[-1]
153
- print(x)
154
- return x
155
-
156
-
157
- def plot_spectrogram_to_numpy(spectrogram):
158
- global MATPLOTLIB_FLAG
159
- if not MATPLOTLIB_FLAG:
160
- import matplotlib
161
- matplotlib.use("Agg")
162
- MATPLOTLIB_FLAG = True
163
- mpl_logger = logging.getLogger('matplotlib')
164
- mpl_logger.setLevel(logging.WARNING)
165
- import matplotlib.pylab as plt
166
- import numpy as np
167
-
168
- fig, ax = plt.subplots(figsize=(10,2))
169
- im = ax.imshow(spectrogram, aspect="auto", origin="lower",
170
- interpolation='none')
171
- plt.colorbar(im, ax=ax)
172
- plt.xlabel("Frames")
173
- plt.ylabel("Channels")
174
- plt.tight_layout()
175
-
176
- fig.canvas.draw()
177
- data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
178
- data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
179
- plt.close()
180
- return data
181
-
182
-
183
- def plot_alignment_to_numpy(alignment, info=None):
184
- global MATPLOTLIB_FLAG
185
- if not MATPLOTLIB_FLAG:
186
- import matplotlib
187
- matplotlib.use("Agg")
188
- MATPLOTLIB_FLAG = True
189
- mpl_logger = logging.getLogger('matplotlib')
190
- mpl_logger.setLevel(logging.WARNING)
191
- import matplotlib.pylab as plt
192
- import numpy as np
193
-
194
- fig, ax = plt.subplots(figsize=(6, 4))
195
- im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
196
- interpolation='none')
197
- fig.colorbar(im, ax=ax)
198
- xlabel = 'Decoder timestep'
199
- if info is not None:
200
- xlabel += '\n\n' + info
201
- plt.xlabel(xlabel)
202
- plt.ylabel('Encoder timestep')
203
- plt.tight_layout()
204
-
205
- fig.canvas.draw()
206
- data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
207
- data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
208
- plt.close()
209
- return data
210
-
211
-
212
- def load_wav_to_torch(full_path):
213
- sampling_rate, data = read(full_path)
214
- return torch.FloatTensor(data.astype(np.float32)), sampling_rate
215
-
216
-
217
- def load_filepaths_and_text(filename, split="|"):
218
- with open(filename, encoding='utf-8') as f:
219
- filepaths_and_text = [line.strip().split(split) for line in f]
220
- return filepaths_and_text
221
-
222
-
223
- def get_hparams(init=True):
224
- parser = argparse.ArgumentParser()
225
- parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
226
- help='JSON file for configuration')
227
- parser.add_argument('-m', '--model', type=str, required=True,
228
- help='Model name')
229
-
230
- args = parser.parse_args()
231
- model_dir = os.path.join("./logs", args.model)
232
-
233
- if not os.path.exists(model_dir):
234
- os.makedirs(model_dir)
235
-
236
- config_path = args.config
237
- config_save_path = os.path.join(model_dir, "config.json")
238
- if init:
239
- with open(config_path, "r") as f:
240
- data = f.read()
241
- with open(config_save_path, "w") as f:
242
- f.write(data)
243
- else:
244
- with open(config_save_path, "r") as f:
245
- data = f.read()
246
- config = json.loads(data)
247
-
248
- hparams = HParams(**config)
249
- hparams.model_dir = model_dir
250
- return hparams
251
-
252
-
253
- def get_hparams_from_dir(model_dir):
254
- config_save_path = os.path.join(model_dir, "config.json")
255
- with open(config_save_path, "r") as f:
256
- data = f.read()
257
- config = json.loads(data)
258
-
259
- hparams =HParams(**config)
260
- hparams.model_dir = model_dir
261
- return hparams
262
-
263
-
264
- def get_hparams_from_file(config_path):
265
- with open(config_path, "r") as f:
266
- data = f.read()
267
- config = json.loads(data)
268
-
269
- hparams =HParams(**config)
270
- return hparams
271
-
272
-
273
- def check_git_hash(model_dir):
274
- source_dir = os.path.dirname(os.path.realpath(__file__))
275
- if not os.path.exists(os.path.join(source_dir, ".git")):
276
- logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
277
- source_dir
278
- ))
279
- return
280
-
281
- cur_hash = subprocess.getoutput("git rev-parse HEAD")
282
-
283
- path = os.path.join(model_dir, "githash")
284
- if os.path.exists(path):
285
- saved_hash = open(path).read()
286
- if saved_hash != cur_hash:
287
- logger.warn("git hash values are different. {}(saved) != {}(current)".format(
288
- saved_hash[:8], cur_hash[:8]))
289
- else:
290
- open(path, "w").write(cur_hash)
291
-
292
-
293
- def get_logger(model_dir, filename="train.log"):
294
- global logger
295
- logger = logging.getLogger(os.path.basename(model_dir))
296
- logger.setLevel(logging.DEBUG)
297
-
298
- formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
299
- if not os.path.exists(model_dir):
300
- os.makedirs(model_dir)
301
- h = logging.FileHandler(os.path.join(model_dir, filename))
302
- h.setLevel(logging.DEBUG)
303
- h.setFormatter(formatter)
304
- logger.addHandler(h)
305
- return logger
306
-
307
-
308
- class HParams():
309
- def __init__(self, **kwargs):
310
- for k, v in kwargs.items():
311
- if type(v) == dict:
312
- v = HParams(**v)
313
- self[k] = v
314
-
315
- def keys(self):
316
- return self.__dict__.keys()
317
-
318
- def items(self):
319
- return self.__dict__.items()
320
-
321
- def values(self):
322
- return self.__dict__.values()
323
-
324
- def __len__(self):
325
- return len(self.__dict__)
326
-
327
- def __getitem__(self, key):
328
- return getattr(self, key)
329
-
330
- def __setitem__(self, key, value):
331
- return setattr(self, key, value)
332
-
333
- def __contains__(self, key):
334
- return key in self.__dict__
335
-
336
- def __repr__(self):
337
- return self.__dict__.__repr__()
338
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/vdecoder/__init__.py DELETED
File without changes
src/tts_vits/vdecoder/hifigan/env.py DELETED
@@ -1,15 +0,0 @@
1
- import os
2
- import shutil
3
-
4
-
5
- class AttrDict(dict):
6
- def __init__(self, *args, **kwargs):
7
- super(AttrDict, self).__init__(*args, **kwargs)
8
- self.__dict__ = self
9
-
10
-
11
- def build_env(config, config_name, path):
12
- t_path = os.path.join(path, config_name)
13
- if config != t_path:
14
- os.makedirs(path, exist_ok=True)
15
- shutil.copyfile(config, os.path.join(path, config_name))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/vdecoder/hifigan/models.py DELETED
@@ -1,503 +0,0 @@
1
- import os
2
- import json
3
- from .env import AttrDict
4
- import numpy as np
5
- import torch
6
- import torch.nn.functional as F
7
- import torch.nn as nn
8
- from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
9
- from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
10
- from .utils import init_weights, get_padding
11
-
12
- LRELU_SLOPE = 0.1
13
-
14
-
15
- def load_model(model_path, device='cuda'):
16
- config_file = os.path.join(os.path.split(model_path)[0], 'config.json')
17
- with open(config_file) as f:
18
- data = f.read()
19
-
20
- global h
21
- json_config = json.loads(data)
22
- h = AttrDict(json_config)
23
-
24
- generator = Generator(h).to(device)
25
-
26
- cp_dict = torch.load(model_path)
27
- generator.load_state_dict(cp_dict['generator'])
28
- generator.eval()
29
- generator.remove_weight_norm()
30
- del cp_dict
31
- return generator, h
32
-
33
-
34
- class ResBlock1(torch.nn.Module):
35
- def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
36
- super(ResBlock1, self).__init__()
37
- self.h = h
38
- self.convs1 = nn.ModuleList([
39
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
40
- padding=get_padding(kernel_size, dilation[0]))),
41
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
42
- padding=get_padding(kernel_size, dilation[1]))),
43
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
44
- padding=get_padding(kernel_size, dilation[2])))
45
- ])
46
- self.convs1.apply(init_weights)
47
-
48
- self.convs2 = nn.ModuleList([
49
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
50
- padding=get_padding(kernel_size, 1))),
51
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
52
- padding=get_padding(kernel_size, 1))),
53
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
54
- padding=get_padding(kernel_size, 1)))
55
- ])
56
- self.convs2.apply(init_weights)
57
-
58
- def forward(self, x):
59
- for c1, c2 in zip(self.convs1, self.convs2):
60
- xt = F.leaky_relu(x, LRELU_SLOPE)
61
- xt = c1(xt)
62
- xt = F.leaky_relu(xt, LRELU_SLOPE)
63
- xt = c2(xt)
64
- x = xt + x
65
- return x
66
-
67
- def remove_weight_norm(self):
68
- for l in self.convs1:
69
- remove_weight_norm(l)
70
- for l in self.convs2:
71
- remove_weight_norm(l)
72
-
73
-
74
- class ResBlock2(torch.nn.Module):
75
- def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
76
- super(ResBlock2, self).__init__()
77
- self.h = h
78
- self.convs = nn.ModuleList([
79
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
80
- padding=get_padding(kernel_size, dilation[0]))),
81
- weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
82
- padding=get_padding(kernel_size, dilation[1])))
83
- ])
84
- self.convs.apply(init_weights)
85
-
86
- def forward(self, x):
87
- for c in self.convs:
88
- xt = F.leaky_relu(x, LRELU_SLOPE)
89
- xt = c(xt)
90
- x = xt + x
91
- return x
92
-
93
- def remove_weight_norm(self):
94
- for l in self.convs:
95
- remove_weight_norm(l)
96
-
97
-
98
- def padDiff(x):
99
- return F.pad(F.pad(x, (0,0,-1,1), 'constant', 0) - x, (0,0,0,-1), 'constant', 0)
100
-
101
- class SineGen(torch.nn.Module):
102
- """ Definition of sine generator
103
- SineGen(samp_rate, harmonic_num = 0,
104
- sine_amp = 0.1, noise_std = 0.003,
105
- voiced_threshold = 0,
106
- flag_for_pulse=False)
107
- samp_rate: sampling rate in Hz
108
- harmonic_num: number of harmonic overtones (default 0)
109
- sine_amp: amplitude of sine-wavefrom (default 0.1)
110
- noise_std: std of Gaussian noise (default 0.003)
111
- voiced_thoreshold: F0 threshold for U/V classification (default 0)
112
- flag_for_pulse: this SinGen is used inside PulseGen (default False)
113
- Note: when flag_for_pulse is True, the first time step of a voiced
114
- segment is always sin(np.pi) or cos(0)
115
- """
116
-
117
- def __init__(self, samp_rate, harmonic_num=0,
118
- sine_amp=0.1, noise_std=0.003,
119
- voiced_threshold=0,
120
- flag_for_pulse=False):
121
- super(SineGen, self).__init__()
122
- self.sine_amp = sine_amp
123
- self.noise_std = noise_std
124
- self.harmonic_num = harmonic_num
125
- self.dim = self.harmonic_num + 1
126
- self.sampling_rate = samp_rate
127
- self.voiced_threshold = voiced_threshold
128
- self.flag_for_pulse = flag_for_pulse
129
-
130
- def _f02uv(self, f0):
131
- # generate uv signal
132
- uv = (f0 > self.voiced_threshold).type(torch.float32)
133
- return uv
134
-
135
- def _f02sine(self, f0_values):
136
- """ f0_values: (batchsize, length, dim)
137
- where dim indicates fundamental tone and overtones
138
- """
139
- # convert to F0 in rad. The interger part n can be ignored
140
- # because 2 * np.pi * n doesn't affect phase
141
- rad_values = (f0_values / self.sampling_rate) % 1
142
-
143
- # initial phase noise (no noise for fundamental component)
144
- rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], \
145
- device=f0_values.device)
146
- rand_ini[:, 0] = 0
147
- rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
148
-
149
- # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
150
- if not self.flag_for_pulse:
151
- # for normal case
152
-
153
- # To prevent torch.cumsum numerical overflow,
154
- # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
155
- # Buffer tmp_over_one_idx indicates the time step to add -1.
156
- # This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
157
- tmp_over_one = torch.cumsum(rad_values, 1) % 1
158
- tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
159
- cumsum_shift = torch.zeros_like(rad_values)
160
- cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
161
-
162
- sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1)
163
- * 2 * np.pi)
164
- else:
165
- # If necessary, make sure that the first time step of every
166
- # voiced segments is sin(pi) or cos(0)
167
- # This is used for pulse-train generation
168
-
169
- # identify the last time step in unvoiced segments
170
- uv = self._f02uv(f0_values)
171
- uv_1 = torch.roll(uv, shifts=-1, dims=1)
172
- uv_1[:, -1, :] = 1
173
- u_loc = (uv < 1) * (uv_1 > 0)
174
-
175
- # get the instantanouse phase
176
- tmp_cumsum = torch.cumsum(rad_values, dim=1)
177
- # different batch needs to be processed differently
178
- for idx in range(f0_values.shape[0]):
179
- temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
180
- temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
181
- # stores the accumulation of i.phase within
182
- # each voiced segments
183
- tmp_cumsum[idx, :, :] = 0
184
- tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
185
-
186
- # rad_values - tmp_cumsum: remove the accumulation of i.phase
187
- # within the previous voiced segment.
188
- i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
189
-
190
- # get the sines
191
- sines = torch.cos(i_phase * 2 * np.pi)
192
- return sines
193
-
194
- def forward(self, f0):
195
- """ sine_tensor, uv = forward(f0)
196
- input F0: tensor(batchsize=1, length, dim=1)
197
- f0 for unvoiced steps should be 0
198
- output sine_tensor: tensor(batchsize=1, length, dim)
199
- output uv: tensor(batchsize=1, length, 1)
200
- """
201
- with torch.no_grad():
202
- f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim,
203
- device=f0.device)
204
- # fundamental component
205
- fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
206
-
207
- # generate sine waveforms
208
- sine_waves = self._f02sine(fn) * self.sine_amp
209
-
210
- # generate uv signal
211
- # uv = torch.ones(f0.shape)
212
- # uv = uv * (f0 > self.voiced_threshold)
213
- uv = self._f02uv(f0)
214
-
215
- # noise: for unvoiced should be similar to sine_amp
216
- # std = self.sine_amp/3 -> max value ~ self.sine_amp
217
- # . for voiced regions is self.noise_std
218
- noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
219
- noise = noise_amp * torch.randn_like(sine_waves)
220
-
221
- # first: set the unvoiced part to 0 by uv
222
- # then: additive noise
223
- sine_waves = sine_waves * uv + noise
224
- return sine_waves, uv, noise
225
-
226
-
227
- class SourceModuleHnNSF(torch.nn.Module):
228
- """ SourceModule for hn-nsf
229
- SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
230
- add_noise_std=0.003, voiced_threshod=0)
231
- sampling_rate: sampling_rate in Hz
232
- harmonic_num: number of harmonic above F0 (default: 0)
233
- sine_amp: amplitude of sine source signal (default: 0.1)
234
- add_noise_std: std of additive Gaussian noise (default: 0.003)
235
- note that amplitude of noise in unvoiced is decided
236
- by sine_amp
237
- voiced_threshold: threhold to set U/V given F0 (default: 0)
238
- Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
239
- F0_sampled (batchsize, length, 1)
240
- Sine_source (batchsize, length, 1)
241
- noise_source (batchsize, length 1)
242
- uv (batchsize, length, 1)
243
- """
244
-
245
- def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
246
- add_noise_std=0.003, voiced_threshod=0):
247
- super(SourceModuleHnNSF, self).__init__()
248
-
249
- self.sine_amp = sine_amp
250
- self.noise_std = add_noise_std
251
-
252
- # to produce sine waveforms
253
- self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
254
- sine_amp, add_noise_std, voiced_threshod)
255
-
256
- # to merge source harmonics into a single excitation
257
- self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
258
- self.l_tanh = torch.nn.Tanh()
259
-
260
- def forward(self, x):
261
- """
262
- Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
263
- F0_sampled (batchsize, length, 1)
264
- Sine_source (batchsize, length, 1)
265
- noise_source (batchsize, length 1)
266
- """
267
- # source for harmonic branch
268
- sine_wavs, uv, _ = self.l_sin_gen(x)
269
- sine_merge = self.l_tanh(self.l_linear(sine_wavs))
270
-
271
- # source for noise branch, in the same shape as uv
272
- noise = torch.randn_like(uv) * self.sine_amp / 3
273
- return sine_merge, noise, uv
274
-
275
-
276
- class Generator(torch.nn.Module):
277
- def __init__(self, h):
278
- super(Generator, self).__init__()
279
- self.h = h
280
-
281
- self.num_kernels = len(h["resblock_kernel_sizes"])
282
- self.num_upsamples = len(h["upsample_rates"])
283
- self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"]))
284
- self.m_source = SourceModuleHnNSF(
285
- sampling_rate=h["sampling_rate"],
286
- harmonic_num=8)
287
- self.noise_convs = nn.ModuleList()
288
- self.conv_pre = weight_norm(Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3))
289
- resblock = ResBlock1 if h["resblock"] == '1' else ResBlock2
290
- self.ups = nn.ModuleList()
291
- for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])):
292
- c_cur = h["upsample_initial_channel"] // (2 ** (i + 1))
293
- self.ups.append(weight_norm(
294
- ConvTranspose1d(h["upsample_initial_channel"] // (2 ** i), h["upsample_initial_channel"] // (2 ** (i + 1)),
295
- k, u, padding=(k - u) // 2)))
296
- if i + 1 < len(h["upsample_rates"]): #
297
- stride_f0 = np.prod(h["upsample_rates"][i + 1:])
298
- self.noise_convs.append(Conv1d(
299
- 1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
300
- else:
301
- self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
302
- self.resblocks = nn.ModuleList()
303
- for i in range(len(self.ups)):
304
- ch = h["upsample_initial_channel"] // (2 ** (i + 1))
305
- for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])):
306
- self.resblocks.append(resblock(h, ch, k, d))
307
-
308
- self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
309
- self.ups.apply(init_weights)
310
- self.conv_post.apply(init_weights)
311
- self.cond = nn.Conv1d(h['gin_channels'], h['upsample_initial_channel'], 1)
312
-
313
- def forward(self, x, f0, g=None):
314
- # print(1,x.shape,f0.shape,f0[:, None].shape)
315
- f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
316
- # print(2,f0.shape)
317
- har_source, noi_source, uv = self.m_source(f0)
318
- har_source = har_source.transpose(1, 2)
319
- x = self.conv_pre(x)
320
- x = x + self.cond(g)
321
- # print(124,x.shape,har_source.shape)
322
- for i in range(self.num_upsamples):
323
- x = F.leaky_relu(x, LRELU_SLOPE)
324
- # print(3,x.shape)
325
- x = self.ups[i](x)
326
- x_source = self.noise_convs[i](har_source)
327
- # print(4,x_source.shape,har_source.shape,x.shape)
328
- x = x + x_source
329
- xs = None
330
- for j in range(self.num_kernels):
331
- if xs is None:
332
- xs = self.resblocks[i * self.num_kernels + j](x)
333
- else:
334
- xs += self.resblocks[i * self.num_kernels + j](x)
335
- x = xs / self.num_kernels
336
- x = F.leaky_relu(x)
337
- x = self.conv_post(x)
338
- x = torch.tanh(x)
339
-
340
- return x
341
-
342
- def remove_weight_norm(self):
343
- print('Removing weight norm...')
344
- for l in self.ups:
345
- remove_weight_norm(l)
346
- for l in self.resblocks:
347
- l.remove_weight_norm()
348
- remove_weight_norm(self.conv_pre)
349
- remove_weight_norm(self.conv_post)
350
-
351
-
352
- class DiscriminatorP(torch.nn.Module):
353
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
354
- super(DiscriminatorP, self).__init__()
355
- self.period = period
356
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
357
- self.convs = nn.ModuleList([
358
- norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
359
- norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
360
- norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
361
- norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
362
- norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
363
- ])
364
- self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
365
-
366
- def forward(self, x):
367
- fmap = []
368
-
369
- # 1d to 2d
370
- b, c, t = x.shape
371
- if t % self.period != 0: # pad first
372
- n_pad = self.period - (t % self.period)
373
- x = F.pad(x, (0, n_pad), "reflect")
374
- t = t + n_pad
375
- x = x.view(b, c, t // self.period, self.period)
376
-
377
- for l in self.convs:
378
- x = l(x)
379
- x = F.leaky_relu(x, LRELU_SLOPE)
380
- fmap.append(x)
381
- x = self.conv_post(x)
382
- fmap.append(x)
383
- x = torch.flatten(x, 1, -1)
384
-
385
- return x, fmap
386
-
387
-
388
- class MultiPeriodDiscriminator(torch.nn.Module):
389
- def __init__(self, periods=None):
390
- super(MultiPeriodDiscriminator, self).__init__()
391
- self.periods = periods if periods is not None else [2, 3, 5, 7, 11]
392
- self.discriminators = nn.ModuleList()
393
- for period in self.periods:
394
- self.discriminators.append(DiscriminatorP(period))
395
-
396
- def forward(self, y, y_hat):
397
- y_d_rs = []
398
- y_d_gs = []
399
- fmap_rs = []
400
- fmap_gs = []
401
- for i, d in enumerate(self.discriminators):
402
- y_d_r, fmap_r = d(y)
403
- y_d_g, fmap_g = d(y_hat)
404
- y_d_rs.append(y_d_r)
405
- fmap_rs.append(fmap_r)
406
- y_d_gs.append(y_d_g)
407
- fmap_gs.append(fmap_g)
408
-
409
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
410
-
411
-
412
- class DiscriminatorS(torch.nn.Module):
413
- def __init__(self, use_spectral_norm=False):
414
- super(DiscriminatorS, self).__init__()
415
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
416
- self.convs = nn.ModuleList([
417
- norm_f(Conv1d(1, 128, 15, 1, padding=7)),
418
- norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
419
- norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
420
- norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
421
- norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
422
- norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
423
- norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
424
- ])
425
- self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
426
-
427
- def forward(self, x):
428
- fmap = []
429
- for l in self.convs:
430
- x = l(x)
431
- x = F.leaky_relu(x, LRELU_SLOPE)
432
- fmap.append(x)
433
- x = self.conv_post(x)
434
- fmap.append(x)
435
- x = torch.flatten(x, 1, -1)
436
-
437
- return x, fmap
438
-
439
-
440
- class MultiScaleDiscriminator(torch.nn.Module):
441
- def __init__(self):
442
- super(MultiScaleDiscriminator, self).__init__()
443
- self.discriminators = nn.ModuleList([
444
- DiscriminatorS(use_spectral_norm=True),
445
- DiscriminatorS(),
446
- DiscriminatorS(),
447
- ])
448
- self.meanpools = nn.ModuleList([
449
- AvgPool1d(4, 2, padding=2),
450
- AvgPool1d(4, 2, padding=2)
451
- ])
452
-
453
- def forward(self, y, y_hat):
454
- y_d_rs = []
455
- y_d_gs = []
456
- fmap_rs = []
457
- fmap_gs = []
458
- for i, d in enumerate(self.discriminators):
459
- if i != 0:
460
- y = self.meanpools[i - 1](y)
461
- y_hat = self.meanpools[i - 1](y_hat)
462
- y_d_r, fmap_r = d(y)
463
- y_d_g, fmap_g = d(y_hat)
464
- y_d_rs.append(y_d_r)
465
- fmap_rs.append(fmap_r)
466
- y_d_gs.append(y_d_g)
467
- fmap_gs.append(fmap_g)
468
-
469
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
470
-
471
-
472
- def feature_loss(fmap_r, fmap_g):
473
- loss = 0
474
- for dr, dg in zip(fmap_r, fmap_g):
475
- for rl, gl in zip(dr, dg):
476
- loss += torch.mean(torch.abs(rl - gl))
477
-
478
- return loss * 2
479
-
480
-
481
- def discriminator_loss(disc_real_outputs, disc_generated_outputs):
482
- loss = 0
483
- r_losses = []
484
- g_losses = []
485
- for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
486
- r_loss = torch.mean((1 - dr) ** 2)
487
- g_loss = torch.mean(dg ** 2)
488
- loss += (r_loss + g_loss)
489
- r_losses.append(r_loss.item())
490
- g_losses.append(g_loss.item())
491
-
492
- return loss, r_losses, g_losses
493
-
494
-
495
- def generator_loss(disc_outputs):
496
- loss = 0
497
- gen_losses = []
498
- for dg in disc_outputs:
499
- l = torch.mean((1 - dg) ** 2)
500
- gen_losses.append(l)
501
- loss += l
502
-
503
- return loss, gen_losses
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/vdecoder/hifigan/nvSTFT.py DELETED
@@ -1,111 +0,0 @@
1
- import math
2
- import os
3
- os.environ["LRU_CACHE_CAPACITY"] = "3"
4
- import random
5
- import torch
6
- import torch.utils.data
7
- import numpy as np
8
- import librosa
9
- from librosa.util import normalize
10
- from librosa.filters import mel as librosa_mel_fn
11
- from scipy.io.wavfile import read
12
- import soundfile as sf
13
-
14
- def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
15
- sampling_rate = None
16
- try:
17
- data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile.
18
- except Exception as ex:
19
- print(f"'{full_path}' failed to load.\nException:")
20
- print(ex)
21
- if return_empty_on_exception:
22
- return [], sampling_rate or target_sr or 32000
23
- else:
24
- raise Exception(ex)
25
-
26
- if len(data.shape) > 1:
27
- data = data[:, 0]
28
- assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
29
-
30
- if np.issubdtype(data.dtype, np.integer): # if audio data is type int
31
- max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
32
- else: # if audio data is type fp32
33
- max_mag = max(np.amax(data), -np.amin(data))
34
- max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
35
-
36
- data = torch.FloatTensor(data.astype(np.float32))/max_mag
37
-
38
- if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
39
- return [], sampling_rate or target_sr or 32000
40
- if target_sr is not None and sampling_rate != target_sr:
41
- data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
42
- sampling_rate = target_sr
43
-
44
- return data, sampling_rate
45
-
46
- def dynamic_range_compression(x, C=1, clip_val=1e-5):
47
- return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
48
-
49
- def dynamic_range_decompression(x, C=1):
50
- return np.exp(x) / C
51
-
52
- def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
53
- return torch.log(torch.clamp(x, min=clip_val) * C)
54
-
55
- def dynamic_range_decompression_torch(x, C=1):
56
- return torch.exp(x) / C
57
-
58
- class STFT():
59
- def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
60
- self.target_sr = sr
61
-
62
- self.n_mels = n_mels
63
- self.n_fft = n_fft
64
- self.win_size = win_size
65
- self.hop_length = hop_length
66
- self.fmin = fmin
67
- self.fmax = fmax
68
- self.clip_val = clip_val
69
- self.mel_basis = {}
70
- self.hann_window = {}
71
-
72
- def get_mel(self, y, center=False):
73
- sampling_rate = self.target_sr
74
- n_mels = self.n_mels
75
- n_fft = self.n_fft
76
- win_size = self.win_size
77
- hop_length = self.hop_length
78
- fmin = self.fmin
79
- fmax = self.fmax
80
- clip_val = self.clip_val
81
-
82
- if torch.min(y) < -1.:
83
- print('min value is ', torch.min(y))
84
- if torch.max(y) > 1.:
85
- print('max value is ', torch.max(y))
86
-
87
- if fmax not in self.mel_basis:
88
- mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
89
- self.mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
90
- self.hann_window[str(y.device)] = torch.hann_window(self.win_size).to(y.device)
91
-
92
- y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_length)/2), int((n_fft-hop_length)/2)), mode='reflect')
93
- y = y.squeeze(1)
94
-
95
- spec = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_size, window=self.hann_window[str(y.device)],
96
- center=center, pad_mode='reflect', normalized=False, onesided=True)
97
- # print(111,spec)
98
- spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
99
- # print(222,spec)
100
- spec = torch.matmul(self.mel_basis[str(fmax)+'_'+str(y.device)], spec)
101
- # print(333,spec)
102
- spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
103
- # print(444,spec)
104
- return spec
105
-
106
- def __call__(self, audiopath):
107
- audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
108
- spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
109
- return spect
110
-
111
- stft = STFT()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/vdecoder/hifigan/utils.py DELETED
@@ -1,68 +0,0 @@
1
- import glob
2
- import os
3
- import matplotlib
4
- import torch
5
- from torch.nn.utils import weight_norm
6
- matplotlib.use("Agg")
7
- import matplotlib.pylab as plt
8
-
9
-
10
- def plot_spectrogram(spectrogram):
11
- fig, ax = plt.subplots(figsize=(10, 2))
12
- im = ax.imshow(spectrogram, aspect="auto", origin="lower",
13
- interpolation='none')
14
- plt.colorbar(im, ax=ax)
15
-
16
- fig.canvas.draw()
17
- plt.close()
18
-
19
- return fig
20
-
21
-
22
- def init_weights(m, mean=0.0, std=0.01):
23
- classname = m.__class__.__name__
24
- if classname.find("Conv") != -1:
25
- m.weight.data.normal_(mean, std)
26
-
27
-
28
- def apply_weight_norm(m):
29
- classname = m.__class__.__name__
30
- if classname.find("Conv") != -1:
31
- weight_norm(m)
32
-
33
-
34
- def get_padding(kernel_size, dilation=1):
35
- return int((kernel_size*dilation - dilation)/2)
36
-
37
-
38
- def load_checkpoint(filepath, device):
39
- assert os.path.isfile(filepath)
40
- print("Loading '{}'".format(filepath))
41
- checkpoint_dict = torch.load(filepath, map_location=device)
42
- print("Complete.")
43
- return checkpoint_dict
44
-
45
-
46
- def save_checkpoint(filepath, obj):
47
- print("Saving checkpoint to {}".format(filepath))
48
- torch.save(obj, filepath)
49
- print("Complete.")
50
-
51
-
52
- def del_old_checkpoints(cp_dir, prefix, n_models=2):
53
- pattern = os.path.join(cp_dir, prefix + '????????')
54
- cp_list = glob.glob(pattern) # get checkpoint paths
55
- cp_list = sorted(cp_list)# sort by iter
56
- if len(cp_list) > n_models: # if more than n_models models are found
57
- for cp in cp_list[:-n_models]:# delete the oldest models other than lastest n_models
58
- open(cp, 'w').close()# empty file contents
59
- os.unlink(cp)# delete file (move to trash when using Colab)
60
-
61
-
62
- def scan_checkpoint(cp_dir, prefix):
63
- pattern = os.path.join(cp_dir, prefix + '????????')
64
- cp_list = glob.glob(pattern)
65
- if len(cp_list) == 0:
66
- return None
67
- return sorted(cp_list)[-1]
68
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts_vits/vits_haruhi.py DELETED
@@ -1,51 +0,0 @@
1
- import requests
2
- import inference_main
3
- import time
4
- import uuid
5
-
6
- def set_model_path(path):
7
- inference_main.set_model_path(path)
8
-
9
- def tts(text, spd):
10
- url = f"https://fanyi.baidu.com/gettts?lan=jp&text={text}&spd={spd}&source=web"
11
-
12
- payload = {}
13
- headers = {
14
- 'Cookie': 'BAIDUID=543CBD0E4FB46C2FD5F44F7D81911F15:FG=1'
15
- }
16
-
17
- res = requests.request("GET", url, headers=headers, data=payload)
18
- while res.content == b'':
19
- res = requests.request("GET", url, headers=headers, data=payload)
20
- time.sleep(0.1)
21
-
22
-
23
- if res.status_code == 200:
24
- return res.content
25
- else:
26
- return None
27
-
28
- def vits_haruhi(text, tran, spd=3):
29
- voice = tts(text, spd)
30
-
31
- if voice is None:
32
- print("TTS failed")
33
- return None
34
- filename = f"tts_results/{str(uuid.uuid4())}.mp3";
35
- with open(filename, "wb") as f:
36
- f.write(voice)
37
- return inference_main.infer_to("haruhi", tran, filename)
38
-
39
-
40
- if __name__ == "__main__":
41
- inference_main.infer_tool.mkdir(["./tts_results"])
42
- # ่ฎพ็ฝฎๆจกๅž‹่ทฏๅพ„
43
- set_model_path("vits_models/Haruhi_54000.pth")
44
- # ็”Ÿๆˆ่ฏญ้Ÿณ
45
- print( vits_haruhi("็œŸๅฎŸใฏใ„ใคใ‚‚ใฒใจใค", 8))
46
- print( vits_haruhi("็งใฎ้’ๆ˜ฅใฏๅพŒๆ‚”ใ—ใฆใ„ใชใ„", 8))
47
- # vits_haruhi("ใพใŸใฟใ‚“ใชใง็ฌ‘ใ„ใŸใ„ใฎใซๅ›ใŒๆญปใ‚“ใ ใ‚‰ๆ„ๅ‘ณใŒ็„กใ„ใ˜ใ‚ƒใชใ„ใ‹๏ผ", 8)
48
- # vits_haruhi("ใ‚ใใ‚‰ใ‚ใŸใ‚‰ใใ“ใง่ฉฆๅˆ็ต‚ไบ†ใ ใ‚ˆ", 8)
49
- # vits_haruhi("ๅˆฅใ‚Œใฎๅ‘ณใฏๅˆ†ใ‹ใ‚Šใพใ›ใ‚“ใ€‚ใ•ใ‚ˆใ†ใชใ‚‰ใจใ„ใ†่จ€่‘‰ใŒใ“ใ‚“ใชใซๅผทใ„ใจใฏ็Ÿฅใ‚Šใพใ›ใ‚“ใงใ—ใŸ", 8)
50
- # vits_haruhi("ๅ‘ฝใซใฏ้™ใ‚ŠใŒใ‚ใ‚‹ใ‹ใ‚‰ใ“ใใ€ใ‚‚ใฃใจๅคงๅˆ‡ใซ่ฆ‹ใˆใ‚‹ใ€‚ๅ‘ฝใซ้™ใ‚ŠใŒใ‚ใ‚‹ใ‹ใ‚‰ใ“ใใ€ใŸใ‚†ใพใฌๅŠชๅŠ›ใŒๅฟ…่ฆใ ", 8)
51
- # vits_haruhi("ใชใ‚“ใจใ‹ใชใ‚‹ใ‚ˆ๏ผ็ตถๅฏพๅคงไธˆๅคซใ ใ‚ˆ", 8)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/text.py โ†’ text.py RENAMED
File without changes