XiaoHei Studio commited on
Commit
8907ed4
1 Parent(s): abc4e5e

Upload 13 files

Browse files
filelists/test.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ ./dataset/44k/taffy/000562.wav
2
+ ./dataset/44k/nyaru/000011.wav
3
+ ./dataset/44k/nyaru/000008.wav
4
+ ./dataset/44k/taffy/000563.wav
filelists/train.txt ADDED
@@ -0,0 +1,566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ./dataset/44k/chino_v7/chino_diff_aug_251.wav
2
+ ./dataset/44k/chino_v7/chino_reprocess_744.wav
3
+ ./dataset/44k/chino_v7/chino_diff_aug_2.wav
4
+ ./dataset/44k/chino_v7/chino_reprocess_872.wav
5
+ ./dataset/44k/chino_v7/chino_diff_aug_382_0.wav
6
+ ./dataset/44k/chino_v7/chino_diff_aug_262.wav
7
+ ./dataset/44k/chino_v7/chino_diff_aug_264.wav
8
+ ./dataset/44k/chino_v7/chino_diff_aug_268.wav
9
+ ./dataset/44k/chino_v7/chino_diff_aug_64.wav
10
+ ./dataset/44k/chino_v7/chino_diff_aug_323.wav
11
+ ./dataset/44k/chino_v7/chino_diff_aug_316.wav
12
+ ./dataset/44k/chino_v7/chino_reprocess_801.wav
13
+ ./dataset/44k/chino_v7/chino_diff_aug_29.wav
14
+ ./dataset/44k/chino_v7/chino_diff_aug_87_3.wav
15
+ ./dataset/44k/chino_v7/chino_reprocess_766.wav
16
+ ./dataset/44k/chino_v7/chino_diff_aug_91.wav
17
+ ./dataset/44k/chino_v7/chino_diff_aug_217_2.wav
18
+ ./dataset/44k/chino_v7/chino_diff_aug_88.wav
19
+ ./dataset/44k/chino_v7/chino_diff_aug_61.wav
20
+ ./dataset/44k/chino_v7/chino_reprocess_828.wav
21
+ ./dataset/44k/chino_v7/chino_diff_aug_109.wav
22
+ ./dataset/44k/chino_v7/chino_diff_aug_306.wav
23
+ ./dataset/44k/chino_v7/chino_diff_aug_40.wav
24
+ ./dataset/44k/chino_v7/chino_diff_aug_220_2.wav
25
+ ./dataset/44k/chino_v7/chino_diff_aug_310_2.wav
26
+ ./dataset/44k/chino_v7/chino_diff_aug_3.wav
27
+ ./dataset/44k/chino_v7/chino_diff_aug_293.wav
28
+ ./dataset/44k/chino_v7/chino_diff_aug_176.wav
29
+ ./dataset/44k/chino_v7/chino_diff_aug_93.wav
30
+ ./dataset/44k/chino_v7/chino_diff_aug_140_4.wav
31
+ ./dataset/44k/chino_v7/chino_diff_aug_174_0.wav
32
+ ./dataset/44k/chino_v7/chino_diff_aug_212.wav
33
+ ./dataset/44k/chino_v7/chino_diff_aug_296.wav
34
+ ./dataset/44k/chino_v7/chino_diff_aug_300.wav
35
+ ./dataset/44k/chino_v7/chino_diff_aug_193_6.wav
36
+ ./dataset/44k/chino_v7/chino_reprocess_819.wav
37
+ ./dataset/44k/chino_v7/chino_diff_aug_382_1.wav
38
+ ./dataset/44k/chino_v7/chino_diff_aug_225.wav
39
+ ./dataset/44k/chino_v7/chino_diff_aug_139_1.wav
40
+ ./dataset/44k/chino_v7/chino_diff_aug_149_2.wav
41
+ ./dataset/44k/chino_v7/chino_diff_aug_320.wav
42
+ ./dataset/44k/chino_v7/chino_reprocess_844.wav
43
+ ./dataset/44k/chino_v7/chino_diff_aug_398.wav
44
+ ./dataset/44k/chino_v7/chino_reprocess_747.wav
45
+ ./dataset/44k/chino_v7/chino_diff_aug_213_1.wav
46
+ ./dataset/44k/chino_v7/chino_diff_aug_194.wav
47
+ ./dataset/44k/chino_v7/chino_diff_aug_167_0.wav
48
+ ./dataset/44k/chino_v7/chino_diff_aug_393_0.wav
49
+ ./dataset/44k/chino_v7/chino_diff_aug_62.wav
50
+ ./dataset/44k/chino_v7/chino_diff_aug_116_4.wav
51
+ ./dataset/44k/chino_v7/chino_reprocess_875.wav
52
+ ./dataset/44k/chino_v7/chino_diff_aug_111.wav
53
+ ./dataset/44k/chino_v7/chino_reprocess_817.wav
54
+ ./dataset/44k/chino_v7/chino_diff_aug_39.wav
55
+ ./dataset/44k/chino_v7/chino_diff_aug_353_3.wav
56
+ ./dataset/44k/chino_v7/chino_diff_aug_329.wav
57
+ ./dataset/44k/chino_v7/chino_diff_aug_265.wav
58
+ ./dataset/44k/chino_v7/chino_reprocess_831.wav
59
+ ./dataset/44k/chino_v7/chino_diff_aug_42_1.wav
60
+ ./dataset/44k/chino_v7/chino_diff_aug_330.wav
61
+ ./dataset/44k/chino_v7/chino_diff_aug_207.wav
62
+ ./dataset/44k/chino_v7/chino_diff_aug_334.wav
63
+ ./dataset/44k/chino_v7/chino_diff_aug_6_0.wav
64
+ ./dataset/44k/chino_v7/chino_diff_aug_18.wav
65
+ ./dataset/44k/chino_v7/chino_diff_aug_342.wav
66
+ ./dataset/44k/chino_v7/chino_diff_aug_397.wav
67
+ ./dataset/44k/chino_v7/chino_diff_aug_164.wav
68
+ ./dataset/44k/chino_v7/chino_diff_aug_313.wav
69
+ ./dataset/44k/chino_v7/chino_diff_aug_211.wav
70
+ ./dataset/44k/chino_v7/chino_reprocess_836.wav
71
+ ./dataset/44k/chino_v7/chino_diff_aug_339.wav
72
+ ./dataset/44k/chino_v7/chino_diff_aug_157_3.wav
73
+ ./dataset/44k/chino_v7/chino_reprocess_853.wav
74
+ ./dataset/44k/chino_v7/chino_reprocess_803.wav
75
+ ./dataset/44k/chino_v7/chino_diff_aug_189.wav
76
+ ./dataset/44k/chino_v7/chino_diff_aug_248.wav
77
+ ./dataset/44k/chino_v7/chino_diff_aug_277.wav
78
+ ./dataset/44k/chino_v7/chino_reprocess_772.wav
79
+ ./dataset/44k/chino_v7/chino_reprocess_778.wav
80
+ ./dataset/44k/chino_v7/chino_diff_aug_285.wav
81
+ ./dataset/44k/chino_v7/chino_diff_aug_301.wav
82
+ ./dataset/44k/chino_v7/chino_diff_aug_113_3.wav
83
+ ./dataset/44k/chino_v7/chino_diff_aug_228.wav
84
+ ./dataset/44k/chino_v7/chino_diff_aug_68.wav
85
+ ./dataset/44k/chino_v7/chino_reprocess_770.wav
86
+ ./dataset/44k/chino_v7/chino_reprocess_730.wav
87
+ ./dataset/44k/chino_v7/chino_diff_aug_324.wav
88
+ ./dataset/44k/chino_v7/chino_diff_aug_340.wav
89
+ ./dataset/44k/chino_v7/chino_diff_aug_45.wav
90
+ ./dataset/44k/chino_v7/chino_reprocess_716.wav
91
+ ./dataset/44k/chino_v7/chino_diff_aug_19.wav
92
+ ./dataset/44k/chino_v7/chino_reprocess_724.wav
93
+ ./dataset/44k/chino_v7/chino_reprocess_882.wav
94
+ ./dataset/44k/chino_v7/chino_reprocess_809.wav
95
+ ./dataset/44k/chino_v7/chino_diff_aug_354.wav
96
+ ./dataset/44k/chino_v7/chino_diff_aug_49.wav
97
+ ./dataset/44k/chino_v7/chino_diff_aug_242.wav
98
+ ./dataset/44k/chino_v7/chino_reprocess_810.wav
99
+ ./dataset/44k/chino_v7/chino_diff_aug_96_0.wav
100
+ ./dataset/44k/chino_v7/chino_reprocess_798.wav
101
+ ./dataset/44k/chino_v7/chino_diff_aug_14.wav
102
+ ./dataset/44k/chino_v7/chino_diff_aug_372.wav
103
+ ./dataset/44k/chino_v7/chino_diff_aug_84.wav
104
+ ./dataset/44k/chino_v7/chino_diff_aug_302.wav
105
+ ./dataset/44k/chino_v7/chino_diff_aug_256.wav
106
+ ./dataset/44k/chino_v7/chino_reprocess_751.wav
107
+ ./dataset/44k/chino_v7/chino_diff_aug_104.wav
108
+ ./dataset/44k/chino_v7/chino_diff_aug_125_1.wav
109
+ ./dataset/44k/chino_v7/chino_diff_aug_315_0.wav
110
+ ./dataset/44k/chino_v7/chino_diff_aug_186_0.wav
111
+ ./dataset/44k/chino_v7/chino_diff_aug_275.wav
112
+ ./dataset/44k/chino_v7/chino_diff_aug_59.wav
113
+ ./dataset/44k/chino_v7/chino_diff_aug_382_2.wav
114
+ ./dataset/44k/chino_v7/chino_diff_aug_297.wav
115
+ ./dataset/44k/chino_v7/chino_diff_aug_254.wav
116
+ ./dataset/44k/chino_v7/chino_diff_aug_193_2.wav
117
+ ./dataset/44k/chino_v7/chino_diff_aug_233.wav
118
+ ./dataset/44k/chino_v7/chino_diff_aug_258.wav
119
+ ./dataset/44k/chino_v7/chino_diff_aug_360_0.wav
120
+ ./dataset/44k/chino_v7/chino_reprocess_779.wav
121
+ ./dataset/44k/chino_v7/chino_diff_aug_170_0.wav
122
+ ./dataset/44k/chino_v7/chino_diff_aug_193_1.wav
123
+ ./dataset/44k/chino_v7/chino_diff_aug_195.wav
124
+ ./dataset/44k/chino_v7/chino_diff_aug_244.wav
125
+ ./dataset/44k/chino_v7/chino_diff_aug_353_1.wav
126
+ ./dataset/44k/chino_v7/chino_reprocess_799.wav
127
+ ./dataset/44k/chino_v7/chino_diff_aug_393_2.wav
128
+ ./dataset/44k/chino_v7/chino_diff_aug_193_5.wav
129
+ ./dataset/44k/chino_v7/chino_diff_aug_74.wav
130
+ ./dataset/44k/chino_v7/chino_diff_aug_373.wav
131
+ ./dataset/44k/chino_v7/chino_diff_aug_116_2.wav
132
+ ./dataset/44k/chino_v7/chino_diff_aug_304.wav
133
+ ./dataset/44k/chino_v7/chino_diff_aug_114.wav
134
+ ./dataset/44k/chino_v7/chino_diff_aug_31.wav
135
+ ./dataset/44k/chino_v7/chino_diff_aug_217_1.wav
136
+ ./dataset/44k/chino_v7/chino_diff_aug_200.wav
137
+ ./dataset/44k/chino_v7/chino_diff_aug_367_2.wav
138
+ ./dataset/44k/chino_v7/chino_diff_aug_157_1.wav
139
+ ./dataset/44k/chino_v7/chino_reprocess_852.wav
140
+ ./dataset/44k/chino_v7/chino_diff_aug_379.wav
141
+ ./dataset/44k/chino_v7/chino_reprocess_726.wav
142
+ ./dataset/44k/chino_v7/chino_diff_aug_37.wav
143
+ ./dataset/44k/chino_v7/chino_diff_aug_107_0.wav
144
+ ./dataset/44k/chino_v7/chino_diff_aug_198.wav
145
+ ./dataset/44k/chino_v7/chino_reprocess_859.wav
146
+ ./dataset/44k/chino_v7/chino_reprocess_841.wav
147
+ ./dataset/44k/chino_v7/chino_reprocess_742.wav
148
+ ./dataset/44k/chino_v7/chino_diff_aug_43.wav
149
+ ./dataset/44k/chino_v7/chino_diff_aug_107_4.wav
150
+ ./dataset/44k/chino_v7/chino_reprocess_717.wav
151
+ ./dataset/44k/chino_v7/chino_diff_aug_222.wav
152
+ ./dataset/44k/chino_v7/chino_reprocess_722.wav
153
+ ./dataset/44k/chino_v7/chino_diff_aug_294.wav
154
+ ./dataset/44k/chino_v7/chino_diff_aug_351.wav
155
+ ./dataset/44k/chino_v7/chino_diff_aug_333.wav
156
+ ./dataset/44k/chino_v7/chino_diff_aug_137_0.wav
157
+ ./dataset/44k/chino_v7/chino_diff_aug_113_4.wav
158
+ ./dataset/44k/chino_v7/chino_diff_aug_184.wav
159
+ ./dataset/44k/chino_v7/chino_diff_aug_328.wav
160
+ ./dataset/44k/chino_v7/chino_reprocess_874.wav
161
+ ./dataset/44k/chino_v7/chino_diff_aug_193_0.wav
162
+ ./dataset/44k/chino_v7/chino_diff_aug_170_3.wav
163
+ ./dataset/44k/chino_v7/chino_diff_aug_249.wav
164
+ ./dataset/44k/chino_v7/chino_reprocess_752.wav
165
+ ./dataset/44k/chino_v7/chino_diff_aug_75.wav
166
+ ./dataset/44k/chino_v7/chino_diff_aug_353_0.wav
167
+ ./dataset/44k/chino_v7/chino_diff_aug_107_2.wav
168
+ ./dataset/44k/chino_v7/chino_diff_aug_377.wav
169
+ ./dataset/44k/chino_v7/chino_diff_aug_106.wav
170
+ ./dataset/44k/chino_v7/chino_diff_aug_229_2.wav
171
+ ./dataset/44k/chino_v7/chino_diff_aug_388.wav
172
+ ./dataset/44k/chino_v7/chino_diff_aug_234_0.wav
173
+ ./dataset/44k/chino_v7/chino_diff_aug_125_0.wav
174
+ ./dataset/44k/chino_v7/chino_diff_aug_282.wav
175
+ ./dataset/44k/chino_v7/chino_diff_aug_292.wav
176
+ ./dataset/44k/chino_v7/chino_diff_aug_107_1.wav
177
+ ./dataset/44k/chino_v7/chino_diff_aug_42_0.wav
178
+ ./dataset/44k/chino_v7/chino_diff_aug_202_2.wav
179
+ ./dataset/44k/chino_v7/chino_reprocess_862.wav
180
+ ./dataset/44k/chino_v7/chino_diff_aug_308.wav
181
+ ./dataset/44k/chino_v7/chino_diff_aug_400_0.wav
182
+ ./dataset/44k/chino_v7/chino_diff_aug_41.wav
183
+ ./dataset/44k/chino_v7/chino_diff_aug_325.wav
184
+ ./dataset/44k/chino_v7/chino_reprocess_788.wav
185
+ ./dataset/44k/chino_v7/chino_diff_aug_310_0.wav
186
+ ./dataset/44k/chino_v7/chino_diff_aug_226.wav
187
+ ./dataset/44k/chino_v7/chino_reprocess_795.wav
188
+ ./dataset/44k/chino_v7/chino_diff_aug_270.wav
189
+ ./dataset/44k/chino_v7/chino_diff_aug_153.wav
190
+ ./dataset/44k/chino_v7/chino_diff_aug_403_1.wav
191
+ ./dataset/44k/chino_v7/chino_reprocess_858.wav
192
+ ./dataset/44k/chino_v7/chino_diff_aug_12_0.wav
193
+ ./dataset/44k/chino_v7/chino_diff_aug_403_3.wav
194
+ ./dataset/44k/chino_v7/chino_diff_aug_130.wav
195
+ ./dataset/44k/chino_v7/chino_diff_aug_63.wav
196
+ ./dataset/44k/chino_v7/chino_diff_aug_108.wav
197
+ ./dataset/44k/chino_v7/chino_diff_aug_134.wav
198
+ ./dataset/44k/chino_v7/chino_diff_aug_276.wav
199
+ ./dataset/44k/chino_v7/chino_diff_aug_183.wav
200
+ ./dataset/44k/chino_v7/chino_diff_aug_100_0.wav
201
+ ./dataset/44k/chino_v7/chino_diff_aug_159.wav
202
+ ./dataset/44k/chino_v7/chino_diff_aug_400_2.wav
203
+ ./dataset/44k/chino_v7/chino_diff_aug_386.wav
204
+ ./dataset/44k/chino_v7/chino_reprocess_781.wav
205
+ ./dataset/44k/chino_v7/chino_reprocess_750.wav
206
+ ./dataset/44k/chino_v7/chino_diff_aug_229_1.wav
207
+ ./dataset/44k/chino_v7/chino_diff_aug_7.wav
208
+ ./dataset/44k/chino_v7/chino_reprocess_805.wav
209
+ ./dataset/44k/chino_v7/chino_diff_aug_384.wav
210
+ ./dataset/44k/chino_v7/chino_diff_aug_171.wav
211
+ ./dataset/44k/chino_v7/chino_diff_aug_220_0.wav
212
+ ./dataset/44k/chino_v7/chino_reprocess_870.wav
213
+ ./dataset/44k/chino_v7/chino_diff_aug_165.wav
214
+ ./dataset/44k/chino_v7/chino_diff_aug_115.wav
215
+ ./dataset/44k/chino_v7/chino_diff_aug_191.wav
216
+ ./dataset/44k/chino_v7/chino_reprocess_849.wav
217
+ ./dataset/44k/chino_v7/chino_diff_aug_185.wav
218
+ ./dataset/44k/chino_v7/chino_diff_aug_400_1.wav
219
+ ./dataset/44k/chino_v7/chino_diff_aug_341.wav
220
+ ./dataset/44k/chino_v7/chino_diff_aug_116_3.wav
221
+ ./dataset/44k/chino_v7/chino_diff_aug_234_2.wav
222
+ ./dataset/44k/chino_v7/chino_diff_aug_112.wav
223
+ ./dataset/44k/chino_v7/chino_diff_aug_252.wav
224
+ ./dataset/44k/chino_v7/chino_diff_aug_396.wav
225
+ ./dataset/44k/chino_v7/chino_reprocess_787.wav
226
+ ./dataset/44k/chino_v7/chino_diff_aug_202_0.wav
227
+ ./dataset/44k/chino_v7/chino_diff_aug_378.wav
228
+ ./dataset/44k/chino_v7/chino_diff_aug_219.wav
229
+ ./dataset/44k/chino_v7/chino_diff_aug_55.wav
230
+ ./dataset/44k/chino_v7/chino_diff_aug_8_2.wav
231
+ ./dataset/44k/chino_v7/chino_diff_aug_148_1.wav
232
+ ./dataset/44k/chino_v7/chino_diff_aug_11.wav
233
+ ./dataset/44k/chino_v7/chino_diff_aug_288.wav
234
+ ./dataset/44k/chino_v7/chino_reprocess_823.wav
235
+ ./dataset/44k/chino_v7/chino_diff_aug_170_2.wav
236
+ ./dataset/44k/chino_v7/chino_diff_aug_283.wav
237
+ ./dataset/44k/chino_v7/chino_reprocess_741.wav
238
+ ./dataset/44k/chino_v7/chino_diff_aug_32.wav
239
+ ./dataset/44k/chino_v7/chino_diff_aug_299.wav
240
+ ./dataset/44k/chino_v7/chino_diff_aug_161.wav
241
+ ./dataset/44k/chino_v7/chino_reprocess_736.wav
242
+ ./dataset/44k/chino_v7/chino_diff_aug_73.wav
243
+ ./dataset/44k/chino_v7/chino_diff_aug_326.wav
244
+ ./dataset/44k/chino_v7/chino_diff_aug_12_1.wav
245
+ ./dataset/44k/chino_v7/chino_reprocess_867.wav
246
+ ./dataset/44k/chino_v7/chino_diff_aug_124.wav
247
+ ./dataset/44k/chino_v7/chino_reprocess_791.wav
248
+ ./dataset/44k/chino_v7/chino_diff_aug_128.wav
249
+ ./dataset/44k/chino_v7/chino_diff_aug_214.wav
250
+ ./dataset/44k/chino_v7/chino_diff_aug_403_2.wav
251
+ ./dataset/44k/chino_v7/chino_reprocess_725.wav
252
+ ./dataset/44k/chino_v7/chino_diff_aug_118.wav
253
+ ./dataset/44k/chino_v7/chino_reprocess_790.wav
254
+ ./dataset/44k/chino_v7/chino_reprocess_825.wav
255
+ ./dataset/44k/chino_v7/chino_diff_aug_217_0.wav
256
+ ./dataset/44k/chino_v7/chino_diff_aug_54.wav
257
+ ./dataset/44k/chino_v7/chino_reprocess_780.wav
258
+ ./dataset/44k/chino_v7/chino_diff_aug_12_3.wav
259
+ ./dataset/44k/chino_v7/chino_diff_aug_266.wav
260
+ ./dataset/44k/chino_v7/chino_reprocess_762.wav
261
+ ./dataset/44k/chino_v7/chino_diff_aug_65.wav
262
+ ./dataset/44k/chino_v7/chino_diff_aug_0.wav
263
+ ./dataset/44k/chino_v7/chino_diff_aug_392.wav
264
+ ./dataset/44k/chino_v7/chino_diff_aug_139_0.wav
265
+ ./dataset/44k/chino_v7/chino_diff_aug_400_3.wav
266
+ ./dataset/44k/chino_v7/chino_diff_aug_193_3.wav
267
+ ./dataset/44k/chino_v7/chino_reprocess_814.wav
268
+ ./dataset/44k/chino_v7/chino_diff_aug_348.wav
269
+ ./dataset/44k/chino_v7/chino_diff_aug_13.wav
270
+ ./dataset/44k/chino_v7/chino_diff_aug_48.wav
271
+ ./dataset/44k/chino_v7/chino_diff_aug_52.wav
272
+ ./dataset/44k/chino_v7/chino_reprocess_784.wav
273
+ ./dataset/44k/chino_v7/chino_diff_aug_314.wav
274
+ ./dataset/44k/chino_v7/chino_reprocess_804.wav
275
+ ./dataset/44k/chino_v7/chino_diff_aug_399.wav
276
+ ./dataset/44k/chino_v7/chino_reprocess_827.wav
277
+ ./dataset/44k/chino_v7/chino_diff_aug_369_3.wav
278
+ ./dataset/44k/chino_v7/chino_diff_aug_281.wav
279
+ ./dataset/44k/chino_v7/chino_reprocess_835.wav
280
+ ./dataset/44k/chino_v7/chino_diff_aug_46.wav
281
+ ./dataset/44k/chino_v7/chino_diff_aug_144.wav
282
+ ./dataset/44k/chino_v7/chino_diff_aug_188.wav
283
+ ./dataset/44k/chino_v7/chino_diff_aug_121.wav
284
+ ./dataset/44k/chino_v7/chino_diff_aug_160.wav
285
+ ./dataset/44k/chino_v7/chino_diff_aug_205.wav
286
+ ./dataset/44k/chino_v7/chino_diff_aug_4.wav
287
+ ./dataset/44k/chino_v7/chino_diff_aug_103.wav
288
+ ./dataset/44k/chino_v7/chino_diff_aug_71.wav
289
+ ./dataset/44k/chino_v7/chino_reprocess_785.wav
290
+ ./dataset/44k/chino_v7/chino_diff_aug_113_2.wav
291
+ ./dataset/44k/chino_v7/chino_diff_aug_261.wav
292
+ ./dataset/44k/chino_v7/chino_reprocess_829.wav
293
+ ./dataset/44k/chino_v7/chino_diff_aug_289.wav
294
+ ./dataset/44k/chino_v7/chino_diff_aug_208.wav
295
+ ./dataset/44k/chino_v7/chino_diff_aug_332.wav
296
+ ./dataset/44k/chino_v7/chino_diff_aug_162.wav
297
+ ./dataset/44k/chino_v7/chino_diff_aug_143_0.wav
298
+ ./dataset/44k/chino_v7/chino_diff_aug_201.wav
299
+ ./dataset/44k/chino_v7/chino_reprocess_885.wav
300
+ ./dataset/44k/chino_v7/chino_diff_aug_369_0.wav
301
+ ./dataset/44k/chino_v7/chino_reprocess_794.wav
302
+ ./dataset/44k/chino_v7/chino_diff_aug_202_3.wav
303
+ ./dataset/44k/chino_v7/chino_reprocess_857.wav
304
+ ./dataset/44k/chino_v7/chino_diff_aug_257.wav
305
+ ./dataset/44k/chino_v7/chino_diff_aug_163_1.wav
306
+ ./dataset/44k/chino_v7/chino_diff_aug_267.wav
307
+ ./dataset/44k/chino_v7/chino_diff_aug_36_1.wav
308
+ ./dataset/44k/chino_v7/chino_diff_aug_387.wav
309
+ ./dataset/44k/chino_v7/chino_diff_aug_327.wav
310
+ ./dataset/44k/chino_v7/chino_reprocess_806.wav
311
+ ./dataset/44k/chino_v7/chino_reprocess_822.wav
312
+ ./dataset/44k/chino_v7/chino_diff_aug_238_2.wav
313
+ ./dataset/44k/chino_v7/chino_diff_aug_168.wav
314
+ ./dataset/44k/chino_v7/chino_reprocess_721.wav
315
+ ./dataset/44k/chino_v7/chino_diff_aug_138.wav
316
+ ./dataset/44k/chino_v7/chino_diff_aug_15.wav
317
+ ./dataset/44k/chino_v7/chino_diff_aug_352.wav
318
+ ./dataset/44k/chino_v7/chino_reprocess_854.wav
319
+ ./dataset/44k/chino_v7/chino_diff_aug_336.wav
320
+ ./dataset/44k/chino_v7/chino_diff_aug_38.wav
321
+ ./dataset/44k/chino_v7/chino_diff_aug_116_0.wav
322
+ ./dataset/44k/chino_v7/chino_diff_aug_53.wav
323
+ ./dataset/44k/chino_v7/chino_reprocess_887.wav
324
+ ./dataset/44k/chino_v7/chino_reprocess_846.wav
325
+ ./dataset/44k/chino_v7/chino_reprocess_776.wav
326
+ ./dataset/44k/chino_v7/chino_reprocess_848.wav
327
+ ./dataset/44k/chino_v7/chino_diff_aug_303.wav
328
+ ./dataset/44k/chino_v7/chino_diff_aug_364.wav
329
+ ./dataset/44k/chino_v7/chino_reprocess_843.wav
330
+ ./dataset/44k/chino_v7/chino_diff_aug_231_2.wav
331
+ ./dataset/44k/chino_v7/chino_diff_aug_393_1.wav
332
+ ./dataset/44k/chino_v7/chino_diff_aug_113_1.wav
333
+ ./dataset/44k/chino_v7/chino_diff_aug_382_3.wav
334
+ ./dataset/44k/chino_v7/chino_diff_aug_147.wav
335
+ ./dataset/44k/chino_v7/chino_diff_aug_140_0.wav
336
+ ./dataset/44k/chino_v7/chino_diff_aug_243.wav
337
+ ./dataset/44k/chino_v7/chino_reprocess_889.wav
338
+ ./dataset/44k/chino_v7/chino_reprocess_877.wav
339
+ ./dataset/44k/chino_v7/chino_diff_aug_158.wav
340
+ ./dataset/44k/chino_v7/chino_diff_aug_356.wav
341
+ ./dataset/44k/chino_v7/chino_diff_aug_286.wav
342
+ ./dataset/44k/chino_v7/chino_diff_aug_10.wav
343
+ ./dataset/44k/chino_v7/chino_diff_aug_360_2.wav
344
+ ./dataset/44k/chino_v7/chino_reprocess_796.wav
345
+ ./dataset/44k/chino_v7/chino_diff_aug_23.wav
346
+ ./dataset/44k/chino_v7/chino_reprocess_861.wav
347
+ ./dataset/44k/chino_v7/chino_reprocess_869.wav
348
+ ./dataset/44k/chino_v7/chino_diff_aug_202_1.wav
349
+ ./dataset/44k/chino_v7/chino_diff_aug_319.wav
350
+ ./dataset/44k/chino_v7/chino_reprocess_864.wav
351
+ ./dataset/44k/chino_v7/chino_reprocess_826.wav
352
+ ./dataset/44k/chino_v7/chino_diff_aug_175.wav
353
+ ./dataset/44k/chino_v7/chino_diff_aug_9.wav
354
+ ./dataset/44k/chino_v7/chino_diff_aug_36_0.wav
355
+ ./dataset/44k/chino_v7/chino_diff_aug_206.wav
356
+ ./dataset/44k/chino_v7/chino_reprocess_833.wav
357
+ ./dataset/44k/chino_v7/chino_diff_aug_355.wav
358
+ ./dataset/44k/chino_v7/chino_diff_aug_120.wav
359
+ ./dataset/44k/chino_v7/chino_reprocess_881.wav
360
+ ./dataset/44k/chino_v7/chino_diff_aug_215.wav
361
+ ./dataset/44k/chino_v7/chino_diff_aug_234_1.wav
362
+ ./dataset/44k/chino_v7/chino_diff_aug_129.wav
363
+ ./dataset/44k/chino_v7/chino_diff_aug_116_6.wav
364
+ ./dataset/44k/chino_v7/chino_reprocess_868.wav
365
+ ./dataset/44k/chino_v7/chino_diff_aug_167_1.wav
366
+ ./dataset/44k/chino_v7/chino_diff_aug_156.wav
367
+ ./dataset/44k/chino_v7/chino_diff_aug_298.wav
368
+ ./dataset/44k/chino_v7/chino_diff_aug_51.wav
369
+ ./dataset/44k/chino_v7/chino_reprocess_775.wav
370
+ ./dataset/44k/chino_v7/chino_reprocess_839.wav
371
+ ./dataset/44k/chino_v7/chino_diff_aug_117.wav
372
+ ./dataset/44k/chino_v7/chino_diff_aug_247.wav
373
+ ./dataset/44k/chino_v7/chino_diff_aug_123.wav
374
+ ./dataset/44k/chino_v7/chino_reprocess_731.wav
375
+ ./dataset/44k/chino_v7/chino_diff_aug_231_1.wav
376
+ ./dataset/44k/chino_v7/chino_diff_aug_170_4.wav
377
+ ./dataset/44k/chino_v7/chino_diff_aug_6_2.wav
378
+ ./dataset/44k/chino_v7/chino_diff_aug_237_2.wav
379
+ ./dataset/44k/chino_v7/chino_diff_aug_116_7.wav
380
+ ./dataset/44k/chino_v7/chino_diff_aug_146.wav
381
+ ./dataset/44k/chino_v7/chino_diff_aug_271.wav
382
+ ./dataset/44k/chino_v7/chino_diff_aug_403_0.wav
383
+ ./dataset/44k/chino_v7/chino_reprocess_842.wav
384
+ ./dataset/44k/chino_v7/chino_diff_aug_6_1.wav
385
+ ./dataset/44k/chino_v7/chino_reprocess_734.wav
386
+ ./dataset/44k/chino_v7/chino_diff_aug_273.wav
387
+ ./dataset/44k/chino_v7/chino_diff_aug_338.wav
388
+ ./dataset/44k/chino_v7/chino_diff_aug_116_1.wav
389
+ ./dataset/44k/chino_v7/chino_diff_aug_140_3.wav
390
+ ./dataset/44k/chino_v7/chino_diff_aug_337.wav
391
+ ./dataset/44k/chino_v7/chino_reprocess_837.wav
392
+ ./dataset/44k/chino_v7/chino_diff_aug_229_0.wav
393
+ ./dataset/44k/chino_v7/chino_diff_aug_139_2.wav
394
+ ./dataset/44k/chino_v7/chino_reprocess_789.wav
395
+ ./dataset/44k/chino_v7/chino_diff_aug_172.wav
396
+ ./dataset/44k/chino_v7/chino_diff_aug_148_2.wav
397
+ ./dataset/44k/chino_v7/chino_reprocess_818.wav
398
+ ./dataset/44k/chino_v7/chino_diff_aug_116_5.wav
399
+ ./dataset/44k/chino_v7/chino_diff_aug_69.wav
400
+ ./dataset/44k/chino_v7/chino_reprocess_886.wav
401
+ ./dataset/44k/chino_v7/chino_diff_aug_307.wav
402
+ ./dataset/44k/chino_v7/chino_diff_aug_284.wav
403
+ ./dataset/44k/chino_v7/chino_diff_aug_167_2.wav
404
+ ./dataset/44k/chino_v7/chino_diff_aug_371.wav
405
+ ./dataset/44k/chino_v7/chino_diff_aug_5.wav
406
+ ./dataset/44k/chino_v7/chino_reprocess_813.wav
407
+ ./dataset/44k/chino_v7/chino_diff_aug_131.wav
408
+ ./dataset/44k/chino_v7/chino_diff_aug_166.wav
409
+ ./dataset/44k/chino_v7/chino_diff_aug_154.wav
410
+ ./dataset/44k/chino_v7/chino_diff_aug_279.wav
411
+ ./dataset/44k/chino_v7/chino_diff_aug_380.wav
412
+ ./dataset/44k/chino_v7/chino_diff_aug_204.wav
413
+ ./dataset/44k/chino_v7/chino_diff_aug_174_1.wav
414
+ ./dataset/44k/chino_v7/chino_diff_aug_360_1.wav
415
+ ./dataset/44k/chino_v7/chino_reprocess_807.wav
416
+ ./dataset/44k/chino_v7/chino_diff_aug_272.wav
417
+ ./dataset/44k/chino_v7/chino_diff_aug_1_0.wav
418
+ ./dataset/44k/chino_v7/chino_diff_aug_210.wav
419
+ ./dataset/44k/chino_v7/chino_reprocess_749.wav
420
+ ./dataset/44k/chino_v7/chino_diff_aug_347.wav
421
+ ./dataset/44k/chino_v7/chino_diff_aug_349.wav
422
+ ./dataset/44k/chino_v7/chino_diff_aug_344.wav
423
+ ./dataset/44k/chino_v7/chino_reprocess_800.wav
424
+ ./dataset/44k/chino_v7/chino_reprocess_769.wav
425
+ ./dataset/44k/chino_v7/chino_reprocess_856.wav
426
+ ./dataset/44k/chino_v7/chino_reprocess_808.wav
427
+ ./dataset/44k/chino_v7/chino_diff_aug_197_2.wav
428
+ ./dataset/44k/chino_v7/chino_diff_aug_295.wav
429
+ ./dataset/44k/chino_v7/chino_diff_aug_169.wav
430
+ ./dataset/44k/chino_v7/chino_diff_aug_174_2.wav
431
+ ./dataset/44k/chino_v7/chino_reprocess_821.wav
432
+ ./dataset/44k/chino_v7/chino_diff_aug_148_0.wav
433
+ ./dataset/44k/chino_v7/chino_diff_aug_35.wav
434
+ ./dataset/44k/chino_v7/chino_reprocess_820.wav
435
+ ./dataset/44k/chino_v7/chino_diff_aug_107_6.wav
436
+ ./dataset/44k/chino_v7/chino_diff_aug_305.wav
437
+ ./dataset/44k/chino_v7/chino_reprocess_811.wav
438
+ ./dataset/44k/chino_v7/chino_diff_aug_113_6.wav
439
+ ./dataset/44k/chino_v7/chino_diff_aug_223.wav
440
+ ./dataset/44k/chino_v7/chino_diff_aug_202_4.wav
441
+ ./dataset/44k/chino_v7/chino_diff_aug_278.wav
442
+ ./dataset/44k/chino_v7/chino_diff_aug_36_2.wav
443
+ ./dataset/44k/chino_v7/chino_reprocess_834.wav
444
+ ./dataset/44k/chino_v7/chino_diff_aug_196.wav
445
+ ./dataset/44k/chino_v7/chino_reprocess_773.wav
446
+ ./dataset/44k/chino_v7/chino_reprocess_719.wav
447
+ ./dataset/44k/chino_v7/chino_diff_aug_157_0.wav
448
+ ./dataset/44k/chino_v7/chino_diff_aug_1_1.wav
449
+ ./dataset/44k/chino_v7/chino_reprocess_740.wav
450
+ ./dataset/44k/chino_v7/chino_reprocess_815.wav
451
+ ./dataset/44k/chino_v7/chino_diff_aug_180.wav
452
+ ./dataset/44k/chino_v7/chino_reprocess_865.wav
453
+ ./dataset/44k/chino_v7/chino_diff_aug_113_0.wav
454
+ ./dataset/44k/chino_v7/chino_diff_aug_179.wav
455
+ ./dataset/44k/chino_v7/chino_diff_aug_140_2.wav
456
+ ./dataset/44k/chino_v7/chino_reprocess_793.wav
457
+ ./dataset/44k/chino_v7/chino_diff_aug_383.wav
458
+ ./dataset/44k/chino_v7/chino_diff_aug_291.wav
459
+ ./dataset/44k/chino_v7/chino_diff_aug_192.wav
460
+ ./dataset/44k/chino_v7/chino_diff_aug_362.wav
461
+ ./dataset/44k/chino_v7/chino_diff_aug_30.wav
462
+ ./dataset/44k/chino_v7/chino_diff_aug_385.wav
463
+ ./dataset/44k/chino_v7/chino_diff_aug_290.wav
464
+ ./dataset/44k/chino_v7/chino_diff_aug_141.wav
465
+ ./dataset/44k/chino_v7/chino_diff_aug_107_5.wav
466
+ ./dataset/44k/chino_v7/chino_diff_aug_287.wav
467
+ ./dataset/44k/chino_v7/chino_diff_aug_33.wav
468
+ ./dataset/44k/chino_v7/chino_diff_aug_186_2.wav
469
+ ./dataset/44k/chino_v7/chino_diff_aug_366.wav
470
+ ./dataset/44k/chino_v7/chino_diff_aug_213_0.wav
471
+ ./dataset/44k/chino_v7/chino_diff_aug_259.wav
472
+ ./dataset/44k/chino_v7/chino_diff_aug_170_1.wav
473
+ ./dataset/44k/chino_v7/chino_diff_aug_47.wav
474
+ ./dataset/44k/chino_v7/chino_reprocess_855.wav
475
+ ./dataset/44k/chino_v7/chino_reprocess_863.wav
476
+ ./dataset/44k/chino_v7/chino_diff_aug_391.wav
477
+ ./dataset/44k/chino_v7/chino_diff_aug_110.wav
478
+ ./dataset/44k/chino_v7/chino_reprocess_880.wav
479
+ ./dataset/44k/chino_v7/chino_diff_aug_245.wav
480
+ ./dataset/44k/chino_v7/chino_reprocess_754.wav
481
+ ./dataset/44k/chino_v7/chino_diff_aug_209.wav
482
+ ./dataset/44k/chino_v7/chino_diff_aug_227.wav
483
+ ./dataset/44k/chino_v7/chino_diff_aug_401.wav
484
+ ./dataset/44k/chino_v7/chino_diff_aug_353_2.wav
485
+ ./dataset/44k/chino_v7/chino_diff_aug_167_3.wav
486
+ ./dataset/44k/chino_v7/chino_reprocess_850.wav
487
+ ./dataset/44k/chino_v7/chino_diff_aug_60.wav
488
+ ./dataset/44k/chino_v7/chino_diff_aug_345.wav
489
+ ./dataset/44k/chino_v7/chino_diff_aug_76.wav
490
+ ./dataset/44k/chino_v7/chino_reprocess_832.wav
491
+ ./dataset/44k/chino_v7/chino_diff_aug_155.wav
492
+ ./dataset/44k/chino_v7/chino_reprocess_735.wav
493
+ ./dataset/44k/chino_v7/chino_diff_aug_395.wav
494
+ ./dataset/44k/chino_v7/chino_diff_aug_260.wav
495
+ ./dataset/44k/chino_v7/chino_diff_aug_24.wav
496
+ ./dataset/44k/chino_v7/chino_reprocess_763.wav
497
+ ./dataset/44k/chino_v7/chino_diff_aug_16.wav
498
+ ./dataset/44k/chino_v7/chino_diff_aug_107_3.wav
499
+ ./dataset/44k/chino_v7/chino_diff_aug_232.wav
500
+ ./dataset/44k/chino_v7/chino_diff_aug_405.wav
501
+ ./dataset/44k/chino_v7/chino_reprocess_739.wav
502
+ ./dataset/44k/chino_v7/chino_diff_aug_315_2.wav
503
+ ./dataset/44k/chino_v7/chino_diff_aug_113_5.wav
504
+ ./dataset/44k/chino_v7/chino_reprocess_876.wav
505
+ ./dataset/44k/chino_v7/chino_diff_aug_343.wav
506
+ ./dataset/44k/chino_v7/chino_reprocess_764.wav
507
+ ./dataset/44k/chino_v7/chino_diff_aug_34.wav
508
+ ./dataset/44k/chino_v7/chino_diff_aug_86.wav
509
+ ./dataset/44k/chino_v7/chino_diff_aug_181.wav
510
+ ./dataset/44k/chino_v7/chino_reprocess_760.wav
511
+ ./dataset/44k/chino_v7/chino_diff_aug_197_0.wav
512
+ ./dataset/44k/chino_v7/chino_diff_aug_163_2.wav
513
+ ./dataset/44k/chino_v7/chino_diff_aug_390.wav
514
+ ./dataset/44k/chino_v7/chino_reprocess_838.wav
515
+ ./dataset/44k/chino_v7/chino_diff_aug_20.wav
516
+ ./dataset/44k/chino_v7/chino_reprocess_748.wav
517
+ ./dataset/44k/chino_v7/chino_reprocess_797.wav
518
+ ./dataset/44k/chino_v7/chino_reprocess_768.wav
519
+ ./dataset/44k/chino_v7/chino_diff_aug_216.wav
520
+ ./dataset/44k/chino_v7/chino_diff_aug_331.wav
521
+ ./dataset/44k/chino_v7/chino_reprocess_771.wav
522
+ ./dataset/44k/chino_v7/chino_reprocess_745.wav
523
+ ./dataset/44k/chino_v7/chino_diff_aug_83.wav
524
+ ./dataset/44k/chino_v7/chino_diff_aug_178.wav
525
+ ./dataset/44k/chino_v7/chino_reprocess_777.wav
526
+ ./dataset/44k/chino_v7/chino_diff_aug_310_3.wav
527
+ ./dataset/44k/chino_v7/chino_diff_aug_231_0.wav
528
+ ./dataset/44k/chino_v7/chino_diff_aug_203.wav
529
+ ./dataset/44k/chino_v7/chino_diff_aug_102.wav
530
+ ./dataset/44k/chino_v7/chino_diff_aug_368.wav
531
+ ./dataset/44k/chino_v7/chino_reprocess_783.wav
532
+ ./dataset/44k/chino_v7/chino_diff_aug_346.wav
533
+ ./dataset/44k/chino_v7/chino_diff_aug_70.wav
534
+ ./dataset/44k/chino_v7/chino_diff_aug_25.wav
535
+ ./dataset/44k/chino_v7/chino_diff_aug_389.wav
536
+ ./dataset/44k/chino_v7/chino_diff_aug_190.wav
537
+ ./dataset/44k/chino_v7/chino_reprocess_765.wav
538
+ ./dataset/44k/chino_v7/chino_diff_aug_151_0.wav
539
+ ./dataset/44k/chino_v7/chino_diff_aug_224.wav
540
+ ./dataset/44k/chino_v7/chino_diff_aug_246.wav
541
+ ./dataset/44k/chino_v7/chino_diff_aug_50.wav
542
+ ./dataset/44k/chino_v7/chino_reprocess_753.wav
543
+ ./dataset/44k/chino_v7/chino_diff_aug_56.wav
544
+ ./dataset/44k/chino_v7/chino_reprocess_873.wav
545
+ ./dataset/44k/chino_v7/chino_diff_aug_119.wav
546
+ ./dataset/44k/chino_v7/chino_reprocess_884.wav
547
+ ./dataset/44k/chino_v7/chino_diff_aug_21.wav
548
+ ./dataset/44k/chino_v7/chino_diff_aug_280.wav
549
+ ./dataset/44k/chino_v7/chino_diff_aug_218.wav
550
+ ./dataset/44k/chino_v7/chino_reprocess_883.wav
551
+ ./dataset/44k/chino_v7/chino_reprocess_782.wav
552
+ ./dataset/44k/chino_v7/chino_diff_aug_350.wav
553
+ ./dataset/44k/chino_v7/chino_reprocess_847.wav
554
+ ./dataset/44k/chino_v7/chino_diff_aug_193_4.wav
555
+ ./dataset/44k/chino_v7/chino_diff_aug_173.wav
556
+ ./dataset/44k/chino_v7/chino_diff_aug_44.wav
557
+ ./dataset/44k/chino_v7/chino_diff_aug_255.wav
558
+ ./dataset/44k/chino_v7/chino_diff_aug_376.wav
559
+ ./dataset/44k/chino_v7/chino_diff_aug_199.wav
560
+ ./dataset/44k/chino_v7/chino_diff_aug_105.wav
561
+ ./dataset/44k/chino_v7/chino_diff_aug_149_0.wav
562
+ ./dataset/44k/chino_v7/chino_diff_aug_136.wav
563
+ ./dataset/44k/chino_v7/chino_diff_aug_335.wav
564
+ ./dataset/44k/chino_v7/chino_reprocess_824.wav
565
+ ./dataset/44k/chino_v7/chino_diff_aug_1_2.wav
566
+ ./dataset/44k/chino_v7/chino_diff_aug_8_0.wav
filelists/val.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ./dataset/44k/chino_v7/chino_diff_aug_197_1.wav
2
+ ./dataset/44k/chino_v7/chino_diff_aug_237_3.wav
inference/__init__.py ADDED
File without changes
inference/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (159 Bytes). View file
 
inference/__pycache__/infer_tool.cpython-38.pyc ADDED
Binary file (14.8 kB). View file
 
inference/__pycache__/infer_tool_webui.cpython-38.pyc ADDED
Binary file (15.3 kB). View file
 
inference/__pycache__/slicer.cpython-38.pyc ADDED
Binary file (3.86 kB). View file
 
inference/chunks_temp.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"info": "temp_dict"}
inference/infer_tool.py ADDED
@@ -0,0 +1,546 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import hashlib
3
+ import io
4
+ import json
5
+ import logging
6
+ import os
7
+ import pickle
8
+ import time
9
+ from pathlib import Path
10
+
11
+ import librosa
12
+ import numpy as np
13
+
14
+ # import onnxruntime
15
+ import soundfile
16
+ import torch
17
+ import torchaudio
18
+
19
+ import cluster
20
+ import utils
21
+ from diffusion.unit2mel import load_model_vocoder
22
+ from inference import slicer
23
+ from models import SynthesizerTrn
24
+
25
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
26
+
27
+
28
+ def read_temp(file_name):
29
+ if not os.path.exists(file_name):
30
+ with open(file_name, "w") as f:
31
+ f.write(json.dumps({"info": "temp_dict"}))
32
+ return {}
33
+ else:
34
+ try:
35
+ with open(file_name, "r") as f:
36
+ data = f.read()
37
+ data_dict = json.loads(data)
38
+ if os.path.getsize(file_name) > 50 * 1024 * 1024:
39
+ f_name = file_name.replace("\\", "/").split("/")[-1]
40
+ print(f"clean {f_name}")
41
+ for wav_hash in list(data_dict.keys()):
42
+ if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
43
+ del data_dict[wav_hash]
44
+ except Exception as e:
45
+ print(e)
46
+ print(f"{file_name} error,auto rebuild file")
47
+ data_dict = {"info": "temp_dict"}
48
+ return data_dict
49
+
50
+
51
+ def write_temp(file_name, data):
52
+ with open(file_name, "w") as f:
53
+ f.write(json.dumps(data))
54
+
55
+
56
+ def timeit(func):
57
+ def run(*args, **kwargs):
58
+ t = time.time()
59
+ res = func(*args, **kwargs)
60
+ print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
61
+ return res
62
+
63
+ return run
64
+
65
+
66
+ def format_wav(audio_path):
67
+ if Path(audio_path).suffix == '.wav':
68
+ return
69
+ raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
70
+ soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
71
+
72
+
73
+ def get_end_file(dir_path, end):
74
+ file_lists = []
75
+ for root, dirs, files in os.walk(dir_path):
76
+ files = [f for f in files if f[0] != '.']
77
+ dirs[:] = [d for d in dirs if d[0] != '.']
78
+ for f_file in files:
79
+ if f_file.endswith(end):
80
+ file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
81
+ return file_lists
82
+
83
+
84
+ def get_md5(content):
85
+ return hashlib.new("md5", content).hexdigest()
86
+
87
+ def fill_a_to_b(a, b):
88
+ if len(a) < len(b):
89
+ for _ in range(0, len(b) - len(a)):
90
+ a.append(a[0])
91
+
92
+ def mkdir(paths: list):
93
+ for path in paths:
94
+ if not os.path.exists(path):
95
+ os.mkdir(path)
96
+
97
+ def pad_array(arr, target_length):
98
+ current_length = arr.shape[0]
99
+ if current_length >= target_length:
100
+ return arr
101
+ else:
102
+ pad_width = target_length - current_length
103
+ pad_left = pad_width // 2
104
+ pad_right = pad_width - pad_left
105
+ padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
106
+ return padded_arr
107
+
108
+ def split_list_by_n(list_collection, n, pre=0):
109
+ for i in range(0, len(list_collection), n):
110
+ yield list_collection[i-pre if i-pre>=0 else i: i + n]
111
+
112
+
113
+ class F0FilterException(Exception):
114
+ pass
115
+
116
+ class Svc(object):
117
+ def __init__(self, net_g_path, config_path,
118
+ device=None,
119
+ cluster_model_path="logs/44k/kmeans_10000.pt",
120
+ nsf_hifigan_enhance = False,
121
+ diffusion_model_path="logs/44k/diffusion/model_0.pt",
122
+ diffusion_config_path="configs/diffusion.yaml",
123
+ shallow_diffusion = False,
124
+ only_diffusion = False,
125
+ spk_mix_enable = False,
126
+ feature_retrieval = False
127
+ ):
128
+ self.net_g_path = net_g_path
129
+ self.only_diffusion = only_diffusion
130
+ self.shallow_diffusion = shallow_diffusion
131
+ self.feature_retrieval = feature_retrieval
132
+ if device is None:
133
+ self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
134
+ else:
135
+ self.dev = torch.device(device)
136
+ self.net_g_ms = None
137
+ if not self.only_diffusion:
138
+ self.hps_ms = utils.get_hparams_from_file(config_path,True)
139
+ self.target_sample = self.hps_ms.data.sampling_rate
140
+ self.hop_size = self.hps_ms.data.hop_length
141
+ self.spk2id = self.hps_ms.spk
142
+ self.unit_interpolate_mode = self.hps_ms.data.unit_interpolate_mode if self.hps_ms.data.unit_interpolate_mode is not None else 'left'
143
+ self.vol_embedding = self.hps_ms.model.vol_embedding if self.hps_ms.model.vol_embedding is not None else False
144
+ self.speech_encoder = self.hps_ms.model.speech_encoder if self.hps_ms.model.speech_encoder is not None else 'vec768l12'
145
+
146
+ self.nsf_hifigan_enhance = nsf_hifigan_enhance
147
+ if self.shallow_diffusion or self.only_diffusion:
148
+ if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
149
+ self.diffusion_model,self.vocoder,self.diffusion_args = load_model_vocoder(diffusion_model_path,self.dev,config_path=diffusion_config_path)
150
+ if self.only_diffusion:
151
+ self.target_sample = self.diffusion_args.data.sampling_rate
152
+ self.hop_size = self.diffusion_args.data.block_size
153
+ self.spk2id = self.diffusion_args.spk
154
+ self.dtype = torch.float32
155
+ self.speech_encoder = self.diffusion_args.data.encoder
156
+ self.unit_interpolate_mode = self.diffusion_args.data.unit_interpolate_mode if self.diffusion_args.data.unit_interpolate_mode is not None else 'left'
157
+ if spk_mix_enable:
158
+ self.diffusion_model.init_spkmix(len(self.spk2id))
159
+ else:
160
+ print("No diffusion model or config found. Shallow diffusion mode will False")
161
+ self.shallow_diffusion = self.only_diffusion = False
162
+
163
+ # load hubert and model
164
+ if not self.only_diffusion:
165
+ self.load_model(spk_mix_enable)
166
+ self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
167
+ self.volume_extractor = utils.Volume_Extractor(self.hop_size)
168
+ else:
169
+ self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder,device=self.dev)
170
+ self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
171
+
172
+ if os.path.exists(cluster_model_path):
173
+ if self.feature_retrieval:
174
+ with open(cluster_model_path,"rb") as f:
175
+ self.cluster_model = pickle.load(f)
176
+ self.big_npy = None
177
+ self.now_spk_id = -1
178
+ else:
179
+ self.cluster_model = cluster.get_cluster_model(cluster_model_path)
180
+ else:
181
+ self.feature_retrieval=False
182
+
183
+ if self.shallow_diffusion :
184
+ self.nsf_hifigan_enhance = False
185
+ if self.nsf_hifigan_enhance:
186
+ from modules.enhancer import Enhancer
187
+ self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
188
+
189
+ def load_model(self, spk_mix_enable=False):
190
+ # get model configuration
191
+ self.net_g_ms = SynthesizerTrn(
192
+ self.hps_ms.data.filter_length // 2 + 1,
193
+ self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
194
+ **self.hps_ms.model)
195
+ _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
196
+ self.dtype = list(self.net_g_ms.parameters())[0].dtype
197
+ if "half" in self.net_g_path and torch.cuda.is_available():
198
+ _ = self.net_g_ms.half().eval().to(self.dev)
199
+ else:
200
+ _ = self.net_g_ms.eval().to(self.dev)
201
+ if spk_mix_enable:
202
+ self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
203
+
204
+ def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
205
+
206
+ if not hasattr(self,"f0_predictor_object") or self.f0_predictor_object is None or f0_predictor != self.f0_predictor_object.name:
207
+ self.f0_predictor_object = utils.get_f0_predictor(f0_predictor,hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold)
208
+ f0, uv = self.f0_predictor_object.compute_f0_uv(wav)
209
+
210
+ if f0_filter and sum(f0) == 0:
211
+ raise F0FilterException("No voice detected")
212
+ f0 = torch.FloatTensor(f0).to(self.dev)
213
+ uv = torch.FloatTensor(uv).to(self.dev)
214
+
215
+ f0 = f0 * 2 ** (tran / 12)
216
+ f0 = f0.unsqueeze(0)
217
+ uv = uv.unsqueeze(0)
218
+
219
+ wav = torch.from_numpy(wav).to(self.dev)
220
+ if not hasattr(self,"audio16k_resample_transform"):
221
+ self.audio16k_resample_transform = torchaudio.transforms.Resample(self.target_sample, 16000).to(self.dev)
222
+ wav16k = self.audio16k_resample_transform(wav[None,:])[0]
223
+
224
+ c = self.hubert_model.encoder(wav16k)
225
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
226
+
227
+ if cluster_infer_ratio !=0:
228
+ if self.feature_retrieval:
229
+ speaker_id = self.spk2id.get(speaker)
230
+ if not speaker_id and type(speaker) is int:
231
+ if len(self.spk2id.__dict__) >= speaker:
232
+ speaker_id = speaker
233
+ if speaker_id is None:
234
+ raise RuntimeError("The name you entered is not in the speaker list!")
235
+ feature_index = self.cluster_model[speaker_id]
236
+ feat_np = np.ascontiguousarray(c.transpose(0,1).cpu().numpy())
237
+ if self.big_npy is None or self.now_spk_id != speaker_id:
238
+ self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
239
+ self.now_spk_id = speaker_id
240
+ print("starting feature retrieval...")
241
+ score, ix = feature_index.search(feat_np, k=8)
242
+ weight = np.square(1 / score)
243
+ weight /= weight.sum(axis=1, keepdims=True)
244
+ npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
245
+ c = cluster_infer_ratio * npy + (1 - cluster_infer_ratio) * feat_np
246
+ c = torch.FloatTensor(c).to(self.dev).transpose(0,1)
247
+ print("end feature retrieval...")
248
+ else:
249
+ cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
250
+ cluster_c = torch.FloatTensor(cluster_c).to(self.dev)
251
+ c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c
252
+
253
+ c = c.unsqueeze(0)
254
+ return c, f0, uv
255
+
256
+ def infer(self, speaker, tran, raw_path,
257
+ cluster_infer_ratio=0,
258
+ auto_predict_f0=False,
259
+ noice_scale=0.4,
260
+ f0_filter=False,
261
+ f0_predictor='pm',
262
+ enhancer_adaptive_key = 0,
263
+ cr_threshold = 0.05,
264
+ k_step = 100,
265
+ frame = 0,
266
+ spk_mix = False,
267
+ second_encoding = False,
268
+ loudness_envelope_adjustment = 1
269
+ ):
270
+ torchaudio.set_audio_backend("soundfile")
271
+ wav, sr = torchaudio.load(raw_path)
272
+ if not hasattr(self,"audio_resample_transform") or self.audio16k_resample_transform.orig_freq != sr:
273
+ self.audio_resample_transform = torchaudio.transforms.Resample(sr,self.target_sample)
274
+ wav = self.audio_resample_transform(wav).numpy()[0]
275
+ if spk_mix:
276
+ c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
277
+ n_frames = f0.size(1)
278
+ sid = speaker[:, frame:frame+n_frames].transpose(0,1)
279
+ else:
280
+ speaker_id = self.spk2id.get(speaker)
281
+ if not speaker_id and type(speaker) is int:
282
+ if len(self.spk2id.__dict__) >= speaker:
283
+ speaker_id = speaker
284
+ if speaker_id is None:
285
+ raise RuntimeError("The name you entered is not in the speaker list!")
286
+ sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
287
+ c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
288
+ n_frames = f0.size(1)
289
+ c = c.to(self.dtype)
290
+ f0 = f0.to(self.dtype)
291
+ uv = uv.to(self.dtype)
292
+ with torch.no_grad():
293
+ start = time.time()
294
+ vol = None
295
+ if not self.only_diffusion:
296
+ vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
297
+ audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
298
+ audio = audio[0,0].data.float()
299
+ audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
300
+ else:
301
+ audio = torch.FloatTensor(wav).to(self.dev)
302
+ audio_mel = None
303
+ if self.dtype != torch.float32:
304
+ c = c.to(torch.float32)
305
+ f0 = f0.to(torch.float32)
306
+ uv = uv.to(torch.float32)
307
+ if self.only_diffusion or self.shallow_diffusion:
308
+ vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol is None else vol[:,:,None]
309
+ if self.shallow_diffusion and second_encoding:
310
+ if not hasattr(self,"audio16k_resample_transform"):
311
+ self.audio16k_resample_transform = torchaudio.transforms.Resample(self.target_sample, 16000).to(self.dev)
312
+ audio16k = self.audio16k_resample_transform(audio[None,:])[0]
313
+ c = self.hubert_model.encoder(audio16k)
314
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
315
+ f0 = f0[:,:,None]
316
+ c = c.transpose(-1,-2)
317
+ audio_mel = self.diffusion_model(
318
+ c,
319
+ f0,
320
+ vol,
321
+ spk_id = sid,
322
+ spk_mix_dict = None,
323
+ gt_spec=audio_mel,
324
+ infer=True,
325
+ infer_speedup=self.diffusion_args.infer.speedup,
326
+ method=self.diffusion_args.infer.method,
327
+ k_step=k_step)
328
+ audio = self.vocoder.infer(audio_mel, f0).squeeze()
329
+ if self.nsf_hifigan_enhance:
330
+ audio, _ = self.enhancer.enhance(
331
+ audio[None,:],
332
+ self.target_sample,
333
+ f0[:,:,None],
334
+ self.hps_ms.data.hop_length,
335
+ adaptive_key = enhancer_adaptive_key)
336
+ if loudness_envelope_adjustment != 1:
337
+ audio = utils.change_rms(wav,self.target_sample,audio,self.target_sample,loudness_envelope_adjustment)
338
+ use_time = time.time() - start
339
+ print("vits use time:{}".format(use_time))
340
+ return audio, audio.shape[-1], n_frames
341
+
342
+ def clear_empty(self):
343
+ # clean up vram
344
+ torch.cuda.empty_cache()
345
+
346
+ def unload_model(self):
347
+ # unload model
348
+ self.net_g_ms = self.net_g_ms.to("cpu")
349
+ del self.net_g_ms
350
+ if hasattr(self,"enhancer"):
351
+ self.enhancer.enhancer = self.enhancer.enhancer.to("cpu")
352
+ del self.enhancer.enhancer
353
+ del self.enhancer
354
+ gc.collect()
355
+
356
+ def slice_inference(self,
357
+ raw_audio_path,
358
+ spk,
359
+ tran,
360
+ slice_db,
361
+ cluster_infer_ratio,
362
+ auto_predict_f0,
363
+ noice_scale,
364
+ pad_seconds=0.5,
365
+ clip_seconds=0,
366
+ lg_num=0,
367
+ lgr_num =0.75,
368
+ f0_predictor='pm',
369
+ enhancer_adaptive_key = 0,
370
+ cr_threshold = 0.05,
371
+ k_step = 100,
372
+ use_spk_mix = False,
373
+ second_encoding = False,
374
+ loudness_envelope_adjustment = 1
375
+ ):
376
+ if use_spk_mix:
377
+ if len(self.spk2id) == 1:
378
+ spk = self.spk2id.keys()[0]
379
+ use_spk_mix = False
380
+ wav_path = Path(raw_audio_path).with_suffix('.wav')
381
+ chunks = slicer.cut(wav_path, db_thresh=slice_db)
382
+ audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
383
+ per_size = int(clip_seconds*audio_sr)
384
+ lg_size = int(lg_num*audio_sr)
385
+ lg_size_r = int(lg_size*lgr_num)
386
+ lg_size_c_l = (lg_size-lg_size_r)//2
387
+ lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
388
+ lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
389
+
390
+ if use_spk_mix:
391
+ assert len(self.spk2id) == len(spk)
392
+ audio_length = 0
393
+ for (slice_tag, data) in audio_data:
394
+ aud_length = int(np.ceil(len(data) / audio_sr * self.target_sample))
395
+ if slice_tag:
396
+ audio_length += aud_length // self.hop_size
397
+ continue
398
+ if per_size != 0:
399
+ datas = split_list_by_n(data, per_size,lg_size)
400
+ else:
401
+ datas = [data]
402
+ for k,dat in enumerate(datas):
403
+ pad_len = int(audio_sr * pad_seconds)
404
+ per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
405
+ a_length = per_length + 2 * pad_len
406
+ audio_length += a_length // self.hop_size
407
+ audio_length += len(audio_data)
408
+ spk_mix_tensor = torch.zeros(size=(len(spk), audio_length)).to(self.dev)
409
+ for i in range(len(spk)):
410
+ last_end = None
411
+ for mix in spk[i]:
412
+ if mix[3]<0. or mix[2]<0.:
413
+ raise RuntimeError("mix value must higer Than zero!")
414
+ begin = int(audio_length * mix[0])
415
+ end = int(audio_length * mix[1])
416
+ length = end - begin
417
+ if length<=0:
418
+ raise RuntimeError("begin Must lower Than end!")
419
+ step = (mix[3] - mix[2])/length
420
+ if last_end is not None:
421
+ if last_end != begin:
422
+ raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
423
+ last_end = end
424
+ if step == 0.:
425
+ spk_mix_data = torch.zeros(length).to(self.dev) + mix[2]
426
+ else:
427
+ spk_mix_data = torch.arange(mix[2],mix[3],step).to(self.dev)
428
+ if(len(spk_mix_data)<length):
429
+ num_pad = length - len(spk_mix_data)
430
+ spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
431
+ spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
432
+
433
+ spk_mix_ten = torch.sum(spk_mix_tensor,dim=0).unsqueeze(0).to(self.dev)
434
+ # spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
435
+ for i, x in enumerate(spk_mix_ten[0]):
436
+ if x == 0.0:
437
+ spk_mix_ten[0][i] = 1.0
438
+ spk_mix_tensor[:,i] = 1.0 / len(spk)
439
+ spk_mix_tensor = spk_mix_tensor / spk_mix_ten
440
+ if not ((torch.sum(spk_mix_tensor,dim=0) - 1.)<0.0001).all():
441
+ raise RuntimeError("sum(spk_mix_tensor) not equal 1")
442
+ spk = spk_mix_tensor
443
+
444
+ global_frame = 0
445
+ audio = []
446
+ for (slice_tag, data) in audio_data:
447
+ print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
448
+ # padd
449
+ length = int(np.ceil(len(data) / audio_sr * self.target_sample))
450
+ if slice_tag:
451
+ print('jump empty segment')
452
+ _audio = np.zeros(length)
453
+ audio.extend(list(pad_array(_audio, length)))
454
+ global_frame += length // self.hop_size
455
+ continue
456
+ if per_size != 0:
457
+ datas = split_list_by_n(data, per_size,lg_size)
458
+ else:
459
+ datas = [data]
460
+ for k,dat in enumerate(datas):
461
+ per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
462
+ if clip_seconds!=0:
463
+ print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
464
+ # padd
465
+ pad_len = int(audio_sr * pad_seconds)
466
+ dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
467
+ raw_path = io.BytesIO()
468
+ soundfile.write(raw_path, dat, audio_sr, format="wav")
469
+ raw_path.seek(0)
470
+ out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
471
+ cluster_infer_ratio=cluster_infer_ratio,
472
+ auto_predict_f0=auto_predict_f0,
473
+ noice_scale=noice_scale,
474
+ f0_predictor = f0_predictor,
475
+ enhancer_adaptive_key = enhancer_adaptive_key,
476
+ cr_threshold = cr_threshold,
477
+ k_step = k_step,
478
+ frame = global_frame,
479
+ spk_mix = use_spk_mix,
480
+ second_encoding = second_encoding,
481
+ loudness_envelope_adjustment = loudness_envelope_adjustment
482
+ )
483
+ global_frame += out_frame
484
+ _audio = out_audio.cpu().numpy()
485
+ pad_len = int(self.target_sample * pad_seconds)
486
+ _audio = _audio[pad_len:-pad_len]
487
+ _audio = pad_array(_audio, per_length)
488
+ if lg_size!=0 and k!=0:
489
+ lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
490
+ lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
491
+ lg_pre = lg1*(1-lg)+lg2*lg
492
+ audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
493
+ audio.extend(lg_pre)
494
+ _audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
495
+ audio.extend(list(_audio))
496
+ return np.array(audio)
497
+
498
+ class RealTimeVC:
499
+ def __init__(self):
500
+ self.last_chunk = None
501
+ self.last_o = None
502
+ self.chunk_len = 16000 # chunk length
503
+ self.pre_len = 3840 # cross fade length, multiples of 640
504
+
505
+ # Input and output are 1-dimensional numpy waveform arrays
506
+
507
+ def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path,
508
+ cluster_infer_ratio=0,
509
+ auto_predict_f0=False,
510
+ noice_scale=0.4,
511
+ f0_filter=False):
512
+
513
+ import maad
514
+ audio, sr = torchaudio.load(input_wav_path)
515
+ audio = audio.cpu().numpy()[0]
516
+ temp_wav = io.BytesIO()
517
+ if self.last_chunk is None:
518
+ input_wav_path.seek(0)
519
+
520
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path,
521
+ cluster_infer_ratio=cluster_infer_ratio,
522
+ auto_predict_f0=auto_predict_f0,
523
+ noice_scale=noice_scale,
524
+ f0_filter=f0_filter)
525
+
526
+ audio = audio.cpu().numpy()
527
+ self.last_chunk = audio[-self.pre_len:]
528
+ self.last_o = audio
529
+ return audio[-self.chunk_len:]
530
+ else:
531
+ audio = np.concatenate([self.last_chunk, audio])
532
+ soundfile.write(temp_wav, audio, sr, format="wav")
533
+ temp_wav.seek(0)
534
+
535
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav,
536
+ cluster_infer_ratio=cluster_infer_ratio,
537
+ auto_predict_f0=auto_predict_f0,
538
+ noice_scale=noice_scale,
539
+ f0_filter=f0_filter)
540
+
541
+ audio = audio.cpu().numpy()
542
+ ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
543
+ self.last_chunk = audio[-self.pre_len:]
544
+ self.last_o = audio
545
+ return ret[self.chunk_len:2 * self.chunk_len]
546
+
inference/infer_tool_grad.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import logging
3
+ import os
4
+
5
+ import librosa
6
+ import numpy as np
7
+ import parselmouth
8
+ import soundfile
9
+ import torch
10
+ import torchaudio
11
+
12
+ import utils
13
+ from inference import slicer
14
+ from models import SynthesizerTrn
15
+
16
+ logging.getLogger('numba').setLevel(logging.WARNING)
17
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
18
+
19
+ def resize2d_f0(x, target_len):
20
+ source = np.array(x)
21
+ source[source < 0.001] = np.nan
22
+ target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
23
+ source)
24
+ res = np.nan_to_num(target)
25
+ return res
26
+
27
+ def get_f0(x, p_len,f0_up_key=0):
28
+
29
+ time_step = 160 / 16000 * 1000
30
+ f0_min = 50
31
+ f0_max = 1100
32
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
33
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
34
+
35
+ f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
36
+ time_step=time_step / 1000, voicing_threshold=0.6,
37
+ pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
38
+
39
+ pad_size=(p_len - len(f0) + 1) // 2
40
+ if(pad_size>0 or p_len - len(f0) - pad_size>0):
41
+ f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
42
+
43
+ f0 *= pow(2, f0_up_key / 12)
44
+ f0_mel = 1127 * np.log(1 + f0 / 700)
45
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
46
+ f0_mel[f0_mel <= 1] = 1
47
+ f0_mel[f0_mel > 255] = 255
48
+ f0_coarse = np.rint(f0_mel).astype(np.int)
49
+ return f0_coarse, f0
50
+
51
+ def clean_pitch(input_pitch):
52
+ num_nan = np.sum(input_pitch == 1)
53
+ if num_nan / len(input_pitch) > 0.9:
54
+ input_pitch[input_pitch != 1] = 1
55
+ return input_pitch
56
+
57
+
58
+ def plt_pitch(input_pitch):
59
+ input_pitch = input_pitch.astype(float)
60
+ input_pitch[input_pitch == 1] = np.nan
61
+ return input_pitch
62
+
63
+
64
+ def f0_to_pitch(ff):
65
+ f0_pitch = 69 + 12 * np.log2(ff / 440)
66
+ return f0_pitch
67
+
68
+
69
+ def fill_a_to_b(a, b):
70
+ if len(a) < len(b):
71
+ for _ in range(0, len(b) - len(a)):
72
+ a.append(a[0])
73
+
74
+
75
+ def mkdir(paths: list):
76
+ for path in paths:
77
+ if not os.path.exists(path):
78
+ os.mkdir(path)
79
+
80
+
81
+ class VitsSvc(object):
82
+ def __init__(self):
83
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
84
+ self.SVCVITS = None
85
+ self.hps = None
86
+ self.speakers = None
87
+ self.hubert_soft = utils.get_hubert_model()
88
+
89
+ def set_device(self, device):
90
+ self.device = torch.device(device)
91
+ self.hubert_soft.to(self.device)
92
+ if self.SVCVITS is not None:
93
+ self.SVCVITS.to(self.device)
94
+
95
+ def loadCheckpoint(self, path):
96
+ self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
97
+ self.SVCVITS = SynthesizerTrn(
98
+ self.hps.data.filter_length // 2 + 1,
99
+ self.hps.train.segment_size // self.hps.data.hop_length,
100
+ **self.hps.model)
101
+ _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.SVCVITS, None)
102
+ _ = self.SVCVITS.eval().to(self.device)
103
+ self.speakers = self.hps.spk
104
+
105
+ def get_units(self, source, sr):
106
+ source = source.unsqueeze(0).to(self.device)
107
+ with torch.inference_mode():
108
+ units = self.hubert_soft.units(source)
109
+ return units
110
+
111
+
112
+ def get_unit_pitch(self, in_path, tran):
113
+ source, sr = torchaudio.load(in_path)
114
+ source = torchaudio.functional.resample(source, sr, 16000)
115
+ if len(source.shape) == 2 and source.shape[1] >= 2:
116
+ source = torch.mean(source, dim=0).unsqueeze(0)
117
+ soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
118
+ f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0]*2, tran)
119
+ return soft, f0
120
+
121
+ def infer(self, speaker_id, tran, raw_path):
122
+ speaker_id = self.speakers[speaker_id]
123
+ sid = torch.LongTensor([int(speaker_id)]).to(self.device).unsqueeze(0)
124
+ soft, pitch = self.get_unit_pitch(raw_path, tran)
125
+ f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device)
126
+ stn_tst = torch.FloatTensor(soft)
127
+ with torch.no_grad():
128
+ x_tst = stn_tst.unsqueeze(0).to(self.device)
129
+ x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
130
+ audio,_ = self.SVCVITS.infer(x_tst, f0=f0, g=sid)[0,0].data.float()
131
+ return audio, audio.shape[-1]
132
+
133
+ def inference(self,srcaudio,chara,tran,slice_db):
134
+ sampling_rate, audio = srcaudio
135
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
136
+ if len(audio.shape) > 1:
137
+ audio = librosa.to_mono(audio.transpose(1, 0))
138
+ if sampling_rate != 16000:
139
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
140
+ soundfile.write("tmpwav.wav", audio, 16000, format="wav")
141
+ chunks = slicer.cut("tmpwav.wav", db_thresh=slice_db)
142
+ audio_data, audio_sr = slicer.chunks2audio("tmpwav.wav", chunks)
143
+ audio = []
144
+ for (slice_tag, data) in audio_data:
145
+ length = int(np.ceil(len(data) / audio_sr * self.hps.data.sampling_rate))
146
+ raw_path = io.BytesIO()
147
+ soundfile.write(raw_path, data, audio_sr, format="wav")
148
+ raw_path.seek(0)
149
+ if slice_tag:
150
+ _audio = np.zeros(length)
151
+ else:
152
+ out_audio, out_sr = self.infer(chara, tran, raw_path)
153
+ _audio = out_audio.cpu().numpy()
154
+ audio.extend(list(_audio))
155
+ audio = (np.array(audio) * 32768.0).astype('int16')
156
+ return (self.hps.data.sampling_rate,audio)
inference/infer_tool_webui.py ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import hashlib
3
+ import io
4
+ import json
5
+ import logging
6
+ import os
7
+ import pickle
8
+ import time
9
+ from pathlib import Path
10
+
11
+ import librosa
12
+ import numpy as np
13
+
14
+ # import onnxruntime
15
+ import soundfile
16
+ import torch
17
+ import torchaudio
18
+ from tqdm import tqdm
19
+
20
+ import cluster
21
+ import utils
22
+ from diffusion.unit2mel import load_model_vocoder
23
+ from inference import slicer
24
+ from models import SynthesizerTrn
25
+
26
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
27
+
28
+
29
+ def read_temp(file_name):
30
+ if not os.path.exists(file_name):
31
+ with open(file_name, "w") as f:
32
+ f.write(json.dumps({"info": "temp_dict"}))
33
+ return {}
34
+ else:
35
+ try:
36
+ with open(file_name, "r") as f:
37
+ data = f.read()
38
+ data_dict = json.loads(data)
39
+ if os.path.getsize(file_name) > 50 * 1024 * 1024:
40
+ f_name = file_name.replace("\\", "/").split("/")[-1]
41
+ print(f"clean {f_name}")
42
+ for wav_hash in list(data_dict.keys()):
43
+ if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
44
+ del data_dict[wav_hash]
45
+ except Exception as e:
46
+ print(e)
47
+ print(f"{file_name} error,auto rebuild file")
48
+ data_dict = {"info": "temp_dict"}
49
+ return data_dict
50
+
51
+
52
+ def write_temp(file_name, data):
53
+ with open(file_name, "w") as f:
54
+ f.write(json.dumps(data))
55
+
56
+
57
+ def timeit(func):
58
+ def run(*args, **kwargs):
59
+ t = time.time()
60
+ res = func(*args, **kwargs)
61
+ print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
62
+ return res
63
+
64
+ return run
65
+
66
+
67
+ def format_wav(audio_path):
68
+ if Path(audio_path).suffix == '.wav':
69
+ return
70
+ raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
71
+ soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
72
+
73
+
74
+ def get_end_file(dir_path, end):
75
+ file_lists = []
76
+ for root, dirs, files in os.walk(dir_path):
77
+ files = [f for f in files if f[0] != '.']
78
+ dirs[:] = [d for d in dirs if d[0] != '.']
79
+ for f_file in files:
80
+ if f_file.endswith(end):
81
+ file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
82
+ return file_lists
83
+
84
+
85
+ def get_md5(content):
86
+ return hashlib.new("md5", content).hexdigest()
87
+
88
+ def fill_a_to_b(a, b):
89
+ if len(a) < len(b):
90
+ for _ in range(0, len(b) - len(a)):
91
+ a.append(a[0])
92
+
93
+ def mkdir(paths: list):
94
+ for path in paths:
95
+ if not os.path.exists(path):
96
+ os.mkdir(path)
97
+
98
+ def pad_array(arr, target_length):
99
+ current_length = arr.shape[0]
100
+ if current_length >= target_length:
101
+ return arr
102
+ else:
103
+ pad_width = target_length - current_length
104
+ pad_left = pad_width // 2
105
+ pad_right = pad_width - pad_left
106
+ padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
107
+ return padded_arr
108
+
109
+ def split_list_by_n(list_collection, n, pre=0):
110
+ for i in range(0, len(list_collection), n):
111
+ yield list_collection[i-pre if i-pre>=0 else i: i + n]
112
+
113
+
114
+ class F0FilterException(Exception):
115
+ pass
116
+
117
+ class Svc(object):
118
+ def __init__(self, net_g_path, config_path,
119
+ device=None,
120
+ cluster_model_path="logs/44k/kmeans_10000.pt",
121
+ nsf_hifigan_enhance = False,
122
+ diffusion_model_path="logs/44k/diffusion/model_0.pt",
123
+ diffusion_config_path="configs/diffusion.yaml",
124
+ shallow_diffusion = False,
125
+ only_diffusion = False,
126
+ spk_mix_enable = False,
127
+ feature_retrieval = False
128
+ ):
129
+ self.net_g_path = net_g_path
130
+ self.only_diffusion = only_diffusion
131
+ self.shallow_diffusion = shallow_diffusion
132
+ self.feature_retrieval = feature_retrieval
133
+ if device is None:
134
+ self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
135
+ else:
136
+ self.dev = torch.device(device)
137
+ self.net_g_ms = None
138
+ if not self.only_diffusion:
139
+ self.hps_ms = utils.get_hparams_from_file(config_path,True)
140
+ self.target_sample = self.hps_ms.data.sampling_rate
141
+ self.hop_size = self.hps_ms.data.hop_length
142
+ self.spk2id = self.hps_ms.spk
143
+ self.unit_interpolate_mode = self.hps_ms.data.unit_interpolate_mode if self.hps_ms.data.unit_interpolate_mode is not None else 'left'
144
+ self.vol_embedding = self.hps_ms.model.vol_embedding if self.hps_ms.model.vol_embedding is not None else False
145
+ self.speech_encoder = self.hps_ms.model.speech_encoder if self.hps_ms.model.speech_encoder is not None else 'vec768l12'
146
+
147
+ self.nsf_hifigan_enhance = nsf_hifigan_enhance
148
+ if self.shallow_diffusion or self.only_diffusion:
149
+ if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
150
+ self.diffusion_model,self.vocoder,self.diffusion_args = load_model_vocoder(diffusion_model_path,self.dev,config_path=diffusion_config_path)
151
+ if self.only_diffusion:
152
+ self.target_sample = self.diffusion_args.data.sampling_rate
153
+ self.hop_size = self.diffusion_args.data.block_size
154
+ self.spk2id = self.diffusion_args.spk
155
+ self.dtype = torch.float32
156
+ self.speech_encoder = self.diffusion_args.data.encoder
157
+ self.unit_interpolate_mode = self.diffusion_args.data.unit_interpolate_mode if self.diffusion_args.data.unit_interpolate_mode is not None else 'left'
158
+ if spk_mix_enable:
159
+ self.diffusion_model.init_spkmix(len(self.spk2id))
160
+ else:
161
+ print("No diffusion model or config found. Shallow diffusion mode will False")
162
+ self.shallow_diffusion = self.only_diffusion = False
163
+
164
+ # load hubert and model
165
+ if not self.only_diffusion:
166
+ self.load_model(spk_mix_enable)
167
+ self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
168
+ self.volume_extractor = utils.Volume_Extractor(self.hop_size)
169
+ else:
170
+ self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder,device=self.dev)
171
+ self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
172
+
173
+ if os.path.exists(cluster_model_path):
174
+ if self.feature_retrieval:
175
+ with open(cluster_model_path,"rb") as f:
176
+ self.cluster_model = pickle.load(f)
177
+ self.big_npy = None
178
+ self.now_spk_id = -1
179
+ else:
180
+ self.cluster_model = cluster.get_cluster_model(cluster_model_path)
181
+ else:
182
+ self.feature_retrieval=False
183
+
184
+ if self.shallow_diffusion :
185
+ self.nsf_hifigan_enhance = False
186
+ if self.nsf_hifigan_enhance:
187
+ from modules.enhancer import Enhancer
188
+ self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
189
+
190
+ def load_model(self, spk_mix_enable=False):
191
+ # get model configuration
192
+ self.net_g_ms = SynthesizerTrn(
193
+ self.hps_ms.data.filter_length // 2 + 1,
194
+ self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
195
+ **self.hps_ms.model)
196
+ _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
197
+ self.dtype = list(self.net_g_ms.parameters())[0].dtype
198
+ if "half" in self.net_g_path and torch.cuda.is_available():
199
+ _ = self.net_g_ms.half().eval().to(self.dev)
200
+ else:
201
+ _ = self.net_g_ms.eval().to(self.dev)
202
+ if spk_mix_enable:
203
+ self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
204
+
205
+ def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
206
+
207
+ if not hasattr(self,"f0_predictor_object") or self.f0_predictor_object is None or f0_predictor != self.f0_predictor_object.name:
208
+ self.f0_predictor_object = utils.get_f0_predictor(f0_predictor,hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold)
209
+ f0, uv = self.f0_predictor_object.compute_f0_uv(wav)
210
+
211
+ if f0_filter and sum(f0) == 0:
212
+ raise F0FilterException("No voice detected")
213
+ f0 = torch.FloatTensor(f0).to(self.dev)
214
+ uv = torch.FloatTensor(uv).to(self.dev)
215
+
216
+ f0 = f0 * 2 ** (tran / 12)
217
+ f0 = f0.unsqueeze(0)
218
+ uv = uv.unsqueeze(0)
219
+
220
+ wav = torch.from_numpy(wav).to(self.dev)
221
+ if not hasattr(self,"audio16k_resample_transform"):
222
+ self.audio16k_resample_transform = torchaudio.transforms.Resample(self.target_sample, 16000).to(self.dev)
223
+ wav16k = self.audio16k_resample_transform(wav[None,:])[0]
224
+
225
+ c = self.hubert_model.encoder(wav16k)
226
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
227
+
228
+ if cluster_infer_ratio !=0:
229
+ if self.feature_retrieval:
230
+ speaker_id = self.spk2id.get(speaker)
231
+ if not speaker_id and type(speaker) is int:
232
+ if len(self.spk2id.__dict__) >= speaker:
233
+ speaker_id = speaker
234
+ if speaker_id is None:
235
+ raise RuntimeError("The name you entered is not in the speaker list!")
236
+ feature_index = self.cluster_model[speaker_id]
237
+ feat_np = np.ascontiguousarray(c.transpose(0,1).cpu().numpy())
238
+ if self.big_npy is None or self.now_spk_id != speaker_id:
239
+ self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
240
+ self.now_spk_id = speaker_id
241
+ print("starting feature retrieval...")
242
+ score, ix = feature_index.search(feat_np, k=8)
243
+ weight = np.square(1 / score)
244
+ weight /= weight.sum(axis=1, keepdims=True)
245
+ npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
246
+ c = cluster_infer_ratio * npy + (1 - cluster_infer_ratio) * feat_np
247
+ c = torch.FloatTensor(c).to(self.dev).transpose(0,1)
248
+ print("end feature retrieval...")
249
+ else:
250
+ cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
251
+ cluster_c = torch.FloatTensor(cluster_c).to(self.dev)
252
+ c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c
253
+
254
+ c = c.unsqueeze(0)
255
+ return c, f0, uv
256
+
257
+ def infer(self, speaker, tran, raw_path,
258
+ cluster_infer_ratio=0,
259
+ auto_predict_f0=False,
260
+ noice_scale=0.4,
261
+ f0_filter=False,
262
+ f0_predictor='pm',
263
+ enhancer_adaptive_key = 0,
264
+ cr_threshold = 0.05,
265
+ k_step = 100,
266
+ frame = 0,
267
+ spk_mix = False,
268
+ second_encoding = False,
269
+ loudness_envelope_adjustment = 1
270
+ ):
271
+ torchaudio.set_audio_backend("soundfile")
272
+ wav, sr = torchaudio.load(raw_path)
273
+ if not hasattr(self,"audio_resample_transform") or self.audio16k_resample_transform.orig_freq != sr:
274
+ self.audio_resample_transform = torchaudio.transforms.Resample(sr,self.target_sample)
275
+ wav = self.audio_resample_transform(wav).numpy()[0]
276
+ if spk_mix:
277
+ c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
278
+ n_frames = f0.size(1)
279
+ sid = speaker[:, frame:frame+n_frames].transpose(0,1)
280
+ else:
281
+ speaker_id = self.spk2id.get(speaker)
282
+ if not speaker_id and type(speaker) is int:
283
+ if len(self.spk2id.__dict__) >= speaker:
284
+ speaker_id = speaker
285
+ if speaker_id is None:
286
+ raise RuntimeError("The name you entered is not in the speaker list!")
287
+ sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
288
+ c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
289
+ n_frames = f0.size(1)
290
+ c = c.to(self.dtype)
291
+ f0 = f0.to(self.dtype)
292
+ uv = uv.to(self.dtype)
293
+ with torch.no_grad():
294
+ start = time.time()
295
+ vol = None
296
+ if not self.only_diffusion:
297
+ vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
298
+ audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
299
+ audio = audio[0,0].data.float()
300
+ audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
301
+ else:
302
+ audio = torch.FloatTensor(wav).to(self.dev)
303
+ audio_mel = None
304
+ if self.dtype != torch.float32:
305
+ c = c.to(torch.float32)
306
+ f0 = f0.to(torch.float32)
307
+ uv = uv.to(torch.float32)
308
+ if self.only_diffusion or self.shallow_diffusion:
309
+ vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol is None else vol[:,:,None]
310
+ if self.shallow_diffusion and second_encoding:
311
+ if not hasattr(self,"audio16k_resample_transform"):
312
+ self.audio16k_resample_transform = torchaudio.transforms.Resample(self.target_sample, 16000).to(self.dev)
313
+ audio16k = self.audio16k_resample_transform(audio[None,:])[0]
314
+ c = self.hubert_model.encoder(audio16k)
315
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
316
+ f0 = f0[:,:,None]
317
+ c = c.transpose(-1,-2)
318
+ audio_mel = self.diffusion_model(
319
+ c,
320
+ f0,
321
+ vol,
322
+ spk_id = sid,
323
+ spk_mix_dict = None,
324
+ gt_spec=audio_mel,
325
+ infer=True,
326
+ infer_speedup=self.diffusion_args.infer.speedup,
327
+ method=self.diffusion_args.infer.method,
328
+ k_step=k_step)
329
+ audio = self.vocoder.infer(audio_mel, f0).squeeze()
330
+ if self.nsf_hifigan_enhance:
331
+ audio, _ = self.enhancer.enhance(
332
+ audio[None,:],
333
+ self.target_sample,
334
+ f0[:,:,None],
335
+ self.hps_ms.data.hop_length,
336
+ adaptive_key = enhancer_adaptive_key)
337
+ if loudness_envelope_adjustment != 1:
338
+ audio = utils.change_rms(wav,self.target_sample,audio,self.target_sample,loudness_envelope_adjustment)
339
+ use_time = time.time() - start
340
+ print("vits use time:{}".format(use_time))
341
+ return audio, audio.shape[-1], n_frames
342
+
343
+ def clear_empty(self):
344
+ # clean up vram
345
+ torch.cuda.empty_cache()
346
+
347
+ def unload_model(self):
348
+ # unload model
349
+ self.net_g_ms = self.net_g_ms.to("cpu")
350
+ del self.net_g_ms
351
+ if hasattr(self,"enhancer"):
352
+ self.enhancer.enhancer = self.enhancer.enhancer.to("cpu")
353
+ del self.enhancer.enhancer
354
+ del self.enhancer
355
+ gc.collect()
356
+
357
+ def slice_inference(self,
358
+ raw_audio_path,
359
+ spk,
360
+ tran,
361
+ slice_db,
362
+ cluster_infer_ratio,
363
+ auto_predict_f0,
364
+ noice_scale,
365
+ pad_seconds=0.5,
366
+ clip_seconds=0,
367
+ lg_num=0,
368
+ lgr_num =0.75,
369
+ f0_predictor='pm',
370
+ enhancer_adaptive_key = 0,
371
+ cr_threshold = 0.05,
372
+ k_step = 100,
373
+ use_spk_mix = False,
374
+ second_encoding = False,
375
+ loudness_envelope_adjustment = 1
376
+ ):
377
+ if use_spk_mix:
378
+ if len(self.spk2id) == 1:
379
+ spk = self.spk2id.keys()[0]
380
+ use_spk_mix = False
381
+ wav_path = Path(raw_audio_path).with_suffix('.wav')
382
+ chunks = slicer.cut(wav_path, db_thresh=slice_db)
383
+ audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
384
+ per_size = int(clip_seconds*audio_sr)
385
+ lg_size = int(lg_num*audio_sr)
386
+ lg_size_r = int(lg_size*lgr_num)
387
+ lg_size_c_l = (lg_size-lg_size_r)//2
388
+ lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
389
+ lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
390
+
391
+ if use_spk_mix:
392
+ assert len(self.spk2id) == len(spk)
393
+ audio_length = 0
394
+ for (slice_tag, data) in audio_data:
395
+ aud_length = int(np.ceil(len(data) / audio_sr * self.target_sample))
396
+ if slice_tag:
397
+ audio_length += aud_length // self.hop_size
398
+ continue
399
+ if per_size != 0:
400
+ datas = split_list_by_n(data, per_size,lg_size)
401
+ else:
402
+ datas = [data]
403
+ for k,dat in enumerate(datas):
404
+ pad_len = int(audio_sr * pad_seconds)
405
+ per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
406
+ a_length = per_length + 2 * pad_len
407
+ audio_length += a_length // self.hop_size
408
+ audio_length += len(audio_data)
409
+ spk_mix_tensor = torch.zeros(size=(len(spk), audio_length)).to(self.dev)
410
+ for i in range(len(spk)):
411
+ last_end = None
412
+ for mix in spk[i]:
413
+ if mix[3]<0. or mix[2]<0.:
414
+ raise RuntimeError("mix value must higer Than zero!")
415
+ begin = int(audio_length * mix[0])
416
+ end = int(audio_length * mix[1])
417
+ length = end - begin
418
+ if length<=0:
419
+ raise RuntimeError("begin Must lower Than end!")
420
+ step = (mix[3] - mix[2])/length
421
+ if last_end is not None:
422
+ if last_end != begin:
423
+ raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
424
+ last_end = end
425
+ if step == 0.:
426
+ spk_mix_data = torch.zeros(length).to(self.dev) + mix[2]
427
+ else:
428
+ spk_mix_data = torch.arange(mix[2],mix[3],step).to(self.dev)
429
+ if(len(spk_mix_data)<length):
430
+ num_pad = length - len(spk_mix_data)
431
+ spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
432
+ spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
433
+
434
+ spk_mix_ten = torch.sum(spk_mix_tensor,dim=0).unsqueeze(0).to(self.dev)
435
+ # spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
436
+ for i, x in enumerate(spk_mix_ten[0]):
437
+ if x == 0.0:
438
+ spk_mix_ten[0][i] = 1.0
439
+ spk_mix_tensor[:,i] = 1.0 / len(spk)
440
+ spk_mix_tensor = spk_mix_tensor / spk_mix_ten
441
+ if not ((torch.sum(spk_mix_tensor,dim=0) - 1.)<0.0001).all():
442
+ raise RuntimeError("sum(spk_mix_tensor) not equal 1")
443
+ spk = spk_mix_tensor
444
+
445
+ global_frame = 0
446
+ audio = []
447
+ for (slice_tag, data) in tqdm(audio_data):
448
+ print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
449
+ # padd
450
+ length = int(np.ceil(len(data) / audio_sr * self.target_sample))
451
+ if slice_tag:
452
+ print('jump empty segment')
453
+ _audio = np.zeros(length)
454
+ audio.extend(list(pad_array(_audio, length)))
455
+ global_frame += length // self.hop_size
456
+ continue
457
+ if per_size != 0:
458
+ datas = split_list_by_n(data, per_size,lg_size)
459
+ else:
460
+ datas = [data]
461
+ for k,dat in enumerate(datas):
462
+ per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
463
+ if clip_seconds!=0:
464
+ print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
465
+ # padd
466
+ pad_len = int(audio_sr * pad_seconds)
467
+ dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
468
+ raw_path = io.BytesIO()
469
+ soundfile.write(raw_path, dat, audio_sr, format="wav")
470
+ raw_path.seek(0)
471
+ out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
472
+ cluster_infer_ratio=cluster_infer_ratio,
473
+ auto_predict_f0=auto_predict_f0,
474
+ noice_scale=noice_scale,
475
+ f0_predictor = f0_predictor,
476
+ enhancer_adaptive_key = enhancer_adaptive_key,
477
+ cr_threshold = cr_threshold,
478
+ k_step = k_step,
479
+ frame = global_frame,
480
+ spk_mix = use_spk_mix,
481
+ second_encoding = second_encoding,
482
+ loudness_envelope_adjustment = loudness_envelope_adjustment
483
+ )
484
+ global_frame += out_frame
485
+ _audio = out_audio.cpu().numpy()
486
+ pad_len = int(self.target_sample * pad_seconds)
487
+ _audio = _audio[pad_len:-pad_len]
488
+ _audio = pad_array(_audio, per_length)
489
+ if lg_size!=0 and k!=0:
490
+ lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
491
+ lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
492
+ lg_pre = lg1*(1-lg)+lg2*lg
493
+ audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
494
+ audio.extend(lg_pre)
495
+ _audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
496
+ audio.extend(list(_audio))
497
+ return np.array(audio)
498
+
499
+ class RealTimeVC:
500
+ def __init__(self):
501
+ self.last_chunk = None
502
+ self.last_o = None
503
+ self.chunk_len = 16000 # chunk length
504
+ self.pre_len = 3840 # cross fade length, multiples of 640
505
+
506
+ # Input and output are 1-dimensional numpy waveform arrays
507
+
508
+ def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path,
509
+ cluster_infer_ratio=0,
510
+ auto_predict_f0=False,
511
+ noice_scale=0.4,
512
+ f0_filter=False):
513
+
514
+ import maad
515
+ audio, sr = torchaudio.load(input_wav_path)
516
+ audio = audio.cpu().numpy()[0]
517
+ temp_wav = io.BytesIO()
518
+ if self.last_chunk is None:
519
+ input_wav_path.seek(0)
520
+
521
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path,
522
+ cluster_infer_ratio=cluster_infer_ratio,
523
+ auto_predict_f0=auto_predict_f0,
524
+ noice_scale=noice_scale,
525
+ f0_filter=f0_filter)
526
+
527
+ audio = audio.cpu().numpy()
528
+ self.last_chunk = audio[-self.pre_len:]
529
+ self.last_o = audio
530
+ return audio[-self.chunk_len:]
531
+ else:
532
+ audio = np.concatenate([self.last_chunk, audio])
533
+ soundfile.write(temp_wav, audio, sr, format="wav")
534
+ temp_wav.seek(0)
535
+
536
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav,
537
+ cluster_infer_ratio=cluster_infer_ratio,
538
+ auto_predict_f0=auto_predict_f0,
539
+ noice_scale=noice_scale,
540
+ f0_filter=f0_filter)
541
+
542
+ audio = audio.cpu().numpy()
543
+ ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
544
+ self.last_chunk = audio[-self.pre_len:]
545
+ self.last_o = audio
546
+ return ret[self.chunk_len:2 * self.chunk_len]
547
+
inference/slicer.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import torch
3
+ import torchaudio
4
+
5
+
6
+ class Slicer:
7
+ def __init__(self,
8
+ sr: int,
9
+ threshold: float = -40.,
10
+ min_length: int = 5000,
11
+ min_interval: int = 300,
12
+ hop_size: int = 20,
13
+ max_sil_kept: int = 5000):
14
+ if not min_length >= min_interval >= hop_size:
15
+ raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
16
+ if not max_sil_kept >= hop_size:
17
+ raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
18
+ min_interval = sr * min_interval / 1000
19
+ self.threshold = 10 ** (threshold / 20.)
20
+ self.hop_size = round(sr * hop_size / 1000)
21
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
22
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
23
+ self.min_interval = round(min_interval / self.hop_size)
24
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
25
+
26
+ def _apply_slice(self, waveform, begin, end):
27
+ if len(waveform.shape) > 1:
28
+ return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
29
+ else:
30
+ return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
31
+
32
+ # @timeit
33
+ def slice(self, waveform):
34
+ if len(waveform.shape) > 1:
35
+ samples = librosa.to_mono(waveform)
36
+ else:
37
+ samples = waveform
38
+ if samples.shape[0] <= self.min_length:
39
+ return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
40
+ rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
41
+ sil_tags = []
42
+ silence_start = None
43
+ clip_start = 0
44
+ for i, rms in enumerate(rms_list):
45
+ # Keep looping while frame is silent.
46
+ if rms < self.threshold:
47
+ # Record start of silent frames.
48
+ if silence_start is None:
49
+ silence_start = i
50
+ continue
51
+ # Keep looping while frame is not silent and silence start has not been recorded.
52
+ if silence_start is None:
53
+ continue
54
+ # Clear recorded silence start if interval is not enough or clip is too short
55
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
56
+ need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
57
+ if not is_leading_silence and not need_slice_middle:
58
+ silence_start = None
59
+ continue
60
+ # Need slicing. Record the range of silent frames to be removed.
61
+ if i - silence_start <= self.max_sil_kept:
62
+ pos = rms_list[silence_start: i + 1].argmin() + silence_start
63
+ if silence_start == 0:
64
+ sil_tags.append((0, pos))
65
+ else:
66
+ sil_tags.append((pos, pos))
67
+ clip_start = pos
68
+ elif i - silence_start <= self.max_sil_kept * 2:
69
+ pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
70
+ pos += i - self.max_sil_kept
71
+ pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
72
+ pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
73
+ if silence_start == 0:
74
+ sil_tags.append((0, pos_r))
75
+ clip_start = pos_r
76
+ else:
77
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
78
+ clip_start = max(pos_r, pos)
79
+ else:
80
+ pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
81
+ pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
82
+ if silence_start == 0:
83
+ sil_tags.append((0, pos_r))
84
+ else:
85
+ sil_tags.append((pos_l, pos_r))
86
+ clip_start = pos_r
87
+ silence_start = None
88
+ # Deal with trailing silence.
89
+ total_frames = rms_list.shape[0]
90
+ if silence_start is not None and total_frames - silence_start >= self.min_interval:
91
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
92
+ pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
93
+ sil_tags.append((pos, total_frames + 1))
94
+ # Apply and return slices.
95
+ if len(sil_tags) == 0:
96
+ return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
97
+ else:
98
+ chunks = []
99
+ # 第一段静音并非从头开始,补上有声片段
100
+ if sil_tags[0][0]:
101
+ chunks.append(
102
+ {"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
103
+ for i in range(0, len(sil_tags)):
104
+ # 标识有声片段(跳过第一段)
105
+ if i:
106
+ chunks.append({"slice": False,
107
+ "split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
108
+ # 标识所有静音片段
109
+ chunks.append({"slice": True,
110
+ "split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
111
+ # 最后一段静音并非结尾,补上结尾片段
112
+ if sil_tags[-1][1] * self.hop_size < len(waveform):
113
+ chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
114
+ chunk_dict = {}
115
+ for i in range(len(chunks)):
116
+ chunk_dict[str(i)] = chunks[i]
117
+ return chunk_dict
118
+
119
+
120
+ def cut(audio_path, db_thresh=-30, min_len=5000):
121
+ audio, sr = librosa.load(audio_path, sr=None)
122
+ slicer = Slicer(
123
+ sr=sr,
124
+ threshold=db_thresh,
125
+ min_length=min_len
126
+ )
127
+ chunks = slicer.slice(audio)
128
+ return chunks
129
+
130
+
131
+ def chunks2audio(audio_path, chunks):
132
+ chunks = dict(chunks)
133
+ audio, sr = torchaudio.load(audio_path)
134
+ if len(audio.shape) == 2 and audio.shape[1] >= 2:
135
+ audio = torch.mean(audio, dim=0).unsqueeze(0)
136
+ audio = audio.cpu().numpy()[0]
137
+ result = []
138
+ for k, v in chunks.items():
139
+ tag = v["split_time"].split(",")
140
+ if tag[0] != tag[1]:
141
+ result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
142
+ return result, sr