Katock commited on
Commit
be3fda3
1 Parent(s): d35b7f5
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +0 -1
  2. .gitignore +382 -0
  3. LICENSE +21 -0
  4. README.md +11 -8
  5. app-slice.py +135 -0
  6. app.py +141 -0
  7. cluster/__init__.py +29 -0
  8. cluster/train_cluster.py +89 -0
  9. configs/config.json +0 -0
  10. data_utils.py +155 -0
  11. hubert/__init__.py +0 -0
  12. hubert/checkpoint_best_legacy_500.pt +3 -0
  13. hubert/hubert_model.py +222 -0
  14. hubert/hubert_model_onnx.py +217 -0
  15. inference/__init__.py +0 -0
  16. inference/chunks_temp.json +1 -0
  17. inference/infer_tool.py +324 -0
  18. inference/infer_tool_grad.py +160 -0
  19. inference/slicer.py +142 -0
  20. inference_main.py +130 -0
  21. models.py +420 -0
  22. models/arthur/arthur.pth +3 -0
  23. models/arthur/config_arthur.json +93 -0
  24. models/carl/carl.pth +3 -0
  25. models/carl/config_carl.json +93 -0
  26. models/cesar/cesar.pth +3 -0
  27. models/cesar/config_cesar.json +99 -0
  28. models/katalina/config_katalina.json +99 -0
  29. models/katalina/katalina.pth +3 -0
  30. models/kendl/config_kendl.json +99 -0
  31. models/kendl/kendl.pth +3 -0
  32. models/ogloc/config_ogloc.json +99 -0
  33. models/ogloc/kmeans_ogloc.pt +3 -0
  34. models/ogloc/ogloc.pth +3 -0
  35. models/pulaski/config_pulaski.json +99 -0
  36. models/pulaski/pulaski.pth +3 -0
  37. models/ryder/config_ryder.json +99 -0
  38. models/ryder/ryder.pth +3 -0
  39. models/smoke/config_smoke.json +99 -0
  40. models/smoke/smoke.pth +3 -0
  41. models/sweet/config_sweet.json +99 -0
  42. models/sweet/sweet.pth +3 -0
  43. models/tenpenny/config_tenpenny.json +99 -0
  44. models/tenpenny/tenpenny.pth +3 -0
  45. models/tommy/config_tommy.json +99 -0
  46. models/tommy/tommy.pth +3 -0
  47. models/tomori/config_tomori.json +99 -0
  48. models/tomori/tomori.pth +3 -0
  49. models/tomori/tomori_index.pkl +3 -0
  50. models/torino/config_torino.json +99 -0
.gitattributes CHANGED
@@ -25,7 +25,6 @@
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
 
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
28
  *.tflite filter=lfs diff=lfs merge=lfs -text
29
  *.tgz filter=lfs diff=lfs merge=lfs -text
30
  *.wasm filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Ignore Visual Studio temporary files, build results, and
2
+ ## files generated by popular Visual Studio add-ons.
3
+ ##
4
+ ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
5
+
6
+ # User-specific files
7
+ *.rsuser
8
+ *.suo
9
+ *.user
10
+ *.userosscache
11
+ *.sln.docstates
12
+
13
+ # User-specific files (MonoDevelop/Xamarin Studio)
14
+ *.userprefs
15
+
16
+ # Mono auto generated files
17
+ mono_crash.*
18
+
19
+ # Build results
20
+ [Dd]ebug/
21
+ [Dd]ebugPublic/
22
+ [Rr]elease/
23
+ [Rr]eleases/
24
+ x64/
25
+ x86/
26
+ [Ww][Ii][Nn]32/
27
+ [Aa][Rr][Mm]/
28
+ [Aa][Rr][Mm]64/
29
+ bld/
30
+ [Bb]in/
31
+ [Oo]bj/
32
+ [Oo]ut/
33
+ [Ll]og/
34
+ [Ll]ogs/
35
+
36
+ # Visual Studio 2015/2017 cache/options directory
37
+ .vs/
38
+ # Uncomment if you have tasks that create the project's static files in wwwroot
39
+ #wwwroot/
40
+
41
+ # Visual Studio 2017 auto generated files
42
+ Generated\ Files/
43
+
44
+ # MSTest test Results
45
+ [Tt]est[Rr]esult*/
46
+ [Bb]uild[Ll]og.*
47
+
48
+ # NUnit
49
+ *.VisualState.xml
50
+ TestResult.xml
51
+ nunit-*.xml
52
+
53
+ # Build Results of an ATL Project
54
+ [Dd]ebugPS/
55
+ [Rr]eleasePS/
56
+ dlldata.c
57
+
58
+ # Benchmark Results
59
+ BenchmarkDotNet.Artifacts/
60
+
61
+ # .NET Core
62
+ project.lock.json
63
+ project.fragment.lock.json
64
+ artifacts/
65
+
66
+ # ASP.NET Scaffolding
67
+ ScaffoldingReadMe.txt
68
+
69
+ # StyleCop
70
+ StyleCopReport.xml
71
+
72
+ # Files built by Visual Studio
73
+ *_i.c
74
+ *_p.c
75
+ *_h.h
76
+ *.ilk
77
+ *.meta
78
+ *.obj
79
+ *.iobj
80
+ *.pch
81
+ *.pdb
82
+ *.ipdb
83
+ *.pgc
84
+ *.pgd
85
+ *.rsp
86
+ *.sbr
87
+ *.tlb
88
+ *.tli
89
+ *.tlh
90
+ *.tmp
91
+ *.tmp_proj
92
+ *_wpftmp.csproj
93
+ *.log
94
+ *.vspscc
95
+ *.vssscc
96
+ .builds
97
+ *.pidb
98
+ *.svclog
99
+ *.scc
100
+
101
+ # Chutzpah Test files
102
+ _Chutzpah*
103
+
104
+ # Visual C++ cache files
105
+ ipch/
106
+ *.aps
107
+ *.ncb
108
+ *.opendb
109
+ *.opensdf
110
+ *.sdf
111
+ *.cachefile
112
+ *.VC.db
113
+ *.VC.VC.opendb
114
+
115
+ # Visual Studio profiler
116
+ *.psess
117
+ *.vsp
118
+ *.vspx
119
+ *.sap
120
+
121
+ # Visual Studio Trace Files
122
+ *.e2e
123
+
124
+ # TFS 2012 Local Workspace
125
+ $tf/
126
+
127
+ # Guidance Automation Toolkit
128
+ *.gpState
129
+
130
+ # ReSharper is a .NET coding add-in
131
+ _ReSharper*/
132
+ *.[Rr]e[Ss]harper
133
+ *.DotSettings.user
134
+
135
+ # TeamCity is a build add-in
136
+ _TeamCity*
137
+
138
+ # DotCover is a Code Coverage Tool
139
+ *.dotCover
140
+
141
+ # AxoCover is a Code Coverage Tool
142
+ .axoCover/*
143
+ !.axoCover/settings.json
144
+
145
+ # Coverlet is a free, cross platform Code Coverage Tool
146
+ coverage*.json
147
+ coverage*.xml
148
+ coverage*.info
149
+
150
+ # Visual Studio code coverage results
151
+ *.coverage
152
+ *.coveragexml
153
+
154
+ # NCrunch
155
+ _NCrunch_*
156
+ .*crunch*.local.xml
157
+ nCrunchTemp_*
158
+
159
+ # MightyMoose
160
+ *.mm.*
161
+ AutoTest.Net/
162
+
163
+ # Web workbench (sass)
164
+ .sass-cache/
165
+
166
+ # Installshield output folder
167
+ [Ee]xpress/
168
+
169
+ # DocProject is a documentation generator add-in
170
+ DocProject/buildhelp/
171
+ DocProject/Help/*.HxT
172
+ DocProject/Help/*.HxC
173
+ DocProject/Help/*.hhc
174
+ DocProject/Help/*.hhk
175
+ DocProject/Help/*.hhp
176
+ DocProject/Help/Html2
177
+ DocProject/Help/html
178
+
179
+ # Click-Once directory
180
+ publish/
181
+
182
+ # Publish Web Output
183
+ *.[Pp]ublish.xml
184
+ *.azurePubxml
185
+ # Note: Comment the next line if you want to checkin your web deploy settings,
186
+ # but database connection strings (with potential passwords) will be unencrypted
187
+ *.pubxml
188
+ *.publishproj
189
+
190
+ # Microsoft Azure Web App publish settings. Comment the next line if you want to
191
+ # checkin your Azure Web App publish settings, but sensitive information contained
192
+ # in these scripts will be unencrypted
193
+ PublishScripts/
194
+
195
+ # NuGet Packages
196
+ *.nupkg
197
+ # NuGet Symbol Packages
198
+ *.snupkg
199
+ # The packages folder can be ignored because of Package Restore
200
+ **/[Pp]ackages/*
201
+ # except build/, which is used as an MSBuild target.
202
+ !**/[Pp]ackages/build/
203
+ # Uncomment if necessary however generally it will be regenerated when needed
204
+ #!**/[Pp]ackages/repositories.config
205
+ # NuGet v3's project.json files produces more ignorable files
206
+ *.nuget.props
207
+ *.nuget.targets
208
+
209
+ # Microsoft Azure Build Output
210
+ csx/
211
+ *.build.csdef
212
+
213
+ # Microsoft Azure Emulator
214
+ ecf/
215
+ rcf/
216
+
217
+ # Windows Store app package directories and files
218
+ AppPackages/
219
+ BundleArtifacts/
220
+ Package.StoreAssociation.xml
221
+ _pkginfo.txt
222
+ *.appx
223
+ *.appxbundle
224
+ *.appxupload
225
+
226
+ # Visual Studio cache files
227
+ # files ending in .cache can be ignored
228
+ *.[Cc]ache
229
+ # but keep track of directories ending in .cache
230
+ !?*.[Cc]ache/
231
+
232
+ # Others
233
+ ClientBin/
234
+ ~$*
235
+ *~
236
+ *.dbmdl
237
+ *.dbproj.schemaview
238
+ *.jfm
239
+ *.pfx
240
+ *.publishsettings
241
+ orleans.codegen.cs
242
+
243
+ # Including strong name files can present a security risk
244
+ # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245
+ #*.snk
246
+
247
+ # Since there are multiple workflows, uncomment next line to ignore bower_components
248
+ # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249
+ #bower_components/
250
+
251
+ # RIA/Silverlight projects
252
+ Generated_Code/
253
+
254
+ # Backup & report files from converting an old project file
255
+ # to a newer Visual Studio version. Backup files are not needed,
256
+ # because we have git ;-)
257
+ _UpgradeReport_Files/
258
+ Backup*/
259
+ UpgradeLog*.XML
260
+ UpgradeLog*.htm
261
+ ServiceFabricBackup/
262
+ *.rptproj.bak
263
+
264
+ # SQL Server files
265
+ *.mdf
266
+ *.ldf
267
+ *.ndf
268
+
269
+ # Business Intelligence projects
270
+ *.rdl.data
271
+ *.bim.layout
272
+ *.bim_*.settings
273
+ *.rptproj.rsuser
274
+ *- [Bb]ackup.rdl
275
+ *- [Bb]ackup ([0-9]).rdl
276
+ *- [Bb]ackup ([0-9][0-9]).rdl
277
+
278
+ # Microsoft Fakes
279
+ FakesAssemblies/
280
+
281
+ # GhostDoc plugin setting file
282
+ *.GhostDoc.xml
283
+
284
+ # Node.js Tools for Visual Studio
285
+ .ntvs_analysis.dat
286
+ node_modules/
287
+
288
+ # Visual Studio 6 build log
289
+ *.plg
290
+
291
+ # Visual Studio 6 workspace options file
292
+ *.opt
293
+
294
+ # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295
+ *.vbw
296
+
297
+ # Visual Studio LightSwitch build output
298
+ **/*.HTMLClient/GeneratedArtifacts
299
+ **/*.DesktopClient/GeneratedArtifacts
300
+ **/*.DesktopClient/ModelManifest.xml
301
+ **/*.Server/GeneratedArtifacts
302
+ **/*.Server/ModelManifest.xml
303
+ _Pvt_Extensions
304
+
305
+ # Paket dependency manager
306
+ .paket/paket.exe
307
+ paket-files/
308
+
309
+ # FAKE - F# Make
310
+ .fake/
311
+
312
+ # CodeRush personal settings
313
+ .cr/personal
314
+
315
+ # Python Tools for Visual Studio (PTVS)
316
+ __pycache__/
317
+
318
+
319
+ # Cake - Uncomment if you are using it
320
+ # tools/**
321
+ # !tools/packages.config
322
+
323
+ # Tabs Studio
324
+ *.tss
325
+
326
+ # Telerik's JustMock configuration file
327
+ *.jmconfig
328
+
329
+ # BizTalk build output
330
+ *.btp.cs
331
+ *.btm.cs
332
+ *.odx.cs
333
+ *.xsd.cs
334
+
335
+ # OpenCover UI analysis results
336
+ OpenCover/
337
+
338
+ # Azure Stream Analytics local run output
339
+ ASALocalRun/
340
+
341
+ # MSBuild Binary and Structured Log
342
+ *.binlog
343
+
344
+ # NVidia Nsight GPU debugger configuration file
345
+ *.nvuser
346
+
347
+ # MFractors (Xamarin productivity tool) working folder
348
+ .mfractor/
349
+
350
+ # Local History for Visual Studio
351
+ .localhistory/
352
+
353
+ # BeatPulse healthcheck temp database
354
+ healthchecksdb
355
+
356
+ # Backup folder for Package Reference Convert tool in Visual Studio 2017
357
+ MigrationBackup/
358
+
359
+ # Ionide (cross platform F# VS Code tools) working folder
360
+ .ionide/
361
+
362
+ # Fody - auto-generated XML schema
363
+ FodyWeavers.xsd
364
+
365
+ # build
366
+ build
367
+ monotonic_align/core.c
368
+ *.o
369
+ *.so
370
+ *.dll
371
+
372
+ # data
373
+ /config.json
374
+ /*.pth
375
+ *.wav
376
+ /monotonic_align/monotonic_align
377
+ /resources
378
+ /MoeGoe.spec
379
+ /dist/MoeGoe
380
+ /dist
381
+
382
+ .idea
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Jingyi Li
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,16 @@
1
  ---
2
- title: GTA SOVITS
3
- emoji: 🐨
4
- colorFrom: purple
5
- colorTo: blue
6
- sdk: streamlit
7
- sdk_version: 1.25.0
8
  app_file: app.py
9
  pinned: false
10
- license: openrail
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
1
  ---
2
+ title: Sovits Models
3
+ emoji: 🎙️
4
+ colorFrom: gray
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 3.18.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ 我是一个菜鸡,推理界面设计参考了大佬。
14
+
15
+ **但是使用此处的模型务必注明出处(即本人:B站Cyber蝈蝈总),这是孩子唯一的愿望。**
16
+
app-slice.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import edge_tts
4
+ from pathlib import Path
5
+ import inference.infer_tool as infer_tool
6
+ import utils
7
+ from inference.infer_tool import Svc
8
+ import logging
9
+ import webbrowser
10
+ import argparse
11
+ import asyncio
12
+ import librosa
13
+ import soundfile
14
+ import gradio.processing_utils as gr_processing_utils
15
+ logging.getLogger('numba').setLevel(logging.WARNING)
16
+ logging.getLogger('markdown_it').setLevel(logging.WARNING)
17
+ logging.getLogger('urllib3').setLevel(logging.WARNING)
18
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
19
+
20
+ limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
21
+
22
+ audio_postprocess_ori = gr.Audio.postprocess
23
+
24
+ def audio_postprocess(self, y):
25
+ data = audio_postprocess_ori(self, y)
26
+ if data is None:
27
+ return None
28
+ return gr_processing_utils.encode_url_or_file_to_base64(data["name"])
29
+
30
+
31
+ gr.Audio.postprocess = audio_postprocess
32
+ def create_vc_fn(model, sid):
33
+ def vc_fn(input_audio, vc_transform, auto_f0, slice_db, noise_scale, pad_seconds, tts_text, tts_voice, tts_mode):
34
+ if tts_mode:
35
+ if len(tts_text) > 100 and limitation:
36
+ return "Text is too long", None
37
+ if tts_text is None or tts_voice is None:
38
+ return "You need to enter text and select a voice", None
39
+ asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
40
+ audio, sr = librosa.load("tts.mp3")
41
+ soundfile.write("tts.wav", audio, 24000, format="wav")
42
+ wav_path = "tts.wav"
43
+ else:
44
+ if input_audio is None:
45
+ return "You need to select an audio", None
46
+ raw_audio_path = f"raw/{input_audio}"
47
+ if "." not in raw_audio_path:
48
+ raw_audio_path += ".wav"
49
+ infer_tool.format_wav(raw_audio_path)
50
+ wav_path = Path(raw_audio_path).with_suffix('.wav')
51
+ _audio = model.slice_inference(
52
+ wav_path, sid, vc_transform, slice_db,
53
+ cluster_infer_ratio=0,
54
+ auto_predict_f0=auto_f0,
55
+ noice_scale=noise_scale,
56
+ pad_seconds=pad_seconds)
57
+ model.clear_empty()
58
+ return "Success", (44100, _audio)
59
+ return vc_fn
60
+
61
+ def refresh_raw_wav():
62
+ return gr.Dropdown.update(choices=os.listdir("raw"))
63
+
64
+ def change_to_tts_mode(tts_mode):
65
+ if tts_mode:
66
+ return gr.Audio.update(visible=False), gr.Button.update(visible=False), gr.Textbox.update(visible=True), gr.Dropdown.update(visible=True)
67
+ else:
68
+ return gr.Audio.update(visible=True), gr.Button.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False)
69
+
70
+ if __name__ == '__main__':
71
+ parser = argparse.ArgumentParser()
72
+ parser.add_argument('--device', type=str, default='cpu')
73
+ parser.add_argument('--api', action="store_true", default=False)
74
+ parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
75
+ parser.add_argument("--colab", action="store_true", default=False, help="share gradio app")
76
+ args = parser.parse_args()
77
+ hubert_model = utils.get_hubert_model().to(args.device)
78
+ models = []
79
+ voices = []
80
+ tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
81
+ for r in tts_voice_list:
82
+ voices.append(f"{r['ShortName']}-{r['Gender']}")
83
+ raw = os.listdir("raw")
84
+ for f in os.listdir("models"):
85
+ name = f
86
+ model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config.json", device=args.device)
87
+ cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else None
88
+ models.append((name, cover, create_vc_fn(model, name)))
89
+ with gr.Blocks() as app:
90
+ gr.Markdown(
91
+ "# <center> Sovits Models\n"
92
+ "## <center> The input audio should be clean and pure voice without background music.\n"
93
+ "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=sayashi.Sovits-Umamusume)\n\n"
94
+ "[Open In Colab](https://colab.research.google.com/drive/1wfsBbMzmtLflOJeqc5ZnJiLY7L239hJW?usp=share_link)"
95
+ " without queue and length limitation.\n\n"
96
+ "[Original Repo](https://github.com/svc-develop-team/so-vits-svc)\n\n"
97
+ "Other models:\n"
98
+ "[rudolf](https://huggingface.co/spaces/sayashi/sovits-rudolf)\n"
99
+ "[teio](https://huggingface.co/spaces/sayashi/sovits-teio)\n"
100
+ "[goldship](https://huggingface.co/spaces/sayashi/sovits-goldship)\n"
101
+ "[tannhauser](https://huggingface.co/spaces/sayashi/sovits-tannhauser)\n"
102
+
103
+ )
104
+ with gr.Tabs():
105
+ for (name, cover, vc_fn) in models:
106
+ with gr.TabItem(name):
107
+ with gr.Row():
108
+ gr.Markdown(
109
+ '<div align="center">'
110
+ f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else ""
111
+ '</div>'
112
+ )
113
+ with gr.Row():
114
+ with gr.Column():
115
+ with gr.Row():
116
+ vc_input = gr.Dropdown(label="Input audio", choices=raw)
117
+ vc_refresh = gr.Button("🔁", variant="primary")
118
+ vc_transform = gr.Number(label="vc_transform", value=0)
119
+ slice_db = gr.Number(label="slice_db", value=-40)
120
+ noise_scale = gr.Number(label="noise_scale", value=0.4)
121
+ pad_seconds = gr.Number(label="pad_seconds", value=0.5)
122
+ auto_f0 = gr.Checkbox(label="auto_f0", value=False)
123
+ tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
124
+ tts_text = gr.Textbox(visible=False,label="TTS text (100 words limitation)" if limitation else "TTS text")
125
+ tts_voice = gr.Dropdown(choices=voices, visible=False)
126
+ vc_submit = gr.Button("Generate", variant="primary")
127
+ with gr.Column():
128
+ vc_output1 = gr.Textbox(label="Output Message")
129
+ vc_output2 = gr.Audio(label="Output Audio")
130
+ vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0, slice_db, noise_scale, pad_seconds, tts_text, tts_voice, tts_mode], [vc_output1, vc_output2])
131
+ vc_refresh.click(refresh_raw_wav, [], [vc_input])
132
+ tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, vc_refresh, tts_text, tts_voice])
133
+ if args.colab:
134
+ webbrowser.open("http://127.0.0.1:7860")
135
+ app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import gradio as gr
4
+ import librosa
5
+ import numpy as np
6
+ import utils
7
+ from inference.infer_tool import Svc
8
+ import logging
9
+ import soundfile
10
+ import asyncio
11
+ import argparse
12
+ import edge_tts
13
+ import gradio.processing_utils as gr_processing_utils
14
+ logging.getLogger('numba').setLevel(logging.WARNING)
15
+ logging.getLogger('markdown_it').setLevel(logging.WARNING)
16
+ logging.getLogger('urllib3').setLevel(logging.WARNING)
17
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
18
+
19
+ limitation = os.getenv("SYSTEM") == "spaces" # limit audio length in huggingface spaces
20
+
21
+ audio_postprocess_ori = gr.Audio.postprocess
22
+
23
+ def audio_postprocess(self, y):
24
+ data = audio_postprocess_ori(self, y)
25
+ if data is None:
26
+ return None
27
+ return gr_processing_utils.encode_url_or_file_to_base64(data["name"])
28
+
29
+
30
+ gr.Audio.postprocess = audio_postprocess
31
+ def create_vc_fn(model, sid):
32
+ def vc_fn(input_audio, vc_transform, auto_f0, tts_text, tts_voice, tts_mode):
33
+ if tts_mode:
34
+ if len(tts_text) > 100 and limitation:
35
+ return "Text is too long", None
36
+ if tts_text is None or tts_voice is None:
37
+ return "You need to enter text and select a voice", None
38
+ asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save("tts.mp3"))
39
+ audio, sr = librosa.load("tts.mp3", sr=16000, mono=True)
40
+ raw_path = io.BytesIO()
41
+ soundfile.write(raw_path, audio, 16000, format="wav")
42
+ raw_path.seek(0)
43
+ out_audio, out_sr = model.infer(sid, vc_transform, raw_path,
44
+ auto_predict_f0=auto_f0,
45
+ )
46
+ return "Success", (44100, out_audio.cpu().numpy())
47
+ if input_audio is None:
48
+ return "You need to upload an audio", None
49
+ sampling_rate, audio = input_audio
50
+ duration = audio.shape[0] / sampling_rate
51
+ if duration > 20 and limitation:
52
+ return "Please upload an audio file that is less than 20 seconds. If you need to generate a longer audio file, please use Colab.", None
53
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
54
+ if len(audio.shape) > 1:
55
+ audio = librosa.to_mono(audio.transpose(1, 0))
56
+ if sampling_rate != 16000:
57
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
58
+ raw_path = io.BytesIO()
59
+ soundfile.write(raw_path, audio, 16000, format="wav")
60
+ raw_path.seek(0)
61
+ out_audio, out_sr = model.infer(sid, vc_transform, raw_path,
62
+ auto_predict_f0=auto_f0,
63
+ )
64
+ return "Success", (44100, out_audio.cpu().numpy())
65
+ return vc_fn
66
+
67
+ def change_to_tts_mode(tts_mode):
68
+ if tts_mode:
69
+ return gr.Audio.update(visible=False), gr.Textbox.update(visible=True), gr.Dropdown.update(visible=True), gr.Checkbox.update(value=True)
70
+ else:
71
+ return gr.Audio.update(visible=True), gr.Textbox.update(visible=False), gr.Dropdown.update(visible=False), gr.Checkbox.update(value=False)
72
+
73
+ if __name__ == '__main__':
74
+ parser = argparse.ArgumentParser()
75
+ parser.add_argument('--device', type=str, default='cpu')
76
+ parser.add_argument('--api', action="store_true", default=False)
77
+ parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
78
+ args = parser.parse_args()
79
+ hubert_model = utils.get_hubert_model().to(args.device)
80
+ models = []
81
+ # others = {
82
+ # "rudolf": "https://huggingface.co/spaces/sayashi/sovits-rudolf",
83
+ # "teio": "https://huggingface.co/spaces/sayashi/sovits-teio",
84
+ # "goldship": "https://huggingface.co/spaces/sayashi/sovits-goldship",
85
+ # "tannhauser": "https://huggingface.co/spaces/sayashi/sovits-tannhauser"
86
+ # }
87
+ voices = []
88
+ tts_voice_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices())
89
+ for r in tts_voice_list:
90
+ voices.append(f"{r['ShortName']}-{r['Gender']}")
91
+ for f in os.listdir("models"):
92
+ name = f
93
+ model = Svc(fr"models/{f}/{f}.pth", f"models/{f}/config_{f}.json", device=args.device)
94
+ cover = f"models/{f}/cover.png" if os.path.exists(f"models/{f}/cover.png") else None
95
+ models.append((name, cover, create_vc_fn(model, name)))
96
+ with gr.Blocks() as app:
97
+ gr.Markdown(
98
+ "# <center> Sovits Models\n"
99
+ "## <center> The input audio should be clean and pure voice without background music.\n"
100
+ "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=sayashi.Sovits-Umamusume)\n\n"
101
+ "[![image](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wfsBbMzmtLflOJeqc5ZnJiLY7L239hJW?usp=share_link)\n\n"
102
+ "[![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm-dark.svg)](https://huggingface.co/spaces/sayashi/sovits-models?duplicate=true)\n\n"
103
+ "[![Original Repo](https://badgen.net/badge/icon/github?icon=github&label=Original%20Repo)](https://github.com/svc-develop-team/so-vits-svc)"
104
+
105
+ )
106
+ with gr.Tabs():
107
+ for (name, cover, vc_fn) in models:
108
+ with gr.TabItem(name):
109
+ with gr.Row():
110
+ gr.Markdown(
111
+ '<div align="center">'
112
+ f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else ""
113
+ '</div>'
114
+ )
115
+ with gr.Row():
116
+ with gr.Column():
117
+ vc_input = gr.Audio(label="Input audio"+' (less than 20 seconds)' if limitation else '')
118
+ vc_transform = gr.Number(label="vc_transform", value=0)
119
+ auto_f0 = gr.Checkbox(label="auto_f0", value=False)
120
+ tts_mode = gr.Checkbox(label="tts (use edge-tts as input)", value=False)
121
+ tts_text = gr.Textbox(visible=False, label="TTS text (100 words limitation)" if limitation else "TTS text")
122
+ tts_voice = gr.Dropdown(choices=voices, visible=False)
123
+ vc_submit = gr.Button("Generate", variant="primary")
124
+ with gr.Column():
125
+ vc_output1 = gr.Textbox(label="Output Message")
126
+ vc_output2 = gr.Audio(label="Output Audio")
127
+ vc_submit.click(vc_fn, [vc_input, vc_transform, auto_f0, tts_text, tts_voice, tts_mode], [vc_output1, vc_output2])
128
+ tts_mode.change(change_to_tts_mode, [tts_mode], [vc_input, tts_text, tts_voice, auto_f0])
129
+ for category, link in others.items():
130
+ with gr.TabItem(category):
131
+ gr.Markdown(
132
+ f'''
133
+ <center>
134
+ <h2>Click to Go</h2>
135
+ <a href="{link}">
136
+ <img src="https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-xl-dark.svg"
137
+ </a>
138
+ </center>
139
+ '''
140
+ )
141
+ app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
cluster/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from sklearn.cluster import KMeans
4
+
5
+ def get_cluster_model(ckpt_path):
6
+ checkpoint = torch.load(ckpt_path)
7
+ kmeans_dict = {}
8
+ for spk, ckpt in checkpoint.items():
9
+ km = KMeans(ckpt["n_features_in_"])
10
+ km.__dict__["n_features_in_"] = ckpt["n_features_in_"]
11
+ km.__dict__["_n_threads"] = ckpt["_n_threads"]
12
+ km.__dict__["cluster_centers_"] = ckpt["cluster_centers_"]
13
+ kmeans_dict[spk] = km
14
+ return kmeans_dict
15
+
16
+ def get_cluster_result(model, x, speaker):
17
+ """
18
+ x: np.array [t, 256]
19
+ return cluster class result
20
+ """
21
+ return model[speaker].predict(x)
22
+
23
+ def get_cluster_center_result(model, x,speaker):
24
+ """x: np.array [t, 256]"""
25
+ predict = model[speaker].predict(x)
26
+ return model[speaker].cluster_centers_[predict]
27
+
28
+ def get_center(model, x,speaker):
29
+ return model[speaker].cluster_centers_[x]
cluster/train_cluster.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from glob import glob
3
+ from pathlib import Path
4
+ import torch
5
+ import logging
6
+ import argparse
7
+ import torch
8
+ import numpy as np
9
+ from sklearn.cluster import KMeans, MiniBatchKMeans
10
+ import tqdm
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+ import time
14
+ import random
15
+
16
+ def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False):
17
+
18
+ logger.info(f"Loading features from {in_dir}")
19
+ features = []
20
+ nums = 0
21
+ for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
22
+ features.append(torch.load(path).squeeze(0).numpy().T)
23
+ # print(features[-1].shape)
24
+ features = np.concatenate(features, axis=0)
25
+ print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
26
+ features = features.astype(np.float32)
27
+ logger.info(f"Clustering features of shape: {features.shape}")
28
+ t = time.time()
29
+ if use_minibatch:
30
+ kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
31
+ else:
32
+ kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
33
+ print(time.time()-t, "s")
34
+
35
+ x = {
36
+ "n_features_in_": kmeans.n_features_in_,
37
+ "_n_threads": kmeans._n_threads,
38
+ "cluster_centers_": kmeans.cluster_centers_,
39
+ }
40
+ print("end")
41
+
42
+ return x
43
+
44
+
45
+ if __name__ == "__main__":
46
+
47
+ parser = argparse.ArgumentParser()
48
+ parser.add_argument('--dataset', type=Path, default="./dataset/44k",
49
+ help='path of training data directory')
50
+ parser.add_argument('--output', type=Path, default="logs/44k",
51
+ help='path of model output directory')
52
+
53
+ args = parser.parse_args()
54
+
55
+ checkpoint_dir = args.output
56
+ dataset = args.dataset
57
+ n_clusters = 10000
58
+
59
+ ckpt = {}
60
+ for spk in os.listdir(dataset):
61
+ if os.path.isdir(dataset/spk):
62
+ print(f"train kmeans for {spk}...")
63
+ in_dir = dataset/spk
64
+ x = train_cluster(in_dir, n_clusters, verbose=False)
65
+ ckpt[spk] = x
66
+
67
+ checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pt"
68
+ checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
69
+ torch.save(
70
+ ckpt,
71
+ checkpoint_path,
72
+ )
73
+
74
+
75
+ # import cluster
76
+ # for spk in tqdm.tqdm(os.listdir("dataset")):
77
+ # if os.path.isdir(f"dataset/{spk}"):
78
+ # print(f"start kmeans inference for {spk}...")
79
+ # for feature_path in tqdm.tqdm(glob(f"dataset/{spk}/*.discrete.npy", recursive=True)):
80
+ # mel_path = feature_path.replace(".discrete.npy",".mel.npy")
81
+ # mel_spectrogram = np.load(mel_path)
82
+ # feature_len = mel_spectrogram.shape[-1]
83
+ # c = np.load(feature_path)
84
+ # c = utils.tools.repeat_expand_2d(torch.FloatTensor(c), feature_len).numpy()
85
+ # feature = c.T
86
+ # feature_class = cluster.get_cluster_result(feature, spk)
87
+ # np.save(feature_path.replace(".discrete.npy", ".discrete_class.npy"), feature_class)
88
+
89
+
configs/config.json ADDED
File without changes
data_utils.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import os
3
+ import random
4
+ import numpy as np
5
+ import torch
6
+ import torch.utils.data
7
+
8
+ import modules.commons as commons
9
+ import utils
10
+ from modules.mel_processing import spectrogram_torch, spec_to_mel_torch
11
+ from utils import load_wav_to_torch, load_filepaths_and_text
12
+
13
+ # import h5py
14
+
15
+
16
+ """Multi speaker version"""
17
+
18
+
19
+ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
20
+ """
21
+ 1) loads audio, speaker_id, text pairs
22
+ 2) normalizes text and converts them to sequences of integers
23
+ 3) computes spectrograms from audio files.
24
+ """
25
+
26
+ def __init__(self, audiopaths, hparams, all_in_mem: bool = False):
27
+ self.audiopaths = load_filepaths_and_text(audiopaths)
28
+ self.max_wav_value = hparams.data.max_wav_value
29
+ self.sampling_rate = hparams.data.sampling_rate
30
+ self.filter_length = hparams.data.filter_length
31
+ self.hop_length = hparams.data.hop_length
32
+ self.win_length = hparams.data.win_length
33
+ self.sampling_rate = hparams.data.sampling_rate
34
+ self.use_sr = hparams.train.use_sr
35
+ self.spec_len = hparams.train.max_speclen
36
+ self.spk_map = hparams.spk
37
+
38
+ random.seed(1234)
39
+ random.shuffle(self.audiopaths)
40
+
41
+ self.all_in_mem = all_in_mem
42
+ if self.all_in_mem:
43
+ self.cache = [self.get_audio(p[0]) for p in self.audiopaths]
44
+
45
+ def get_audio(self, filename):
46
+ filename = filename.replace("\\", "/")
47
+ audio, sampling_rate = load_wav_to_torch(filename)
48
+ if sampling_rate != self.sampling_rate:
49
+ raise ValueError("{} SR doesn't match target {} SR".format(
50
+ sampling_rate, self.sampling_rate))
51
+ audio_norm = audio / self.max_wav_value
52
+ audio_norm = audio_norm.unsqueeze(0)
53
+ spec_filename = filename.replace(".wav", ".spec.pt")
54
+
55
+ # Ideally, all data generated after Mar 25 should have .spec.pt
56
+ if os.path.exists(spec_filename):
57
+ spec = torch.load(spec_filename)
58
+ else:
59
+ spec = spectrogram_torch(audio_norm, self.filter_length,
60
+ self.sampling_rate, self.hop_length, self.win_length,
61
+ center=False)
62
+ spec = torch.squeeze(spec, 0)
63
+ torch.save(spec, spec_filename)
64
+
65
+ spk = filename.split("/")[-2]
66
+ spk = torch.LongTensor([self.spk_map[spk]])
67
+
68
+ f0 = np.load(filename + ".f0.npy")
69
+ f0, uv = utils.interpolate_f0(f0)
70
+ f0 = torch.FloatTensor(f0)
71
+ uv = torch.FloatTensor(uv)
72
+
73
+ c = torch.load(filename+ ".soft.pt")
74
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0])
75
+
76
+
77
+ lmin = min(c.size(-1), spec.size(-1))
78
+ assert abs(c.size(-1) - spec.size(-1)) < 3, (c.size(-1), spec.size(-1), f0.shape, filename)
79
+ assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
80
+ spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
81
+ audio_norm = audio_norm[:, :lmin * self.hop_length]
82
+
83
+ return c, f0, spec, audio_norm, spk, uv
84
+
85
+ def random_slice(self, c, f0, spec, audio_norm, spk, uv):
86
+ # if spec.shape[1] < 30:
87
+ # print("skip too short audio:", filename)
88
+ # return None
89
+ if spec.shape[1] > 800:
90
+ start = random.randint(0, spec.shape[1]-800)
91
+ end = start + 790
92
+ spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
93
+ audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
94
+
95
+ return c, f0, spec, audio_norm, spk, uv
96
+
97
+ def __getitem__(self, index):
98
+ if self.all_in_mem:
99
+ return self.random_slice(*self.cache[index])
100
+ else:
101
+ return self.random_slice(*self.get_audio(self.audiopaths[index][0]))
102
+
103
+ def __len__(self):
104
+ return len(self.audiopaths)
105
+
106
+
107
+ class TextAudioCollate:
108
+
109
+ def __call__(self, batch):
110
+ batch = [b for b in batch if b is not None]
111
+
112
+ input_lengths, ids_sorted_decreasing = torch.sort(
113
+ torch.LongTensor([x[0].shape[1] for x in batch]),
114
+ dim=0, descending=True)
115
+
116
+ max_c_len = max([x[0].size(1) for x in batch])
117
+ max_wav_len = max([x[3].size(1) for x in batch])
118
+
119
+ lengths = torch.LongTensor(len(batch))
120
+
121
+ c_padded = torch.FloatTensor(len(batch), batch[0][0].shape[0], max_c_len)
122
+ f0_padded = torch.FloatTensor(len(batch), max_c_len)
123
+ spec_padded = torch.FloatTensor(len(batch), batch[0][2].shape[0], max_c_len)
124
+ wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
125
+ spkids = torch.LongTensor(len(batch), 1)
126
+ uv_padded = torch.FloatTensor(len(batch), max_c_len)
127
+
128
+ c_padded.zero_()
129
+ spec_padded.zero_()
130
+ f0_padded.zero_()
131
+ wav_padded.zero_()
132
+ uv_padded.zero_()
133
+
134
+ for i in range(len(ids_sorted_decreasing)):
135
+ row = batch[ids_sorted_decreasing[i]]
136
+
137
+ c = row[0]
138
+ c_padded[i, :, :c.size(1)] = c
139
+ lengths[i] = c.size(1)
140
+
141
+ f0 = row[1]
142
+ f0_padded[i, :f0.size(0)] = f0
143
+
144
+ spec = row[2]
145
+ spec_padded[i, :, :spec.size(1)] = spec
146
+
147
+ wav = row[3]
148
+ wav_padded[i, :, :wav.size(1)] = wav
149
+
150
+ spkids[i, 0] = row[4]
151
+
152
+ uv = row[5]
153
+ uv_padded[i, :uv.size(0)] = uv
154
+
155
+ return c_padded, f0_padded, spec_padded, wav_padded, spkids, lengths, uv_padded
hubert/__init__.py ADDED
File without changes
hubert/checkpoint_best_legacy_500.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60d936ec5a566776fc392e69ad8b630d14eb588111233fe313436e200a7b187b
3
+ size 1330114945
hubert/hubert_model.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import random
3
+ from typing import Optional, Tuple
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as t_func
8
+ from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
9
+
10
+
11
+ class Hubert(nn.Module):
12
+ def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
13
+ super().__init__()
14
+ self._mask = mask
15
+ self.feature_extractor = FeatureExtractor()
16
+ self.feature_projection = FeatureProjection()
17
+ self.positional_embedding = PositionalConvEmbedding()
18
+ self.norm = nn.LayerNorm(768)
19
+ self.dropout = nn.Dropout(0.1)
20
+ self.encoder = TransformerEncoder(
21
+ nn.TransformerEncoderLayer(
22
+ 768, 12, 3072, activation="gelu", batch_first=True
23
+ ),
24
+ 12,
25
+ )
26
+ self.proj = nn.Linear(768, 256)
27
+
28
+ self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
29
+ self.label_embedding = nn.Embedding(num_label_embeddings, 256)
30
+
31
+ def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
32
+ mask = None
33
+ if self.training and self._mask:
34
+ mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
35
+ x[mask] = self.masked_spec_embed.to(x.dtype)
36
+ return x, mask
37
+
38
+ def encode(
39
+ self, x: torch.Tensor, layer: Optional[int] = None
40
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
41
+ x = self.feature_extractor(x)
42
+ x = self.feature_projection(x.transpose(1, 2))
43
+ x, mask = self.mask(x)
44
+ x = x + self.positional_embedding(x)
45
+ x = self.dropout(self.norm(x))
46
+ x = self.encoder(x, output_layer=layer)
47
+ return x, mask
48
+
49
+ def logits(self, x: torch.Tensor) -> torch.Tensor:
50
+ logits = torch.cosine_similarity(
51
+ x.unsqueeze(2),
52
+ self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
53
+ dim=-1,
54
+ )
55
+ return logits / 0.1
56
+
57
+ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
58
+ x, mask = self.encode(x)
59
+ x = self.proj(x)
60
+ logits = self.logits(x)
61
+ return logits, mask
62
+
63
+
64
+ class HubertSoft(Hubert):
65
+ def __init__(self):
66
+ super().__init__()
67
+
68
+ @torch.inference_mode()
69
+ def units(self, wav: torch.Tensor) -> torch.Tensor:
70
+ wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
71
+ x, _ = self.encode(wav)
72
+ return self.proj(x)
73
+
74
+
75
+ class FeatureExtractor(nn.Module):
76
+ def __init__(self):
77
+ super().__init__()
78
+ self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
79
+ self.norm0 = nn.GroupNorm(512, 512)
80
+ self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
81
+ self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
82
+ self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
83
+ self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
84
+ self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
85
+ self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
86
+
87
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
88
+ x = t_func.gelu(self.norm0(self.conv0(x)))
89
+ x = t_func.gelu(self.conv1(x))
90
+ x = t_func.gelu(self.conv2(x))
91
+ x = t_func.gelu(self.conv3(x))
92
+ x = t_func.gelu(self.conv4(x))
93
+ x = t_func.gelu(self.conv5(x))
94
+ x = t_func.gelu(self.conv6(x))
95
+ return x
96
+
97
+
98
+ class FeatureProjection(nn.Module):
99
+ def __init__(self):
100
+ super().__init__()
101
+ self.norm = nn.LayerNorm(512)
102
+ self.projection = nn.Linear(512, 768)
103
+ self.dropout = nn.Dropout(0.1)
104
+
105
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
106
+ x = self.norm(x)
107
+ x = self.projection(x)
108
+ x = self.dropout(x)
109
+ return x
110
+
111
+
112
+ class PositionalConvEmbedding(nn.Module):
113
+ def __init__(self):
114
+ super().__init__()
115
+ self.conv = nn.Conv1d(
116
+ 768,
117
+ 768,
118
+ kernel_size=128,
119
+ padding=128 // 2,
120
+ groups=16,
121
+ )
122
+ self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
123
+
124
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
125
+ x = self.conv(x.transpose(1, 2))
126
+ x = t_func.gelu(x[:, :, :-1])
127
+ return x.transpose(1, 2)
128
+
129
+
130
+ class TransformerEncoder(nn.Module):
131
+ def __init__(
132
+ self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
133
+ ) -> None:
134
+ super(TransformerEncoder, self).__init__()
135
+ self.layers = nn.ModuleList(
136
+ [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
137
+ )
138
+ self.num_layers = num_layers
139
+
140
+ def forward(
141
+ self,
142
+ src: torch.Tensor,
143
+ mask: torch.Tensor = None,
144
+ src_key_padding_mask: torch.Tensor = None,
145
+ output_layer: Optional[int] = None,
146
+ ) -> torch.Tensor:
147
+ output = src
148
+ for layer in self.layers[:output_layer]:
149
+ output = layer(
150
+ output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
151
+ )
152
+ return output
153
+
154
+
155
+ def _compute_mask(
156
+ shape: Tuple[int, int],
157
+ mask_prob: float,
158
+ mask_length: int,
159
+ device: torch.device,
160
+ min_masks: int = 0,
161
+ ) -> torch.Tensor:
162
+ batch_size, sequence_length = shape
163
+
164
+ if mask_length < 1:
165
+ raise ValueError("`mask_length` has to be bigger than 0.")
166
+
167
+ if mask_length > sequence_length:
168
+ raise ValueError(
169
+ f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
170
+ )
171
+
172
+ # compute number of masked spans in batch
173
+ num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
174
+ num_masked_spans = max(num_masked_spans, min_masks)
175
+
176
+ # make sure num masked indices <= sequence_length
177
+ if num_masked_spans * mask_length > sequence_length:
178
+ num_masked_spans = sequence_length // mask_length
179
+
180
+ # SpecAugment mask to fill
181
+ mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
182
+
183
+ # uniform distribution to sample from, make sure that offset samples are < sequence_length
184
+ uniform_dist = torch.ones(
185
+ (batch_size, sequence_length - (mask_length - 1)), device=device
186
+ )
187
+
188
+ # get random indices to mask
189
+ mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
190
+
191
+ # expand masked indices to masked spans
192
+ mask_indices = (
193
+ mask_indices.unsqueeze(dim=-1)
194
+ .expand((batch_size, num_masked_spans, mask_length))
195
+ .reshape(batch_size, num_masked_spans * mask_length)
196
+ )
197
+ offsets = (
198
+ torch.arange(mask_length, device=device)[None, None, :]
199
+ .expand((batch_size, num_masked_spans, mask_length))
200
+ .reshape(batch_size, num_masked_spans * mask_length)
201
+ )
202
+ mask_idxs = mask_indices + offsets
203
+
204
+ # scatter indices to mask
205
+ mask = mask.scatter(1, mask_idxs, True)
206
+
207
+ return mask
208
+
209
+
210
+ def hubert_soft(
211
+ path: str,
212
+ ) -> HubertSoft:
213
+ r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
214
+ Args:
215
+ path (str): path of a pretrained model
216
+ """
217
+ hubert = HubertSoft()
218
+ checkpoint = torch.load(path)
219
+ consume_prefix_in_state_dict_if_present(checkpoint, "module.")
220
+ hubert.load_state_dict(checkpoint)
221
+ hubert.eval()
222
+ return hubert
hubert/hubert_model_onnx.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import random
3
+ from typing import Optional, Tuple
4
+
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as t_func
8
+ from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
9
+
10
+
11
+ class Hubert(nn.Module):
12
+ def __init__(self, num_label_embeddings: int = 100, mask: bool = True):
13
+ super().__init__()
14
+ self._mask = mask
15
+ self.feature_extractor = FeatureExtractor()
16
+ self.feature_projection = FeatureProjection()
17
+ self.positional_embedding = PositionalConvEmbedding()
18
+ self.norm = nn.LayerNorm(768)
19
+ self.dropout = nn.Dropout(0.1)
20
+ self.encoder = TransformerEncoder(
21
+ nn.TransformerEncoderLayer(
22
+ 768, 12, 3072, activation="gelu", batch_first=True
23
+ ),
24
+ 12,
25
+ )
26
+ self.proj = nn.Linear(768, 256)
27
+
28
+ self.masked_spec_embed = nn.Parameter(torch.FloatTensor(768).uniform_())
29
+ self.label_embedding = nn.Embedding(num_label_embeddings, 256)
30
+
31
+ def mask(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
32
+ mask = None
33
+ if self.training and self._mask:
34
+ mask = _compute_mask((x.size(0), x.size(1)), 0.8, 10, x.device, 2)
35
+ x[mask] = self.masked_spec_embed.to(x.dtype)
36
+ return x, mask
37
+
38
+ def encode(
39
+ self, x: torch.Tensor, layer: Optional[int] = None
40
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
41
+ x = self.feature_extractor(x)
42
+ x = self.feature_projection(x.transpose(1, 2))
43
+ x, mask = self.mask(x)
44
+ x = x + self.positional_embedding(x)
45
+ x = self.dropout(self.norm(x))
46
+ x = self.encoder(x, output_layer=layer)
47
+ return x, mask
48
+
49
+ def logits(self, x: torch.Tensor) -> torch.Tensor:
50
+ logits = torch.cosine_similarity(
51
+ x.unsqueeze(2),
52
+ self.label_embedding.weight.unsqueeze(0).unsqueeze(0),
53
+ dim=-1,
54
+ )
55
+ return logits / 0.1
56
+
57
+
58
+ class HubertSoft(Hubert):
59
+ def __init__(self):
60
+ super().__init__()
61
+
62
+ def units(self, wav: torch.Tensor) -> torch.Tensor:
63
+ wav = t_func.pad(wav, ((400 - 320) // 2, (400 - 320) // 2))
64
+ x, _ = self.encode(wav)
65
+ return self.proj(x)
66
+
67
+ def forward(self, x):
68
+ return self.units(x)
69
+
70
+ class FeatureExtractor(nn.Module):
71
+ def __init__(self):
72
+ super().__init__()
73
+ self.conv0 = nn.Conv1d(1, 512, 10, 5, bias=False)
74
+ self.norm0 = nn.GroupNorm(512, 512)
75
+ self.conv1 = nn.Conv1d(512, 512, 3, 2, bias=False)
76
+ self.conv2 = nn.Conv1d(512, 512, 3, 2, bias=False)
77
+ self.conv3 = nn.Conv1d(512, 512, 3, 2, bias=False)
78
+ self.conv4 = nn.Conv1d(512, 512, 3, 2, bias=False)
79
+ self.conv5 = nn.Conv1d(512, 512, 2, 2, bias=False)
80
+ self.conv6 = nn.Conv1d(512, 512, 2, 2, bias=False)
81
+
82
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
83
+ x = t_func.gelu(self.norm0(self.conv0(x)))
84
+ x = t_func.gelu(self.conv1(x))
85
+ x = t_func.gelu(self.conv2(x))
86
+ x = t_func.gelu(self.conv3(x))
87
+ x = t_func.gelu(self.conv4(x))
88
+ x = t_func.gelu(self.conv5(x))
89
+ x = t_func.gelu(self.conv6(x))
90
+ return x
91
+
92
+
93
+ class FeatureProjection(nn.Module):
94
+ def __init__(self):
95
+ super().__init__()
96
+ self.norm = nn.LayerNorm(512)
97
+ self.projection = nn.Linear(512, 768)
98
+ self.dropout = nn.Dropout(0.1)
99
+
100
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
101
+ x = self.norm(x)
102
+ x = self.projection(x)
103
+ x = self.dropout(x)
104
+ return x
105
+
106
+
107
+ class PositionalConvEmbedding(nn.Module):
108
+ def __init__(self):
109
+ super().__init__()
110
+ self.conv = nn.Conv1d(
111
+ 768,
112
+ 768,
113
+ kernel_size=128,
114
+ padding=128 // 2,
115
+ groups=16,
116
+ )
117
+ self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
118
+
119
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
120
+ x = self.conv(x.transpose(1, 2))
121
+ x = t_func.gelu(x[:, :, :-1])
122
+ return x.transpose(1, 2)
123
+
124
+
125
+ class TransformerEncoder(nn.Module):
126
+ def __init__(
127
+ self, encoder_layer: nn.TransformerEncoderLayer, num_layers: int
128
+ ) -> None:
129
+ super(TransformerEncoder, self).__init__()
130
+ self.layers = nn.ModuleList(
131
+ [copy.deepcopy(encoder_layer) for _ in range(num_layers)]
132
+ )
133
+ self.num_layers = num_layers
134
+
135
+ def forward(
136
+ self,
137
+ src: torch.Tensor,
138
+ mask: torch.Tensor = None,
139
+ src_key_padding_mask: torch.Tensor = None,
140
+ output_layer: Optional[int] = None,
141
+ ) -> torch.Tensor:
142
+ output = src
143
+ for layer in self.layers[:output_layer]:
144
+ output = layer(
145
+ output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
146
+ )
147
+ return output
148
+
149
+
150
+ def _compute_mask(
151
+ shape: Tuple[int, int],
152
+ mask_prob: float,
153
+ mask_length: int,
154
+ device: torch.device,
155
+ min_masks: int = 0,
156
+ ) -> torch.Tensor:
157
+ batch_size, sequence_length = shape
158
+
159
+ if mask_length < 1:
160
+ raise ValueError("`mask_length` has to be bigger than 0.")
161
+
162
+ if mask_length > sequence_length:
163
+ raise ValueError(
164
+ f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
165
+ )
166
+
167
+ # compute number of masked spans in batch
168
+ num_masked_spans = int(mask_prob * sequence_length / mask_length + random.random())
169
+ num_masked_spans = max(num_masked_spans, min_masks)
170
+
171
+ # make sure num masked indices <= sequence_length
172
+ if num_masked_spans * mask_length > sequence_length:
173
+ num_masked_spans = sequence_length // mask_length
174
+
175
+ # SpecAugment mask to fill
176
+ mask = torch.zeros((batch_size, sequence_length), device=device, dtype=torch.bool)
177
+
178
+ # uniform distribution to sample from, make sure that offset samples are < sequence_length
179
+ uniform_dist = torch.ones(
180
+ (batch_size, sequence_length - (mask_length - 1)), device=device
181
+ )
182
+
183
+ # get random indices to mask
184
+ mask_indices = torch.multinomial(uniform_dist, num_masked_spans)
185
+
186
+ # expand masked indices to masked spans
187
+ mask_indices = (
188
+ mask_indices.unsqueeze(dim=-1)
189
+ .expand((batch_size, num_masked_spans, mask_length))
190
+ .reshape(batch_size, num_masked_spans * mask_length)
191
+ )
192
+ offsets = (
193
+ torch.arange(mask_length, device=device)[None, None, :]
194
+ .expand((batch_size, num_masked_spans, mask_length))
195
+ .reshape(batch_size, num_masked_spans * mask_length)
196
+ )
197
+ mask_idxs = mask_indices + offsets
198
+
199
+ # scatter indices to mask
200
+ mask = mask.scatter(1, mask_idxs, True)
201
+
202
+ return mask
203
+
204
+
205
+ def hubert_soft(
206
+ path: str,
207
+ ) -> HubertSoft:
208
+ r"""HuBERT-Soft from `"A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion"`.
209
+ Args:
210
+ path (str): path of a pretrained model
211
+ """
212
+ hubert = HubertSoft()
213
+ checkpoint = torch.load(path)
214
+ consume_prefix_in_state_dict_if_present(checkpoint, "module.")
215
+ hubert.load_state_dict(checkpoint)
216
+ hubert.eval()
217
+ return hubert
inference/__init__.py ADDED
File without changes
inference/chunks_temp.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"info": "temp_dict"}
inference/infer_tool.py ADDED
@@ -0,0 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import io
3
+ import json
4
+ import logging
5
+ import os
6
+ import time
7
+ from pathlib import Path
8
+ from inference import slicer
9
+
10
+ import librosa
11
+ import numpy as np
12
+ # import onnxruntime
13
+ import parselmouth
14
+ import soundfile
15
+ import torch
16
+ import torchaudio
17
+
18
+ import cluster
19
+ from hubert import hubert_model
20
+ import utils
21
+ from models import SynthesizerTrn
22
+
23
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
24
+
25
+
26
+ def read_temp(file_name):
27
+ if not os.path.exists(file_name):
28
+ with open(file_name, "w") as f:
29
+ f.write(json.dumps({"info": "temp_dict"}))
30
+ return {}
31
+ else:
32
+ try:
33
+ with open(file_name, "r") as f:
34
+ data = f.read()
35
+ data_dict = json.loads(data)
36
+ if os.path.getsize(file_name) > 50 * 1024 * 1024:
37
+ f_name = file_name.replace("\\", "/").split("/")[-1]
38
+ print(f"clean {f_name}")
39
+ for wav_hash in list(data_dict.keys()):
40
+ if int(time.time()) - int(data_dict[wav_hash]["time"]) > 14 * 24 * 3600:
41
+ del data_dict[wav_hash]
42
+ except Exception as e:
43
+ print(e)
44
+ print(f"{file_name} error,auto rebuild file")
45
+ data_dict = {"info": "temp_dict"}
46
+ return data_dict
47
+
48
+
49
+ def write_temp(file_name, data):
50
+ with open(file_name, "w") as f:
51
+ f.write(json.dumps(data))
52
+
53
+
54
+ def timeit(func):
55
+ def run(*args, **kwargs):
56
+ t = time.time()
57
+ res = func(*args, **kwargs)
58
+ print('executing \'%s\' costed %.3fs' % (func.__name__, time.time() - t))
59
+ return res
60
+
61
+ return run
62
+
63
+
64
+ def format_wav(audio_path):
65
+ if Path(audio_path).suffix == '.wav':
66
+ return
67
+ raw_audio, raw_sample_rate = librosa.load(audio_path, mono=True, sr=None)
68
+ soundfile.write(Path(audio_path).with_suffix(".wav"), raw_audio, raw_sample_rate)
69
+
70
+
71
+ def get_end_file(dir_path, end):
72
+ file_lists = []
73
+ for root, dirs, files in os.walk(dir_path):
74
+ files = [f for f in files if f[0] != '.']
75
+ dirs[:] = [d for d in dirs if d[0] != '.']
76
+ for f_file in files:
77
+ if f_file.endswith(end):
78
+ file_lists.append(os.path.join(root, f_file).replace("\\", "/"))
79
+ return file_lists
80
+
81
+
82
+ def get_md5(content):
83
+ return hashlib.new("md5", content).hexdigest()
84
+
85
+ def fill_a_to_b(a, b):
86
+ if len(a) < len(b):
87
+ for _ in range(0, len(b) - len(a)):
88
+ a.append(a[0])
89
+
90
+ def mkdir(paths: list):
91
+ for path in paths:
92
+ if not os.path.exists(path):
93
+ os.mkdir(path)
94
+
95
+ def pad_array(arr, target_length):
96
+ current_length = arr.shape[0]
97
+ if current_length >= target_length:
98
+ return arr
99
+ else:
100
+ pad_width = target_length - current_length
101
+ pad_left = pad_width // 2
102
+ pad_right = pad_width - pad_left
103
+ padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
104
+ return padded_arr
105
+
106
+ def split_list_by_n(list_collection, n, pre=0):
107
+ for i in range(0, len(list_collection), n):
108
+ yield list_collection[i-pre if i-pre>=0 else i: i + n]
109
+
110
+
111
+ class F0FilterException(Exception):
112
+ pass
113
+
114
+ class Svc(object):
115
+ def __init__(self, net_g_path, config_path,
116
+ device=None,
117
+ cluster_model_path="logs/44k/kmeans_10000.pt"):
118
+ self.net_g_path = net_g_path
119
+ if device is None:
120
+ self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
121
+ else:
122
+ self.dev = torch.device(device)
123
+ self.net_g_ms = None
124
+ self.hps_ms = utils.get_hparams_from_file(config_path)
125
+ self.target_sample = self.hps_ms.data.sampling_rate
126
+ self.hop_size = self.hps_ms.data.hop_length
127
+ self.spk2id = self.hps_ms.spk
128
+ # 加载hubert
129
+ self.hubert_model = utils.get_hubert_model().to(self.dev)
130
+ self.load_model()
131
+ if os.path.exists(cluster_model_path):
132
+ self.cluster_model = cluster.get_cluster_model(cluster_model_path)
133
+
134
+ def load_model(self):
135
+ # 获取模型配置
136
+ self.net_g_ms = SynthesizerTrn(
137
+ self.hps_ms.data.filter_length // 2 + 1,
138
+ self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
139
+ **self.hps_ms.model)
140
+ _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
141
+ if "half" in self.net_g_path and torch.cuda.is_available():
142
+ _ = self.net_g_ms.half().eval().to(self.dev)
143
+ else:
144
+ _ = self.net_g_ms.eval().to(self.dev)
145
+
146
+
147
+
148
+ def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter ,F0_mean_pooling):
149
+
150
+ wav, sr = librosa.load(in_path, sr=self.target_sample)
151
+
152
+ if F0_mean_pooling == True:
153
+ f0, uv = utils.compute_f0_uv_torchcrepe(torch.FloatTensor(wav), sampling_rate=self.target_sample, hop_length=self.hop_size,device=self.dev)
154
+ if f0_filter and sum(f0) == 0:
155
+ raise F0FilterException("未检���到人声")
156
+ f0 = torch.FloatTensor(list(f0))
157
+ uv = torch.FloatTensor(list(uv))
158
+ if F0_mean_pooling == False:
159
+ f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size)
160
+ if f0_filter and sum(f0) == 0:
161
+ raise F0FilterException("未检测到人声")
162
+ f0, uv = utils.interpolate_f0(f0)
163
+ f0 = torch.FloatTensor(f0)
164
+ uv = torch.FloatTensor(uv)
165
+
166
+ f0 = f0 * 2 ** (tran / 12)
167
+ f0 = f0.unsqueeze(0).to(self.dev)
168
+ uv = uv.unsqueeze(0).to(self.dev)
169
+
170
+ wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
171
+ wav16k = torch.from_numpy(wav16k).to(self.dev)
172
+ c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k)
173
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
174
+
175
+ if cluster_infer_ratio !=0:
176
+ cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
177
+ cluster_c = torch.FloatTensor(cluster_c).to(self.dev)
178
+ c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c
179
+
180
+ c = c.unsqueeze(0)
181
+ return c, f0, uv
182
+
183
+ def infer(self, speaker, tran, raw_path,
184
+ cluster_infer_ratio=0,
185
+ auto_predict_f0=False,
186
+ noice_scale=0.4,
187
+ f0_filter=False,
188
+ F0_mean_pooling=False
189
+ ):
190
+
191
+ speaker_id = self.spk2id.__dict__.get(speaker)
192
+ if not speaker_id and type(speaker) is int:
193
+ if len(self.spk2id.__dict__) >= speaker:
194
+ speaker_id = speaker
195
+ sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
196
+ c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker, f0_filter,F0_mean_pooling)
197
+ if "half" in self.net_g_path and torch.cuda.is_available():
198
+ c = c.half()
199
+ with torch.no_grad():
200
+ start = time.time()
201
+ audio = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)[0,0].data.float()
202
+ use_time = time.time() - start
203
+ print("vits use time:{}".format(use_time))
204
+ return audio, audio.shape[-1]
205
+
206
+ def clear_empty(self):
207
+ # 清理显存
208
+ torch.cuda.empty_cache()
209
+
210
+ def slice_inference(self,
211
+ raw_audio_path,
212
+ spk,
213
+ tran,
214
+ slice_db,
215
+ cluster_infer_ratio,
216
+ auto_predict_f0,
217
+ noice_scale,
218
+ pad_seconds=0.5,
219
+ clip_seconds=0,
220
+ lg_num=0,
221
+ lgr_num =0.75,
222
+ F0_mean_pooling = False
223
+ ):
224
+ wav_path = raw_audio_path
225
+ chunks = slicer.cut(wav_path, db_thresh=slice_db)
226
+ audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
227
+ per_size = int(clip_seconds*audio_sr)
228
+ lg_size = int(lg_num*audio_sr)
229
+ lg_size_r = int(lg_size*lgr_num)
230
+ lg_size_c_l = (lg_size-lg_size_r)//2
231
+ lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
232
+ lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
233
+
234
+ audio = []
235
+ for (slice_tag, data) in audio_data:
236
+ print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
237
+ # padd
238
+ length = int(np.ceil(len(data) / audio_sr * self.target_sample))
239
+ if slice_tag:
240
+ print('jump empty segment')
241
+ _audio = np.zeros(length)
242
+ audio.extend(list(pad_array(_audio, length)))
243
+ continue
244
+ if per_size != 0:
245
+ datas = split_list_by_n(data, per_size,lg_size)
246
+ else:
247
+ datas = [data]
248
+ for k,dat in enumerate(datas):
249
+ per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
250
+ if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
251
+ # padd
252
+ pad_len = int(audio_sr * pad_seconds)
253
+ dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
254
+ raw_path = io.BytesIO()
255
+ soundfile.write(raw_path, dat, audio_sr, format="wav")
256
+ raw_path.seek(0)
257
+ out_audio, out_sr = self.infer(spk, tran, raw_path,
258
+ cluster_infer_ratio=cluster_infer_ratio,
259
+ auto_predict_f0=auto_predict_f0,
260
+ noice_scale=noice_scale,
261
+ F0_mean_pooling = F0_mean_pooling
262
+ )
263
+ _audio = out_audio.cpu().numpy()
264
+ pad_len = int(self.target_sample * pad_seconds)
265
+ _audio = _audio[pad_len:-pad_len]
266
+ _audio = pad_array(_audio, per_length)
267
+ if lg_size!=0 and k!=0:
268
+ lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr_num != 1 else audio[-lg_size:]
269
+ lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr_num != 1 else _audio[0:lg_size]
270
+ lg_pre = lg1*(1-lg)+lg2*lg
271
+ audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr_num != 1 else audio[0:-lg_size]
272
+ audio.extend(lg_pre)
273
+ _audio = _audio[lg_size_c_l+lg_size_r:] if lgr_num != 1 else _audio[lg_size:]
274
+ audio.extend(list(_audio))
275
+ return np.array(audio)
276
+
277
+ class RealTimeVC:
278
+ def __init__(self):
279
+ self.last_chunk = None
280
+ self.last_o = None
281
+ self.chunk_len = 16000 # 区块长度
282
+ self.pre_len = 3840 # 交叉淡化长度,640的倍数
283
+
284
+ """输入输出都是1维numpy 音频波形数组"""
285
+
286
+ def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path,
287
+ cluster_infer_ratio=0,
288
+ auto_predict_f0=False,
289
+ noice_scale=0.4,
290
+ f0_filter=False):
291
+
292
+ import maad
293
+ audio, sr = torchaudio.load(input_wav_path)
294
+ audio = audio.cpu().numpy()[0]
295
+ temp_wav = io.BytesIO()
296
+ if self.last_chunk is None:
297
+ input_wav_path.seek(0)
298
+
299
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path,
300
+ cluster_infer_ratio=cluster_infer_ratio,
301
+ auto_predict_f0=auto_predict_f0,
302
+ noice_scale=noice_scale,
303
+ f0_filter=f0_filter)
304
+
305
+ audio = audio.cpu().numpy()
306
+ self.last_chunk = audio[-self.pre_len:]
307
+ self.last_o = audio
308
+ return audio[-self.chunk_len:]
309
+ else:
310
+ audio = np.concatenate([self.last_chunk, audio])
311
+ soundfile.write(temp_wav, audio, sr, format="wav")
312
+ temp_wav.seek(0)
313
+
314
+ audio, sr = svc_model.infer(speaker_id, f_pitch_change, temp_wav,
315
+ cluster_infer_ratio=cluster_infer_ratio,
316
+ auto_predict_f0=auto_predict_f0,
317
+ noice_scale=noice_scale,
318
+ f0_filter=f0_filter)
319
+
320
+ audio = audio.cpu().numpy()
321
+ ret = maad.util.crossfade(self.last_o, audio, self.pre_len)
322
+ self.last_chunk = audio[-self.pre_len:]
323
+ self.last_o = audio
324
+ return ret[self.chunk_len:2 * self.chunk_len]
inference/infer_tool_grad.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import json
3
+ import logging
4
+ import os
5
+ import time
6
+ from pathlib import Path
7
+ import io
8
+ import librosa
9
+ import maad
10
+ import numpy as np
11
+ from inference import slicer
12
+ import parselmouth
13
+ import soundfile
14
+ import torch
15
+ import torchaudio
16
+
17
+ from hubert import hubert_model
18
+ import utils
19
+ from models import SynthesizerTrn
20
+ logging.getLogger('numba').setLevel(logging.WARNING)
21
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
22
+
23
+ def resize2d_f0(x, target_len):
24
+ source = np.array(x)
25
+ source[source < 0.001] = np.nan
26
+ target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
27
+ source)
28
+ res = np.nan_to_num(target)
29
+ return res
30
+
31
+ def get_f0(x, p_len,f0_up_key=0):
32
+
33
+ time_step = 160 / 16000 * 1000
34
+ f0_min = 50
35
+ f0_max = 1100
36
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
37
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
38
+
39
+ f0 = parselmouth.Sound(x, 16000).to_pitch_ac(
40
+ time_step=time_step / 1000, voicing_threshold=0.6,
41
+ pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
42
+
43
+ pad_size=(p_len - len(f0) + 1) // 2
44
+ if(pad_size>0 or p_len - len(f0) - pad_size>0):
45
+ f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
46
+
47
+ f0 *= pow(2, f0_up_key / 12)
48
+ f0_mel = 1127 * np.log(1 + f0 / 700)
49
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
50
+ f0_mel[f0_mel <= 1] = 1
51
+ f0_mel[f0_mel > 255] = 255
52
+ f0_coarse = np.rint(f0_mel).astype(np.int)
53
+ return f0_coarse, f0
54
+
55
+ def clean_pitch(input_pitch):
56
+ num_nan = np.sum(input_pitch == 1)
57
+ if num_nan / len(input_pitch) > 0.9:
58
+ input_pitch[input_pitch != 1] = 1
59
+ return input_pitch
60
+
61
+
62
+ def plt_pitch(input_pitch):
63
+ input_pitch = input_pitch.astype(float)
64
+ input_pitch[input_pitch == 1] = np.nan
65
+ return input_pitch
66
+
67
+
68
+ def f0_to_pitch(ff):
69
+ f0_pitch = 69 + 12 * np.log2(ff / 440)
70
+ return f0_pitch
71
+
72
+
73
+ def fill_a_to_b(a, b):
74
+ if len(a) < len(b):
75
+ for _ in range(0, len(b) - len(a)):
76
+ a.append(a[0])
77
+
78
+
79
+ def mkdir(paths: list):
80
+ for path in paths:
81
+ if not os.path.exists(path):
82
+ os.mkdir(path)
83
+
84
+
85
+ class VitsSvc(object):
86
+ def __init__(self):
87
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
88
+ self.SVCVITS = None
89
+ self.hps = None
90
+ self.speakers = None
91
+ self.hubert_soft = utils.get_hubert_model()
92
+
93
+ def set_device(self, device):
94
+ self.device = torch.device(device)
95
+ self.hubert_soft.to(self.device)
96
+ if self.SVCVITS != None:
97
+ self.SVCVITS.to(self.device)
98
+
99
+ def loadCheckpoint(self, path):
100
+ self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
101
+ self.SVCVITS = SynthesizerTrn(
102
+ self.hps.data.filter_length // 2 + 1,
103
+ self.hps.train.segment_size // self.hps.data.hop_length,
104
+ **self.hps.model)
105
+ _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.SVCVITS, None)
106
+ _ = self.SVCVITS.eval().to(self.device)
107
+ self.speakers = self.hps.spk
108
+
109
+ def get_units(self, source, sr):
110
+ source = source.unsqueeze(0).to(self.device)
111
+ with torch.inference_mode():
112
+ units = self.hubert_soft.units(source)
113
+ return units
114
+
115
+
116
+ def get_unit_pitch(self, in_path, tran):
117
+ source, sr = torchaudio.load(in_path)
118
+ source = torchaudio.functional.resample(source, sr, 16000)
119
+ if len(source.shape) == 2 and source.shape[1] >= 2:
120
+ source = torch.mean(source, dim=0).unsqueeze(0)
121
+ soft = self.get_units(source, sr).squeeze(0).cpu().numpy()
122
+ f0_coarse, f0 = get_f0(source.cpu().numpy()[0], soft.shape[0]*2, tran)
123
+ return soft, f0
124
+
125
+ def infer(self, speaker_id, tran, raw_path):
126
+ speaker_id = self.speakers[speaker_id]
127
+ sid = torch.LongTensor([int(speaker_id)]).to(self.device).unsqueeze(0)
128
+ soft, pitch = self.get_unit_pitch(raw_path, tran)
129
+ f0 = torch.FloatTensor(clean_pitch(pitch)).unsqueeze(0).to(self.device)
130
+ stn_tst = torch.FloatTensor(soft)
131
+ with torch.no_grad():
132
+ x_tst = stn_tst.unsqueeze(0).to(self.device)
133
+ x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
134
+ audio = self.SVCVITS.infer(x_tst, f0=f0, g=sid)[0,0].data.float()
135
+ return audio, audio.shape[-1]
136
+
137
+ def inference(self,srcaudio,chara,tran,slice_db):
138
+ sampling_rate, audio = srcaudio
139
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
140
+ if len(audio.shape) > 1:
141
+ audio = librosa.to_mono(audio.transpose(1, 0))
142
+ if sampling_rate != 16000:
143
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
144
+ soundfile.write("tmpwav.wav", audio, 16000, format="wav")
145
+ chunks = slicer.cut("tmpwav.wav", db_thresh=slice_db)
146
+ audio_data, audio_sr = slicer.chunks2audio("tmpwav.wav", chunks)
147
+ audio = []
148
+ for (slice_tag, data) in audio_data:
149
+ length = int(np.ceil(len(data) / audio_sr * self.hps.data.sampling_rate))
150
+ raw_path = io.BytesIO()
151
+ soundfile.write(raw_path, data, audio_sr, format="wav")
152
+ raw_path.seek(0)
153
+ if slice_tag:
154
+ _audio = np.zeros(length)
155
+ else:
156
+ out_audio, out_sr = self.infer(chara, tran, raw_path)
157
+ _audio = out_audio.cpu().numpy()
158
+ audio.extend(list(_audio))
159
+ audio = (np.array(audio) * 32768.0).astype('int16')
160
+ return (self.hps.data.sampling_rate,audio)
inference/slicer.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import torch
3
+ import torchaudio
4
+
5
+
6
+ class Slicer:
7
+ def __init__(self,
8
+ sr: int,
9
+ threshold: float = -40.,
10
+ min_length: int = 5000,
11
+ min_interval: int = 300,
12
+ hop_size: int = 20,
13
+ max_sil_kept: int = 5000):
14
+ if not min_length >= min_interval >= hop_size:
15
+ raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
16
+ if not max_sil_kept >= hop_size:
17
+ raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
18
+ min_interval = sr * min_interval / 1000
19
+ self.threshold = 10 ** (threshold / 20.)
20
+ self.hop_size = round(sr * hop_size / 1000)
21
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
22
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
23
+ self.min_interval = round(min_interval / self.hop_size)
24
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
25
+
26
+ def _apply_slice(self, waveform, begin, end):
27
+ if len(waveform.shape) > 1:
28
+ return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
29
+ else:
30
+ return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
31
+
32
+ # @timeit
33
+ def slice(self, waveform):
34
+ if len(waveform.shape) > 1:
35
+ samples = librosa.to_mono(waveform)
36
+ else:
37
+ samples = waveform
38
+ if samples.shape[0] <= self.min_length:
39
+ return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
40
+ rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
41
+ sil_tags = []
42
+ silence_start = None
43
+ clip_start = 0
44
+ for i, rms in enumerate(rms_list):
45
+ # Keep looping while frame is silent.
46
+ if rms < self.threshold:
47
+ # Record start of silent frames.
48
+ if silence_start is None:
49
+ silence_start = i
50
+ continue
51
+ # Keep looping while frame is not silent and silence start has not been recorded.
52
+ if silence_start is None:
53
+ continue
54
+ # Clear recorded silence start if interval is not enough or clip is too short
55
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
56
+ need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
57
+ if not is_leading_silence and not need_slice_middle:
58
+ silence_start = None
59
+ continue
60
+ # Need slicing. Record the range of silent frames to be removed.
61
+ if i - silence_start <= self.max_sil_kept:
62
+ pos = rms_list[silence_start: i + 1].argmin() + silence_start
63
+ if silence_start == 0:
64
+ sil_tags.append((0, pos))
65
+ else:
66
+ sil_tags.append((pos, pos))
67
+ clip_start = pos
68
+ elif i - silence_start <= self.max_sil_kept * 2:
69
+ pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
70
+ pos += i - self.max_sil_kept
71
+ pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
72
+ pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
73
+ if silence_start == 0:
74
+ sil_tags.append((0, pos_r))
75
+ clip_start = pos_r
76
+ else:
77
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
78
+ clip_start = max(pos_r, pos)
79
+ else:
80
+ pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
81
+ pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
82
+ if silence_start == 0:
83
+ sil_tags.append((0, pos_r))
84
+ else:
85
+ sil_tags.append((pos_l, pos_r))
86
+ clip_start = pos_r
87
+ silence_start = None
88
+ # Deal with trailing silence.
89
+ total_frames = rms_list.shape[0]
90
+ if silence_start is not None and total_frames - silence_start >= self.min_interval:
91
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
92
+ pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
93
+ sil_tags.append((pos, total_frames + 1))
94
+ # Apply and return slices.
95
+ if len(sil_tags) == 0:
96
+ return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
97
+ else:
98
+ chunks = []
99
+ # 第一段静音并非从头开始,补上有声片段
100
+ if sil_tags[0][0]:
101
+ chunks.append(
102
+ {"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
103
+ for i in range(0, len(sil_tags)):
104
+ # 标识有声片段(跳过第一段)
105
+ if i:
106
+ chunks.append({"slice": False,
107
+ "split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
108
+ # 标识所有静音片段
109
+ chunks.append({"slice": True,
110
+ "split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
111
+ # 最后一段静音并非结尾,补上结尾片段
112
+ if sil_tags[-1][1] * self.hop_size < len(waveform):
113
+ chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
114
+ chunk_dict = {}
115
+ for i in range(len(chunks)):
116
+ chunk_dict[str(i)] = chunks[i]
117
+ return chunk_dict
118
+
119
+
120
+ def cut(audio_path, db_thresh=-30, min_len=5000):
121
+ audio, sr = librosa.load(audio_path, sr=None)
122
+ slicer = Slicer(
123
+ sr=sr,
124
+ threshold=db_thresh,
125
+ min_length=min_len
126
+ )
127
+ chunks = slicer.slice(audio)
128
+ return chunks
129
+
130
+
131
+ def chunks2audio(audio_path, chunks):
132
+ chunks = dict(chunks)
133
+ audio, sr = torchaudio.load(audio_path)
134
+ if len(audio.shape) == 2 and audio.shape[1] >= 2:
135
+ audio = torch.mean(audio, dim=0).unsqueeze(0)
136
+ audio = audio.cpu().numpy()[0]
137
+ result = []
138
+ for k, v in chunks.items():
139
+ tag = v["split_time"].split(",")
140
+ if tag[0] != tag[1]:
141
+ result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
142
+ return result, sr
inference_main.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import logging
3
+ import time
4
+ from pathlib import Path
5
+
6
+ import librosa
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import soundfile
10
+
11
+ from inference import infer_tool
12
+ from inference import slicer
13
+ from inference.infer_tool import Svc
14
+
15
+ logging.getLogger('numba').setLevel(logging.WARNING)
16
+ chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
17
+
18
+
19
+
20
+ def main():
21
+ import argparse
22
+
23
+ parser = argparse.ArgumentParser(description='sovits4 inference')
24
+
25
+ # 一定要设置的部分
26
+ parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_0.pth", help='模型路径')
27
+ parser.add_argument('-c', '--config_path', type=str, default="configs/config.json", help='配置文件路径')
28
+ parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片,默认0为自动切片,单位为秒/s')
29
+ parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表,放在raw文件夹下')
30
+ parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
31
+ parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nen'], help='合成目标说话人名称')
32
+
33
+ # 可选项部分
34
+ parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False,help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
35
+ parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt", help='聚类模型路径,如果没有训练聚类则随便填')
36
+ parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案占比,范围0-1,若没有训练聚类模型则默认0即可')
37
+ parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
38
+ parser.add_argument('-fmp', '--f0_mean_pooling', type=bool, default=False, help='是否对F0使用均值滤波器(池化),对部分哑音有改善。注意,启动该选项会导致推理速度下降,默认关闭')
39
+
40
+ # 不用动的部分
41
+ parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
42
+ parser.add_argument('-d', '--device', type=str, default=None, help='推理设备,None则为自动选择cpu和gpu')
43
+ parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='噪音级别,会影响咬字和音质,较为玄学')
44
+ parser.add_argument('-p', '--pad_seconds', type=float, default=0.5, help='推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现')
45
+ parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
46
+ parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭')
47
+
48
+ args = parser.parse_args()
49
+
50
+ svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path)
51
+ infer_tool.mkdir(["raw", "results"])
52
+ clean_names = args.clean_names
53
+ trans = args.trans
54
+ spk_list = args.spk_list
55
+ slice_db = args.slice_db
56
+ wav_format = args.wav_format
57
+ auto_predict_f0 = args.auto_predict_f0
58
+ cluster_infer_ratio = args.cluster_infer_ratio
59
+ noice_scale = args.noice_scale
60
+ pad_seconds = args.pad_seconds
61
+ clip = args.clip
62
+ lg = args.linear_gradient
63
+ lgr = args.linear_gradient_retain
64
+ F0_mean_pooling = args.f0_mean_pooling
65
+
66
+ infer_tool.fill_a_to_b(trans, clean_names)
67
+ for clean_name, tran in zip(clean_names, trans):
68
+ raw_audio_path = f"raw/{clean_name}"
69
+ if "." not in raw_audio_path:
70
+ raw_audio_path += ".wav"
71
+ infer_tool.format_wav(raw_audio_path)
72
+ wav_path = Path(raw_audio_path).with_suffix('.wav')
73
+ chunks = slicer.cut(wav_path, db_thresh=slice_db)
74
+ audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
75
+ per_size = int(clip*audio_sr)
76
+ lg_size = int(lg*audio_sr)
77
+ lg_size_r = int(lg_size*lgr)
78
+ lg_size_c_l = (lg_size-lg_size_r)//2
79
+ lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
80
+ lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
81
+
82
+ for spk in spk_list:
83
+ audio = []
84
+ for (slice_tag, data) in audio_data:
85
+ print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
86
+
87
+ length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
88
+ if slice_tag:
89
+ print('jump empty segment')
90
+ _audio = np.zeros(length)
91
+ audio.extend(list(infer_tool.pad_array(_audio, length)))
92
+ continue
93
+ if per_size != 0:
94
+ datas = infer_tool.split_list_by_n(data, per_size,lg_size)
95
+ else:
96
+ datas = [data]
97
+ for k,dat in enumerate(datas):
98
+ per_length = int(np.ceil(len(dat) / audio_sr * svc_model.target_sample)) if clip!=0 else length
99
+ if clip!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
100
+ # padd
101
+ pad_len = int(audio_sr * pad_seconds)
102
+ dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
103
+ raw_path = io.BytesIO()
104
+ soundfile.write(raw_path, dat, audio_sr, format="wav")
105
+ raw_path.seek(0)
106
+ out_audio, out_sr = svc_model.infer(spk, tran, raw_path,
107
+ cluster_infer_ratio=cluster_infer_ratio,
108
+ auto_predict_f0=auto_predict_f0,
109
+ noice_scale=noice_scale,
110
+ F0_mean_pooling = F0_mean_pooling
111
+ )
112
+ _audio = out_audio.cpu().numpy()
113
+ pad_len = int(svc_model.target_sample * pad_seconds)
114
+ _audio = _audio[pad_len:-pad_len]
115
+ _audio = infer_tool.pad_array(_audio, per_length)
116
+ if lg_size!=0 and k!=0:
117
+ lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr != 1 else audio[-lg_size:]
118
+ lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr != 1 else _audio[0:lg_size]
119
+ lg_pre = lg1*(1-lg)+lg2*lg
120
+ audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr != 1 else audio[0:-lg_size]
121
+ audio.extend(lg_pre)
122
+ _audio = _audio[lg_size_c_l+lg_size_r:] if lgr != 1 else _audio[lg_size:]
123
+ audio.extend(list(_audio))
124
+ key = "auto" if auto_predict_f0 else f"{tran}key"
125
+ cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
126
+ res_path = f'./results/{clean_name}_{key}_{spk}{cluster_name}.{wav_format}'
127
+ soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
128
+
129
+ if __name__ == '__main__':
130
+ main()
models.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+ import modules.attentions as attentions
8
+ import modules.commons as commons
9
+ import modules.modules as modules
10
+
11
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
12
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
13
+
14
+ import utils
15
+ from modules.commons import init_weights, get_padding
16
+ from vdecoder.hifigan.models import Generator
17
+ from utils import f0_to_coarse
18
+
19
+ class ResidualCouplingBlock(nn.Module):
20
+ def __init__(self,
21
+ channels,
22
+ hidden_channels,
23
+ kernel_size,
24
+ dilation_rate,
25
+ n_layers,
26
+ n_flows=4,
27
+ gin_channels=0):
28
+ super().__init__()
29
+ self.channels = channels
30
+ self.hidden_channels = hidden_channels
31
+ self.kernel_size = kernel_size
32
+ self.dilation_rate = dilation_rate
33
+ self.n_layers = n_layers
34
+ self.n_flows = n_flows
35
+ self.gin_channels = gin_channels
36
+
37
+ self.flows = nn.ModuleList()
38
+ for i in range(n_flows):
39
+ self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
40
+ self.flows.append(modules.Flip())
41
+
42
+ def forward(self, x, x_mask, g=None, reverse=False):
43
+ if not reverse:
44
+ for flow in self.flows:
45
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
46
+ else:
47
+ for flow in reversed(self.flows):
48
+ x = flow(x, x_mask, g=g, reverse=reverse)
49
+ return x
50
+
51
+
52
+ class Encoder(nn.Module):
53
+ def __init__(self,
54
+ in_channels,
55
+ out_channels,
56
+ hidden_channels,
57
+ kernel_size,
58
+ dilation_rate,
59
+ n_layers,
60
+ gin_channels=0):
61
+ super().__init__()
62
+ self.in_channels = in_channels
63
+ self.out_channels = out_channels
64
+ self.hidden_channels = hidden_channels
65
+ self.kernel_size = kernel_size
66
+ self.dilation_rate = dilation_rate
67
+ self.n_layers = n_layers
68
+ self.gin_channels = gin_channels
69
+
70
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
71
+ self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
72
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
73
+
74
+ def forward(self, x, x_lengths, g=None):
75
+ # print(x.shape,x_lengths.shape)
76
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
77
+ x = self.pre(x) * x_mask
78
+ x = self.enc(x, x_mask, g=g)
79
+ stats = self.proj(x) * x_mask
80
+ m, logs = torch.split(stats, self.out_channels, dim=1)
81
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
82
+ return z, m, logs, x_mask
83
+
84
+
85
+ class TextEncoder(nn.Module):
86
+ def __init__(self,
87
+ out_channels,
88
+ hidden_channels,
89
+ kernel_size,
90
+ n_layers,
91
+ gin_channels=0,
92
+ filter_channels=None,
93
+ n_heads=None,
94
+ p_dropout=None):
95
+ super().__init__()
96
+ self.out_channels = out_channels
97
+ self.hidden_channels = hidden_channels
98
+ self.kernel_size = kernel_size
99
+ self.n_layers = n_layers
100
+ self.gin_channels = gin_channels
101
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
102
+ self.f0_emb = nn.Embedding(256, hidden_channels)
103
+
104
+ self.enc_ = attentions.Encoder(
105
+ hidden_channels,
106
+ filter_channels,
107
+ n_heads,
108
+ n_layers,
109
+ kernel_size,
110
+ p_dropout)
111
+
112
+ def forward(self, x, x_mask, f0=None, noice_scale=1):
113
+ x = x + self.f0_emb(f0).transpose(1,2)
114
+ x = self.enc_(x * x_mask, x_mask)
115
+ stats = self.proj(x) * x_mask
116
+ m, logs = torch.split(stats, self.out_channels, dim=1)
117
+ z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
118
+
119
+ return z, m, logs, x_mask
120
+
121
+
122
+
123
+ class DiscriminatorP(torch.nn.Module):
124
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
125
+ super(DiscriminatorP, self).__init__()
126
+ self.period = period
127
+ self.use_spectral_norm = use_spectral_norm
128
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
129
+ self.convs = nn.ModuleList([
130
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
131
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
132
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
133
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
134
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
135
+ ])
136
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
137
+
138
+ def forward(self, x):
139
+ fmap = []
140
+
141
+ # 1d to 2d
142
+ b, c, t = x.shape
143
+ if t % self.period != 0: # pad first
144
+ n_pad = self.period - (t % self.period)
145
+ x = F.pad(x, (0, n_pad), "reflect")
146
+ t = t + n_pad
147
+ x = x.view(b, c, t // self.period, self.period)
148
+
149
+ for l in self.convs:
150
+ x = l(x)
151
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
152
+ fmap.append(x)
153
+ x = self.conv_post(x)
154
+ fmap.append(x)
155
+ x = torch.flatten(x, 1, -1)
156
+
157
+ return x, fmap
158
+
159
+
160
+ class DiscriminatorS(torch.nn.Module):
161
+ def __init__(self, use_spectral_norm=False):
162
+ super(DiscriminatorS, self).__init__()
163
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
164
+ self.convs = nn.ModuleList([
165
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
166
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
167
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
168
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
169
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
170
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
171
+ ])
172
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
173
+
174
+ def forward(self, x):
175
+ fmap = []
176
+
177
+ for l in self.convs:
178
+ x = l(x)
179
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
180
+ fmap.append(x)
181
+ x = self.conv_post(x)
182
+ fmap.append(x)
183
+ x = torch.flatten(x, 1, -1)
184
+
185
+ return x, fmap
186
+
187
+
188
+ class MultiPeriodDiscriminator(torch.nn.Module):
189
+ def __init__(self, use_spectral_norm=False):
190
+ super(MultiPeriodDiscriminator, self).__init__()
191
+ periods = [2,3,5,7,11]
192
+
193
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
194
+ discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
195
+ self.discriminators = nn.ModuleList(discs)
196
+
197
+ def forward(self, y, y_hat):
198
+ y_d_rs = []
199
+ y_d_gs = []
200
+ fmap_rs = []
201
+ fmap_gs = []
202
+ for i, d in enumerate(self.discriminators):
203
+ y_d_r, fmap_r = d(y)
204
+ y_d_g, fmap_g = d(y_hat)
205
+ y_d_rs.append(y_d_r)
206
+ y_d_gs.append(y_d_g)
207
+ fmap_rs.append(fmap_r)
208
+ fmap_gs.append(fmap_g)
209
+
210
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
211
+
212
+
213
+ class SpeakerEncoder(torch.nn.Module):
214
+ def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
215
+ super(SpeakerEncoder, self).__init__()
216
+ self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
217
+ self.linear = nn.Linear(model_hidden_size, model_embedding_size)
218
+ self.relu = nn.ReLU()
219
+
220
+ def forward(self, mels):
221
+ self.lstm.flatten_parameters()
222
+ _, (hidden, _) = self.lstm(mels)
223
+ embeds_raw = self.relu(self.linear(hidden[-1]))
224
+ return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
225
+
226
+ def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
227
+ mel_slices = []
228
+ for i in range(0, total_frames-partial_frames, partial_hop):
229
+ mel_range = torch.arange(i, i+partial_frames)
230
+ mel_slices.append(mel_range)
231
+
232
+ return mel_slices
233
+
234
+ def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
235
+ mel_len = mel.size(1)
236
+ last_mel = mel[:,-partial_frames:]
237
+
238
+ if mel_len > partial_frames:
239
+ mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
240
+ mels = list(mel[:,s] for s in mel_slices)
241
+ mels.append(last_mel)
242
+ mels = torch.stack(tuple(mels), 0).squeeze(1)
243
+
244
+ with torch.no_grad():
245
+ partial_embeds = self(mels)
246
+ embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
247
+ #embed = embed / torch.linalg.norm(embed, 2)
248
+ else:
249
+ with torch.no_grad():
250
+ embed = self(last_mel)
251
+
252
+ return embed
253
+
254
+ class F0Decoder(nn.Module):
255
+ def __init__(self,
256
+ out_channels,
257
+ hidden_channels,
258
+ filter_channels,
259
+ n_heads,
260
+ n_layers,
261
+ kernel_size,
262
+ p_dropout,
263
+ spk_channels=0):
264
+ super().__init__()
265
+ self.out_channels = out_channels
266
+ self.hidden_channels = hidden_channels
267
+ self.filter_channels = filter_channels
268
+ self.n_heads = n_heads
269
+ self.n_layers = n_layers
270
+ self.kernel_size = kernel_size
271
+ self.p_dropout = p_dropout
272
+ self.spk_channels = spk_channels
273
+
274
+ self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1)
275
+ self.decoder = attentions.FFT(
276
+ hidden_channels,
277
+ filter_channels,
278
+ n_heads,
279
+ n_layers,
280
+ kernel_size,
281
+ p_dropout)
282
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
283
+ self.f0_prenet = nn.Conv1d(1, hidden_channels , 3, padding=1)
284
+ self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
285
+
286
+ def forward(self, x, norm_f0, x_mask, spk_emb=None):
287
+ x = torch.detach(x)
288
+ if (spk_emb is not None):
289
+ x = x + self.cond(spk_emb)
290
+ x += self.f0_prenet(norm_f0)
291
+ x = self.prenet(x) * x_mask
292
+ x = self.decoder(x * x_mask, x_mask)
293
+ x = self.proj(x) * x_mask
294
+ return x
295
+
296
+
297
+ class SynthesizerTrn(nn.Module):
298
+ """
299
+ Synthesizer for Training
300
+ """
301
+
302
+ def __init__(self,
303
+ spec_channels,
304
+ segment_size,
305
+ inter_channels,
306
+ hidden_channels,
307
+ filter_channels,
308
+ n_heads,
309
+ n_layers,
310
+ kernel_size,
311
+ p_dropout,
312
+ resblock,
313
+ resblock_kernel_sizes,
314
+ resblock_dilation_sizes,
315
+ upsample_rates,
316
+ upsample_initial_channel,
317
+ upsample_kernel_sizes,
318
+ gin_channels,
319
+ ssl_dim,
320
+ n_speakers,
321
+ sampling_rate=44100,
322
+ **kwargs):
323
+
324
+ super().__init__()
325
+ self.spec_channels = spec_channels
326
+ self.inter_channels = inter_channels
327
+ self.hidden_channels = hidden_channels
328
+ self.filter_channels = filter_channels
329
+ self.n_heads = n_heads
330
+ self.n_layers = n_layers
331
+ self.kernel_size = kernel_size
332
+ self.p_dropout = p_dropout
333
+ self.resblock = resblock
334
+ self.resblock_kernel_sizes = resblock_kernel_sizes
335
+ self.resblock_dilation_sizes = resblock_dilation_sizes
336
+ self.upsample_rates = upsample_rates
337
+ self.upsample_initial_channel = upsample_initial_channel
338
+ self.upsample_kernel_sizes = upsample_kernel_sizes
339
+ self.segment_size = segment_size
340
+ self.gin_channels = gin_channels
341
+ self.ssl_dim = ssl_dim
342
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
343
+
344
+ self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
345
+
346
+ self.enc_p = TextEncoder(
347
+ inter_channels,
348
+ hidden_channels,
349
+ filter_channels=filter_channels,
350
+ n_heads=n_heads,
351
+ n_layers=n_layers,
352
+ kernel_size=kernel_size,
353
+ p_dropout=p_dropout
354
+ )
355
+ hps = {
356
+ "sampling_rate": sampling_rate,
357
+ "inter_channels": inter_channels,
358
+ "resblock": resblock,
359
+ "resblock_kernel_sizes": resblock_kernel_sizes,
360
+ "resblock_dilation_sizes": resblock_dilation_sizes,
361
+ "upsample_rates": upsample_rates,
362
+ "upsample_initial_channel": upsample_initial_channel,
363
+ "upsample_kernel_sizes": upsample_kernel_sizes,
364
+ "gin_channels": gin_channels,
365
+ }
366
+ self.dec = Generator(h=hps)
367
+ self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
368
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
369
+ self.f0_decoder = F0Decoder(
370
+ 1,
371
+ hidden_channels,
372
+ filter_channels,
373
+ n_heads,
374
+ n_layers,
375
+ kernel_size,
376
+ p_dropout,
377
+ spk_channels=gin_channels
378
+ )
379
+ self.emb_uv = nn.Embedding(2, hidden_channels)
380
+
381
+ def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None):
382
+ g = self.emb_g(g).transpose(1,2)
383
+ # ssl prenet
384
+ x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
385
+ x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2)
386
+
387
+ # f0 predict
388
+ lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
389
+ norm_lf0 = utils.normalize_f0(lf0, x_mask, uv)
390
+ pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
391
+
392
+ # encoder
393
+ z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
394
+ z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
395
+
396
+ # flow
397
+ z_p = self.flow(z, spec_mask, g=g)
398
+ z_slice, pitch_slice, ids_slice = commons.rand_slice_segments_with_pitch(z, f0, spec_lengths, self.segment_size)
399
+
400
+ # nsf decoder
401
+ o = self.dec(z_slice, g=g, f0=pitch_slice)
402
+
403
+ return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0
404
+
405
+ def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False):
406
+ c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
407
+ g = self.emb_g(g).transpose(1,2)
408
+ x_mask = torch.unsqueeze(commons.sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
409
+ x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1,2)
410
+
411
+ if predict_f0:
412
+ lf0 = 2595. * torch.log10(1. + f0.unsqueeze(1) / 700.) / 500
413
+ norm_lf0 = utils.normalize_f0(lf0, x_mask, uv, random_scale=False)
414
+ pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
415
+ f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
416
+
417
+ z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
418
+ z = self.flow(z_p, c_mask, g=g, reverse=True)
419
+ o = self.dec(z * c_mask, g=g, f0=f0)
420
+ return o
models/arthur/arthur.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70fc73a4bf772cbdabb3703d45a466f54a45e446e869dd655038bbb41784e8ca
3
+ size 180653938
models/arthur/config_arthur.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 6,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 0
25
+ },
26
+ "data": {
27
+ "training_files": "filelists/train.txt",
28
+ "validation_files": "filelists/val.txt",
29
+ "max_wav_value": 32768.0,
30
+ "sampling_rate": 44100,
31
+ "filter_length": 2048,
32
+ "hop_length": 512,
33
+ "win_length": 2048,
34
+ "n_mel_channels": 80,
35
+ "mel_fmin": 0.0,
36
+ "mel_fmax": 22050
37
+ },
38
+ "model": {
39
+ "inter_channels": 192,
40
+ "hidden_channels": 192,
41
+ "filter_channels": 768,
42
+ "n_heads": 2,
43
+ "n_layers": 6,
44
+ "kernel_size": 3,
45
+ "p_dropout": 0.1,
46
+ "resblock": "1",
47
+ "resblock_kernel_sizes": [
48
+ 3,
49
+ 7,
50
+ 11
51
+ ],
52
+ "resblock_dilation_sizes": [
53
+ [
54
+ 1,
55
+ 3,
56
+ 5
57
+ ],
58
+ [
59
+ 1,
60
+ 3,
61
+ 5
62
+ ],
63
+ [
64
+ 1,
65
+ 3,
66
+ 5
67
+ ]
68
+ ],
69
+ "upsample_rates": [
70
+ 8,
71
+ 8,
72
+ 2,
73
+ 2,
74
+ 2
75
+ ],
76
+ "upsample_initial_channel": 512,
77
+ "upsample_kernel_sizes": [
78
+ 16,
79
+ 16,
80
+ 4,
81
+ 4,
82
+ 4
83
+ ],
84
+ "n_layers_q": 3,
85
+ "use_spectral_norm": false,
86
+ "gin_channels": 256,
87
+ "ssl_dim": 256,
88
+ "n_speakers": 1
89
+ },
90
+ "spk": {
91
+ "arthur": 0
92
+ }
93
+ }
models/carl/carl.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa229fa4e8de8f14b3b5cfd4b21552e7e8139656cb1ac617ff83d79aff2f241f
3
+ size 180665609
models/carl/config_carl.json ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 20
25
+ },
26
+ "data": {
27
+ "training_files": "filelists/train.txt",
28
+ "validation_files": "filelists/val.txt",
29
+ "max_wav_value": 32768.0,
30
+ "sampling_rate": 44100,
31
+ "filter_length": 2048,
32
+ "hop_length": 512,
33
+ "win_length": 2048,
34
+ "n_mel_channels": 80,
35
+ "mel_fmin": 0.0,
36
+ "mel_fmax": 22050
37
+ },
38
+ "model": {
39
+ "inter_channels": 192,
40
+ "hidden_channels": 192,
41
+ "filter_channels": 768,
42
+ "n_heads": 2,
43
+ "n_layers": 6,
44
+ "kernel_size": 3,
45
+ "p_dropout": 0.1,
46
+ "resblock": "1",
47
+ "resblock_kernel_sizes": [
48
+ 3,
49
+ 7,
50
+ 11
51
+ ],
52
+ "resblock_dilation_sizes": [
53
+ [
54
+ 1,
55
+ 3,
56
+ 5
57
+ ],
58
+ [
59
+ 1,
60
+ 3,
61
+ 5
62
+ ],
63
+ [
64
+ 1,
65
+ 3,
66
+ 5
67
+ ]
68
+ ],
69
+ "upsample_rates": [
70
+ 8,
71
+ 8,
72
+ 2,
73
+ 2,
74
+ 2
75
+ ],
76
+ "upsample_initial_channel": 512,
77
+ "upsample_kernel_sizes": [
78
+ 16,
79
+ 16,
80
+ 4,
81
+ 4,
82
+ 4
83
+ ],
84
+ "n_layers_q": 3,
85
+ "use_spectral_norm": false,
86
+ "gin_channels": 256,
87
+ "ssl_dim": 256,
88
+ "n_speakers": 1
89
+ },
90
+ "spk": {
91
+ "carl": 0
92
+ }
93
+ }
models/cesar/cesar.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:371b46e33961609ce9ff7a3d22e11bb7e839e5c1ad8c0105a0ffb7e31c7832d6
3
+ size 209238367
models/cesar/config_cesar.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 2000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 6,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 10,
25
+ "all_in_mem": false,
26
+ "vol_aug": true
27
+ },
28
+ "data": {
29
+ "training_files": "filelists/train.txt",
30
+ "validation_files": "filelists/val.txt",
31
+ "max_wav_value": 32768.0,
32
+ "sampling_rate": 44100,
33
+ "filter_length": 2048,
34
+ "hop_length": 512,
35
+ "win_length": 2048,
36
+ "n_mel_channels": 80,
37
+ "mel_fmin": 0.0,
38
+ "mel_fmax": 22050
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2,
76
+ 2
77
+ ],
78
+ "upsample_initial_channel": 512,
79
+ "upsample_kernel_sizes": [
80
+ 16,
81
+ 16,
82
+ 4,
83
+ 4,
84
+ 4
85
+ ],
86
+ "n_layers_q": 3,
87
+ "use_spectral_norm": false,
88
+ "gin_channels": 768,
89
+ "ssl_dim": 768,
90
+ "n_speakers": 1,
91
+ "vocoder_name": "nsf-hifigan",
92
+ "speech_encoder": "vec768l12",
93
+ "speaker_embedding": false,
94
+ "vol_embedding": true
95
+ },
96
+ "spk": {
97
+ "cesar": 0
98
+ }
99
+ }
models/katalina/config_katalina.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 12,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 3,
25
+ "all_in_mem": false,
26
+ "vol_aug": true
27
+ },
28
+ "data": {
29
+ "training_files": "filelists/train.txt",
30
+ "validation_files": "filelists/val.txt",
31
+ "max_wav_value": 32768.0,
32
+ "sampling_rate": 44100,
33
+ "filter_length": 2048,
34
+ "hop_length": 512,
35
+ "win_length": 2048,
36
+ "n_mel_channels": 80,
37
+ "mel_fmin": 0.0,
38
+ "mel_fmax": 22050
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2,
76
+ 2
77
+ ],
78
+ "upsample_initial_channel": 512,
79
+ "upsample_kernel_sizes": [
80
+ 16,
81
+ 16,
82
+ 4,
83
+ 4,
84
+ 4
85
+ ],
86
+ "n_layers_q": 3,
87
+ "use_spectral_norm": false,
88
+ "gin_channels": 768,
89
+ "ssl_dim": 768,
90
+ "n_speakers": 1,
91
+ "vocoder_name": "nsf-hifigan",
92
+ "speech_encoder": "vec768l12",
93
+ "speaker_embedding": false,
94
+ "vol_embedding": true
95
+ },
96
+ "spk": {
97
+ "katalina": 0
98
+ }
99
+ }
models/katalina/katalina.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf2d89ab9a7862128c62a4abc6c55fc7c6ef7b1c9c92cffde0d1fa43ba0bcadc
3
+ size 209238367
models/kendl/config_kendl.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 6,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 3,
25
+ "all_in_mem": false,
26
+ "vol_aug": true
27
+ },
28
+ "data": {
29
+ "training_files": "filelists/train.txt",
30
+ "validation_files": "filelists/val.txt",
31
+ "max_wav_value": 32768.0,
32
+ "sampling_rate": 44100,
33
+ "filter_length": 2048,
34
+ "hop_length": 512,
35
+ "win_length": 2048,
36
+ "n_mel_channels": 80,
37
+ "mel_fmin": 0.0,
38
+ "mel_fmax": 22050
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2,
76
+ 2
77
+ ],
78
+ "upsample_initial_channel": 512,
79
+ "upsample_kernel_sizes": [
80
+ 16,
81
+ 16,
82
+ 4,
83
+ 4,
84
+ 4
85
+ ],
86
+ "n_layers_q": 3,
87
+ "use_spectral_norm": false,
88
+ "gin_channels": 768,
89
+ "ssl_dim": 768,
90
+ "n_speakers": 1,
91
+ "vocoder_name": "nsf-hifigan",
92
+ "speech_encoder": "vec768l12",
93
+ "speaker_embedding": false,
94
+ "vol_embedding": true
95
+ },
96
+ "spk": {
97
+ "kendl": 0
98
+ }
99
+ }
models/kendl/kendl.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d122bdbda24bebeb25b88ad9f0c6ccdf454995bd06435ddc0bcce924505336ae
3
+ size 209189561
models/ogloc/config_ogloc.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 20000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 10,
25
+ "all_in_mem": false,
26
+ "vol_aug": false
27
+ },
28
+ "data": {
29
+ "training_files": "filelists/train.txt",
30
+ "validation_files": "filelists/val.txt",
31
+ "max_wav_value": 32768.0,
32
+ "sampling_rate": 44100,
33
+ "filter_length": 2048,
34
+ "hop_length": 512,
35
+ "win_length": 2048,
36
+ "n_mel_channels": 80,
37
+ "mel_fmin": 0.0,
38
+ "mel_fmax": 22050
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2,
76
+ 2
77
+ ],
78
+ "upsample_initial_channel": 512,
79
+ "upsample_kernel_sizes": [
80
+ 16,
81
+ 16,
82
+ 4,
83
+ 4,
84
+ 4
85
+ ],
86
+ "n_layers_q": 3,
87
+ "use_spectral_norm": false,
88
+ "gin_channels": 768,
89
+ "ssl_dim": 768,
90
+ "n_speakers": 1,
91
+ "vocoder_name": "nsf-hifigan",
92
+ "speech_encoder": "vec768l12",
93
+ "speaker_embedding": false,
94
+ "vol_embedding": false
95
+ },
96
+ "spk": {
97
+ "ogloc": 0
98
+ }
99
+ }
models/ogloc/kmeans_ogloc.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dc2d3c53272a0f1caeee0863c47484ad83592a2780246b98bda77284440c95b
3
+ size 31339961
models/ogloc/ogloc.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb14244a02966c812cea2462fb6ce60bf7792199ce1d80e1307285f4ae36c60a
3
+ size 209187585
models/pulaski/config_pulaski.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 6,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 3,
25
+ "all_in_mem": false,
26
+ "vol_aug": true
27
+ },
28
+ "data": {
29
+ "training_files": "filelists/train.txt",
30
+ "validation_files": "filelists/val.txt",
31
+ "max_wav_value": 32768.0,
32
+ "sampling_rate": 44100,
33
+ "filter_length": 2048,
34
+ "hop_length": 512,
35
+ "win_length": 2048,
36
+ "n_mel_channels": 80,
37
+ "mel_fmin": 0.0,
38
+ "mel_fmax": 22050
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2,
76
+ 2
77
+ ],
78
+ "upsample_initial_channel": 512,
79
+ "upsample_kernel_sizes": [
80
+ 16,
81
+ 16,
82
+ 4,
83
+ 4,
84
+ 4
85
+ ],
86
+ "n_layers_q": 3,
87
+ "use_spectral_norm": false,
88
+ "gin_channels": 768,
89
+ "ssl_dim": 768,
90
+ "n_speakers": 1,
91
+ "vocoder_name": "nsf-hifigan",
92
+ "speech_encoder": "vec768l12",
93
+ "speaker_embedding": false,
94
+ "vol_embedding": true
95
+ },
96
+ "spk": {
97
+ "pulaski": 0
98
+ }
99
+ }
models/pulaski/pulaski.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:720f78d3a854892e6fc2b4e1f9869cf8cff0bb331cf4f6e55eaf60c3a34a7ae9
3
+ size 209238367
models/ryder/config_ryder.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 30000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 6,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 10,
25
+ "all_in_mem": false,
26
+ "vol_aug": true
27
+ },
28
+ "data": {
29
+ "training_files": "filelists/train.txt",
30
+ "validation_files": "filelists/val.txt",
31
+ "max_wav_value": 32768.0,
32
+ "sampling_rate": 44100,
33
+ "filter_length": 2048,
34
+ "hop_length": 512,
35
+ "win_length": 2048,
36
+ "n_mel_channels": 80,
37
+ "mel_fmin": 0.0,
38
+ "mel_fmax": 22050
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2,
76
+ 2
77
+ ],
78
+ "upsample_initial_channel": 512,
79
+ "upsample_kernel_sizes": [
80
+ 16,
81
+ 16,
82
+ 4,
83
+ 4,
84
+ 4
85
+ ],
86
+ "n_layers_q": 3,
87
+ "use_spectral_norm": false,
88
+ "gin_channels": 768,
89
+ "ssl_dim": 768,
90
+ "n_speakers": 1,
91
+ "vocoder_name": "nsf-hifigan",
92
+ "speech_encoder": "vec768l12",
93
+ "speaker_embedding": false,
94
+ "vol_embedding": true
95
+ },
96
+ "spk": {
97
+ "ryder": 0
98
+ }
99
+ }
models/ryder/ryder.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19aaac8e141faf1b4309f20e78705c6e6c47f540c78a1ebf8afbf89b36b5de5f
3
+ size 209189561
models/smoke/config_smoke.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 6,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 3,
25
+ "all_in_mem": false,
26
+ "vol_aug": true
27
+ },
28
+ "data": {
29
+ "training_files": "filelists/train.txt",
30
+ "validation_files": "filelists/val.txt",
31
+ "max_wav_value": 32768.0,
32
+ "sampling_rate": 44100,
33
+ "filter_length": 2048,
34
+ "hop_length": 512,
35
+ "win_length": 2048,
36
+ "n_mel_channels": 80,
37
+ "mel_fmin": 0.0,
38
+ "mel_fmax": 22050
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2,
76
+ 2
77
+ ],
78
+ "upsample_initial_channel": 512,
79
+ "upsample_kernel_sizes": [
80
+ 16,
81
+ 16,
82
+ 4,
83
+ 4,
84
+ 4
85
+ ],
86
+ "n_layers_q": 3,
87
+ "use_spectral_norm": false,
88
+ "gin_channels": 768,
89
+ "ssl_dim": 768,
90
+ "n_speakers": 1,
91
+ "vocoder_name": "nsf-hifigan",
92
+ "speech_encoder": "vec768l12",
93
+ "speaker_embedding": false,
94
+ "vol_embedding": true
95
+ },
96
+ "spk": {
97
+ "smoke": 0
98
+ }
99
+ }
models/smoke/smoke.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c302ae6ca088cedda29c8a8e10cec888bd15be504f6ae0e8b94cc81aca98bab7
3
+ size 209189561
models/sweet/config_sweet.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 6,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 3,
25
+ "all_in_mem": false,
26
+ "vol_aug": true
27
+ },
28
+ "data": {
29
+ "training_files": "filelists/train.txt",
30
+ "validation_files": "filelists/val.txt",
31
+ "max_wav_value": 32768.0,
32
+ "sampling_rate": 44100,
33
+ "filter_length": 2048,
34
+ "hop_length": 512,
35
+ "win_length": 2048,
36
+ "n_mel_channels": 80,
37
+ "mel_fmin": 0.0,
38
+ "mel_fmax": 22050
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2,
76
+ 2
77
+ ],
78
+ "upsample_initial_channel": 512,
79
+ "upsample_kernel_sizes": [
80
+ 16,
81
+ 16,
82
+ 4,
83
+ 4,
84
+ 4
85
+ ],
86
+ "n_layers_q": 3,
87
+ "use_spectral_norm": false,
88
+ "gin_channels": 768,
89
+ "ssl_dim": 768,
90
+ "n_speakers": 1,
91
+ "vocoder_name": "nsf-hifigan",
92
+ "speech_encoder": "vec768l12",
93
+ "speaker_embedding": false,
94
+ "vol_embedding": true
95
+ },
96
+ "spk": {
97
+ "sweet": 0
98
+ }
99
+ }
models/sweet/sweet.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d79642f4d1ee8b7381c2067178c8332e25195e18ad37ca0447fdc9dc1c5eb1a
3
+ size 209189561
models/tenpenny/config_tenpenny.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 12,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 3,
25
+ "all_in_mem": false,
26
+ "vol_aug": true
27
+ },
28
+ "data": {
29
+ "training_files": "filelists/train.txt",
30
+ "validation_files": "filelists/val.txt",
31
+ "max_wav_value": 32768.0,
32
+ "sampling_rate": 44100,
33
+ "filter_length": 2048,
34
+ "hop_length": 512,
35
+ "win_length": 2048,
36
+ "n_mel_channels": 80,
37
+ "mel_fmin": 0.0,
38
+ "mel_fmax": 22050
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2,
76
+ 2
77
+ ],
78
+ "upsample_initial_channel": 512,
79
+ "upsample_kernel_sizes": [
80
+ 16,
81
+ 16,
82
+ 4,
83
+ 4,
84
+ 4
85
+ ],
86
+ "n_layers_q": 3,
87
+ "use_spectral_norm": false,
88
+ "gin_channels": 768,
89
+ "ssl_dim": 768,
90
+ "n_speakers": 1,
91
+ "vocoder_name": "nsf-hifigan",
92
+ "speech_encoder": "vec768l12",
93
+ "speaker_embedding": false,
94
+ "vol_embedding": true
95
+ },
96
+ "spk": {
97
+ "tenpenny": 0
98
+ }
99
+ }
models/tenpenny/tenpenny.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc76ce97c91e042abce0d454d4e140237287e170440b54109534f4404f32a73
3
+ size 209238367
models/tommy/config_tommy.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 100000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 6,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 30,
25
+ "all_in_mem": false,
26
+ "vol_aug": true
27
+ },
28
+ "data": {
29
+ "training_files": "filelists/train.txt",
30
+ "validation_files": "filelists/val.txt",
31
+ "max_wav_value": 32768.0,
32
+ "sampling_rate": 44100,
33
+ "filter_length": 2048,
34
+ "hop_length": 512,
35
+ "win_length": 2048,
36
+ "n_mel_channels": 80,
37
+ "mel_fmin": 0.0,
38
+ "mel_fmax": 22050
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2,
76
+ 2
77
+ ],
78
+ "upsample_initial_channel": 512,
79
+ "upsample_kernel_sizes": [
80
+ 16,
81
+ 16,
82
+ 4,
83
+ 4,
84
+ 4
85
+ ],
86
+ "n_layers_q": 3,
87
+ "use_spectral_norm": false,
88
+ "gin_channels": 768,
89
+ "ssl_dim": 768,
90
+ "n_speakers": 1,
91
+ "vocoder_name": "nsf-hifigan",
92
+ "speech_encoder": "vec768l12",
93
+ "speaker_embedding": false,
94
+ "vol_embedding": true
95
+ },
96
+ "spk": {
97
+ "tommy": 0
98
+ }
99
+ }
models/tommy/tommy.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43045cbe7f62f212f93afb1ed9b1c7a05b778440e1ae227234ec984e4719a527
3
+ size 209189561
models/tomori/config_tomori.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 3200,
5
+ "seed": 1234,
6
+ "epochs": 30000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 12,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 3,
25
+ "all_in_mem": false,
26
+ "vol_aug": true
27
+ },
28
+ "data": {
29
+ "training_files": "filelists/train.txt",
30
+ "validation_files": "filelists/val.txt",
31
+ "max_wav_value": 32768.0,
32
+ "sampling_rate": 44100,
33
+ "filter_length": 2048,
34
+ "hop_length": 512,
35
+ "win_length": 2048,
36
+ "n_mel_channels": 80,
37
+ "mel_fmin": 0.0,
38
+ "mel_fmax": 22050
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2,
76
+ 2
77
+ ],
78
+ "upsample_initial_channel": 512,
79
+ "upsample_kernel_sizes": [
80
+ 16,
81
+ 16,
82
+ 4,
83
+ 4,
84
+ 4
85
+ ],
86
+ "n_layers_q": 3,
87
+ "use_spectral_norm": false,
88
+ "gin_channels": 768,
89
+ "ssl_dim": 768,
90
+ "n_speakers": 1,
91
+ "vocoder_name": "nsf-hifigan",
92
+ "speech_encoder": "vec768l12",
93
+ "speaker_embedding": false,
94
+ "vol_embedding": true
95
+ },
96
+ "spk": {
97
+ "tomori": 0
98
+ }
99
+ }
models/tomori/tomori.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e915470eed2322969ae9dfadb51393f2f5744092690c11149de0ed65f04413f
3
+ size 209238367
models/tomori/tomori_index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a069b9dfacaec64cbfa500576134535cac59b9fc08087f52c2cbeae90db9076
3
+ size 189669700
models/torino/config_torino.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 6,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 3,
25
+ "all_in_mem": false,
26
+ "vol_aug": true
27
+ },
28
+ "data": {
29
+ "training_files": "filelists/train.txt",
30
+ "validation_files": "filelists/val.txt",
31
+ "max_wav_value": 32768.0,
32
+ "sampling_rate": 44100,
33
+ "filter_length": 2048,
34
+ "hop_length": 512,
35
+ "win_length": 2048,
36
+ "n_mel_channels": 80,
37
+ "mel_fmin": 0.0,
38
+ "mel_fmax": 22050
39
+ },
40
+ "model": {
41
+ "inter_channels": 192,
42
+ "hidden_channels": 192,
43
+ "filter_channels": 768,
44
+ "n_heads": 2,
45
+ "n_layers": 6,
46
+ "kernel_size": 3,
47
+ "p_dropout": 0.1,
48
+ "resblock": "1",
49
+ "resblock_kernel_sizes": [
50
+ 3,
51
+ 7,
52
+ 11
53
+ ],
54
+ "resblock_dilation_sizes": [
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ],
65
+ [
66
+ 1,
67
+ 3,
68
+ 5
69
+ ]
70
+ ],
71
+ "upsample_rates": [
72
+ 8,
73
+ 8,
74
+ 2,
75
+ 2,
76
+ 2
77
+ ],
78
+ "upsample_initial_channel": 512,
79
+ "upsample_kernel_sizes": [
80
+ 16,
81
+ 16,
82
+ 4,
83
+ 4,
84
+ 4
85
+ ],
86
+ "n_layers_q": 3,
87
+ "use_spectral_norm": false,
88
+ "gin_channels": 768,
89
+ "ssl_dim": 768,
90
+ "n_speakers": 1,
91
+ "vocoder_name": "nsf-hifigan",
92
+ "speech_encoder": "vec768l12",
93
+ "speaker_embedding": false,
94
+ "vol_embedding": true
95
+ },
96
+ "spk": {
97
+ "torino": 0
98
+ }
99
+ }