NeoPy commited on
Commit
e75ebf5
·
verified ·
1 Parent(s): e815924

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .github/FUNDING.yml +3 -0
  2. .github/PULL_REQUEST_TEMPLATE.md +37 -0
  3. .github/workflows/code_formatter.yml +53 -0
  4. .github/workflows/python-app.yml +55 -0
  5. .gitignore +388 -0
  6. Advanced-RVC.ipynb +494 -0
  7. LICENSE +22 -0
  8. README.md +64 -12
  9. assets/config.json +5 -0
  10. assets/themes/loadThemes.py +119 -0
  11. assets/themes/themes_list.json +24 -0
  12. install.bat +87 -0
  13. models.py +5 -0
  14. requirements.txt +33 -0
  15. rvc/configs/config.py +179 -0
  16. rvc/configs/v1/32000.json +47 -0
  17. rvc/configs/v1/40000.json +47 -0
  18. rvc/configs/v1/48000.json +47 -0
  19. rvc/configs/v2/32000.json +43 -0
  20. rvc/configs/v2/40000.json +43 -0
  21. rvc/configs/v2/48000.json +43 -0
  22. rvc/infer/infer.py +495 -0
  23. rvc/infer/pipeline.py +708 -0
  24. rvc/lib/algorithm/__init__.py +0 -0
  25. rvc/lib/algorithm/attentions.py +243 -0
  26. rvc/lib/algorithm/commons.py +207 -0
  27. rvc/lib/algorithm/discriminators.py +160 -0
  28. rvc/lib/algorithm/encoders.py +218 -0
  29. rvc/lib/algorithm/generators.py +231 -0
  30. rvc/lib/algorithm/modules.py +124 -0
  31. rvc/lib/algorithm/normalization.py +31 -0
  32. rvc/lib/algorithm/nsf.py +196 -0
  33. rvc/lib/algorithm/residuals.py +250 -0
  34. rvc/lib/algorithm/synthesizers.py +237 -0
  35. rvc/lib/predictors/F0Extractor.py +100 -0
  36. rvc/lib/predictors/FCPE.py +920 -0
  37. rvc/lib/predictors/RMVPE.py +560 -0
  38. rvc/lib/tools/analyzer.py +76 -0
  39. rvc/lib/tools/gdown.py +354 -0
  40. rvc/lib/tools/launch_tensorboard.py +21 -0
  41. rvc/lib/tools/model_download.py +385 -0
  42. rvc/lib/tools/prerequisites_download.py +104 -0
  43. rvc/lib/tools/pretrained_selector.py +63 -0
  44. rvc/lib/tools/split_audio.py +56 -0
  45. rvc/lib/tools/tts.py +29 -0
  46. rvc/lib/tools/tts_voices.json +0 -0
  47. rvc/lib/utils.py +137 -0
  48. rvc/lib/zluda.py +43 -0
  49. scrpt.py +1897 -0
  50. tabs/download/download.py +111 -0
.github/FUNDING.yml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+
2
+ github: TheNeodev, ArkanDash
3
+ ko_fi: arkandash
.github/PULL_REQUEST_TEMPLATE.md ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--- Provide a general summary of your changes in the Title above -->
2
+
3
+ ## Description
4
+
5
+ <!--- Describe your changes in detail -->
6
+
7
+ ## Motivation and Context
8
+
9
+ <!--- Why is this change required? What problem does it solve? -->
10
+ <!--- If it fixes an open issue, please link to the issue here. -->
11
+
12
+ ## How has this been tested?
13
+
14
+ <!--- Please describe in detail how you tested your changes. -->
15
+ <!--- Include details of your testing environment, tests ran to see how -->
16
+ <!--- your change affects other areas of the code, etc. -->
17
+
18
+ ## Screenshots (if appropriate):
19
+
20
+ ## Types of changes
21
+
22
+ <!--- What types of changes does your code introduce? Put an `x` in all the boxes that apply: -->
23
+
24
+ - [ ] Bug fix (non-breaking change which fixes an issue)
25
+ - [ ] New feature (non-breaking change which adds functionality)
26
+ - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
27
+
28
+ ## Checklist:
29
+
30
+ <!--- Go over all the following points, and put an `x` in all the boxes that apply. -->
31
+ <!--- If you're unsure about any of these, don't hesitate to ask. We're here to help! -->
32
+
33
+ - [ ] My code follows the code style of this project.
34
+ - [ ] My change requires a change to the documentation.
35
+ - [ ] I have updated the documentation accordingly.
36
+ - [ ] I have added tests to cover my changes.
37
+ - [ ] All new and existing tests passed.
.github/workflows/code_formatter.yml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Code Formatter
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ push_format:
10
+ runs-on: ubuntu-latest
11
+
12
+ permissions:
13
+ contents: write
14
+ pull-requests: write
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+ with:
19
+ ref: ${{github.ref_name}}
20
+
21
+ - name: Set up Python ${{ matrix.python-version }}
22
+ uses: actions/setup-python@v5
23
+ with:
24
+ python-version: ${{ matrix.python-version }}
25
+
26
+ - name: Install Black and autoflake
27
+ run: pip install "black[jupyter]" autoflake
28
+
29
+ - name: Run autoflake
30
+ run: autoflake --in-place --recursive .
31
+
32
+ - name: Run Black
33
+ run: black . --exclude=".*\.ipynb$"
34
+
35
+ - name: Commit Back
36
+ continue-on-error: true
37
+ id: commitback
38
+ run: |
39
+ git config --local user.email "github-actions[bot]@users.noreply.github.com"
40
+ git config --local user.name "github-actions[bot]"
41
+ git add --all
42
+ git commit -m "chore(format): run black on ${{github.ref_name}}"
43
+
44
+ - name: Create Pull Request
45
+ if: steps.commitback.outcome == 'success'
46
+ continue-on-error: true
47
+ uses: peter-evans/create-pull-request@v5
48
+ with:
49
+ delete-branch: true
50
+ body: "Automatically apply code formatter change"
51
+ title: "chore(format): run black on ${{github.ref_name}}"
52
+ commit-message: "chore(format): run black on ${{github.ref_name}}"
53
+ branch: formatter/${{github.ref_name}}
.github/workflows/python-app.yml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will install Python dependencies, run tests and lint with a single version of Python
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3
+
4
+ name: Python application
5
+
6
+ on:
7
+ push:
8
+ branches: [ "master" ]
9
+ pull_request:
10
+ branches: [ "master" ]
11
+
12
+ permissions:
13
+ contents: read
14
+
15
+ jobs:
16
+ build:
17
+ runs-on: ubuntu-latest
18
+
19
+ steps:
20
+ - name: Checkout code
21
+ uses: actions/checkout@v3
22
+
23
+ - name: Set up Python
24
+ uses: actions/setup-python@v4
25
+ with:
26
+ python-version: "3.10"
27
+
28
+ - name: Install torch, torchvision, torchaudio
29
+ run: |
30
+ pip install torch torchvision torchaudio
31
+
32
+ - name: Install FFmpeg
33
+ run: |
34
+ sudo apt-get update
35
+ sudo apt-get install -y ffmpeg
36
+
37
+ - name: Install dependencies from requirements.txt
38
+ run: |
39
+ python -m pip install pip==24.0
40
+ pip install -r requirements.txt
41
+
42
+ - name: Download Hubert & RMVPE
43
+ run: |
44
+ sudo apt-get install -qq -y aria2
45
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d /home/runner/work/Advanced-RVC-Inference/Advanced-RVC-Inference -o hubert_base.pt
46
+ aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -d /home/runner/work/Advanced-RVC-Inference/Advanced-RVC-Inference -o rmvpe.pt
47
+
48
+ - name: Test application
49
+ run: |
50
+ python infer.py &
51
+ sleep 180
52
+
53
+ - name: Exit application
54
+ run: |
55
+ pkill -f infer.py || true
.gitignore ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Ignore Visual Studio temporary files, build results, and
2
+ ## files generated by popular Visual Studio add-ons.
3
+ ##
4
+ ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
5
+
6
+ # User-specific files
7
+ *.rsuser
8
+ *.suo
9
+ *.user
10
+ *.userosscache
11
+ *.sln.docstates
12
+
13
+ # User-specific files (MonoDevelop/Xamarin Studio)
14
+ *.userprefs
15
+
16
+ # Mono auto generated files
17
+ mono_crash.*
18
+
19
+ # Build results
20
+ [Dd]ebug/
21
+ [Dd]ebugPublic/
22
+ [Rr]elease/
23
+ [Rr]eleases/
24
+ x64/
25
+ x86/
26
+ [Ww][Ii][Nn]32/
27
+ [Aa][Rr][Mm]/
28
+ [Aa][Rr][Mm]64/
29
+ bld/
30
+ [Bb]in/
31
+ [Oo]bj/
32
+ [Oo]ut/
33
+ [Ll]og/
34
+ [Ll]ogs/
35
+ infer_pack\__pycache__
36
+ # Visual Studio 2015/2017 cache/options directory
37
+ .vs/
38
+ # Uncomment if you have tasks that create the project's static files in wwwroot
39
+ #wwwroot/
40
+
41
+ # Visual Studio 2017 auto generated files
42
+ Generated\ Files/
43
+
44
+ # MSTest test Results
45
+ [Tt]est[Rr]esult*/
46
+ [Bb]uild[Ll]og.*
47
+
48
+ # NUnit
49
+ *.VisualState.xml
50
+ TestResult.xml
51
+ nunit-*.xml
52
+
53
+ # Build Results of an ATL Project
54
+ [Dd]ebugPS/
55
+ [Rr]eleasePS/
56
+ dlldata.c
57
+
58
+ # Benchmark Results
59
+ BenchmarkDotNet.Artifacts/
60
+
61
+ # .NET Core
62
+ project.lock.json
63
+ project.fragment.lock.json
64
+ artifacts/
65
+
66
+ # ASP.NET Scaffolding
67
+ ScaffoldingReadMe.txt
68
+
69
+ # StyleCop
70
+ StyleCopReport.xml
71
+
72
+ # Files built by Visual Studio
73
+ *_i.c
74
+ *_p.c
75
+ *_h.h
76
+ *.ilk
77
+ *.meta
78
+ *.obj
79
+ *.iobj
80
+ *.pch
81
+ *.pdb
82
+ *.ipdb
83
+ *.pgc
84
+ *.pgd
85
+ *.rsp
86
+ *.sbr
87
+ *.tlb
88
+ *.tli
89
+ *.tlh
90
+ *.tmp
91
+ *.tmp_proj
92
+ *_wpftmp.csproj
93
+ *.log
94
+ *.vspscc
95
+ *.vssscc
96
+ .builds
97
+ *.pidb
98
+ *.svclog
99
+ *.scc
100
+
101
+ # Chutzpah Test files
102
+ _Chutzpah*
103
+
104
+ # Visual C++ cache files
105
+ ipch/
106
+ *.aps
107
+ *.ncb
108
+ *.opendb
109
+ *.opensdf
110
+ *.sdf
111
+ *.cachefile
112
+ *.VC.db
113
+ *.VC.VC.opendb
114
+
115
+ # Visual Studio profiler
116
+ *.psess
117
+ *.vsp
118
+ *.vspx
119
+ *.sap
120
+
121
+ # Visual Studio Trace Files
122
+ *.e2e
123
+
124
+ # TFS 2012 Local Workspace
125
+ $tf/
126
+
127
+ # Guidance Automation Toolkit
128
+ *.gpState
129
+
130
+ # ReSharper is a .NET coding add-in
131
+ _ReSharper*/
132
+ *.[Rr]e[Ss]harper
133
+ *.DotSettings.user
134
+
135
+ # TeamCity is a build add-in
136
+ _TeamCity*
137
+
138
+ # DotCover is a Code Coverage Tool
139
+ *.dotCover
140
+
141
+ # AxoCover is a Code Coverage Tool
142
+ .axoCover/*
143
+ !.axoCover/settings.json
144
+
145
+ # Coverlet is a free, cross platform Code Coverage Tool
146
+ coverage*.json
147
+ coverage*.xml
148
+ coverage*.info
149
+
150
+ # Visual Studio code coverage results
151
+ *.coverage
152
+ *.coveragexml
153
+
154
+ # NCrunch
155
+ _NCrunch_*
156
+ .*crunch*.local.xml
157
+ nCrunchTemp_*
158
+
159
+ # MightyMoose
160
+ *.mm.*
161
+ AutoTest.Net/
162
+
163
+ # Web workbench (sass)
164
+ .sass-cache/
165
+
166
+ # Installshield output folder
167
+ [Ee]xpress/
168
+
169
+ # DocProject is a documentation generator add-in
170
+ DocProject/buildhelp/
171
+ DocProject/Help/*.HxT
172
+ DocProject/Help/*.HxC
173
+ DocProject/Help/*.hhc
174
+ DocProject/Help/*.hhk
175
+ DocProject/Help/*.hhp
176
+ DocProject/Help/Html2
177
+ DocProject/Help/html
178
+
179
+ # Click-Once directory
180
+ publish/
181
+
182
+ # Publish Web Output
183
+ *.[Pp]ublish.xml
184
+ *.azurePubxml
185
+ # Note: Comment the next line if you want to checkin your web deploy settings,
186
+ # but database connection strings (with potential passwords) will be unencrypted
187
+ *.pubxml
188
+ *.publishproj
189
+
190
+ # Microsoft Azure Web App publish settings. Comment the next line if you want to
191
+ # checkin your Azure Web App publish settings, but sensitive information contained
192
+ # in these scripts will be unencrypted
193
+ PublishScripts/
194
+
195
+ # NuGet Packages
196
+ *.nupkg
197
+ # NuGet Symbol Packages
198
+ *.snupkg
199
+ # The packages folder can be ignored because of Package Restore
200
+ **/[Pp]ackages/*
201
+ # except build/, which is used as an MSBuild target.
202
+ !**/[Pp]ackages/build/
203
+ # Uncomment if necessary however generally it will be regenerated when needed
204
+ #!**/[Pp]ackages/repositories.config
205
+ # NuGet v3's project.json files produces more ignorable files
206
+ *.nuget.props
207
+ *.nuget.targets
208
+
209
+ # Microsoft Azure Build Output
210
+ csx/
211
+ *.build.csdef
212
+
213
+ # Microsoft Azure Emulator
214
+ ecf/
215
+ rcf/
216
+
217
+ # Windows Store app package directories and files
218
+ AppPackages/
219
+ BundleArtifacts/
220
+ Package.StoreAssociation.xml
221
+ _pkginfo.txt
222
+ *.appx
223
+ *.appxbundle
224
+ *.appxupload
225
+
226
+ # Visual Studio cache files
227
+ # files ending in .cache can be ignored
228
+ *.[Cc]ache
229
+ # but keep track of directories ending in .cache
230
+ !?*.[Cc]ache/
231
+
232
+ # Others
233
+ ClientBin/
234
+ ~$*
235
+ *~
236
+ *.dbmdl
237
+ *.dbproj.schemaview
238
+ *.jfm
239
+ *.pfx
240
+ *.publishsettings
241
+ orleans.codegen.cs
242
+
243
+ # Including strong name files can present a security risk
244
+ # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245
+ #*.snk
246
+
247
+ # Since there are multiple workflows, uncomment next line to ignore bower_components
248
+ # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249
+ #bower_components/
250
+
251
+ # RIA/Silverlight projects
252
+ Generated_Code/
253
+
254
+ # Backup & report files from converting an old project file
255
+ # to a newer Visual Studio version. Backup files are not needed,
256
+ # because we have git ;-)
257
+ _UpgradeReport_Files/
258
+ Backup*/
259
+ UpgradeLog*.XML
260
+ UpgradeLog*.htm
261
+ ServiceFabricBackup/
262
+ *.rptproj.bak
263
+
264
+ # SQL Server files
265
+ *.mdf
266
+ *.ldf
267
+ *.ndf
268
+
269
+ # Business Intelligence projects
270
+ *.rdl.data
271
+ *.bim.layout
272
+ *.bim_*.settings
273
+ *.rptproj.rsuser
274
+ *- [Bb]ackup.rdl
275
+ *- [Bb]ackup ([0-9]).rdl
276
+ *- [Bb]ackup ([0-9][0-9]).rdl
277
+
278
+ # Microsoft Fakes
279
+ FakesAssemblies/
280
+
281
+ # GhostDoc plugin setting file
282
+ *.GhostDoc.xml
283
+
284
+ # Node.js Tools for Visual Studio
285
+ .ntvs_analysis.dat
286
+ node_modules/
287
+
288
+ # Visual Studio 6 build log
289
+ *.plg
290
+
291
+ # Visual Studio 6 workspace options file
292
+ *.opt
293
+
294
+ # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295
+ *.vbw
296
+
297
+ # Visual Studio LightSwitch build output
298
+ **/*.HTMLClient/GeneratedArtifacts
299
+ **/*.DesktopClient/GeneratedArtifacts
300
+ **/*.DesktopClient/ModelManifest.xml
301
+ **/*.Server/GeneratedArtifacts
302
+ **/*.Server/ModelManifest.xml
303
+ _Pvt_Extensions
304
+
305
+ # Paket dependency manager
306
+ .paket/paket.exe
307
+ paket-files/
308
+
309
+ # FAKE - F# Make
310
+ .fake/
311
+
312
+ # CodeRush personal settings
313
+ .cr/personal
314
+
315
+ # Python Tools for Visual Studio (PTVS)
316
+ __pycache__/
317
+
318
+
319
+ # Cake - Uncomment if you are using it
320
+ # tools/**
321
+ # !tools/packages.config
322
+
323
+ # Tabs Studio
324
+ *.tss
325
+
326
+ # Telerik's JustMock configuration file
327
+ *.jmconfig
328
+
329
+ # BizTalk build output
330
+ *.btp.cs
331
+ *.btm.cs
332
+ *.odx.cs
333
+ *.xsd.cs
334
+
335
+ # OpenCover UI analysis results
336
+ OpenCover/
337
+
338
+ # Azure Stream Analytics local run output
339
+ ASALocalRun/
340
+
341
+ # MSBuild Binary and Structured Log
342
+ *.binlog
343
+
344
+ # NVidia Nsight GPU debugger configuration file
345
+ *.nvuser
346
+
347
+ # MFractors (Xamarin productivity tool) working folder
348
+ .mfractor/
349
+
350
+ # Local History for Visual Studio
351
+ .localhistory/
352
+
353
+ # BeatPulse healthcheck temp database
354
+ healthchecksdb
355
+
356
+ # Backup folder for Package Reference Convert tool in Visual Studio 2017
357
+ MigrationBackup/
358
+
359
+ # Ionide (cross platform F# VS Code tools) working folder
360
+ .ionide/
361
+
362
+ # Fody - auto-generated XML schema
363
+ FodyWeavers.xsd
364
+
365
+ # build
366
+ build
367
+ monotonic_align/core.c
368
+ *.o
369
+ *.so
370
+ *.dll
371
+
372
+ # data
373
+ /config.json
374
+ /*.pth
375
+ *.wav
376
+ /monotonic_align/monotonic_align
377
+ /resources
378
+ /MoeGoe.spec
379
+ /dist/MoeGoe
380
+ /dist
381
+
382
+ .venv
383
+ .idea
384
+ app.py
385
+ infer-web.py
386
+ app-old.py
387
+ rmvpe.pt
388
+ hubert_base.pt
Advanced-RVC.ipynb ADDED
@@ -0,0 +1,494 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "view-in-github",
7
+ "colab_type": "text"
8
+ },
9
+ "source": [
10
+ "<a href=\"https://colab.research.google.com/github/ArkanDash/Advanced-RVC-Inference/blob/master/Advanced-RVC.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "markdown",
15
+ "source": [
16
+ "<h1><div align=\"center\"> Advanced RVC Inference:\n",
17
+ "\n",
18
+ "<big> for quicker and effortless model downloads\n",
19
+ "\n",
20
+ "---\n",
21
+ "\n",
22
+ "[Support](https://discord.gg/hvmsukmBHE) — [GitHub](https://github.com/ArkanDash/Advanced-RVC-Inference.git)"
23
+ ],
24
+ "metadata": {
25
+ "id": "FZUxBujkr91c"
26
+ }
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": null,
31
+ "metadata": {
32
+ "cellView": "form",
33
+ "id": "fl7Y_WjdrEO2"
34
+ },
35
+ "outputs": [],
36
+ "source": [
37
+ "#@title Check GPU\n",
38
+ "!nvidia-smi"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": null,
44
+ "metadata": {
45
+ "cellView": "form",
46
+ "id": "sfqNqmS-rEPK"
47
+ },
48
+ "outputs": [],
49
+ "source": [
50
+ "# @title Installation\n",
51
+ "\n",
52
+ "\n",
53
+ "from IPython.display import clear_output\n",
54
+ "\n",
55
+ "\n",
56
+ "\n",
57
+ "url = \"https://github.com/ArkanDash/Advanced-RVC-Inference.git\"\n",
58
+ "\n",
59
+ "!git clone $url /content/program_infer\n",
60
+ "clear_output()\n",
61
+ "\n",
62
+ "%cd /content/program_infer\n",
63
+ "\n",
64
+ "\n",
65
+ "!pip install -r requirements.txt\n",
66
+ "!pip uninstall torch torchvision torchaudio -y\n",
67
+ "!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --upgrade --index-url https://download.pytorch.org/whl/cu121\n",
68
+ "clear_output()\n",
69
+ "print(\"Finished installing requirements!\")"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "source": [
75
+ "#@title Run WebUI\n",
76
+ "\n",
77
+ "\n",
78
+ "iyalah = \"app.py\"\n",
79
+ "print(\"running WebUI\")\n",
80
+ "!python $iyalah --share"
81
+ ],
82
+ "metadata": {
83
+ "cellView": "form",
84
+ "id": "AJZH4XDOKnK3"
85
+ },
86
+ "execution_count": null,
87
+ "outputs": []
88
+ },
89
+ {
90
+ "cell_type": "markdown",
91
+ "source": [
92
+ "## Run NoUI\n",
93
+ "<div align=\"center\">\n",
94
+ "\n",
95
+ "•created by [NeoDev](https://github.com/TheNeodev)•"
96
+ ],
97
+ "metadata": {
98
+ "id": "MO_UV5ZhKOTF"
99
+ }
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "source": [
104
+ "# @title Download model\n",
105
+ "# @markdown Hugging Face or Google Drive\n",
106
+ "model_link = \"https://huggingface.co/Bredvige/Sonic2/resolve/main/Sonic.zip\" # @param {type:\"string\"}\n",
107
+ "\n",
108
+ "!python scrpt.py download --model_link \"{model_link}\""
109
+ ],
110
+ "metadata": {
111
+ "cellView": "form",
112
+ "id": "qk74gqJqEB_A"
113
+ },
114
+ "execution_count": null,
115
+ "outputs": []
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "source": [
120
+ "\n",
121
+ "#@title run Advanced-RVC\n",
122
+ "\n",
123
+ "import os\n",
124
+ "import sys\n",
125
+ "import yt_dlp\n",
126
+ "import subprocess\n",
127
+ "import logging\n",
128
+ "import json\n",
129
+ "from logging.handlers import RotatingFileHandler\n",
130
+ "from contextlib import suppress\n",
131
+ "import gradio as gr\n",
132
+ "import librosa\n",
133
+ "import numpy as np\n",
134
+ "import soundfile as sf\n",
135
+ "from pydub import AudioSegment\n",
136
+ "# Import the UVR separator. Ensure the module is available.\n",
137
+ "try:\n",
138
+ " from audio_separator.separator import Separator\n",
139
+ "except ImportError:\n",
140
+ " raise ImportError(\"Make sure the 'audio_separator' module is installed or in your working directory.\")\n",
141
+ "\n",
142
+ "from rvc.lib.tools.prerequisites_download import prerequisites_download_pipeline\n",
143
+ "\n",
144
+ "if __name__ == \"__main__\":\n",
145
+ " prerequisites_download_pipeline(models=True, exe=True)\n",
146
+ "\n",
147
+ "\n",
148
+ "# =============================================================================\n",
149
+ "# Logging Setup\n",
150
+ "# =============================================================================\n",
151
+ "\n",
152
+ "def setup_logging(log_level=logging.DEBUG, log_file=\"kuro_rvc.log\"):\n",
153
+ " \"\"\"\n",
154
+ " Set up advanced logging with both console and rotating file handlers.\n",
155
+ " \"\"\"\n",
156
+ " logger = logging.getLogger()\n",
157
+ " logger.setLevel(log_level)\n",
158
+ "\n",
159
+ " # Formatter for both handlers\n",
160
+ " formatter = logging.Formatter(\n",
161
+ " fmt=\"%(asctime)s [%(levelname)s] %(name)s: %(message)s\",\n",
162
+ " datefmt=\"%Y-%m-%d %H:%M:%S\"\n",
163
+ " )\n",
164
+ "\n",
165
+ " # Console handler (INFO level and above)\n",
166
+ " console_handler = logging.StreamHandler(sys.stdout)\n",
167
+ " console_handler.setLevel(logging.INFO)\n",
168
+ " console_handler.setFormatter(formatter)\n",
169
+ "\n",
170
+ " # Rotating file handler (DEBUG level and above)\n",
171
+ " file_handler = RotatingFileHandler(log_file, maxBytes=5*1024*1024, backupCount=2)\n",
172
+ " file_handler.setLevel(log_level)\n",
173
+ " file_handler.setFormatter(formatter)\n",
174
+ "\n",
175
+ " # Clear existing handlers, then add ours\n",
176
+ " if logger.hasHandlers():\n",
177
+ " logger.handlers.clear()\n",
178
+ " logger.addHandler(console_handler)\n",
179
+ " logger.addHandler(file_handler)\n",
180
+ " logger.debug(\"...logging has been configured.\")\n",
181
+ "\n",
182
+ "# Initialize logging as early as possible\n",
183
+ "setup_logging()\n",
184
+ "\n",
185
+ "# =============================================================================\n",
186
+ "# Directories and File Paths\n",
187
+ "# =============================================================================\n",
188
+ "\n",
189
+ "current_dir = os.getcwd()\n",
190
+ "rvc_models_dir = os.path.join(current_dir, 'logs')\n",
191
+ "rvc_output_dir = os.path.join(current_dir, 'song_output')\n",
192
+ "download_dir = os.path.join(current_dir, \"downloads\")\n",
193
+ "uvr_output_dir = os.path.join(current_dir, \"output_uvr\")\n",
194
+ "\n",
195
+ "# File paths for separated stems (using uvr_output_dir)\n",
196
+ "vocals_path = os.path.join(uvr_output_dir, 'Vocals.wav')\n",
197
+ "instrumental_path = os.path.join(uvr_output_dir, 'Instrumental.wav')\n",
198
+ "lead_vocals_path = os.path.join(uvr_output_dir, 'Lead_Vocals.wav')\n",
199
+ "backing_vocals_path = os.path.join(uvr_output_dir, 'Backing_Vocals.wav')\n",
200
+ "\n",
201
+ "# File paths for RVC inference outputs\n",
202
+ "rvc_lead_output = os.path.join(rvc_output_dir, \"rvc_result_lead.wav\")\n",
203
+ "rvc_backing_output = os.path.join(rvc_output_dir, \"rvc_result_backing.wav\")\n",
204
+ "\n",
205
+ "# Path to the RVC script (ensure it exists in the current directory)\n",
206
+ "rvc_cli_file = os.path.join(current_dir, \"scrpt.py\")\n",
207
+ "if not os.path.exists(rvc_cli_file):\n",
208
+ " logging.error(\"scrpt.py not found in the current directory: %s\", current_dir)\n",
209
+ " raise FileNotFoundError(\"scrpt.py not found in the current directory.\")\n",
210
+ "\n",
211
+ "# =============================================================================\n",
212
+ "# Inference and Pipeline Parameters (Colab UI parameters below)\n",
213
+ "# =============================================================================\n",
214
+ "\n",
215
+ "model_name = \"Sonic\" # @param {type:\"string\"}\n",
216
+ "youtube_url = \"https://youtu.be/eCkWlRL3_N0?si=y6xHAs1m8fYVLTUV\" # @param {type:\"string\"}\n",
217
+ "export_format = \"WAV\" # @param ['WAV', 'MP3', 'FLAC', 'OGG', 'M4A']\n",
218
+ "f0_method = \"hybrid[rmvpe+fcpe]\" # @param [\"crepe\", \"crepe-tiny\", \"rmvpe\", \"fcpe\", \"hybrid[rmvpe+fcpe]\"]\n",
219
+ "f0_up_key = 0 # @param {type:\"slider\", min:-24, max:24, step:0}\n",
220
+ "filter_radius = 3 # @param {type:\"slider\", min:0, max:10, step:0}\n",
221
+ "rms_mix_rate = 0.8 # @param {type:\"slider\", min:0.0, max:1.0, step:0.1}\n",
222
+ "protect = 0.5 # @param {type:\"slider\", min:0.0, max:0.5, step:0.1}\n",
223
+ "index_rate = 0.6 # @param {type:\"slider\", min:0.0, max:1.0, step:0.1}\n",
224
+ "hop_length = 128 # @param {type:\"slider\", min:1, max:512, step:0}\n",
225
+ "clean_strength = 0.7 # @param {type:\"slider\", min:0.0, max:1.0, step:0.1}\n",
226
+ "split_audio = False # @param {type:\"boolean\"}\n",
227
+ "clean_audio = False # @param {type:\"boolean\"}\n",
228
+ "f0_autotune = False # @param {type:\"boolean\"}\n",
229
+ "backing_vocal_infer = False # @param {type:\"boolean\"}\n",
230
+ "embedder_model = \"contentvec\" # @param [\"contentvec\", \"chinese-hubert-base\", \"japanese-hubert-base\", \"korean-hubert-base\", \"custom\"]\n",
231
+ "embedder_model_custom = \"\" # @param {type:\"string\"}\n",
232
+ "output_filename = f\"aicover_{model_name}_opt\"\n",
233
+ "logging.info(\"This code was written by [NeoDev](https://github.com/TheNeodev). Please credit if you copy or modify the code.\")\n",
234
+ "\n",
235
+ "# =============================================================================\n",
236
+ "# Function Definitions\n",
237
+ "# =============================================================================\n",
238
+ "\n",
239
+ "def download_youtube_audio(url, download_dir):\n",
240
+ " \"\"\"\n",
241
+ " Download audio from a YouTube URL and return the path(s) to the downloaded WAV file(s).\n",
242
+ " \"\"\"\n",
243
+ " logging.debug(\"Starting YouTube audio download. URL: %s\", url)\n",
244
+ " os.makedirs(download_dir, exist_ok=True)\n",
245
+ " outtmpl = os.path.join(download_dir, \"%(title)s.%(ext)s\")\n",
246
+ " ydl_opts = {\n",
247
+ " \"format\": \"bestaudio/best\",\n",
248
+ " \"outtmpl\": outtmpl,\n",
249
+ " \"postprocessors\": [{\n",
250
+ " \"key\": \"FFmpegExtractAudio\",\n",
251
+ " \"preferredcodec\": \"wav\",\n",
252
+ " \"preferredquality\": \"192\"\n",
253
+ " }],\n",
254
+ " }\n",
255
+ " with yt_dlp.YoutubeDL(ydl_opts) as ydl:\n",
256
+ " info_dict = ydl.extract_info(url, download=True)\n",
257
+ " if \"entries\" in info_dict: # Playlist support\n",
258
+ " downloaded_files = [os.path.join(download_dir, f\"{entry['title']}.wav\") for entry in info_dict[\"entries\"] if entry]\n",
259
+ " else:\n",
260
+ " downloaded_files = os.path.join(download_dir, f\"{info_dict['title']}.wav\")\n",
261
+ " logging.debug(\"Downloaded audio file(s): %s\", downloaded_files)\n",
262
+ " return downloaded_files\n",
263
+ "\n",
264
+ "def separator_uvr(input_audio, output_dir):\n",
265
+ " \"\"\"\n",
266
+ " Separate the input audio into instrumental and vocal stems,\n",
267
+ " then further separate vocals into lead and backing vocals.\n",
268
+ " Returns the paths to the lead and backing vocal files.\n",
269
+ " \"\"\"\n",
270
+ " logging.debug(\"Starting UVR separation for file: %s\", input_audio)\n",
271
+ " os.makedirs(output_dir, exist_ok=True)\n",
272
+ "\n",
273
+ " # First separation: get instrumental and vocals\n",
274
+ " uvr_separator = Separator(output_dir=output_dir)\n",
275
+ " logging.debug(\"Loading first UVR model for instrumental/vocals separation.\")\n",
276
+ " uvr_separator.load_model('model_bs_roformer_ep_317_sdr_12.9755.ckpt')\n",
277
+ " separated_files = uvr_separator.separate(input_audio)\n",
278
+ " if len(separated_files) < 2:\n",
279
+ " error_msg = \"UVR separation did not produce expected files for instrumental/vocals.\"\n",
280
+ " logging.error(error_msg)\n",
281
+ " raise RuntimeError(error_msg)\n",
282
+ "\n",
283
+ " # Rename the separated files to our designated paths\n",
284
+ " os.rename(os.path.join(output_dir, separated_files[0]), instrumental_path)\n",
285
+ " os.rename(os.path.join(output_dir, separated_files[1]), vocals_path)\n",
286
+ " logging.debug(\"Separated instrumental saved to: %s\", instrumental_path)\n",
287
+ " logging.debug(\"Separated vocals saved to: %s\", vocals_path)\n",
288
+ "\n",
289
+ " # Second separation: split vocals into lead and backing\n",
290
+ " logging.debug(\"Loading second UVR model for vocal splitting.\")\n",
291
+ " uvr_separator.load_model('mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt')\n",
292
+ " separated_vocals = uvr_separator.separate(vocals_path)\n",
293
+ " if len(separated_vocals) < 2:\n",
294
+ " error_msg = \"UVR separation did not produce expected files for vocal split.\"\n",
295
+ " logging.error(error_msg)\n",
296
+ " raise RuntimeError(error_msg)\n",
297
+ "\n",
298
+ " os.rename(os.path.join(output_dir, separated_vocals[0]), backing_vocals_path)\n",
299
+ " os.rename(os.path.join(output_dir, separated_vocals[1]), lead_vocals_path)\n",
300
+ " logging.debug(\"Separated backing vocals saved to: %s\", backing_vocals_path)\n",
301
+ " logging.debug(\"Separated lead vocals saved to: %s\", lead_vocals_path)\n",
302
+ "\n",
303
+ " return lead_vocals_path, backing_vocals_path\n",
304
+ "\n",
305
+ "def run_rvc(f0_up_key, filter_radius, rms_mix_rate, index_rate, hop_length, protect,\n",
306
+ " f0_method, input_path, output_path, pth_file, index_file, split_audio,\n",
307
+ " clean_audio, clean_strength, export_format, f0_autotune,\n",
308
+ " embedder_model, embedder_model_custom):\n",
309
+ " \"\"\"\n",
310
+ " Run the RVC inference pipeline via the rvc_cli.py script.\n",
311
+ " \"\"\"\n",
312
+ " logging.debug(\"Preparing RVC inference command for input file: %s\", input_path)\n",
313
+ " command = [\n",
314
+ " sys.executable, rvc_cli_file, \"infer\",\n",
315
+ " \"--pitch\", str(f0_up_key),\n",
316
+ " \"--filter_radius\", str(filter_radius),\n",
317
+ " \"--volume_envelope\", str(rms_mix_rate),\n",
318
+ " \"--index_rate\", str(index_rate),\n",
319
+ " \"--hop_length\", str(hop_length),\n",
320
+ " \"--protect\", str(protect),\n",
321
+ " \"--f0_method\", f0_method,\n",
322
+ " \"--f0_autotune\", str(f0_autotune),\n",
323
+ " \"--input_path\", input_path,\n",
324
+ " \"--output_path\", output_path,\n",
325
+ " \"--pth_path\", pth_file,\n",
326
+ " \"--index_path\", index_file,\n",
327
+ " \"--split_audio\", str(split_audio),\n",
328
+ " \"--clean_audio\", str(clean_audio),\n",
329
+ " \"--clean_strength\", str(clean_strength),\n",
330
+ " \"--export_format\", export_format,\n",
331
+ " \"--embedder_model\", embedder_model,\n",
332
+ " \"--embedder_model_custom\", embedder_model_custom\n",
333
+ " ]\n",
334
+ " logging.info(\"Running RVC inference. Command: %s\", \" \".join(command))\n",
335
+ " try:\n",
336
+ " result = subprocess.run(command, check=True, capture_output=True, text=True)\n",
337
+ " logging.debug(\"RVC inference stdout: %s\", result.stdout)\n",
338
+ " if result.stderr:\n",
339
+ " logging.debug(\"RVC inference stderr: %s\", result.stderr)\n",
340
+ " logging.info(\"RVC inference completed for input: %s\", input_path)\n",
341
+ " except subprocess.CalledProcessError as e:\n",
342
+ " logging.error(\"RVC inference failed for input: %s\", input_path)\n",
343
+ " logging.error(\"Error output: %s\", e.stderr)\n",
344
+ " raise e\n",
345
+ "\n",
346
+ "def load_audio(file_path):\n",
347
+ " \"\"\"Load an audio file using pydub if it exists.\"\"\"\n",
348
+ " if file_path and os.path.exists(file_path):\n",
349
+ " logging.debug(\"Loading audio file: %s\", file_path)\n",
350
+ " return AudioSegment.from_file(file_path)\n",
351
+ " else:\n",
352
+ " logging.warning(\"Audio file not found: %s\", file_path)\n",
353
+ " return None\n",
354
+ "\n",
355
+ "# =============================================================================\n",
356
+ "# Main Execution Function\n",
357
+ "# =============================================================================\n",
358
+ "\n",
359
+ "def main():\n",
360
+ " logging.info(\"Starting Advanced-RVC pipeline.\")\n",
361
+ "\n",
362
+ " # Check model folder and required model files\n",
363
+ " model_folder = os.path.join(rvc_models_dir, model_name)\n",
364
+ " if not os.path.exists(model_folder):\n",
365
+ " error_msg = f\"Model directory not found: {model_folder}\"\n",
366
+ " logging.error(error_msg)\n",
367
+ " raise FileNotFoundError(error_msg)\n",
368
+ " files_in_folder = os.listdir(model_folder)\n",
369
+ " pth_filename = next((f for f in files_in_folder if f.endswith(\".pth\")), None)\n",
370
+ " index_filename = next((f for f in files_in_folder if f.endswith(\".index\")), None)\n",
371
+ " if not pth_filename or not index_filename:\n",
372
+ " error_msg = \"Required model files (.pth or .index) were not found in the model folder.\"\n",
373
+ " logging.error(error_msg)\n",
374
+ " raise FileNotFoundError(error_msg)\n",
375
+ " pth_file = os.path.join(model_folder, pth_filename)\n",
376
+ " index_file = os.path.join(model_folder, index_filename)\n",
377
+ " logging.debug(\"Model files located. PTH: %s, Index: %s\", pth_file, index_file)\n",
378
+ "\n",
379
+ " # Download audio from YouTube\n",
380
+ " logging.info(\"Downloading audio from YouTube...\")\n",
381
+ " downloaded_audio = download_youtube_audio(youtube_url, download_dir)\n",
382
+ " input_audio = downloaded_audio[0] if isinstance(downloaded_audio, list) else downloaded_audio\n",
383
+ " if not os.path.exists(input_audio):\n",
384
+ " error_msg = f\"Downloaded audio file not found: {input_audio}\"\n",
385
+ " logging.error(error_msg)\n",
386
+ " raise FileNotFoundError(error_msg)\n",
387
+ " logging.info(\"Audio downloaded successfully: %s\", input_audio)\n",
388
+ "\n",
389
+ " # Run UVR separation\n",
390
+ " logging.info(\"Running UVR separation...\")\n",
391
+ " lead_vocals_file, backing_vocals_file = separator_uvr(input_audio, uvr_output_dir)\n",
392
+ " logging.info(\"UVR separation completed. Lead vocals: %s, Backing vocals: %s\", lead_vocals_file, backing_vocals_file)\n",
393
+ "\n",
394
+ " # Ensure the output directory for RVC exists\n",
395
+ " os.makedirs(rvc_output_dir, exist_ok=True)\n",
396
+ "\n",
397
+ " # Run RVC inference for lead vocals\n",
398
+ " logging.info(\"Running RVC inference for lead vocals...\")\n",
399
+ " run_rvc(f0_up_key, filter_radius, rms_mix_rate, index_rate, hop_length, protect,\n",
400
+ " f0_method, lead_vocals_path, rvc_lead_output, pth_file, index_file,\n",
401
+ " split_audio, clean_audio, clean_strength, export_format, f0_autotune,\n",
402
+ " embedder_model, embedder_model_custom)\n",
403
+ "\n",
404
+ " # Optionally run RVC inference for backing vocals\n",
405
+ " if backing_vocal_infer:\n",
406
+ " logging.info(\"Running RVC inference for backing vocals...\")\n",
407
+ " run_rvc(f0_up_key, filter_radius, rms_mix_rate, index_rate, hop_length, protect,\n",
408
+ " f0_method, backing_vocals_path, rvc_backing_output, pth_file, index_file,\n",
409
+ " split_audio, clean_audio, clean_strength, export_format, f0_autotune,\n",
410
+ " embedder_model, embedder_model_custom)\n",
411
+ "\n",
412
+ " logging.info(\"RVC pipeline complete.\")\n",
413
+ "\n",
414
+ " # Load the separated/inferred tracks for final mix\n",
415
+ " logging.info(\"Loading audio tracks for final mix.\")\n",
416
+ " lead_vocals_audio = load_audio(rvc_lead_output)\n",
417
+ " instrumental_audio = load_audio(instrumental_path)\n",
418
+ " # If backing inference was run, load its result; otherwise use separated backing vocals.\n",
419
+ " backing_vocals_audio = load_audio(rvc_backing_output) if backing_vocal_infer else load_audio(backing_vocals_path)\n",
420
+ "\n",
421
+ " if not instrumental_audio:\n",
422
+ " error_msg = \"Instrumental track is required for mixing!\"\n",
423
+ " logging.error(error_msg)\n",
424
+ " raise ValueError(error_msg)\n",
425
+ "\n",
426
+ " # Mix the audio tracks: overlay lead vocals and backing vocals onto the instrumental\n",
427
+ " final_mix = instrumental_audio\n",
428
+ " if lead_vocals_audio:\n",
429
+ " logging.debug(\"Overlaying lead vocals onto instrumental.\")\n",
430
+ " final_mix = final_mix.overlay(lead_vocals_audio)\n",
431
+ " if backing_vocals_audio:\n",
432
+ " logging.debug(\"Overlaying backing vocals onto instrumental.\")\n",
433
+ " final_mix = final_mix.overlay(backing_vocals_audio)\n",
434
+ "\n",
435
+ " # Export final mix to file\n",
436
+ " output_file = f\"{output_filename}.{export_format.lower()}\"\n",
437
+ " final_mix.export(output_file, format=export_format.lower())\n",
438
+ " logging.info(\"✅ Mixed file saved as: %s\", output_file)\n",
439
+ " print(f\"✅ Mixed file saved as: {output_file}\")\n",
440
+ "\n",
441
+ "# =============================================================================\n",
442
+ "# Run the Pipeline if Executed as a Script\n",
443
+ "# =============================================================================\n",
444
+ "\n",
445
+ "if __name__ == \"__main__\":\n",
446
+ " try:\n",
447
+ " main()\n",
448
+ " except Exception as e:\n",
449
+ " logging.exception(\"An error occurred during execution: %s\", e)\n",
450
+ " raise"
451
+ ],
452
+ "metadata": {
453
+ "cellView": "form",
454
+ "id": "9-KMNp7tFrEk"
455
+ },
456
+ "execution_count": null,
457
+ "outputs": []
458
+ },
459
+ {
460
+ "cell_type": "code",
461
+ "source": [
462
+ "#@title play ur audio output\n",
463
+ "\n",
464
+ "output_file = f\"{output_filename}.{export_format.lower()}\"\n",
465
+ "\n",
466
+ "AudioSegment.from_file(output_file)"
467
+ ],
468
+ "metadata": {
469
+ "cellView": "form",
470
+ "id": "NvxvDUUOrYd-"
471
+ },
472
+ "execution_count": null,
473
+ "outputs": []
474
+ }
475
+ ],
476
+ "metadata": {
477
+ "language_info": {
478
+ "name": "python"
479
+ },
480
+ "orig_nbformat": 4,
481
+ "colab": {
482
+ "provenance": [],
483
+ "gpuType": "T4",
484
+ "include_colab_link": true
485
+ },
486
+ "kernelspec": {
487
+ "name": "python3",
488
+ "display_name": "Python 3"
489
+ },
490
+ "accelerator": "GPU"
491
+ },
492
+ "nbformat": 4,
493
+ "nbformat_minor": 0
494
+ }
LICENSE ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 arkandash
4
+ Copyright (c) 2025 NeoDev
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,64 @@
1
- ---
2
- title: Advanced Rvc Inference
3
- emoji: 🔥
4
- colorFrom: blue
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.15.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ # Advanced RVC Inference
4
+
5
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ArkanDash/Advanced-RVC-Inference/blob/master/Advanced-RVC.ipynb)
6
+
7
+
8
+ </div>
9
+
10
+ ## Information
11
+ <div align="center">
12
+ Advanced RVC Inference presents itself as a state-of-the-art web UI crafted to streamline rapid and effortless inference. This comprehensive toolset encompasses a model downloader, a voice splitter.
13
+
14
+ Please support the Applio. This inference won't be possible to make without it.<br />
15
+ [![Original Applio](https://img.shields.io/badge/Github-Original%20Applio%20Repository-blue?style=for-the-badge&logo=github)](https://github.com/IAHispano/Applio)
16
+ </div>
17
+
18
+ ## Features
19
+ - Support V1 & V2 Model ✅
20
+ - Youtube Audio Downloader ✅
21
+ - Audio-Separator (Voice Splitter) [Internet required for downloading model] ✅
22
+ - Model Downloader ✅
23
+ - Gradio WebUI ✅
24
+ ## Installation
25
+
26
+ 1. Install Dependencies <br />
27
+ ```bash
28
+ pip install torch torchvision torchaudio
29
+
30
+ python -m pip install -r requirements.txt
31
+ ```
32
+ 2. Install [ffmpeg](https://ffmpeg.org/)
33
+
34
+ 3. Download models use:
35
+
36
+ ```bash
37
+ python models.py
38
+ ```
39
+
40
+ ## Run WebUI <br />
41
+ ```bash
42
+ python app.py
43
+ ```
44
+
45
+
46
+ ## Terms of Use
47
+
48
+ The use of the converted voice for the following purposes is prohibited.
49
+
50
+ * Criticizing or attacking individuals.
51
+
52
+ * Advocating for or opposing specific political positions, religions, or ideologies.
53
+
54
+ * Publicly displaying strongly stimulating expressions without proper zoning.
55
+
56
+ * Selling of voice models and generated voice clips.
57
+
58
+ * Impersonation of the original owner of the voice with malicious intentions to harm/hurt others.
59
+
60
+ * Fraudulent purposes that lead to identity theft or fraudulent phone calls.
61
+
62
+ ## Disclaimer
63
+
64
+ I am not liable for any direct, indirect, consequential, incidental, or special damages arising out of or in any way connected with the use/misuse or inability to use this software.
assets/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "theme": {
3
+ "file": null,
4
+ "class": "NoCrypt/miku"
5
+ }
assets/themes/loadThemes.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import importlib
4
+ import gradio as gr
5
+
6
+ now_dir = os.getcwd()
7
+
8
+ folder = os.path.join(now_dir, "assets", "themes")
9
+ config_file = os.path.join(now_dir, "assets", "config.json")
10
+
11
+ import sys
12
+
13
+ sys.path.append(folder)
14
+
15
+
16
+ def get_class(filename):
17
+ with open(filename, "r", encoding="utf8") as file:
18
+ for line_number, line in enumerate(file, start=1):
19
+ if "class " in line:
20
+ found = line.split("class ")[1].split(":")[0].split("(")[0].strip()
21
+ return found
22
+ break
23
+ return None
24
+
25
+
26
+ def get_list():
27
+
28
+ themes_from_files = [
29
+ os.path.splitext(name)[0]
30
+ for root, _, files in os.walk(folder, topdown=False)
31
+ for name in files
32
+ if name.endswith(".py") and root == folder and name != "loadThemes.py"
33
+ ]
34
+
35
+ json_file_path = os.path.join(folder, "themes_list.json")
36
+
37
+ try:
38
+ with open(json_file_path, "r", encoding="utf8") as json_file:
39
+ themes_from_url = [item["id"] for item in json.load(json_file)]
40
+ except FileNotFoundError:
41
+ themes_from_url = []
42
+
43
+ combined_themes = set(themes_from_files + themes_from_url)
44
+
45
+ return list(combined_themes)
46
+
47
+
48
+ def select_theme(name):
49
+ selected_file = name + ".py"
50
+ full_path = os.path.join(folder, selected_file)
51
+
52
+ if not os.path.exists(full_path):
53
+ with open(config_file, "r", encoding="utf8") as json_file:
54
+ config_data = json.load(json_file)
55
+
56
+ config_data["theme"]["file"] = None
57
+ config_data["theme"]["class"] = name
58
+
59
+ with open(config_file, "w", encoding="utf8") as json_file:
60
+ json.dump(config_data, json_file, indent=2)
61
+ print(f"Theme {name} successfully selected, restart the App.")
62
+ gr.Info(f"Theme {name} successfully selected, restart the App.")
63
+ return
64
+
65
+ class_found = get_class(full_path)
66
+ if class_found:
67
+ with open(config_file, "r", encoding="utf8") as json_file:
68
+ config_data = json.load(json_file)
69
+
70
+ config_data["theme"]["file"] = selected_file
71
+ config_data["theme"]["class"] = class_found
72
+
73
+ with open(config_file, "w", encoding="utf8") as json_file:
74
+ json.dump(config_data, json_file, indent=2)
75
+ print(f"Theme {name} successfully selected, restart the App.")
76
+ gr.Info(f"Theme {name} successfully selected, restart the App.")
77
+ else:
78
+ print(f"Theme {name} was not found.")
79
+
80
+
81
+ def read_json():
82
+ try:
83
+ with open(config_file, "r", encoding="utf8") as json_file:
84
+ data = json.load(json_file)
85
+ selected_file = data["theme"]["file"]
86
+ class_name = data["theme"]["class"]
87
+
88
+ if selected_file is not None and class_name:
89
+ return class_name
90
+ elif selected_file == None and class_name:
91
+ return class_name
92
+ else:
93
+ return "NoCrypt/miku"
94
+ except Exception as error:
95
+ print(f"An error occurred loading the theme: {error}")
96
+ return "NoCrypt/miku"
97
+
98
+
99
+ def load_json():
100
+ try:
101
+ with open(config_file, "r", encoding="utf8") as json_file:
102
+ data = json.load(json_file)
103
+ selected_file = data["theme"]["file"]
104
+ class_name = data["theme"]["class"]
105
+
106
+ if selected_file is not None and class_name:
107
+ module = importlib.import_module(selected_file[:-3])
108
+ obtained_class = getattr(module, class_name)
109
+ instance = obtained_class()
110
+ print(f"Theme {class_name} successfully loaded.")
111
+ return instance
112
+ elif selected_file == None and class_name:
113
+ return class_name
114
+ else:
115
+ print("The theme is incorrect.")
116
+ return None
117
+ except Exception as error:
118
+ print(f"An error occurred loading the theme: {error}")
119
+ return None
assets/themes/themes_list.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {"id": "freddyaboulton/dracula_revamped"},
3
+ {"id": "freddyaboulton/bad-theme-space"},
4
+ {"id": "gradio/dracula_revamped"},
5
+ {"id": "abidlabs/dracula_revamped"},
6
+ {"id": "gradio/seafoam"},
7
+ {"id": "gradio/monochrome"},
8
+ {"id": "gradio/soft"},
9
+ {"id": "gradio/default"},
10
+ {"id": "dawood/microsoft_windows"},
11
+ {"id": "ysharma/steampunk"},
12
+ {"id": "ysharma/huggingface"},
13
+ {"id": "gstaff/xkcd"},
14
+ {"id": "JohnSmith9982/small_and_pretty"},
15
+ {"id": "abidlabs/Lime"},
16
+ {"id": "bethecloud/storj_theme"},
17
+ {"id": "sudeepshouche/minimalist"},
18
+ {"id": "knotdgaf/gradiotest"},
19
+ {"id": "ParityError/Interstellar"},
20
+ {"id": "ParityError/Anime"},
21
+ {"id": "Ajaxon6255/Emerald_Isle"},
22
+ {"id": "NoCrypt/miku"},
23
+ {"id": "Hev832/Applio"}
24
+ ]
install.bat ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+ setlocal enabledelayedexpansion
3
+ title RVC CLI Installer
4
+
5
+ echo Welcome to the RVC CLI Installer!
6
+ echo.
7
+
8
+ set "INSTALL_DIR=%cd%"
9
+ set "MINICONDA_DIR=%UserProfile%\Miniconda3"
10
+ set "ENV_DIR=%INSTALL_DIR%\env"
11
+ set "MINICONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-py39_23.9.0-0-Windows-x86_64.exe"
12
+ set "CONDA_EXE=%MINICONDA_DIR%\Scripts\conda.exe"
13
+
14
+ call :cleanup
15
+ call :install_miniconda
16
+ call :create_conda_env
17
+ call :install_dependencies
18
+
19
+ echo RVC CLI has been installed successfully!
20
+ echo.
21
+ pause
22
+ exit /b 0
23
+
24
+ :cleanup
25
+ echo Cleaning up unnecessary files...
26
+ for %%F in (Makefile Dockerfile docker-compose.yaml *.sh) do if exist "%%F" del "%%F"
27
+ echo Cleanup complete.
28
+ echo.
29
+ exit /b 0
30
+
31
+ :install_miniconda
32
+ if exist "%CONDA_EXE%" (
33
+ echo Miniconda already installed. Skipping installation.
34
+ exit /b 0
35
+ )
36
+
37
+ echo Miniconda not found. Starting download and installation...
38
+ powershell -Command "& {Invoke-WebRequest -Uri '%MINICONDA_URL%' -OutFile 'miniconda.exe'}"
39
+ if not exist "miniconda.exe" goto :download_error
40
+
41
+ start /wait "" miniconda.exe /InstallationType=JustMe /RegisterPython=0 /S /D=%MINICONDA_DIR%
42
+ if errorlevel 1 goto :install_error
43
+
44
+ del miniconda.exe
45
+ echo Miniconda installation complete.
46
+ echo.
47
+ exit /b 0
48
+
49
+ :create_conda_env
50
+ echo Creating Conda environment...
51
+ call "%MINICONDA_DIR%\_conda.exe" create --no-shortcuts -y -k --prefix "%ENV_DIR%" python=3.9
52
+ if errorlevel 1 goto :error
53
+ echo Conda environment created successfully.
54
+ echo.
55
+
56
+ if exist "%ENV_DIR%\python.exe" (
57
+ echo Installing specific pip version...
58
+ "%ENV_DIR%\python.exe" -m pip install "pip<24.1"
59
+ if errorlevel 1 goto :error
60
+ echo Pip installation complete.
61
+ echo.
62
+ )
63
+ exit /b 0
64
+
65
+ :install_dependencies
66
+ echo Installing dependencies...
67
+ call "%MINICONDA_DIR%\condabin\conda.bat" activate "%ENV_DIR%" || goto :error
68
+ pip install --upgrade setuptools || goto :error
69
+ pip install --no-cache-dir -r "%INSTALL_DIR%\requirements.txt" || goto :error
70
+ pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --upgrade --index-url https://download.pytorch.org/whl/cu121 || goto :error
71
+ call "%MINICONDA_DIR%\condabin\conda.bat" deactivate
72
+ echo Dependencies installation complete.
73
+ echo.
74
+ exit /b 0
75
+
76
+ :download_error
77
+ echo Download failed. Please check your internet connection and try again.
78
+ goto :error
79
+
80
+ :install_error
81
+ echo Miniconda installation failed.
82
+ goto :error
83
+
84
+ :error
85
+ echo An error occurred during installation. Please check the output above for details.
86
+ pause
87
+ exit /b 1
models.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from rvc.lib.tools.prerequisites_download import prerequisites_download_pipeline
2
+
3
+
4
+ print("downloading models...")
5
+ prerequisites_download_pipeline(models=True, exe=True)
requirements.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pip>=23.3; sys_platform == 'darwin'
2
+ wheel; sys_platform == 'darwin'
3
+ PyYAML; sys_platform == 'darwin'
4
+ tqdm
5
+ wget
6
+ ffmpeg-python>=0.2.0
7
+ faiss-cpu==1.7.3
8
+ soundfile==0.12.1
9
+ noisereduce
10
+ pedalboard
11
+ stftpitchshift
12
+ yt-dlp
13
+ audio-separator[gpu]==0.28.5
14
+ omegaconf>=2.0.6; sys_platform == 'darwin'
15
+ numba; sys_platform == 'linux'
16
+ numba==0.57.0; sys_platform == 'darwin' or sys_platform == 'win32'
17
+ torchaudio==2.3.1
18
+ torchvision==0.18.1
19
+ torchcrepe==0.0.23
20
+ torchfcpe
21
+ libf0
22
+ transformers==4.44.2
23
+ matplotlib==3.7.2
24
+ tensorboard
25
+ gradio==4.44.0
26
+ certifi>=2023.07.22; sys_platform == 'darwin'
27
+ antlr4-python3-runtime==4.8; sys_platform == 'darwin'
28
+ tensorboardX
29
+ edge-tts==6.1.9
30
+ pypresence
31
+ beautifulsoup4
32
+ flask
33
+ typing
rvc/configs/config.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import json
3
+ import os
4
+
5
+
6
+ version_config_paths = [
7
+ os.path.join("v1", "32000.json"),
8
+ os.path.join("v1", "40000.json"),
9
+ os.path.join("v1", "48000.json"),
10
+ os.path.join("v2", "48000.json"),
11
+ os.path.join("v2", "40000.json"),
12
+ os.path.join("v2", "32000.json"),
13
+ ]
14
+
15
+
16
+ def singleton(cls):
17
+ instances = {}
18
+
19
+ def get_instance(*args, **kwargs):
20
+ if cls not in instances:
21
+ instances[cls] = cls(*args, **kwargs)
22
+ return instances[cls]
23
+
24
+ return get_instance
25
+
26
+
27
+ @singleton
28
+ class Config:
29
+ def __init__(self):
30
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
31
+ self.is_half = self.device != "cpu"
32
+ self.gpu_name = (
33
+ torch.cuda.get_device_name(int(self.device.split(":")[-1]))
34
+ if self.device.startswith("cuda")
35
+ else None
36
+ )
37
+ self.json_config = self.load_config_json()
38
+ self.gpu_mem = None
39
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
40
+
41
+ def load_config_json(self) -> dict:
42
+ configs = {}
43
+ for config_file in version_config_paths:
44
+ config_path = os.path.join("rvc", "configs", config_file)
45
+ with open(config_path, "r") as f:
46
+ configs[config_file] = json.load(f)
47
+ return configs
48
+
49
+ def has_mps(self) -> bool:
50
+ # Check if Metal Performance Shaders are available - for macOS 12.3+.
51
+ return torch.backends.mps.is_available()
52
+
53
+ def has_xpu(self) -> bool:
54
+ # Check if XPU is available.
55
+ return hasattr(torch, "xpu") and torch.xpu.is_available()
56
+
57
+ def set_precision(self, precision):
58
+ if precision not in ["fp32", "fp16"]:
59
+ raise ValueError("Invalid precision type. Must be 'fp32' or 'fp16'.")
60
+
61
+ fp16_run_value = precision == "fp16"
62
+ preprocess_target_version = "3.7" if precision == "fp16" else "3.0"
63
+ preprocess_path = os.path.join(
64
+ os.path.dirname(__file__),
65
+ os.pardir,
66
+ "rvc",
67
+ "train",
68
+ "preprocess",
69
+ "preprocess.py",
70
+ )
71
+
72
+ for config_path in version_config_paths:
73
+ full_config_path = os.path.join("rvc", "configs", config_path)
74
+ try:
75
+ with open(full_config_path, "r") as f:
76
+ config = json.load(f)
77
+ config["train"]["fp16_run"] = fp16_run_value
78
+ with open(full_config_path, "w") as f:
79
+ json.dump(config, f, indent=4)
80
+ except FileNotFoundError:
81
+ print(f"File not found: {full_config_path}")
82
+
83
+ if os.path.exists(preprocess_path):
84
+ with open(preprocess_path, "r") as f:
85
+ preprocess_content = f.read()
86
+ preprocess_content = preprocess_content.replace(
87
+ "3.0" if precision == "fp16" else "3.7", preprocess_target_version
88
+ )
89
+ with open(preprocess_path, "w") as f:
90
+ f.write(preprocess_content)
91
+
92
+ return f"Overwritten preprocess and config.json to use {precision}."
93
+
94
+ def get_precision(self):
95
+ if not version_config_paths:
96
+ raise FileNotFoundError("No configuration paths provided.")
97
+
98
+ full_config_path = os.path.join("rvc", "configs", version_config_paths[0])
99
+ try:
100
+ with open(full_config_path, "r") as f:
101
+ config = json.load(f)
102
+ fp16_run_value = config["train"].get("fp16_run", False)
103
+ precision = "fp16" if fp16_run_value else "fp32"
104
+ return precision
105
+ except FileNotFoundError:
106
+ print(f"File not found: {full_config_path}")
107
+ return None
108
+
109
+ def device_config(self) -> tuple:
110
+ if self.device.startswith("cuda"):
111
+ self.set_cuda_config()
112
+ elif self.has_mps():
113
+ self.device = "mps"
114
+ self.is_half = False
115
+ self.set_precision("fp32")
116
+ else:
117
+ self.device = "cpu"
118
+ self.is_half = False
119
+ self.set_precision("fp32")
120
+
121
+ # Configuration for 6GB GPU memory
122
+ x_pad, x_query, x_center, x_max = (
123
+ (3, 10, 60, 65) if self.is_half else (1, 6, 38, 41)
124
+ )
125
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
126
+ # Configuration for 5GB GPU memory
127
+ x_pad, x_query, x_center, x_max = (1, 5, 30, 32)
128
+
129
+ return x_pad, x_query, x_center, x_max
130
+
131
+ def set_cuda_config(self):
132
+ i_device = int(self.device.split(":")[-1])
133
+ self.gpu_name = torch.cuda.get_device_name(i_device)
134
+ low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"]
135
+ if (
136
+ any(gpu in self.gpu_name for gpu in low_end_gpus)
137
+ and "V100" not in self.gpu_name.upper()
138
+ ):
139
+ self.is_half = False
140
+ self.set_precision("fp32")
141
+
142
+ self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // (
143
+ 1024**3
144
+ )
145
+
146
+
147
+ def max_vram_gpu(gpu):
148
+ if torch.cuda.is_available():
149
+ gpu_properties = torch.cuda.get_device_properties(gpu)
150
+ total_memory_gb = round(gpu_properties.total_memory / 1024 / 1024 / 1024)
151
+ return total_memory_gb
152
+ else:
153
+ return "8"
154
+
155
+
156
+ def get_gpu_info():
157
+ ngpu = torch.cuda.device_count()
158
+ gpu_infos = []
159
+ if torch.cuda.is_available() or ngpu != 0:
160
+ for i in range(ngpu):
161
+ gpu_name = torch.cuda.get_device_name(i)
162
+ mem = int(
163
+ torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024
164
+ + 0.4
165
+ )
166
+ gpu_infos.append(f"{i}: {gpu_name} ({mem} GB)")
167
+ if len(gpu_infos) > 0:
168
+ gpu_info = "\n".join(gpu_infos)
169
+ else:
170
+ gpu_info = "Unfortunately, there is no compatible GPU available to support your training."
171
+ return gpu_info
172
+
173
+
174
+ def get_number_of_gpus():
175
+ if torch.cuda.is_available():
176
+ num_gpus = torch.cuda.device_count()
177
+ return "-".join(map(str, range(num_gpus)))
178
+ else:
179
+ return "-"
rvc/configs/v1/32000.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sample_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "text_enc_hidden_dim": 256,
33
+ "n_heads": 2,
34
+ "n_layers": 6,
35
+ "kernel_size": 3,
36
+ "p_dropout": 0,
37
+ "resblock": "1",
38
+ "resblock_kernel_sizes": [3,7,11],
39
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
40
+ "upsample_rates": [10,4,2,2,2],
41
+ "upsample_initial_channel": 512,
42
+ "upsample_kernel_sizes": [16,16,4,4,4],
43
+ "use_spectral_norm": false,
44
+ "gin_channels": 256,
45
+ "spk_embed_dim": 109
46
+ }
47
+ }
rvc/configs/v1/40000.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sample_rate": 40000,
21
+ "filter_length": 2048,
22
+ "hop_length": 400,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 125,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "text_enc_hidden_dim": 256,
33
+ "n_heads": 2,
34
+ "n_layers": 6,
35
+ "kernel_size": 3,
36
+ "p_dropout": 0,
37
+ "resblock": "1",
38
+ "resblock_kernel_sizes": [3,7,11],
39
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
40
+ "upsample_rates": [10,10,2,2],
41
+ "upsample_initial_channel": 512,
42
+ "upsample_kernel_sizes": [16,16,4,4],
43
+ "use_spectral_norm": false,
44
+ "gin_channels": 256,
45
+ "spk_embed_dim": 109
46
+ }
47
+ }
rvc/configs/v1/48000.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 11520,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sample_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "text_enc_hidden_dim": 256,
33
+ "n_heads": 2,
34
+ "n_layers": 6,
35
+ "kernel_size": 3,
36
+ "p_dropout": 0,
37
+ "resblock": "1",
38
+ "resblock_kernel_sizes": [3,7,11],
39
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
40
+ "upsample_rates": [10,6,2,2,2],
41
+ "upsample_initial_channel": 512,
42
+ "upsample_kernel_sizes": [16,16,4,4,4],
43
+ "use_spectral_norm": false,
44
+ "gin_channels": 256,
45
+ "spk_embed_dim": 109
46
+ }
47
+ }
rvc/configs/v2/32000.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "learning_rate": 1e-4,
6
+ "betas": [0.8, 0.99],
7
+ "eps": 1e-9,
8
+ "fp16_run": true,
9
+ "lr_decay": 0.999875,
10
+ "segment_size": 12800,
11
+ "c_mel": 45,
12
+ "c_kl": 1.0
13
+ },
14
+ "data": {
15
+ "max_wav_value": 32768.0,
16
+ "sample_rate": 32000,
17
+ "filter_length": 1024,
18
+ "hop_length": 320,
19
+ "win_length": 1024,
20
+ "n_mel_channels": 80,
21
+ "mel_fmin": 0.0,
22
+ "mel_fmax": null
23
+ },
24
+ "model": {
25
+ "inter_channels": 192,
26
+ "hidden_channels": 192,
27
+ "filter_channels": 768,
28
+ "text_enc_hidden_dim": 768,
29
+ "n_heads": 2,
30
+ "n_layers": 6,
31
+ "kernel_size": 3,
32
+ "p_dropout": 0,
33
+ "resblock": "1",
34
+ "resblock_kernel_sizes": [3,7,11],
35
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
36
+ "upsample_rates": [10,8,2,2],
37
+ "upsample_initial_channel": 512,
38
+ "upsample_kernel_sizes": [20,16,4,4],
39
+ "use_spectral_norm": false,
40
+ "gin_channels": 256,
41
+ "spk_embed_dim": 109
42
+ }
43
+ }
rvc/configs/v2/40000.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "learning_rate": 1e-4,
6
+ "betas": [0.8, 0.99],
7
+ "eps": 1e-9,
8
+ "fp16_run": true,
9
+ "lr_decay": 0.999875,
10
+ "segment_size": 12800,
11
+ "c_mel": 45,
12
+ "c_kl": 1.0
13
+ },
14
+ "data": {
15
+ "max_wav_value": 32768.0,
16
+ "sample_rate": 40000,
17
+ "filter_length": 2048,
18
+ "hop_length": 400,
19
+ "win_length": 2048,
20
+ "n_mel_channels": 125,
21
+ "mel_fmin": 0.0,
22
+ "mel_fmax": null
23
+ },
24
+ "model": {
25
+ "inter_channels": 192,
26
+ "hidden_channels": 192,
27
+ "filter_channels": 768,
28
+ "text_enc_hidden_dim": 768,
29
+ "n_heads": 2,
30
+ "n_layers": 6,
31
+ "kernel_size": 3,
32
+ "p_dropout": 0,
33
+ "resblock": "1",
34
+ "resblock_kernel_sizes": [3,7,11],
35
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
36
+ "upsample_rates": [10,10,2,2],
37
+ "upsample_initial_channel": 512,
38
+ "upsample_kernel_sizes": [16,16,4,4],
39
+ "use_spectral_norm": false,
40
+ "gin_channels": 256,
41
+ "spk_embed_dim": 109
42
+ }
43
+ }
rvc/configs/v2/48000.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "learning_rate": 1e-4,
6
+ "betas": [0.8, 0.99],
7
+ "eps": 1e-9,
8
+ "fp16_run": true,
9
+ "lr_decay": 0.999875,
10
+ "segment_size": 17280,
11
+ "c_mel": 45,
12
+ "c_kl": 1.0
13
+ },
14
+ "data": {
15
+ "max_wav_value": 32768.0,
16
+ "sample_rate": 48000,
17
+ "filter_length": 2048,
18
+ "hop_length": 480,
19
+ "win_length": 2048,
20
+ "n_mel_channels": 128,
21
+ "mel_fmin": 0.0,
22
+ "mel_fmax": null
23
+ },
24
+ "model": {
25
+ "inter_channels": 192,
26
+ "hidden_channels": 192,
27
+ "filter_channels": 768,
28
+ "text_enc_hidden_dim": 768,
29
+ "n_heads": 2,
30
+ "n_layers": 6,
31
+ "kernel_size": 3,
32
+ "p_dropout": 0,
33
+ "resblock": "1",
34
+ "resblock_kernel_sizes": [3,7,11],
35
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
36
+ "upsample_rates": [12,10,2,2],
37
+ "upsample_initial_channel": 512,
38
+ "upsample_kernel_sizes": [24,20,4,4],
39
+ "use_spectral_norm": false,
40
+ "gin_channels": 256,
41
+ "spk_embed_dim": 109
42
+ }
43
+ }
rvc/infer/infer.py ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ import torch
5
+ import librosa
6
+ import logging
7
+ import traceback
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import noisereduce as nr
11
+ from pedalboard import (
12
+ Pedalboard,
13
+ Chorus,
14
+ Distortion,
15
+ Reverb,
16
+ PitchShift,
17
+ Limiter,
18
+ Gain,
19
+ Bitcrush,
20
+ Clipping,
21
+ Compressor,
22
+ Delay,
23
+ )
24
+
25
+ now_dir = os.getcwd()
26
+ sys.path.append(now_dir)
27
+
28
+ from rvc.infer.pipeline import Pipeline as VC
29
+ from rvc.lib.utils import load_audio_infer, load_embedding
30
+ from rvc.lib.tools.split_audio import process_audio, merge_audio
31
+ from rvc.lib.algorithm.synthesizers import Synthesizer
32
+ from rvc.configs.config import Config
33
+
34
+ logging.getLogger("httpx").setLevel(logging.WARNING)
35
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
36
+ logging.getLogger("faiss").setLevel(logging.WARNING)
37
+ logging.getLogger("faiss.loader").setLevel(logging.WARNING)
38
+
39
+
40
+ class VoiceConverter:
41
+ """
42
+ A class for performing voice conversion using the Retrieval-Based Voice Conversion (RVC) method.
43
+ """
44
+
45
+ def __init__(self):
46
+ """
47
+ Initializes the VoiceConverter with default configuration, and sets up models and parameters.
48
+ """
49
+ self.config = Config() # Load RVC configuration
50
+ self.hubert_model = (
51
+ None # Initialize the Hubert model (for embedding extraction)
52
+ )
53
+ self.last_embedder_model = None # Last used embedder model
54
+ self.tgt_sr = None # Target sampling rate for the output audio
55
+ self.net_g = None # Generator network for voice conversion
56
+ self.vc = None # Voice conversion pipeline instance
57
+ self.cpt = None # Checkpoint for loading model weights
58
+ self.version = None # Model version
59
+ self.n_spk = None # Number of speakers in the model
60
+ self.use_f0 = None # Whether the model uses F0
61
+ self.loaded_model = None
62
+
63
+ def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
64
+ """
65
+ Loads the HuBERT model for speaker embedding extraction.
66
+
67
+ Args:
68
+ embedder_model (str): Path to the pre-trained HuBERT model.
69
+ embedder_model_custom (str): Path to the custom HuBERT model.
70
+ """
71
+ self.hubert_model = load_embedding(embedder_model, embedder_model_custom)
72
+ self.hubert_model.to(self.config.device)
73
+ self.hubert_model = (
74
+ self.hubert_model.half()
75
+ if self.config.is_half
76
+ else self.hubert_model.float()
77
+ )
78
+ self.hubert_model.eval()
79
+
80
+ @staticmethod
81
+ def remove_audio_noise(data, sr, reduction_strength=0.7):
82
+ """
83
+ Removes noise from an audio file using the NoiseReduce library.
84
+
85
+ Args:
86
+ data (numpy.ndarray): The audio data as a NumPy array.
87
+ sr (int): The sample rate of the audio data.
88
+ reduction_strength (float): Strength of the noise reduction. Default is 0.7.
89
+ """
90
+ try:
91
+ reduced_noise = nr.reduce_noise(
92
+ y=data, sr=sr, prop_decrease=reduction_strength
93
+ )
94
+ return reduced_noise
95
+ except Exception as error:
96
+ print(f"An error occurred removing audio noise: {error}")
97
+ return None
98
+
99
+ @staticmethod
100
+ def convert_audio_format(input_path, output_path, output_format):
101
+ """
102
+ Converts an audio file to a specified output format.
103
+
104
+ Args:
105
+ input_path (str): Path to the input audio file.
106
+ output_path (str): Path to the output audio file.
107
+ output_format (str): Desired audio format (e.g., "WAV", "MP3").
108
+ """
109
+ try:
110
+ if output_format != "WAV":
111
+ print(f"Saving audio as {output_format}...")
112
+ audio, sample_rate = librosa.load(input_path, sr=None)
113
+ common_sample_rates = [
114
+ 8000,
115
+ 11025,
116
+ 12000,
117
+ 16000,
118
+ 22050,
119
+ 24000,
120
+ 32000,
121
+ 44100,
122
+ 48000,
123
+ ]
124
+ target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
125
+ audio = librosa.resample(
126
+ audio, orig_sr=sample_rate, target_sr=target_sr
127
+ )
128
+ sf.write(output_path, audio, target_sr, format=output_format.lower())
129
+ return output_path
130
+ except Exception as error:
131
+ print(f"An error occurred converting the audio format: {error}")
132
+
133
+ @staticmethod
134
+ def post_process_audio(
135
+ audio_input,
136
+ sample_rate,
137
+ **kwargs,
138
+ ):
139
+ board = Pedalboard()
140
+ if kwargs.get("reverb", False):
141
+ reverb = Reverb(
142
+ room_size=kwargs.get("reverb_room_size", 0.5),
143
+ damping=kwargs.get("reverb_damping", 0.5),
144
+ wet_level=kwargs.get("reverb_wet_level", 0.33),
145
+ dry_level=kwargs.get("reverb_dry_level", 0.4),
146
+ width=kwargs.get("reverb_width", 1.0),
147
+ freeze_mode=kwargs.get("reverb_freeze_mode", 0),
148
+ )
149
+ board.append(reverb)
150
+ if kwargs.get("pitch_shift", False):
151
+ pitch_shift = PitchShift(semitones=kwargs.get("pitch_shift_semitones", 0))
152
+ board.append(pitch_shift)
153
+ if kwargs.get("limiter", False):
154
+ limiter = Limiter(
155
+ threshold_db=kwargs.get("limiter_threshold", -6),
156
+ release_ms=kwargs.get("limiter_release", 0.05),
157
+ )
158
+ board.append(limiter)
159
+ if kwargs.get("gain", False):
160
+ gain = Gain(gain_db=kwargs.get("gain_db", 0))
161
+ board.append(gain)
162
+ if kwargs.get("distortion", False):
163
+ distortion = Distortion(drive_db=kwargs.get("distortion_gain", 25))
164
+ board.append(distortion)
165
+ if kwargs.get("chorus", False):
166
+ chorus = Chorus(
167
+ rate_hz=kwargs.get("chorus_rate", 1.0),
168
+ depth=kwargs.get("chorus_depth", 0.25),
169
+ centre_delay_ms=kwargs.get("chorus_delay", 7),
170
+ feedback=kwargs.get("chorus_feedback", 0.0),
171
+ mix=kwargs.get("chorus_mix", 0.5),
172
+ )
173
+ board.append(chorus)
174
+ if kwargs.get("bitcrush", False):
175
+ bitcrush = Bitcrush(bit_depth=kwargs.get("bitcrush_bit_depth", 8))
176
+ board.append(bitcrush)
177
+ if kwargs.get("clipping", False):
178
+ clipping = Clipping(threshold_db=kwargs.get("clipping_threshold", 0))
179
+ board.append(clipping)
180
+ if kwargs.get("compressor", False):
181
+ compressor = Compressor(
182
+ threshold_db=kwargs.get("compressor_threshold", 0),
183
+ ratio=kwargs.get("compressor_ratio", 1),
184
+ attack_ms=kwargs.get("compressor_attack", 1.0),
185
+ release_ms=kwargs.get("compressor_release", 100),
186
+ )
187
+ board.append(compressor)
188
+ if kwargs.get("delay", False):
189
+ delay = Delay(
190
+ delay_seconds=kwargs.get("delay_seconds", 0.5),
191
+ feedback=kwargs.get("delay_feedback", 0.0),
192
+ mix=kwargs.get("delay_mix", 0.5),
193
+ )
194
+ board.append(delay)
195
+ return board(audio_input, sample_rate)
196
+
197
+ def convert_audio(
198
+ self,
199
+ audio_input_path: str,
200
+ audio_output_path: str,
201
+ model_path: str,
202
+ index_path: str,
203
+ pitch: int = 0,
204
+ f0_file: str = None,
205
+ f0_method: str = "rmvpe",
206
+ index_rate: float = 0.75,
207
+ volume_envelope: float = 1,
208
+ protect: float = 0.5,
209
+ hop_length: int = 128,
210
+ split_audio: bool = False,
211
+ f0_autotune: bool = False,
212
+ f0_autotune_strength: float = 1,
213
+ filter_radius: int = 3,
214
+ embedder_model: str = "contentvec",
215
+ embedder_model_custom: str = None,
216
+ clean_audio: bool = False,
217
+ clean_strength: float = 0.5,
218
+ export_format: str = "WAV",
219
+ upscale_audio: bool = False,
220
+ post_process: bool = False,
221
+ resample_sr: int = 0,
222
+ sid: int = 0,
223
+ **kwargs,
224
+ ):
225
+ """
226
+ Performs voice conversion on the input audio.
227
+
228
+ Args:
229
+ pitch (int): Key for F0 up-sampling.
230
+ filter_radius (int): Radius for filtering.
231
+ index_rate (float): Rate for index matching.
232
+ volume_envelope (int): RMS mix rate.
233
+ protect (float): Protection rate for certain audio segments.
234
+ hop_length (int): Hop length for audio processing.
235
+ f0_method (str): Method for F0 extraction.
236
+ audio_input_path (str): Path to the input audio file.
237
+ audio_output_path (str): Path to the output audio file.
238
+ model_path (str): Path to the voice conversion model.
239
+ index_path (str): Path to the index file.
240
+ split_audio (bool): Whether to split the audio for processing.
241
+ f0_autotune (bool): Whether to use F0 autotune.
242
+ clean_audio (bool): Whether to clean the audio.
243
+ clean_strength (float): Strength of the audio cleaning.
244
+ export_format (str): Format for exporting the audio.
245
+ upscale_audio (bool): Whether to upscale the audio.
246
+ f0_file (str): Path to the F0 file.
247
+ embedder_model (str): Path to the embedder model.
248
+ embedder_model_custom (str): Path to the custom embedder model.
249
+ resample_sr (int, optional): Resample sampling rate. Default is 0.
250
+ sid (int, optional): Speaker ID. Default is 0.
251
+ **kwargs: Additional keyword arguments.
252
+ """
253
+ self.get_vc(model_path, sid)
254
+ try:
255
+ start_time = time.time()
256
+ print(f"Converting audio '{audio_input_path}'...")
257
+
258
+ audio = load_audio_infer(
259
+ audio_input_path,
260
+ 16000,
261
+ **kwargs,
262
+ )
263
+ audio_max = np.abs(audio).max() / 0.95
264
+
265
+ if audio_max > 1:
266
+ audio /= audio_max
267
+
268
+ if not self.hubert_model or embedder_model != self.last_embedder_model:
269
+ self.load_hubert(embedder_model, embedder_model_custom)
270
+ self.last_embedder_model = embedder_model
271
+
272
+ file_index = (
273
+ index_path.strip()
274
+ .strip('"')
275
+ .strip("\n")
276
+ .strip('"')
277
+ .strip()
278
+ .replace("trained", "added")
279
+ )
280
+
281
+ if self.tgt_sr != resample_sr >= 16000:
282
+ self.tgt_sr = resample_sr
283
+
284
+ if split_audio:
285
+ chunks, intervals = process_audio(audio, 16000)
286
+ print(f"Audio split into {len(chunks)} chunks for processing.")
287
+ else:
288
+ chunks = []
289
+ chunks.append(audio)
290
+
291
+ converted_chunks = []
292
+ for c in chunks:
293
+ audio_opt = self.vc.pipeline(
294
+ model=self.hubert_model,
295
+ net_g=self.net_g,
296
+ sid=sid,
297
+ audio=c,
298
+ pitch=pitch,
299
+ f0_method=f0_method,
300
+ file_index=file_index,
301
+ index_rate=index_rate,
302
+ pitch_guidance=self.use_f0,
303
+ filter_radius=filter_radius,
304
+ volume_envelope=volume_envelope,
305
+ version=self.version,
306
+ protect=protect,
307
+ hop_length=hop_length,
308
+ f0_autotune=f0_autotune,
309
+ f0_autotune_strength=f0_autotune_strength,
310
+ f0_file=f0_file,
311
+ )
312
+ converted_chunks.append(audio_opt)
313
+ if split_audio:
314
+ print(f"Converted audio chunk {len(converted_chunks)}")
315
+
316
+ if split_audio:
317
+ audio_opt = merge_audio(converted_chunks, intervals, 16000, self.tgt_sr)
318
+ else:
319
+ audio_opt = converted_chunks[0]
320
+
321
+ if clean_audio:
322
+ cleaned_audio = self.remove_audio_noise(
323
+ audio_opt, self.tgt_sr, clean_strength
324
+ )
325
+ if cleaned_audio is not None:
326
+ audio_opt = cleaned_audio
327
+
328
+ if post_process:
329
+ audio_opt = self.post_process_audio(
330
+ audio_input=audio_opt,
331
+ sample_rate=self.tgt_sr,
332
+ **kwargs,
333
+ )
334
+
335
+ sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV")
336
+ output_path_format = audio_output_path.replace(
337
+ ".wav", f".{export_format.lower()}"
338
+ )
339
+ audio_output_path = self.convert_audio_format(
340
+ audio_output_path, output_path_format, export_format
341
+ )
342
+
343
+ elapsed_time = time.time() - start_time
344
+ print(
345
+ f"Conversion completed at '{audio_output_path}' in {elapsed_time:.2f} seconds."
346
+ )
347
+ except Exception as error:
348
+ print(f"An error occurred during audio conversion: {error}")
349
+ print(traceback.format_exc())
350
+
351
+ def convert_audio_batch(
352
+ self,
353
+ audio_input_paths: str,
354
+ audio_output_path: str,
355
+ **kwargs,
356
+ ):
357
+ """
358
+ Performs voice conversion on a batch of input audio files.
359
+
360
+ Args:
361
+ audio_input_paths (str): List of paths to the input audio files.
362
+ audio_output_path (str): Path to the output audio file.
363
+ resample_sr (int, optional): Resample sampling rate. Default is 0.
364
+ sid (int, optional): Speaker ID. Default is 0.
365
+ **kwargs: Additional keyword arguments.
366
+ """
367
+ pid = os.getpid()
368
+ try:
369
+ with open(
370
+ os.path.join(now_dir, "assets", "infer_pid.txt"), "w"
371
+ ) as pid_file:
372
+ pid_file.write(str(pid))
373
+ start_time = time.time()
374
+ print(f"Converting audio batch '{audio_input_paths}'...")
375
+ audio_files = [
376
+ f
377
+ for f in os.listdir(audio_input_paths)
378
+ if f.endswith(
379
+ (
380
+ "wav",
381
+ "mp3",
382
+ "flac",
383
+ "ogg",
384
+ "opus",
385
+ "m4a",
386
+ "mp4",
387
+ "aac",
388
+ "alac",
389
+ "wma",
390
+ "aiff",
391
+ "webm",
392
+ "ac3",
393
+ )
394
+ )
395
+ ]
396
+ print(f"Detected {len(audio_files)} audio files for inference.")
397
+ for a in audio_files:
398
+ new_input = os.path.join(audio_input_paths, a)
399
+ new_output = os.path.splitext(a)[0] + "_output.wav"
400
+ new_output = os.path.join(audio_output_path, new_output)
401
+ if os.path.exists(new_output):
402
+ continue
403
+ self.convert_audio(
404
+ audio_input_path=new_input,
405
+ audio_output_path=new_output,
406
+ **kwargs,
407
+ )
408
+ print(f"Conversion completed at '{audio_input_paths}'.")
409
+ elapsed_time = time.time() - start_time
410
+ print(f"Batch conversion completed in {elapsed_time:.2f} seconds.")
411
+ except Exception as error:
412
+ print(f"An error occurred during audio batch conversion: {error}")
413
+ print(traceback.format_exc())
414
+ finally:
415
+ os.remove(os.path.join(now_dir, "assets", "infer_pid.txt"))
416
+
417
+ def get_vc(self, weight_root, sid):
418
+ """
419
+ Loads the voice conversion model and sets up the pipeline.
420
+
421
+ Args:
422
+ weight_root (str): Path to the model weights.
423
+ sid (int): Speaker ID.
424
+ """
425
+ if sid == "" or sid == []:
426
+ self.cleanup_model()
427
+ if torch.cuda.is_available():
428
+ torch.cuda.empty_cache()
429
+
430
+ if not self.loaded_model or self.loaded_model != weight_root:
431
+ self.load_model(weight_root)
432
+ if self.cpt is not None:
433
+ self.setup_network()
434
+ self.setup_vc_instance()
435
+ self.loaded_model = weight_root
436
+
437
+ def cleanup_model(self):
438
+ """
439
+ Cleans up the model and releases resources.
440
+ """
441
+ if self.hubert_model is not None:
442
+ del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr
443
+ self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
444
+ if torch.cuda.is_available():
445
+ torch.cuda.empty_cache()
446
+
447
+ del self.net_g, self.cpt
448
+ if torch.cuda.is_available():
449
+ torch.cuda.empty_cache()
450
+ self.cpt = None
451
+
452
+ def load_model(self, weight_root):
453
+ """
454
+ Loads the model weights from the specified path.
455
+
456
+ Args:
457
+ weight_root (str): Path to the model weights.
458
+ """
459
+ self.cpt = (
460
+ torch.load(weight_root, map_location="cpu")
461
+ if os.path.isfile(weight_root)
462
+ else None
463
+ )
464
+
465
+ def setup_network(self):
466
+ """
467
+ Sets up the network configuration based on the loaded checkpoint.
468
+ """
469
+ if self.cpt is not None:
470
+ self.tgt_sr = self.cpt["config"][-1]
471
+ self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]
472
+ self.use_f0 = self.cpt.get("f0", 1)
473
+
474
+ self.version = self.cpt.get("version", "v1")
475
+ self.text_enc_hidden_dim = 768 if self.version == "v2" else 256
476
+ self.net_g = Synthesizer(
477
+ *self.cpt["config"],
478
+ use_f0=self.use_f0,
479
+ text_enc_hidden_dim=self.text_enc_hidden_dim,
480
+ is_half=self.config.is_half,
481
+ )
482
+ del self.net_g.enc_q
483
+ self.net_g.load_state_dict(self.cpt["weight"], strict=False)
484
+ self.net_g.eval().to(self.config.device)
485
+ self.net_g = (
486
+ self.net_g.half() if self.config.is_half else self.net_g.float()
487
+ )
488
+
489
+ def setup_vc_instance(self):
490
+ """
491
+ Sets up the voice conversion pipeline instance based on the target sampling rate and configuration.
492
+ """
493
+ if self.cpt is not None:
494
+ self.vc = VC(self.tgt_sr, self.config)
495
+ self.n_spk = self.cpt["config"][-3]
rvc/infer/pipeline.py ADDED
@@ -0,0 +1,708 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gc
3
+ import re
4
+ import sys
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import torchcrepe
8
+ import faiss
9
+ import librosa
10
+ import numpy as np
11
+ from scipy import signal
12
+ from torch import Tensor
13
+
14
+ now_dir = os.getcwd()
15
+ sys.path.append(now_dir)
16
+
17
+ from rvc.lib.predictors.RMVPE import RMVPE0Predictor
18
+ from rvc.lib.predictors.FCPE import FCPEF0Predictor
19
+
20
+ import logging
21
+
22
+ logging.getLogger("faiss").setLevel(logging.WARNING)
23
+
24
+ # Constants for high-pass filter
25
+ FILTER_ORDER = 5
26
+ CUTOFF_FREQUENCY = 48 # Hz
27
+ SAMPLE_RATE = 16000 # Hz
28
+ bh, ah = signal.butter(
29
+ N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE
30
+ )
31
+
32
+ input_audio_path2wav = {}
33
+
34
+
35
+ class AudioProcessor:
36
+ """
37
+ A class for processing audio signals, specifically for adjusting RMS levels.
38
+ """
39
+
40
+ def change_rms(
41
+ source_audio: np.ndarray,
42
+ source_rate: int,
43
+ target_audio: np.ndarray,
44
+ target_rate: int,
45
+ rate: float,
46
+ ) -> np.ndarray:
47
+ """
48
+ Adjust the RMS level of target_audio to match the RMS of source_audio, with a given blending rate.
49
+
50
+ Args:
51
+ source_audio: The source audio signal as a NumPy array.
52
+ source_rate: The sampling rate of the source audio.
53
+ target_audio: The target audio signal to adjust.
54
+ target_rate: The sampling rate of the target audio.
55
+ rate: The blending rate between the source and target RMS levels.
56
+ """
57
+ # Calculate RMS of both audio data
58
+ rms1 = librosa.feature.rms(
59
+ y=source_audio,
60
+ frame_length=source_rate // 2 * 2,
61
+ hop_length=source_rate // 2,
62
+ )
63
+ rms2 = librosa.feature.rms(
64
+ y=target_audio,
65
+ frame_length=target_rate // 2 * 2,
66
+ hop_length=target_rate // 2,
67
+ )
68
+
69
+ # Interpolate RMS to match target audio length
70
+ rms1 = F.interpolate(
71
+ torch.from_numpy(rms1).float().unsqueeze(0),
72
+ size=target_audio.shape[0],
73
+ mode="linear",
74
+ ).squeeze()
75
+ rms2 = F.interpolate(
76
+ torch.from_numpy(rms2).float().unsqueeze(0),
77
+ size=target_audio.shape[0],
78
+ mode="linear",
79
+ ).squeeze()
80
+ rms2 = torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6)
81
+
82
+ # Adjust target audio RMS based on the source audio RMS
83
+ adjusted_audio = (
84
+ target_audio
85
+ * (torch.pow(rms1, 1 - rate) * torch.pow(rms2, rate - 1)).numpy()
86
+ )
87
+ return adjusted_audio
88
+
89
+
90
+ class Autotune:
91
+ """
92
+ A class for applying autotune to a given fundamental frequency (F0) contour.
93
+ """
94
+
95
+ def __init__(self, ref_freqs):
96
+ """
97
+ Initializes the Autotune class with a set of reference frequencies.
98
+
99
+ Args:
100
+ ref_freqs: A list of reference frequencies representing musical notes.
101
+ """
102
+ self.ref_freqs = ref_freqs
103
+ self.note_dict = self.ref_freqs # No interpolation needed
104
+
105
+ def autotune_f0(self, f0, f0_autotune_strength):
106
+ """
107
+ Autotunes a given F0 contour by snapping each frequency to the closest reference frequency.
108
+
109
+ Args:
110
+ f0: The input F0 contour as a NumPy array.
111
+ """
112
+ autotuned_f0 = np.zeros_like(f0)
113
+ for i, freq in enumerate(f0):
114
+ closest_note = min(self.note_dict, key=lambda x: abs(x - freq))
115
+ autotuned_f0[i] = freq + (closest_note - freq) * f0_autotune_strength
116
+ return autotuned_f0
117
+
118
+
119
+ class Pipeline:
120
+ """
121
+ The main pipeline class for performing voice conversion, including preprocessing, F0 estimation,
122
+ voice conversion using a model, and post-processing.
123
+ """
124
+
125
+ def __init__(self, tgt_sr, config):
126
+ """
127
+ Initializes the Pipeline class with target sampling rate and configuration parameters.
128
+
129
+ Args:
130
+ tgt_sr: The target sampling rate for the output audio.
131
+ config: A configuration object containing various parameters for the pipeline.
132
+ """
133
+ self.x_pad = config.x_pad
134
+ self.x_query = config.x_query
135
+ self.x_center = config.x_center
136
+ self.x_max = config.x_max
137
+ self.is_half = config.is_half
138
+ self.sample_rate = 16000
139
+ self.window = 160
140
+ self.t_pad = self.sample_rate * self.x_pad
141
+ self.t_pad_tgt = tgt_sr * self.x_pad
142
+ self.t_pad2 = self.t_pad * 2
143
+ self.t_query = self.sample_rate * self.x_query
144
+ self.t_center = self.sample_rate * self.x_center
145
+ self.t_max = self.sample_rate * self.x_max
146
+ self.time_step = self.window / self.sample_rate * 1000
147
+ self.f0_min = 50
148
+ self.f0_max = 1100
149
+ self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
150
+ self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
151
+ self.device = config.device
152
+ self.ref_freqs = [
153
+ 49.00, # G1
154
+ 51.91, # G#1 / Ab1
155
+ 55.00, # A1
156
+ 58.27, # A#1 / Bb1
157
+ 61.74, # B1
158
+ 65.41, # C2
159
+ 69.30, # C#2 / Db2
160
+ 73.42, # D2
161
+ 77.78, # D#2 / Eb2
162
+ 82.41, # E2
163
+ 87.31, # F2
164
+ 92.50, # F#2 / Gb2
165
+ 98.00, # G2
166
+ 103.83, # G#2 / Ab2
167
+ 110.00, # A2
168
+ 116.54, # A#2 / Bb2
169
+ 123.47, # B2
170
+ 130.81, # C3
171
+ 138.59, # C#3 / Db3
172
+ 146.83, # D3
173
+ 155.56, # D#3 / Eb3
174
+ 164.81, # E3
175
+ 174.61, # F3
176
+ 185.00, # F#3 / Gb3
177
+ 196.00, # G3
178
+ 207.65, # G#3 / Ab3
179
+ 220.00, # A3
180
+ 233.08, # A#3 / Bb3
181
+ 246.94, # B3
182
+ 261.63, # C4
183
+ 277.18, # C#4 / Db4
184
+ 293.66, # D4
185
+ 311.13, # D#4 / Eb4
186
+ 329.63, # E4
187
+ 349.23, # F4
188
+ 369.99, # F#4 / Gb4
189
+ 392.00, # G4
190
+ 415.30, # G#4 / Ab4
191
+ 440.00, # A4
192
+ 466.16, # A#4 / Bb4
193
+ 493.88, # B4
194
+ 523.25, # C5
195
+ 554.37, # C#5 / Db5
196
+ 587.33, # D5
197
+ 622.25, # D#5 / Eb5
198
+ 659.25, # E5
199
+ 698.46, # F5
200
+ 739.99, # F#5 / Gb5
201
+ 783.99, # G5
202
+ 830.61, # G#5 / Ab5
203
+ 880.00, # A5
204
+ 932.33, # A#5 / Bb5
205
+ 987.77, # B5
206
+ 1046.50, # C6
207
+ ]
208
+ self.autotune = Autotune(self.ref_freqs)
209
+ self.note_dict = self.autotune.note_dict
210
+ self.model_rmvpe = RMVPE0Predictor(
211
+ os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
212
+ is_half=self.is_half,
213
+ device=self.device,
214
+ )
215
+
216
+ def get_f0_crepe(
217
+ self,
218
+ x,
219
+ f0_min,
220
+ f0_max,
221
+ p_len,
222
+ hop_length,
223
+ model="full",
224
+ ):
225
+ """
226
+ Estimates the fundamental frequency (F0) of a given audio signal using the Crepe model.
227
+
228
+ Args:
229
+ x: The input audio signal as a NumPy array.
230
+ f0_min: Minimum F0 value to consider.
231
+ f0_max: Maximum F0 value to consider.
232
+ p_len: Desired length of the F0 output.
233
+ hop_length: Hop length for the Crepe model.
234
+ model: Crepe model size to use ("full" or "tiny").
235
+ """
236
+ x = x.astype(np.float32)
237
+ x /= np.quantile(np.abs(x), 0.999)
238
+ audio = torch.from_numpy(x).to(self.device, copy=True)
239
+ audio = torch.unsqueeze(audio, dim=0)
240
+ if audio.ndim == 2 and audio.shape[0] > 1:
241
+ audio = torch.mean(audio, dim=0, keepdim=True).detach()
242
+ audio = audio.detach()
243
+ pitch: Tensor = torchcrepe.predict(
244
+ audio,
245
+ self.sample_rate,
246
+ hop_length,
247
+ f0_min,
248
+ f0_max,
249
+ model,
250
+ batch_size=hop_length * 2,
251
+ device=self.device,
252
+ pad=True,
253
+ )
254
+ p_len = p_len or x.shape[0] // hop_length
255
+ source = np.array(pitch.squeeze(0).cpu().float().numpy())
256
+ source[source < 0.001] = np.nan
257
+ target = np.interp(
258
+ np.arange(0, len(source) * p_len, len(source)) / p_len,
259
+ np.arange(0, len(source)),
260
+ source,
261
+ )
262
+ f0 = np.nan_to_num(target)
263
+ return f0
264
+
265
+ def get_f0_hybrid(
266
+ self,
267
+ methods_str,
268
+ x,
269
+ f0_min,
270
+ f0_max,
271
+ p_len,
272
+ hop_length,
273
+ ):
274
+ """
275
+ Estimates the fundamental frequency (F0) using a hybrid approach combining multiple methods.
276
+
277
+ Args:
278
+ methods_str: A string specifying the methods to combine (e.g., "hybrid[crepe+rmvpe]").
279
+ x: The input audio signal as a NumPy array.
280
+ f0_min: Minimum F0 value to consider.
281
+ f0_max: Maximum F0 value to consider.
282
+ p_len: Desired length of the F0 output.
283
+ hop_length: Hop length for F0 estimation methods.
284
+ """
285
+ methods_str = re.search("hybrid\[(.+)\]", methods_str)
286
+ if methods_str:
287
+ methods = [method.strip() for method in methods_str.group(1).split("+")]
288
+ f0_computation_stack = []
289
+ print(f"Calculating f0 pitch estimations for methods: {', '.join(methods)}")
290
+ x = x.astype(np.float32)
291
+ x /= np.quantile(np.abs(x), 0.999)
292
+ for method in methods:
293
+ f0 = None
294
+ if method == "crepe":
295
+ f0 = self.get_f0_crepe_computation(
296
+ x, f0_min, f0_max, p_len, int(hop_length)
297
+ )
298
+ elif method == "rmvpe":
299
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
300
+ f0 = f0[1:]
301
+ elif method == "fcpe":
302
+ self.model_fcpe = FCPEF0Predictor(
303
+ os.path.join("rvc", "models", "predictors", "fcpe.pt"),
304
+ f0_min=int(f0_min),
305
+ f0_max=int(f0_max),
306
+ dtype=torch.float32,
307
+ device=self.device,
308
+ sample_rate=self.sample_rate,
309
+ threshold=0.03,
310
+ )
311
+ f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
312
+ del self.model_fcpe
313
+ gc.collect()
314
+ f0_computation_stack.append(f0)
315
+
316
+ f0_computation_stack = [fc for fc in f0_computation_stack if fc is not None]
317
+ f0_median_hybrid = None
318
+ if len(f0_computation_stack) == 1:
319
+ f0_median_hybrid = f0_computation_stack[0]
320
+ else:
321
+ f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
322
+ return f0_median_hybrid
323
+
324
+ def get_f0(
325
+ self,
326
+ input_audio_path,
327
+ x,
328
+ p_len,
329
+ pitch,
330
+ f0_method,
331
+ filter_radius,
332
+ hop_length,
333
+ f0_autotune,
334
+ f0_autotune_strength,
335
+ inp_f0=None,
336
+ ):
337
+ """
338
+ Estimates the fundamental frequency (F0) of a given audio signal using various methods.
339
+
340
+ Args:
341
+ input_audio_path: Path to the input audio file.
342
+ x: The input audio signal as a NumPy array.
343
+ p_len: Desired length of the F0 output.
344
+ pitch: Key to adjust the pitch of the F0 contour.
345
+ f0_method: Method to use for F0 estimation (e.g., "crepe").
346
+ filter_radius: Radius for median filtering the F0 contour.
347
+ hop_length: Hop length for F0 estimation methods.
348
+ f0_autotune: Whether to apply autotune to the F0 contour.
349
+ inp_f0: Optional input F0 contour to use instead of estimating.
350
+ """
351
+ global input_audio_path2wav
352
+ if f0_method == "crepe":
353
+ f0 = self.get_f0_crepe(x, self.f0_min, self.f0_max, p_len, int(hop_length))
354
+ elif f0_method == "crepe-tiny":
355
+ f0 = self.get_f0_crepe(
356
+ x, self.f0_min, self.f0_max, p_len, int(hop_length), "tiny"
357
+ )
358
+ elif f0_method == "rmvpe":
359
+ f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
360
+ elif f0_method == "fcpe":
361
+ self.model_fcpe = FCPEF0Predictor(
362
+ os.path.join("rvc", "models", "predictors", "fcpe.pt"),
363
+ f0_min=int(self.f0_min),
364
+ f0_max=int(self.f0_max),
365
+ dtype=torch.float32,
366
+ device=self.device,
367
+ sample_rate=self.sample_rate,
368
+ threshold=0.03,
369
+ )
370
+ f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
371
+ del self.model_fcpe
372
+ gc.collect()
373
+ elif "hybrid" in f0_method:
374
+ input_audio_path2wav[input_audio_path] = x.astype(np.double)
375
+ f0 = self.get_f0_hybrid(
376
+ f0_method,
377
+ x,
378
+ self.f0_min,
379
+ self.f0_max,
380
+ p_len,
381
+ hop_length,
382
+ )
383
+
384
+ if f0_autotune is True:
385
+ f0 = Autotune.autotune_f0(self, f0, f0_autotune_strength)
386
+
387
+ f0 *= pow(2, pitch / 12)
388
+ tf0 = self.sample_rate // self.window
389
+ if inp_f0 is not None:
390
+ delta_t = np.round(
391
+ (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
392
+ ).astype("int16")
393
+ replace_f0 = np.interp(
394
+ list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
395
+ )
396
+ shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
397
+ f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
398
+ :shape
399
+ ]
400
+ f0bak = f0.copy()
401
+ f0_mel = 1127 * np.log(1 + f0 / 700)
402
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
403
+ self.f0_mel_max - self.f0_mel_min
404
+ ) + 1
405
+ f0_mel[f0_mel <= 1] = 1
406
+ f0_mel[f0_mel > 255] = 255
407
+ f0_coarse = np.rint(f0_mel).astype(int)
408
+
409
+ return f0_coarse, f0bak
410
+
411
+ def voice_conversion(
412
+ self,
413
+ model,
414
+ net_g,
415
+ sid,
416
+ audio0,
417
+ pitch,
418
+ pitchf,
419
+ index,
420
+ big_npy,
421
+ index_rate,
422
+ version,
423
+ protect,
424
+ ):
425
+ """
426
+ Performs voice conversion on a given audio segment.
427
+
428
+ Args:
429
+ model: The feature extractor model.
430
+ net_g: The generative model for synthesizing speech.
431
+ sid: Speaker ID for the target voice.
432
+ audio0: The input audio segment.
433
+ pitch: Quantized F0 contour for pitch guidance.
434
+ pitchf: Original F0 contour for pitch guidance.
435
+ index: FAISS index for speaker embedding retrieval.
436
+ big_npy: Speaker embeddings stored in a NumPy array.
437
+ index_rate: Blending rate for speaker embedding retrieval.
438
+ version: Model version ("v1" or "v2").
439
+ protect: Protection level for preserving the original pitch.
440
+ """
441
+ with torch.no_grad():
442
+ pitch_guidance = pitch != None and pitchf != None
443
+ # prepare source audio
444
+ feats = (
445
+ torch.from_numpy(audio0).half()
446
+ if self.is_half
447
+ else torch.from_numpy(audio0).float()
448
+ )
449
+ feats = feats.mean(-1) if feats.dim() == 2 else feats
450
+ assert feats.dim() == 1, feats.dim()
451
+ feats = feats.view(1, -1).to(self.device)
452
+ # extract features
453
+ feats = model(feats)["last_hidden_state"]
454
+ feats = (
455
+ model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats
456
+ )
457
+ # make a copy for pitch guidance and protection
458
+ feats0 = feats.clone() if pitch_guidance else None
459
+ if (
460
+ index
461
+ ): # set by parent function, only true if index is available, loaded, and index rate > 0
462
+ feats = self._retrieve_speaker_embeddings(
463
+ feats, index, big_npy, index_rate
464
+ )
465
+ # feature upsampling
466
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(
467
+ 0, 2, 1
468
+ )
469
+ # adjust the length if the audio is short
470
+ p_len = min(audio0.shape[0] // self.window, feats.shape[1])
471
+ if pitch_guidance:
472
+ feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
473
+ 0, 2, 1
474
+ )
475
+ pitch, pitchf = pitch[:, :p_len], pitchf[:, :p_len]
476
+ # Pitch protection blending
477
+ if protect < 0.5:
478
+ pitchff = pitchf.clone()
479
+ pitchff[pitchf > 0] = 1
480
+ pitchff[pitchf < 1] = protect
481
+ feats = feats * pitchff.unsqueeze(-1) + feats0 * (
482
+ 1 - pitchff.unsqueeze(-1)
483
+ )
484
+ feats = feats.to(feats0.dtype)
485
+ else:
486
+ pitch, pitchf = None, None
487
+ p_len = torch.tensor([p_len], device=self.device).long()
488
+ audio1 = (
489
+ (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
490
+ .data.cpu()
491
+ .float()
492
+ .numpy()
493
+ )
494
+ # clean up
495
+ del feats, feats0, p_len
496
+ if torch.cuda.is_available():
497
+ torch.cuda.empty_cache()
498
+ return audio1
499
+
500
+ def _retrieve_speaker_embeddings(self, feats, index, big_npy, index_rate):
501
+ npy = feats[0].cpu().numpy()
502
+ npy = npy.astype("float32") if self.is_half else npy
503
+ score, ix = index.search(npy, k=8)
504
+ weight = np.square(1 / score)
505
+ weight /= weight.sum(axis=1, keepdims=True)
506
+ npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
507
+ npy = npy.astype("float16") if self.is_half else npy
508
+ feats = (
509
+ torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
510
+ + (1 - index_rate) * feats
511
+ )
512
+ return feats
513
+
514
+ def pipeline(
515
+ self,
516
+ model,
517
+ net_g,
518
+ sid,
519
+ audio,
520
+ pitch,
521
+ f0_method,
522
+ file_index,
523
+ index_rate,
524
+ pitch_guidance,
525
+ filter_radius,
526
+ volume_envelope,
527
+ version,
528
+ protect,
529
+ hop_length,
530
+ f0_autotune,
531
+ f0_autotune_strength,
532
+ f0_file,
533
+ ):
534
+ """
535
+ The main pipeline function for performing voice conversion.
536
+
537
+ Args:
538
+ model: The feature extractor model.
539
+ net_g: The generative model for synthesizing speech.
540
+ sid: Speaker ID for the target voice.
541
+ audio: The input audio signal.
542
+ input_audio_path: Path to the input audio file.
543
+ pitch: Key to adjust the pitch of the F0 contour.
544
+ f0_method: Method to use for F0 estimation.
545
+ file_index: Path to the FAISS index file for speaker embedding retrieval.
546
+ index_rate: Blending rate for speaker embedding retrieval.
547
+ pitch_guidance: Whether to use pitch guidance during voice conversion.
548
+ filter_radius: Radius for median filtering the F0 contour.
549
+ tgt_sr: Target sampling rate for the output audio.
550
+ resample_sr: Resampling rate for the output audio.
551
+ volume_envelope: Blending rate for adjusting the RMS level of the output audio.
552
+ version: Model version.
553
+ protect: Protection level for preserving the original pitch.
554
+ hop_length: Hop length for F0 estimation methods.
555
+ f0_autotune: Whether to apply autotune to the F0 contour.
556
+ f0_file: Path to a file containing an F0 contour to use.
557
+ """
558
+ if file_index != "" and os.path.exists(file_index) and index_rate > 0:
559
+ try:
560
+ index = faiss.read_index(file_index)
561
+ big_npy = index.reconstruct_n(0, index.ntotal)
562
+ except Exception as error:
563
+ print(f"An error occurred reading the FAISS index: {error}")
564
+ index = big_npy = None
565
+ else:
566
+ index = big_npy = None
567
+ audio = signal.filtfilt(bh, ah, audio)
568
+ audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
569
+ opt_ts = []
570
+ if audio_pad.shape[0] > self.t_max:
571
+ audio_sum = np.zeros_like(audio)
572
+ for i in range(self.window):
573
+ audio_sum += audio_pad[i : i - self.window]
574
+ for t in range(self.t_center, audio.shape[0], self.t_center):
575
+ opt_ts.append(
576
+ t
577
+ - self.t_query
578
+ + np.where(
579
+ np.abs(audio_sum[t - self.t_query : t + self.t_query])
580
+ == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
581
+ )[0][0]
582
+ )
583
+ s = 0
584
+ audio_opt = []
585
+ t = None
586
+ audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
587
+ p_len = audio_pad.shape[0] // self.window
588
+ inp_f0 = None
589
+ if hasattr(f0_file, "name"):
590
+ try:
591
+ with open(f0_file.name, "r") as f:
592
+ lines = f.read().strip("\n").split("\n")
593
+ inp_f0 = []
594
+ for line in lines:
595
+ inp_f0.append([float(i) for i in line.split(",")])
596
+ inp_f0 = np.array(inp_f0, dtype="float32")
597
+ except Exception as error:
598
+ print(f"An error occurred reading the F0 file: {error}")
599
+ sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
600
+ if pitch_guidance:
601
+ pitch, pitchf = self.get_f0(
602
+ "input_audio_path", # questionable purpose of making a key for an array
603
+ audio_pad,
604
+ p_len,
605
+ pitch,
606
+ f0_method,
607
+ filter_radius,
608
+ hop_length,
609
+ f0_autotune,
610
+ f0_autotune_strength,
611
+ inp_f0,
612
+ )
613
+ pitch = pitch[:p_len]
614
+ pitchf = pitchf[:p_len]
615
+ if self.device == "mps":
616
+ pitchf = pitchf.astype(np.float32)
617
+ pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
618
+ pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
619
+ for t in opt_ts:
620
+ t = t // self.window * self.window
621
+ if pitch_guidance:
622
+ audio_opt.append(
623
+ self.voice_conversion(
624
+ model,
625
+ net_g,
626
+ sid,
627
+ audio_pad[s : t + self.t_pad2 + self.window],
628
+ pitch[:, s // self.window : (t + self.t_pad2) // self.window],
629
+ pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
630
+ index,
631
+ big_npy,
632
+ index_rate,
633
+ version,
634
+ protect,
635
+ )[self.t_pad_tgt : -self.t_pad_tgt]
636
+ )
637
+ else:
638
+ audio_opt.append(
639
+ self.voice_conversion(
640
+ model,
641
+ net_g,
642
+ sid,
643
+ audio_pad[s : t + self.t_pad2 + self.window],
644
+ None,
645
+ None,
646
+ index,
647
+ big_npy,
648
+ index_rate,
649
+ version,
650
+ protect,
651
+ )[self.t_pad_tgt : -self.t_pad_tgt]
652
+ )
653
+ s = t
654
+ if pitch_guidance:
655
+ audio_opt.append(
656
+ self.voice_conversion(
657
+ model,
658
+ net_g,
659
+ sid,
660
+ audio_pad[t:],
661
+ pitch[:, t // self.window :] if t is not None else pitch,
662
+ pitchf[:, t // self.window :] if t is not None else pitchf,
663
+ index,
664
+ big_npy,
665
+ index_rate,
666
+ version,
667
+ protect,
668
+ )[self.t_pad_tgt : -self.t_pad_tgt]
669
+ )
670
+ else:
671
+ audio_opt.append(
672
+ self.voice_conversion(
673
+ model,
674
+ net_g,
675
+ sid,
676
+ audio_pad[t:],
677
+ None,
678
+ None,
679
+ index,
680
+ big_npy,
681
+ index_rate,
682
+ version,
683
+ protect,
684
+ )[self.t_pad_tgt : -self.t_pad_tgt]
685
+ )
686
+ audio_opt = np.concatenate(audio_opt)
687
+ if volume_envelope != 1:
688
+ audio_opt = AudioProcessor.change_rms(
689
+ audio, self.sample_rate, audio_opt, self.sample_rate, volume_envelope
690
+ )
691
+ # if resample_sr >= self.sample_rate and tgt_sr != resample_sr:
692
+ # audio_opt = librosa.resample(
693
+ # audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
694
+ # )
695
+ # audio_max = np.abs(audio_opt).max() / 0.99
696
+ # max_int16 = 32768
697
+ # if audio_max > 1:
698
+ # max_int16 /= audio_max
699
+ # audio_opt = (audio_opt * 32768).astype(np.int16)
700
+ audio_max = np.abs(audio_opt).max() / 0.99
701
+ if audio_max > 1:
702
+ audio_opt /= audio_max
703
+ if pitch_guidance:
704
+ del pitch, pitchf
705
+ del sid
706
+ if torch.cuda.is_available():
707
+ torch.cuda.empty_cache()
708
+ return audio_opt
rvc/lib/algorithm/__init__.py ADDED
File without changes
rvc/lib/algorithm/attentions.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from rvc.lib.algorithm.commons import convert_pad_shape
4
+
5
+
6
+ class MultiHeadAttention(torch.nn.Module):
7
+ """
8
+ Multi-head attention module with optional relative positional encoding and proximal bias.
9
+
10
+ Args:
11
+ channels (int): Number of input channels.
12
+ out_channels (int): Number of output channels.
13
+ n_heads (int): Number of attention heads.
14
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
15
+ window_size (int, optional): Window size for relative positional encoding. Defaults to None.
16
+ heads_share (bool, optional): Whether to share relative positional embeddings across heads. Defaults to True.
17
+ block_length (int, optional): Block length for local attention. Defaults to None.
18
+ proximal_bias (bool, optional): Whether to use proximal bias in self-attention. Defaults to False.
19
+ proximal_init (bool, optional): Whether to initialize the key projection weights the same as query projection weights. Defaults to False.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ channels,
25
+ out_channels,
26
+ n_heads,
27
+ p_dropout=0.0,
28
+ window_size=None,
29
+ heads_share=True,
30
+ block_length=None,
31
+ proximal_bias=False,
32
+ proximal_init=False,
33
+ ):
34
+ super().__init__()
35
+ assert (
36
+ channels % n_heads == 0
37
+ ), "Channels must be divisible by the number of heads."
38
+
39
+ self.channels = channels
40
+ self.out_channels = out_channels
41
+ self.n_heads = n_heads
42
+ self.k_channels = channels // n_heads
43
+ self.window_size = window_size
44
+ self.block_length = block_length
45
+ self.proximal_bias = proximal_bias
46
+
47
+ # Define projections
48
+ self.conv_q = torch.nn.Conv1d(channels, channels, 1)
49
+ self.conv_k = torch.nn.Conv1d(channels, channels, 1)
50
+ self.conv_v = torch.nn.Conv1d(channels, channels, 1)
51
+ self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
52
+
53
+ self.drop = torch.nn.Dropout(p_dropout)
54
+
55
+ # Relative positional encodings
56
+ if window_size:
57
+ n_heads_rel = 1 if heads_share else n_heads
58
+ rel_stddev = self.k_channels**-0.5
59
+ self.emb_rel_k = torch.nn.Parameter(
60
+ torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels)
61
+ * rel_stddev
62
+ )
63
+ self.emb_rel_v = torch.nn.Parameter(
64
+ torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels)
65
+ * rel_stddev
66
+ )
67
+
68
+ # Initialize weights
69
+ torch.nn.init.xavier_uniform_(self.conv_q.weight)
70
+ torch.nn.init.xavier_uniform_(self.conv_k.weight)
71
+ torch.nn.init.xavier_uniform_(self.conv_v.weight)
72
+ torch.nn.init.xavier_uniform_(self.conv_o.weight)
73
+
74
+ if proximal_init:
75
+ with torch.no_grad():
76
+ self.conv_k.weight.copy_(self.conv_q.weight)
77
+ self.conv_k.bias.copy_(self.conv_q.bias)
78
+
79
+ def forward(self, x, c, attn_mask=None):
80
+ # Compute query, key, value projections
81
+ q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c)
82
+
83
+ # Compute attention
84
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
85
+
86
+ # Final output projection
87
+ return self.conv_o(x)
88
+
89
+ def attention(self, query, key, value, mask=None):
90
+ # Reshape and compute scaled dot-product attention
91
+ b, d, t_s, t_t = (*key.size(), query.size(2))
92
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
93
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
94
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
95
+
96
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
97
+
98
+ if self.window_size:
99
+ assert t_s == t_t, "Relative attention only supports self-attention."
100
+ scores += self._compute_relative_scores(query, t_s)
101
+
102
+ if self.proximal_bias:
103
+ assert t_s == t_t, "Proximal bias only supports self-attention."
104
+ scores += self._attention_bias_proximal(t_s).to(scores.device, scores.dtype)
105
+
106
+ if mask is not None:
107
+ scores = scores.masked_fill(mask == 0, -1e4)
108
+ if self.block_length:
109
+ block_mask = (
110
+ torch.ones_like(scores)
111
+ .triu(-self.block_length)
112
+ .tril(self.block_length)
113
+ )
114
+ scores = scores.masked_fill(block_mask == 0, -1e4)
115
+
116
+ # Apply softmax and dropout
117
+ p_attn = self.drop(torch.nn.functional.softmax(scores, dim=-1))
118
+
119
+ # Compute attention output
120
+ output = torch.matmul(p_attn, value)
121
+
122
+ if self.window_size:
123
+ output += self._apply_relative_values(p_attn, t_s)
124
+
125
+ return output.transpose(2, 3).contiguous().view(b, d, t_t), p_attn
126
+
127
+ def _compute_relative_scores(self, query, length):
128
+ rel_emb = self._get_relative_embeddings(self.emb_rel_k, length)
129
+ rel_logits = self._matmul_with_relative_keys(
130
+ query / math.sqrt(self.k_channels), rel_emb
131
+ )
132
+ return self._relative_position_to_absolute_position(rel_logits)
133
+
134
+ def _apply_relative_values(self, p_attn, length):
135
+ rel_weights = self._absolute_position_to_relative_position(p_attn)
136
+ rel_emb = self._get_relative_embeddings(self.emb_rel_v, length)
137
+ return self._matmul_with_relative_values(rel_weights, rel_emb)
138
+
139
+ # Helper methods
140
+ def _matmul_with_relative_values(self, x, y):
141
+ return torch.matmul(x, y.unsqueeze(0))
142
+
143
+ def _matmul_with_relative_keys(self, x, y):
144
+ return torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
145
+
146
+ def _get_relative_embeddings(self, embeddings, length):
147
+ pad_length = max(length - (self.window_size + 1), 0)
148
+ start = max((self.window_size + 1) - length, 0)
149
+ end = start + 2 * length - 1
150
+
151
+ if pad_length > 0:
152
+ embeddings = torch.nn.functional.pad(
153
+ embeddings,
154
+ convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
155
+ )
156
+ return embeddings[:, start:end]
157
+
158
+ def _relative_position_to_absolute_position(self, x):
159
+ batch, heads, length, _ = x.size()
160
+ x = torch.nn.functional.pad(
161
+ x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])
162
+ )
163
+ x_flat = x.view(batch, heads, length * 2 * length)
164
+ x_flat = torch.nn.functional.pad(
165
+ x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
166
+ )
167
+ return x_flat.view(batch, heads, length + 1, 2 * length - 1)[
168
+ :, :, :length, length - 1 :
169
+ ]
170
+
171
+ def _absolute_position_to_relative_position(self, x):
172
+ batch, heads, length, _ = x.size()
173
+ x = torch.nn.functional.pad(
174
+ x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
175
+ )
176
+ x_flat = x.view(batch, heads, length**2 + length * (length - 1))
177
+ x_flat = torch.nn.functional.pad(
178
+ x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]])
179
+ )
180
+ return x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:]
181
+
182
+ def _attention_bias_proximal(self, length):
183
+ r = torch.arange(length, dtype=torch.float32)
184
+ diff = r.unsqueeze(0) - r.unsqueeze(1)
185
+ return -torch.log1p(torch.abs(diff)).unsqueeze(0).unsqueeze(0)
186
+
187
+
188
+ class FFN(torch.nn.Module):
189
+ """
190
+ Feed-forward network module.
191
+
192
+ Args:
193
+ in_channels (int): Number of input channels.
194
+ out_channels (int): Number of output channels.
195
+ filter_channels (int): Number of filter channels in the convolution layers.
196
+ kernel_size (int): Kernel size of the convolution layers.
197
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
198
+ activation (str, optional): Activation function to use. Defaults to None.
199
+ causal (bool, optional): Whether to use causal padding in the convolution layers. Defaults to False.
200
+ """
201
+
202
+ def __init__(
203
+ self,
204
+ in_channels,
205
+ out_channels,
206
+ filter_channels,
207
+ kernel_size,
208
+ p_dropout=0.0,
209
+ activation=None,
210
+ causal=False,
211
+ ):
212
+ super().__init__()
213
+ self.padding_fn = self._causal_padding if causal else self._same_padding
214
+
215
+ self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size)
216
+ self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size)
217
+ self.drop = torch.nn.Dropout(p_dropout)
218
+
219
+ self.activation = activation
220
+
221
+ def forward(self, x, x_mask):
222
+ x = self.conv_1(self.padding_fn(x * x_mask))
223
+ x = self._apply_activation(x)
224
+ x = self.drop(x)
225
+ x = self.conv_2(self.padding_fn(x * x_mask))
226
+ return x * x_mask
227
+
228
+ def _apply_activation(self, x):
229
+ if self.activation == "gelu":
230
+ return x * torch.sigmoid(1.702 * x)
231
+ return torch.relu(x)
232
+
233
+ def _causal_padding(self, x):
234
+ pad_l, pad_r = self.conv_1.kernel_size[0] - 1, 0
235
+ return torch.nn.functional.pad(
236
+ x, convert_pad_shape([[0, 0], [0, 0], [pad_l, pad_r]])
237
+ )
238
+
239
+ def _same_padding(self, x):
240
+ pad = (self.conv_1.kernel_size[0] - 1) // 2
241
+ return torch.nn.functional.pad(
242
+ x, convert_pad_shape([[0, 0], [0, 0], [pad, pad]])
243
+ )
rvc/lib/algorithm/commons.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from typing import List, Optional
4
+
5
+
6
+ def init_weights(m, mean=0.0, std=0.01):
7
+ """
8
+ Initialize the weights of a module.
9
+
10
+ Args:
11
+ m: The module to initialize.
12
+ mean: The mean of the normal distribution.
13
+ std: The standard deviation of the normal distribution.
14
+ """
15
+ classname = m.__class__.__name__
16
+ if classname.find("Conv") != -1:
17
+ m.weight.data.normal_(mean, std)
18
+
19
+
20
+ def get_padding(kernel_size, dilation=1):
21
+ """
22
+ Calculate the padding needed for a convolution.
23
+
24
+ Args:
25
+ kernel_size: The size of the kernel.
26
+ dilation: The dilation of the convolution.
27
+ """
28
+ return int((kernel_size * dilation - dilation) / 2)
29
+
30
+
31
+ def convert_pad_shape(pad_shape):
32
+ """
33
+ Convert the pad shape to a list of integers.
34
+
35
+ Args:
36
+ pad_shape: The pad shape..
37
+ """
38
+ l = pad_shape[::-1]
39
+ pad_shape = [item for sublist in l for item in sublist]
40
+ return pad_shape
41
+
42
+
43
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
44
+ """
45
+ Calculate the KL divergence between two distributions.
46
+
47
+ Args:
48
+ m_p: The mean of the first distribution.
49
+ logs_p: The log of the standard deviation of the first distribution.
50
+ m_q: The mean of the second distribution.
51
+ logs_q: The log of the standard deviation of the second distribution.
52
+ """
53
+ kl = (logs_q - logs_p) - 0.5
54
+ kl += (
55
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
56
+ )
57
+ return kl
58
+
59
+
60
+ def slice_segments(
61
+ x: torch.Tensor, ids_str: torch.Tensor, segment_size: int = 4, dim: int = 2
62
+ ):
63
+ """
64
+ Slice segments from a tensor, handling tensors with different numbers of dimensions.
65
+
66
+ Args:
67
+ x (torch.Tensor): The tensor to slice.
68
+ ids_str (torch.Tensor): The starting indices of the segments.
69
+ segment_size (int, optional): The size of each segment. Defaults to 4.
70
+ dim (int, optional): The dimension to slice across (2D or 3D tensors). Defaults to 2.
71
+ """
72
+ if dim == 2:
73
+ ret = torch.zeros_like(x[:, :segment_size])
74
+ elif dim == 3:
75
+ ret = torch.zeros_like(x[:, :, :segment_size])
76
+
77
+ for i in range(x.size(0)):
78
+ idx_str = ids_str[i].item()
79
+ idx_end = idx_str + segment_size
80
+ if dim == 2:
81
+ ret[i] = x[i, idx_str:idx_end]
82
+ else:
83
+ ret[i] = x[i, :, idx_str:idx_end]
84
+
85
+ return ret
86
+
87
+
88
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
89
+ """
90
+ Randomly slice segments from a tensor.
91
+
92
+ Args:
93
+ x: The tensor to slice.
94
+ x_lengths: The lengths of the sequences.
95
+ segment_size: The size of each segment.
96
+ """
97
+ b, d, t = x.size()
98
+ if x_lengths is None:
99
+ x_lengths = t
100
+ ids_str_max = x_lengths - segment_size + 1
101
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
102
+ ret = slice_segments(x, ids_str, segment_size, dim=3)
103
+ return ret, ids_str
104
+
105
+
106
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
107
+ """
108
+ Generate a 1D timing signal.
109
+
110
+ Args:
111
+ length: The length of the signal.
112
+ channels: The number of channels of the signal.
113
+ min_timescale: The minimum timescale.
114
+ max_timescale: The maximum timescale.
115
+ """
116
+ position = torch.arange(length, dtype=torch.float)
117
+ num_timescales = channels // 2
118
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
119
+ num_timescales - 1
120
+ )
121
+ inv_timescales = min_timescale * torch.exp(
122
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
123
+ )
124
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
125
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
126
+ signal = torch.nn.functional.pad(signal, [0, 0, 0, channels % 2])
127
+ signal = signal.view(1, channels, length)
128
+ return signal
129
+
130
+
131
+ def subsequent_mask(length):
132
+ """
133
+ Generate a subsequent mask.
134
+
135
+ Args:
136
+ length: The length of the sequence.
137
+ """
138
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
139
+ return mask
140
+
141
+
142
+ @torch.jit.script
143
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
144
+ """
145
+ Fused add tanh sigmoid multiply operation.
146
+
147
+ Args:
148
+ input_a: The first input tensor.
149
+ input_b: The second input tensor.
150
+ n_channels: The number of channels.
151
+ """
152
+ n_channels_int = n_channels[0]
153
+ in_act = input_a + input_b
154
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
155
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
156
+ acts = t_act * s_act
157
+ return acts
158
+
159
+
160
+ def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]:
161
+ """
162
+ Convert the pad shape to a list of integers.
163
+
164
+ Args:
165
+ pad_shape: The pad shape.
166
+ """
167
+ return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist()
168
+
169
+
170
+ def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
171
+ """
172
+ Generate a sequence mask.
173
+
174
+ Args:
175
+ length: The lengths of the sequences.
176
+ max_length: The maximum length of the sequences.
177
+ """
178
+ if max_length is None:
179
+ max_length = length.max()
180
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
181
+ return x.unsqueeze(0) < length.unsqueeze(1)
182
+
183
+
184
+ def clip_grad_value(parameters, clip_value, norm_type=2):
185
+ """
186
+ Clip the gradients of a list of parameters.
187
+
188
+ Args:
189
+ parameters: The list of parameters to clip.
190
+ clip_value: The maximum value of the gradients.
191
+ norm_type: The type of norm to use for clipping.
192
+ """
193
+ if isinstance(parameters, torch.Tensor):
194
+ parameters = [parameters]
195
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
196
+ norm_type = float(norm_type)
197
+ if clip_value is not None:
198
+ clip_value = float(clip_value)
199
+
200
+ total_norm = 0
201
+ for p in parameters:
202
+ param_norm = p.grad.data.norm(norm_type)
203
+ total_norm += param_norm.item() ** norm_type
204
+ if clip_value is not None:
205
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
206
+ total_norm = total_norm ** (1.0 / norm_type)
207
+ return total_norm
rvc/lib/algorithm/discriminators.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn.utils.parametrizations import spectral_norm, weight_norm
3
+
4
+ from rvc.lib.algorithm.commons import get_padding
5
+ from rvc.lib.algorithm.residuals import LRELU_SLOPE
6
+
7
+
8
+ class MultiPeriodDiscriminator(torch.nn.Module):
9
+ """
10
+ Multi-period discriminator.
11
+
12
+ This class implements a multi-period discriminator, which is used to
13
+ discriminate between real and fake audio signals. The discriminator
14
+ is composed of a series of convolutional layers that are applied to
15
+ the input signal at different periods.
16
+
17
+ Args:
18
+ periods (str): Periods of the discriminator. V1 = [2, 3, 5, 7, 11, 17], V2 = [2, 3, 5, 7, 11, 17, 23, 37].
19
+ use_spectral_norm (bool): Whether to use spectral normalization.
20
+ Defaults to False.
21
+ """
22
+
23
+ def __init__(self, version, use_spectral_norm=False):
24
+ super(MultiPeriodDiscriminator, self).__init__()
25
+ periods = (
26
+ [2, 3, 5, 7, 11, 17] if version == "v1" else [2, 3, 5, 7, 11, 17, 23, 37]
27
+ )
28
+ self.discriminators = torch.nn.ModuleList(
29
+ [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
30
+ + [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods]
31
+ )
32
+
33
+ def forward(self, y, y_hat):
34
+ """
35
+ Forward pass of the multi-period discriminator.
36
+
37
+ Args:
38
+ y (torch.Tensor): Real audio signal.
39
+ y_hat (torch.Tensor): Fake audio signal.
40
+ """
41
+ y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
42
+ for d in self.discriminators:
43
+ y_d_r, fmap_r = d(y)
44
+ y_d_g, fmap_g = d(y_hat)
45
+ y_d_rs.append(y_d_r)
46
+ y_d_gs.append(y_d_g)
47
+ fmap_rs.append(fmap_r)
48
+ fmap_gs.append(fmap_g)
49
+
50
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
51
+
52
+
53
+ class DiscriminatorS(torch.nn.Module):
54
+ """
55
+ Discriminator for the short-term component.
56
+
57
+ This class implements a discriminator for the short-term component
58
+ of the audio signal. The discriminator is composed of a series of
59
+ convolutional layers that are applied to the input signal.
60
+ """
61
+
62
+ def __init__(self, use_spectral_norm=False):
63
+ super(DiscriminatorS, self).__init__()
64
+ norm_f = spectral_norm if use_spectral_norm else weight_norm
65
+ self.convs = torch.nn.ModuleList(
66
+ [
67
+ norm_f(torch.nn.Conv1d(1, 16, 15, 1, padding=7)),
68
+ norm_f(torch.nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)),
69
+ norm_f(torch.nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)),
70
+ norm_f(torch.nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
71
+ norm_f(torch.nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
72
+ norm_f(torch.nn.Conv1d(1024, 1024, 5, 1, padding=2)),
73
+ ]
74
+ )
75
+ self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1))
76
+ self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE)
77
+
78
+ def forward(self, x):
79
+ """
80
+ Forward pass of the discriminator.
81
+
82
+ Args:
83
+ x (torch.Tensor): Input audio signal.
84
+ """
85
+ fmap = []
86
+ for conv in self.convs:
87
+ x = self.lrelu(conv(x))
88
+ fmap.append(x)
89
+ x = self.conv_post(x)
90
+ fmap.append(x)
91
+ x = torch.flatten(x, 1, -1)
92
+ return x, fmap
93
+
94
+
95
+ class DiscriminatorP(torch.nn.Module):
96
+ """
97
+ Discriminator for the long-term component.
98
+
99
+ This class implements a discriminator for the long-term component
100
+ of the audio signal. The discriminator is composed of a series of
101
+ convolutional layers that are applied to the input signal at a given
102
+ period.
103
+
104
+ Args:
105
+ period (int): Period of the discriminator.
106
+ kernel_size (int): Kernel size of the convolutional layers.
107
+ Defaults to 5.
108
+ stride (int): Stride of the convolutional layers. Defaults to 3.
109
+ use_spectral_norm (bool): Whether to use spectral normalization.
110
+ Defaults to False.
111
+ """
112
+
113
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
114
+ super(DiscriminatorP, self).__init__()
115
+ self.period = period
116
+ norm_f = spectral_norm if use_spectral_norm else weight_norm
117
+
118
+ in_channels = [1, 32, 128, 512, 1024]
119
+ out_channels = [32, 128, 512, 1024, 1024]
120
+
121
+ self.convs = torch.nn.ModuleList(
122
+ [
123
+ norm_f(
124
+ torch.nn.Conv2d(
125
+ in_ch,
126
+ out_ch,
127
+ (kernel_size, 1),
128
+ (stride, 1),
129
+ padding=(get_padding(kernel_size, 1), 0),
130
+ )
131
+ )
132
+ for in_ch, out_ch in zip(in_channels, out_channels)
133
+ ]
134
+ )
135
+
136
+ self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
137
+ self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE)
138
+
139
+ def forward(self, x):
140
+ """
141
+ Forward pass of the discriminator.
142
+
143
+ Args:
144
+ x (torch.Tensor): Input audio signal.
145
+ """
146
+ fmap = []
147
+ b, c, t = x.shape
148
+ if t % self.period != 0:
149
+ n_pad = self.period - (t % self.period)
150
+ x = torch.nn.functional.pad(x, (0, n_pad), "reflect")
151
+ x = x.view(b, c, -1, self.period)
152
+
153
+ for conv in self.convs:
154
+ x = self.lrelu(conv(x))
155
+ fmap.append(x)
156
+
157
+ x = self.conv_post(x)
158
+ fmap.append(x)
159
+ x = torch.flatten(x, 1, -1)
160
+ return x, fmap
rvc/lib/algorithm/encoders.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from typing import Optional
4
+
5
+ from rvc.lib.algorithm.commons import sequence_mask
6
+ from rvc.lib.algorithm.modules import WaveNet
7
+ from rvc.lib.algorithm.normalization import LayerNorm
8
+ from rvc.lib.algorithm.attentions import FFN, MultiHeadAttention
9
+
10
+
11
+ class Encoder(torch.nn.Module):
12
+ """
13
+ Encoder module for the Transformer model.
14
+
15
+ Args:
16
+ hidden_channels (int): Number of hidden channels in the encoder.
17
+ filter_channels (int): Number of filter channels in the feed-forward network.
18
+ n_heads (int): Number of attention heads.
19
+ n_layers (int): Number of encoder layers.
20
+ kernel_size (int, optional): Kernel size of the convolution layers in the feed-forward network. Defaults to 1.
21
+ p_dropout (float, optional): Dropout probability. Defaults to 0.0.
22
+ window_size (int, optional): Window size for relative positional encoding. Defaults to 10.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ hidden_channels,
28
+ filter_channels,
29
+ n_heads,
30
+ n_layers,
31
+ kernel_size=1,
32
+ p_dropout=0.0,
33
+ window_size=10,
34
+ ):
35
+ super().__init__()
36
+ self.hidden_channels = hidden_channels
37
+ self.filter_channels = filter_channels
38
+ self.n_heads = n_heads
39
+ self.n_layers = n_layers
40
+ self.kernel_size = kernel_size
41
+ self.p_dropout = p_dropout
42
+ self.window_size = window_size
43
+
44
+ self.drop = torch.nn.Dropout(p_dropout)
45
+ self.attn_layers = torch.nn.ModuleList()
46
+ self.norm_layers_1 = torch.nn.ModuleList()
47
+ self.ffn_layers = torch.nn.ModuleList()
48
+ self.norm_layers_2 = torch.nn.ModuleList()
49
+ for i in range(self.n_layers):
50
+ self.attn_layers.append(
51
+ MultiHeadAttention(
52
+ hidden_channels,
53
+ hidden_channels,
54
+ n_heads,
55
+ p_dropout=p_dropout,
56
+ window_size=window_size,
57
+ )
58
+ )
59
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
60
+ self.ffn_layers.append(
61
+ FFN(
62
+ hidden_channels,
63
+ hidden_channels,
64
+ filter_channels,
65
+ kernel_size,
66
+ p_dropout=p_dropout,
67
+ )
68
+ )
69
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
70
+
71
+ def forward(self, x, x_mask):
72
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
73
+ x = x * x_mask
74
+ for i in range(self.n_layers):
75
+ y = self.attn_layers[i](x, x, attn_mask)
76
+ y = self.drop(y)
77
+ x = self.norm_layers_1[i](x + y)
78
+
79
+ y = self.ffn_layers[i](x, x_mask)
80
+ y = self.drop(y)
81
+ x = self.norm_layers_2[i](x + y)
82
+ x = x * x_mask
83
+ return x
84
+
85
+
86
+ class TextEncoder(torch.nn.Module):
87
+ """Text Encoder with configurable embedding dimension.
88
+
89
+ Args:
90
+ out_channels (int): Output channels of the encoder.
91
+ hidden_channels (int): Hidden channels of the encoder.
92
+ filter_channels (int): Filter channels of the encoder.
93
+ n_heads (int): Number of attention heads.
94
+ n_layers (int): Number of encoder layers.
95
+ kernel_size (int): Kernel size of the convolutional layers.
96
+ p_dropout (float): Dropout probability.
97
+ embedding_dim (int): Embedding dimension for phone embeddings (v1 = 256, v2 = 768).
98
+ f0 (bool, optional): Whether to use F0 embedding. Defaults to True.
99
+ """
100
+
101
+ def __init__(
102
+ self,
103
+ out_channels,
104
+ hidden_channels,
105
+ filter_channels,
106
+ n_heads,
107
+ n_layers,
108
+ kernel_size,
109
+ p_dropout,
110
+ embedding_dim,
111
+ f0=True,
112
+ ):
113
+ super(TextEncoder, self).__init__()
114
+ self.out_channels = out_channels
115
+ self.hidden_channels = hidden_channels
116
+ self.filter_channels = filter_channels
117
+ self.n_heads = n_heads
118
+ self.n_layers = n_layers
119
+ self.kernel_size = kernel_size
120
+ self.p_dropout = float(p_dropout)
121
+ self.emb_phone = torch.nn.Linear(embedding_dim, hidden_channels)
122
+ self.lrelu = torch.nn.LeakyReLU(0.1, inplace=True)
123
+ if f0:
124
+ self.emb_pitch = torch.nn.Embedding(256, hidden_channels)
125
+ self.encoder = Encoder(
126
+ hidden_channels,
127
+ filter_channels,
128
+ n_heads,
129
+ n_layers,
130
+ kernel_size,
131
+ float(p_dropout),
132
+ )
133
+ self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
134
+
135
+ def forward(
136
+ self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor
137
+ ):
138
+ if pitch is None:
139
+ x = self.emb_phone(phone)
140
+ else:
141
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
142
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
143
+ x = self.lrelu(x)
144
+ x = torch.transpose(x, 1, -1) # [b, h, t]
145
+ x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(x.dtype)
146
+ x = self.encoder(x * x_mask, x_mask)
147
+ stats = self.proj(x) * x_mask
148
+
149
+ m, logs = torch.split(stats, self.out_channels, dim=1)
150
+ return m, logs, x_mask
151
+
152
+
153
+ class PosteriorEncoder(torch.nn.Module):
154
+ """Posterior Encoder for inferring latent representation.
155
+
156
+ Args:
157
+ in_channels (int): Number of channels in the input.
158
+ out_channels (int): Number of channels in the output.
159
+ hidden_channels (int): Number of hidden channels in the encoder.
160
+ kernel_size (int): Kernel size of the convolutional layers.
161
+ dilation_rate (int): Dilation rate of the convolutional layers.
162
+ n_layers (int): Number of layers in the encoder.
163
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
164
+ """
165
+
166
+ def __init__(
167
+ self,
168
+ in_channels,
169
+ out_channels,
170
+ hidden_channels,
171
+ kernel_size,
172
+ dilation_rate,
173
+ n_layers,
174
+ gin_channels=0,
175
+ ):
176
+ super(PosteriorEncoder, self).__init__()
177
+ self.in_channels = in_channels
178
+ self.out_channels = out_channels
179
+ self.hidden_channels = hidden_channels
180
+ self.kernel_size = kernel_size
181
+ self.dilation_rate = dilation_rate
182
+ self.n_layers = n_layers
183
+ self.gin_channels = gin_channels
184
+
185
+ self.pre = torch.nn.Conv1d(in_channels, hidden_channels, 1)
186
+ self.enc = WaveNet(
187
+ hidden_channels,
188
+ kernel_size,
189
+ dilation_rate,
190
+ n_layers,
191
+ gin_channels=gin_channels,
192
+ )
193
+ self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
194
+
195
+ def forward(
196
+ self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
197
+ ):
198
+ x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
199
+ x = self.pre(x) * x_mask
200
+ x = self.enc(x, x_mask, g=g)
201
+ stats = self.proj(x) * x_mask
202
+ m, logs = torch.split(stats, self.out_channels, dim=1)
203
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
204
+ return z, m, logs, x_mask
205
+
206
+ def remove_weight_norm(self):
207
+ """Removes weight normalization from the encoder."""
208
+ self.enc.remove_weight_norm()
209
+
210
+ def __prepare_scriptable__(self):
211
+ """Prepares the module for scripting."""
212
+ for hook in self.enc._forward_pre_hooks.values():
213
+ if (
214
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
215
+ and hook.__class__.__name__ == "WeightNorm"
216
+ ):
217
+ torch.nn.utils.remove_weight_norm(self.enc)
218
+ return self
rvc/lib/algorithm/generators.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from torch.nn.utils import remove_weight_norm
4
+ from torch.nn.utils.parametrizations import weight_norm
5
+ from typing import Optional
6
+
7
+ from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock1, ResBlock2
8
+ from rvc.lib.algorithm.commons import init_weights
9
+
10
+
11
+ class Generator(torch.nn.Module):
12
+ """Generator for synthesizing audio.
13
+
14
+ Args:
15
+ initial_channel (int): Number of channels in the initial convolutional layer.
16
+ resblock (str): Type of residual block to use (1 or 2).
17
+ resblock_kernel_sizes (list): Kernel sizes of the residual blocks.
18
+ resblock_dilation_sizes (list): Dilation rates of the residual blocks.
19
+ upsample_rates (list): Upsampling rates.
20
+ upsample_initial_channel (int): Number of channels in the initial upsampling layer.
21
+ upsample_kernel_sizes (list): Kernel sizes of the upsampling layers.
22
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ initial_channel,
28
+ resblock,
29
+ resblock_kernel_sizes,
30
+ resblock_dilation_sizes,
31
+ upsample_rates,
32
+ upsample_initial_channel,
33
+ upsample_kernel_sizes,
34
+ gin_channels=0,
35
+ ):
36
+ super(Generator, self).__init__()
37
+ self.num_kernels = len(resblock_kernel_sizes)
38
+ self.num_upsamples = len(upsample_rates)
39
+ self.conv_pre = torch.nn.Conv1d(
40
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
41
+ )
42
+ resblock = ResBlock1 if resblock == "1" else ResBlock2
43
+
44
+ self.ups = torch.nn.ModuleList()
45
+ self.resblocks = torch.nn.ModuleList()
46
+
47
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
48
+ self.ups.append(
49
+ weight_norm(
50
+ torch.nn.ConvTranspose1d(
51
+ upsample_initial_channel // (2**i),
52
+ upsample_initial_channel // (2 ** (i + 1)),
53
+ k,
54
+ u,
55
+ padding=(k - u) // 2,
56
+ )
57
+ )
58
+ )
59
+ ch = upsample_initial_channel // (2 ** (i + 1))
60
+ for j, (k, d) in enumerate(
61
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
62
+ ):
63
+ self.resblocks.append(resblock(ch, k, d))
64
+
65
+ self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
66
+ self.ups.apply(init_weights)
67
+
68
+ if gin_channels != 0:
69
+ self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
70
+
71
+ def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
72
+ x = self.conv_pre(x)
73
+ if g is not None:
74
+ x = x + self.cond(g)
75
+
76
+ for i in range(self.num_upsamples):
77
+ x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
78
+ x = self.ups[i](x)
79
+ xs = None
80
+ for j in range(self.num_kernels):
81
+ if xs == None:
82
+ xs = self.resblocks[i * self.num_kernels + j](x)
83
+ else:
84
+ xs += self.resblocks[i * self.num_kernels + j](x)
85
+ x = xs / self.num_kernels
86
+
87
+ x = torch.nn.functional.leaky_relu(x)
88
+ x = self.conv_post(x)
89
+ x = torch.tanh(x)
90
+
91
+ return x
92
+
93
+ def __prepare_scriptable__(self):
94
+ """Prepares the module for scripting."""
95
+ for l in self.ups_and_resblocks:
96
+ for hook in l._forward_pre_hooks.values():
97
+ if (
98
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
99
+ and hook.__class__.__name__ == "WeightNorm"
100
+ ):
101
+ torch.nn.utils.remove_weight_norm(l)
102
+ return self
103
+
104
+ def remove_weight_norm(self):
105
+ """Removes weight normalization from the upsampling and residual blocks."""
106
+ for l in self.ups:
107
+ remove_weight_norm(l)
108
+ for l in self.resblocks:
109
+ l.remove_weight_norm()
110
+
111
+
112
+ class SineGenerator(torch.nn.Module):
113
+ """
114
+ A sine wave generator that synthesizes waveforms with optional harmonic overtones and noise.
115
+
116
+ Args:
117
+ sampling_rate (int): The sampling rate in Hz.
118
+ num_harmonics (int, optional): The number of harmonic overtones to include. Defaults to 0.
119
+ sine_amplitude (float, optional): The amplitude of the sine waveform. Defaults to 0.1.
120
+ noise_stddev (float, optional): The standard deviation of Gaussian noise. Defaults to 0.003.
121
+ voiced_threshold (float, optional): F0 threshold for distinguishing voiced/unvoiced frames. Defaults to 0.
122
+ """
123
+
124
+ def __init__(
125
+ self,
126
+ sampling_rate: int,
127
+ num_harmonics: int = 0,
128
+ sine_amplitude: float = 0.1,
129
+ noise_stddev: float = 0.003,
130
+ voiced_threshold: float = 0.0,
131
+ ):
132
+ super(SineGenerator, self).__init__()
133
+ self.sampling_rate = sampling_rate
134
+ self.num_harmonics = num_harmonics
135
+ self.sine_amplitude = sine_amplitude
136
+ self.noise_stddev = noise_stddev
137
+ self.voiced_threshold = voiced_threshold
138
+ self.waveform_dim = self.num_harmonics + 1 # fundamental + harmonics
139
+
140
+ def _compute_voiced_unvoiced(self, f0: torch.Tensor) -> torch.Tensor:
141
+ """
142
+ Generate a binary mask to indicate voiced/unvoiced frames.
143
+
144
+ Args:
145
+ f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length).
146
+ """
147
+ uv_mask = (f0 > self.voiced_threshold).float()
148
+ return uv_mask
149
+
150
+ def _generate_sine_wave(
151
+ self, f0: torch.Tensor, upsampling_factor: int
152
+ ) -> torch.Tensor:
153
+ """
154
+ Generate sine waves for the fundamental frequency and its harmonics.
155
+
156
+ Args:
157
+ f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length, 1).
158
+ upsampling_factor (int): Upsampling factor.
159
+ """
160
+ batch_size, length, _ = f0.shape
161
+
162
+ # Create an upsampling grid
163
+ upsampling_grid = torch.arange(
164
+ 1, upsampling_factor + 1, dtype=f0.dtype, device=f0.device
165
+ )
166
+
167
+ # Calculate phase increments
168
+ phase_increments = (f0 / self.sampling_rate) * upsampling_grid
169
+ phase_remainder = torch.fmod(phase_increments[:, :-1, -1:] + 0.5, 1.0) - 0.5
170
+ cumulative_phase = phase_remainder.cumsum(dim=1).fmod(1.0).to(f0.dtype)
171
+ phase_increments += torch.nn.functional.pad(
172
+ cumulative_phase, (0, 0, 1, 0), mode="constant"
173
+ )
174
+
175
+ # Reshape to match the sine wave shape
176
+ phase_increments = phase_increments.reshape(batch_size, -1, 1)
177
+
178
+ # Scale for harmonics
179
+ harmonic_scale = torch.arange(
180
+ 1, self.waveform_dim + 1, dtype=f0.dtype, device=f0.device
181
+ ).reshape(1, 1, -1)
182
+ phase_increments *= harmonic_scale
183
+
184
+ # Add random phase offset (except for the fundamental)
185
+ random_phase = torch.rand(1, 1, self.waveform_dim, device=f0.device)
186
+ random_phase[..., 0] = 0 # Fundamental frequency has no random offset
187
+ phase_increments += random_phase
188
+
189
+ # Generate sine waves
190
+ sine_waves = torch.sin(2 * np.pi * phase_increments)
191
+ return sine_waves
192
+
193
+ def forward(self, f0: torch.Tensor, upsampling_factor: int):
194
+ """
195
+ Forward pass to generate sine waveforms with noise and voiced/unvoiced masking.
196
+
197
+ Args:
198
+ f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length, 1).
199
+ upsampling_factor (int): Upsampling factor.
200
+ """
201
+ with torch.no_grad():
202
+ # Expand `f0` to include waveform dimensions
203
+ f0 = f0.unsqueeze(-1)
204
+
205
+ # Generate sine waves
206
+ sine_waves = (
207
+ self._generate_sine_wave(f0, upsampling_factor) * self.sine_amplitude
208
+ )
209
+
210
+ # Compute voiced/unvoiced mask
211
+ voiced_mask = self._compute_voiced_unvoiced(f0)
212
+
213
+ # Upsample voiced/unvoiced mask
214
+ voiced_mask = torch.nn.functional.interpolate(
215
+ voiced_mask.transpose(2, 1),
216
+ scale_factor=float(upsampling_factor),
217
+ mode="nearest",
218
+ ).transpose(2, 1)
219
+
220
+ # Compute noise amplitude
221
+ noise_amplitude = voiced_mask * self.noise_stddev + (1 - voiced_mask) * (
222
+ self.sine_amplitude / 3
223
+ )
224
+
225
+ # Add Gaussian noise
226
+ noise = noise_amplitude * torch.randn_like(sine_waves)
227
+
228
+ # Combine sine waves and noise
229
+ sine_waveforms = sine_waves * voiced_mask + noise
230
+
231
+ return sine_waveforms, voiced_mask, noise
rvc/lib/algorithm/modules.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from rvc.lib.algorithm.commons import fused_add_tanh_sigmoid_multiply
3
+
4
+
5
+ class WaveNet(torch.nn.Module):
6
+ """WaveNet residual blocks as used in WaveGlow.
7
+
8
+ Args:
9
+ hidden_channels (int): Number of hidden channels.
10
+ kernel_size (int): Size of the convolutional kernel.
11
+ dilation_rate (int): Dilation rate of the convolution.
12
+ n_layers (int): Number of convolutional layers.
13
+ gin_channels (int, optional): Number of conditioning channels. Defaults to 0.
14
+ p_dropout (float, optional): Dropout probability. Defaults to 0.
15
+ """
16
+
17
+ def __init__(
18
+ self,
19
+ hidden_channels,
20
+ kernel_size,
21
+ dilation_rate,
22
+ n_layers,
23
+ gin_channels=0,
24
+ p_dropout=0,
25
+ ):
26
+ super().__init__()
27
+ assert kernel_size % 2 == 1, "Kernel size must be odd for proper padding."
28
+
29
+ self.hidden_channels = hidden_channels
30
+ self.kernel_size = (kernel_size,)
31
+ self.dilation_rate = dilation_rate
32
+ self.n_layers = n_layers
33
+ self.gin_channels = gin_channels
34
+ self.p_dropout = p_dropout
35
+ self.n_channels_tensor = torch.IntTensor([hidden_channels]) # Static tensor
36
+
37
+ self.in_layers = torch.nn.ModuleList()
38
+ self.res_skip_layers = torch.nn.ModuleList()
39
+ self.drop = torch.nn.Dropout(p_dropout)
40
+
41
+ # Conditional layer for global conditioning
42
+ if gin_channels:
43
+ self.cond_layer = torch.nn.utils.parametrizations.weight_norm(
44
+ torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1),
45
+ name="weight",
46
+ )
47
+
48
+ # Precompute dilations and paddings
49
+ dilations = [dilation_rate**i for i in range(n_layers)]
50
+ paddings = [(kernel_size * d - d) // 2 for d in dilations]
51
+
52
+ # Initialize layers
53
+ for i in range(n_layers):
54
+ self.in_layers.append(
55
+ torch.nn.utils.parametrizations.weight_norm(
56
+ torch.nn.Conv1d(
57
+ hidden_channels,
58
+ 2 * hidden_channels,
59
+ kernel_size,
60
+ dilation=dilations[i],
61
+ padding=paddings[i],
62
+ ),
63
+ name="weight",
64
+ )
65
+ )
66
+
67
+ res_skip_channels = (
68
+ hidden_channels if i == n_layers - 1 else 2 * hidden_channels
69
+ )
70
+ self.res_skip_layers.append(
71
+ torch.nn.utils.parametrizations.weight_norm(
72
+ torch.nn.Conv1d(hidden_channels, res_skip_channels, 1),
73
+ name="weight",
74
+ )
75
+ )
76
+
77
+ def forward(self, x, x_mask, g=None):
78
+ """Forward pass.
79
+
80
+ Args:
81
+ x (torch.Tensor): Input tensor (batch_size, hidden_channels, time_steps).
82
+ x_mask (torch.Tensor): Mask tensor (batch_size, 1, time_steps).
83
+ g (torch.Tensor, optional): Conditioning tensor (batch_size, gin_channels, time_steps).
84
+ """
85
+ output = x.clone().zero_()
86
+
87
+ # Apply conditional layer if global conditioning is provided
88
+ g = self.cond_layer(g) if g is not None else None
89
+
90
+ for i in range(self.n_layers):
91
+ x_in = self.in_layers[i](x)
92
+ g_l = (
93
+ g[
94
+ :,
95
+ i * 2 * self.hidden_channels : (i + 1) * 2 * self.hidden_channels,
96
+ :,
97
+ ]
98
+ if g is not None
99
+ else 0
100
+ )
101
+
102
+ # Activation with fused Tanh-Sigmoid
103
+ acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, self.n_channels_tensor)
104
+ acts = self.drop(acts)
105
+
106
+ # Residual and skip connections
107
+ res_skip_acts = self.res_skip_layers[i](acts)
108
+ if i < self.n_layers - 1:
109
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
110
+ x = (x + res_acts) * x_mask
111
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
112
+ else:
113
+ output = output + res_skip_acts
114
+
115
+ return output * x_mask
116
+
117
+ def remove_weight_norm(self):
118
+ """Remove weight normalization from the module."""
119
+ if self.gin_channels:
120
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
121
+ for layer in self.in_layers:
122
+ torch.nn.utils.remove_weight_norm(layer)
123
+ for layer in self.res_skip_layers:
124
+ torch.nn.utils.remove_weight_norm(layer)
rvc/lib/algorithm/normalization.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ class LayerNorm(torch.nn.Module):
5
+ """Layer normalization module.
6
+
7
+ Args:
8
+ channels (int): Number of channels.
9
+ eps (float, optional): Epsilon value for numerical stability. Defaults to 1e-5.
10
+ """
11
+
12
+ def __init__(self, channels, eps=1e-5):
13
+ super().__init__()
14
+ self.eps = eps
15
+ self.gamma = torch.nn.Parameter(torch.ones(channels))
16
+ self.beta = torch.nn.Parameter(torch.zeros(channels))
17
+
18
+ def forward(self, x):
19
+ """Forward pass.
20
+
21
+ Args:
22
+ x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps).
23
+
24
+ """
25
+ # Transpose to (batch_size, time_steps, channels) for layer_norm
26
+ x = x.transpose(1, -1)
27
+ x = torch.nn.functional.layer_norm(
28
+ x, (x.size(-1),), self.gamma, self.beta, self.eps
29
+ )
30
+ # Transpose back to (batch_size, channels, time_steps)
31
+ return x.transpose(1, -1)
rvc/lib/algorithm/nsf.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch.nn.utils import remove_weight_norm
4
+ from torch.nn.utils.parametrizations import weight_norm
5
+ from typing import Optional
6
+
7
+ from rvc.lib.algorithm.generators import SineGenerator
8
+ from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock1, ResBlock2
9
+ from rvc.lib.algorithm.commons import init_weights
10
+
11
+
12
+ class SourceModuleHnNSF(torch.nn.Module):
13
+ """
14
+ Source Module for harmonic-plus-noise excitation.
15
+
16
+ Args:
17
+ sample_rate (int): Sampling rate in Hz.
18
+ harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0.
19
+ sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1.
20
+ add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003.
21
+ voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0.
22
+ is_half (bool, optional): Whether to use half precision. Defaults to True.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ sample_rate,
28
+ harmonic_num=0,
29
+ sine_amp=0.1,
30
+ add_noise_std=0.003,
31
+ voiced_threshod=0,
32
+ is_half=True,
33
+ ):
34
+ super(SourceModuleHnNSF, self).__init__()
35
+
36
+ self.sine_amp = sine_amp
37
+ self.noise_std = add_noise_std
38
+ self.is_half = is_half
39
+
40
+ self.l_sin_gen = SineGenerator(
41
+ sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
42
+ )
43
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
44
+ self.l_tanh = torch.nn.Tanh()
45
+
46
+ def forward(self, x: torch.Tensor, upsample_factor: int = 1):
47
+ sine_wavs, uv, _ = self.l_sin_gen(x, upsample_factor)
48
+ sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
49
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
50
+ return sine_merge, None, None
51
+
52
+
53
+ class GeneratorNSF(torch.nn.Module):
54
+ """
55
+ Generator for synthesizing audio using the NSF (Neural Source Filter) approach.
56
+
57
+ Args:
58
+ initial_channel (int): Number of channels in the initial convolutional layer.
59
+ resblock (str): Type of residual block to use (1 or 2).
60
+ resblock_kernel_sizes (list): Kernel sizes of the residual blocks.
61
+ resblock_dilation_sizes (list): Dilation rates of the residual blocks.
62
+ upsample_rates (list): Upsampling rates.
63
+ upsample_initial_channel (int): Number of channels in the initial upsampling layer.
64
+ upsample_kernel_sizes (list): Kernel sizes of the upsampling layers.
65
+ gin_channels (int): Number of channels for the global conditioning input.
66
+ sr (int): Sampling rate.
67
+ is_half (bool, optional): Whether to use half precision. Defaults to False.
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ initial_channel,
73
+ resblock,
74
+ resblock_kernel_sizes,
75
+ resblock_dilation_sizes,
76
+ upsample_rates,
77
+ upsample_initial_channel,
78
+ upsample_kernel_sizes,
79
+ gin_channels,
80
+ sr,
81
+ is_half=False,
82
+ ):
83
+ super(GeneratorNSF, self).__init__()
84
+
85
+ self.num_kernels = len(resblock_kernel_sizes)
86
+ self.num_upsamples = len(upsample_rates)
87
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates))
88
+ self.m_source = SourceModuleHnNSF(
89
+ sample_rate=sr, harmonic_num=0, is_half=is_half
90
+ )
91
+
92
+ self.conv_pre = torch.nn.Conv1d(
93
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
94
+ )
95
+ resblock_cls = ResBlock1 if resblock == "1" else ResBlock2
96
+
97
+ self.ups = torch.nn.ModuleList()
98
+ self.noise_convs = torch.nn.ModuleList()
99
+
100
+ channels = [
101
+ upsample_initial_channel // (2 ** (i + 1))
102
+ for i in range(len(upsample_rates))
103
+ ]
104
+ stride_f0s = [
105
+ math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1
106
+ for i in range(len(upsample_rates))
107
+ ]
108
+
109
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
110
+ self.ups.append(
111
+ weight_norm(
112
+ torch.nn.ConvTranspose1d(
113
+ upsample_initial_channel // (2**i),
114
+ channels[i],
115
+ k,
116
+ u,
117
+ padding=(k - u) // 2,
118
+ )
119
+ )
120
+ )
121
+
122
+ self.noise_convs.append(
123
+ torch.nn.Conv1d(
124
+ 1,
125
+ channels[i],
126
+ kernel_size=(stride_f0s[i] * 2 if stride_f0s[i] > 1 else 1),
127
+ stride=stride_f0s[i],
128
+ padding=(stride_f0s[i] // 2 if stride_f0s[i] > 1 else 0),
129
+ )
130
+ )
131
+
132
+ self.resblocks = torch.nn.ModuleList(
133
+ [
134
+ resblock_cls(channels[i], k, d)
135
+ for i in range(len(self.ups))
136
+ for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes)
137
+ ]
138
+ )
139
+
140
+ self.conv_post = torch.nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False)
141
+ self.ups.apply(init_weights)
142
+
143
+ if gin_channels != 0:
144
+ self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
145
+
146
+ self.upp = math.prod(upsample_rates)
147
+ self.lrelu_slope = LRELU_SLOPE
148
+
149
+ def forward(self, x, f0, g: Optional[torch.Tensor] = None):
150
+ har_source, _, _ = self.m_source(f0, self.upp)
151
+ har_source = har_source.transpose(1, 2)
152
+ x = self.conv_pre(x)
153
+
154
+ if g is not None:
155
+ x = x + self.cond(g)
156
+
157
+ for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)):
158
+ x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
159
+ x = ups(x)
160
+ x = x + noise_convs(har_source)
161
+
162
+ xs = sum(
163
+ [
164
+ resblock(x)
165
+ for j, resblock in enumerate(self.resblocks)
166
+ if j in range(i * self.num_kernels, (i + 1) * self.num_kernels)
167
+ ]
168
+ )
169
+ x = xs / self.num_kernels
170
+
171
+ x = torch.nn.functional.leaky_relu(x)
172
+ x = torch.tanh(self.conv_post(x))
173
+ return x
174
+
175
+ def remove_weight_norm(self):
176
+ for l in self.ups:
177
+ remove_weight_norm(l)
178
+ for l in self.resblocks:
179
+ l.remove_weight_norm()
180
+
181
+ def __prepare_scriptable__(self):
182
+ for l in self.ups:
183
+ for hook in l._forward_pre_hooks.values():
184
+ if (
185
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
186
+ and hook.__class__.__name__ == "WeightNorm"
187
+ ):
188
+ remove_weight_norm(l)
189
+ for l in self.resblocks:
190
+ for hook in l._forward_pre_hooks.values():
191
+ if (
192
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
193
+ and hook.__class__.__name__ == "WeightNorm"
194
+ ):
195
+ remove_weight_norm(l)
196
+ return self
rvc/lib/algorithm/residuals.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ import torch
3
+ from torch.nn.utils import remove_weight_norm
4
+ from torch.nn.utils.parametrizations import weight_norm
5
+
6
+ from rvc.lib.algorithm.modules import WaveNet
7
+ from rvc.lib.algorithm.commons import get_padding, init_weights
8
+
9
+ LRELU_SLOPE = 0.1
10
+
11
+
12
+ def create_conv1d_layer(channels, kernel_size, dilation):
13
+ return weight_norm(
14
+ torch.nn.Conv1d(
15
+ channels,
16
+ channels,
17
+ kernel_size,
18
+ 1,
19
+ dilation=dilation,
20
+ padding=get_padding(kernel_size, dilation),
21
+ )
22
+ )
23
+
24
+
25
+ def apply_mask(tensor, mask):
26
+ return tensor * mask if mask is not None else tensor
27
+
28
+
29
+ class ResBlockBase(torch.nn.Module):
30
+ def __init__(self, channels, kernel_size, dilations):
31
+ super(ResBlockBase, self).__init__()
32
+ self.convs1 = torch.nn.ModuleList(
33
+ [create_conv1d_layer(channels, kernel_size, d) for d in dilations]
34
+ )
35
+ self.convs1.apply(init_weights)
36
+
37
+ self.convs2 = torch.nn.ModuleList(
38
+ [create_conv1d_layer(channels, kernel_size, 1) for _ in dilations]
39
+ )
40
+ self.convs2.apply(init_weights)
41
+
42
+ def forward(self, x, x_mask=None):
43
+ for c1, c2 in zip(self.convs1, self.convs2):
44
+ xt = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
45
+ xt = apply_mask(xt, x_mask)
46
+ xt = torch.nn.functional.leaky_relu(c1(xt), LRELU_SLOPE)
47
+ xt = apply_mask(xt, x_mask)
48
+ xt = c2(xt)
49
+ x = xt + x
50
+ return apply_mask(x, x_mask)
51
+
52
+ def remove_weight_norm(self):
53
+ for conv in self.convs1 + self.convs2:
54
+ remove_weight_norm(conv)
55
+
56
+
57
+ class ResBlock1(ResBlockBase):
58
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
59
+ super(ResBlock1, self).__init__(channels, kernel_size, dilation)
60
+
61
+
62
+ class ResBlock2(ResBlockBase):
63
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
64
+ super(ResBlock2, self).__init__(channels, kernel_size, dilation)
65
+
66
+
67
+ class Flip(torch.nn.Module):
68
+ """Flip module for flow-based models.
69
+
70
+ This module flips the input along the time dimension.
71
+ """
72
+
73
+ def forward(self, x, *args, reverse=False, **kwargs):
74
+ """Forward pass.
75
+
76
+ Args:
77
+ x (torch.Tensor): Input tensor.
78
+ reverse (bool, optional): Whether to reverse the operation. Defaults to False.
79
+ """
80
+ x = torch.flip(x, [1])
81
+ if not reverse:
82
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
83
+ return x, logdet
84
+ else:
85
+ return x
86
+
87
+
88
+ class ResidualCouplingBlock(torch.nn.Module):
89
+ """Residual Coupling Block for normalizing flow.
90
+
91
+ Args:
92
+ channels (int): Number of channels in the input.
93
+ hidden_channels (int): Number of hidden channels in the coupling layer.
94
+ kernel_size (int): Kernel size of the convolutional layers.
95
+ dilation_rate (int): Dilation rate of the convolutional layers.
96
+ n_layers (int): Number of layers in the coupling layer.
97
+ n_flows (int, optional): Number of coupling layers in the block. Defaults to 4.
98
+ gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
99
+ """
100
+
101
+ def __init__(
102
+ self,
103
+ channels,
104
+ hidden_channels,
105
+ kernel_size,
106
+ dilation_rate,
107
+ n_layers,
108
+ n_flows=4,
109
+ gin_channels=0,
110
+ ):
111
+ super(ResidualCouplingBlock, self).__init__()
112
+ self.channels = channels
113
+ self.hidden_channels = hidden_channels
114
+ self.kernel_size = kernel_size
115
+ self.dilation_rate = dilation_rate
116
+ self.n_layers = n_layers
117
+ self.n_flows = n_flows
118
+ self.gin_channels = gin_channels
119
+
120
+ self.flows = torch.nn.ModuleList()
121
+ for i in range(n_flows):
122
+ self.flows.append(
123
+ ResidualCouplingLayer(
124
+ channels,
125
+ hidden_channels,
126
+ kernel_size,
127
+ dilation_rate,
128
+ n_layers,
129
+ gin_channels=gin_channels,
130
+ mean_only=True,
131
+ )
132
+ )
133
+ self.flows.append(Flip())
134
+
135
+ def forward(
136
+ self,
137
+ x: torch.Tensor,
138
+ x_mask: torch.Tensor,
139
+ g: Optional[torch.Tensor] = None,
140
+ reverse: bool = False,
141
+ ):
142
+ if not reverse:
143
+ for flow in self.flows:
144
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
145
+ else:
146
+ for flow in reversed(self.flows):
147
+ x = flow.forward(x, x_mask, g=g, reverse=reverse)
148
+ return x
149
+
150
+ def remove_weight_norm(self):
151
+ """Removes weight normalization from the coupling layers."""
152
+ for i in range(self.n_flows):
153
+ self.flows[i * 2].remove_weight_norm()
154
+
155
+ def __prepare_scriptable__(self):
156
+ """Prepares the module for scripting."""
157
+ for i in range(self.n_flows):
158
+ for hook in self.flows[i * 2]._forward_pre_hooks.values():
159
+ if (
160
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
161
+ and hook.__class__.__name__ == "WeightNorm"
162
+ ):
163
+ torch.nn.utils.remove_weight_norm(self.flows[i * 2])
164
+
165
+ return self
166
+
167
+
168
+ class ResidualCouplingLayer(torch.nn.Module):
169
+ """Residual coupling layer for flow-based models.
170
+
171
+ Args:
172
+ channels (int): Number of channels.
173
+ hidden_channels (int): Number of hidden channels.
174
+ kernel_size (int): Size of the convolutional kernel.
175
+ dilation_rate (int): Dilation rate of the convolution.
176
+ n_layers (int): Number of convolutional layers.
177
+ p_dropout (float, optional): Dropout probability. Defaults to 0.
178
+ gin_channels (int, optional): Number of conditioning channels. Defaults to 0.
179
+ mean_only (bool, optional): Whether to use mean-only coupling. Defaults to False.
180
+ """
181
+
182
+ def __init__(
183
+ self,
184
+ channels,
185
+ hidden_channels,
186
+ kernel_size,
187
+ dilation_rate,
188
+ n_layers,
189
+ p_dropout=0,
190
+ gin_channels=0,
191
+ mean_only=False,
192
+ ):
193
+ assert channels % 2 == 0, "channels should be divisible by 2"
194
+ super().__init__()
195
+ self.channels = channels
196
+ self.hidden_channels = hidden_channels
197
+ self.kernel_size = kernel_size
198
+ self.dilation_rate = dilation_rate
199
+ self.n_layers = n_layers
200
+ self.half_channels = channels // 2
201
+ self.mean_only = mean_only
202
+
203
+ self.pre = torch.nn.Conv1d(self.half_channels, hidden_channels, 1)
204
+ self.enc = WaveNet(
205
+ hidden_channels,
206
+ kernel_size,
207
+ dilation_rate,
208
+ n_layers,
209
+ p_dropout=p_dropout,
210
+ gin_channels=gin_channels,
211
+ )
212
+ self.post = torch.nn.Conv1d(
213
+ hidden_channels, self.half_channels * (2 - mean_only), 1
214
+ )
215
+ self.post.weight.data.zero_()
216
+ self.post.bias.data.zero_()
217
+
218
+ def forward(self, x, x_mask, g=None, reverse=False):
219
+ """Forward pass.
220
+
221
+ Args:
222
+ x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps).
223
+ x_mask (torch.Tensor): Mask tensor of shape (batch_size, 1, time_steps).
224
+ g (torch.Tensor, optional): Conditioning tensor of shape (batch_size, gin_channels, time_steps).
225
+ Defaults to None.
226
+ reverse (bool, optional): Whether to reverse the operation. Defaults to False.
227
+ """
228
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
229
+ h = self.pre(x0) * x_mask
230
+ h = self.enc(h, x_mask, g=g)
231
+ stats = self.post(h) * x_mask
232
+ if not self.mean_only:
233
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
234
+ else:
235
+ m = stats
236
+ logs = torch.zeros_like(m)
237
+
238
+ if not reverse:
239
+ x1 = m + x1 * torch.exp(logs) * x_mask
240
+ x = torch.cat([x0, x1], 1)
241
+ logdet = torch.sum(logs, [1, 2])
242
+ return x, logdet
243
+ else:
244
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
245
+ x = torch.cat([x0, x1], 1)
246
+ return x
247
+
248
+ def remove_weight_norm(self):
249
+ """Remove weight normalization from the module."""
250
+ self.enc.remove_weight_norm()
rvc/lib/algorithm/synthesizers.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import Optional
3
+
4
+ from rvc.lib.algorithm.nsf import GeneratorNSF
5
+ from rvc.lib.algorithm.generators import Generator
6
+ from rvc.lib.algorithm.commons import slice_segments, rand_slice_segments
7
+ from rvc.lib.algorithm.residuals import ResidualCouplingBlock
8
+ from rvc.lib.algorithm.encoders import TextEncoder, PosteriorEncoder
9
+
10
+
11
+ class Synthesizer(torch.nn.Module):
12
+ """
13
+ Base Synthesizer model.
14
+
15
+ Args:
16
+ spec_channels (int): Number of channels in the spectrogram.
17
+ segment_size (int): Size of the audio segment.
18
+ inter_channels (int): Number of channels in the intermediate layers.
19
+ hidden_channels (int): Number of channels in the hidden layers.
20
+ filter_channels (int): Number of channels in the filter layers.
21
+ n_heads (int): Number of attention heads.
22
+ n_layers (int): Number of layers in the encoder.
23
+ kernel_size (int): Size of the convolution kernel.
24
+ p_dropout (float): Dropout probability.
25
+ resblock (str): Type of residual block.
26
+ resblock_kernel_sizes (list): Kernel sizes for the residual blocks.
27
+ resblock_dilation_sizes (list): Dilation sizes for the residual blocks.
28
+ upsample_rates (list): Upsampling rates for the decoder.
29
+ upsample_initial_channel (int): Number of channels in the initial upsampling layer.
30
+ upsample_kernel_sizes (list): Kernel sizes for the upsampling layers.
31
+ spk_embed_dim (int): Dimension of the speaker embedding.
32
+ gin_channels (int): Number of channels in the global conditioning vector.
33
+ sr (int): Sampling rate of the audio.
34
+ use_f0 (bool): Whether to use F0 information.
35
+ text_enc_hidden_dim (int): Hidden dimension for the text encoder.
36
+ kwargs: Additional keyword arguments.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ spec_channels,
42
+ segment_size,
43
+ inter_channels,
44
+ hidden_channels,
45
+ filter_channels,
46
+ n_heads,
47
+ n_layers,
48
+ kernel_size,
49
+ p_dropout,
50
+ resblock,
51
+ resblock_kernel_sizes,
52
+ resblock_dilation_sizes,
53
+ upsample_rates,
54
+ upsample_initial_channel,
55
+ upsample_kernel_sizes,
56
+ spk_embed_dim,
57
+ gin_channels,
58
+ sr,
59
+ use_f0,
60
+ text_enc_hidden_dim=768,
61
+ **kwargs
62
+ ):
63
+ super(Synthesizer, self).__init__()
64
+ self.spec_channels = spec_channels
65
+ self.inter_channels = inter_channels
66
+ self.hidden_channels = hidden_channels
67
+ self.filter_channels = filter_channels
68
+ self.n_heads = n_heads
69
+ self.n_layers = n_layers
70
+ self.kernel_size = kernel_size
71
+ self.p_dropout = float(p_dropout)
72
+ self.resblock = resblock
73
+ self.resblock_kernel_sizes = resblock_kernel_sizes
74
+ self.resblock_dilation_sizes = resblock_dilation_sizes
75
+ self.upsample_rates = upsample_rates
76
+ self.upsample_initial_channel = upsample_initial_channel
77
+ self.upsample_kernel_sizes = upsample_kernel_sizes
78
+ self.segment_size = segment_size
79
+ self.gin_channels = gin_channels
80
+ self.spk_embed_dim = spk_embed_dim
81
+ self.use_f0 = use_f0
82
+
83
+ self.enc_p = TextEncoder(
84
+ inter_channels,
85
+ hidden_channels,
86
+ filter_channels,
87
+ n_heads,
88
+ n_layers,
89
+ kernel_size,
90
+ float(p_dropout),
91
+ text_enc_hidden_dim,
92
+ f0=use_f0,
93
+ )
94
+
95
+ if use_f0:
96
+ self.dec = GeneratorNSF(
97
+ inter_channels,
98
+ resblock,
99
+ resblock_kernel_sizes,
100
+ resblock_dilation_sizes,
101
+ upsample_rates,
102
+ upsample_initial_channel,
103
+ upsample_kernel_sizes,
104
+ gin_channels=gin_channels,
105
+ sr=sr,
106
+ is_half=kwargs["is_half"],
107
+ )
108
+ else:
109
+ self.dec = Generator(
110
+ inter_channels,
111
+ resblock,
112
+ resblock_kernel_sizes,
113
+ resblock_dilation_sizes,
114
+ upsample_rates,
115
+ upsample_initial_channel,
116
+ upsample_kernel_sizes,
117
+ gin_channels=gin_channels,
118
+ )
119
+
120
+ self.enc_q = PosteriorEncoder(
121
+ spec_channels,
122
+ inter_channels,
123
+ hidden_channels,
124
+ 5,
125
+ 1,
126
+ 16,
127
+ gin_channels=gin_channels,
128
+ )
129
+ self.flow = ResidualCouplingBlock(
130
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
131
+ )
132
+ self.emb_g = torch.nn.Embedding(self.spk_embed_dim, gin_channels)
133
+
134
+ def remove_weight_norm(self):
135
+ """Removes weight normalization from the model."""
136
+ self.dec.remove_weight_norm()
137
+ self.flow.remove_weight_norm()
138
+ self.enc_q.remove_weight_norm()
139
+
140
+ def __prepare_scriptable__(self):
141
+ for hook in self.dec._forward_pre_hooks.values():
142
+ if (
143
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
144
+ and hook.__class__.__name__ == "WeightNorm"
145
+ ):
146
+ torch.nn.utils.remove_weight_norm(self.dec)
147
+ for hook in self.flow._forward_pre_hooks.values():
148
+ if (
149
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
150
+ and hook.__class__.__name__ == "WeightNorm"
151
+ ):
152
+ torch.nn.utils.remove_weight_norm(self.flow)
153
+ if hasattr(self, "enc_q"):
154
+ for hook in self.enc_q._forward_pre_hooks.values():
155
+ if (
156
+ hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
157
+ and hook.__class__.__name__ == "WeightNorm"
158
+ ):
159
+ torch.nn.utils.remove_weight_norm(self.enc_q)
160
+ return self
161
+
162
+ @torch.jit.ignore
163
+ def forward(
164
+ self,
165
+ phone: torch.Tensor,
166
+ phone_lengths: torch.Tensor,
167
+ pitch: Optional[torch.Tensor] = None,
168
+ pitchf: Optional[torch.Tensor] = None,
169
+ y: torch.Tensor = None,
170
+ y_lengths: torch.Tensor = None,
171
+ ds: Optional[torch.Tensor] = None,
172
+ ):
173
+ """
174
+ Forward pass of the model.
175
+
176
+ Args:
177
+ phone (torch.Tensor): Phoneme sequence.
178
+ phone_lengths (torch.Tensor): Lengths of the phoneme sequences.
179
+ pitch (torch.Tensor, optional): Pitch sequence.
180
+ pitchf (torch.Tensor, optional): Fine-grained pitch sequence.
181
+ y (torch.Tensor, optional): Target spectrogram.
182
+ y_lengths (torch.Tensor, optional): Lengths of the target spectrograms.
183
+ ds (torch.Tensor, optional): Speaker embedding. Defaults to None.
184
+ """
185
+ g = self.emb_g(ds).unsqueeze(-1)
186
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
187
+ if y is not None:
188
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
189
+ z_p = self.flow(z, y_mask, g=g)
190
+ z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size)
191
+ if self.use_f0:
192
+ pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2)
193
+ o = self.dec(z_slice, pitchf, g=g)
194
+ else:
195
+ o = self.dec(z_slice, g=g)
196
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
197
+ else:
198
+ return None, None, x_mask, None, (None, None, m_p, logs_p, None, None)
199
+
200
+ @torch.jit.export
201
+ def infer(
202
+ self,
203
+ phone: torch.Tensor,
204
+ phone_lengths: torch.Tensor,
205
+ pitch: Optional[torch.Tensor] = None,
206
+ nsff0: Optional[torch.Tensor] = None,
207
+ sid: torch.Tensor = None,
208
+ rate: Optional[torch.Tensor] = None,
209
+ ):
210
+ """
211
+ Inference of the model.
212
+
213
+ Args:
214
+ phone (torch.Tensor): Phoneme sequence.
215
+ phone_lengths (torch.Tensor): Lengths of the phoneme sequences.
216
+ pitch (torch.Tensor, optional): Pitch sequence.
217
+ nsff0 (torch.Tensor, optional): Fine-grained pitch sequence.
218
+ sid (torch.Tensor): Speaker embedding.
219
+ rate (torch.Tensor, optional): Rate for time-stretching. Defaults to None.
220
+ """
221
+ g = self.emb_g(sid).unsqueeze(-1)
222
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
223
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
224
+ if rate is not None:
225
+ assert isinstance(rate, torch.Tensor)
226
+ head = int(z_p.shape[2] * (1.0 - rate.item()))
227
+ z_p = z_p[:, :, head:]
228
+ x_mask = x_mask[:, :, head:]
229
+ if self.use_f0:
230
+ nsff0 = nsff0[:, head:]
231
+ if self.use_f0:
232
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
233
+ o = self.dec(z * x_mask, nsff0, g=g)
234
+ else:
235
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
236
+ o = self.dec(z * x_mask, g=g)
237
+ return o, x_mask, (z, z_p, m_p, logs_p)
rvc/lib/predictors/F0Extractor.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import pathlib
3
+ import libf0
4
+ import librosa
5
+ import numpy as np
6
+ import resampy
7
+ import torch
8
+ import torchcrepe
9
+ import torchfcpe
10
+ import os
11
+
12
+ # from tools.anyf0.rmvpe import RMVPE
13
+ from rvc.lib.predictors.RMVPE import RMVPE0Predictor
14
+ from rvc.configs.config import Config
15
+
16
+ config = Config()
17
+
18
+
19
+ @dataclasses.dataclass
20
+ class F0Extractor:
21
+ wav_path: pathlib.Path
22
+ sample_rate: int = 44100
23
+ hop_length: int = 512
24
+ f0_min: int = 50
25
+ f0_max: int = 1600
26
+ method: str = "rmvpe"
27
+ x: np.ndarray = dataclasses.field(init=False)
28
+
29
+ def __post_init__(self):
30
+ self.x, self.sample_rate = librosa.load(self.wav_path, sr=self.sample_rate)
31
+
32
+ @property
33
+ def hop_size(self) -> float:
34
+ return self.hop_length / self.sample_rate
35
+
36
+ @property
37
+ def wav16k(self) -> np.ndarray:
38
+ return resampy.resample(self.x, self.sample_rate, 16000)
39
+
40
+ def extract_f0(self) -> np.ndarray:
41
+ f0 = None
42
+ method = self.method
43
+ if method == "crepe":
44
+ wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(config.device)
45
+ f0 = torchcrepe.predict(
46
+ wav16k_torch,
47
+ sample_rate=16000,
48
+ hop_length=160,
49
+ batch_size=512,
50
+ fmin=self.f0_min,
51
+ fmax=self.f0_max,
52
+ device=config.device,
53
+ )
54
+ f0 = f0[0].cpu().numpy()
55
+ elif method == "fcpe":
56
+ audio = librosa.to_mono(self.x)
57
+ audio_length = len(audio)
58
+ f0_target_length = (audio_length // self.hop_length) + 1
59
+ audio = (
60
+ torch.from_numpy(audio)
61
+ .float()
62
+ .unsqueeze(0)
63
+ .unsqueeze(-1)
64
+ .to(config.device)
65
+ )
66
+ model = torchfcpe.spawn_bundled_infer_model(device=config.device)
67
+
68
+ f0 = model.infer(
69
+ audio,
70
+ sr=self.sample_rate,
71
+ decoder_mode="local_argmax",
72
+ threshold=0.006,
73
+ f0_min=self.f0_min,
74
+ f0_max=self.f0_max,
75
+ interp_uv=False,
76
+ output_interp_target_length=f0_target_length,
77
+ )
78
+ f0 = f0.squeeze().cpu().numpy()
79
+ elif method == "rmvpe":
80
+ model_rmvpe = RMVPE0Predictor(
81
+ os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
82
+ is_half=config.is_half,
83
+ device=config.device,
84
+ # hop_length=80
85
+ )
86
+ f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03)
87
+
88
+ else:
89
+ raise ValueError(f"Unknown method: {self.method}")
90
+ return libf0.hz_to_cents(f0, librosa.midi_to_hz(0))
91
+
92
+ def plot_f0(self, f0):
93
+ from matplotlib import pyplot as plt
94
+
95
+ plt.figure(figsize=(10, 4))
96
+ plt.plot(f0)
97
+ plt.title(self.method)
98
+ plt.xlabel("Time (frames)")
99
+ plt.ylabel("F0 (cents)")
100
+ plt.show()
rvc/lib/predictors/FCPE.py ADDED
@@ -0,0 +1,920 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn as nn
7
+ from torch.nn.utils.parametrizations import weight_norm
8
+ from torchaudio.transforms import Resample
9
+ import os
10
+ import librosa
11
+ import soundfile as sf
12
+ import torch.utils.data
13
+ from librosa.filters import mel as librosa_mel_fn
14
+ import math
15
+ from functools import partial
16
+
17
+ from einops import rearrange, repeat
18
+ from local_attention import LocalAttention
19
+ from torch import nn
20
+
21
+ os.environ["LRU_CACHE_CAPACITY"] = "3"
22
+
23
+
24
+ def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
25
+ """Loads wav file to torch tensor."""
26
+ try:
27
+ data, sample_rate = sf.read(full_path, always_2d=True)
28
+ except Exception as error:
29
+ print(f"An error occurred loading {full_path}: {error}")
30
+ if return_empty_on_exception:
31
+ return [], sample_rate or target_sr or 48000
32
+ else:
33
+ raise
34
+
35
+ data = data[:, 0] if len(data.shape) > 1 else data
36
+ assert len(data) > 2
37
+
38
+ # Normalize data
39
+ max_mag = (
40
+ -np.iinfo(data.dtype).min
41
+ if np.issubdtype(data.dtype, np.integer)
42
+ else max(np.amax(data), -np.amin(data))
43
+ )
44
+ max_mag = (
45
+ (2**31) + 1 if max_mag > (2**15) else ((2**15) + 1 if max_mag > 1.01 else 1.0)
46
+ )
47
+ data = torch.FloatTensor(data.astype(np.float32)) / max_mag
48
+
49
+ # Handle exceptions and resample
50
+ if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:
51
+ return [], sample_rate or target_sr or 48000
52
+ if target_sr is not None and sample_rate != target_sr:
53
+ data = torch.from_numpy(
54
+ librosa.core.resample(
55
+ data.numpy(), orig_sr=sample_rate, target_sr=target_sr
56
+ )
57
+ )
58
+ sample_rate = target_sr
59
+
60
+ return data, sample_rate
61
+
62
+
63
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
64
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
65
+
66
+
67
+ def dynamic_range_decompression(x, C=1):
68
+ return np.exp(x) / C
69
+
70
+
71
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
72
+ return torch.log(torch.clamp(x, min=clip_val) * C)
73
+
74
+
75
+ def dynamic_range_decompression_torch(x, C=1):
76
+ return torch.exp(x) / C
77
+
78
+
79
+ class STFT:
80
+ def __init__(
81
+ self,
82
+ sr=22050,
83
+ n_mels=80,
84
+ n_fft=1024,
85
+ win_size=1024,
86
+ hop_length=256,
87
+ fmin=20,
88
+ fmax=11025,
89
+ clip_val=1e-5,
90
+ ):
91
+ self.target_sr = sr
92
+ self.n_mels = n_mels
93
+ self.n_fft = n_fft
94
+ self.win_size = win_size
95
+ self.hop_length = hop_length
96
+ self.fmin = fmin
97
+ self.fmax = fmax
98
+ self.clip_val = clip_val
99
+ self.mel_basis = {}
100
+ self.hann_window = {}
101
+
102
+ def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
103
+ sample_rate = self.target_sr
104
+ n_mels = self.n_mels
105
+ n_fft = self.n_fft
106
+ win_size = self.win_size
107
+ hop_length = self.hop_length
108
+ fmin = self.fmin
109
+ fmax = self.fmax
110
+ clip_val = self.clip_val
111
+
112
+ factor = 2 ** (keyshift / 12)
113
+ n_fft_new = int(np.round(n_fft * factor))
114
+ win_size_new = int(np.round(win_size * factor))
115
+ hop_length_new = int(np.round(hop_length * speed))
116
+
117
+ # Optimize mel_basis and hann_window caching
118
+ mel_basis = self.mel_basis if not train else {}
119
+ hann_window = self.hann_window if not train else {}
120
+
121
+ mel_basis_key = str(fmax) + "_" + str(y.device)
122
+ if mel_basis_key not in mel_basis:
123
+ mel = librosa_mel_fn(
124
+ sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
125
+ )
126
+ mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
127
+
128
+ keyshift_key = str(keyshift) + "_" + str(y.device)
129
+ if keyshift_key not in hann_window:
130
+ hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
131
+
132
+ # Padding and STFT
133
+ pad_left = (win_size_new - hop_length_new) // 2
134
+ pad_right = max(
135
+ (win_size_new - hop_length_new + 1) // 2,
136
+ win_size_new - y.size(-1) - pad_left,
137
+ )
138
+ mode = "reflect" if pad_right < y.size(-1) else "constant"
139
+ y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode)
140
+ y = y.squeeze(1)
141
+
142
+ spec = torch.stft(
143
+ y,
144
+ n_fft_new,
145
+ hop_length=hop_length_new,
146
+ win_length=win_size_new,
147
+ window=hann_window[keyshift_key],
148
+ center=center,
149
+ pad_mode="reflect",
150
+ normalized=False,
151
+ onesided=True,
152
+ return_complex=True,
153
+ )
154
+ spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
155
+
156
+ # Handle keyshift and mel conversion
157
+ if keyshift != 0:
158
+ size = n_fft // 2 + 1
159
+ resize = spec.size(1)
160
+ spec = (
161
+ F.pad(spec, (0, 0, 0, size - resize))
162
+ if resize < size
163
+ else spec[:, :size, :]
164
+ )
165
+ spec = spec * win_size / win_size_new
166
+ spec = torch.matmul(mel_basis[mel_basis_key], spec)
167
+ spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
168
+ return spec
169
+
170
+ def __call__(self, audiopath):
171
+ audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
172
+ spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
173
+ return spect
174
+
175
+
176
+ stft = STFT()
177
+
178
+
179
+ def softmax_kernel(
180
+ data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None
181
+ ):
182
+ b, h, *_ = data.shape
183
+
184
+ # Normalize data
185
+ data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0
186
+
187
+ # Project data
188
+ ratio = projection_matrix.shape[0] ** -0.5
189
+ projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h)
190
+ projection = projection.type_as(data)
191
+ data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection)
192
+
193
+ # Calculate diagonal data
194
+ diag_data = data**2
195
+ diag_data = torch.sum(diag_data, dim=-1)
196
+ diag_data = (diag_data / 2.0) * (data_normalizer**2)
197
+ diag_data = diag_data.unsqueeze(dim=-1)
198
+
199
+ # Apply softmax
200
+ if is_query:
201
+ data_dash = ratio * (
202
+ torch.exp(
203
+ data_dash
204
+ - diag_data
205
+ - torch.max(data_dash, dim=-1, keepdim=True).values
206
+ )
207
+ + eps
208
+ )
209
+ else:
210
+ data_dash = ratio * (torch.exp(data_dash - diag_data + eps))
211
+
212
+ return data_dash.type_as(data)
213
+
214
+
215
+ def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None):
216
+ unstructured_block = torch.randn((cols, cols), device=device)
217
+ q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced")
218
+ q, r = map(lambda t: t.to(device), (q, r))
219
+
220
+ if qr_uniform_q:
221
+ d = torch.diag(r, 0)
222
+ q *= d.sign()
223
+ return q.t()
224
+
225
+
226
+ def exists(val):
227
+ return val is not None
228
+
229
+
230
+ def empty(tensor):
231
+ return tensor.numel() == 0
232
+
233
+
234
+ def default(val, d):
235
+ return val if exists(val) else d
236
+
237
+
238
+ def cast_tuple(val):
239
+ return (val,) if not isinstance(val, tuple) else val
240
+
241
+
242
+ class PCmer(nn.Module):
243
+ def __init__(
244
+ self,
245
+ num_layers,
246
+ num_heads,
247
+ dim_model,
248
+ dim_keys,
249
+ dim_values,
250
+ residual_dropout,
251
+ attention_dropout,
252
+ ):
253
+ super().__init__()
254
+ self.num_layers = num_layers
255
+ self.num_heads = num_heads
256
+ self.dim_model = dim_model
257
+ self.dim_values = dim_values
258
+ self.dim_keys = dim_keys
259
+ self.residual_dropout = residual_dropout
260
+ self.attention_dropout = attention_dropout
261
+
262
+ self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])
263
+
264
+ def forward(self, phone, mask=None):
265
+ for layer in self._layers:
266
+ phone = layer(phone, mask)
267
+ return phone
268
+
269
+
270
+ class _EncoderLayer(nn.Module):
271
+ def __init__(self, parent: PCmer):
272
+ super().__init__()
273
+ self.conformer = ConformerConvModule(parent.dim_model)
274
+ self.norm = nn.LayerNorm(parent.dim_model)
275
+ self.dropout = nn.Dropout(parent.residual_dropout)
276
+ self.attn = SelfAttention(
277
+ dim=parent.dim_model, heads=parent.num_heads, causal=False
278
+ )
279
+
280
+ def forward(self, phone, mask=None):
281
+ phone = phone + (self.attn(self.norm(phone), mask=mask))
282
+ phone = phone + (self.conformer(phone))
283
+ return phone
284
+
285
+
286
+ def calc_same_padding(kernel_size):
287
+ pad = kernel_size // 2
288
+ return (pad, pad - (kernel_size + 1) % 2)
289
+
290
+
291
+ class Swish(nn.Module):
292
+ def forward(self, x):
293
+ return x * x.sigmoid()
294
+
295
+
296
+ class Transpose(nn.Module):
297
+ def __init__(self, dims):
298
+ super().__init__()
299
+ assert len(dims) == 2, "dims must be a tuple of two dimensions"
300
+ self.dims = dims
301
+
302
+ def forward(self, x):
303
+ return x.transpose(*self.dims)
304
+
305
+
306
+ class GLU(nn.Module):
307
+ def __init__(self, dim):
308
+ super().__init__()
309
+ self.dim = dim
310
+
311
+ def forward(self, x):
312
+ out, gate = x.chunk(2, dim=self.dim)
313
+ return out * gate.sigmoid()
314
+
315
+
316
+ class DepthWiseConv1d(nn.Module):
317
+ def __init__(self, chan_in, chan_out, kernel_size, padding):
318
+ super().__init__()
319
+ self.padding = padding
320
+ self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in)
321
+
322
+ def forward(self, x):
323
+ x = F.pad(x, self.padding)
324
+ return self.conv(x)
325
+
326
+
327
+ class ConformerConvModule(nn.Module):
328
+ def __init__(
329
+ self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0
330
+ ):
331
+ super().__init__()
332
+
333
+ inner_dim = dim * expansion_factor
334
+ padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
335
+
336
+ self.net = nn.Sequential(
337
+ nn.LayerNorm(dim),
338
+ Transpose((1, 2)),
339
+ nn.Conv1d(dim, inner_dim * 2, 1),
340
+ GLU(dim=1),
341
+ DepthWiseConv1d(
342
+ inner_dim, inner_dim, kernel_size=kernel_size, padding=padding
343
+ ),
344
+ Swish(),
345
+ nn.Conv1d(inner_dim, dim, 1),
346
+ Transpose((1, 2)),
347
+ nn.Dropout(dropout),
348
+ )
349
+
350
+ def forward(self, x):
351
+ return self.net(x)
352
+
353
+
354
+ def linear_attention(q, k, v):
355
+ if v is None:
356
+ out = torch.einsum("...ed,...nd->...ne", k, q)
357
+ return out
358
+ else:
359
+ k_cumsum = k.sum(dim=-2)
360
+ D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8)
361
+ context = torch.einsum("...nd,...ne->...de", k, v)
362
+ out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv)
363
+ return out
364
+
365
+
366
+ def gaussian_orthogonal_random_matrix(
367
+ nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None
368
+ ):
369
+ nb_full_blocks = int(nb_rows / nb_columns)
370
+ block_list = []
371
+
372
+ for _ in range(nb_full_blocks):
373
+ q = orthogonal_matrix_chunk(
374
+ nb_columns, qr_uniform_q=qr_uniform_q, device=device
375
+ )
376
+ block_list.append(q)
377
+
378
+ remaining_rows = nb_rows - nb_full_blocks * nb_columns
379
+ if remaining_rows > 0:
380
+ q = orthogonal_matrix_chunk(
381
+ nb_columns, qr_uniform_q=qr_uniform_q, device=device
382
+ )
383
+ block_list.append(q[:remaining_rows])
384
+
385
+ final_matrix = torch.cat(block_list)
386
+
387
+ if scaling == 0:
388
+ multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1)
389
+ elif scaling == 1:
390
+ multiplier = math.sqrt((float(nb_columns))) * torch.ones(
391
+ (nb_rows,), device=device
392
+ )
393
+ else:
394
+ raise ValueError(f"Invalid scaling {scaling}")
395
+
396
+ return torch.diag(multiplier) @ final_matrix
397
+
398
+
399
+ class FastAttention(nn.Module):
400
+ def __init__(
401
+ self,
402
+ dim_heads,
403
+ nb_features=None,
404
+ ortho_scaling=0,
405
+ causal=False,
406
+ generalized_attention=False,
407
+ kernel_fn=nn.ReLU(),
408
+ qr_uniform_q=False,
409
+ no_projection=False,
410
+ ):
411
+ super().__init__()
412
+ nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
413
+
414
+ self.dim_heads = dim_heads
415
+ self.nb_features = nb_features
416
+ self.ortho_scaling = ortho_scaling
417
+
418
+ self.create_projection = partial(
419
+ gaussian_orthogonal_random_matrix,
420
+ nb_rows=self.nb_features,
421
+ nb_columns=dim_heads,
422
+ scaling=ortho_scaling,
423
+ qr_uniform_q=qr_uniform_q,
424
+ )
425
+ projection_matrix = self.create_projection()
426
+ self.register_buffer("projection_matrix", projection_matrix)
427
+
428
+ self.generalized_attention = generalized_attention
429
+ self.kernel_fn = kernel_fn
430
+ self.no_projection = no_projection
431
+ self.causal = causal
432
+
433
+ @torch.no_grad()
434
+ def redraw_projection_matrix(self):
435
+ projections = self.create_projection()
436
+ self.projection_matrix.copy_(projections)
437
+ del projections
438
+
439
+ def forward(self, q, k, v):
440
+ device = q.device
441
+
442
+ if self.no_projection:
443
+ q = q.softmax(dim=-1)
444
+ k = torch.exp(k) if self.causal else k.softmax(dim=-2)
445
+ else:
446
+ create_kernel = partial(
447
+ softmax_kernel, projection_matrix=self.projection_matrix, device=device
448
+ )
449
+ q = create_kernel(q, is_query=True)
450
+ k = create_kernel(k, is_query=False)
451
+
452
+ attn_fn = linear_attention if not self.causal else self.causal_linear_fn
453
+
454
+ if v is None:
455
+ out = attn_fn(q, k, None)
456
+ return out
457
+ else:
458
+ out = attn_fn(q, k, v)
459
+ return out
460
+
461
+
462
+ class SelfAttention(nn.Module):
463
+ def __init__(
464
+ self,
465
+ dim,
466
+ causal=False,
467
+ heads=8,
468
+ dim_head=64,
469
+ local_heads=0,
470
+ local_window_size=256,
471
+ nb_features=None,
472
+ feature_redraw_interval=1000,
473
+ generalized_attention=False,
474
+ kernel_fn=nn.ReLU(),
475
+ qr_uniform_q=False,
476
+ dropout=0.0,
477
+ no_projection=False,
478
+ ):
479
+ super().__init__()
480
+ assert dim % heads == 0, "dimension must be divisible by number of heads"
481
+ dim_head = default(dim_head, dim // heads)
482
+ inner_dim = dim_head * heads
483
+ self.fast_attention = FastAttention(
484
+ dim_head,
485
+ nb_features,
486
+ causal=causal,
487
+ generalized_attention=generalized_attention,
488
+ kernel_fn=kernel_fn,
489
+ qr_uniform_q=qr_uniform_q,
490
+ no_projection=no_projection,
491
+ )
492
+
493
+ self.heads = heads
494
+ self.global_heads = heads - local_heads
495
+ self.local_attn = (
496
+ LocalAttention(
497
+ window_size=local_window_size,
498
+ causal=causal,
499
+ autopad=True,
500
+ dropout=dropout,
501
+ look_forward=int(not causal),
502
+ rel_pos_emb_config=(dim_head, local_heads),
503
+ )
504
+ if local_heads > 0
505
+ else None
506
+ )
507
+
508
+ self.to_q = nn.Linear(dim, inner_dim)
509
+ self.to_k = nn.Linear(dim, inner_dim)
510
+ self.to_v = nn.Linear(dim, inner_dim)
511
+ self.to_out = nn.Linear(inner_dim, dim)
512
+ self.dropout = nn.Dropout(dropout)
513
+
514
+ @torch.no_grad()
515
+ def redraw_projection_matrix(self):
516
+ self.fast_attention.redraw_projection_matrix()
517
+
518
+ def forward(
519
+ self,
520
+ x,
521
+ context=None,
522
+ mask=None,
523
+ context_mask=None,
524
+ name=None,
525
+ inference=False,
526
+ **kwargs,
527
+ ):
528
+ _, _, _, h, gh = *x.shape, self.heads, self.global_heads
529
+
530
+ cross_attend = exists(context)
531
+ context = default(context, x)
532
+ context_mask = default(context_mask, mask) if not cross_attend else context_mask
533
+ q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
534
+
535
+ q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
536
+ (q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
537
+
538
+ attn_outs = []
539
+ if not empty(q):
540
+ if exists(context_mask):
541
+ global_mask = context_mask[:, None, :, None]
542
+ v.masked_fill_(~global_mask, 0.0)
543
+ if cross_attend:
544
+ pass # TODO: Implement cross-attention
545
+ else:
546
+ out = self.fast_attention(q, k, v)
547
+ attn_outs.append(out)
548
+
549
+ if not empty(lq):
550
+ assert (
551
+ not cross_attend
552
+ ), "local attention is not compatible with cross attention"
553
+ out = self.local_attn(lq, lk, lv, input_mask=mask)
554
+ attn_outs.append(out)
555
+
556
+ out = torch.cat(attn_outs, dim=1)
557
+ out = rearrange(out, "b h n d -> b n (h d)")
558
+ out = self.to_out(out)
559
+ return self.dropout(out)
560
+
561
+
562
+ def l2_regularization(model, l2_alpha):
563
+ l2_loss = []
564
+ for module in model.modules():
565
+ if type(module) is nn.Conv2d:
566
+ l2_loss.append((module.weight**2).sum() / 2.0)
567
+ return l2_alpha * sum(l2_loss)
568
+
569
+
570
+ class FCPE(nn.Module):
571
+ def __init__(
572
+ self,
573
+ input_channel=128,
574
+ out_dims=360,
575
+ n_layers=12,
576
+ n_chans=512,
577
+ use_siren=False,
578
+ use_full=False,
579
+ loss_mse_scale=10,
580
+ loss_l2_regularization=False,
581
+ loss_l2_regularization_scale=1,
582
+ loss_grad1_mse=False,
583
+ loss_grad1_mse_scale=1,
584
+ f0_max=1975.5,
585
+ f0_min=32.70,
586
+ confidence=False,
587
+ threshold=0.05,
588
+ use_input_conv=True,
589
+ ):
590
+ super().__init__()
591
+ if use_siren is True:
592
+ raise ValueError("Siren is not supported yet.")
593
+ if use_full is True:
594
+ raise ValueError("Full model is not supported yet.")
595
+
596
+ self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10
597
+ self.loss_l2_regularization = (
598
+ loss_l2_regularization if (loss_l2_regularization is not None) else False
599
+ )
600
+ self.loss_l2_regularization_scale = (
601
+ loss_l2_regularization_scale
602
+ if (loss_l2_regularization_scale is not None)
603
+ else 1
604
+ )
605
+ self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False
606
+ self.loss_grad1_mse_scale = (
607
+ loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1
608
+ )
609
+ self.f0_max = f0_max if (f0_max is not None) else 1975.5
610
+ self.f0_min = f0_min if (f0_min is not None) else 32.70
611
+ self.confidence = confidence if (confidence is not None) else False
612
+ self.threshold = threshold if (threshold is not None) else 0.05
613
+ self.use_input_conv = use_input_conv if (use_input_conv is not None) else True
614
+
615
+ self.cent_table_b = torch.Tensor(
616
+ np.linspace(
617
+ self.f0_to_cent(torch.Tensor([f0_min]))[0],
618
+ self.f0_to_cent(torch.Tensor([f0_max]))[0],
619
+ out_dims,
620
+ )
621
+ )
622
+ self.register_buffer("cent_table", self.cent_table_b)
623
+
624
+ # conv in stack
625
+ _leaky = nn.LeakyReLU()
626
+ self.stack = nn.Sequential(
627
+ nn.Conv1d(input_channel, n_chans, 3, 1, 1),
628
+ nn.GroupNorm(4, n_chans),
629
+ _leaky,
630
+ nn.Conv1d(n_chans, n_chans, 3, 1, 1),
631
+ )
632
+
633
+ # transformer
634
+ self.decoder = PCmer(
635
+ num_layers=n_layers,
636
+ num_heads=8,
637
+ dim_model=n_chans,
638
+ dim_keys=n_chans,
639
+ dim_values=n_chans,
640
+ residual_dropout=0.1,
641
+ attention_dropout=0.1,
642
+ )
643
+ self.norm = nn.LayerNorm(n_chans)
644
+
645
+ # out
646
+ self.n_out = out_dims
647
+ self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out))
648
+
649
+ def forward(
650
+ self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax"
651
+ ):
652
+ if cdecoder == "argmax":
653
+ self.cdecoder = self.cents_decoder
654
+ elif cdecoder == "local_argmax":
655
+ self.cdecoder = self.cents_local_decoder
656
+
657
+ x = (
658
+ self.stack(mel.transpose(1, 2)).transpose(1, 2)
659
+ if self.use_input_conv
660
+ else mel
661
+ )
662
+ x = self.decoder(x)
663
+ x = self.norm(x)
664
+ x = self.dense_out(x)
665
+ x = torch.sigmoid(x)
666
+
667
+ if not infer:
668
+ gt_cent_f0 = self.f0_to_cent(gt_f0)
669
+ gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0)
670
+ loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0)
671
+ if self.loss_l2_regularization:
672
+ loss_all = loss_all + l2_regularization(
673
+ model=self, l2_alpha=self.loss_l2_regularization_scale
674
+ )
675
+ x = loss_all
676
+ if infer:
677
+ x = self.cdecoder(x)
678
+ x = self.cent_to_f0(x)
679
+ x = (1 + x / 700).log() if not return_hz_f0 else x
680
+
681
+ return x
682
+
683
+ def cents_decoder(self, y, mask=True):
684
+ B, N, _ = y.size()
685
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
686
+ rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(
687
+ y, dim=-1, keepdim=True
688
+ )
689
+ if mask:
690
+ confident = torch.max(y, dim=-1, keepdim=True)[0]
691
+ confident_mask = torch.ones_like(confident)
692
+ confident_mask[confident <= self.threshold] = float("-INF")
693
+ rtn = rtn * confident_mask
694
+ return (rtn, confident) if self.confidence else rtn
695
+
696
+ def cents_local_decoder(self, y, mask=True):
697
+ B, N, _ = y.size()
698
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
699
+ confident, max_index = torch.max(y, dim=-1, keepdim=True)
700
+ local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4)
701
+ local_argmax_index = torch.clamp(local_argmax_index, 0, self.n_out - 1)
702
+ ci_l = torch.gather(ci, -1, local_argmax_index)
703
+ y_l = torch.gather(y, -1, local_argmax_index)
704
+ rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum(
705
+ y_l, dim=-1, keepdim=True
706
+ )
707
+ if mask:
708
+ confident_mask = torch.ones_like(confident)
709
+ confident_mask[confident <= self.threshold] = float("-INF")
710
+ rtn = rtn * confident_mask
711
+ return (rtn, confident) if self.confidence else rtn
712
+
713
+ def cent_to_f0(self, cent):
714
+ return 10.0 * 2 ** (cent / 1200.0)
715
+
716
+ def f0_to_cent(self, f0):
717
+ return 1200.0 * torch.log2(f0 / 10.0)
718
+
719
+ def gaussian_blurred_cent(self, cents):
720
+ mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0)))
721
+ B, N, _ = cents.size()
722
+ ci = self.cent_table[None, None, :].expand(B, N, -1)
723
+ return torch.exp(-torch.square(ci - cents) / 1250) * mask.float()
724
+
725
+
726
+ class FCPEInfer:
727
+ def __init__(self, model_path, device=None, dtype=torch.float32):
728
+ if device is None:
729
+ device = "cuda" if torch.cuda.is_available() else "cpu"
730
+ self.device = device
731
+ ckpt = torch.load(model_path, map_location=torch.device(self.device))
732
+ self.args = DotDict(ckpt["config"])
733
+ self.dtype = dtype
734
+ model = FCPE(
735
+ input_channel=self.args.model.input_channel,
736
+ out_dims=self.args.model.out_dims,
737
+ n_layers=self.args.model.n_layers,
738
+ n_chans=self.args.model.n_chans,
739
+ use_siren=self.args.model.use_siren,
740
+ use_full=self.args.model.use_full,
741
+ loss_mse_scale=self.args.loss.loss_mse_scale,
742
+ loss_l2_regularization=self.args.loss.loss_l2_regularization,
743
+ loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale,
744
+ loss_grad1_mse=self.args.loss.loss_grad1_mse,
745
+ loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale,
746
+ f0_max=self.args.model.f0_max,
747
+ f0_min=self.args.model.f0_min,
748
+ confidence=self.args.model.confidence,
749
+ )
750
+ model.to(self.device).to(self.dtype)
751
+ model.load_state_dict(ckpt["model"])
752
+ model.eval()
753
+ self.model = model
754
+ self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device)
755
+
756
+ @torch.no_grad()
757
+ def __call__(self, audio, sr, threshold=0.05):
758
+ self.model.threshold = threshold
759
+ audio = audio[None, :]
760
+ mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype)
761
+ f0 = self.model(mel=mel, infer=True, return_hz_f0=True)
762
+ return f0
763
+
764
+
765
+ class Wav2Mel:
766
+ def __init__(self, args, device=None, dtype=torch.float32):
767
+ self.sample_rate = args.mel.sampling_rate
768
+ self.hop_size = args.mel.hop_size
769
+ if device is None:
770
+ device = "cuda" if torch.cuda.is_available() else "cpu"
771
+ self.device = device
772
+ self.dtype = dtype
773
+ self.stft = STFT(
774
+ args.mel.sampling_rate,
775
+ args.mel.num_mels,
776
+ args.mel.n_fft,
777
+ args.mel.win_size,
778
+ args.mel.hop_size,
779
+ args.mel.fmin,
780
+ args.mel.fmax,
781
+ )
782
+ self.resample_kernel = {}
783
+
784
+ def extract_nvstft(self, audio, keyshift=0, train=False):
785
+ mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2)
786
+ return mel
787
+
788
+ def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
789
+ audio = audio.to(self.dtype).to(self.device)
790
+ if sample_rate == self.sample_rate:
791
+ audio_res = audio
792
+ else:
793
+ key_str = str(sample_rate)
794
+ if key_str not in self.resample_kernel:
795
+ self.resample_kernel[key_str] = Resample(
796
+ sample_rate, self.sample_rate, lowpass_filter_width=128
797
+ )
798
+ self.resample_kernel[key_str] = (
799
+ self.resample_kernel[key_str].to(self.dtype).to(self.device)
800
+ )
801
+ audio_res = self.resample_kernel[key_str](audio)
802
+
803
+ mel = self.extract_nvstft(
804
+ audio_res, keyshift=keyshift, train=train
805
+ ) # B, n_frames, bins
806
+ n_frames = int(audio.shape[1] // self.hop_size) + 1
807
+ mel = (
808
+ torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel
809
+ )
810
+ mel = mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel
811
+ return mel
812
+
813
+ def __call__(self, audio, sample_rate, keyshift=0, train=False):
814
+ return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)
815
+
816
+
817
+ class DotDict(dict):
818
+ def __getattr__(*args):
819
+ val = dict.get(*args)
820
+ return DotDict(val) if type(val) is dict else val
821
+
822
+ __setattr__ = dict.__setitem__
823
+ __delattr__ = dict.__delitem__
824
+
825
+
826
+ class F0Predictor(object):
827
+ def compute_f0(self, wav, p_len):
828
+ pass
829
+
830
+ def compute_f0_uv(self, wav, p_len):
831
+ pass
832
+
833
+
834
+ class FCPEF0Predictor(F0Predictor):
835
+ def __init__(
836
+ self,
837
+ model_path,
838
+ hop_length=512,
839
+ f0_min=50,
840
+ f0_max=1100,
841
+ dtype=torch.float32,
842
+ device=None,
843
+ sample_rate=44100,
844
+ threshold=0.05,
845
+ ):
846
+ self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype)
847
+ self.hop_length = hop_length
848
+ self.f0_min = f0_min
849
+ self.f0_max = f0_max
850
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
851
+ self.threshold = threshold
852
+ self.sample_rate = sample_rate
853
+ self.dtype = dtype
854
+ self.name = "fcpe"
855
+
856
+ def repeat_expand(
857
+ self,
858
+ content: Union[torch.Tensor, np.ndarray],
859
+ target_len: int,
860
+ mode: str = "nearest",
861
+ ):
862
+ ndim = content.ndim
863
+ content = (
864
+ content[None, None]
865
+ if ndim == 1
866
+ else content[None] if ndim == 2 else content
867
+ )
868
+ assert content.ndim == 3
869
+ is_np = isinstance(content, np.ndarray)
870
+ content = torch.from_numpy(content) if is_np else content
871
+ results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
872
+ results = results.numpy() if is_np else results
873
+ return results[0, 0] if ndim == 1 else results[0] if ndim == 2 else results
874
+
875
+ def post_process(self, x, sample_rate, f0, pad_to):
876
+ f0 = (
877
+ torch.from_numpy(f0).float().to(x.device)
878
+ if isinstance(f0, np.ndarray)
879
+ else f0
880
+ )
881
+ f0 = self.repeat_expand(f0, pad_to) if pad_to is not None else f0
882
+
883
+ vuv_vector = torch.zeros_like(f0)
884
+ vuv_vector[f0 > 0.0] = 1.0
885
+ vuv_vector[f0 <= 0.0] = 0.0
886
+
887
+ nzindex = torch.nonzero(f0).squeeze()
888
+ f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
889
+ time_org = self.hop_length / sample_rate * nzindex.cpu().numpy()
890
+ time_frame = np.arange(pad_to) * self.hop_length / sample_rate
891
+
892
+ vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0]
893
+
894
+ if f0.shape[0] <= 0:
895
+ return np.zeros(pad_to), vuv_vector.cpu().numpy()
896
+ if f0.shape[0] == 1:
897
+ return np.ones(pad_to) * f0[0], vuv_vector.cpu().numpy()
898
+
899
+ f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
900
+ return f0, vuv_vector.cpu().numpy()
901
+
902
+ def compute_f0(self, wav, p_len=None):
903
+ x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
904
+ p_len = x.shape[0] // self.hop_length if p_len is None else p_len
905
+ f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
906
+ if torch.all(f0 == 0):
907
+ return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (
908
+ f0.cpu().numpy() if p_len is None else np.zeros(p_len)
909
+ )
910
+ return self.post_process(x, self.sample_rate, f0, p_len)[0]
911
+
912
+ def compute_f0_uv(self, wav, p_len=None):
913
+ x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
914
+ p_len = x.shape[0] // self.hop_length if p_len is None else p_len
915
+ f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
916
+ if torch.all(f0 == 0):
917
+ return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (
918
+ f0.cpu().numpy() if p_len is None else np.zeros(p_len)
919
+ )
920
+ return self.post_process(x, self.sample_rate, f0, p_len)
rvc/lib/predictors/RMVPE.py ADDED
@@ -0,0 +1,560 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+
6
+ from librosa.filters import mel
7
+ from typing import List
8
+
9
+ # Constants for readability
10
+ N_MELS = 128
11
+ N_CLASS = 360
12
+
13
+
14
+ # Define a helper function for creating convolutional blocks
15
+ class ConvBlockRes(nn.Module):
16
+ """
17
+ A convolutional block with residual connection.
18
+
19
+ Args:
20
+ in_channels (int): Number of input channels.
21
+ out_channels (int): Number of output channels.
22
+ momentum (float): Momentum for batch normalization.
23
+ """
24
+
25
+ def __init__(self, in_channels, out_channels, momentum=0.01):
26
+ super(ConvBlockRes, self).__init__()
27
+ self.conv = nn.Sequential(
28
+ nn.Conv2d(
29
+ in_channels=in_channels,
30
+ out_channels=out_channels,
31
+ kernel_size=(3, 3),
32
+ stride=(1, 1),
33
+ padding=(1, 1),
34
+ bias=False,
35
+ ),
36
+ nn.BatchNorm2d(out_channels, momentum=momentum),
37
+ nn.ReLU(),
38
+ nn.Conv2d(
39
+ in_channels=out_channels,
40
+ out_channels=out_channels,
41
+ kernel_size=(3, 3),
42
+ stride=(1, 1),
43
+ padding=(1, 1),
44
+ bias=False,
45
+ ),
46
+ nn.BatchNorm2d(out_channels, momentum=momentum),
47
+ nn.ReLU(),
48
+ )
49
+ if in_channels != out_channels:
50
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
51
+ self.is_shortcut = True
52
+ else:
53
+ self.is_shortcut = False
54
+
55
+ def forward(self, x):
56
+ if self.is_shortcut:
57
+ return self.conv(x) + self.shortcut(x)
58
+ else:
59
+ return self.conv(x) + x
60
+
61
+
62
+ # Define a class for residual encoder blocks
63
+ class ResEncoderBlock(nn.Module):
64
+ """
65
+ A residual encoder block.
66
+
67
+ Args:
68
+ in_channels (int): Number of input channels.
69
+ out_channels (int): Number of output channels.
70
+ kernel_size (tuple): Size of the average pooling kernel.
71
+ n_blocks (int): Number of convolutional blocks in the block.
72
+ momentum (float): Momentum for batch normalization.
73
+ """
74
+
75
+ def __init__(
76
+ self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
77
+ ):
78
+ super(ResEncoderBlock, self).__init__()
79
+ self.n_blocks = n_blocks
80
+ self.conv = nn.ModuleList()
81
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
82
+ for _ in range(n_blocks - 1):
83
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
84
+ self.kernel_size = kernel_size
85
+ if self.kernel_size is not None:
86
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
87
+
88
+ def forward(self, x):
89
+ for i in range(self.n_blocks):
90
+ x = self.conv[i](x)
91
+ if self.kernel_size is not None:
92
+ return x, self.pool(x)
93
+ else:
94
+ return x
95
+
96
+
97
+ # Define a class for the encoder
98
+ class Encoder(nn.Module):
99
+ """
100
+ The encoder part of the DeepUnet.
101
+
102
+ Args:
103
+ in_channels (int): Number of input channels.
104
+ in_size (int): Size of the input tensor.
105
+ n_encoders (int): Number of encoder blocks.
106
+ kernel_size (tuple): Size of the average pooling kernel.
107
+ n_blocks (int): Number of convolutional blocks in each encoder block.
108
+ out_channels (int): Number of output channels for the first encoder block.
109
+ momentum (float): Momentum for batch normalization.
110
+ """
111
+
112
+ def __init__(
113
+ self,
114
+ in_channels,
115
+ in_size,
116
+ n_encoders,
117
+ kernel_size,
118
+ n_blocks,
119
+ out_channels=16,
120
+ momentum=0.01,
121
+ ):
122
+ super(Encoder, self).__init__()
123
+ self.n_encoders = n_encoders
124
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
125
+ self.layers = nn.ModuleList()
126
+ self.latent_channels = []
127
+ for i in range(self.n_encoders):
128
+ self.layers.append(
129
+ ResEncoderBlock(
130
+ in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
131
+ )
132
+ )
133
+ self.latent_channels.append([out_channels, in_size])
134
+ in_channels = out_channels
135
+ out_channels *= 2
136
+ in_size //= 2
137
+ self.out_size = in_size
138
+ self.out_channel = out_channels
139
+
140
+ def forward(self, x: torch.Tensor):
141
+ concat_tensors: List[torch.Tensor] = []
142
+ x = self.bn(x)
143
+ for i in range(self.n_encoders):
144
+ t, x = self.layers[i](x)
145
+ concat_tensors.append(t)
146
+ return x, concat_tensors
147
+
148
+
149
+ # Define a class for the intermediate layer
150
+ class Intermediate(nn.Module):
151
+ """
152
+ The intermediate layer of the DeepUnet.
153
+
154
+ Args:
155
+ in_channels (int): Number of input channels.
156
+ out_channels (int): Number of output channels.
157
+ n_inters (int): Number of convolutional blocks in the intermediate layer.
158
+ n_blocks (int): Number of convolutional blocks in each intermediate block.
159
+ momentum (float): Momentum for batch normalization.
160
+ """
161
+
162
+ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
163
+ super(Intermediate, self).__init__()
164
+ self.n_inters = n_inters
165
+ self.layers = nn.ModuleList()
166
+ self.layers.append(
167
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
168
+ )
169
+ for _ in range(self.n_inters - 1):
170
+ self.layers.append(
171
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
172
+ )
173
+
174
+ def forward(self, x):
175
+ for i in range(self.n_inters):
176
+ x = self.layers[i](x)
177
+ return x
178
+
179
+
180
+ # Define a class for residual decoder blocks
181
+ class ResDecoderBlock(nn.Module):
182
+ """
183
+ A residual decoder block.
184
+
185
+ Args:
186
+ in_channels (int): Number of input channels.
187
+ out_channels (int): Number of output channels.
188
+ stride (tuple): Stride for transposed convolution.
189
+ n_blocks (int): Number of convolutional blocks in the block.
190
+ momentum (float): Momentum for batch normalization.
191
+ """
192
+
193
+ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
194
+ super(ResDecoderBlock, self).__init__()
195
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
196
+ self.n_blocks = n_blocks
197
+ self.conv1 = nn.Sequential(
198
+ nn.ConvTranspose2d(
199
+ in_channels=in_channels,
200
+ out_channels=out_channels,
201
+ kernel_size=(3, 3),
202
+ stride=stride,
203
+ padding=(1, 1),
204
+ output_padding=out_padding,
205
+ bias=False,
206
+ ),
207
+ nn.BatchNorm2d(out_channels, momentum=momentum),
208
+ nn.ReLU(),
209
+ )
210
+ self.conv2 = nn.ModuleList()
211
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
212
+ for _ in range(n_blocks - 1):
213
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
214
+
215
+ def forward(self, x, concat_tensor):
216
+ x = self.conv1(x)
217
+ x = torch.cat((x, concat_tensor), dim=1)
218
+ for i in range(self.n_blocks):
219
+ x = self.conv2[i](x)
220
+ return x
221
+
222
+
223
+ # Define a class for the decoder
224
+ class Decoder(nn.Module):
225
+ """
226
+ The decoder part of the DeepUnet.
227
+
228
+ Args:
229
+ in_channels (int): Number of input channels.
230
+ n_decoders (int): Number of decoder blocks.
231
+ stride (tuple): Stride for transposed convolution.
232
+ n_blocks (int): Number of convolutional blocks in each decoder block.
233
+ momentum (float): Momentum for batch normalization.
234
+ """
235
+
236
+ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
237
+ super(Decoder, self).__init__()
238
+ self.layers = nn.ModuleList()
239
+ self.n_decoders = n_decoders
240
+ for _ in range(self.n_decoders):
241
+ out_channels = in_channels // 2
242
+ self.layers.append(
243
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
244
+ )
245
+ in_channels = out_channels
246
+
247
+ def forward(self, x, concat_tensors):
248
+ for i in range(self.n_decoders):
249
+ x = self.layers[i](x, concat_tensors[-1 - i])
250
+ return x
251
+
252
+
253
+ # Define a class for the DeepUnet architecture
254
+ class DeepUnet(nn.Module):
255
+ """
256
+ The DeepUnet architecture.
257
+
258
+ Args:
259
+ kernel_size (tuple): Size of the average pooling kernel.
260
+ n_blocks (int): Number of convolutional blocks in each encoder/decoder block.
261
+ en_de_layers (int): Number of encoder/decoder layers.
262
+ inter_layers (int): Number of convolutional blocks in the intermediate layer.
263
+ in_channels (int): Number of input channels.
264
+ en_out_channels (int): Number of output channels for the first encoder block.
265
+ """
266
+
267
+ def __init__(
268
+ self,
269
+ kernel_size,
270
+ n_blocks,
271
+ en_de_layers=5,
272
+ inter_layers=4,
273
+ in_channels=1,
274
+ en_out_channels=16,
275
+ ):
276
+ super(DeepUnet, self).__init__()
277
+ self.encoder = Encoder(
278
+ in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
279
+ )
280
+ self.intermediate = Intermediate(
281
+ self.encoder.out_channel // 2,
282
+ self.encoder.out_channel,
283
+ inter_layers,
284
+ n_blocks,
285
+ )
286
+ self.decoder = Decoder(
287
+ self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
288
+ )
289
+
290
+ def forward(self, x):
291
+ x, concat_tensors = self.encoder(x)
292
+ x = self.intermediate(x)
293
+ x = self.decoder(x, concat_tensors)
294
+ return x
295
+
296
+
297
+ # Define a class for the end-to-end model
298
+ class E2E(nn.Module):
299
+ """
300
+ The end-to-end model.
301
+
302
+ Args:
303
+ n_blocks (int): Number of convolutional blocks in each encoder/decoder block.
304
+ n_gru (int): Number of GRU layers.
305
+ kernel_size (tuple): Size of the average pooling kernel.
306
+ en_de_layers (int): Number of encoder/decoder layers.
307
+ inter_layers (int): Number of convolutional blocks in the intermediate layer.
308
+ in_channels (int): Number of input channels.
309
+ en_out_channels (int): Number of output channels for the first encoder block.
310
+ """
311
+
312
+ def __init__(
313
+ self,
314
+ n_blocks,
315
+ n_gru,
316
+ kernel_size,
317
+ en_de_layers=5,
318
+ inter_layers=4,
319
+ in_channels=1,
320
+ en_out_channels=16,
321
+ ):
322
+ super(E2E, self).__init__()
323
+ self.unet = DeepUnet(
324
+ kernel_size,
325
+ n_blocks,
326
+ en_de_layers,
327
+ inter_layers,
328
+ in_channels,
329
+ en_out_channels,
330
+ )
331
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
332
+ if n_gru:
333
+ self.fc = nn.Sequential(
334
+ BiGRU(3 * 128, 256, n_gru),
335
+ nn.Linear(512, N_CLASS),
336
+ nn.Dropout(0.25),
337
+ nn.Sigmoid(),
338
+ )
339
+ else:
340
+ self.fc = nn.Sequential(
341
+ nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
342
+ )
343
+
344
+ def forward(self, mel):
345
+ mel = mel.transpose(-1, -2).unsqueeze(1)
346
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
347
+ x = self.fc(x)
348
+ return x
349
+
350
+
351
+ # Define a class for the MelSpectrogram extractor
352
+ class MelSpectrogram(torch.nn.Module):
353
+ """
354
+ Extracts Mel-spectrogram features from audio.
355
+
356
+ Args:
357
+ is_half (bool): Whether to use half-precision floating-point numbers.
358
+ n_mel_channels (int): Number of Mel-frequency bands.
359
+ sample_rate (int): Sampling rate of the audio.
360
+ win_length (int): Length of the window function in samples.
361
+ hop_length (int): Hop size between frames in samples.
362
+ n_fft (int, optional): Length of the FFT window. Defaults to None, which uses win_length.
363
+ mel_fmin (int, optional): Minimum frequency for the Mel filter bank. Defaults to 0.
364
+ mel_fmax (int, optional): Maximum frequency for the Mel filter bank. Defaults to None.
365
+ clamp (float, optional): Minimum value for clamping the Mel-spectrogram. Defaults to 1e-5.
366
+ """
367
+
368
+ def __init__(
369
+ self,
370
+ is_half,
371
+ n_mel_channels,
372
+ sample_rate,
373
+ win_length,
374
+ hop_length,
375
+ n_fft=None,
376
+ mel_fmin=0,
377
+ mel_fmax=None,
378
+ clamp=1e-5,
379
+ ):
380
+ super().__init__()
381
+ n_fft = win_length if n_fft is None else n_fft
382
+ self.hann_window = {}
383
+ mel_basis = mel(
384
+ sr=sample_rate,
385
+ n_fft=n_fft,
386
+ n_mels=n_mel_channels,
387
+ fmin=mel_fmin,
388
+ fmax=mel_fmax,
389
+ htk=True,
390
+ )
391
+ mel_basis = torch.from_numpy(mel_basis).float()
392
+ self.register_buffer("mel_basis", mel_basis)
393
+ self.n_fft = win_length if n_fft is None else n_fft
394
+ self.hop_length = hop_length
395
+ self.win_length = win_length
396
+ self.sample_rate = sample_rate
397
+ self.n_mel_channels = n_mel_channels
398
+ self.clamp = clamp
399
+ self.is_half = is_half
400
+
401
+ def forward(self, audio, keyshift=0, speed=1, center=True):
402
+ factor = 2 ** (keyshift / 12)
403
+ n_fft_new = int(np.round(self.n_fft * factor))
404
+ win_length_new = int(np.round(self.win_length * factor))
405
+ hop_length_new = int(np.round(self.hop_length * speed))
406
+ keyshift_key = str(keyshift) + "_" + str(audio.device)
407
+ if keyshift_key not in self.hann_window:
408
+ self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
409
+ audio.device
410
+ )
411
+ fft = torch.stft(
412
+ audio,
413
+ n_fft=n_fft_new,
414
+ hop_length=hop_length_new,
415
+ win_length=win_length_new,
416
+ window=self.hann_window[keyshift_key],
417
+ center=center,
418
+ return_complex=True,
419
+ )
420
+
421
+ magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
422
+ if keyshift != 0:
423
+ size = self.n_fft // 2 + 1
424
+ resize = magnitude.size(1)
425
+ if resize < size:
426
+ magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
427
+ magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
428
+ mel_output = torch.matmul(self.mel_basis, magnitude)
429
+ if self.is_half:
430
+ mel_output = mel_output.half()
431
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
432
+ return log_mel_spec
433
+
434
+
435
+ # Define a class for the RMVPE0 predictor
436
+ class RMVPE0Predictor:
437
+ """
438
+ A predictor for fundamental frequency (F0) based on the RMVPE0 model.
439
+
440
+ Args:
441
+ model_path (str): Path to the RMVPE0 model file.
442
+ is_half (bool): Whether to use half-precision floating-point numbers.
443
+ device (str, optional): Device to use for computation. Defaults to None, which uses CUDA if available.
444
+ """
445
+
446
+ def __init__(self, model_path, is_half, device=None):
447
+ self.resample_kernel = {}
448
+ model = E2E(4, 1, (2, 2))
449
+ ckpt = torch.load(model_path, map_location="cpu")
450
+ model.load_state_dict(ckpt)
451
+ model.eval()
452
+ if is_half:
453
+ model = model.half()
454
+ self.model = model
455
+ self.resample_kernel = {}
456
+ self.is_half = is_half
457
+ self.device = device
458
+ self.mel_extractor = MelSpectrogram(
459
+ is_half, N_MELS, 16000, 1024, 160, None, 30, 8000
460
+ ).to(device)
461
+ self.model = self.model.to(device)
462
+ cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191
463
+ self.cents_mapping = np.pad(cents_mapping, (4, 4))
464
+
465
+ def mel2hidden(self, mel):
466
+ """
467
+ Converts Mel-spectrogram features to hidden representation.
468
+
469
+ Args:
470
+ mel (torch.Tensor): Mel-spectrogram features.
471
+ """
472
+ with torch.no_grad():
473
+ n_frames = mel.shape[-1]
474
+ mel = F.pad(
475
+ mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
476
+ )
477
+ hidden = self.model(mel)
478
+ return hidden[:, :n_frames]
479
+
480
+ def decode(self, hidden, thred=0.03):
481
+ """
482
+ Decodes hidden representation to F0.
483
+
484
+ Args:
485
+ hidden (np.ndarray): Hidden representation.
486
+ thred (float, optional): Threshold for salience. Defaults to 0.03.
487
+ """
488
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
489
+ f0 = 10 * (2 ** (cents_pred / 1200))
490
+ f0[f0 == 10] = 0
491
+ return f0
492
+
493
+ def infer_from_audio(self, audio, thred=0.03):
494
+ """
495
+ Infers F0 from audio.
496
+
497
+ Args:
498
+ audio (np.ndarray): Audio signal.
499
+ thred (float, optional): Threshold for salience. Defaults to 0.03.
500
+ """
501
+ audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
502
+ mel = self.mel_extractor(audio, center=True)
503
+ hidden = self.mel2hidden(mel)
504
+ hidden = hidden.squeeze(0).cpu().numpy()
505
+ if self.is_half == True:
506
+ hidden = hidden.astype("float32")
507
+ f0 = self.decode(hidden, thred=thred)
508
+ return f0
509
+
510
+ def to_local_average_cents(self, salience, thred=0.05):
511
+ """
512
+ Converts salience to local average cents.
513
+
514
+ Args:
515
+ salience (np.ndarray): Salience values.
516
+ thred (float, optional): Threshold for salience. Defaults to 0.05.
517
+ """
518
+ center = np.argmax(salience, axis=1)
519
+ salience = np.pad(salience, ((0, 0), (4, 4)))
520
+ center += 4
521
+ todo_salience = []
522
+ todo_cents_mapping = []
523
+ starts = center - 4
524
+ ends = center + 5
525
+ for idx in range(salience.shape[0]):
526
+ todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
527
+ todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
528
+ todo_salience = np.array(todo_salience)
529
+ todo_cents_mapping = np.array(todo_cents_mapping)
530
+ product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
531
+ weight_sum = np.sum(todo_salience, 1)
532
+ devided = product_sum / weight_sum
533
+ maxx = np.max(salience, axis=1)
534
+ devided[maxx <= thred] = 0
535
+ return devided
536
+
537
+
538
+ # Define a class for BiGRU (bidirectional GRU)
539
+ class BiGRU(nn.Module):
540
+ """
541
+ A bidirectional GRU layer.
542
+
543
+ Args:
544
+ input_features (int): Number of input features.
545
+ hidden_features (int): Number of hidden features.
546
+ num_layers (int): Number of GRU layers.
547
+ """
548
+
549
+ def __init__(self, input_features, hidden_features, num_layers):
550
+ super(BiGRU, self).__init__()
551
+ self.gru = nn.GRU(
552
+ input_features,
553
+ hidden_features,
554
+ num_layers=num_layers,
555
+ batch_first=True,
556
+ bidirectional=True,
557
+ )
558
+
559
+ def forward(self, x):
560
+ return self.gru(x)[0]
rvc/lib/tools/analyzer.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ import librosa.display
4
+ import librosa
5
+
6
+
7
+ def calculate_features(y, sr):
8
+ stft = np.abs(librosa.stft(y))
9
+ duration = librosa.get_duration(y=y, sr=sr)
10
+ cent = librosa.feature.spectral_centroid(S=stft, sr=sr)[0]
11
+ bw = librosa.feature.spectral_bandwidth(S=stft, sr=sr)[0]
12
+ rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sr)[0]
13
+ return stft, duration, cent, bw, rolloff
14
+
15
+
16
+ def plot_title(title):
17
+ plt.suptitle(title, fontsize=16, fontweight="bold")
18
+
19
+
20
+ def plot_spectrogram(y, sr, stft, duration, cmap="inferno"):
21
+ plt.subplot(3, 1, 1)
22
+ plt.imshow(
23
+ librosa.amplitude_to_db(stft, ref=np.max),
24
+ origin="lower",
25
+ extent=[0, duration, 0, sr / 1000],
26
+ aspect="auto",
27
+ cmap=cmap, # Change the colormap here
28
+ )
29
+ plt.colorbar(format="%+2.0f dB")
30
+ plt.xlabel("Time (s)")
31
+ plt.ylabel("Frequency (kHz)")
32
+ plt.title("Spectrogram")
33
+
34
+
35
+ def plot_waveform(y, sr, duration):
36
+ plt.subplot(3, 1, 2)
37
+ librosa.display.waveshow(y, sr=sr)
38
+ plt.xlabel("Time (s)")
39
+ plt.ylabel("Amplitude")
40
+ plt.title("Waveform")
41
+
42
+
43
+ def plot_features(times, cent, bw, rolloff, duration):
44
+ plt.subplot(3, 1, 3)
45
+ plt.plot(times, cent, label="Spectral Centroid (kHz)", color="b")
46
+ plt.plot(times, bw, label="Spectral Bandwidth (kHz)", color="g")
47
+ plt.plot(times, rolloff, label="Spectral Rolloff (kHz)", color="r")
48
+ plt.xlabel("Time (s)")
49
+ plt.title("Spectral Features")
50
+ plt.legend()
51
+
52
+
53
+ def analyze_audio(audio_file, save_plot_path="logs/audio_analysis.png"):
54
+ y, sr = librosa.load(audio_file)
55
+ stft, duration, cent, bw, rolloff = calculate_features(y, sr)
56
+
57
+ plt.figure(figsize=(12, 10))
58
+
59
+ plot_title("Audio Analysis" + " - " + audio_file.split("/")[-1])
60
+ plot_spectrogram(y, sr, stft, duration)
61
+ plot_waveform(y, sr, duration)
62
+ plot_features(librosa.times_like(cent), cent, bw, rolloff, duration)
63
+
64
+ plt.tight_layout()
65
+
66
+ if save_plot_path:
67
+ plt.savefig(save_plot_path, bbox_inches="tight", dpi=300)
68
+ plt.close()
69
+
70
+ audio_info = f"""Sample Rate: {sr}\nDuration: {(
71
+ str(round(duration, 2)) + " seconds"
72
+ if duration < 60
73
+ else str(round(duration / 60, 2)) + " minutes"
74
+ )}\nNumber of Samples: {len(y)}\nBits per Sample: {librosa.get_samplerate(audio_file)}\nChannels: {"Mono (1)" if y.ndim == 1 else "Stereo (2)"}"""
75
+
76
+ return audio_info, save_plot_path
rvc/lib/tools/gdown.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import six
4
+ import sys
5
+ import json
6
+ import tqdm
7
+ import time
8
+ import shutil
9
+ import warnings
10
+ import tempfile
11
+ import textwrap
12
+ import requests
13
+ from six.moves import urllib_parse
14
+
15
+
16
+ def indent(text, prefix):
17
+ """Indent each non-empty line of text with the given prefix."""
18
+ return "".join(
19
+ (prefix + line if line.strip() else line) for line in text.splitlines(True)
20
+ )
21
+
22
+
23
+ class FileURLRetrievalError(Exception):
24
+ pass
25
+
26
+
27
+ class FolderContentsMaximumLimitError(Exception):
28
+ pass
29
+
30
+
31
+ def parse_url(url, warning=True):
32
+ """Parse URLs especially for Google Drive links.
33
+
34
+ Args:
35
+ url: URL to parse.
36
+ warning: Whether to warn if the URL is not a download link.
37
+
38
+ Returns:
39
+ A tuple (file_id, is_download_link), where file_id is the ID of the
40
+ file on Google Drive, and is_download_link is a flag indicating
41
+ whether the URL is a download link.
42
+ """
43
+ parsed = urllib_parse.urlparse(url)
44
+ query = urllib_parse.parse_qs(parsed.query)
45
+ is_gdrive = parsed.hostname in ("drive.google.com", "docs.google.com")
46
+ is_download_link = parsed.path.endswith("/uc")
47
+
48
+ if not is_gdrive:
49
+ return None, is_download_link
50
+
51
+ file_id = query.get("id", [None])[0]
52
+ if file_id is None:
53
+ for pattern in (
54
+ r"^/file/d/(.*?)/(edit|view)$",
55
+ r"^/file/u/[0-9]+/d/(.*?)/(edit|view)$",
56
+ r"^/document/d/(.*?)/(edit|htmlview|view)$",
57
+ r"^/document/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$",
58
+ r"^/presentation/d/(.*?)/(edit|htmlview|view)$",
59
+ r"^/presentation/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$",
60
+ r"^/spreadsheets/d/(.*?)/(edit|htmlview|view)$",
61
+ r"^/spreadsheets/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$",
62
+ ):
63
+ match = re.match(pattern, parsed.path)
64
+ if match:
65
+ file_id = match.group(1)
66
+ break
67
+
68
+ if warning and not is_download_link:
69
+ warnings.warn(
70
+ "You specified a Google Drive link that is not the correct link "
71
+ "to download a file. You might want to try `--fuzzy` option "
72
+ f"or the following url: https://drive.google.com/uc?id={file_id}"
73
+ )
74
+
75
+ return file_id, is_download_link
76
+
77
+
78
+ CHUNK_SIZE = 512 * 1024 # 512KB
79
+ HOME = os.path.expanduser("~")
80
+
81
+
82
+ def get_url_from_gdrive_confirmation(contents):
83
+ """Extract the download URL from a Google Drive confirmation page."""
84
+ for pattern in (
85
+ r'href="(\/uc\?export=download[^"]+)',
86
+ r'href="/open\?id=([^"]+)"',
87
+ r'"downloadUrl":"([^"]+)',
88
+ ):
89
+ match = re.search(pattern, contents)
90
+ if match:
91
+ url = match.group(1)
92
+ if pattern == r'href="/open\?id=([^"]+)"':
93
+ uuid = re.search(
94
+ r'<input\s+type="hidden"\s+name="uuid"\s+value="([^"]+)"',
95
+ contents,
96
+ ).group(1)
97
+ url = (
98
+ "https://drive.usercontent.google.com/download?id="
99
+ + url
100
+ + "&confirm=t&uuid="
101
+ + uuid
102
+ )
103
+ elif pattern == r'"downloadUrl":"([^"]+)':
104
+ url = url.replace("\\u003d", "=").replace("\\u0026", "&")
105
+ else:
106
+ url = "https://docs.google.com" + url.replace("&", "&")
107
+ return url
108
+
109
+ match = re.search(r'<p class="uc-error-subcaption">(.*)</p>', contents)
110
+ if match:
111
+ error = match.group(1)
112
+ raise FileURLRetrievalError(error)
113
+
114
+ raise FileURLRetrievalError(
115
+ "Cannot retrieve the public link of the file. "
116
+ "You may need to change the permission to "
117
+ "'Anyone with the link', or have had many accesses."
118
+ )
119
+
120
+
121
+ def _get_session(proxy, use_cookies, return_cookies_file=False):
122
+ """Create a requests session with optional proxy and cookie handling."""
123
+ sess = requests.session()
124
+ sess.headers.update(
125
+ {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"}
126
+ )
127
+
128
+ if proxy is not None:
129
+ sess.proxies = {"http": proxy, "https": proxy}
130
+ print("Using proxy:", proxy, file=sys.stderr)
131
+
132
+ cookies_file = os.path.join(HOME, ".cache/gdown/cookies.json")
133
+ if os.path.exists(cookies_file) and use_cookies:
134
+ with open(cookies_file) as f:
135
+ cookies = json.load(f)
136
+ for k, v in cookies:
137
+ sess.cookies[k] = v
138
+
139
+ return (sess, cookies_file) if return_cookies_file else sess
140
+
141
+
142
+ def download(
143
+ url=None,
144
+ output=None,
145
+ quiet=False,
146
+ proxy=None,
147
+ speed=None,
148
+ use_cookies=True,
149
+ verify=True,
150
+ id=None,
151
+ fuzzy=True,
152
+ resume=False,
153
+ format=None,
154
+ ):
155
+ """Download file from URL.
156
+
157
+ Parameters
158
+ ----------
159
+ url: str
160
+ URL. Google Drive URL is also supported.
161
+ output: str
162
+ Output filename. Default is basename of URL.
163
+ quiet: bool
164
+ Suppress terminal output. Default is False.
165
+ proxy: str
166
+ Proxy.
167
+ speed: float
168
+ Download byte size per second (e.g., 256KB/s = 256 * 1024).
169
+ use_cookies: bool
170
+ Flag to use cookies. Default is True.
171
+ verify: bool or string
172
+ Either a bool, in which case it controls whether the server's TLS
173
+ certificate is verified, or a string, in which case it must be a path
174
+ to a CA bundle to use. Default is True.
175
+ id: str
176
+ Google Drive's file ID.
177
+ fuzzy: bool
178
+ Fuzzy extraction of Google Drive's file Id. Default is False.
179
+ resume: bool
180
+ Resume the download from existing tmp file if possible.
181
+ Default is False.
182
+ format: str, optional
183
+ Format of Google Docs, Spreadsheets and Slides. Default is:
184
+ - Google Docs: 'docx'
185
+ - Google Spreadsheet: 'xlsx'
186
+ - Google Slides: 'pptx'
187
+
188
+ Returns
189
+ -------
190
+ output: str
191
+ Output filename.
192
+ """
193
+ if not (id is None) ^ (url is None):
194
+ raise ValueError("Either url or id has to be specified")
195
+ if id is not None:
196
+ url = f"https://drive.google.com/uc?id={id}"
197
+
198
+ url_origin = url
199
+
200
+ sess, cookies_file = _get_session(
201
+ proxy=proxy, use_cookies=use_cookies, return_cookies_file=True
202
+ )
203
+
204
+ gdrive_file_id, is_gdrive_download_link = parse_url(url, warning=not fuzzy)
205
+
206
+ if fuzzy and gdrive_file_id:
207
+ # overwrite the url with fuzzy match of a file id
208
+ url = f"https://drive.google.com/uc?id={gdrive_file_id}"
209
+ url_origin = url
210
+ is_gdrive_download_link = True
211
+
212
+ while True:
213
+ res = sess.get(url, stream=True, verify=verify)
214
+
215
+ if url == url_origin and res.status_code == 500:
216
+ # The file could be Google Docs or Spreadsheets.
217
+ url = f"https://drive.google.com/open?id={gdrive_file_id}"
218
+ continue
219
+
220
+ if res.headers["Content-Type"].startswith("text/html"):
221
+ title = re.search("<title>(.+)</title>", res.text)
222
+ if title:
223
+ title = title.group(1)
224
+ if title.endswith(" - Google Docs"):
225
+ url = f"https://docs.google.com/document/d/{gdrive_file_id}/export?format={'docx' if format is None else format}"
226
+ continue
227
+ if title.endswith(" - Google Sheets"):
228
+ url = f"https://docs.google.com/spreadsheets/d/{gdrive_file_id}/export?format={'xlsx' if format is None else format}"
229
+ continue
230
+ if title.endswith(" - Google Slides"):
231
+ url = f"https://docs.google.com/presentation/d/{gdrive_file_id}/export?format={'pptx' if format is None else format}"
232
+ continue
233
+ elif (
234
+ "Content-Disposition" in res.headers
235
+ and res.headers["Content-Disposition"].endswith("pptx")
236
+ and format not in (None, "pptx")
237
+ ):
238
+ url = f"https://docs.google.com/presentation/d/{gdrive_file_id}/export?format={'pptx' if format is None else format}"
239
+ continue
240
+
241
+ if use_cookies:
242
+ os.makedirs(os.path.dirname(cookies_file), exist_ok=True)
243
+ with open(cookies_file, "w") as f:
244
+ cookies = [
245
+ (k, v)
246
+ for k, v in sess.cookies.items()
247
+ if not k.startswith("download_warning_")
248
+ ]
249
+ json.dump(cookies, f, indent=2)
250
+
251
+ if "Content-Disposition" in res.headers:
252
+ # This is the file
253
+ break
254
+ if not (gdrive_file_id and is_gdrive_download_link):
255
+ break
256
+
257
+ # Need to redirect with confirmation
258
+ try:
259
+ url = get_url_from_gdrive_confirmation(res.text)
260
+ except FileURLRetrievalError as e:
261
+ message = (
262
+ "Failed to retrieve file url:\n\n"
263
+ "{}\n\n"
264
+ "You may still be able to access the file from the browser:"
265
+ f"\n\n\t{url_origin}\n\n"
266
+ "but Gdown can't. Please check connections and permissions."
267
+ ).format(indent("\n".join(textwrap.wrap(str(e))), prefix="\t"))
268
+ raise FileURLRetrievalError(message)
269
+
270
+ if gdrive_file_id and is_gdrive_download_link:
271
+ content_disposition = urllib_parse.unquote(res.headers["Content-Disposition"])
272
+ filename_from_url = (
273
+ re.search(r"filename\*=UTF-8''(.*)", content_disposition)
274
+ or re.search(r'filename=["\']?(.*?)["\']?$', content_disposition)
275
+ ).group(1)
276
+ filename_from_url = filename_from_url.replace(os.path.sep, "_")
277
+ else:
278
+ filename_from_url = os.path.basename(url)
279
+
280
+ output = output or filename_from_url
281
+
282
+ output_is_path = isinstance(output, six.string_types)
283
+ if output_is_path and output.endswith(os.path.sep):
284
+ os.makedirs(output, exist_ok=True)
285
+ output = os.path.join(output, filename_from_url)
286
+
287
+ if output_is_path:
288
+ temp_dir = os.path.dirname(output) or "."
289
+ prefix = os.path.basename(output)
290
+ existing_tmp_files = [
291
+ os.path.join(temp_dir, file)
292
+ for file in os.listdir(temp_dir)
293
+ if file.startswith(prefix)
294
+ ]
295
+ if resume and existing_tmp_files:
296
+ if len(existing_tmp_files) > 1:
297
+ print(
298
+ "There are multiple temporary files to resume:",
299
+ file=sys.stderr,
300
+ )
301
+ for file in existing_tmp_files:
302
+ print(f"\t{file}", file=sys.stderr)
303
+ print(
304
+ "Please remove them except one to resume downloading.",
305
+ file=sys.stderr,
306
+ )
307
+ return
308
+ tmp_file = existing_tmp_files[0]
309
+ else:
310
+ resume = False
311
+ tmp_file = tempfile.mktemp(
312
+ suffix=tempfile.template, prefix=prefix, dir=temp_dir
313
+ )
314
+ f = open(tmp_file, "ab")
315
+ else:
316
+ tmp_file = None
317
+ f = output
318
+
319
+ if tmp_file is not None and f.tell() != 0:
320
+ headers = {"Range": f"bytes={f.tell()}-"}
321
+ res = sess.get(url, headers=headers, stream=True, verify=verify)
322
+
323
+ if not quiet:
324
+ if resume:
325
+ print("Resume:", tmp_file, file=sys.stderr)
326
+ print(
327
+ "To:",
328
+ os.path.abspath(output) if output_is_path else output,
329
+ file=sys.stderr,
330
+ )
331
+
332
+ try:
333
+ total = int(res.headers.get("Content-Length", 0))
334
+ if not quiet:
335
+ pbar = tqdm.tqdm(total=total, unit="B", unit_scale=True)
336
+ t_start = time.time()
337
+ for chunk in res.iter_content(chunk_size=CHUNK_SIZE):
338
+ f.write(chunk)
339
+ if not quiet:
340
+ pbar.update(len(chunk))
341
+ if speed is not None:
342
+ elapsed_time_expected = 1.0 * pbar.n / speed
343
+ elapsed_time = time.time() - t_start
344
+ if elapsed_time < elapsed_time_expected:
345
+ time.sleep(elapsed_time_expected - elapsed_time)
346
+ if not quiet:
347
+ pbar.close()
348
+ if tmp_file:
349
+ f.close()
350
+ shutil.move(tmp_file, output)
351
+ finally:
352
+ sess.close()
353
+
354
+ return output
rvc/lib/tools/launch_tensorboard.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import logging
3
+ from tensorboard import program
4
+
5
+ log_path = "logs"
6
+
7
+
8
+ def launch_tensorboard_pipeline():
9
+ logging.getLogger("root").setLevel(logging.WARNING)
10
+ logging.getLogger("tensorboard").setLevel(logging.WARNING)
11
+
12
+ tb = program.TensorBoard()
13
+ tb.configure(argv=[None, "--logdir", log_path])
14
+ url = tb.launch()
15
+
16
+ print(
17
+ f"Access the tensorboard using the following link:\n{url}?pinnedCards=%5B%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fd%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fkl%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fmel%22%7D%5D"
18
+ )
19
+
20
+ while True:
21
+ time.sleep(600)
rvc/lib/tools/model_download.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import six
4
+ import sys
5
+ import wget
6
+ import shutil
7
+ import zipfile
8
+ import requests
9
+ from bs4 import BeautifulSoup
10
+ from urllib.parse import unquote, urlencode, parse_qs, urlparse
11
+
12
+ now_dir = os.getcwd()
13
+ sys.path.append(now_dir)
14
+
15
+ from rvc.lib.utils import format_title
16
+ from rvc.lib.tools import gdown
17
+
18
+
19
+ def find_folder_parent(search_dir, folder_name):
20
+ for dirpath, dirnames, _ in os.walk(search_dir):
21
+ if folder_name in dirnames:
22
+ return os.path.abspath(dirpath)
23
+ return None
24
+
25
+
26
+ file_path = find_folder_parent(now_dir, "logs")
27
+ zips_path = os.path.join(file_path, "zips")
28
+
29
+
30
+ def search_pth_index(folder):
31
+ pth_paths = [
32
+ os.path.join(folder, file)
33
+ for file in os.listdir(folder)
34
+ if os.path.isfile(os.path.join(folder, file)) and file.endswith(".pth")
35
+ ]
36
+ index_paths = [
37
+ os.path.join(folder, file)
38
+ for file in os.listdir(folder)
39
+ if os.path.isfile(os.path.join(folder, file)) and file.endswith(".index")
40
+ ]
41
+
42
+ return pth_paths, index_paths
43
+
44
+
45
+ def get_mediafire_download_link(url):
46
+ response = requests.get(url)
47
+ response.raise_for_status()
48
+ soup = BeautifulSoup(response.text, "html.parser")
49
+ download_button = soup.find(
50
+ "a", {"class": "input popsok", "aria-label": "Download file"}
51
+ )
52
+ if download_button:
53
+ download_link = download_button.get("href")
54
+ return download_link
55
+ else:
56
+ return None
57
+
58
+
59
+ def download_from_url(url):
60
+ os.makedirs(zips_path, exist_ok=True)
61
+ if url != "":
62
+ if "drive.google.com" in url:
63
+ if "file/d/" in url:
64
+ file_id = url.split("file/d/")[1].split("/")[0]
65
+ elif "id=" in url:
66
+ file_id = url.split("id=")[1].split("&")[0]
67
+ else:
68
+ return None
69
+
70
+ if file_id:
71
+ os.chdir(zips_path)
72
+ try:
73
+ gdown.download(
74
+ f"https://drive.google.com/uc?id={file_id}",
75
+ quiet=True,
76
+ fuzzy=True,
77
+ )
78
+ except Exception as error:
79
+ error_message = str(
80
+ f"An error occurred downloading the file: {error}"
81
+ )
82
+ if (
83
+ "Too many users have viewed or downloaded this file recently"
84
+ in error_message
85
+ ):
86
+ os.chdir(now_dir)
87
+ return "too much use"
88
+ elif (
89
+ "Cannot retrieve the public link of the file." in error_message
90
+ ):
91
+ os.chdir(now_dir)
92
+ return "private link"
93
+ else:
94
+ print(error_message)
95
+ os.chdir(now_dir)
96
+ return None
97
+ elif "disk.yandex.ru" in url:
98
+ base_url = "https://cloud-api.yandex.net/v1/disk/public/resources/download?"
99
+ public_key = url
100
+ final_url = base_url + urlencode(dict(public_key=public_key))
101
+ response = requests.get(final_url)
102
+ download_url = response.json()["href"]
103
+ download_response = requests.get(download_url)
104
+
105
+ if download_response.status_code == 200:
106
+ filename = parse_qs(urlparse(unquote(download_url)).query).get(
107
+ "filename", [""]
108
+ )[0]
109
+ if filename:
110
+ os.chdir(zips_path)
111
+ with open(filename, "wb") as f:
112
+ f.write(download_response.content)
113
+ else:
114
+ print("Failed to get filename from URL.")
115
+ return None
116
+
117
+ elif "pixeldrain.com" in url:
118
+ try:
119
+ file_id = url.split("pixeldrain.com/u/")[1]
120
+ os.chdir(zips_path)
121
+ print(file_id)
122
+ response = requests.get(f"https://pixeldrain.com/api/file/{file_id}")
123
+ if response.status_code == 200:
124
+ file_name = (
125
+ response.headers.get("Content-Disposition")
126
+ .split("filename=")[-1]
127
+ .strip('";')
128
+ )
129
+ os.makedirs(zips_path, exist_ok=True)
130
+ with open(os.path.join(zips_path, file_name), "wb") as newfile:
131
+ newfile.write(response.content)
132
+ os.chdir(file_path)
133
+ return "downloaded"
134
+ else:
135
+ os.chdir(file_path)
136
+ return None
137
+ except Exception as error:
138
+ print(f"An error occurred downloading the file: {error}")
139
+ os.chdir(file_path)
140
+ return None
141
+
142
+ elif "cdn.discordapp.com" in url:
143
+ file = requests.get(url)
144
+ os.chdir(zips_path)
145
+ if file.status_code == 200:
146
+ name = url.split("/")
147
+ with open(os.path.join(name[-1]), "wb") as newfile:
148
+ newfile.write(file.content)
149
+ else:
150
+ return None
151
+ elif "/blob/" in url or "/resolve/" in url:
152
+ os.chdir(zips_path)
153
+ if "/blob/" in url:
154
+ url = url.replace("/blob/", "/resolve/")
155
+
156
+ response = requests.get(url, stream=True)
157
+ if response.status_code == 200:
158
+ content_disposition = six.moves.urllib_parse.unquote(
159
+ response.headers["Content-Disposition"]
160
+ )
161
+ m = re.search(r'filename="([^"]+)"', content_disposition)
162
+ file_name = m.groups()[0]
163
+ file_name = file_name.replace(os.path.sep, "_")
164
+ total_size_in_bytes = int(response.headers.get("content-length", 0))
165
+ block_size = 1024
166
+ progress_bar_length = 50
167
+ progress = 0
168
+
169
+ with open(os.path.join(zips_path, file_name), "wb") as file:
170
+ for data in response.iter_content(block_size):
171
+ file.write(data)
172
+ progress += len(data)
173
+ progress_percent = int((progress / total_size_in_bytes) * 100)
174
+ num_dots = int(
175
+ (progress / total_size_in_bytes) * progress_bar_length
176
+ )
177
+ progress_bar = (
178
+ "["
179
+ + "." * num_dots
180
+ + " " * (progress_bar_length - num_dots)
181
+ + "]"
182
+ )
183
+ print(
184
+ f"{progress_percent}% {progress_bar} {progress}/{total_size_in_bytes} ",
185
+ end="\r",
186
+ )
187
+ if progress_percent == 100:
188
+ print("\n")
189
+
190
+ else:
191
+ os.chdir(now_dir)
192
+ return None
193
+ elif "/tree/main" in url:
194
+ os.chdir(zips_path)
195
+ response = requests.get(url)
196
+ soup = BeautifulSoup(response.content, "html.parser")
197
+ temp_url = ""
198
+ for link in soup.find_all("a", href=True):
199
+ if link["href"].endswith(".zip"):
200
+ temp_url = link["href"]
201
+ break
202
+ if temp_url:
203
+ url = temp_url
204
+ url = url.replace("blob", "resolve")
205
+ if "huggingface.co" not in url:
206
+ url = "https://huggingface.co" + url
207
+
208
+ wget.download(url)
209
+ else:
210
+ os.chdir(now_dir)
211
+ return None
212
+ elif "applio.org" in url:
213
+ parts = url.split("/")
214
+ id_with_query = parts[-1]
215
+ id_parts = id_with_query.split("?")
216
+ id_number = id_parts[0]
217
+
218
+ url = "https://cjtfqzjfdimgpvpwhzlv.supabase.co/rest/v1/models"
219
+ headers = {
220
+ "apikey": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNqdGZxempmZGltZ3B2cHdoemx2Iiwicm9sZSI6ImFub24iLCJpYXQiOjE2OTUxNjczODgsImV4cCI6MjAxMDc0MzM4OH0.7z5WMIbjR99c2Ooc0ma7B_FyGq10G8X-alkCYTkKR10"
221
+ }
222
+
223
+ params = {"id": f"eq.{id_number}"}
224
+ response = requests.get(url, headers=headers, params=params)
225
+ if response.status_code == 200:
226
+ json_response = response.json()
227
+ print(json_response)
228
+ if json_response:
229
+ link = json_response[0]["link"]
230
+ verify = download_from_url(link)
231
+ if verify == "downloaded":
232
+ return "downloaded"
233
+ else:
234
+ return None
235
+ else:
236
+ return None
237
+ else:
238
+ try:
239
+ os.chdir(zips_path)
240
+ wget.download(url)
241
+ except Exception as error:
242
+ os.chdir(now_dir)
243
+ print(f"An error occurred downloading the file: {error}")
244
+ return None
245
+
246
+ for currentPath, _, zipFiles in os.walk(zips_path):
247
+ for Files in zipFiles:
248
+ filePart = Files.split(".")
249
+ extensionFile = filePart[len(filePart) - 1]
250
+ filePart.pop()
251
+ nameFile = "_".join(filePart)
252
+ realPath = os.path.join(currentPath, Files)
253
+ os.rename(realPath, nameFile + "." + extensionFile)
254
+
255
+ os.chdir(now_dir)
256
+ return "downloaded"
257
+
258
+ os.chdir(now_dir)
259
+ return None
260
+
261
+
262
+ def extract_and_show_progress(zipfile_path, unzips_path):
263
+ try:
264
+ with zipfile.ZipFile(zipfile_path, "r") as zip_ref:
265
+ for file_info in zip_ref.infolist():
266
+ zip_ref.extract(file_info, unzips_path)
267
+ os.remove(zipfile_path)
268
+ return True
269
+ except Exception as error:
270
+ print(f"An error occurred extracting the zip file: {error}")
271
+ return False
272
+
273
+
274
+ def unzip_file(zip_path, zip_file_name):
275
+ zip_file_path = os.path.join(zip_path, zip_file_name + ".zip")
276
+ extract_path = os.path.join(file_path, zip_file_name)
277
+ with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
278
+ zip_ref.extractall(extract_path)
279
+ os.remove(zip_file_path)
280
+
281
+
282
+ def model_download_pipeline(url: str):
283
+ try:
284
+ verify = download_from_url(url)
285
+ if verify == "downloaded":
286
+ extract_folder_path = ""
287
+ for filename in os.listdir(zips_path):
288
+ if filename.endswith(".zip"):
289
+ zipfile_path = os.path.join(zips_path, filename)
290
+ print("Proceeding with the extraction...")
291
+
292
+ model_zip = os.path.basename(zipfile_path)
293
+ model_name = format_title(model_zip.split(".zip")[0])
294
+ extract_folder_path = os.path.join(
295
+ "logs",
296
+ os.path.normpath(model_name),
297
+ )
298
+ success = extract_and_show_progress(
299
+ zipfile_path, extract_folder_path
300
+ )
301
+
302
+ macosx_path = os.path.join(extract_folder_path, "__MACOSX")
303
+ if os.path.exists(macosx_path):
304
+ shutil.rmtree(macosx_path)
305
+
306
+ subfolders = [
307
+ f
308
+ for f in os.listdir(extract_folder_path)
309
+ if os.path.isdir(os.path.join(extract_folder_path, f))
310
+ ]
311
+ if len(subfolders) == 1:
312
+ subfolder_path = os.path.join(
313
+ extract_folder_path, subfolders[0]
314
+ )
315
+ for item in os.listdir(subfolder_path):
316
+ s = os.path.join(subfolder_path, item)
317
+ d = os.path.join(extract_folder_path, item)
318
+ shutil.move(s, d)
319
+ os.rmdir(subfolder_path)
320
+
321
+ for item in os.listdir(extract_folder_path):
322
+ if ".pth" in item:
323
+ file_name = item.split(".pth")[0]
324
+ if file_name != model_name:
325
+ os.rename(
326
+ os.path.join(extract_folder_path, item),
327
+ os.path.join(
328
+ extract_folder_path, model_name + ".pth"
329
+ ),
330
+ )
331
+ else:
332
+ if "v2" not in item:
333
+ if "_nprobe_1_" in item and "_v1" in item:
334
+ file_name = item.split("_nprobe_1_")[1].split(
335
+ "_v1"
336
+ )[0]
337
+ if file_name != model_name:
338
+ new_file_name = (
339
+ item.split("_nprobe_1_")[0]
340
+ + "_nprobe_1_"
341
+ + model_name
342
+ + "_v1"
343
+ )
344
+ os.rename(
345
+ os.path.join(extract_folder_path, item),
346
+ os.path.join(
347
+ extract_folder_path,
348
+ new_file_name + ".index",
349
+ ),
350
+ )
351
+ else:
352
+ if "_nprobe_1_" in item and "_v2" in item:
353
+ file_name = item.split("_nprobe_1_")[1].split(
354
+ "_v2"
355
+ )[0]
356
+ if file_name != model_name:
357
+ new_file_name = (
358
+ item.split("_nprobe_1_")[0]
359
+ + "_nprobe_1_"
360
+ + model_name
361
+ + "_v2"
362
+ )
363
+ os.rename(
364
+ os.path.join(extract_folder_path, item),
365
+ os.path.join(
366
+ extract_folder_path,
367
+ new_file_name + ".index",
368
+ ),
369
+ )
370
+
371
+ if success:
372
+ print(f"Model {model_name} downloaded!")
373
+ else:
374
+ print(f"Error downloading {model_name}")
375
+ return "Error"
376
+ if extract_folder_path == "":
377
+ print("Zip file was not found.")
378
+ return "Error"
379
+ result = search_pth_index(extract_folder_path)
380
+ return result
381
+ else:
382
+ return "Error"
383
+ except Exception as error:
384
+ print(f"An unexpected error occurred: {error}")
385
+ return "Error"
rvc/lib/tools/prerequisites_download.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from concurrent.futures import ThreadPoolExecutor
3
+ from tqdm import tqdm
4
+ import requests
5
+
6
+ url_base = "https://huggingface.co/IAHispano/Applio/resolve/main/Resources"
7
+
8
+ # Define the file lists
9
+ models_list = [("predictors/", ["rmvpe.pt", "fcpe.pt"])]
10
+ embedders_list = [("embedders/contentvec/", ["pytorch_model.bin", "config.json"])]
11
+ executables_list = [
12
+ ("", ["ffmpeg.exe", "ffprobe.exe"]),
13
+ ]
14
+
15
+ folder_mapping_list = {
16
+ "embedders/contentvec/": "rvc/models/embedders/contentvec/",
17
+ "predictors/": "rvc/models/predictors/",
18
+ "formant/": "rvc/models/formant/",
19
+ }
20
+
21
+
22
+ def get_file_size_all(file_list):
23
+ """
24
+ Calculate the total size of files to be downloaded, regardless of local existence.
25
+ """
26
+ total_size = 0
27
+ for remote_folder, files in file_list:
28
+ # Use the mapping if available; otherwise, use an empty local folder
29
+ local_folder = folder_mapping_list.get(remote_folder, "")
30
+ for file in files:
31
+ url = f"{url_base}/{remote_folder}{file}"
32
+ response = requests.head(url)
33
+ total_size += int(response.headers.get("content-length", 0))
34
+ return total_size
35
+
36
+
37
+ def download_file(url, destination_path, global_bar):
38
+ """
39
+ Download a file from the given URL to the specified destination path,
40
+ updating the global progress bar as data is downloaded.
41
+ """
42
+ dir_name = os.path.dirname(destination_path)
43
+ if dir_name:
44
+ os.makedirs(dir_name, exist_ok=True)
45
+ response = requests.get(url, stream=True)
46
+ block_size = 1024
47
+ with open(destination_path, "wb") as file:
48
+ for data in response.iter_content(block_size):
49
+ file.write(data)
50
+ global_bar.update(len(data))
51
+
52
+
53
+ def download_mapping_files(file_mapping_list, global_bar):
54
+ """
55
+ Download all files in the provided file mapping list using a thread pool executor,
56
+ and update the global progress bar as downloads progress.
57
+ This version downloads all files regardless of whether they already exist.
58
+ """
59
+ with ThreadPoolExecutor() as executor:
60
+ futures = []
61
+ for remote_folder, file_list in file_mapping_list:
62
+ local_folder = folder_mapping_list.get(remote_folder, "")
63
+ for file in file_list:
64
+ destination_path = os.path.join(local_folder, file)
65
+ url = f"{url_base}/{remote_folder}{file}"
66
+ futures.append(
67
+ executor.submit(download_file, url, destination_path, global_bar)
68
+ )
69
+ for future in futures:
70
+ future.result()
71
+
72
+
73
+ def calculate_total_size(models, exe):
74
+ """
75
+ Calculate the total size of all files to be downloaded based on selected categories.
76
+ """
77
+ total_size = 0
78
+ if models:
79
+ total_size += get_file_size_all(models_list)
80
+ total_size += get_file_size_all(embedders_list)
81
+ if exe and os.name == "nt":
82
+ total_size += get_file_size_all(executables_list)
83
+ return total_size
84
+
85
+
86
+ def prerequisites_download_pipeline(models, exe):
87
+ """
88
+ Manage the download pipeline for different categories of files.
89
+ """
90
+ total_size = calculate_total_size(models, exe)
91
+ if total_size > 0:
92
+ with tqdm(
93
+ total=total_size, unit="iB", unit_scale=True, desc="Downloading all files"
94
+ ) as global_bar:
95
+ if models:
96
+ download_mapping_files(models_list, global_bar)
97
+ download_mapping_files(embedders_list, global_bar)
98
+ if exe:
99
+ if os.name == "nt":
100
+ download_mapping_files(executables_list, global_bar)
101
+ else:
102
+ print("No executables needed for non-Windows systems.")
103
+ else:
104
+ print("No files to download.")
rvc/lib/tools/pretrained_selector.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def pretrained_selector(pitch_guidance):
2
+ if pitch_guidance == True:
3
+ return {
4
+ "v1": {
5
+ 32000: (
6
+ "rvc/models/pretraineds/pretrained_v1/f0G32k.pth",
7
+ "rvc/models/pretraineds/pretrained_v1/f0D32k.pth",
8
+ ),
9
+ 40000: (
10
+ "rvc/models/pretraineds/pretrained_v1/f0G40k.pth",
11
+ "rvc/models/pretraineds/pretrained_v1/f0D40k.pth",
12
+ ),
13
+ 48000: (
14
+ "rvc/models/pretraineds/pretrained_v1/f0G48k.pth",
15
+ "rvc/models/pretraineds/pretrained_v1/f0D48k.pth",
16
+ ),
17
+ },
18
+ "v2": {
19
+ 32000: (
20
+ "rvc/models/pretraineds/pretrained_v2/f0G32k.pth",
21
+ "rvc/models/pretraineds/pretrained_v2/f0D32k.pth",
22
+ ),
23
+ 40000: (
24
+ "rvc/models/pretraineds/pretrained_v2/f0G40k.pth",
25
+ "rvc/models/pretraineds/pretrained_v2/f0D40k.pth",
26
+ ),
27
+ 48000: (
28
+ "rvc/models/pretraineds/pretrained_v2/f0G48k.pth",
29
+ "rvc/models/pretraineds/pretrained_v2/f0D48k.pth",
30
+ ),
31
+ },
32
+ }
33
+ elif pitch_guidance == False:
34
+ return {
35
+ "v1": {
36
+ 32000: (
37
+ "rvc/models/pretraineds/pretrained_v1/G32k.pth",
38
+ "rvc/models/pretraineds/pretrained_v1/D32k.pth",
39
+ ),
40
+ 40000: (
41
+ "rvc/models/pretraineds/pretrained_v1/G40k.pth",
42
+ "rvc/models/pretraineds/pretrained_v1/D40k.pth",
43
+ ),
44
+ 48000: (
45
+ "rvc/models/pretraineds/pretrained_v1/G48k.pth",
46
+ "rvc/models/pretraineds/pretrained_v1/D48k.pth",
47
+ ),
48
+ },
49
+ "v2": {
50
+ 32000: (
51
+ "rvc/models/pretraineds/pretrained_v2/G32k.pth",
52
+ "rvc/models/pretraineds/pretrained_v2/D32k.pth",
53
+ ),
54
+ 40000: (
55
+ "rvc/models/pretraineds/pretrained_v2/G40k.pth",
56
+ "rvc/models/pretraineds/pretrained_v2/D40k.pth",
57
+ ),
58
+ 48000: (
59
+ "rvc/models/pretraineds/pretrained_v2/G48k.pth",
60
+ "rvc/models/pretraineds/pretrained_v2/D48k.pth",
61
+ ),
62
+ },
63
+ }
rvc/lib/tools/split_audio.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import librosa
3
+
4
+
5
+ def process_audio(audio, sr=16000, silence_thresh=-60, min_silence_len=250):
6
+ """
7
+ Splits an audio signal into segments using a fixed frame size and hop size.
8
+
9
+ Parameters:
10
+ - audio (np.ndarray): The audio signal to split.
11
+ - sr (int): The sample rate of the input audio (default is 16000).
12
+ - silence_thresh (int): Silence threshold (default =-60dB)
13
+ - min_silence_len (int): Minimum silence duration (default 250ms).
14
+
15
+ Returns:
16
+ - list of np.ndarray: A list of audio segments.
17
+ - np.ndarray: The intervals where the audio was split.
18
+ """
19
+ frame_length = int(min_silence_len / 1000 * sr)
20
+ hop_length = frame_length // 2
21
+ intervals = librosa.effects.split(
22
+ audio, top_db=-silence_thresh, frame_length=frame_length, hop_length=hop_length
23
+ )
24
+ audio_segments = [audio[start:end] for start, end in intervals]
25
+
26
+ return audio_segments, intervals
27
+
28
+
29
+ def merge_audio(audio_segments, intervals, sr_orig, sr_new):
30
+ """
31
+ Merges audio segments back into a single audio signal, filling gaps with silence.
32
+
33
+ Parameters:
34
+ - audio_segments (list of np.ndarray): The non-silent audio segments.
35
+ - intervals (np.ndarray): The intervals used for splitting the original audio.
36
+ - sr_orig (int): The sample rate of the original audio
37
+ - sr_new (int): The sample rate of the model
38
+
39
+ Returns:
40
+ - np.ndarray: The merged audio signal with silent gaps restored.
41
+ """
42
+ sr_ratio = sr_new / sr_orig if sr_new > sr_orig else 1.0
43
+
44
+ merged_audio = np.zeros(
45
+ int(intervals[0][0] * sr_ratio if intervals[0][0] > 0 else 0),
46
+ dtype=audio_segments[0].dtype,
47
+ )
48
+
49
+ merged_audio = np.concatenate((merged_audio, audio_segments[0]))
50
+
51
+ for i in range(1, len(intervals)):
52
+ silence_duration = int((intervals[i][0] - intervals[i - 1][1]) * sr_ratio)
53
+ silence = np.zeros(silence_duration, dtype=audio_segments[0].dtype)
54
+ merged_audio = np.concatenate((merged_audio, silence, audio_segments[i]))
55
+
56
+ return merged_audio
rvc/lib/tools/tts.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import asyncio
3
+ import edge_tts
4
+ import os
5
+
6
+
7
+ async def main():
8
+ # Parse command line arguments
9
+ tts_file = str(sys.argv[1])
10
+ text = str(sys.argv[2])
11
+ voice = str(sys.argv[3])
12
+ rate = int(sys.argv[4])
13
+ output_file = str(sys.argv[5])
14
+
15
+ rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
16
+ if tts_file and os.path.exists(tts_file):
17
+ text = ""
18
+ try:
19
+ with open(tts_file, "r", encoding="utf-8") as file:
20
+ text = file.read()
21
+ except UnicodeDecodeError:
22
+ with open(tts_file, "r") as file:
23
+ text = file.read()
24
+ await edge_tts.Communicate(text, voice, rate=rates).save(output_file)
25
+ print(f"TTS with {voice} completed. Output TTS file: '{output_file}'")
26
+
27
+
28
+ if __name__ == "__main__":
29
+ asyncio.run(main())
rvc/lib/tools/tts_voices.json ADDED
The diff for this file is too large to render. See raw diff
 
rvc/lib/utils.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import librosa
3
+ import soundfile as sf
4
+ import numpy as np
5
+ import re
6
+ import unicodedata
7
+ import wget
8
+ from pydub import AudioSegment
9
+ from torch import nn
10
+
11
+ import logging
12
+ from transformers import HubertModel
13
+ import warnings
14
+
15
+ # Remove this to see warnings about transformers models
16
+ warnings.filterwarnings("ignore")
17
+
18
+ logging.getLogger("fairseq").setLevel(logging.ERROR)
19
+ logging.getLogger("faiss.loader").setLevel(logging.ERROR)
20
+ logging.getLogger("transformers").setLevel(logging.ERROR)
21
+ logging.getLogger("torch").setLevel(logging.ERROR)
22
+
23
+ now_dir = os.getcwd()
24
+ sys.path.append(now_dir)
25
+
26
+ base_path = os.path.join(now_dir, "rvc", "models", "formant", "stftpitchshift")
27
+ stft = base_path + ".exe" if sys.platform == "win32" else base_path
28
+
29
+
30
+ class HubertModelWithFinalProj(HubertModel):
31
+ def __init__(self, config):
32
+ super().__init__(config)
33
+ self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
34
+
35
+
36
+ def load_audio(file, sample_rate):
37
+ try:
38
+ file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
39
+ audio, sr = sf.read(file)
40
+ if len(audio.shape) > 1:
41
+ audio = librosa.to_mono(audio.T)
42
+ if sr != sample_rate:
43
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
44
+ except Exception as error:
45
+ raise RuntimeError(f"An error occurred loading the audio: {error}")
46
+
47
+ return audio.flatten()
48
+
49
+
50
+ def load_audio_infer(
51
+ file,
52
+ sample_rate,
53
+ **kwargs,
54
+ ):
55
+ formant_shifting = kwargs.get("formant_shifting", False)
56
+ try:
57
+ file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
58
+ if not os.path.isfile(file):
59
+ raise FileNotFoundError(f"File not found: {file}")
60
+ audio, sr = sf.read(file)
61
+ if len(audio.shape) > 1:
62
+ audio = librosa.to_mono(audio.T)
63
+ if sr != sample_rate:
64
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
65
+ if formant_shifting:
66
+ formant_qfrency = kwargs.get("formant_qfrency", 0.8)
67
+ formant_timbre = kwargs.get("formant_timbre", 0.8)
68
+
69
+ from stftpitchshift import StftPitchShift
70
+
71
+ pitchshifter = StftPitchShift(1024, 32, sample_rate)
72
+ audio = pitchshifter.shiftpitch(
73
+ audio,
74
+ factors=1,
75
+ quefrency=formant_qfrency * 1e-3,
76
+ distortion=formant_timbre,
77
+ )
78
+ except Exception as error:
79
+ raise RuntimeError(f"An error occurred loading the audio: {error}")
80
+ return np.array(audio).flatten()
81
+
82
+
83
+ def format_title(title):
84
+ formatted_title = (
85
+ unicodedata.normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8")
86
+ )
87
+ formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title)
88
+ formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title)
89
+ formatted_title = re.sub(r"\s+", "_", formatted_title)
90
+ return formatted_title
91
+
92
+
93
+ def load_embedding(embedder_model, custom_embedder=None):
94
+ embedder_root = os.path.join(now_dir, "rvc", "models", "embedders")
95
+ embedding_list = {
96
+ "contentvec": os.path.join(embedder_root, "contentvec"),
97
+ "chinese-hubert-base": os.path.join(embedder_root, "chinese_hubert_base"),
98
+ "japanese-hubert-base": os.path.join(embedder_root, "japanese_hubert_base"),
99
+ "korean-hubert-base": os.path.join(embedder_root, "korean_hubert_base"),
100
+ }
101
+
102
+ online_embedders = {
103
+ "contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/pytorch_model.bin",
104
+ "chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/pytorch_model.bin",
105
+ "japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/pytorch_model.bin",
106
+ "korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/pytorch_model.bin",
107
+ }
108
+
109
+ config_files = {
110
+ "contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/config.json",
111
+ "chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/config.json",
112
+ "japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/config.json",
113
+ "korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/config.json",
114
+ }
115
+
116
+ if embedder_model == "custom":
117
+ if os.path.exists(custom_embedder):
118
+ model_path = custom_embedder
119
+ else:
120
+ print(f"Custom embedder not found: {custom_embedder}, using contentvec")
121
+ model_path = embedding_list["contentvec"]
122
+ else:
123
+ model_path = embedding_list[embedder_model]
124
+ bin_file = os.path.join(model_path, "pytorch_model.bin")
125
+ json_file = os.path.join(model_path, "config.json")
126
+ os.makedirs(model_path, exist_ok=True)
127
+ if not os.path.exists(bin_file):
128
+ url = online_embedders[embedder_model]
129
+ print(f"Downloading {url} to {model_path}...")
130
+ wget.download(url, out=bin_file)
131
+ if not os.path.exists(json_file):
132
+ url = config_files[embedder_model]
133
+ print(f"Downloading {url} to {model_path}...")
134
+ wget.download(url, out=json_file)
135
+
136
+ models = HubertModelWithFinalProj.from_pretrained(model_path)
137
+ return models
rvc/lib/zluda.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"):
4
+ _torch_stft = torch.stft
5
+
6
+ def z_stft(
7
+ audio: torch.Tensor,
8
+ n_fft: int,
9
+ hop_length: int = None,
10
+ win_length: int = None,
11
+ window: torch.Tensor = None,
12
+ center: bool = True,
13
+ pad_mode: str = "reflect",
14
+ normalized: bool = False,
15
+ onesided: bool = None,
16
+ return_complex: bool = None,
17
+ ):
18
+ sd = audio.device
19
+ return _torch_stft(
20
+ audio.to("cpu"),
21
+ n_fft=n_fft,
22
+ hop_length=hop_length,
23
+ win_length=win_length,
24
+ window=window.to("cpu"),
25
+ center=center,
26
+ pad_mode=pad_mode,
27
+ normalized=normalized,
28
+ onesided=onesided,
29
+ return_complex=return_complex,
30
+ ).to(sd)
31
+
32
+ def z_jit(f, *_, **__):
33
+ f.graph = torch._C.Graph()
34
+ return f
35
+
36
+ # hijacks
37
+ torch.stft = z_stft
38
+ torch.jit.script = z_jit
39
+ # disabling unsupported cudnn
40
+ torch.backends.cudnn.enabled = False
41
+ torch.backends.cuda.enable_flash_sdp(False)
42
+ torch.backends.cuda.enable_math_sdp(True)
43
+ torch.backends.cuda.enable_mem_efficient_sdp(False)
scrpt.py ADDED
@@ -0,0 +1,1897 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import argparse
5
+ import subprocess
6
+ from functools import lru_cache
7
+ from distutils.util import strtobool
8
+
9
+ now_dir = os.getcwd()
10
+ sys.path.append(now_dir)
11
+
12
+ current_script_directory = os.path.dirname(os.path.realpath(__file__))
13
+ logs_path = os.path.join(current_script_directory, "logs")
14
+
15
+ from rvc.lib.tools.analyzer import analyze_audio
16
+ from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline
17
+ from rvc.lib.tools.model_download import model_download_pipeline
18
+
19
+ python = sys.executable
20
+
21
+
22
+ # Get TTS Voices -> https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4
23
+ @lru_cache(maxsize=1) # Cache only one result since the file is static
24
+ def load_voices_data():
25
+ with open(
26
+ os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r", encoding="utf-8"
27
+ ) as file:
28
+ return json.load(file)
29
+
30
+
31
+ voices_data = load_voices_data()
32
+ locales = list({voice["ShortName"] for voice in voices_data})
33
+
34
+
35
+ @lru_cache(maxsize=None)
36
+ def import_voice_converter():
37
+ from rvc.infer.infer import VoiceConverter
38
+
39
+ return VoiceConverter()
40
+
41
+
42
+ @lru_cache(maxsize=1)
43
+ def get_config():
44
+ from rvc.configs.config import Config
45
+
46
+ return Config()
47
+
48
+
49
+ # Infer
50
+ def run_infer_script(
51
+ pitch: int,
52
+ filter_radius: int,
53
+ index_rate: float,
54
+ volume_envelope: int,
55
+ protect: float,
56
+ hop_length: int,
57
+ f0_method: str,
58
+ input_path: str,
59
+ output_path: str,
60
+ pth_path: str,
61
+ index_path: str,
62
+ split_audio: bool,
63
+ f0_autotune: bool,
64
+ f0_autotune_strength: float,
65
+ clean_audio: bool,
66
+ clean_strength: float,
67
+ export_format: str,
68
+ f0_file: str,
69
+ embedder_model: str,
70
+ embedder_model_custom: str = None,
71
+ formant_shifting: bool = False,
72
+ formant_qfrency: float = 1.0,
73
+ formant_timbre: float = 1.0,
74
+ post_process: bool = False,
75
+ reverb: bool = False,
76
+ pitch_shift: bool = False,
77
+ limiter: bool = False,
78
+ gain: bool = False,
79
+ distortion: bool = False,
80
+ chorus: bool = False,
81
+ bitcrush: bool = False,
82
+ clipping: bool = False,
83
+ compressor: bool = False,
84
+ delay: bool = False,
85
+ reverb_room_size: float = 0.5,
86
+ reverb_damping: float = 0.5,
87
+ reverb_wet_gain: float = 0.5,
88
+ reverb_dry_gain: float = 0.5,
89
+ reverb_width: float = 0.5,
90
+ reverb_freeze_mode: float = 0.5,
91
+ pitch_shift_semitones: float = 0.0,
92
+ limiter_threshold: float = -6,
93
+ limiter_release_time: float = 0.01,
94
+ gain_db: float = 0.0,
95
+ distortion_gain: float = 25,
96
+ chorus_rate: float = 1.0,
97
+ chorus_depth: float = 0.25,
98
+ chorus_center_delay: float = 7,
99
+ chorus_feedback: float = 0.0,
100
+ chorus_mix: float = 0.5,
101
+ bitcrush_bit_depth: int = 8,
102
+ clipping_threshold: float = -6,
103
+ compressor_threshold: float = 0,
104
+ compressor_ratio: float = 1,
105
+ compressor_attack: float = 1.0,
106
+ compressor_release: float = 100,
107
+ delay_seconds: float = 0.5,
108
+ delay_feedback: float = 0.0,
109
+ delay_mix: float = 0.5,
110
+ sid: int = 0,
111
+ ):
112
+ kwargs = {
113
+ "audio_input_path": input_path,
114
+ "audio_output_path": output_path,
115
+ "model_path": pth_path,
116
+ "index_path": index_path,
117
+ "pitch": pitch,
118
+ "filter_radius": filter_radius,
119
+ "index_rate": index_rate,
120
+ "volume_envelope": volume_envelope,
121
+ "protect": protect,
122
+ "hop_length": hop_length,
123
+ "f0_method": f0_method,
124
+ "pth_path": pth_path,
125
+ "index_path": index_path,
126
+ "split_audio": split_audio,
127
+ "f0_autotune": f0_autotune,
128
+ "f0_autotune_strength": f0_autotune_strength,
129
+ "clean_audio": clean_audio,
130
+ "clean_strength": clean_strength,
131
+ "export_format": export_format,
132
+ "f0_file": f0_file,
133
+ "embedder_model": embedder_model,
134
+ "embedder_model_custom": embedder_model_custom,
135
+ "post_process": post_process,
136
+ "formant_shifting": formant_shifting,
137
+ "formant_qfrency": formant_qfrency,
138
+ "formant_timbre": formant_timbre,
139
+ "reverb": reverb,
140
+ "pitch_shift": pitch_shift,
141
+ "limiter": limiter,
142
+ "gain": gain,
143
+ "distortion": distortion,
144
+ "chorus": chorus,
145
+ "bitcrush": bitcrush,
146
+ "clipping": clipping,
147
+ "compressor": compressor,
148
+ "delay": delay,
149
+ "reverb_room_size": reverb_room_size,
150
+ "reverb_damping": reverb_damping,
151
+ "reverb_wet_level": reverb_wet_gain,
152
+ "reverb_dry_level": reverb_dry_gain,
153
+ "reverb_width": reverb_width,
154
+ "reverb_freeze_mode": reverb_freeze_mode,
155
+ "pitch_shift_semitones": pitch_shift_semitones,
156
+ "limiter_threshold": limiter_threshold,
157
+ "limiter_release": limiter_release_time,
158
+ "gain_db": gain_db,
159
+ "distortion_gain": distortion_gain,
160
+ "chorus_rate": chorus_rate,
161
+ "chorus_depth": chorus_depth,
162
+ "chorus_delay": chorus_center_delay,
163
+ "chorus_feedback": chorus_feedback,
164
+ "chorus_mix": chorus_mix,
165
+ "bitcrush_bit_depth": bitcrush_bit_depth,
166
+ "clipping_threshold": clipping_threshold,
167
+ "compressor_threshold": compressor_threshold,
168
+ "compressor_ratio": compressor_ratio,
169
+ "compressor_attack": compressor_attack,
170
+ "compressor_release": compressor_release,
171
+ "delay_seconds": delay_seconds,
172
+ "delay_feedback": delay_feedback,
173
+ "delay_mix": delay_mix,
174
+ "sid": sid,
175
+ }
176
+ infer_pipeline = import_voice_converter()
177
+ infer_pipeline.convert_audio(
178
+ **kwargs,
179
+ )
180
+ return f"File {input_path} inferred successfully.", output_path.replace(
181
+ ".wav", f".{export_format.lower()}"
182
+ )
183
+
184
+
185
+ # Batch infer
186
+ def run_batch_infer_script(
187
+ pitch: int,
188
+ filter_radius: int,
189
+ index_rate: float,
190
+ volume_envelope: int,
191
+ protect: float,
192
+ hop_length: int,
193
+ f0_method: str,
194
+ input_folder: str,
195
+ output_folder: str,
196
+ pth_path: str,
197
+ index_path: str,
198
+ split_audio: bool,
199
+ f0_autotune: bool,
200
+ f0_autotune_strength: float,
201
+ clean_audio: bool,
202
+ clean_strength: float,
203
+ export_format: str,
204
+ f0_file: str,
205
+ embedder_model: str,
206
+ embedder_model_custom: str = None,
207
+ formant_shifting: bool = False,
208
+ formant_qfrency: float = 1.0,
209
+ formant_timbre: float = 1.0,
210
+ post_process: bool = False,
211
+ reverb: bool = False,
212
+ pitch_shift: bool = False,
213
+ limiter: bool = False,
214
+ gain: bool = False,
215
+ distortion: bool = False,
216
+ chorus: bool = False,
217
+ bitcrush: bool = False,
218
+ clipping: bool = False,
219
+ compressor: bool = False,
220
+ delay: bool = False,
221
+ reverb_room_size: float = 0.5,
222
+ reverb_damping: float = 0.5,
223
+ reverb_wet_gain: float = 0.5,
224
+ reverb_dry_gain: float = 0.5,
225
+ reverb_width: float = 0.5,
226
+ reverb_freeze_mode: float = 0.5,
227
+ pitch_shift_semitones: float = 0.0,
228
+ limiter_threshold: float = -6,
229
+ limiter_release_time: float = 0.01,
230
+ gain_db: float = 0.0,
231
+ distortion_gain: float = 25,
232
+ chorus_rate: float = 1.0,
233
+ chorus_depth: float = 0.25,
234
+ chorus_center_delay: float = 7,
235
+ chorus_feedback: float = 0.0,
236
+ chorus_mix: float = 0.5,
237
+ bitcrush_bit_depth: int = 8,
238
+ clipping_threshold: float = -6,
239
+ compressor_threshold: float = 0,
240
+ compressor_ratio: float = 1,
241
+ compressor_attack: float = 1.0,
242
+ compressor_release: float = 100,
243
+ delay_seconds: float = 0.5,
244
+ delay_feedback: float = 0.0,
245
+ delay_mix: float = 0.5,
246
+ sid: int = 0,
247
+ ):
248
+ kwargs = {
249
+ "audio_input_paths": input_folder,
250
+ "audio_output_path": output_folder,
251
+ "model_path": pth_path,
252
+ "index_path": index_path,
253
+ "pitch": pitch,
254
+ "filter_radius": filter_radius,
255
+ "index_rate": index_rate,
256
+ "volume_envelope": volume_envelope,
257
+ "protect": protect,
258
+ "hop_length": hop_length,
259
+ "f0_method": f0_method,
260
+ "pth_path": pth_path,
261
+ "index_path": index_path,
262
+ "split_audio": split_audio,
263
+ "f0_autotune": f0_autotune,
264
+ "f0_autotune_strength": f0_autotune_strength,
265
+ "clean_audio": clean_audio,
266
+ "clean_strength": clean_strength,
267
+ "export_format": export_format,
268
+ "f0_file": f0_file,
269
+ "embedder_model": embedder_model,
270
+ "embedder_model_custom": embedder_model_custom,
271
+ "post_process": post_process,
272
+ "formant_shifting": formant_shifting,
273
+ "formant_qfrency": formant_qfrency,
274
+ "formant_timbre": formant_timbre,
275
+ "reverb": reverb,
276
+ "pitch_shift": pitch_shift,
277
+ "limiter": limiter,
278
+ "gain": gain,
279
+ "distortion": distortion,
280
+ "chorus": chorus,
281
+ "bitcrush": bitcrush,
282
+ "clipping": clipping,
283
+ "compressor": compressor,
284
+ "delay": delay,
285
+ "reverb_room_size": reverb_room_size,
286
+ "reverb_damping": reverb_damping,
287
+ "reverb_wet_level": reverb_wet_gain,
288
+ "reverb_dry_level": reverb_dry_gain,
289
+ "reverb_width": reverb_width,
290
+ "reverb_freeze_mode": reverb_freeze_mode,
291
+ "pitch_shift_semitones": pitch_shift_semitones,
292
+ "limiter_threshold": limiter_threshold,
293
+ "limiter_release": limiter_release_time,
294
+ "gain_db": gain_db,
295
+ "distortion_gain": distortion_gain,
296
+ "chorus_rate": chorus_rate,
297
+ "chorus_depth": chorus_depth,
298
+ "chorus_delay": chorus_center_delay,
299
+ "chorus_feedback": chorus_feedback,
300
+ "chorus_mix": chorus_mix,
301
+ "bitcrush_bit_depth": bitcrush_bit_depth,
302
+ "clipping_threshold": clipping_threshold,
303
+ "compressor_threshold": compressor_threshold,
304
+ "compressor_ratio": compressor_ratio,
305
+ "compressor_attack": compressor_attack,
306
+ "compressor_release": compressor_release,
307
+ "delay_seconds": delay_seconds,
308
+ "delay_feedback": delay_feedback,
309
+ "delay_mix": delay_mix,
310
+ "sid": sid,
311
+ }
312
+ infer_pipeline = import_voice_converter()
313
+ infer_pipeline.convert_audio_batch(
314
+ **kwargs,
315
+ )
316
+
317
+ return f"Files from {input_folder} inferred successfully."
318
+
319
+
320
+ # TTS
321
+ def run_tts_script(
322
+ tts_file: str,
323
+ tts_text: str,
324
+ tts_voice: str,
325
+ tts_rate: int,
326
+ pitch: int,
327
+ filter_radius: int,
328
+ index_rate: float,
329
+ volume_envelope: int,
330
+ protect: float,
331
+ hop_length: int,
332
+ f0_method: str,
333
+ output_tts_path: str,
334
+ output_rvc_path: str,
335
+ pth_path: str,
336
+ index_path: str,
337
+ split_audio: bool,
338
+ f0_autotune: bool,
339
+ f0_autotune_strength: float,
340
+ clean_audio: bool,
341
+ clean_strength: float,
342
+ export_format: str,
343
+ f0_file: str,
344
+ embedder_model: str,
345
+ embedder_model_custom: str = None,
346
+ sid: int = 0,
347
+ ):
348
+
349
+ tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
350
+
351
+ if os.path.exists(output_tts_path):
352
+ os.remove(output_tts_path)
353
+
354
+ command_tts = [
355
+ *map(
356
+ str,
357
+ [
358
+ python,
359
+ tts_script_path,
360
+ tts_file,
361
+ tts_text,
362
+ tts_voice,
363
+ tts_rate,
364
+ output_tts_path,
365
+ ],
366
+ ),
367
+ ]
368
+ subprocess.run(command_tts)
369
+ infer_pipeline = import_voice_converter()
370
+ infer_pipeline.convert_audio(
371
+ pitch=pitch,
372
+ filter_radius=filter_radius,
373
+ index_rate=index_rate,
374
+ volume_envelope=volume_envelope,
375
+ protect=protect,
376
+ hop_length=hop_length,
377
+ f0_method=f0_method,
378
+ audio_input_path=output_tts_path,
379
+ audio_output_path=output_rvc_path,
380
+ model_path=pth_path,
381
+ index_path=index_path,
382
+ split_audio=split_audio,
383
+ f0_autotune=f0_autotune,
384
+ f0_autotune_strength=f0_autotune_strength,
385
+ clean_audio=clean_audio,
386
+ clean_strength=clean_strength,
387
+ export_format=export_format,
388
+ f0_file=f0_file,
389
+ embedder_model=embedder_model,
390
+ embedder_model_custom=embedder_model_custom,
391
+ sid=sid,
392
+ formant_shifting=None,
393
+ formant_qfrency=None,
394
+ formant_timbre=None,
395
+ post_process=None,
396
+ reverb=None,
397
+ pitch_shift=None,
398
+ limiter=None,
399
+ gain=None,
400
+ distortion=None,
401
+ chorus=None,
402
+ bitcrush=None,
403
+ clipping=None,
404
+ compressor=None,
405
+ delay=None,
406
+ sliders=None,
407
+ )
408
+
409
+ return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace(
410
+ ".wav", f".{export_format.lower()}"
411
+ )
412
+
413
+
414
+ # Model information
415
+ def run_model_information_script(pth_path: str):
416
+ print(model_information(pth_path))
417
+ return model_information(pth_path)
418
+
419
+
420
+ # Model blender
421
+ def run_model_blender_script(
422
+ model_name: str, pth_path_1: str, pth_path_2: str, ratio: float
423
+ ):
424
+ message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio)
425
+ return message, model_blended
426
+
427
+
428
+ # Tensorboard
429
+ def run_tensorboard_script():
430
+ launch_tensorboard_pipeline()
431
+
432
+
433
+ # Download
434
+ def run_download_script(model_link: str):
435
+ model_download_pipeline(model_link)
436
+ return f"Model downloaded successfully."
437
+
438
+
439
+ # Audio analyzer
440
+ def run_audio_analyzer_script(
441
+ input_path: str, save_plot_path: str = "logs/audio_analysis.png"
442
+ ):
443
+ audio_info, plot_path = analyze_audio(input_path, save_plot_path)
444
+ print(
445
+ f"Audio info of {input_path}: {audio_info}",
446
+ f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}",
447
+ )
448
+ return audio_info, plot_path
449
+
450
+
451
+ # Parse arguments
452
+ def parse_arguments():
453
+ parser = argparse.ArgumentParser(
454
+ description="Run the main.py script with specific parameters."
455
+ )
456
+ subparsers = parser.add_subparsers(
457
+ title="subcommands", dest="mode", help="Choose a mode"
458
+ )
459
+
460
+ # Parser for 'infer' mode
461
+ infer_parser = subparsers.add_parser("infer", help="Run inference")
462
+ pitch_description = (
463
+ "Set the pitch of the audio. Higher values result in a higher pitch."
464
+ )
465
+ infer_parser.add_argument(
466
+ "--pitch",
467
+ type=int,
468
+ help=pitch_description,
469
+ choices=range(-24, 25),
470
+ default=0,
471
+ )
472
+ filter_radius_description = "Apply median filtering to the extracted pitch values if this value is greater than or equal to three. This can help reduce breathiness in the output audio."
473
+ infer_parser.add_argument(
474
+ "--filter_radius",
475
+ type=int,
476
+ help=filter_radius_description,
477
+ choices=range(11),
478
+ default=3,
479
+ )
480
+ index_rate_description = "Control the influence of the index file on the output. Higher values mean stronger influence. Lower values can help reduce artifacts but may result in less accurate voice cloning."
481
+ infer_parser.add_argument(
482
+ "--index_rate",
483
+ type=float,
484
+ help=index_rate_description,
485
+ choices=[i / 100.0 for i in range(0, 101)],
486
+ default=0.3,
487
+ )
488
+ volume_envelope_description = "Control the blending of the output's volume envelope. A value of 1 means the output envelope is fully used."
489
+ infer_parser.add_argument(
490
+ "--volume_envelope",
491
+ type=float,
492
+ help=volume_envelope_description,
493
+ choices=[i / 100.0 for i in range(0, 101)],
494
+ default=1,
495
+ )
496
+ protect_description = "Protect consonants and breathing sounds from artifacts. A value of 0.5 offers the strongest protection, while lower values may reduce the protection level but potentially mitigate the indexing effect."
497
+ infer_parser.add_argument(
498
+ "--protect",
499
+ type=float,
500
+ help=protect_description,
501
+ choices=[i / 1000.0 for i in range(0, 501)],
502
+ default=0.33,
503
+ )
504
+ hop_length_description = "Only applicable for the Crepe pitch extraction method. Determines the time it takes for the system to react to a significant pitch change. Smaller values require more processing time but can lead to better pitch accuracy."
505
+ infer_parser.add_argument(
506
+ "--hop_length",
507
+ type=int,
508
+ help=hop_length_description,
509
+ choices=range(1, 513),
510
+ default=128,
511
+ )
512
+ f0_method_description = "Choose the pitch extraction algorithm for the conversion. 'rmvpe' is the default and generally recommended."
513
+ infer_parser.add_argument(
514
+ "--f0_method",
515
+ type=str,
516
+ help=f0_method_description,
517
+ choices=[
518
+ "crepe",
519
+ "crepe-tiny",
520
+ "rmvpe",
521
+ "fcpe",
522
+ "hybrid[crepe+rmvpe]",
523
+ "hybrid[crepe+fcpe]",
524
+ "hybrid[rmvpe+fcpe]",
525
+ "hybrid[crepe+rmvpe+fcpe]",
526
+ ],
527
+ default="rmvpe",
528
+ )
529
+ infer_parser.add_argument(
530
+ "--input_path",
531
+ type=str,
532
+ help="Full path to the input audio file.",
533
+ required=True,
534
+ )
535
+ infer_parser.add_argument(
536
+ "--output_path",
537
+ type=str,
538
+ help="Full path to the output audio file.",
539
+ required=True,
540
+ )
541
+ pth_path_description = "Full path to the RVC model file (.pth)."
542
+ infer_parser.add_argument(
543
+ "--pth_path", type=str, help=pth_path_description, required=True
544
+ )
545
+ index_path_description = "Full path to the index file (.index)."
546
+ infer_parser.add_argument(
547
+ "--index_path", type=str, help=index_path_description, required=True
548
+ )
549
+ split_audio_description = "Split the audio into smaller segments before inference. This can improve the quality of the output for longer audio files."
550
+ infer_parser.add_argument(
551
+ "--split_audio",
552
+ type=lambda x: bool(strtobool(x)),
553
+ choices=[True, False],
554
+ help=split_audio_description,
555
+ default=False,
556
+ )
557
+ f0_autotune_description = "Apply a light autotune to the inferred audio. Particularly useful for singing voice conversions."
558
+ infer_parser.add_argument(
559
+ "--f0_autotune",
560
+ type=lambda x: bool(strtobool(x)),
561
+ choices=[True, False],
562
+ help=f0_autotune_description,
563
+ default=False,
564
+ )
565
+ f0_autotune_strength_description = "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid."
566
+ infer_parser.add_argument(
567
+ "--f0_autotune_strength",
568
+ type=float,
569
+ help=f0_autotune_strength_description,
570
+ choices=[(i / 10) for i in range(11)],
571
+ default=1.0,
572
+ )
573
+ clean_audio_description = "Clean the output audio using noise reduction algorithms. Recommended for speech conversions."
574
+ infer_parser.add_argument(
575
+ "--clean_audio",
576
+ type=lambda x: bool(strtobool(x)),
577
+ choices=[True, False],
578
+ help=clean_audio_description,
579
+ default=False,
580
+ )
581
+ clean_strength_description = "Adjust the intensity of the audio cleaning process. Higher values result in stronger cleaning, but may lead to a more compressed sound."
582
+ infer_parser.add_argument(
583
+ "--clean_strength",
584
+ type=float,
585
+ help=clean_strength_description,
586
+ choices=[(i / 10) for i in range(11)],
587
+ default=0.7,
588
+ )
589
+ export_format_description = "Select the desired output audio format."
590
+ infer_parser.add_argument(
591
+ "--export_format",
592
+ type=str,
593
+ help=export_format_description,
594
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
595
+ default="WAV",
596
+ )
597
+ embedder_model_description = (
598
+ "Choose the model used for generating speaker embeddings."
599
+ )
600
+ infer_parser.add_argument(
601
+ "--embedder_model",
602
+ type=str,
603
+ help=embedder_model_description,
604
+ choices=[
605
+ "contentvec",
606
+ "chinese-hubert-base",
607
+ "japanese-hubert-base",
608
+ "korean-hubert-base",
609
+ "custom",
610
+ ],
611
+ default="contentvec",
612
+ )
613
+ embedder_model_custom_description = "Specify the path to a custom model for speaker embedding. Only applicable if 'embedder_model' is set to 'custom'."
614
+ infer_parser.add_argument(
615
+ "--embedder_model_custom",
616
+ type=str,
617
+ help=embedder_model_custom_description,
618
+ default=None,
619
+ )
620
+ f0_file_description = "Full path to an external F0 file (.f0). This allows you to use pre-computed pitch values for the input audio."
621
+ infer_parser.add_argument(
622
+ "--f0_file",
623
+ type=str,
624
+ help=f0_file_description,
625
+ default=None,
626
+ )
627
+ formant_shifting_description = "Apply formant shifting to the input audio. This can help adjust the timbre of the voice."
628
+ infer_parser.add_argument(
629
+ "--formant_shifting",
630
+ type=lambda x: bool(strtobool(x)),
631
+ choices=[True, False],
632
+ help=formant_shifting_description,
633
+ default=False,
634
+ required=False,
635
+ )
636
+ formant_qfrency_description = "Control the frequency of the formant shifting effect. Higher values result in a more pronounced effect."
637
+ infer_parser.add_argument(
638
+ "--formant_qfrency",
639
+ type=float,
640
+ help=formant_qfrency_description,
641
+ default=1.0,
642
+ required=False,
643
+ )
644
+ formant_timbre_description = "Control the timbre of the formant shifting effect. Higher values result in a more pronounced effect."
645
+ infer_parser.add_argument(
646
+ "--formant_timbre",
647
+ type=float,
648
+ help=formant_timbre_description,
649
+ default=1.0,
650
+ required=False,
651
+ )
652
+ sid_description = "Speaker ID for multi-speaker models."
653
+ infer_parser.add_argument(
654
+ "--sid",
655
+ type=int,
656
+ help=sid_description,
657
+ default=0,
658
+ required=False,
659
+ )
660
+ post_process_description = "Apply post-processing effects to the output audio."
661
+ infer_parser.add_argument(
662
+ "--post_process",
663
+ type=lambda x: bool(strtobool(x)),
664
+ choices=[True, False],
665
+ help=post_process_description,
666
+ default=False,
667
+ required=False,
668
+ )
669
+ reverb_description = "Apply reverb effect to the output audio."
670
+ infer_parser.add_argument(
671
+ "--reverb",
672
+ type=lambda x: bool(strtobool(x)),
673
+ choices=[True, False],
674
+ help=reverb_description,
675
+ default=False,
676
+ required=False,
677
+ )
678
+
679
+ pitch_shift_description = "Apply pitch shifting effect to the output audio."
680
+ infer_parser.add_argument(
681
+ "--pitch_shift",
682
+ type=lambda x: bool(strtobool(x)),
683
+ choices=[True, False],
684
+ help=pitch_shift_description,
685
+ default=False,
686
+ required=False,
687
+ )
688
+
689
+ limiter_description = "Apply limiter effect to the output audio."
690
+ infer_parser.add_argument(
691
+ "--limiter",
692
+ type=lambda x: bool(strtobool(x)),
693
+ choices=[True, False],
694
+ help=limiter_description,
695
+ default=False,
696
+ required=False,
697
+ )
698
+
699
+ gain_description = "Apply gain effect to the output audio."
700
+ infer_parser.add_argument(
701
+ "--gain",
702
+ type=lambda x: bool(strtobool(x)),
703
+ choices=[True, False],
704
+ help=gain_description,
705
+ default=False,
706
+ required=False,
707
+ )
708
+
709
+ distortion_description = "Apply distortion effect to the output audio."
710
+ infer_parser.add_argument(
711
+ "--distortion",
712
+ type=lambda x: bool(strtobool(x)),
713
+ choices=[True, False],
714
+ help=distortion_description,
715
+ default=False,
716
+ required=False,
717
+ )
718
+
719
+ chorus_description = "Apply chorus effect to the output audio."
720
+ infer_parser.add_argument(
721
+ "--chorus",
722
+ type=lambda x: bool(strtobool(x)),
723
+ choices=[True, False],
724
+ help=chorus_description,
725
+ default=False,
726
+ required=False,
727
+ )
728
+
729
+ bitcrush_description = "Apply bitcrush effect to the output audio."
730
+ infer_parser.add_argument(
731
+ "--bitcrush",
732
+ type=lambda x: bool(strtobool(x)),
733
+ choices=[True, False],
734
+ help=bitcrush_description,
735
+ default=False,
736
+ required=False,
737
+ )
738
+
739
+ clipping_description = "Apply clipping effect to the output audio."
740
+ infer_parser.add_argument(
741
+ "--clipping",
742
+ type=lambda x: bool(strtobool(x)),
743
+ choices=[True, False],
744
+ help=clipping_description,
745
+ default=False,
746
+ required=False,
747
+ )
748
+
749
+ compressor_description = "Apply compressor effect to the output audio."
750
+ infer_parser.add_argument(
751
+ "--compressor",
752
+ type=lambda x: bool(strtobool(x)),
753
+ choices=[True, False],
754
+ help=compressor_description,
755
+ default=False,
756
+ required=False,
757
+ )
758
+
759
+ delay_description = "Apply delay effect to the output audio."
760
+ infer_parser.add_argument(
761
+ "--delay",
762
+ type=lambda x: bool(strtobool(x)),
763
+ choices=[True, False],
764
+ help=delay_description,
765
+ default=False,
766
+ required=False,
767
+ )
768
+
769
+ reverb_room_size_description = "Control the room size of the reverb effect. Higher values result in a larger room size."
770
+ infer_parser.add_argument(
771
+ "--reverb_room_size",
772
+ type=float,
773
+ help=reverb_room_size_description,
774
+ default=0.5,
775
+ required=False,
776
+ )
777
+
778
+ reverb_damping_description = "Control the damping of the reverb effect. Higher values result in a more damped sound."
779
+ infer_parser.add_argument(
780
+ "--reverb_damping",
781
+ type=float,
782
+ help=reverb_damping_description,
783
+ default=0.5,
784
+ required=False,
785
+ )
786
+
787
+ reverb_wet_gain_description = "Control the wet gain of the reverb effect. Higher values result in a stronger reverb effect."
788
+ infer_parser.add_argument(
789
+ "--reverb_wet_gain",
790
+ type=float,
791
+ help=reverb_wet_gain_description,
792
+ default=0.5,
793
+ required=False,
794
+ )
795
+
796
+ reverb_dry_gain_description = "Control the dry gain of the reverb effect. Higher values result in a stronger dry signal."
797
+ infer_parser.add_argument(
798
+ "--reverb_dry_gain",
799
+ type=float,
800
+ help=reverb_dry_gain_description,
801
+ default=0.5,
802
+ required=False,
803
+ )
804
+
805
+ reverb_width_description = "Control the stereo width of the reverb effect. Higher values result in a wider stereo image."
806
+ infer_parser.add_argument(
807
+ "--reverb_width",
808
+ type=float,
809
+ help=reverb_width_description,
810
+ default=0.5,
811
+ required=False,
812
+ )
813
+
814
+ reverb_freeze_mode_description = "Control the freeze mode of the reverb effect. Higher values result in a stronger freeze effect."
815
+ infer_parser.add_argument(
816
+ "--reverb_freeze_mode",
817
+ type=float,
818
+ help=reverb_freeze_mode_description,
819
+ default=0.5,
820
+ required=False,
821
+ )
822
+
823
+ pitch_shift_semitones_description = "Control the pitch shift in semitones. Positive values increase the pitch, while negative values decrease it."
824
+ infer_parser.add_argument(
825
+ "--pitch_shift_semitones",
826
+ type=float,
827
+ help=pitch_shift_semitones_description,
828
+ default=0.0,
829
+ required=False,
830
+ )
831
+
832
+ limiter_threshold_description = "Control the threshold of the limiter effect. Higher values result in a stronger limiting effect."
833
+ infer_parser.add_argument(
834
+ "--limiter_threshold",
835
+ type=float,
836
+ help=limiter_threshold_description,
837
+ default=-6,
838
+ required=False,
839
+ )
840
+
841
+ limiter_release_time_description = "Control the release time of the limiter effect. Higher values result in a longer release time."
842
+ infer_parser.add_argument(
843
+ "--limiter_release_time",
844
+ type=float,
845
+ help=limiter_release_time_description,
846
+ default=0.01,
847
+ required=False,
848
+ )
849
+
850
+ gain_db_description = "Control the gain in decibels. Positive values increase the gain, while negative values decrease it."
851
+ infer_parser.add_argument(
852
+ "--gain_db",
853
+ type=float,
854
+ help=gain_db_description,
855
+ default=0.0,
856
+ required=False,
857
+ )
858
+
859
+ distortion_gain_description = "Control the gain of the distortion effect. Higher values result in a stronger distortion effect."
860
+ infer_parser.add_argument(
861
+ "--distortion_gain",
862
+ type=float,
863
+ help=distortion_gain_description,
864
+ default=25,
865
+ required=False,
866
+ )
867
+
868
+ chorus_rate_description = "Control the rate of the chorus effect. Higher values result in a faster chorus effect."
869
+ infer_parser.add_argument(
870
+ "--chorus_rate",
871
+ type=float,
872
+ help=chorus_rate_description,
873
+ default=1.0,
874
+ required=False,
875
+ )
876
+
877
+ chorus_depth_description = "Control the depth of the chorus effect. Higher values result in a stronger chorus effect."
878
+ infer_parser.add_argument(
879
+ "--chorus_depth",
880
+ type=float,
881
+ help=chorus_depth_description,
882
+ default=0.25,
883
+ required=False,
884
+ )
885
+
886
+ chorus_center_delay_description = "Control the center delay of the chorus effect. Higher values result in a longer center delay."
887
+ infer_parser.add_argument(
888
+ "--chorus_center_delay",
889
+ type=float,
890
+ help=chorus_center_delay_description,
891
+ default=7,
892
+ required=False,
893
+ )
894
+
895
+ chorus_feedback_description = "Control the feedback of the chorus effect. Higher values result in a stronger feedback effect."
896
+ infer_parser.add_argument(
897
+ "--chorus_feedback",
898
+ type=float,
899
+ help=chorus_feedback_description,
900
+ default=0.0,
901
+ required=False,
902
+ )
903
+
904
+ chorus_mix_description = "Control the mix of the chorus effect. Higher values result in a stronger chorus effect."
905
+ infer_parser.add_argument(
906
+ "--chorus_mix",
907
+ type=float,
908
+ help=chorus_mix_description,
909
+ default=0.5,
910
+ required=False,
911
+ )
912
+
913
+ bitcrush_bit_depth_description = "Control the bit depth of the bitcrush effect. Higher values result in a stronger bitcrush effect."
914
+ infer_parser.add_argument(
915
+ "--bitcrush_bit_depth",
916
+ type=int,
917
+ help=bitcrush_bit_depth_description,
918
+ default=8,
919
+ required=False,
920
+ )
921
+
922
+ clipping_threshold_description = "Control the threshold of the clipping effect. Higher values result in a stronger clipping effect."
923
+ infer_parser.add_argument(
924
+ "--clipping_threshold",
925
+ type=float,
926
+ help=clipping_threshold_description,
927
+ default=-6,
928
+ required=False,
929
+ )
930
+
931
+ compressor_threshold_description = "Control the threshold of the compressor effect. Higher values result in a stronger compressor effect."
932
+ infer_parser.add_argument(
933
+ "--compressor_threshold",
934
+ type=float,
935
+ help=compressor_threshold_description,
936
+ default=0,
937
+ required=False,
938
+ )
939
+
940
+ compressor_ratio_description = "Control the ratio of the compressor effect. Higher values result in a stronger compressor effect."
941
+ infer_parser.add_argument(
942
+ "--compressor_ratio",
943
+ type=float,
944
+ help=compressor_ratio_description,
945
+ default=1,
946
+ required=False,
947
+ )
948
+
949
+ compressor_attack_description = "Control the attack of the compressor effect. Higher values result in a stronger compressor effect."
950
+ infer_parser.add_argument(
951
+ "--compressor_attack",
952
+ type=float,
953
+ help=compressor_attack_description,
954
+ default=1.0,
955
+ required=False,
956
+ )
957
+
958
+ compressor_release_description = "Control the release of the compressor effect. Higher values result in a stronger compressor effect."
959
+ infer_parser.add_argument(
960
+ "--compressor_release",
961
+ type=float,
962
+ help=compressor_release_description,
963
+ default=100,
964
+ required=False,
965
+ )
966
+
967
+ delay_seconds_description = "Control the delay time in seconds. Higher values result in a longer delay time."
968
+ infer_parser.add_argument(
969
+ "--delay_seconds",
970
+ type=float,
971
+ help=delay_seconds_description,
972
+ default=0.5,
973
+ required=False,
974
+ )
975
+ delay_feedback_description = "Control the feedback of the delay effect. Higher values result in a stronger feedback effect."
976
+ infer_parser.add_argument(
977
+ "--delay_feedback",
978
+ type=float,
979
+ help=delay_feedback_description,
980
+ default=0.0,
981
+ required=False,
982
+ )
983
+ delay_mix_description = "Control the mix of the delay effect. Higher values result in a stronger delay effect."
984
+ infer_parser.add_argument(
985
+ "--delay_mix",
986
+ type=float,
987
+ help=delay_mix_description,
988
+ default=0.5,
989
+ required=False,
990
+ )
991
+
992
+ # Parser for 'batch_infer' mode
993
+ batch_infer_parser = subparsers.add_parser(
994
+ "batch_infer",
995
+ help="Run batch inference",
996
+ )
997
+ batch_infer_parser.add_argument(
998
+ "--pitch",
999
+ type=int,
1000
+ help=pitch_description,
1001
+ choices=range(-24, 25),
1002
+ default=0,
1003
+ )
1004
+ batch_infer_parser.add_argument(
1005
+ "--filter_radius",
1006
+ type=int,
1007
+ help=filter_radius_description,
1008
+ choices=range(11),
1009
+ default=3,
1010
+ )
1011
+ batch_infer_parser.add_argument(
1012
+ "--index_rate",
1013
+ type=float,
1014
+ help=index_rate_description,
1015
+ choices=[i / 100.0 for i in range(0, 101)],
1016
+ default=0.3,
1017
+ )
1018
+ batch_infer_parser.add_argument(
1019
+ "--volume_envelope",
1020
+ type=float,
1021
+ help=volume_envelope_description,
1022
+ choices=[i / 100.0 for i in range(0, 101)],
1023
+ default=1,
1024
+ )
1025
+ batch_infer_parser.add_argument(
1026
+ "--protect",
1027
+ type=float,
1028
+ help=protect_description,
1029
+ choices=[i / 1000.0 for i in range(0, 501)],
1030
+ default=0.33,
1031
+ )
1032
+ batch_infer_parser.add_argument(
1033
+ "--hop_length",
1034
+ type=int,
1035
+ help=hop_length_description,
1036
+ choices=range(1, 513),
1037
+ default=128,
1038
+ )
1039
+ batch_infer_parser.add_argument(
1040
+ "--f0_method",
1041
+ type=str,
1042
+ help=f0_method_description,
1043
+ choices=[
1044
+ "crepe",
1045
+ "crepe-tiny",
1046
+ "rmvpe",
1047
+ "fcpe",
1048
+ "hybrid[crepe+rmvpe]",
1049
+ "hybrid[crepe+fcpe]",
1050
+ "hybrid[rmvpe+fcpe]",
1051
+ "hybrid[crepe+rmvpe+fcpe]",
1052
+ ],
1053
+ default="rmvpe",
1054
+ )
1055
+ batch_infer_parser.add_argument(
1056
+ "--input_folder",
1057
+ type=str,
1058
+ help="Path to the folder containing input audio files.",
1059
+ required=True,
1060
+ )
1061
+ batch_infer_parser.add_argument(
1062
+ "--output_folder",
1063
+ type=str,
1064
+ help="Path to the folder for saving output audio files.",
1065
+ required=True,
1066
+ )
1067
+ batch_infer_parser.add_argument(
1068
+ "--pth_path", type=str, help=pth_path_description, required=True
1069
+ )
1070
+ batch_infer_parser.add_argument(
1071
+ "--index_path", type=str, help=index_path_description, required=True
1072
+ )
1073
+ batch_infer_parser.add_argument(
1074
+ "--split_audio",
1075
+ type=lambda x: bool(strtobool(x)),
1076
+ choices=[True, False],
1077
+ help=split_audio_description,
1078
+ default=False,
1079
+ )
1080
+ batch_infer_parser.add_argument(
1081
+ "--f0_autotune",
1082
+ type=lambda x: bool(strtobool(x)),
1083
+ choices=[True, False],
1084
+ help=f0_autotune_description,
1085
+ default=False,
1086
+ )
1087
+ batch_infer_parser.add_argument(
1088
+ "--f0_autotune_strength",
1089
+ type=float,
1090
+ help=clean_strength_description,
1091
+ choices=[(i / 10) for i in range(11)],
1092
+ default=1.0,
1093
+ )
1094
+ batch_infer_parser.add_argument(
1095
+ "--clean_audio",
1096
+ type=lambda x: bool(strtobool(x)),
1097
+ choices=[True, False],
1098
+ help=clean_audio_description,
1099
+ default=False,
1100
+ )
1101
+ batch_infer_parser.add_argument(
1102
+ "--clean_strength",
1103
+ type=float,
1104
+ help=clean_strength_description,
1105
+ choices=[(i / 10) for i in range(11)],
1106
+ default=0.7,
1107
+ )
1108
+ batch_infer_parser.add_argument(
1109
+ "--export_format",
1110
+ type=str,
1111
+ help=export_format_description,
1112
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
1113
+ default="WAV",
1114
+ )
1115
+ batch_infer_parser.add_argument(
1116
+ "--embedder_model",
1117
+ type=str,
1118
+ help=embedder_model_description,
1119
+ choices=[
1120
+ "contentvec",
1121
+ "chinese-hubert-base",
1122
+ "japanese-hubert-base",
1123
+ "korean-hubert-base",
1124
+ "custom",
1125
+ ],
1126
+ default="contentvec",
1127
+ )
1128
+ batch_infer_parser.add_argument(
1129
+ "--embedder_model_custom",
1130
+ type=str,
1131
+ help=embedder_model_custom_description,
1132
+ default=None,
1133
+ )
1134
+ batch_infer_parser.add_argument(
1135
+ "--f0_file",
1136
+ type=str,
1137
+ help=f0_file_description,
1138
+ default=None,
1139
+ )
1140
+ batch_infer_parser.add_argument(
1141
+ "--formant_shifting",
1142
+ type=lambda x: bool(strtobool(x)),
1143
+ choices=[True, False],
1144
+ help=formant_shifting_description,
1145
+ default=False,
1146
+ required=False,
1147
+ )
1148
+ batch_infer_parser.add_argument(
1149
+ "--formant_qfrency",
1150
+ type=float,
1151
+ help=formant_qfrency_description,
1152
+ default=1.0,
1153
+ required=False,
1154
+ )
1155
+ batch_infer_parser.add_argument(
1156
+ "--formant_timbre",
1157
+ type=float,
1158
+ help=formant_timbre_description,
1159
+ default=1.0,
1160
+ required=False,
1161
+ )
1162
+ batch_infer_parser.add_argument(
1163
+ "--sid",
1164
+ type=int,
1165
+ help=sid_description,
1166
+ default=0,
1167
+ required=False,
1168
+ )
1169
+ batch_infer_parser.add_argument(
1170
+ "--post_process",
1171
+ type=lambda x: bool(strtobool(x)),
1172
+ choices=[True, False],
1173
+ help=post_process_description,
1174
+ default=False,
1175
+ required=False,
1176
+ )
1177
+ batch_infer_parser.add_argument(
1178
+ "--reverb",
1179
+ type=lambda x: bool(strtobool(x)),
1180
+ choices=[True, False],
1181
+ help=reverb_description,
1182
+ default=False,
1183
+ required=False,
1184
+ )
1185
+
1186
+ batch_infer_parser.add_argument(
1187
+ "--pitch_shift",
1188
+ type=lambda x: bool(strtobool(x)),
1189
+ choices=[True, False],
1190
+ help=pitch_shift_description,
1191
+ default=False,
1192
+ required=False,
1193
+ )
1194
+
1195
+ batch_infer_parser.add_argument(
1196
+ "--limiter",
1197
+ type=lambda x: bool(strtobool(x)),
1198
+ choices=[True, False],
1199
+ help=limiter_description,
1200
+ default=False,
1201
+ required=False,
1202
+ )
1203
+
1204
+ batch_infer_parser.add_argument(
1205
+ "--gain",
1206
+ type=lambda x: bool(strtobool(x)),
1207
+ choices=[True, False],
1208
+ help=gain_description,
1209
+ default=False,
1210
+ required=False,
1211
+ )
1212
+
1213
+ batch_infer_parser.add_argument(
1214
+ "--distortion",
1215
+ type=lambda x: bool(strtobool(x)),
1216
+ choices=[True, False],
1217
+ help=distortion_description,
1218
+ default=False,
1219
+ required=False,
1220
+ )
1221
+
1222
+ batch_infer_parser.add_argument(
1223
+ "--chorus",
1224
+ type=lambda x: bool(strtobool(x)),
1225
+ choices=[True, False],
1226
+ help=chorus_description,
1227
+ default=False,
1228
+ required=False,
1229
+ )
1230
+
1231
+ batch_infer_parser.add_argument(
1232
+ "--bitcrush",
1233
+ type=lambda x: bool(strtobool(x)),
1234
+ choices=[True, False],
1235
+ help=bitcrush_description,
1236
+ default=False,
1237
+ required=False,
1238
+ )
1239
+
1240
+ batch_infer_parser.add_argument(
1241
+ "--clipping",
1242
+ type=lambda x: bool(strtobool(x)),
1243
+ choices=[True, False],
1244
+ help=clipping_description,
1245
+ default=False,
1246
+ required=False,
1247
+ )
1248
+
1249
+ batch_infer_parser.add_argument(
1250
+ "--compressor",
1251
+ type=lambda x: bool(strtobool(x)),
1252
+ choices=[True, False],
1253
+ help=compressor_description,
1254
+ default=False,
1255
+ required=False,
1256
+ )
1257
+
1258
+ batch_infer_parser.add_argument(
1259
+ "--delay",
1260
+ type=lambda x: bool(strtobool(x)),
1261
+ choices=[True, False],
1262
+ help=delay_description,
1263
+ default=False,
1264
+ required=False,
1265
+ )
1266
+
1267
+ batch_infer_parser.add_argument(
1268
+ "--reverb_room_size",
1269
+ type=float,
1270
+ help=reverb_room_size_description,
1271
+ default=0.5,
1272
+ required=False,
1273
+ )
1274
+
1275
+ batch_infer_parser.add_argument(
1276
+ "--reverb_damping",
1277
+ type=float,
1278
+ help=reverb_damping_description,
1279
+ default=0.5,
1280
+ required=False,
1281
+ )
1282
+
1283
+ batch_infer_parser.add_argument(
1284
+ "--reverb_wet_gain",
1285
+ type=float,
1286
+ help=reverb_wet_gain_description,
1287
+ default=0.5,
1288
+ required=False,
1289
+ )
1290
+
1291
+ batch_infer_parser.add_argument(
1292
+ "--reverb_dry_gain",
1293
+ type=float,
1294
+ help=reverb_dry_gain_description,
1295
+ default=0.5,
1296
+ required=False,
1297
+ )
1298
+
1299
+ batch_infer_parser.add_argument(
1300
+ "--reverb_width",
1301
+ type=float,
1302
+ help=reverb_width_description,
1303
+ default=0.5,
1304
+ required=False,
1305
+ )
1306
+
1307
+ batch_infer_parser.add_argument(
1308
+ "--reverb_freeze_mode",
1309
+ type=float,
1310
+ help=reverb_freeze_mode_description,
1311
+ default=0.5,
1312
+ required=False,
1313
+ )
1314
+
1315
+ batch_infer_parser.add_argument(
1316
+ "--pitch_shift_semitones",
1317
+ type=float,
1318
+ help=pitch_shift_semitones_description,
1319
+ default=0.0,
1320
+ required=False,
1321
+ )
1322
+
1323
+ batch_infer_parser.add_argument(
1324
+ "--limiter_threshold",
1325
+ type=float,
1326
+ help=limiter_threshold_description,
1327
+ default=-6,
1328
+ required=False,
1329
+ )
1330
+
1331
+ batch_infer_parser.add_argument(
1332
+ "--limiter_release_time",
1333
+ type=float,
1334
+ help=limiter_release_time_description,
1335
+ default=0.01,
1336
+ required=False,
1337
+ )
1338
+ batch_infer_parser.add_argument(
1339
+ "--gain_db",
1340
+ type=float,
1341
+ help=gain_db_description,
1342
+ default=0.0,
1343
+ required=False,
1344
+ )
1345
+
1346
+ batch_infer_parser.add_argument(
1347
+ "--distortion_gain",
1348
+ type=float,
1349
+ help=distortion_gain_description,
1350
+ default=25,
1351
+ required=False,
1352
+ )
1353
+
1354
+ batch_infer_parser.add_argument(
1355
+ "--chorus_rate",
1356
+ type=float,
1357
+ help=chorus_rate_description,
1358
+ default=1.0,
1359
+ required=False,
1360
+ )
1361
+
1362
+ batch_infer_parser.add_argument(
1363
+ "--chorus_depth",
1364
+ type=float,
1365
+ help=chorus_depth_description,
1366
+ default=0.25,
1367
+ required=False,
1368
+ )
1369
+ batch_infer_parser.add_argument(
1370
+ "--chorus_center_delay",
1371
+ type=float,
1372
+ help=chorus_center_delay_description,
1373
+ default=7,
1374
+ required=False,
1375
+ )
1376
+
1377
+ batch_infer_parser.add_argument(
1378
+ "--chorus_feedback",
1379
+ type=float,
1380
+ help=chorus_feedback_description,
1381
+ default=0.0,
1382
+ required=False,
1383
+ )
1384
+
1385
+ batch_infer_parser.add_argument(
1386
+ "--chorus_mix",
1387
+ type=float,
1388
+ help=chorus_mix_description,
1389
+ default=0.5,
1390
+ required=False,
1391
+ )
1392
+
1393
+ batch_infer_parser.add_argument(
1394
+ "--bitcrush_bit_depth",
1395
+ type=int,
1396
+ help=bitcrush_bit_depth_description,
1397
+ default=8,
1398
+ required=False,
1399
+ )
1400
+
1401
+ batch_infer_parser.add_argument(
1402
+ "--clipping_threshold",
1403
+ type=float,
1404
+ help=clipping_threshold_description,
1405
+ default=-6,
1406
+ required=False,
1407
+ )
1408
+
1409
+ batch_infer_parser.add_argument(
1410
+ "--compressor_threshold",
1411
+ type=float,
1412
+ help=compressor_threshold_description,
1413
+ default=0,
1414
+ required=False,
1415
+ )
1416
+
1417
+ batch_infer_parser.add_argument(
1418
+ "--compressor_ratio",
1419
+ type=float,
1420
+ help=compressor_ratio_description,
1421
+ default=1,
1422
+ required=False,
1423
+ )
1424
+
1425
+ batch_infer_parser.add_argument(
1426
+ "--compressor_attack",
1427
+ type=float,
1428
+ help=compressor_attack_description,
1429
+ default=1.0,
1430
+ required=False,
1431
+ )
1432
+
1433
+ batch_infer_parser.add_argument(
1434
+ "--compressor_release",
1435
+ type=float,
1436
+ help=compressor_release_description,
1437
+ default=100,
1438
+ required=False,
1439
+ )
1440
+ batch_infer_parser.add_argument(
1441
+ "--delay_seconds",
1442
+ type=float,
1443
+ help=delay_seconds_description,
1444
+ default=0.5,
1445
+ required=False,
1446
+ )
1447
+ batch_infer_parser.add_argument(
1448
+ "--delay_feedback",
1449
+ type=float,
1450
+ help=delay_feedback_description,
1451
+ default=0.0,
1452
+ required=False,
1453
+ )
1454
+ batch_infer_parser.add_argument(
1455
+ "--delay_mix",
1456
+ type=float,
1457
+ help=delay_mix_description,
1458
+ default=0.5,
1459
+ required=False,
1460
+ )
1461
+
1462
+ # Parser for 'tts' mode
1463
+ tts_parser = subparsers.add_parser("tts", help="Run TTS inference")
1464
+ tts_parser.add_argument(
1465
+ "--tts_file", type=str, help="File with a text to be synthesized", required=True
1466
+ )
1467
+ tts_parser.add_argument(
1468
+ "--tts_text", type=str, help="Text to be synthesized", required=True
1469
+ )
1470
+ tts_parser.add_argument(
1471
+ "--tts_voice",
1472
+ type=str,
1473
+ help="Voice to be used for TTS synthesis.",
1474
+ choices=locales,
1475
+ required=True,
1476
+ )
1477
+ tts_parser.add_argument(
1478
+ "--tts_rate",
1479
+ type=int,
1480
+ help="Control the speaking rate of the TTS. Values range from -100 (slower) to 100 (faster).",
1481
+ choices=range(-100, 101),
1482
+ default=0,
1483
+ )
1484
+ tts_parser.add_argument(
1485
+ "--pitch",
1486
+ type=int,
1487
+ help=pitch_description,
1488
+ choices=range(-24, 25),
1489
+ default=0,
1490
+ )
1491
+ tts_parser.add_argument(
1492
+ "--filter_radius",
1493
+ type=int,
1494
+ help=filter_radius_description,
1495
+ choices=range(11),
1496
+ default=3,
1497
+ )
1498
+ tts_parser.add_argument(
1499
+ "--index_rate",
1500
+ type=float,
1501
+ help=index_rate_description,
1502
+ choices=[(i / 10) for i in range(11)],
1503
+ default=0.3,
1504
+ )
1505
+ tts_parser.add_argument(
1506
+ "--volume_envelope",
1507
+ type=float,
1508
+ help=volume_envelope_description,
1509
+ choices=[(i / 10) for i in range(11)],
1510
+ default=1,
1511
+ )
1512
+ tts_parser.add_argument(
1513
+ "--protect",
1514
+ type=float,
1515
+ help=protect_description,
1516
+ choices=[(i / 10) for i in range(6)],
1517
+ default=0.33,
1518
+ )
1519
+ tts_parser.add_argument(
1520
+ "--hop_length",
1521
+ type=int,
1522
+ help=hop_length_description,
1523
+ choices=range(1, 513),
1524
+ default=128,
1525
+ )
1526
+ tts_parser.add_argument(
1527
+ "--f0_method",
1528
+ type=str,
1529
+ help=f0_method_description,
1530
+ choices=[
1531
+ "crepe",
1532
+ "crepe-tiny",
1533
+ "rmvpe",
1534
+ "fcpe",
1535
+ "hybrid[crepe+rmvpe]",
1536
+ "hybrid[crepe+fcpe]",
1537
+ "hybrid[rmvpe+fcpe]",
1538
+ "hybrid[crepe+rmvpe+fcpe]",
1539
+ ],
1540
+ default="rmvpe",
1541
+ )
1542
+ tts_parser.add_argument(
1543
+ "--output_tts_path",
1544
+ type=str,
1545
+ help="Full path to save the synthesized TTS audio.",
1546
+ required=True,
1547
+ )
1548
+ tts_parser.add_argument(
1549
+ "--output_rvc_path",
1550
+ type=str,
1551
+ help="Full path to save the voice-converted audio using the synthesized TTS.",
1552
+ required=True,
1553
+ )
1554
+ tts_parser.add_argument(
1555
+ "--pth_path", type=str, help=pth_path_description, required=True
1556
+ )
1557
+ tts_parser.add_argument(
1558
+ "--index_path", type=str, help=index_path_description, required=True
1559
+ )
1560
+ tts_parser.add_argument(
1561
+ "--split_audio",
1562
+ type=lambda x: bool(strtobool(x)),
1563
+ choices=[True, False],
1564
+ help=split_audio_description,
1565
+ default=False,
1566
+ )
1567
+ tts_parser.add_argument(
1568
+ "--f0_autotune",
1569
+ type=lambda x: bool(strtobool(x)),
1570
+ choices=[True, False],
1571
+ help=f0_autotune_description,
1572
+ default=False,
1573
+ )
1574
+ tts_parser.add_argument(
1575
+ "--f0_autotune_strength",
1576
+ type=float,
1577
+ help=clean_strength_description,
1578
+ choices=[(i / 10) for i in range(11)],
1579
+ default=1.0,
1580
+ )
1581
+ tts_parser.add_argument(
1582
+ "--clean_audio",
1583
+ type=lambda x: bool(strtobool(x)),
1584
+ choices=[True, False],
1585
+ help=clean_audio_description,
1586
+ default=False,
1587
+ )
1588
+ tts_parser.add_argument(
1589
+ "--clean_strength",
1590
+ type=float,
1591
+ help=clean_strength_description,
1592
+ choices=[(i / 10) for i in range(11)],
1593
+ default=0.7,
1594
+ )
1595
+ tts_parser.add_argument(
1596
+ "--export_format",
1597
+ type=str,
1598
+ help=export_format_description,
1599
+ choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
1600
+ default="WAV",
1601
+ )
1602
+ tts_parser.add_argument(
1603
+ "--embedder_model",
1604
+ type=str,
1605
+ help=embedder_model_description,
1606
+ choices=[
1607
+ "contentvec",
1608
+ "chinese-hubert-base",
1609
+ "japanese-hubert-base",
1610
+ "korean-hubert-base",
1611
+ "custom",
1612
+ ],
1613
+ default="contentvec",
1614
+ )
1615
+ tts_parser.add_argument(
1616
+ "--embedder_model_custom",
1617
+ type=str,
1618
+ help=embedder_model_custom_description,
1619
+ default=None,
1620
+ )
1621
+ tts_parser.add_argument(
1622
+ "--f0_file",
1623
+ type=str,
1624
+ help=f0_file_description,
1625
+ default=None,
1626
+ )
1627
+
1628
+ # Parser for 'model_information' mode
1629
+ model_information_parser = subparsers.add_parser(
1630
+ "model_information", help="Display information about a trained model."
1631
+ )
1632
+ model_information_parser.add_argument(
1633
+ "--pth_path", type=str, help="Path to the .pth model file.", required=True
1634
+ )
1635
+
1636
+ # Parser for 'model_blender' mode
1637
+ model_blender_parser = subparsers.add_parser(
1638
+ "model_blender", help="Fuse two RVC models together."
1639
+ )
1640
+ model_blender_parser.add_argument(
1641
+ "--model_name", type=str, help="Name of the new fused model.", required=True
1642
+ )
1643
+ model_blender_parser.add_argument(
1644
+ "--pth_path_1",
1645
+ type=str,
1646
+ help="Path to the first .pth model file.",
1647
+ required=True,
1648
+ )
1649
+ model_blender_parser.add_argument(
1650
+ "--pth_path_2",
1651
+ type=str,
1652
+ help="Path to the second .pth model file.",
1653
+ required=True,
1654
+ )
1655
+ model_blender_parser.add_argument(
1656
+ "--ratio",
1657
+ type=float,
1658
+ help="Ratio for blending the two models (0.0 to 1.0).",
1659
+ choices=[(i / 10) for i in range(11)],
1660
+ default=0.5,
1661
+ )
1662
+
1663
+ # Parser for 'tensorboard' mode
1664
+ subparsers.add_parser(
1665
+ "tensorboard", help="Launch TensorBoard for monitoring training progress."
1666
+ )
1667
+
1668
+ # Parser for 'download' mode
1669
+ download_parser = subparsers.add_parser(
1670
+ "download", help="Download a model from a provided link."
1671
+ )
1672
+ download_parser.add_argument(
1673
+ "--model_link", type=str, help="Direct link to the model file.", required=True
1674
+ )
1675
+
1676
+ # Parser for 'prerequisites' mode
1677
+ prerequisites_parser = subparsers.add_parser(
1678
+ "prerequisites", help="Install prerequisites for RVC."
1679
+ )
1680
+ prerequisites_parser.add_argument(
1681
+ "--models",
1682
+ type=lambda x: bool(strtobool(x)),
1683
+ choices=[True, False],
1684
+ default=True,
1685
+ help="Download additional models.",
1686
+ )
1687
+ prerequisites_parser.add_argument(
1688
+ "--exe",
1689
+ type=lambda x: bool(strtobool(x)),
1690
+ choices=[True, False],
1691
+ default=True,
1692
+ help="Download required executables.",
1693
+ )
1694
+
1695
+ # Parser for 'audio_analyzer' mode
1696
+ audio_analyzer = subparsers.add_parser(
1697
+ "audio_analyzer", help="Analyze an audio file."
1698
+ )
1699
+ audio_analyzer.add_argument(
1700
+ "--input_path", type=str, help="Path to the input audio file.", required=True
1701
+ )
1702
+
1703
+ return parser.parse_args()
1704
+
1705
+
1706
+ def main():
1707
+ if len(sys.argv) == 1:
1708
+ print("Please run the script with '-h' for more information.")
1709
+ sys.exit(1)
1710
+
1711
+ args = parse_arguments()
1712
+
1713
+ try:
1714
+ if args.mode == "infer":
1715
+ run_infer_script(
1716
+ pitch=args.pitch,
1717
+ filter_radius=args.filter_radius,
1718
+ index_rate=args.index_rate,
1719
+ volume_envelope=args.volume_envelope,
1720
+ protect=args.protect,
1721
+ hop_length=args.hop_length,
1722
+ f0_method=args.f0_method,
1723
+ input_path=args.input_path,
1724
+ output_path=args.output_path,
1725
+ pth_path=args.pth_path,
1726
+ index_path=args.index_path,
1727
+ split_audio=args.split_audio,
1728
+ f0_autotune=args.f0_autotune,
1729
+ f0_autotune_strength=args.f0_autotune_strength,
1730
+ clean_audio=args.clean_audio,
1731
+ clean_strength=args.clean_strength,
1732
+ export_format=args.export_format,
1733
+ embedder_model=args.embedder_model,
1734
+ embedder_model_custom=args.embedder_model_custom,
1735
+ f0_file=args.f0_file,
1736
+ formant_shifting=args.formant_shifting,
1737
+ formant_qfrency=args.formant_qfrency,
1738
+ formant_timbre=args.formant_timbre,
1739
+ sid=args.sid,
1740
+ post_process=args.post_process,
1741
+ reverb=args.reverb,
1742
+ pitch_shift=args.pitch_shift,
1743
+ limiter=args.limiter,
1744
+ gain=args.gain,
1745
+ distortion=args.distortion,
1746
+ chorus=args.chorus,
1747
+ bitcrush=args.bitcrush,
1748
+ clipping=args.clipping,
1749
+ compressor=args.compressor,
1750
+ delay=args.delay,
1751
+ reverb_room_size=args.reverb_room_size,
1752
+ reverb_damping=args.reverb_damping,
1753
+ reverb_wet_gain=args.reverb_wet_gain,
1754
+ reverb_dry_gain=args.reverb_dry_gain,
1755
+ reverb_width=args.reverb_width,
1756
+ reverb_freeze_mode=args.reverb_freeze_mode,
1757
+ pitch_shift_semitones=args.pitch_shift_semitones,
1758
+ limiter_threshold=args.limiter_threshold,
1759
+ limiter_release_time=args.limiter_release_time,
1760
+ gain_db=args.gain_db,
1761
+ distortion_gain=args.distortion_gain,
1762
+ chorus_rate=args.chorus_rate,
1763
+ chorus_depth=args.chorus_depth,
1764
+ chorus_center_delay=args.chorus_center_delay,
1765
+ chorus_feedback=args.chorus_feedback,
1766
+ chorus_mix=args.chorus_mix,
1767
+ bitcrush_bit_depth=args.bitcrush_bit_depth,
1768
+ clipping_threshold=args.clipping_threshold,
1769
+ compressor_threshold=args.compressor_threshold,
1770
+ compressor_ratio=args.compressor_ratio,
1771
+ compressor_attack=args.compressor_attack,
1772
+ compressor_release=args.compressor_release,
1773
+ delay_seconds=args.delay_seconds,
1774
+ delay_feedback=args.delay_feedback,
1775
+ delay_mix=args.delay_mix,
1776
+ )
1777
+ elif args.mode == "batch_infer":
1778
+ run_batch_infer_script(
1779
+ pitch=args.pitch,
1780
+ filter_radius=args.filter_radius,
1781
+ index_rate=args.index_rate,
1782
+ volume_envelope=args.volume_envelope,
1783
+ protect=args.protect,
1784
+ hop_length=args.hop_length,
1785
+ f0_method=args.f0_method,
1786
+ input_folder=args.input_folder,
1787
+ output_folder=args.output_folder,
1788
+ pth_path=args.pth_path,
1789
+ index_path=args.index_path,
1790
+ split_audio=args.split_audio,
1791
+ f0_autotune=args.f0_autotune,
1792
+ f0_autotune_strength=args.f0_autotune_strength,
1793
+ clean_audio=args.clean_audio,
1794
+ clean_strength=args.clean_strength,
1795
+ export_format=args.export_format,
1796
+ embedder_model=args.embedder_model,
1797
+ embedder_model_custom=args.embedder_model_custom,
1798
+ f0_file=args.f0_file,
1799
+ formant_shifting=args.formant_shifting,
1800
+ formant_qfrency=args.formant_qfrency,
1801
+ formant_timbre=args.formant_timbre,
1802
+ sid=args.sid,
1803
+ post_process=args.post_process,
1804
+ reverb=args.reverb,
1805
+ pitch_shift=args.pitch_shift,
1806
+ limiter=args.limiter,
1807
+ gain=args.gain,
1808
+ distortion=args.distortion,
1809
+ chorus=args.chorus,
1810
+ bitcrush=args.bitcrush,
1811
+ clipping=args.clipping,
1812
+ compressor=args.compressor,
1813
+ delay=args.delay,
1814
+ reverb_room_size=args.reverb_room_size,
1815
+ reverb_damping=args.reverb_damping,
1816
+ reverb_wet_gain=args.reverb_wet_gain,
1817
+ reverb_dry_gain=args.reverb_dry_gain,
1818
+ reverb_width=args.reverb_width,
1819
+ reverb_freeze_mode=args.reverb_freeze_mode,
1820
+ pitch_shift_semitones=args.pitch_shift_semitones,
1821
+ limiter_threshold=args.limiter_threshold,
1822
+ limiter_release_time=args.limiter_release_time,
1823
+ gain_db=args.gain_db,
1824
+ distortion_gain=args.distortion_gain,
1825
+ chorus_rate=args.chorus_rate,
1826
+ chorus_depth=args.chorus_depth,
1827
+ chorus_center_delay=args.chorus_center_delay,
1828
+ chorus_feedback=args.chorus_feedback,
1829
+ chorus_mix=args.chorus_mix,
1830
+ bitcrush_bit_depth=args.bitcrush_bit_depth,
1831
+ clipping_threshold=args.clipping_threshold,
1832
+ compressor_threshold=args.compressor_threshold,
1833
+ compressor_ratio=args.compressor_ratio,
1834
+ compressor_attack=args.compressor_attack,
1835
+ compressor_release=args.compressor_release,
1836
+ delay_seconds=args.delay_seconds,
1837
+ delay_feedback=args.delay_feedback,
1838
+ delay_mix=args.delay_mix,
1839
+ )
1840
+ elif args.mode == "tts":
1841
+ run_tts_script(
1842
+ tts_file=args.tts_file,
1843
+ tts_text=args.tts_text,
1844
+ tts_voice=args.tts_voice,
1845
+ tts_rate=args.tts_rate,
1846
+ pitch=args.pitch,
1847
+ filter_radius=args.filter_radius,
1848
+ index_rate=args.index_rate,
1849
+ volume_envelope=args.volume_envelope,
1850
+ protect=args.protect,
1851
+ hop_length=args.hop_length,
1852
+ f0_method=args.f0_method,
1853
+ output_tts_path=args.output_tts_path,
1854
+ output_rvc_path=args.output_rvc_path,
1855
+ pth_path=args.pth_path,
1856
+ index_path=args.index_path,
1857
+ split_audio=args.split_audio,
1858
+ f0_autotune=args.f0_autotune,
1859
+ f0_autotune_strength=args.f0_autotune_strength,
1860
+ clean_audio=args.clean_audio,
1861
+ clean_strength=args.clean_strength,
1862
+ export_format=args.export_format,
1863
+ embedder_model=args.embedder_model,
1864
+ embedder_model_custom=args.embedder_model_custom,
1865
+ f0_file=args.f0_file,
1866
+ )
1867
+ elif args.mode == "model_information":
1868
+ run_model_information_script(
1869
+ pth_path=args.pth_path,
1870
+ )
1871
+ elif args.mode == "model_blender":
1872
+ run_model_blender_script(
1873
+ model_name=args.model_name,
1874
+ pth_path_1=args.pth_path_1,
1875
+ pth_path_2=args.pth_path_2,
1876
+ ratio=args.ratio,
1877
+ )
1878
+ elif args.mode == "tensorboard":
1879
+ run_tensorboard_script()
1880
+ elif args.mode == "download":
1881
+ run_download_script(
1882
+ model_link=args.model_link,
1883
+ )
1884
+ elif args.mode == "audio_analyzer":
1885
+ run_audio_analyzer_script(
1886
+ input_path=args.input_path,
1887
+ )
1888
+ except Exception as error:
1889
+ print(f"An error occurred during execution: {error}")
1890
+
1891
+ import traceback
1892
+
1893
+ traceback.print_exc()
1894
+
1895
+
1896
+ if __name__ == "__main__":
1897
+ main()
tabs/download/download.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import shutil
5
+ import requests
6
+ import tempfile
7
+ import gradio as gr
8
+ import pandas as pd
9
+
10
+ from concurrent.futures import ThreadPoolExecutor
11
+ from tqdm import tqdm
12
+
13
+
14
+ now_dir = os.getcwd()
15
+ sys.path.append(now_dir)
16
+
17
+ from scrpt import run_download_script
18
+ from rvc.lib.utils import format_title
19
+
20
+
21
+
22
+ gradio_temp_dir = os.path.join(tempfile.gettempdir(), "gradio")
23
+
24
+ if os.path.exists(gradio_temp_dir):
25
+ shutil.rmtree(gradio_temp_dir)
26
+
27
+
28
+ def save_drop_model(dropbox):
29
+ if "pth" not in dropbox and "index" not in dropbox:
30
+ raise gr.Error(
31
+ message="The file you dropped is not a valid model file. Please try again."
32
+ )
33
+
34
+ file_name = format_title(os.path.basename(dropbox))
35
+ model_name = file_name
36
+
37
+ if ".pth" in model_name:
38
+ model_name = model_name.split(".pth")[0]
39
+ elif ".index" in model_name:
40
+ replacements = ["nprobe_1_", "_v1", "_v2", "added_"]
41
+ for rep in replacements:
42
+ model_name = model_name.replace(rep, "")
43
+ model_name = model_name.split(".index")[0]
44
+
45
+ model_path = os.path.join(now_dir, "logs", model_name)
46
+ if not os.path.exists(model_path):
47
+ os.makedirs(model_path)
48
+ if os.path.exists(os.path.join(model_path, file_name)):
49
+ os.remove(os.path.join(model_path, file_name))
50
+ shutil.move(dropbox, os.path.join(model_path, file_name))
51
+ print(f"{file_name} saved in {model_path}")
52
+ gr.Info(f"{file_name} saved in {model_path}")
53
+
54
+ return None
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+ def get_file_size(url):
63
+ response = requests.head(url)
64
+ return int(response.headers.get("content-length", 0))
65
+
66
+
67
+ def download_file(url, destination_path, progress_bar):
68
+ os.makedirs(os.path.dirname(destination_path), exist_ok=True)
69
+ response = requests.get(url, stream=True)
70
+ block_size = 1024
71
+ with open(destination_path, "wb") as file:
72
+ for data in response.iter_content(block_size):
73
+ file.write(data)
74
+ progress_bar.update(len(data))
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+ def download_tab():
83
+ with gr.Column():
84
+ gr.Markdown(value="## Download Model")
85
+ model_link = gr.Textbox(
86
+ label="Model Link",
87
+ placeholder="Introduce the model link",
88
+ interactive=True,
89
+ )
90
+ model_download_output_info = gr.Textbox(
91
+ label="Output Information",
92
+ info="The output information will be displayed here.",
93
+ value="",
94
+ max_lines=8,
95
+ interactive=False,
96
+ )
97
+ model_download_button = gr.Button("Download Model")
98
+ model_download_button.click(
99
+ fn=run_download_script,
100
+ inputs=[model_link],
101
+ outputs=[model_download_output_info],
102
+ )
103
+ gr.Markdown("## Drop files")
104
+ dropbox = gr.File(label="Drag your .pth file and .index file into this space. Drag one and then the other.", type="filepath")
105
+
106
+ dropbox.upload(
107
+ fn=save_drop_model,
108
+ inputs=[dropbox],
109
+ outputs=[dropbox],
110
+ )
111
+