Spaces:
Build error
Build error
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .github/FUNDING.yml +3 -0
- .github/PULL_REQUEST_TEMPLATE.md +37 -0
- .github/workflows/code_formatter.yml +53 -0
- .github/workflows/python-app.yml +55 -0
- .gitignore +388 -0
- Advanced-RVC.ipynb +494 -0
- LICENSE +22 -0
- README.md +64 -12
- assets/config.json +5 -0
- assets/themes/loadThemes.py +119 -0
- assets/themes/themes_list.json +24 -0
- install.bat +87 -0
- models.py +5 -0
- requirements.txt +33 -0
- rvc/configs/config.py +179 -0
- rvc/configs/v1/32000.json +47 -0
- rvc/configs/v1/40000.json +47 -0
- rvc/configs/v1/48000.json +47 -0
- rvc/configs/v2/32000.json +43 -0
- rvc/configs/v2/40000.json +43 -0
- rvc/configs/v2/48000.json +43 -0
- rvc/infer/infer.py +495 -0
- rvc/infer/pipeline.py +708 -0
- rvc/lib/algorithm/__init__.py +0 -0
- rvc/lib/algorithm/attentions.py +243 -0
- rvc/lib/algorithm/commons.py +207 -0
- rvc/lib/algorithm/discriminators.py +160 -0
- rvc/lib/algorithm/encoders.py +218 -0
- rvc/lib/algorithm/generators.py +231 -0
- rvc/lib/algorithm/modules.py +124 -0
- rvc/lib/algorithm/normalization.py +31 -0
- rvc/lib/algorithm/nsf.py +196 -0
- rvc/lib/algorithm/residuals.py +250 -0
- rvc/lib/algorithm/synthesizers.py +237 -0
- rvc/lib/predictors/F0Extractor.py +100 -0
- rvc/lib/predictors/FCPE.py +920 -0
- rvc/lib/predictors/RMVPE.py +560 -0
- rvc/lib/tools/analyzer.py +76 -0
- rvc/lib/tools/gdown.py +354 -0
- rvc/lib/tools/launch_tensorboard.py +21 -0
- rvc/lib/tools/model_download.py +385 -0
- rvc/lib/tools/prerequisites_download.py +104 -0
- rvc/lib/tools/pretrained_selector.py +63 -0
- rvc/lib/tools/split_audio.py +56 -0
- rvc/lib/tools/tts.py +29 -0
- rvc/lib/tools/tts_voices.json +0 -0
- rvc/lib/utils.py +137 -0
- rvc/lib/zluda.py +43 -0
- scrpt.py +1897 -0
- tabs/download/download.py +111 -0
.github/FUNDING.yml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
github: TheNeodev, ArkanDash
|
3 |
+
ko_fi: arkandash
|
.github/PULL_REQUEST_TEMPLATE.md
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!--- Provide a general summary of your changes in the Title above -->
|
2 |
+
|
3 |
+
## Description
|
4 |
+
|
5 |
+
<!--- Describe your changes in detail -->
|
6 |
+
|
7 |
+
## Motivation and Context
|
8 |
+
|
9 |
+
<!--- Why is this change required? What problem does it solve? -->
|
10 |
+
<!--- If it fixes an open issue, please link to the issue here. -->
|
11 |
+
|
12 |
+
## How has this been tested?
|
13 |
+
|
14 |
+
<!--- Please describe in detail how you tested your changes. -->
|
15 |
+
<!--- Include details of your testing environment, tests ran to see how -->
|
16 |
+
<!--- your change affects other areas of the code, etc. -->
|
17 |
+
|
18 |
+
## Screenshots (if appropriate):
|
19 |
+
|
20 |
+
## Types of changes
|
21 |
+
|
22 |
+
<!--- What types of changes does your code introduce? Put an `x` in all the boxes that apply: -->
|
23 |
+
|
24 |
+
- [ ] Bug fix (non-breaking change which fixes an issue)
|
25 |
+
- [ ] New feature (non-breaking change which adds functionality)
|
26 |
+
- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
|
27 |
+
|
28 |
+
## Checklist:
|
29 |
+
|
30 |
+
<!--- Go over all the following points, and put an `x` in all the boxes that apply. -->
|
31 |
+
<!--- If you're unsure about any of these, don't hesitate to ask. We're here to help! -->
|
32 |
+
|
33 |
+
- [ ] My code follows the code style of this project.
|
34 |
+
- [ ] My change requires a change to the documentation.
|
35 |
+
- [ ] I have updated the documentation accordingly.
|
36 |
+
- [ ] I have added tests to cover my changes.
|
37 |
+
- [ ] All new and existing tests passed.
|
.github/workflows/code_formatter.yml
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Code Formatter
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches:
|
6 |
+
- main
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
push_format:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
|
12 |
+
permissions:
|
13 |
+
contents: write
|
14 |
+
pull-requests: write
|
15 |
+
|
16 |
+
steps:
|
17 |
+
- uses: actions/checkout@v4
|
18 |
+
with:
|
19 |
+
ref: ${{github.ref_name}}
|
20 |
+
|
21 |
+
- name: Set up Python ${{ matrix.python-version }}
|
22 |
+
uses: actions/setup-python@v5
|
23 |
+
with:
|
24 |
+
python-version: ${{ matrix.python-version }}
|
25 |
+
|
26 |
+
- name: Install Black and autoflake
|
27 |
+
run: pip install "black[jupyter]" autoflake
|
28 |
+
|
29 |
+
- name: Run autoflake
|
30 |
+
run: autoflake --in-place --recursive .
|
31 |
+
|
32 |
+
- name: Run Black
|
33 |
+
run: black . --exclude=".*\.ipynb$"
|
34 |
+
|
35 |
+
- name: Commit Back
|
36 |
+
continue-on-error: true
|
37 |
+
id: commitback
|
38 |
+
run: |
|
39 |
+
git config --local user.email "github-actions[bot]@users.noreply.github.com"
|
40 |
+
git config --local user.name "github-actions[bot]"
|
41 |
+
git add --all
|
42 |
+
git commit -m "chore(format): run black on ${{github.ref_name}}"
|
43 |
+
|
44 |
+
- name: Create Pull Request
|
45 |
+
if: steps.commitback.outcome == 'success'
|
46 |
+
continue-on-error: true
|
47 |
+
uses: peter-evans/create-pull-request@v5
|
48 |
+
with:
|
49 |
+
delete-branch: true
|
50 |
+
body: "Automatically apply code formatter change"
|
51 |
+
title: "chore(format): run black on ${{github.ref_name}}"
|
52 |
+
commit-message: "chore(format): run black on ${{github.ref_name}}"
|
53 |
+
branch: formatter/${{github.ref_name}}
|
.github/workflows/python-app.yml
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This workflow will install Python dependencies, run tests and lint with a single version of Python
|
2 |
+
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
|
3 |
+
|
4 |
+
name: Python application
|
5 |
+
|
6 |
+
on:
|
7 |
+
push:
|
8 |
+
branches: [ "master" ]
|
9 |
+
pull_request:
|
10 |
+
branches: [ "master" ]
|
11 |
+
|
12 |
+
permissions:
|
13 |
+
contents: read
|
14 |
+
|
15 |
+
jobs:
|
16 |
+
build:
|
17 |
+
runs-on: ubuntu-latest
|
18 |
+
|
19 |
+
steps:
|
20 |
+
- name: Checkout code
|
21 |
+
uses: actions/checkout@v3
|
22 |
+
|
23 |
+
- name: Set up Python
|
24 |
+
uses: actions/setup-python@v4
|
25 |
+
with:
|
26 |
+
python-version: "3.10"
|
27 |
+
|
28 |
+
- name: Install torch, torchvision, torchaudio
|
29 |
+
run: |
|
30 |
+
pip install torch torchvision torchaudio
|
31 |
+
|
32 |
+
- name: Install FFmpeg
|
33 |
+
run: |
|
34 |
+
sudo apt-get update
|
35 |
+
sudo apt-get install -y ffmpeg
|
36 |
+
|
37 |
+
- name: Install dependencies from requirements.txt
|
38 |
+
run: |
|
39 |
+
python -m pip install pip==24.0
|
40 |
+
pip install -r requirements.txt
|
41 |
+
|
42 |
+
- name: Download Hubert & RMVPE
|
43 |
+
run: |
|
44 |
+
sudo apt-get install -qq -y aria2
|
45 |
+
aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d /home/runner/work/Advanced-RVC-Inference/Advanced-RVC-Inference -o hubert_base.pt
|
46 |
+
aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -d /home/runner/work/Advanced-RVC-Inference/Advanced-RVC-Inference -o rmvpe.pt
|
47 |
+
|
48 |
+
- name: Test application
|
49 |
+
run: |
|
50 |
+
python infer.py &
|
51 |
+
sleep 180
|
52 |
+
|
53 |
+
- name: Exit application
|
54 |
+
run: |
|
55 |
+
pkill -f infer.py || true
|
.gitignore
ADDED
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Ignore Visual Studio temporary files, build results, and
|
2 |
+
## files generated by popular Visual Studio add-ons.
|
3 |
+
##
|
4 |
+
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
|
5 |
+
|
6 |
+
# User-specific files
|
7 |
+
*.rsuser
|
8 |
+
*.suo
|
9 |
+
*.user
|
10 |
+
*.userosscache
|
11 |
+
*.sln.docstates
|
12 |
+
|
13 |
+
# User-specific files (MonoDevelop/Xamarin Studio)
|
14 |
+
*.userprefs
|
15 |
+
|
16 |
+
# Mono auto generated files
|
17 |
+
mono_crash.*
|
18 |
+
|
19 |
+
# Build results
|
20 |
+
[Dd]ebug/
|
21 |
+
[Dd]ebugPublic/
|
22 |
+
[Rr]elease/
|
23 |
+
[Rr]eleases/
|
24 |
+
x64/
|
25 |
+
x86/
|
26 |
+
[Ww][Ii][Nn]32/
|
27 |
+
[Aa][Rr][Mm]/
|
28 |
+
[Aa][Rr][Mm]64/
|
29 |
+
bld/
|
30 |
+
[Bb]in/
|
31 |
+
[Oo]bj/
|
32 |
+
[Oo]ut/
|
33 |
+
[Ll]og/
|
34 |
+
[Ll]ogs/
|
35 |
+
infer_pack\__pycache__
|
36 |
+
# Visual Studio 2015/2017 cache/options directory
|
37 |
+
.vs/
|
38 |
+
# Uncomment if you have tasks that create the project's static files in wwwroot
|
39 |
+
#wwwroot/
|
40 |
+
|
41 |
+
# Visual Studio 2017 auto generated files
|
42 |
+
Generated\ Files/
|
43 |
+
|
44 |
+
# MSTest test Results
|
45 |
+
[Tt]est[Rr]esult*/
|
46 |
+
[Bb]uild[Ll]og.*
|
47 |
+
|
48 |
+
# NUnit
|
49 |
+
*.VisualState.xml
|
50 |
+
TestResult.xml
|
51 |
+
nunit-*.xml
|
52 |
+
|
53 |
+
# Build Results of an ATL Project
|
54 |
+
[Dd]ebugPS/
|
55 |
+
[Rr]eleasePS/
|
56 |
+
dlldata.c
|
57 |
+
|
58 |
+
# Benchmark Results
|
59 |
+
BenchmarkDotNet.Artifacts/
|
60 |
+
|
61 |
+
# .NET Core
|
62 |
+
project.lock.json
|
63 |
+
project.fragment.lock.json
|
64 |
+
artifacts/
|
65 |
+
|
66 |
+
# ASP.NET Scaffolding
|
67 |
+
ScaffoldingReadMe.txt
|
68 |
+
|
69 |
+
# StyleCop
|
70 |
+
StyleCopReport.xml
|
71 |
+
|
72 |
+
# Files built by Visual Studio
|
73 |
+
*_i.c
|
74 |
+
*_p.c
|
75 |
+
*_h.h
|
76 |
+
*.ilk
|
77 |
+
*.meta
|
78 |
+
*.obj
|
79 |
+
*.iobj
|
80 |
+
*.pch
|
81 |
+
*.pdb
|
82 |
+
*.ipdb
|
83 |
+
*.pgc
|
84 |
+
*.pgd
|
85 |
+
*.rsp
|
86 |
+
*.sbr
|
87 |
+
*.tlb
|
88 |
+
*.tli
|
89 |
+
*.tlh
|
90 |
+
*.tmp
|
91 |
+
*.tmp_proj
|
92 |
+
*_wpftmp.csproj
|
93 |
+
*.log
|
94 |
+
*.vspscc
|
95 |
+
*.vssscc
|
96 |
+
.builds
|
97 |
+
*.pidb
|
98 |
+
*.svclog
|
99 |
+
*.scc
|
100 |
+
|
101 |
+
# Chutzpah Test files
|
102 |
+
_Chutzpah*
|
103 |
+
|
104 |
+
# Visual C++ cache files
|
105 |
+
ipch/
|
106 |
+
*.aps
|
107 |
+
*.ncb
|
108 |
+
*.opendb
|
109 |
+
*.opensdf
|
110 |
+
*.sdf
|
111 |
+
*.cachefile
|
112 |
+
*.VC.db
|
113 |
+
*.VC.VC.opendb
|
114 |
+
|
115 |
+
# Visual Studio profiler
|
116 |
+
*.psess
|
117 |
+
*.vsp
|
118 |
+
*.vspx
|
119 |
+
*.sap
|
120 |
+
|
121 |
+
# Visual Studio Trace Files
|
122 |
+
*.e2e
|
123 |
+
|
124 |
+
# TFS 2012 Local Workspace
|
125 |
+
$tf/
|
126 |
+
|
127 |
+
# Guidance Automation Toolkit
|
128 |
+
*.gpState
|
129 |
+
|
130 |
+
# ReSharper is a .NET coding add-in
|
131 |
+
_ReSharper*/
|
132 |
+
*.[Rr]e[Ss]harper
|
133 |
+
*.DotSettings.user
|
134 |
+
|
135 |
+
# TeamCity is a build add-in
|
136 |
+
_TeamCity*
|
137 |
+
|
138 |
+
# DotCover is a Code Coverage Tool
|
139 |
+
*.dotCover
|
140 |
+
|
141 |
+
# AxoCover is a Code Coverage Tool
|
142 |
+
.axoCover/*
|
143 |
+
!.axoCover/settings.json
|
144 |
+
|
145 |
+
# Coverlet is a free, cross platform Code Coverage Tool
|
146 |
+
coverage*.json
|
147 |
+
coverage*.xml
|
148 |
+
coverage*.info
|
149 |
+
|
150 |
+
# Visual Studio code coverage results
|
151 |
+
*.coverage
|
152 |
+
*.coveragexml
|
153 |
+
|
154 |
+
# NCrunch
|
155 |
+
_NCrunch_*
|
156 |
+
.*crunch*.local.xml
|
157 |
+
nCrunchTemp_*
|
158 |
+
|
159 |
+
# MightyMoose
|
160 |
+
*.mm.*
|
161 |
+
AutoTest.Net/
|
162 |
+
|
163 |
+
# Web workbench (sass)
|
164 |
+
.sass-cache/
|
165 |
+
|
166 |
+
# Installshield output folder
|
167 |
+
[Ee]xpress/
|
168 |
+
|
169 |
+
# DocProject is a documentation generator add-in
|
170 |
+
DocProject/buildhelp/
|
171 |
+
DocProject/Help/*.HxT
|
172 |
+
DocProject/Help/*.HxC
|
173 |
+
DocProject/Help/*.hhc
|
174 |
+
DocProject/Help/*.hhk
|
175 |
+
DocProject/Help/*.hhp
|
176 |
+
DocProject/Help/Html2
|
177 |
+
DocProject/Help/html
|
178 |
+
|
179 |
+
# Click-Once directory
|
180 |
+
publish/
|
181 |
+
|
182 |
+
# Publish Web Output
|
183 |
+
*.[Pp]ublish.xml
|
184 |
+
*.azurePubxml
|
185 |
+
# Note: Comment the next line if you want to checkin your web deploy settings,
|
186 |
+
# but database connection strings (with potential passwords) will be unencrypted
|
187 |
+
*.pubxml
|
188 |
+
*.publishproj
|
189 |
+
|
190 |
+
# Microsoft Azure Web App publish settings. Comment the next line if you want to
|
191 |
+
# checkin your Azure Web App publish settings, but sensitive information contained
|
192 |
+
# in these scripts will be unencrypted
|
193 |
+
PublishScripts/
|
194 |
+
|
195 |
+
# NuGet Packages
|
196 |
+
*.nupkg
|
197 |
+
# NuGet Symbol Packages
|
198 |
+
*.snupkg
|
199 |
+
# The packages folder can be ignored because of Package Restore
|
200 |
+
**/[Pp]ackages/*
|
201 |
+
# except build/, which is used as an MSBuild target.
|
202 |
+
!**/[Pp]ackages/build/
|
203 |
+
# Uncomment if necessary however generally it will be regenerated when needed
|
204 |
+
#!**/[Pp]ackages/repositories.config
|
205 |
+
# NuGet v3's project.json files produces more ignorable files
|
206 |
+
*.nuget.props
|
207 |
+
*.nuget.targets
|
208 |
+
|
209 |
+
# Microsoft Azure Build Output
|
210 |
+
csx/
|
211 |
+
*.build.csdef
|
212 |
+
|
213 |
+
# Microsoft Azure Emulator
|
214 |
+
ecf/
|
215 |
+
rcf/
|
216 |
+
|
217 |
+
# Windows Store app package directories and files
|
218 |
+
AppPackages/
|
219 |
+
BundleArtifacts/
|
220 |
+
Package.StoreAssociation.xml
|
221 |
+
_pkginfo.txt
|
222 |
+
*.appx
|
223 |
+
*.appxbundle
|
224 |
+
*.appxupload
|
225 |
+
|
226 |
+
# Visual Studio cache files
|
227 |
+
# files ending in .cache can be ignored
|
228 |
+
*.[Cc]ache
|
229 |
+
# but keep track of directories ending in .cache
|
230 |
+
!?*.[Cc]ache/
|
231 |
+
|
232 |
+
# Others
|
233 |
+
ClientBin/
|
234 |
+
~$*
|
235 |
+
*~
|
236 |
+
*.dbmdl
|
237 |
+
*.dbproj.schemaview
|
238 |
+
*.jfm
|
239 |
+
*.pfx
|
240 |
+
*.publishsettings
|
241 |
+
orleans.codegen.cs
|
242 |
+
|
243 |
+
# Including strong name files can present a security risk
|
244 |
+
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
|
245 |
+
#*.snk
|
246 |
+
|
247 |
+
# Since there are multiple workflows, uncomment next line to ignore bower_components
|
248 |
+
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
|
249 |
+
#bower_components/
|
250 |
+
|
251 |
+
# RIA/Silverlight projects
|
252 |
+
Generated_Code/
|
253 |
+
|
254 |
+
# Backup & report files from converting an old project file
|
255 |
+
# to a newer Visual Studio version. Backup files are not needed,
|
256 |
+
# because we have git ;-)
|
257 |
+
_UpgradeReport_Files/
|
258 |
+
Backup*/
|
259 |
+
UpgradeLog*.XML
|
260 |
+
UpgradeLog*.htm
|
261 |
+
ServiceFabricBackup/
|
262 |
+
*.rptproj.bak
|
263 |
+
|
264 |
+
# SQL Server files
|
265 |
+
*.mdf
|
266 |
+
*.ldf
|
267 |
+
*.ndf
|
268 |
+
|
269 |
+
# Business Intelligence projects
|
270 |
+
*.rdl.data
|
271 |
+
*.bim.layout
|
272 |
+
*.bim_*.settings
|
273 |
+
*.rptproj.rsuser
|
274 |
+
*- [Bb]ackup.rdl
|
275 |
+
*- [Bb]ackup ([0-9]).rdl
|
276 |
+
*- [Bb]ackup ([0-9][0-9]).rdl
|
277 |
+
|
278 |
+
# Microsoft Fakes
|
279 |
+
FakesAssemblies/
|
280 |
+
|
281 |
+
# GhostDoc plugin setting file
|
282 |
+
*.GhostDoc.xml
|
283 |
+
|
284 |
+
# Node.js Tools for Visual Studio
|
285 |
+
.ntvs_analysis.dat
|
286 |
+
node_modules/
|
287 |
+
|
288 |
+
# Visual Studio 6 build log
|
289 |
+
*.plg
|
290 |
+
|
291 |
+
# Visual Studio 6 workspace options file
|
292 |
+
*.opt
|
293 |
+
|
294 |
+
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
|
295 |
+
*.vbw
|
296 |
+
|
297 |
+
# Visual Studio LightSwitch build output
|
298 |
+
**/*.HTMLClient/GeneratedArtifacts
|
299 |
+
**/*.DesktopClient/GeneratedArtifacts
|
300 |
+
**/*.DesktopClient/ModelManifest.xml
|
301 |
+
**/*.Server/GeneratedArtifacts
|
302 |
+
**/*.Server/ModelManifest.xml
|
303 |
+
_Pvt_Extensions
|
304 |
+
|
305 |
+
# Paket dependency manager
|
306 |
+
.paket/paket.exe
|
307 |
+
paket-files/
|
308 |
+
|
309 |
+
# FAKE - F# Make
|
310 |
+
.fake/
|
311 |
+
|
312 |
+
# CodeRush personal settings
|
313 |
+
.cr/personal
|
314 |
+
|
315 |
+
# Python Tools for Visual Studio (PTVS)
|
316 |
+
__pycache__/
|
317 |
+
|
318 |
+
|
319 |
+
# Cake - Uncomment if you are using it
|
320 |
+
# tools/**
|
321 |
+
# !tools/packages.config
|
322 |
+
|
323 |
+
# Tabs Studio
|
324 |
+
*.tss
|
325 |
+
|
326 |
+
# Telerik's JustMock configuration file
|
327 |
+
*.jmconfig
|
328 |
+
|
329 |
+
# BizTalk build output
|
330 |
+
*.btp.cs
|
331 |
+
*.btm.cs
|
332 |
+
*.odx.cs
|
333 |
+
*.xsd.cs
|
334 |
+
|
335 |
+
# OpenCover UI analysis results
|
336 |
+
OpenCover/
|
337 |
+
|
338 |
+
# Azure Stream Analytics local run output
|
339 |
+
ASALocalRun/
|
340 |
+
|
341 |
+
# MSBuild Binary and Structured Log
|
342 |
+
*.binlog
|
343 |
+
|
344 |
+
# NVidia Nsight GPU debugger configuration file
|
345 |
+
*.nvuser
|
346 |
+
|
347 |
+
# MFractors (Xamarin productivity tool) working folder
|
348 |
+
.mfractor/
|
349 |
+
|
350 |
+
# Local History for Visual Studio
|
351 |
+
.localhistory/
|
352 |
+
|
353 |
+
# BeatPulse healthcheck temp database
|
354 |
+
healthchecksdb
|
355 |
+
|
356 |
+
# Backup folder for Package Reference Convert tool in Visual Studio 2017
|
357 |
+
MigrationBackup/
|
358 |
+
|
359 |
+
# Ionide (cross platform F# VS Code tools) working folder
|
360 |
+
.ionide/
|
361 |
+
|
362 |
+
# Fody - auto-generated XML schema
|
363 |
+
FodyWeavers.xsd
|
364 |
+
|
365 |
+
# build
|
366 |
+
build
|
367 |
+
monotonic_align/core.c
|
368 |
+
*.o
|
369 |
+
*.so
|
370 |
+
*.dll
|
371 |
+
|
372 |
+
# data
|
373 |
+
/config.json
|
374 |
+
/*.pth
|
375 |
+
*.wav
|
376 |
+
/monotonic_align/monotonic_align
|
377 |
+
/resources
|
378 |
+
/MoeGoe.spec
|
379 |
+
/dist/MoeGoe
|
380 |
+
/dist
|
381 |
+
|
382 |
+
.venv
|
383 |
+
.idea
|
384 |
+
app.py
|
385 |
+
infer-web.py
|
386 |
+
app-old.py
|
387 |
+
rmvpe.pt
|
388 |
+
hubert_base.pt
|
Advanced-RVC.ipynb
ADDED
@@ -0,0 +1,494 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"id": "view-in-github",
|
7 |
+
"colab_type": "text"
|
8 |
+
},
|
9 |
+
"source": [
|
10 |
+
"<a href=\"https://colab.research.google.com/github/ArkanDash/Advanced-RVC-Inference/blob/master/Advanced-RVC.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
11 |
+
]
|
12 |
+
},
|
13 |
+
{
|
14 |
+
"cell_type": "markdown",
|
15 |
+
"source": [
|
16 |
+
"<h1><div align=\"center\"> Advanced RVC Inference:\n",
|
17 |
+
"\n",
|
18 |
+
"<big> for quicker and effortless model downloads\n",
|
19 |
+
"\n",
|
20 |
+
"---\n",
|
21 |
+
"\n",
|
22 |
+
"[Support](https://discord.gg/hvmsukmBHE) — [GitHub](https://github.com/ArkanDash/Advanced-RVC-Inference.git)"
|
23 |
+
],
|
24 |
+
"metadata": {
|
25 |
+
"id": "FZUxBujkr91c"
|
26 |
+
}
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"cell_type": "code",
|
30 |
+
"execution_count": null,
|
31 |
+
"metadata": {
|
32 |
+
"cellView": "form",
|
33 |
+
"id": "fl7Y_WjdrEO2"
|
34 |
+
},
|
35 |
+
"outputs": [],
|
36 |
+
"source": [
|
37 |
+
"#@title Check GPU\n",
|
38 |
+
"!nvidia-smi"
|
39 |
+
]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"cell_type": "code",
|
43 |
+
"execution_count": null,
|
44 |
+
"metadata": {
|
45 |
+
"cellView": "form",
|
46 |
+
"id": "sfqNqmS-rEPK"
|
47 |
+
},
|
48 |
+
"outputs": [],
|
49 |
+
"source": [
|
50 |
+
"# @title Installation\n",
|
51 |
+
"\n",
|
52 |
+
"\n",
|
53 |
+
"from IPython.display import clear_output\n",
|
54 |
+
"\n",
|
55 |
+
"\n",
|
56 |
+
"\n",
|
57 |
+
"url = \"https://github.com/ArkanDash/Advanced-RVC-Inference.git\"\n",
|
58 |
+
"\n",
|
59 |
+
"!git clone $url /content/program_infer\n",
|
60 |
+
"clear_output()\n",
|
61 |
+
"\n",
|
62 |
+
"%cd /content/program_infer\n",
|
63 |
+
"\n",
|
64 |
+
"\n",
|
65 |
+
"!pip install -r requirements.txt\n",
|
66 |
+
"!pip uninstall torch torchvision torchaudio -y\n",
|
67 |
+
"!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --upgrade --index-url https://download.pytorch.org/whl/cu121\n",
|
68 |
+
"clear_output()\n",
|
69 |
+
"print(\"Finished installing requirements!\")"
|
70 |
+
]
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"cell_type": "code",
|
74 |
+
"source": [
|
75 |
+
"#@title Run WebUI\n",
|
76 |
+
"\n",
|
77 |
+
"\n",
|
78 |
+
"iyalah = \"app.py\"\n",
|
79 |
+
"print(\"running WebUI\")\n",
|
80 |
+
"!python $iyalah --share"
|
81 |
+
],
|
82 |
+
"metadata": {
|
83 |
+
"cellView": "form",
|
84 |
+
"id": "AJZH4XDOKnK3"
|
85 |
+
},
|
86 |
+
"execution_count": null,
|
87 |
+
"outputs": []
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"cell_type": "markdown",
|
91 |
+
"source": [
|
92 |
+
"## Run NoUI\n",
|
93 |
+
"<div align=\"center\">\n",
|
94 |
+
"\n",
|
95 |
+
"•created by [NeoDev](https://github.com/TheNeodev)•"
|
96 |
+
],
|
97 |
+
"metadata": {
|
98 |
+
"id": "MO_UV5ZhKOTF"
|
99 |
+
}
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"cell_type": "code",
|
103 |
+
"source": [
|
104 |
+
"# @title Download model\n",
|
105 |
+
"# @markdown Hugging Face or Google Drive\n",
|
106 |
+
"model_link = \"https://huggingface.co/Bredvige/Sonic2/resolve/main/Sonic.zip\" # @param {type:\"string\"}\n",
|
107 |
+
"\n",
|
108 |
+
"!python scrpt.py download --model_link \"{model_link}\""
|
109 |
+
],
|
110 |
+
"metadata": {
|
111 |
+
"cellView": "form",
|
112 |
+
"id": "qk74gqJqEB_A"
|
113 |
+
},
|
114 |
+
"execution_count": null,
|
115 |
+
"outputs": []
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"cell_type": "code",
|
119 |
+
"source": [
|
120 |
+
"\n",
|
121 |
+
"#@title run Advanced-RVC\n",
|
122 |
+
"\n",
|
123 |
+
"import os\n",
|
124 |
+
"import sys\n",
|
125 |
+
"import yt_dlp\n",
|
126 |
+
"import subprocess\n",
|
127 |
+
"import logging\n",
|
128 |
+
"import json\n",
|
129 |
+
"from logging.handlers import RotatingFileHandler\n",
|
130 |
+
"from contextlib import suppress\n",
|
131 |
+
"import gradio as gr\n",
|
132 |
+
"import librosa\n",
|
133 |
+
"import numpy as np\n",
|
134 |
+
"import soundfile as sf\n",
|
135 |
+
"from pydub import AudioSegment\n",
|
136 |
+
"# Import the UVR separator. Ensure the module is available.\n",
|
137 |
+
"try:\n",
|
138 |
+
" from audio_separator.separator import Separator\n",
|
139 |
+
"except ImportError:\n",
|
140 |
+
" raise ImportError(\"Make sure the 'audio_separator' module is installed or in your working directory.\")\n",
|
141 |
+
"\n",
|
142 |
+
"from rvc.lib.tools.prerequisites_download import prerequisites_download_pipeline\n",
|
143 |
+
"\n",
|
144 |
+
"if __name__ == \"__main__\":\n",
|
145 |
+
" prerequisites_download_pipeline(models=True, exe=True)\n",
|
146 |
+
"\n",
|
147 |
+
"\n",
|
148 |
+
"# =============================================================================\n",
|
149 |
+
"# Logging Setup\n",
|
150 |
+
"# =============================================================================\n",
|
151 |
+
"\n",
|
152 |
+
"def setup_logging(log_level=logging.DEBUG, log_file=\"kuro_rvc.log\"):\n",
|
153 |
+
" \"\"\"\n",
|
154 |
+
" Set up advanced logging with both console and rotating file handlers.\n",
|
155 |
+
" \"\"\"\n",
|
156 |
+
" logger = logging.getLogger()\n",
|
157 |
+
" logger.setLevel(log_level)\n",
|
158 |
+
"\n",
|
159 |
+
" # Formatter for both handlers\n",
|
160 |
+
" formatter = logging.Formatter(\n",
|
161 |
+
" fmt=\"%(asctime)s [%(levelname)s] %(name)s: %(message)s\",\n",
|
162 |
+
" datefmt=\"%Y-%m-%d %H:%M:%S\"\n",
|
163 |
+
" )\n",
|
164 |
+
"\n",
|
165 |
+
" # Console handler (INFO level and above)\n",
|
166 |
+
" console_handler = logging.StreamHandler(sys.stdout)\n",
|
167 |
+
" console_handler.setLevel(logging.INFO)\n",
|
168 |
+
" console_handler.setFormatter(formatter)\n",
|
169 |
+
"\n",
|
170 |
+
" # Rotating file handler (DEBUG level and above)\n",
|
171 |
+
" file_handler = RotatingFileHandler(log_file, maxBytes=5*1024*1024, backupCount=2)\n",
|
172 |
+
" file_handler.setLevel(log_level)\n",
|
173 |
+
" file_handler.setFormatter(formatter)\n",
|
174 |
+
"\n",
|
175 |
+
" # Clear existing handlers, then add ours\n",
|
176 |
+
" if logger.hasHandlers():\n",
|
177 |
+
" logger.handlers.clear()\n",
|
178 |
+
" logger.addHandler(console_handler)\n",
|
179 |
+
" logger.addHandler(file_handler)\n",
|
180 |
+
" logger.debug(\"...logging has been configured.\")\n",
|
181 |
+
"\n",
|
182 |
+
"# Initialize logging as early as possible\n",
|
183 |
+
"setup_logging()\n",
|
184 |
+
"\n",
|
185 |
+
"# =============================================================================\n",
|
186 |
+
"# Directories and File Paths\n",
|
187 |
+
"# =============================================================================\n",
|
188 |
+
"\n",
|
189 |
+
"current_dir = os.getcwd()\n",
|
190 |
+
"rvc_models_dir = os.path.join(current_dir, 'logs')\n",
|
191 |
+
"rvc_output_dir = os.path.join(current_dir, 'song_output')\n",
|
192 |
+
"download_dir = os.path.join(current_dir, \"downloads\")\n",
|
193 |
+
"uvr_output_dir = os.path.join(current_dir, \"output_uvr\")\n",
|
194 |
+
"\n",
|
195 |
+
"# File paths for separated stems (using uvr_output_dir)\n",
|
196 |
+
"vocals_path = os.path.join(uvr_output_dir, 'Vocals.wav')\n",
|
197 |
+
"instrumental_path = os.path.join(uvr_output_dir, 'Instrumental.wav')\n",
|
198 |
+
"lead_vocals_path = os.path.join(uvr_output_dir, 'Lead_Vocals.wav')\n",
|
199 |
+
"backing_vocals_path = os.path.join(uvr_output_dir, 'Backing_Vocals.wav')\n",
|
200 |
+
"\n",
|
201 |
+
"# File paths for RVC inference outputs\n",
|
202 |
+
"rvc_lead_output = os.path.join(rvc_output_dir, \"rvc_result_lead.wav\")\n",
|
203 |
+
"rvc_backing_output = os.path.join(rvc_output_dir, \"rvc_result_backing.wav\")\n",
|
204 |
+
"\n",
|
205 |
+
"# Path to the RVC script (ensure it exists in the current directory)\n",
|
206 |
+
"rvc_cli_file = os.path.join(current_dir, \"scrpt.py\")\n",
|
207 |
+
"if not os.path.exists(rvc_cli_file):\n",
|
208 |
+
" logging.error(\"scrpt.py not found in the current directory: %s\", current_dir)\n",
|
209 |
+
" raise FileNotFoundError(\"scrpt.py not found in the current directory.\")\n",
|
210 |
+
"\n",
|
211 |
+
"# =============================================================================\n",
|
212 |
+
"# Inference and Pipeline Parameters (Colab UI parameters below)\n",
|
213 |
+
"# =============================================================================\n",
|
214 |
+
"\n",
|
215 |
+
"model_name = \"Sonic\" # @param {type:\"string\"}\n",
|
216 |
+
"youtube_url = \"https://youtu.be/eCkWlRL3_N0?si=y6xHAs1m8fYVLTUV\" # @param {type:\"string\"}\n",
|
217 |
+
"export_format = \"WAV\" # @param ['WAV', 'MP3', 'FLAC', 'OGG', 'M4A']\n",
|
218 |
+
"f0_method = \"hybrid[rmvpe+fcpe]\" # @param [\"crepe\", \"crepe-tiny\", \"rmvpe\", \"fcpe\", \"hybrid[rmvpe+fcpe]\"]\n",
|
219 |
+
"f0_up_key = 0 # @param {type:\"slider\", min:-24, max:24, step:0}\n",
|
220 |
+
"filter_radius = 3 # @param {type:\"slider\", min:0, max:10, step:0}\n",
|
221 |
+
"rms_mix_rate = 0.8 # @param {type:\"slider\", min:0.0, max:1.0, step:0.1}\n",
|
222 |
+
"protect = 0.5 # @param {type:\"slider\", min:0.0, max:0.5, step:0.1}\n",
|
223 |
+
"index_rate = 0.6 # @param {type:\"slider\", min:0.0, max:1.0, step:0.1}\n",
|
224 |
+
"hop_length = 128 # @param {type:\"slider\", min:1, max:512, step:0}\n",
|
225 |
+
"clean_strength = 0.7 # @param {type:\"slider\", min:0.0, max:1.0, step:0.1}\n",
|
226 |
+
"split_audio = False # @param {type:\"boolean\"}\n",
|
227 |
+
"clean_audio = False # @param {type:\"boolean\"}\n",
|
228 |
+
"f0_autotune = False # @param {type:\"boolean\"}\n",
|
229 |
+
"backing_vocal_infer = False # @param {type:\"boolean\"}\n",
|
230 |
+
"embedder_model = \"contentvec\" # @param [\"contentvec\", \"chinese-hubert-base\", \"japanese-hubert-base\", \"korean-hubert-base\", \"custom\"]\n",
|
231 |
+
"embedder_model_custom = \"\" # @param {type:\"string\"}\n",
|
232 |
+
"output_filename = f\"aicover_{model_name}_opt\"\n",
|
233 |
+
"logging.info(\"This code was written by [NeoDev](https://github.com/TheNeodev). Please credit if you copy or modify the code.\")\n",
|
234 |
+
"\n",
|
235 |
+
"# =============================================================================\n",
|
236 |
+
"# Function Definitions\n",
|
237 |
+
"# =============================================================================\n",
|
238 |
+
"\n",
|
239 |
+
"def download_youtube_audio(url, download_dir):\n",
|
240 |
+
" \"\"\"\n",
|
241 |
+
" Download audio from a YouTube URL and return the path(s) to the downloaded WAV file(s).\n",
|
242 |
+
" \"\"\"\n",
|
243 |
+
" logging.debug(\"Starting YouTube audio download. URL: %s\", url)\n",
|
244 |
+
" os.makedirs(download_dir, exist_ok=True)\n",
|
245 |
+
" outtmpl = os.path.join(download_dir, \"%(title)s.%(ext)s\")\n",
|
246 |
+
" ydl_opts = {\n",
|
247 |
+
" \"format\": \"bestaudio/best\",\n",
|
248 |
+
" \"outtmpl\": outtmpl,\n",
|
249 |
+
" \"postprocessors\": [{\n",
|
250 |
+
" \"key\": \"FFmpegExtractAudio\",\n",
|
251 |
+
" \"preferredcodec\": \"wav\",\n",
|
252 |
+
" \"preferredquality\": \"192\"\n",
|
253 |
+
" }],\n",
|
254 |
+
" }\n",
|
255 |
+
" with yt_dlp.YoutubeDL(ydl_opts) as ydl:\n",
|
256 |
+
" info_dict = ydl.extract_info(url, download=True)\n",
|
257 |
+
" if \"entries\" in info_dict: # Playlist support\n",
|
258 |
+
" downloaded_files = [os.path.join(download_dir, f\"{entry['title']}.wav\") for entry in info_dict[\"entries\"] if entry]\n",
|
259 |
+
" else:\n",
|
260 |
+
" downloaded_files = os.path.join(download_dir, f\"{info_dict['title']}.wav\")\n",
|
261 |
+
" logging.debug(\"Downloaded audio file(s): %s\", downloaded_files)\n",
|
262 |
+
" return downloaded_files\n",
|
263 |
+
"\n",
|
264 |
+
"def separator_uvr(input_audio, output_dir):\n",
|
265 |
+
" \"\"\"\n",
|
266 |
+
" Separate the input audio into instrumental and vocal stems,\n",
|
267 |
+
" then further separate vocals into lead and backing vocals.\n",
|
268 |
+
" Returns the paths to the lead and backing vocal files.\n",
|
269 |
+
" \"\"\"\n",
|
270 |
+
" logging.debug(\"Starting UVR separation for file: %s\", input_audio)\n",
|
271 |
+
" os.makedirs(output_dir, exist_ok=True)\n",
|
272 |
+
"\n",
|
273 |
+
" # First separation: get instrumental and vocals\n",
|
274 |
+
" uvr_separator = Separator(output_dir=output_dir)\n",
|
275 |
+
" logging.debug(\"Loading first UVR model for instrumental/vocals separation.\")\n",
|
276 |
+
" uvr_separator.load_model('model_bs_roformer_ep_317_sdr_12.9755.ckpt')\n",
|
277 |
+
" separated_files = uvr_separator.separate(input_audio)\n",
|
278 |
+
" if len(separated_files) < 2:\n",
|
279 |
+
" error_msg = \"UVR separation did not produce expected files for instrumental/vocals.\"\n",
|
280 |
+
" logging.error(error_msg)\n",
|
281 |
+
" raise RuntimeError(error_msg)\n",
|
282 |
+
"\n",
|
283 |
+
" # Rename the separated files to our designated paths\n",
|
284 |
+
" os.rename(os.path.join(output_dir, separated_files[0]), instrumental_path)\n",
|
285 |
+
" os.rename(os.path.join(output_dir, separated_files[1]), vocals_path)\n",
|
286 |
+
" logging.debug(\"Separated instrumental saved to: %s\", instrumental_path)\n",
|
287 |
+
" logging.debug(\"Separated vocals saved to: %s\", vocals_path)\n",
|
288 |
+
"\n",
|
289 |
+
" # Second separation: split vocals into lead and backing\n",
|
290 |
+
" logging.debug(\"Loading second UVR model for vocal splitting.\")\n",
|
291 |
+
" uvr_separator.load_model('mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt')\n",
|
292 |
+
" separated_vocals = uvr_separator.separate(vocals_path)\n",
|
293 |
+
" if len(separated_vocals) < 2:\n",
|
294 |
+
" error_msg = \"UVR separation did not produce expected files for vocal split.\"\n",
|
295 |
+
" logging.error(error_msg)\n",
|
296 |
+
" raise RuntimeError(error_msg)\n",
|
297 |
+
"\n",
|
298 |
+
" os.rename(os.path.join(output_dir, separated_vocals[0]), backing_vocals_path)\n",
|
299 |
+
" os.rename(os.path.join(output_dir, separated_vocals[1]), lead_vocals_path)\n",
|
300 |
+
" logging.debug(\"Separated backing vocals saved to: %s\", backing_vocals_path)\n",
|
301 |
+
" logging.debug(\"Separated lead vocals saved to: %s\", lead_vocals_path)\n",
|
302 |
+
"\n",
|
303 |
+
" return lead_vocals_path, backing_vocals_path\n",
|
304 |
+
"\n",
|
305 |
+
"def run_rvc(f0_up_key, filter_radius, rms_mix_rate, index_rate, hop_length, protect,\n",
|
306 |
+
" f0_method, input_path, output_path, pth_file, index_file, split_audio,\n",
|
307 |
+
" clean_audio, clean_strength, export_format, f0_autotune,\n",
|
308 |
+
" embedder_model, embedder_model_custom):\n",
|
309 |
+
" \"\"\"\n",
|
310 |
+
" Run the RVC inference pipeline via the rvc_cli.py script.\n",
|
311 |
+
" \"\"\"\n",
|
312 |
+
" logging.debug(\"Preparing RVC inference command for input file: %s\", input_path)\n",
|
313 |
+
" command = [\n",
|
314 |
+
" sys.executable, rvc_cli_file, \"infer\",\n",
|
315 |
+
" \"--pitch\", str(f0_up_key),\n",
|
316 |
+
" \"--filter_radius\", str(filter_radius),\n",
|
317 |
+
" \"--volume_envelope\", str(rms_mix_rate),\n",
|
318 |
+
" \"--index_rate\", str(index_rate),\n",
|
319 |
+
" \"--hop_length\", str(hop_length),\n",
|
320 |
+
" \"--protect\", str(protect),\n",
|
321 |
+
" \"--f0_method\", f0_method,\n",
|
322 |
+
" \"--f0_autotune\", str(f0_autotune),\n",
|
323 |
+
" \"--input_path\", input_path,\n",
|
324 |
+
" \"--output_path\", output_path,\n",
|
325 |
+
" \"--pth_path\", pth_file,\n",
|
326 |
+
" \"--index_path\", index_file,\n",
|
327 |
+
" \"--split_audio\", str(split_audio),\n",
|
328 |
+
" \"--clean_audio\", str(clean_audio),\n",
|
329 |
+
" \"--clean_strength\", str(clean_strength),\n",
|
330 |
+
" \"--export_format\", export_format,\n",
|
331 |
+
" \"--embedder_model\", embedder_model,\n",
|
332 |
+
" \"--embedder_model_custom\", embedder_model_custom\n",
|
333 |
+
" ]\n",
|
334 |
+
" logging.info(\"Running RVC inference. Command: %s\", \" \".join(command))\n",
|
335 |
+
" try:\n",
|
336 |
+
" result = subprocess.run(command, check=True, capture_output=True, text=True)\n",
|
337 |
+
" logging.debug(\"RVC inference stdout: %s\", result.stdout)\n",
|
338 |
+
" if result.stderr:\n",
|
339 |
+
" logging.debug(\"RVC inference stderr: %s\", result.stderr)\n",
|
340 |
+
" logging.info(\"RVC inference completed for input: %s\", input_path)\n",
|
341 |
+
" except subprocess.CalledProcessError as e:\n",
|
342 |
+
" logging.error(\"RVC inference failed for input: %s\", input_path)\n",
|
343 |
+
" logging.error(\"Error output: %s\", e.stderr)\n",
|
344 |
+
" raise e\n",
|
345 |
+
"\n",
|
346 |
+
"def load_audio(file_path):\n",
|
347 |
+
" \"\"\"Load an audio file using pydub if it exists.\"\"\"\n",
|
348 |
+
" if file_path and os.path.exists(file_path):\n",
|
349 |
+
" logging.debug(\"Loading audio file: %s\", file_path)\n",
|
350 |
+
" return AudioSegment.from_file(file_path)\n",
|
351 |
+
" else:\n",
|
352 |
+
" logging.warning(\"Audio file not found: %s\", file_path)\n",
|
353 |
+
" return None\n",
|
354 |
+
"\n",
|
355 |
+
"# =============================================================================\n",
|
356 |
+
"# Main Execution Function\n",
|
357 |
+
"# =============================================================================\n",
|
358 |
+
"\n",
|
359 |
+
"def main():\n",
|
360 |
+
" logging.info(\"Starting Advanced-RVC pipeline.\")\n",
|
361 |
+
"\n",
|
362 |
+
" # Check model folder and required model files\n",
|
363 |
+
" model_folder = os.path.join(rvc_models_dir, model_name)\n",
|
364 |
+
" if not os.path.exists(model_folder):\n",
|
365 |
+
" error_msg = f\"Model directory not found: {model_folder}\"\n",
|
366 |
+
" logging.error(error_msg)\n",
|
367 |
+
" raise FileNotFoundError(error_msg)\n",
|
368 |
+
" files_in_folder = os.listdir(model_folder)\n",
|
369 |
+
" pth_filename = next((f for f in files_in_folder if f.endswith(\".pth\")), None)\n",
|
370 |
+
" index_filename = next((f for f in files_in_folder if f.endswith(\".index\")), None)\n",
|
371 |
+
" if not pth_filename or not index_filename:\n",
|
372 |
+
" error_msg = \"Required model files (.pth or .index) were not found in the model folder.\"\n",
|
373 |
+
" logging.error(error_msg)\n",
|
374 |
+
" raise FileNotFoundError(error_msg)\n",
|
375 |
+
" pth_file = os.path.join(model_folder, pth_filename)\n",
|
376 |
+
" index_file = os.path.join(model_folder, index_filename)\n",
|
377 |
+
" logging.debug(\"Model files located. PTH: %s, Index: %s\", pth_file, index_file)\n",
|
378 |
+
"\n",
|
379 |
+
" # Download audio from YouTube\n",
|
380 |
+
" logging.info(\"Downloading audio from YouTube...\")\n",
|
381 |
+
" downloaded_audio = download_youtube_audio(youtube_url, download_dir)\n",
|
382 |
+
" input_audio = downloaded_audio[0] if isinstance(downloaded_audio, list) else downloaded_audio\n",
|
383 |
+
" if not os.path.exists(input_audio):\n",
|
384 |
+
" error_msg = f\"Downloaded audio file not found: {input_audio}\"\n",
|
385 |
+
" logging.error(error_msg)\n",
|
386 |
+
" raise FileNotFoundError(error_msg)\n",
|
387 |
+
" logging.info(\"Audio downloaded successfully: %s\", input_audio)\n",
|
388 |
+
"\n",
|
389 |
+
" # Run UVR separation\n",
|
390 |
+
" logging.info(\"Running UVR separation...\")\n",
|
391 |
+
" lead_vocals_file, backing_vocals_file = separator_uvr(input_audio, uvr_output_dir)\n",
|
392 |
+
" logging.info(\"UVR separation completed. Lead vocals: %s, Backing vocals: %s\", lead_vocals_file, backing_vocals_file)\n",
|
393 |
+
"\n",
|
394 |
+
" # Ensure the output directory for RVC exists\n",
|
395 |
+
" os.makedirs(rvc_output_dir, exist_ok=True)\n",
|
396 |
+
"\n",
|
397 |
+
" # Run RVC inference for lead vocals\n",
|
398 |
+
" logging.info(\"Running RVC inference for lead vocals...\")\n",
|
399 |
+
" run_rvc(f0_up_key, filter_radius, rms_mix_rate, index_rate, hop_length, protect,\n",
|
400 |
+
" f0_method, lead_vocals_path, rvc_lead_output, pth_file, index_file,\n",
|
401 |
+
" split_audio, clean_audio, clean_strength, export_format, f0_autotune,\n",
|
402 |
+
" embedder_model, embedder_model_custom)\n",
|
403 |
+
"\n",
|
404 |
+
" # Optionally run RVC inference for backing vocals\n",
|
405 |
+
" if backing_vocal_infer:\n",
|
406 |
+
" logging.info(\"Running RVC inference for backing vocals...\")\n",
|
407 |
+
" run_rvc(f0_up_key, filter_radius, rms_mix_rate, index_rate, hop_length, protect,\n",
|
408 |
+
" f0_method, backing_vocals_path, rvc_backing_output, pth_file, index_file,\n",
|
409 |
+
" split_audio, clean_audio, clean_strength, export_format, f0_autotune,\n",
|
410 |
+
" embedder_model, embedder_model_custom)\n",
|
411 |
+
"\n",
|
412 |
+
" logging.info(\"RVC pipeline complete.\")\n",
|
413 |
+
"\n",
|
414 |
+
" # Load the separated/inferred tracks for final mix\n",
|
415 |
+
" logging.info(\"Loading audio tracks for final mix.\")\n",
|
416 |
+
" lead_vocals_audio = load_audio(rvc_lead_output)\n",
|
417 |
+
" instrumental_audio = load_audio(instrumental_path)\n",
|
418 |
+
" # If backing inference was run, load its result; otherwise use separated backing vocals.\n",
|
419 |
+
" backing_vocals_audio = load_audio(rvc_backing_output) if backing_vocal_infer else load_audio(backing_vocals_path)\n",
|
420 |
+
"\n",
|
421 |
+
" if not instrumental_audio:\n",
|
422 |
+
" error_msg = \"Instrumental track is required for mixing!\"\n",
|
423 |
+
" logging.error(error_msg)\n",
|
424 |
+
" raise ValueError(error_msg)\n",
|
425 |
+
"\n",
|
426 |
+
" # Mix the audio tracks: overlay lead vocals and backing vocals onto the instrumental\n",
|
427 |
+
" final_mix = instrumental_audio\n",
|
428 |
+
" if lead_vocals_audio:\n",
|
429 |
+
" logging.debug(\"Overlaying lead vocals onto instrumental.\")\n",
|
430 |
+
" final_mix = final_mix.overlay(lead_vocals_audio)\n",
|
431 |
+
" if backing_vocals_audio:\n",
|
432 |
+
" logging.debug(\"Overlaying backing vocals onto instrumental.\")\n",
|
433 |
+
" final_mix = final_mix.overlay(backing_vocals_audio)\n",
|
434 |
+
"\n",
|
435 |
+
" # Export final mix to file\n",
|
436 |
+
" output_file = f\"{output_filename}.{export_format.lower()}\"\n",
|
437 |
+
" final_mix.export(output_file, format=export_format.lower())\n",
|
438 |
+
" logging.info(\"✅ Mixed file saved as: %s\", output_file)\n",
|
439 |
+
" print(f\"✅ Mixed file saved as: {output_file}\")\n",
|
440 |
+
"\n",
|
441 |
+
"# =============================================================================\n",
|
442 |
+
"# Run the Pipeline if Executed as a Script\n",
|
443 |
+
"# =============================================================================\n",
|
444 |
+
"\n",
|
445 |
+
"if __name__ == \"__main__\":\n",
|
446 |
+
" try:\n",
|
447 |
+
" main()\n",
|
448 |
+
" except Exception as e:\n",
|
449 |
+
" logging.exception(\"An error occurred during execution: %s\", e)\n",
|
450 |
+
" raise"
|
451 |
+
],
|
452 |
+
"metadata": {
|
453 |
+
"cellView": "form",
|
454 |
+
"id": "9-KMNp7tFrEk"
|
455 |
+
},
|
456 |
+
"execution_count": null,
|
457 |
+
"outputs": []
|
458 |
+
},
|
459 |
+
{
|
460 |
+
"cell_type": "code",
|
461 |
+
"source": [
|
462 |
+
"#@title play ur audio output\n",
|
463 |
+
"\n",
|
464 |
+
"output_file = f\"{output_filename}.{export_format.lower()}\"\n",
|
465 |
+
"\n",
|
466 |
+
"AudioSegment.from_file(output_file)"
|
467 |
+
],
|
468 |
+
"metadata": {
|
469 |
+
"cellView": "form",
|
470 |
+
"id": "NvxvDUUOrYd-"
|
471 |
+
},
|
472 |
+
"execution_count": null,
|
473 |
+
"outputs": []
|
474 |
+
}
|
475 |
+
],
|
476 |
+
"metadata": {
|
477 |
+
"language_info": {
|
478 |
+
"name": "python"
|
479 |
+
},
|
480 |
+
"orig_nbformat": 4,
|
481 |
+
"colab": {
|
482 |
+
"provenance": [],
|
483 |
+
"gpuType": "T4",
|
484 |
+
"include_colab_link": true
|
485 |
+
},
|
486 |
+
"kernelspec": {
|
487 |
+
"name": "python3",
|
488 |
+
"display_name": "Python 3"
|
489 |
+
},
|
490 |
+
"accelerator": "GPU"
|
491 |
+
},
|
492 |
+
"nbformat": 4,
|
493 |
+
"nbformat_minor": 0
|
494 |
+
}
|
LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 arkandash
|
4 |
+
Copyright (c) 2025 NeoDev
|
5 |
+
|
6 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7 |
+
of this software and associated documentation files (the "Software"), to deal
|
8 |
+
in the Software without restriction, including without limitation the rights
|
9 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10 |
+
copies of the Software, and to permit persons to whom the Software is
|
11 |
+
furnished to do so, subject to the following conditions:
|
12 |
+
|
13 |
+
The above copyright notice and this permission notice shall be included in all
|
14 |
+
copies or substantial portions of the Software.
|
15 |
+
|
16 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,12 +1,64 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<div align="center">
|
2 |
+
|
3 |
+
# Advanced RVC Inference
|
4 |
+
|
5 |
+
[](https://colab.research.google.com/github/ArkanDash/Advanced-RVC-Inference/blob/master/Advanced-RVC.ipynb)
|
6 |
+
|
7 |
+
|
8 |
+
</div>
|
9 |
+
|
10 |
+
## Information
|
11 |
+
<div align="center">
|
12 |
+
Advanced RVC Inference presents itself as a state-of-the-art web UI crafted to streamline rapid and effortless inference. This comprehensive toolset encompasses a model downloader, a voice splitter.
|
13 |
+
|
14 |
+
Please support the Applio. This inference won't be possible to make without it.<br />
|
15 |
+
[](https://github.com/IAHispano/Applio)
|
16 |
+
</div>
|
17 |
+
|
18 |
+
## Features
|
19 |
+
- Support V1 & V2 Model ✅
|
20 |
+
- Youtube Audio Downloader ✅
|
21 |
+
- Audio-Separator (Voice Splitter) [Internet required for downloading model] ✅
|
22 |
+
- Model Downloader ✅
|
23 |
+
- Gradio WebUI ✅
|
24 |
+
## Installation
|
25 |
+
|
26 |
+
1. Install Dependencies <br />
|
27 |
+
```bash
|
28 |
+
pip install torch torchvision torchaudio
|
29 |
+
|
30 |
+
python -m pip install -r requirements.txt
|
31 |
+
```
|
32 |
+
2. Install [ffmpeg](https://ffmpeg.org/)
|
33 |
+
|
34 |
+
3. Download models use:
|
35 |
+
|
36 |
+
```bash
|
37 |
+
python models.py
|
38 |
+
```
|
39 |
+
|
40 |
+
## Run WebUI <br />
|
41 |
+
```bash
|
42 |
+
python app.py
|
43 |
+
```
|
44 |
+
|
45 |
+
|
46 |
+
## Terms of Use
|
47 |
+
|
48 |
+
The use of the converted voice for the following purposes is prohibited.
|
49 |
+
|
50 |
+
* Criticizing or attacking individuals.
|
51 |
+
|
52 |
+
* Advocating for or opposing specific political positions, religions, or ideologies.
|
53 |
+
|
54 |
+
* Publicly displaying strongly stimulating expressions without proper zoning.
|
55 |
+
|
56 |
+
* Selling of voice models and generated voice clips.
|
57 |
+
|
58 |
+
* Impersonation of the original owner of the voice with malicious intentions to harm/hurt others.
|
59 |
+
|
60 |
+
* Fraudulent purposes that lead to identity theft or fraudulent phone calls.
|
61 |
+
|
62 |
+
## Disclaimer
|
63 |
+
|
64 |
+
I am not liable for any direct, indirect, consequential, incidental, or special damages arising out of or in any way connected with the use/misuse or inability to use this software.
|
assets/config.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"theme": {
|
3 |
+
"file": null,
|
4 |
+
"class": "NoCrypt/miku"
|
5 |
+
}
|
assets/themes/loadThemes.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import importlib
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
now_dir = os.getcwd()
|
7 |
+
|
8 |
+
folder = os.path.join(now_dir, "assets", "themes")
|
9 |
+
config_file = os.path.join(now_dir, "assets", "config.json")
|
10 |
+
|
11 |
+
import sys
|
12 |
+
|
13 |
+
sys.path.append(folder)
|
14 |
+
|
15 |
+
|
16 |
+
def get_class(filename):
|
17 |
+
with open(filename, "r", encoding="utf8") as file:
|
18 |
+
for line_number, line in enumerate(file, start=1):
|
19 |
+
if "class " in line:
|
20 |
+
found = line.split("class ")[1].split(":")[0].split("(")[0].strip()
|
21 |
+
return found
|
22 |
+
break
|
23 |
+
return None
|
24 |
+
|
25 |
+
|
26 |
+
def get_list():
|
27 |
+
|
28 |
+
themes_from_files = [
|
29 |
+
os.path.splitext(name)[0]
|
30 |
+
for root, _, files in os.walk(folder, topdown=False)
|
31 |
+
for name in files
|
32 |
+
if name.endswith(".py") and root == folder and name != "loadThemes.py"
|
33 |
+
]
|
34 |
+
|
35 |
+
json_file_path = os.path.join(folder, "themes_list.json")
|
36 |
+
|
37 |
+
try:
|
38 |
+
with open(json_file_path, "r", encoding="utf8") as json_file:
|
39 |
+
themes_from_url = [item["id"] for item in json.load(json_file)]
|
40 |
+
except FileNotFoundError:
|
41 |
+
themes_from_url = []
|
42 |
+
|
43 |
+
combined_themes = set(themes_from_files + themes_from_url)
|
44 |
+
|
45 |
+
return list(combined_themes)
|
46 |
+
|
47 |
+
|
48 |
+
def select_theme(name):
|
49 |
+
selected_file = name + ".py"
|
50 |
+
full_path = os.path.join(folder, selected_file)
|
51 |
+
|
52 |
+
if not os.path.exists(full_path):
|
53 |
+
with open(config_file, "r", encoding="utf8") as json_file:
|
54 |
+
config_data = json.load(json_file)
|
55 |
+
|
56 |
+
config_data["theme"]["file"] = None
|
57 |
+
config_data["theme"]["class"] = name
|
58 |
+
|
59 |
+
with open(config_file, "w", encoding="utf8") as json_file:
|
60 |
+
json.dump(config_data, json_file, indent=2)
|
61 |
+
print(f"Theme {name} successfully selected, restart the App.")
|
62 |
+
gr.Info(f"Theme {name} successfully selected, restart the App.")
|
63 |
+
return
|
64 |
+
|
65 |
+
class_found = get_class(full_path)
|
66 |
+
if class_found:
|
67 |
+
with open(config_file, "r", encoding="utf8") as json_file:
|
68 |
+
config_data = json.load(json_file)
|
69 |
+
|
70 |
+
config_data["theme"]["file"] = selected_file
|
71 |
+
config_data["theme"]["class"] = class_found
|
72 |
+
|
73 |
+
with open(config_file, "w", encoding="utf8") as json_file:
|
74 |
+
json.dump(config_data, json_file, indent=2)
|
75 |
+
print(f"Theme {name} successfully selected, restart the App.")
|
76 |
+
gr.Info(f"Theme {name} successfully selected, restart the App.")
|
77 |
+
else:
|
78 |
+
print(f"Theme {name} was not found.")
|
79 |
+
|
80 |
+
|
81 |
+
def read_json():
|
82 |
+
try:
|
83 |
+
with open(config_file, "r", encoding="utf8") as json_file:
|
84 |
+
data = json.load(json_file)
|
85 |
+
selected_file = data["theme"]["file"]
|
86 |
+
class_name = data["theme"]["class"]
|
87 |
+
|
88 |
+
if selected_file is not None and class_name:
|
89 |
+
return class_name
|
90 |
+
elif selected_file == None and class_name:
|
91 |
+
return class_name
|
92 |
+
else:
|
93 |
+
return "NoCrypt/miku"
|
94 |
+
except Exception as error:
|
95 |
+
print(f"An error occurred loading the theme: {error}")
|
96 |
+
return "NoCrypt/miku"
|
97 |
+
|
98 |
+
|
99 |
+
def load_json():
|
100 |
+
try:
|
101 |
+
with open(config_file, "r", encoding="utf8") as json_file:
|
102 |
+
data = json.load(json_file)
|
103 |
+
selected_file = data["theme"]["file"]
|
104 |
+
class_name = data["theme"]["class"]
|
105 |
+
|
106 |
+
if selected_file is not None and class_name:
|
107 |
+
module = importlib.import_module(selected_file[:-3])
|
108 |
+
obtained_class = getattr(module, class_name)
|
109 |
+
instance = obtained_class()
|
110 |
+
print(f"Theme {class_name} successfully loaded.")
|
111 |
+
return instance
|
112 |
+
elif selected_file == None and class_name:
|
113 |
+
return class_name
|
114 |
+
else:
|
115 |
+
print("The theme is incorrect.")
|
116 |
+
return None
|
117 |
+
except Exception as error:
|
118 |
+
print(f"An error occurred loading the theme: {error}")
|
119 |
+
return None
|
assets/themes/themes_list.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{"id": "freddyaboulton/dracula_revamped"},
|
3 |
+
{"id": "freddyaboulton/bad-theme-space"},
|
4 |
+
{"id": "gradio/dracula_revamped"},
|
5 |
+
{"id": "abidlabs/dracula_revamped"},
|
6 |
+
{"id": "gradio/seafoam"},
|
7 |
+
{"id": "gradio/monochrome"},
|
8 |
+
{"id": "gradio/soft"},
|
9 |
+
{"id": "gradio/default"},
|
10 |
+
{"id": "dawood/microsoft_windows"},
|
11 |
+
{"id": "ysharma/steampunk"},
|
12 |
+
{"id": "ysharma/huggingface"},
|
13 |
+
{"id": "gstaff/xkcd"},
|
14 |
+
{"id": "JohnSmith9982/small_and_pretty"},
|
15 |
+
{"id": "abidlabs/Lime"},
|
16 |
+
{"id": "bethecloud/storj_theme"},
|
17 |
+
{"id": "sudeepshouche/minimalist"},
|
18 |
+
{"id": "knotdgaf/gradiotest"},
|
19 |
+
{"id": "ParityError/Interstellar"},
|
20 |
+
{"id": "ParityError/Anime"},
|
21 |
+
{"id": "Ajaxon6255/Emerald_Isle"},
|
22 |
+
{"id": "NoCrypt/miku"},
|
23 |
+
{"id": "Hev832/Applio"}
|
24 |
+
]
|
install.bat
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@echo off
|
2 |
+
setlocal enabledelayedexpansion
|
3 |
+
title RVC CLI Installer
|
4 |
+
|
5 |
+
echo Welcome to the RVC CLI Installer!
|
6 |
+
echo.
|
7 |
+
|
8 |
+
set "INSTALL_DIR=%cd%"
|
9 |
+
set "MINICONDA_DIR=%UserProfile%\Miniconda3"
|
10 |
+
set "ENV_DIR=%INSTALL_DIR%\env"
|
11 |
+
set "MINICONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-py39_23.9.0-0-Windows-x86_64.exe"
|
12 |
+
set "CONDA_EXE=%MINICONDA_DIR%\Scripts\conda.exe"
|
13 |
+
|
14 |
+
call :cleanup
|
15 |
+
call :install_miniconda
|
16 |
+
call :create_conda_env
|
17 |
+
call :install_dependencies
|
18 |
+
|
19 |
+
echo RVC CLI has been installed successfully!
|
20 |
+
echo.
|
21 |
+
pause
|
22 |
+
exit /b 0
|
23 |
+
|
24 |
+
:cleanup
|
25 |
+
echo Cleaning up unnecessary files...
|
26 |
+
for %%F in (Makefile Dockerfile docker-compose.yaml *.sh) do if exist "%%F" del "%%F"
|
27 |
+
echo Cleanup complete.
|
28 |
+
echo.
|
29 |
+
exit /b 0
|
30 |
+
|
31 |
+
:install_miniconda
|
32 |
+
if exist "%CONDA_EXE%" (
|
33 |
+
echo Miniconda already installed. Skipping installation.
|
34 |
+
exit /b 0
|
35 |
+
)
|
36 |
+
|
37 |
+
echo Miniconda not found. Starting download and installation...
|
38 |
+
powershell -Command "& {Invoke-WebRequest -Uri '%MINICONDA_URL%' -OutFile 'miniconda.exe'}"
|
39 |
+
if not exist "miniconda.exe" goto :download_error
|
40 |
+
|
41 |
+
start /wait "" miniconda.exe /InstallationType=JustMe /RegisterPython=0 /S /D=%MINICONDA_DIR%
|
42 |
+
if errorlevel 1 goto :install_error
|
43 |
+
|
44 |
+
del miniconda.exe
|
45 |
+
echo Miniconda installation complete.
|
46 |
+
echo.
|
47 |
+
exit /b 0
|
48 |
+
|
49 |
+
:create_conda_env
|
50 |
+
echo Creating Conda environment...
|
51 |
+
call "%MINICONDA_DIR%\_conda.exe" create --no-shortcuts -y -k --prefix "%ENV_DIR%" python=3.9
|
52 |
+
if errorlevel 1 goto :error
|
53 |
+
echo Conda environment created successfully.
|
54 |
+
echo.
|
55 |
+
|
56 |
+
if exist "%ENV_DIR%\python.exe" (
|
57 |
+
echo Installing specific pip version...
|
58 |
+
"%ENV_DIR%\python.exe" -m pip install "pip<24.1"
|
59 |
+
if errorlevel 1 goto :error
|
60 |
+
echo Pip installation complete.
|
61 |
+
echo.
|
62 |
+
)
|
63 |
+
exit /b 0
|
64 |
+
|
65 |
+
:install_dependencies
|
66 |
+
echo Installing dependencies...
|
67 |
+
call "%MINICONDA_DIR%\condabin\conda.bat" activate "%ENV_DIR%" || goto :error
|
68 |
+
pip install --upgrade setuptools || goto :error
|
69 |
+
pip install --no-cache-dir -r "%INSTALL_DIR%\requirements.txt" || goto :error
|
70 |
+
pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --upgrade --index-url https://download.pytorch.org/whl/cu121 || goto :error
|
71 |
+
call "%MINICONDA_DIR%\condabin\conda.bat" deactivate
|
72 |
+
echo Dependencies installation complete.
|
73 |
+
echo.
|
74 |
+
exit /b 0
|
75 |
+
|
76 |
+
:download_error
|
77 |
+
echo Download failed. Please check your internet connection and try again.
|
78 |
+
goto :error
|
79 |
+
|
80 |
+
:install_error
|
81 |
+
echo Miniconda installation failed.
|
82 |
+
goto :error
|
83 |
+
|
84 |
+
:error
|
85 |
+
echo An error occurred during installation. Please check the output above for details.
|
86 |
+
pause
|
87 |
+
exit /b 1
|
models.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from rvc.lib.tools.prerequisites_download import prerequisites_download_pipeline
|
2 |
+
|
3 |
+
|
4 |
+
print("downloading models...")
|
5 |
+
prerequisites_download_pipeline(models=True, exe=True)
|
requirements.txt
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pip>=23.3; sys_platform == 'darwin'
|
2 |
+
wheel; sys_platform == 'darwin'
|
3 |
+
PyYAML; sys_platform == 'darwin'
|
4 |
+
tqdm
|
5 |
+
wget
|
6 |
+
ffmpeg-python>=0.2.0
|
7 |
+
faiss-cpu==1.7.3
|
8 |
+
soundfile==0.12.1
|
9 |
+
noisereduce
|
10 |
+
pedalboard
|
11 |
+
stftpitchshift
|
12 |
+
yt-dlp
|
13 |
+
audio-separator[gpu]==0.28.5
|
14 |
+
omegaconf>=2.0.6; sys_platform == 'darwin'
|
15 |
+
numba; sys_platform == 'linux'
|
16 |
+
numba==0.57.0; sys_platform == 'darwin' or sys_platform == 'win32'
|
17 |
+
torchaudio==2.3.1
|
18 |
+
torchvision==0.18.1
|
19 |
+
torchcrepe==0.0.23
|
20 |
+
torchfcpe
|
21 |
+
libf0
|
22 |
+
transformers==4.44.2
|
23 |
+
matplotlib==3.7.2
|
24 |
+
tensorboard
|
25 |
+
gradio==4.44.0
|
26 |
+
certifi>=2023.07.22; sys_platform == 'darwin'
|
27 |
+
antlr4-python3-runtime==4.8; sys_platform == 'darwin'
|
28 |
+
tensorboardX
|
29 |
+
edge-tts==6.1.9
|
30 |
+
pypresence
|
31 |
+
beautifulsoup4
|
32 |
+
flask
|
33 |
+
typing
|
rvc/configs/config.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
|
5 |
+
|
6 |
+
version_config_paths = [
|
7 |
+
os.path.join("v1", "32000.json"),
|
8 |
+
os.path.join("v1", "40000.json"),
|
9 |
+
os.path.join("v1", "48000.json"),
|
10 |
+
os.path.join("v2", "48000.json"),
|
11 |
+
os.path.join("v2", "40000.json"),
|
12 |
+
os.path.join("v2", "32000.json"),
|
13 |
+
]
|
14 |
+
|
15 |
+
|
16 |
+
def singleton(cls):
|
17 |
+
instances = {}
|
18 |
+
|
19 |
+
def get_instance(*args, **kwargs):
|
20 |
+
if cls not in instances:
|
21 |
+
instances[cls] = cls(*args, **kwargs)
|
22 |
+
return instances[cls]
|
23 |
+
|
24 |
+
return get_instance
|
25 |
+
|
26 |
+
|
27 |
+
@singleton
|
28 |
+
class Config:
|
29 |
+
def __init__(self):
|
30 |
+
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
31 |
+
self.is_half = self.device != "cpu"
|
32 |
+
self.gpu_name = (
|
33 |
+
torch.cuda.get_device_name(int(self.device.split(":")[-1]))
|
34 |
+
if self.device.startswith("cuda")
|
35 |
+
else None
|
36 |
+
)
|
37 |
+
self.json_config = self.load_config_json()
|
38 |
+
self.gpu_mem = None
|
39 |
+
self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
|
40 |
+
|
41 |
+
def load_config_json(self) -> dict:
|
42 |
+
configs = {}
|
43 |
+
for config_file in version_config_paths:
|
44 |
+
config_path = os.path.join("rvc", "configs", config_file)
|
45 |
+
with open(config_path, "r") as f:
|
46 |
+
configs[config_file] = json.load(f)
|
47 |
+
return configs
|
48 |
+
|
49 |
+
def has_mps(self) -> bool:
|
50 |
+
# Check if Metal Performance Shaders are available - for macOS 12.3+.
|
51 |
+
return torch.backends.mps.is_available()
|
52 |
+
|
53 |
+
def has_xpu(self) -> bool:
|
54 |
+
# Check if XPU is available.
|
55 |
+
return hasattr(torch, "xpu") and torch.xpu.is_available()
|
56 |
+
|
57 |
+
def set_precision(self, precision):
|
58 |
+
if precision not in ["fp32", "fp16"]:
|
59 |
+
raise ValueError("Invalid precision type. Must be 'fp32' or 'fp16'.")
|
60 |
+
|
61 |
+
fp16_run_value = precision == "fp16"
|
62 |
+
preprocess_target_version = "3.7" if precision == "fp16" else "3.0"
|
63 |
+
preprocess_path = os.path.join(
|
64 |
+
os.path.dirname(__file__),
|
65 |
+
os.pardir,
|
66 |
+
"rvc",
|
67 |
+
"train",
|
68 |
+
"preprocess",
|
69 |
+
"preprocess.py",
|
70 |
+
)
|
71 |
+
|
72 |
+
for config_path in version_config_paths:
|
73 |
+
full_config_path = os.path.join("rvc", "configs", config_path)
|
74 |
+
try:
|
75 |
+
with open(full_config_path, "r") as f:
|
76 |
+
config = json.load(f)
|
77 |
+
config["train"]["fp16_run"] = fp16_run_value
|
78 |
+
with open(full_config_path, "w") as f:
|
79 |
+
json.dump(config, f, indent=4)
|
80 |
+
except FileNotFoundError:
|
81 |
+
print(f"File not found: {full_config_path}")
|
82 |
+
|
83 |
+
if os.path.exists(preprocess_path):
|
84 |
+
with open(preprocess_path, "r") as f:
|
85 |
+
preprocess_content = f.read()
|
86 |
+
preprocess_content = preprocess_content.replace(
|
87 |
+
"3.0" if precision == "fp16" else "3.7", preprocess_target_version
|
88 |
+
)
|
89 |
+
with open(preprocess_path, "w") as f:
|
90 |
+
f.write(preprocess_content)
|
91 |
+
|
92 |
+
return f"Overwritten preprocess and config.json to use {precision}."
|
93 |
+
|
94 |
+
def get_precision(self):
|
95 |
+
if not version_config_paths:
|
96 |
+
raise FileNotFoundError("No configuration paths provided.")
|
97 |
+
|
98 |
+
full_config_path = os.path.join("rvc", "configs", version_config_paths[0])
|
99 |
+
try:
|
100 |
+
with open(full_config_path, "r") as f:
|
101 |
+
config = json.load(f)
|
102 |
+
fp16_run_value = config["train"].get("fp16_run", False)
|
103 |
+
precision = "fp16" if fp16_run_value else "fp32"
|
104 |
+
return precision
|
105 |
+
except FileNotFoundError:
|
106 |
+
print(f"File not found: {full_config_path}")
|
107 |
+
return None
|
108 |
+
|
109 |
+
def device_config(self) -> tuple:
|
110 |
+
if self.device.startswith("cuda"):
|
111 |
+
self.set_cuda_config()
|
112 |
+
elif self.has_mps():
|
113 |
+
self.device = "mps"
|
114 |
+
self.is_half = False
|
115 |
+
self.set_precision("fp32")
|
116 |
+
else:
|
117 |
+
self.device = "cpu"
|
118 |
+
self.is_half = False
|
119 |
+
self.set_precision("fp32")
|
120 |
+
|
121 |
+
# Configuration for 6GB GPU memory
|
122 |
+
x_pad, x_query, x_center, x_max = (
|
123 |
+
(3, 10, 60, 65) if self.is_half else (1, 6, 38, 41)
|
124 |
+
)
|
125 |
+
if self.gpu_mem is not None and self.gpu_mem <= 4:
|
126 |
+
# Configuration for 5GB GPU memory
|
127 |
+
x_pad, x_query, x_center, x_max = (1, 5, 30, 32)
|
128 |
+
|
129 |
+
return x_pad, x_query, x_center, x_max
|
130 |
+
|
131 |
+
def set_cuda_config(self):
|
132 |
+
i_device = int(self.device.split(":")[-1])
|
133 |
+
self.gpu_name = torch.cuda.get_device_name(i_device)
|
134 |
+
low_end_gpus = ["16", "P40", "P10", "1060", "1070", "1080"]
|
135 |
+
if (
|
136 |
+
any(gpu in self.gpu_name for gpu in low_end_gpus)
|
137 |
+
and "V100" not in self.gpu_name.upper()
|
138 |
+
):
|
139 |
+
self.is_half = False
|
140 |
+
self.set_precision("fp32")
|
141 |
+
|
142 |
+
self.gpu_mem = torch.cuda.get_device_properties(i_device).total_memory // (
|
143 |
+
1024**3
|
144 |
+
)
|
145 |
+
|
146 |
+
|
147 |
+
def max_vram_gpu(gpu):
|
148 |
+
if torch.cuda.is_available():
|
149 |
+
gpu_properties = torch.cuda.get_device_properties(gpu)
|
150 |
+
total_memory_gb = round(gpu_properties.total_memory / 1024 / 1024 / 1024)
|
151 |
+
return total_memory_gb
|
152 |
+
else:
|
153 |
+
return "8"
|
154 |
+
|
155 |
+
|
156 |
+
def get_gpu_info():
|
157 |
+
ngpu = torch.cuda.device_count()
|
158 |
+
gpu_infos = []
|
159 |
+
if torch.cuda.is_available() or ngpu != 0:
|
160 |
+
for i in range(ngpu):
|
161 |
+
gpu_name = torch.cuda.get_device_name(i)
|
162 |
+
mem = int(
|
163 |
+
torch.cuda.get_device_properties(i).total_memory / 1024 / 1024 / 1024
|
164 |
+
+ 0.4
|
165 |
+
)
|
166 |
+
gpu_infos.append(f"{i}: {gpu_name} ({mem} GB)")
|
167 |
+
if len(gpu_infos) > 0:
|
168 |
+
gpu_info = "\n".join(gpu_infos)
|
169 |
+
else:
|
170 |
+
gpu_info = "Unfortunately, there is no compatible GPU available to support your training."
|
171 |
+
return gpu_info
|
172 |
+
|
173 |
+
|
174 |
+
def get_number_of_gpus():
|
175 |
+
if torch.cuda.is_available():
|
176 |
+
num_gpus = torch.cuda.device_count()
|
177 |
+
return "-".join(map(str, range(num_gpus)))
|
178 |
+
else:
|
179 |
+
return "-"
|
rvc/configs/v1/32000.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"seed": 1234,
|
5 |
+
"epochs": 20000,
|
6 |
+
"learning_rate": 1e-4,
|
7 |
+
"betas": [0.8, 0.99],
|
8 |
+
"eps": 1e-9,
|
9 |
+
"batch_size": 4,
|
10 |
+
"fp16_run": true,
|
11 |
+
"lr_decay": 0.999875,
|
12 |
+
"segment_size": 12800,
|
13 |
+
"init_lr_ratio": 1,
|
14 |
+
"warmup_epochs": 0,
|
15 |
+
"c_mel": 45,
|
16 |
+
"c_kl": 1.0
|
17 |
+
},
|
18 |
+
"data": {
|
19 |
+
"max_wav_value": 32768.0,
|
20 |
+
"sample_rate": 32000,
|
21 |
+
"filter_length": 1024,
|
22 |
+
"hop_length": 320,
|
23 |
+
"win_length": 1024,
|
24 |
+
"n_mel_channels": 80,
|
25 |
+
"mel_fmin": 0.0,
|
26 |
+
"mel_fmax": null
|
27 |
+
},
|
28 |
+
"model": {
|
29 |
+
"inter_channels": 192,
|
30 |
+
"hidden_channels": 192,
|
31 |
+
"filter_channels": 768,
|
32 |
+
"text_enc_hidden_dim": 256,
|
33 |
+
"n_heads": 2,
|
34 |
+
"n_layers": 6,
|
35 |
+
"kernel_size": 3,
|
36 |
+
"p_dropout": 0,
|
37 |
+
"resblock": "1",
|
38 |
+
"resblock_kernel_sizes": [3,7,11],
|
39 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
40 |
+
"upsample_rates": [10,4,2,2,2],
|
41 |
+
"upsample_initial_channel": 512,
|
42 |
+
"upsample_kernel_sizes": [16,16,4,4,4],
|
43 |
+
"use_spectral_norm": false,
|
44 |
+
"gin_channels": 256,
|
45 |
+
"spk_embed_dim": 109
|
46 |
+
}
|
47 |
+
}
|
rvc/configs/v1/40000.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"seed": 1234,
|
5 |
+
"epochs": 20000,
|
6 |
+
"learning_rate": 1e-4,
|
7 |
+
"betas": [0.8, 0.99],
|
8 |
+
"eps": 1e-9,
|
9 |
+
"batch_size": 4,
|
10 |
+
"fp16_run": true,
|
11 |
+
"lr_decay": 0.999875,
|
12 |
+
"segment_size": 12800,
|
13 |
+
"init_lr_ratio": 1,
|
14 |
+
"warmup_epochs": 0,
|
15 |
+
"c_mel": 45,
|
16 |
+
"c_kl": 1.0
|
17 |
+
},
|
18 |
+
"data": {
|
19 |
+
"max_wav_value": 32768.0,
|
20 |
+
"sample_rate": 40000,
|
21 |
+
"filter_length": 2048,
|
22 |
+
"hop_length": 400,
|
23 |
+
"win_length": 2048,
|
24 |
+
"n_mel_channels": 125,
|
25 |
+
"mel_fmin": 0.0,
|
26 |
+
"mel_fmax": null
|
27 |
+
},
|
28 |
+
"model": {
|
29 |
+
"inter_channels": 192,
|
30 |
+
"hidden_channels": 192,
|
31 |
+
"filter_channels": 768,
|
32 |
+
"text_enc_hidden_dim": 256,
|
33 |
+
"n_heads": 2,
|
34 |
+
"n_layers": 6,
|
35 |
+
"kernel_size": 3,
|
36 |
+
"p_dropout": 0,
|
37 |
+
"resblock": "1",
|
38 |
+
"resblock_kernel_sizes": [3,7,11],
|
39 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
40 |
+
"upsample_rates": [10,10,2,2],
|
41 |
+
"upsample_initial_channel": 512,
|
42 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
43 |
+
"use_spectral_norm": false,
|
44 |
+
"gin_channels": 256,
|
45 |
+
"spk_embed_dim": 109
|
46 |
+
}
|
47 |
+
}
|
rvc/configs/v1/48000.json
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"seed": 1234,
|
5 |
+
"epochs": 20000,
|
6 |
+
"learning_rate": 1e-4,
|
7 |
+
"betas": [0.8, 0.99],
|
8 |
+
"eps": 1e-9,
|
9 |
+
"batch_size": 4,
|
10 |
+
"fp16_run": true,
|
11 |
+
"lr_decay": 0.999875,
|
12 |
+
"segment_size": 11520,
|
13 |
+
"init_lr_ratio": 1,
|
14 |
+
"warmup_epochs": 0,
|
15 |
+
"c_mel": 45,
|
16 |
+
"c_kl": 1.0
|
17 |
+
},
|
18 |
+
"data": {
|
19 |
+
"max_wav_value": 32768.0,
|
20 |
+
"sample_rate": 48000,
|
21 |
+
"filter_length": 2048,
|
22 |
+
"hop_length": 480,
|
23 |
+
"win_length": 2048,
|
24 |
+
"n_mel_channels": 128,
|
25 |
+
"mel_fmin": 0.0,
|
26 |
+
"mel_fmax": null
|
27 |
+
},
|
28 |
+
"model": {
|
29 |
+
"inter_channels": 192,
|
30 |
+
"hidden_channels": 192,
|
31 |
+
"filter_channels": 768,
|
32 |
+
"text_enc_hidden_dim": 256,
|
33 |
+
"n_heads": 2,
|
34 |
+
"n_layers": 6,
|
35 |
+
"kernel_size": 3,
|
36 |
+
"p_dropout": 0,
|
37 |
+
"resblock": "1",
|
38 |
+
"resblock_kernel_sizes": [3,7,11],
|
39 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
40 |
+
"upsample_rates": [10,6,2,2,2],
|
41 |
+
"upsample_initial_channel": 512,
|
42 |
+
"upsample_kernel_sizes": [16,16,4,4,4],
|
43 |
+
"use_spectral_norm": false,
|
44 |
+
"gin_channels": 256,
|
45 |
+
"spk_embed_dim": 109
|
46 |
+
}
|
47 |
+
}
|
rvc/configs/v2/32000.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"seed": 1234,
|
5 |
+
"learning_rate": 1e-4,
|
6 |
+
"betas": [0.8, 0.99],
|
7 |
+
"eps": 1e-9,
|
8 |
+
"fp16_run": true,
|
9 |
+
"lr_decay": 0.999875,
|
10 |
+
"segment_size": 12800,
|
11 |
+
"c_mel": 45,
|
12 |
+
"c_kl": 1.0
|
13 |
+
},
|
14 |
+
"data": {
|
15 |
+
"max_wav_value": 32768.0,
|
16 |
+
"sample_rate": 32000,
|
17 |
+
"filter_length": 1024,
|
18 |
+
"hop_length": 320,
|
19 |
+
"win_length": 1024,
|
20 |
+
"n_mel_channels": 80,
|
21 |
+
"mel_fmin": 0.0,
|
22 |
+
"mel_fmax": null
|
23 |
+
},
|
24 |
+
"model": {
|
25 |
+
"inter_channels": 192,
|
26 |
+
"hidden_channels": 192,
|
27 |
+
"filter_channels": 768,
|
28 |
+
"text_enc_hidden_dim": 768,
|
29 |
+
"n_heads": 2,
|
30 |
+
"n_layers": 6,
|
31 |
+
"kernel_size": 3,
|
32 |
+
"p_dropout": 0,
|
33 |
+
"resblock": "1",
|
34 |
+
"resblock_kernel_sizes": [3,7,11],
|
35 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
36 |
+
"upsample_rates": [10,8,2,2],
|
37 |
+
"upsample_initial_channel": 512,
|
38 |
+
"upsample_kernel_sizes": [20,16,4,4],
|
39 |
+
"use_spectral_norm": false,
|
40 |
+
"gin_channels": 256,
|
41 |
+
"spk_embed_dim": 109
|
42 |
+
}
|
43 |
+
}
|
rvc/configs/v2/40000.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"seed": 1234,
|
5 |
+
"learning_rate": 1e-4,
|
6 |
+
"betas": [0.8, 0.99],
|
7 |
+
"eps": 1e-9,
|
8 |
+
"fp16_run": true,
|
9 |
+
"lr_decay": 0.999875,
|
10 |
+
"segment_size": 12800,
|
11 |
+
"c_mel": 45,
|
12 |
+
"c_kl": 1.0
|
13 |
+
},
|
14 |
+
"data": {
|
15 |
+
"max_wav_value": 32768.0,
|
16 |
+
"sample_rate": 40000,
|
17 |
+
"filter_length": 2048,
|
18 |
+
"hop_length": 400,
|
19 |
+
"win_length": 2048,
|
20 |
+
"n_mel_channels": 125,
|
21 |
+
"mel_fmin": 0.0,
|
22 |
+
"mel_fmax": null
|
23 |
+
},
|
24 |
+
"model": {
|
25 |
+
"inter_channels": 192,
|
26 |
+
"hidden_channels": 192,
|
27 |
+
"filter_channels": 768,
|
28 |
+
"text_enc_hidden_dim": 768,
|
29 |
+
"n_heads": 2,
|
30 |
+
"n_layers": 6,
|
31 |
+
"kernel_size": 3,
|
32 |
+
"p_dropout": 0,
|
33 |
+
"resblock": "1",
|
34 |
+
"resblock_kernel_sizes": [3,7,11],
|
35 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
36 |
+
"upsample_rates": [10,10,2,2],
|
37 |
+
"upsample_initial_channel": 512,
|
38 |
+
"upsample_kernel_sizes": [16,16,4,4],
|
39 |
+
"use_spectral_norm": false,
|
40 |
+
"gin_channels": 256,
|
41 |
+
"spk_embed_dim": 109
|
42 |
+
}
|
43 |
+
}
|
rvc/configs/v2/48000.json
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"seed": 1234,
|
5 |
+
"learning_rate": 1e-4,
|
6 |
+
"betas": [0.8, 0.99],
|
7 |
+
"eps": 1e-9,
|
8 |
+
"fp16_run": true,
|
9 |
+
"lr_decay": 0.999875,
|
10 |
+
"segment_size": 17280,
|
11 |
+
"c_mel": 45,
|
12 |
+
"c_kl": 1.0
|
13 |
+
},
|
14 |
+
"data": {
|
15 |
+
"max_wav_value": 32768.0,
|
16 |
+
"sample_rate": 48000,
|
17 |
+
"filter_length": 2048,
|
18 |
+
"hop_length": 480,
|
19 |
+
"win_length": 2048,
|
20 |
+
"n_mel_channels": 128,
|
21 |
+
"mel_fmin": 0.0,
|
22 |
+
"mel_fmax": null
|
23 |
+
},
|
24 |
+
"model": {
|
25 |
+
"inter_channels": 192,
|
26 |
+
"hidden_channels": 192,
|
27 |
+
"filter_channels": 768,
|
28 |
+
"text_enc_hidden_dim": 768,
|
29 |
+
"n_heads": 2,
|
30 |
+
"n_layers": 6,
|
31 |
+
"kernel_size": 3,
|
32 |
+
"p_dropout": 0,
|
33 |
+
"resblock": "1",
|
34 |
+
"resblock_kernel_sizes": [3,7,11],
|
35 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
36 |
+
"upsample_rates": [12,10,2,2],
|
37 |
+
"upsample_initial_channel": 512,
|
38 |
+
"upsample_kernel_sizes": [24,20,4,4],
|
39 |
+
"use_spectral_norm": false,
|
40 |
+
"gin_channels": 256,
|
41 |
+
"spk_embed_dim": 109
|
42 |
+
}
|
43 |
+
}
|
rvc/infer/infer.py
ADDED
@@ -0,0 +1,495 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import time
|
4 |
+
import torch
|
5 |
+
import librosa
|
6 |
+
import logging
|
7 |
+
import traceback
|
8 |
+
import numpy as np
|
9 |
+
import soundfile as sf
|
10 |
+
import noisereduce as nr
|
11 |
+
from pedalboard import (
|
12 |
+
Pedalboard,
|
13 |
+
Chorus,
|
14 |
+
Distortion,
|
15 |
+
Reverb,
|
16 |
+
PitchShift,
|
17 |
+
Limiter,
|
18 |
+
Gain,
|
19 |
+
Bitcrush,
|
20 |
+
Clipping,
|
21 |
+
Compressor,
|
22 |
+
Delay,
|
23 |
+
)
|
24 |
+
|
25 |
+
now_dir = os.getcwd()
|
26 |
+
sys.path.append(now_dir)
|
27 |
+
|
28 |
+
from rvc.infer.pipeline import Pipeline as VC
|
29 |
+
from rvc.lib.utils import load_audio_infer, load_embedding
|
30 |
+
from rvc.lib.tools.split_audio import process_audio, merge_audio
|
31 |
+
from rvc.lib.algorithm.synthesizers import Synthesizer
|
32 |
+
from rvc.configs.config import Config
|
33 |
+
|
34 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
35 |
+
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
36 |
+
logging.getLogger("faiss").setLevel(logging.WARNING)
|
37 |
+
logging.getLogger("faiss.loader").setLevel(logging.WARNING)
|
38 |
+
|
39 |
+
|
40 |
+
class VoiceConverter:
|
41 |
+
"""
|
42 |
+
A class for performing voice conversion using the Retrieval-Based Voice Conversion (RVC) method.
|
43 |
+
"""
|
44 |
+
|
45 |
+
def __init__(self):
|
46 |
+
"""
|
47 |
+
Initializes the VoiceConverter with default configuration, and sets up models and parameters.
|
48 |
+
"""
|
49 |
+
self.config = Config() # Load RVC configuration
|
50 |
+
self.hubert_model = (
|
51 |
+
None # Initialize the Hubert model (for embedding extraction)
|
52 |
+
)
|
53 |
+
self.last_embedder_model = None # Last used embedder model
|
54 |
+
self.tgt_sr = None # Target sampling rate for the output audio
|
55 |
+
self.net_g = None # Generator network for voice conversion
|
56 |
+
self.vc = None # Voice conversion pipeline instance
|
57 |
+
self.cpt = None # Checkpoint for loading model weights
|
58 |
+
self.version = None # Model version
|
59 |
+
self.n_spk = None # Number of speakers in the model
|
60 |
+
self.use_f0 = None # Whether the model uses F0
|
61 |
+
self.loaded_model = None
|
62 |
+
|
63 |
+
def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
|
64 |
+
"""
|
65 |
+
Loads the HuBERT model for speaker embedding extraction.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
embedder_model (str): Path to the pre-trained HuBERT model.
|
69 |
+
embedder_model_custom (str): Path to the custom HuBERT model.
|
70 |
+
"""
|
71 |
+
self.hubert_model = load_embedding(embedder_model, embedder_model_custom)
|
72 |
+
self.hubert_model.to(self.config.device)
|
73 |
+
self.hubert_model = (
|
74 |
+
self.hubert_model.half()
|
75 |
+
if self.config.is_half
|
76 |
+
else self.hubert_model.float()
|
77 |
+
)
|
78 |
+
self.hubert_model.eval()
|
79 |
+
|
80 |
+
@staticmethod
|
81 |
+
def remove_audio_noise(data, sr, reduction_strength=0.7):
|
82 |
+
"""
|
83 |
+
Removes noise from an audio file using the NoiseReduce library.
|
84 |
+
|
85 |
+
Args:
|
86 |
+
data (numpy.ndarray): The audio data as a NumPy array.
|
87 |
+
sr (int): The sample rate of the audio data.
|
88 |
+
reduction_strength (float): Strength of the noise reduction. Default is 0.7.
|
89 |
+
"""
|
90 |
+
try:
|
91 |
+
reduced_noise = nr.reduce_noise(
|
92 |
+
y=data, sr=sr, prop_decrease=reduction_strength
|
93 |
+
)
|
94 |
+
return reduced_noise
|
95 |
+
except Exception as error:
|
96 |
+
print(f"An error occurred removing audio noise: {error}")
|
97 |
+
return None
|
98 |
+
|
99 |
+
@staticmethod
|
100 |
+
def convert_audio_format(input_path, output_path, output_format):
|
101 |
+
"""
|
102 |
+
Converts an audio file to a specified output format.
|
103 |
+
|
104 |
+
Args:
|
105 |
+
input_path (str): Path to the input audio file.
|
106 |
+
output_path (str): Path to the output audio file.
|
107 |
+
output_format (str): Desired audio format (e.g., "WAV", "MP3").
|
108 |
+
"""
|
109 |
+
try:
|
110 |
+
if output_format != "WAV":
|
111 |
+
print(f"Saving audio as {output_format}...")
|
112 |
+
audio, sample_rate = librosa.load(input_path, sr=None)
|
113 |
+
common_sample_rates = [
|
114 |
+
8000,
|
115 |
+
11025,
|
116 |
+
12000,
|
117 |
+
16000,
|
118 |
+
22050,
|
119 |
+
24000,
|
120 |
+
32000,
|
121 |
+
44100,
|
122 |
+
48000,
|
123 |
+
]
|
124 |
+
target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
|
125 |
+
audio = librosa.resample(
|
126 |
+
audio, orig_sr=sample_rate, target_sr=target_sr
|
127 |
+
)
|
128 |
+
sf.write(output_path, audio, target_sr, format=output_format.lower())
|
129 |
+
return output_path
|
130 |
+
except Exception as error:
|
131 |
+
print(f"An error occurred converting the audio format: {error}")
|
132 |
+
|
133 |
+
@staticmethod
|
134 |
+
def post_process_audio(
|
135 |
+
audio_input,
|
136 |
+
sample_rate,
|
137 |
+
**kwargs,
|
138 |
+
):
|
139 |
+
board = Pedalboard()
|
140 |
+
if kwargs.get("reverb", False):
|
141 |
+
reverb = Reverb(
|
142 |
+
room_size=kwargs.get("reverb_room_size", 0.5),
|
143 |
+
damping=kwargs.get("reverb_damping", 0.5),
|
144 |
+
wet_level=kwargs.get("reverb_wet_level", 0.33),
|
145 |
+
dry_level=kwargs.get("reverb_dry_level", 0.4),
|
146 |
+
width=kwargs.get("reverb_width", 1.0),
|
147 |
+
freeze_mode=kwargs.get("reverb_freeze_mode", 0),
|
148 |
+
)
|
149 |
+
board.append(reverb)
|
150 |
+
if kwargs.get("pitch_shift", False):
|
151 |
+
pitch_shift = PitchShift(semitones=kwargs.get("pitch_shift_semitones", 0))
|
152 |
+
board.append(pitch_shift)
|
153 |
+
if kwargs.get("limiter", False):
|
154 |
+
limiter = Limiter(
|
155 |
+
threshold_db=kwargs.get("limiter_threshold", -6),
|
156 |
+
release_ms=kwargs.get("limiter_release", 0.05),
|
157 |
+
)
|
158 |
+
board.append(limiter)
|
159 |
+
if kwargs.get("gain", False):
|
160 |
+
gain = Gain(gain_db=kwargs.get("gain_db", 0))
|
161 |
+
board.append(gain)
|
162 |
+
if kwargs.get("distortion", False):
|
163 |
+
distortion = Distortion(drive_db=kwargs.get("distortion_gain", 25))
|
164 |
+
board.append(distortion)
|
165 |
+
if kwargs.get("chorus", False):
|
166 |
+
chorus = Chorus(
|
167 |
+
rate_hz=kwargs.get("chorus_rate", 1.0),
|
168 |
+
depth=kwargs.get("chorus_depth", 0.25),
|
169 |
+
centre_delay_ms=kwargs.get("chorus_delay", 7),
|
170 |
+
feedback=kwargs.get("chorus_feedback", 0.0),
|
171 |
+
mix=kwargs.get("chorus_mix", 0.5),
|
172 |
+
)
|
173 |
+
board.append(chorus)
|
174 |
+
if kwargs.get("bitcrush", False):
|
175 |
+
bitcrush = Bitcrush(bit_depth=kwargs.get("bitcrush_bit_depth", 8))
|
176 |
+
board.append(bitcrush)
|
177 |
+
if kwargs.get("clipping", False):
|
178 |
+
clipping = Clipping(threshold_db=kwargs.get("clipping_threshold", 0))
|
179 |
+
board.append(clipping)
|
180 |
+
if kwargs.get("compressor", False):
|
181 |
+
compressor = Compressor(
|
182 |
+
threshold_db=kwargs.get("compressor_threshold", 0),
|
183 |
+
ratio=kwargs.get("compressor_ratio", 1),
|
184 |
+
attack_ms=kwargs.get("compressor_attack", 1.0),
|
185 |
+
release_ms=kwargs.get("compressor_release", 100),
|
186 |
+
)
|
187 |
+
board.append(compressor)
|
188 |
+
if kwargs.get("delay", False):
|
189 |
+
delay = Delay(
|
190 |
+
delay_seconds=kwargs.get("delay_seconds", 0.5),
|
191 |
+
feedback=kwargs.get("delay_feedback", 0.0),
|
192 |
+
mix=kwargs.get("delay_mix", 0.5),
|
193 |
+
)
|
194 |
+
board.append(delay)
|
195 |
+
return board(audio_input, sample_rate)
|
196 |
+
|
197 |
+
def convert_audio(
|
198 |
+
self,
|
199 |
+
audio_input_path: str,
|
200 |
+
audio_output_path: str,
|
201 |
+
model_path: str,
|
202 |
+
index_path: str,
|
203 |
+
pitch: int = 0,
|
204 |
+
f0_file: str = None,
|
205 |
+
f0_method: str = "rmvpe",
|
206 |
+
index_rate: float = 0.75,
|
207 |
+
volume_envelope: float = 1,
|
208 |
+
protect: float = 0.5,
|
209 |
+
hop_length: int = 128,
|
210 |
+
split_audio: bool = False,
|
211 |
+
f0_autotune: bool = False,
|
212 |
+
f0_autotune_strength: float = 1,
|
213 |
+
filter_radius: int = 3,
|
214 |
+
embedder_model: str = "contentvec",
|
215 |
+
embedder_model_custom: str = None,
|
216 |
+
clean_audio: bool = False,
|
217 |
+
clean_strength: float = 0.5,
|
218 |
+
export_format: str = "WAV",
|
219 |
+
upscale_audio: bool = False,
|
220 |
+
post_process: bool = False,
|
221 |
+
resample_sr: int = 0,
|
222 |
+
sid: int = 0,
|
223 |
+
**kwargs,
|
224 |
+
):
|
225 |
+
"""
|
226 |
+
Performs voice conversion on the input audio.
|
227 |
+
|
228 |
+
Args:
|
229 |
+
pitch (int): Key for F0 up-sampling.
|
230 |
+
filter_radius (int): Radius for filtering.
|
231 |
+
index_rate (float): Rate for index matching.
|
232 |
+
volume_envelope (int): RMS mix rate.
|
233 |
+
protect (float): Protection rate for certain audio segments.
|
234 |
+
hop_length (int): Hop length for audio processing.
|
235 |
+
f0_method (str): Method for F0 extraction.
|
236 |
+
audio_input_path (str): Path to the input audio file.
|
237 |
+
audio_output_path (str): Path to the output audio file.
|
238 |
+
model_path (str): Path to the voice conversion model.
|
239 |
+
index_path (str): Path to the index file.
|
240 |
+
split_audio (bool): Whether to split the audio for processing.
|
241 |
+
f0_autotune (bool): Whether to use F0 autotune.
|
242 |
+
clean_audio (bool): Whether to clean the audio.
|
243 |
+
clean_strength (float): Strength of the audio cleaning.
|
244 |
+
export_format (str): Format for exporting the audio.
|
245 |
+
upscale_audio (bool): Whether to upscale the audio.
|
246 |
+
f0_file (str): Path to the F0 file.
|
247 |
+
embedder_model (str): Path to the embedder model.
|
248 |
+
embedder_model_custom (str): Path to the custom embedder model.
|
249 |
+
resample_sr (int, optional): Resample sampling rate. Default is 0.
|
250 |
+
sid (int, optional): Speaker ID. Default is 0.
|
251 |
+
**kwargs: Additional keyword arguments.
|
252 |
+
"""
|
253 |
+
self.get_vc(model_path, sid)
|
254 |
+
try:
|
255 |
+
start_time = time.time()
|
256 |
+
print(f"Converting audio '{audio_input_path}'...")
|
257 |
+
|
258 |
+
audio = load_audio_infer(
|
259 |
+
audio_input_path,
|
260 |
+
16000,
|
261 |
+
**kwargs,
|
262 |
+
)
|
263 |
+
audio_max = np.abs(audio).max() / 0.95
|
264 |
+
|
265 |
+
if audio_max > 1:
|
266 |
+
audio /= audio_max
|
267 |
+
|
268 |
+
if not self.hubert_model or embedder_model != self.last_embedder_model:
|
269 |
+
self.load_hubert(embedder_model, embedder_model_custom)
|
270 |
+
self.last_embedder_model = embedder_model
|
271 |
+
|
272 |
+
file_index = (
|
273 |
+
index_path.strip()
|
274 |
+
.strip('"')
|
275 |
+
.strip("\n")
|
276 |
+
.strip('"')
|
277 |
+
.strip()
|
278 |
+
.replace("trained", "added")
|
279 |
+
)
|
280 |
+
|
281 |
+
if self.tgt_sr != resample_sr >= 16000:
|
282 |
+
self.tgt_sr = resample_sr
|
283 |
+
|
284 |
+
if split_audio:
|
285 |
+
chunks, intervals = process_audio(audio, 16000)
|
286 |
+
print(f"Audio split into {len(chunks)} chunks for processing.")
|
287 |
+
else:
|
288 |
+
chunks = []
|
289 |
+
chunks.append(audio)
|
290 |
+
|
291 |
+
converted_chunks = []
|
292 |
+
for c in chunks:
|
293 |
+
audio_opt = self.vc.pipeline(
|
294 |
+
model=self.hubert_model,
|
295 |
+
net_g=self.net_g,
|
296 |
+
sid=sid,
|
297 |
+
audio=c,
|
298 |
+
pitch=pitch,
|
299 |
+
f0_method=f0_method,
|
300 |
+
file_index=file_index,
|
301 |
+
index_rate=index_rate,
|
302 |
+
pitch_guidance=self.use_f0,
|
303 |
+
filter_radius=filter_radius,
|
304 |
+
volume_envelope=volume_envelope,
|
305 |
+
version=self.version,
|
306 |
+
protect=protect,
|
307 |
+
hop_length=hop_length,
|
308 |
+
f0_autotune=f0_autotune,
|
309 |
+
f0_autotune_strength=f0_autotune_strength,
|
310 |
+
f0_file=f0_file,
|
311 |
+
)
|
312 |
+
converted_chunks.append(audio_opt)
|
313 |
+
if split_audio:
|
314 |
+
print(f"Converted audio chunk {len(converted_chunks)}")
|
315 |
+
|
316 |
+
if split_audio:
|
317 |
+
audio_opt = merge_audio(converted_chunks, intervals, 16000, self.tgt_sr)
|
318 |
+
else:
|
319 |
+
audio_opt = converted_chunks[0]
|
320 |
+
|
321 |
+
if clean_audio:
|
322 |
+
cleaned_audio = self.remove_audio_noise(
|
323 |
+
audio_opt, self.tgt_sr, clean_strength
|
324 |
+
)
|
325 |
+
if cleaned_audio is not None:
|
326 |
+
audio_opt = cleaned_audio
|
327 |
+
|
328 |
+
if post_process:
|
329 |
+
audio_opt = self.post_process_audio(
|
330 |
+
audio_input=audio_opt,
|
331 |
+
sample_rate=self.tgt_sr,
|
332 |
+
**kwargs,
|
333 |
+
)
|
334 |
+
|
335 |
+
sf.write(audio_output_path, audio_opt, self.tgt_sr, format="WAV")
|
336 |
+
output_path_format = audio_output_path.replace(
|
337 |
+
".wav", f".{export_format.lower()}"
|
338 |
+
)
|
339 |
+
audio_output_path = self.convert_audio_format(
|
340 |
+
audio_output_path, output_path_format, export_format
|
341 |
+
)
|
342 |
+
|
343 |
+
elapsed_time = time.time() - start_time
|
344 |
+
print(
|
345 |
+
f"Conversion completed at '{audio_output_path}' in {elapsed_time:.2f} seconds."
|
346 |
+
)
|
347 |
+
except Exception as error:
|
348 |
+
print(f"An error occurred during audio conversion: {error}")
|
349 |
+
print(traceback.format_exc())
|
350 |
+
|
351 |
+
def convert_audio_batch(
|
352 |
+
self,
|
353 |
+
audio_input_paths: str,
|
354 |
+
audio_output_path: str,
|
355 |
+
**kwargs,
|
356 |
+
):
|
357 |
+
"""
|
358 |
+
Performs voice conversion on a batch of input audio files.
|
359 |
+
|
360 |
+
Args:
|
361 |
+
audio_input_paths (str): List of paths to the input audio files.
|
362 |
+
audio_output_path (str): Path to the output audio file.
|
363 |
+
resample_sr (int, optional): Resample sampling rate. Default is 0.
|
364 |
+
sid (int, optional): Speaker ID. Default is 0.
|
365 |
+
**kwargs: Additional keyword arguments.
|
366 |
+
"""
|
367 |
+
pid = os.getpid()
|
368 |
+
try:
|
369 |
+
with open(
|
370 |
+
os.path.join(now_dir, "assets", "infer_pid.txt"), "w"
|
371 |
+
) as pid_file:
|
372 |
+
pid_file.write(str(pid))
|
373 |
+
start_time = time.time()
|
374 |
+
print(f"Converting audio batch '{audio_input_paths}'...")
|
375 |
+
audio_files = [
|
376 |
+
f
|
377 |
+
for f in os.listdir(audio_input_paths)
|
378 |
+
if f.endswith(
|
379 |
+
(
|
380 |
+
"wav",
|
381 |
+
"mp3",
|
382 |
+
"flac",
|
383 |
+
"ogg",
|
384 |
+
"opus",
|
385 |
+
"m4a",
|
386 |
+
"mp4",
|
387 |
+
"aac",
|
388 |
+
"alac",
|
389 |
+
"wma",
|
390 |
+
"aiff",
|
391 |
+
"webm",
|
392 |
+
"ac3",
|
393 |
+
)
|
394 |
+
)
|
395 |
+
]
|
396 |
+
print(f"Detected {len(audio_files)} audio files for inference.")
|
397 |
+
for a in audio_files:
|
398 |
+
new_input = os.path.join(audio_input_paths, a)
|
399 |
+
new_output = os.path.splitext(a)[0] + "_output.wav"
|
400 |
+
new_output = os.path.join(audio_output_path, new_output)
|
401 |
+
if os.path.exists(new_output):
|
402 |
+
continue
|
403 |
+
self.convert_audio(
|
404 |
+
audio_input_path=new_input,
|
405 |
+
audio_output_path=new_output,
|
406 |
+
**kwargs,
|
407 |
+
)
|
408 |
+
print(f"Conversion completed at '{audio_input_paths}'.")
|
409 |
+
elapsed_time = time.time() - start_time
|
410 |
+
print(f"Batch conversion completed in {elapsed_time:.2f} seconds.")
|
411 |
+
except Exception as error:
|
412 |
+
print(f"An error occurred during audio batch conversion: {error}")
|
413 |
+
print(traceback.format_exc())
|
414 |
+
finally:
|
415 |
+
os.remove(os.path.join(now_dir, "assets", "infer_pid.txt"))
|
416 |
+
|
417 |
+
def get_vc(self, weight_root, sid):
|
418 |
+
"""
|
419 |
+
Loads the voice conversion model and sets up the pipeline.
|
420 |
+
|
421 |
+
Args:
|
422 |
+
weight_root (str): Path to the model weights.
|
423 |
+
sid (int): Speaker ID.
|
424 |
+
"""
|
425 |
+
if sid == "" or sid == []:
|
426 |
+
self.cleanup_model()
|
427 |
+
if torch.cuda.is_available():
|
428 |
+
torch.cuda.empty_cache()
|
429 |
+
|
430 |
+
if not self.loaded_model or self.loaded_model != weight_root:
|
431 |
+
self.load_model(weight_root)
|
432 |
+
if self.cpt is not None:
|
433 |
+
self.setup_network()
|
434 |
+
self.setup_vc_instance()
|
435 |
+
self.loaded_model = weight_root
|
436 |
+
|
437 |
+
def cleanup_model(self):
|
438 |
+
"""
|
439 |
+
Cleans up the model and releases resources.
|
440 |
+
"""
|
441 |
+
if self.hubert_model is not None:
|
442 |
+
del self.net_g, self.n_spk, self.vc, self.hubert_model, self.tgt_sr
|
443 |
+
self.hubert_model = self.net_g = self.n_spk = self.vc = self.tgt_sr = None
|
444 |
+
if torch.cuda.is_available():
|
445 |
+
torch.cuda.empty_cache()
|
446 |
+
|
447 |
+
del self.net_g, self.cpt
|
448 |
+
if torch.cuda.is_available():
|
449 |
+
torch.cuda.empty_cache()
|
450 |
+
self.cpt = None
|
451 |
+
|
452 |
+
def load_model(self, weight_root):
|
453 |
+
"""
|
454 |
+
Loads the model weights from the specified path.
|
455 |
+
|
456 |
+
Args:
|
457 |
+
weight_root (str): Path to the model weights.
|
458 |
+
"""
|
459 |
+
self.cpt = (
|
460 |
+
torch.load(weight_root, map_location="cpu")
|
461 |
+
if os.path.isfile(weight_root)
|
462 |
+
else None
|
463 |
+
)
|
464 |
+
|
465 |
+
def setup_network(self):
|
466 |
+
"""
|
467 |
+
Sets up the network configuration based on the loaded checkpoint.
|
468 |
+
"""
|
469 |
+
if self.cpt is not None:
|
470 |
+
self.tgt_sr = self.cpt["config"][-1]
|
471 |
+
self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0]
|
472 |
+
self.use_f0 = self.cpt.get("f0", 1)
|
473 |
+
|
474 |
+
self.version = self.cpt.get("version", "v1")
|
475 |
+
self.text_enc_hidden_dim = 768 if self.version == "v2" else 256
|
476 |
+
self.net_g = Synthesizer(
|
477 |
+
*self.cpt["config"],
|
478 |
+
use_f0=self.use_f0,
|
479 |
+
text_enc_hidden_dim=self.text_enc_hidden_dim,
|
480 |
+
is_half=self.config.is_half,
|
481 |
+
)
|
482 |
+
del self.net_g.enc_q
|
483 |
+
self.net_g.load_state_dict(self.cpt["weight"], strict=False)
|
484 |
+
self.net_g.eval().to(self.config.device)
|
485 |
+
self.net_g = (
|
486 |
+
self.net_g.half() if self.config.is_half else self.net_g.float()
|
487 |
+
)
|
488 |
+
|
489 |
+
def setup_vc_instance(self):
|
490 |
+
"""
|
491 |
+
Sets up the voice conversion pipeline instance based on the target sampling rate and configuration.
|
492 |
+
"""
|
493 |
+
if self.cpt is not None:
|
494 |
+
self.vc = VC(self.tgt_sr, self.config)
|
495 |
+
self.n_spk = self.cpt["config"][-3]
|
rvc/infer/pipeline.py
ADDED
@@ -0,0 +1,708 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gc
|
3 |
+
import re
|
4 |
+
import sys
|
5 |
+
import torch
|
6 |
+
import torch.nn.functional as F
|
7 |
+
import torchcrepe
|
8 |
+
import faiss
|
9 |
+
import librosa
|
10 |
+
import numpy as np
|
11 |
+
from scipy import signal
|
12 |
+
from torch import Tensor
|
13 |
+
|
14 |
+
now_dir = os.getcwd()
|
15 |
+
sys.path.append(now_dir)
|
16 |
+
|
17 |
+
from rvc.lib.predictors.RMVPE import RMVPE0Predictor
|
18 |
+
from rvc.lib.predictors.FCPE import FCPEF0Predictor
|
19 |
+
|
20 |
+
import logging
|
21 |
+
|
22 |
+
logging.getLogger("faiss").setLevel(logging.WARNING)
|
23 |
+
|
24 |
+
# Constants for high-pass filter
|
25 |
+
FILTER_ORDER = 5
|
26 |
+
CUTOFF_FREQUENCY = 48 # Hz
|
27 |
+
SAMPLE_RATE = 16000 # Hz
|
28 |
+
bh, ah = signal.butter(
|
29 |
+
N=FILTER_ORDER, Wn=CUTOFF_FREQUENCY, btype="high", fs=SAMPLE_RATE
|
30 |
+
)
|
31 |
+
|
32 |
+
input_audio_path2wav = {}
|
33 |
+
|
34 |
+
|
35 |
+
class AudioProcessor:
|
36 |
+
"""
|
37 |
+
A class for processing audio signals, specifically for adjusting RMS levels.
|
38 |
+
"""
|
39 |
+
|
40 |
+
def change_rms(
|
41 |
+
source_audio: np.ndarray,
|
42 |
+
source_rate: int,
|
43 |
+
target_audio: np.ndarray,
|
44 |
+
target_rate: int,
|
45 |
+
rate: float,
|
46 |
+
) -> np.ndarray:
|
47 |
+
"""
|
48 |
+
Adjust the RMS level of target_audio to match the RMS of source_audio, with a given blending rate.
|
49 |
+
|
50 |
+
Args:
|
51 |
+
source_audio: The source audio signal as a NumPy array.
|
52 |
+
source_rate: The sampling rate of the source audio.
|
53 |
+
target_audio: The target audio signal to adjust.
|
54 |
+
target_rate: The sampling rate of the target audio.
|
55 |
+
rate: The blending rate between the source and target RMS levels.
|
56 |
+
"""
|
57 |
+
# Calculate RMS of both audio data
|
58 |
+
rms1 = librosa.feature.rms(
|
59 |
+
y=source_audio,
|
60 |
+
frame_length=source_rate // 2 * 2,
|
61 |
+
hop_length=source_rate // 2,
|
62 |
+
)
|
63 |
+
rms2 = librosa.feature.rms(
|
64 |
+
y=target_audio,
|
65 |
+
frame_length=target_rate // 2 * 2,
|
66 |
+
hop_length=target_rate // 2,
|
67 |
+
)
|
68 |
+
|
69 |
+
# Interpolate RMS to match target audio length
|
70 |
+
rms1 = F.interpolate(
|
71 |
+
torch.from_numpy(rms1).float().unsqueeze(0),
|
72 |
+
size=target_audio.shape[0],
|
73 |
+
mode="linear",
|
74 |
+
).squeeze()
|
75 |
+
rms2 = F.interpolate(
|
76 |
+
torch.from_numpy(rms2).float().unsqueeze(0),
|
77 |
+
size=target_audio.shape[0],
|
78 |
+
mode="linear",
|
79 |
+
).squeeze()
|
80 |
+
rms2 = torch.maximum(rms2, torch.zeros_like(rms2) + 1e-6)
|
81 |
+
|
82 |
+
# Adjust target audio RMS based on the source audio RMS
|
83 |
+
adjusted_audio = (
|
84 |
+
target_audio
|
85 |
+
* (torch.pow(rms1, 1 - rate) * torch.pow(rms2, rate - 1)).numpy()
|
86 |
+
)
|
87 |
+
return adjusted_audio
|
88 |
+
|
89 |
+
|
90 |
+
class Autotune:
|
91 |
+
"""
|
92 |
+
A class for applying autotune to a given fundamental frequency (F0) contour.
|
93 |
+
"""
|
94 |
+
|
95 |
+
def __init__(self, ref_freqs):
|
96 |
+
"""
|
97 |
+
Initializes the Autotune class with a set of reference frequencies.
|
98 |
+
|
99 |
+
Args:
|
100 |
+
ref_freqs: A list of reference frequencies representing musical notes.
|
101 |
+
"""
|
102 |
+
self.ref_freqs = ref_freqs
|
103 |
+
self.note_dict = self.ref_freqs # No interpolation needed
|
104 |
+
|
105 |
+
def autotune_f0(self, f0, f0_autotune_strength):
|
106 |
+
"""
|
107 |
+
Autotunes a given F0 contour by snapping each frequency to the closest reference frequency.
|
108 |
+
|
109 |
+
Args:
|
110 |
+
f0: The input F0 contour as a NumPy array.
|
111 |
+
"""
|
112 |
+
autotuned_f0 = np.zeros_like(f0)
|
113 |
+
for i, freq in enumerate(f0):
|
114 |
+
closest_note = min(self.note_dict, key=lambda x: abs(x - freq))
|
115 |
+
autotuned_f0[i] = freq + (closest_note - freq) * f0_autotune_strength
|
116 |
+
return autotuned_f0
|
117 |
+
|
118 |
+
|
119 |
+
class Pipeline:
|
120 |
+
"""
|
121 |
+
The main pipeline class for performing voice conversion, including preprocessing, F0 estimation,
|
122 |
+
voice conversion using a model, and post-processing.
|
123 |
+
"""
|
124 |
+
|
125 |
+
def __init__(self, tgt_sr, config):
|
126 |
+
"""
|
127 |
+
Initializes the Pipeline class with target sampling rate and configuration parameters.
|
128 |
+
|
129 |
+
Args:
|
130 |
+
tgt_sr: The target sampling rate for the output audio.
|
131 |
+
config: A configuration object containing various parameters for the pipeline.
|
132 |
+
"""
|
133 |
+
self.x_pad = config.x_pad
|
134 |
+
self.x_query = config.x_query
|
135 |
+
self.x_center = config.x_center
|
136 |
+
self.x_max = config.x_max
|
137 |
+
self.is_half = config.is_half
|
138 |
+
self.sample_rate = 16000
|
139 |
+
self.window = 160
|
140 |
+
self.t_pad = self.sample_rate * self.x_pad
|
141 |
+
self.t_pad_tgt = tgt_sr * self.x_pad
|
142 |
+
self.t_pad2 = self.t_pad * 2
|
143 |
+
self.t_query = self.sample_rate * self.x_query
|
144 |
+
self.t_center = self.sample_rate * self.x_center
|
145 |
+
self.t_max = self.sample_rate * self.x_max
|
146 |
+
self.time_step = self.window / self.sample_rate * 1000
|
147 |
+
self.f0_min = 50
|
148 |
+
self.f0_max = 1100
|
149 |
+
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
|
150 |
+
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
|
151 |
+
self.device = config.device
|
152 |
+
self.ref_freqs = [
|
153 |
+
49.00, # G1
|
154 |
+
51.91, # G#1 / Ab1
|
155 |
+
55.00, # A1
|
156 |
+
58.27, # A#1 / Bb1
|
157 |
+
61.74, # B1
|
158 |
+
65.41, # C2
|
159 |
+
69.30, # C#2 / Db2
|
160 |
+
73.42, # D2
|
161 |
+
77.78, # D#2 / Eb2
|
162 |
+
82.41, # E2
|
163 |
+
87.31, # F2
|
164 |
+
92.50, # F#2 / Gb2
|
165 |
+
98.00, # G2
|
166 |
+
103.83, # G#2 / Ab2
|
167 |
+
110.00, # A2
|
168 |
+
116.54, # A#2 / Bb2
|
169 |
+
123.47, # B2
|
170 |
+
130.81, # C3
|
171 |
+
138.59, # C#3 / Db3
|
172 |
+
146.83, # D3
|
173 |
+
155.56, # D#3 / Eb3
|
174 |
+
164.81, # E3
|
175 |
+
174.61, # F3
|
176 |
+
185.00, # F#3 / Gb3
|
177 |
+
196.00, # G3
|
178 |
+
207.65, # G#3 / Ab3
|
179 |
+
220.00, # A3
|
180 |
+
233.08, # A#3 / Bb3
|
181 |
+
246.94, # B3
|
182 |
+
261.63, # C4
|
183 |
+
277.18, # C#4 / Db4
|
184 |
+
293.66, # D4
|
185 |
+
311.13, # D#4 / Eb4
|
186 |
+
329.63, # E4
|
187 |
+
349.23, # F4
|
188 |
+
369.99, # F#4 / Gb4
|
189 |
+
392.00, # G4
|
190 |
+
415.30, # G#4 / Ab4
|
191 |
+
440.00, # A4
|
192 |
+
466.16, # A#4 / Bb4
|
193 |
+
493.88, # B4
|
194 |
+
523.25, # C5
|
195 |
+
554.37, # C#5 / Db5
|
196 |
+
587.33, # D5
|
197 |
+
622.25, # D#5 / Eb5
|
198 |
+
659.25, # E5
|
199 |
+
698.46, # F5
|
200 |
+
739.99, # F#5 / Gb5
|
201 |
+
783.99, # G5
|
202 |
+
830.61, # G#5 / Ab5
|
203 |
+
880.00, # A5
|
204 |
+
932.33, # A#5 / Bb5
|
205 |
+
987.77, # B5
|
206 |
+
1046.50, # C6
|
207 |
+
]
|
208 |
+
self.autotune = Autotune(self.ref_freqs)
|
209 |
+
self.note_dict = self.autotune.note_dict
|
210 |
+
self.model_rmvpe = RMVPE0Predictor(
|
211 |
+
os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
|
212 |
+
is_half=self.is_half,
|
213 |
+
device=self.device,
|
214 |
+
)
|
215 |
+
|
216 |
+
def get_f0_crepe(
|
217 |
+
self,
|
218 |
+
x,
|
219 |
+
f0_min,
|
220 |
+
f0_max,
|
221 |
+
p_len,
|
222 |
+
hop_length,
|
223 |
+
model="full",
|
224 |
+
):
|
225 |
+
"""
|
226 |
+
Estimates the fundamental frequency (F0) of a given audio signal using the Crepe model.
|
227 |
+
|
228 |
+
Args:
|
229 |
+
x: The input audio signal as a NumPy array.
|
230 |
+
f0_min: Minimum F0 value to consider.
|
231 |
+
f0_max: Maximum F0 value to consider.
|
232 |
+
p_len: Desired length of the F0 output.
|
233 |
+
hop_length: Hop length for the Crepe model.
|
234 |
+
model: Crepe model size to use ("full" or "tiny").
|
235 |
+
"""
|
236 |
+
x = x.astype(np.float32)
|
237 |
+
x /= np.quantile(np.abs(x), 0.999)
|
238 |
+
audio = torch.from_numpy(x).to(self.device, copy=True)
|
239 |
+
audio = torch.unsqueeze(audio, dim=0)
|
240 |
+
if audio.ndim == 2 and audio.shape[0] > 1:
|
241 |
+
audio = torch.mean(audio, dim=0, keepdim=True).detach()
|
242 |
+
audio = audio.detach()
|
243 |
+
pitch: Tensor = torchcrepe.predict(
|
244 |
+
audio,
|
245 |
+
self.sample_rate,
|
246 |
+
hop_length,
|
247 |
+
f0_min,
|
248 |
+
f0_max,
|
249 |
+
model,
|
250 |
+
batch_size=hop_length * 2,
|
251 |
+
device=self.device,
|
252 |
+
pad=True,
|
253 |
+
)
|
254 |
+
p_len = p_len or x.shape[0] // hop_length
|
255 |
+
source = np.array(pitch.squeeze(0).cpu().float().numpy())
|
256 |
+
source[source < 0.001] = np.nan
|
257 |
+
target = np.interp(
|
258 |
+
np.arange(0, len(source) * p_len, len(source)) / p_len,
|
259 |
+
np.arange(0, len(source)),
|
260 |
+
source,
|
261 |
+
)
|
262 |
+
f0 = np.nan_to_num(target)
|
263 |
+
return f0
|
264 |
+
|
265 |
+
def get_f0_hybrid(
|
266 |
+
self,
|
267 |
+
methods_str,
|
268 |
+
x,
|
269 |
+
f0_min,
|
270 |
+
f0_max,
|
271 |
+
p_len,
|
272 |
+
hop_length,
|
273 |
+
):
|
274 |
+
"""
|
275 |
+
Estimates the fundamental frequency (F0) using a hybrid approach combining multiple methods.
|
276 |
+
|
277 |
+
Args:
|
278 |
+
methods_str: A string specifying the methods to combine (e.g., "hybrid[crepe+rmvpe]").
|
279 |
+
x: The input audio signal as a NumPy array.
|
280 |
+
f0_min: Minimum F0 value to consider.
|
281 |
+
f0_max: Maximum F0 value to consider.
|
282 |
+
p_len: Desired length of the F0 output.
|
283 |
+
hop_length: Hop length for F0 estimation methods.
|
284 |
+
"""
|
285 |
+
methods_str = re.search("hybrid\[(.+)\]", methods_str)
|
286 |
+
if methods_str:
|
287 |
+
methods = [method.strip() for method in methods_str.group(1).split("+")]
|
288 |
+
f0_computation_stack = []
|
289 |
+
print(f"Calculating f0 pitch estimations for methods: {', '.join(methods)}")
|
290 |
+
x = x.astype(np.float32)
|
291 |
+
x /= np.quantile(np.abs(x), 0.999)
|
292 |
+
for method in methods:
|
293 |
+
f0 = None
|
294 |
+
if method == "crepe":
|
295 |
+
f0 = self.get_f0_crepe_computation(
|
296 |
+
x, f0_min, f0_max, p_len, int(hop_length)
|
297 |
+
)
|
298 |
+
elif method == "rmvpe":
|
299 |
+
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
|
300 |
+
f0 = f0[1:]
|
301 |
+
elif method == "fcpe":
|
302 |
+
self.model_fcpe = FCPEF0Predictor(
|
303 |
+
os.path.join("rvc", "models", "predictors", "fcpe.pt"),
|
304 |
+
f0_min=int(f0_min),
|
305 |
+
f0_max=int(f0_max),
|
306 |
+
dtype=torch.float32,
|
307 |
+
device=self.device,
|
308 |
+
sample_rate=self.sample_rate,
|
309 |
+
threshold=0.03,
|
310 |
+
)
|
311 |
+
f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
|
312 |
+
del self.model_fcpe
|
313 |
+
gc.collect()
|
314 |
+
f0_computation_stack.append(f0)
|
315 |
+
|
316 |
+
f0_computation_stack = [fc for fc in f0_computation_stack if fc is not None]
|
317 |
+
f0_median_hybrid = None
|
318 |
+
if len(f0_computation_stack) == 1:
|
319 |
+
f0_median_hybrid = f0_computation_stack[0]
|
320 |
+
else:
|
321 |
+
f0_median_hybrid = np.nanmedian(f0_computation_stack, axis=0)
|
322 |
+
return f0_median_hybrid
|
323 |
+
|
324 |
+
def get_f0(
|
325 |
+
self,
|
326 |
+
input_audio_path,
|
327 |
+
x,
|
328 |
+
p_len,
|
329 |
+
pitch,
|
330 |
+
f0_method,
|
331 |
+
filter_radius,
|
332 |
+
hop_length,
|
333 |
+
f0_autotune,
|
334 |
+
f0_autotune_strength,
|
335 |
+
inp_f0=None,
|
336 |
+
):
|
337 |
+
"""
|
338 |
+
Estimates the fundamental frequency (F0) of a given audio signal using various methods.
|
339 |
+
|
340 |
+
Args:
|
341 |
+
input_audio_path: Path to the input audio file.
|
342 |
+
x: The input audio signal as a NumPy array.
|
343 |
+
p_len: Desired length of the F0 output.
|
344 |
+
pitch: Key to adjust the pitch of the F0 contour.
|
345 |
+
f0_method: Method to use for F0 estimation (e.g., "crepe").
|
346 |
+
filter_radius: Radius for median filtering the F0 contour.
|
347 |
+
hop_length: Hop length for F0 estimation methods.
|
348 |
+
f0_autotune: Whether to apply autotune to the F0 contour.
|
349 |
+
inp_f0: Optional input F0 contour to use instead of estimating.
|
350 |
+
"""
|
351 |
+
global input_audio_path2wav
|
352 |
+
if f0_method == "crepe":
|
353 |
+
f0 = self.get_f0_crepe(x, self.f0_min, self.f0_max, p_len, int(hop_length))
|
354 |
+
elif f0_method == "crepe-tiny":
|
355 |
+
f0 = self.get_f0_crepe(
|
356 |
+
x, self.f0_min, self.f0_max, p_len, int(hop_length), "tiny"
|
357 |
+
)
|
358 |
+
elif f0_method == "rmvpe":
|
359 |
+
f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
|
360 |
+
elif f0_method == "fcpe":
|
361 |
+
self.model_fcpe = FCPEF0Predictor(
|
362 |
+
os.path.join("rvc", "models", "predictors", "fcpe.pt"),
|
363 |
+
f0_min=int(self.f0_min),
|
364 |
+
f0_max=int(self.f0_max),
|
365 |
+
dtype=torch.float32,
|
366 |
+
device=self.device,
|
367 |
+
sample_rate=self.sample_rate,
|
368 |
+
threshold=0.03,
|
369 |
+
)
|
370 |
+
f0 = self.model_fcpe.compute_f0(x, p_len=p_len)
|
371 |
+
del self.model_fcpe
|
372 |
+
gc.collect()
|
373 |
+
elif "hybrid" in f0_method:
|
374 |
+
input_audio_path2wav[input_audio_path] = x.astype(np.double)
|
375 |
+
f0 = self.get_f0_hybrid(
|
376 |
+
f0_method,
|
377 |
+
x,
|
378 |
+
self.f0_min,
|
379 |
+
self.f0_max,
|
380 |
+
p_len,
|
381 |
+
hop_length,
|
382 |
+
)
|
383 |
+
|
384 |
+
if f0_autotune is True:
|
385 |
+
f0 = Autotune.autotune_f0(self, f0, f0_autotune_strength)
|
386 |
+
|
387 |
+
f0 *= pow(2, pitch / 12)
|
388 |
+
tf0 = self.sample_rate // self.window
|
389 |
+
if inp_f0 is not None:
|
390 |
+
delta_t = np.round(
|
391 |
+
(inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1
|
392 |
+
).astype("int16")
|
393 |
+
replace_f0 = np.interp(
|
394 |
+
list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1]
|
395 |
+
)
|
396 |
+
shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0]
|
397 |
+
f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[
|
398 |
+
:shape
|
399 |
+
]
|
400 |
+
f0bak = f0.copy()
|
401 |
+
f0_mel = 1127 * np.log(1 + f0 / 700)
|
402 |
+
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (
|
403 |
+
self.f0_mel_max - self.f0_mel_min
|
404 |
+
) + 1
|
405 |
+
f0_mel[f0_mel <= 1] = 1
|
406 |
+
f0_mel[f0_mel > 255] = 255
|
407 |
+
f0_coarse = np.rint(f0_mel).astype(int)
|
408 |
+
|
409 |
+
return f0_coarse, f0bak
|
410 |
+
|
411 |
+
def voice_conversion(
|
412 |
+
self,
|
413 |
+
model,
|
414 |
+
net_g,
|
415 |
+
sid,
|
416 |
+
audio0,
|
417 |
+
pitch,
|
418 |
+
pitchf,
|
419 |
+
index,
|
420 |
+
big_npy,
|
421 |
+
index_rate,
|
422 |
+
version,
|
423 |
+
protect,
|
424 |
+
):
|
425 |
+
"""
|
426 |
+
Performs voice conversion on a given audio segment.
|
427 |
+
|
428 |
+
Args:
|
429 |
+
model: The feature extractor model.
|
430 |
+
net_g: The generative model for synthesizing speech.
|
431 |
+
sid: Speaker ID for the target voice.
|
432 |
+
audio0: The input audio segment.
|
433 |
+
pitch: Quantized F0 contour for pitch guidance.
|
434 |
+
pitchf: Original F0 contour for pitch guidance.
|
435 |
+
index: FAISS index for speaker embedding retrieval.
|
436 |
+
big_npy: Speaker embeddings stored in a NumPy array.
|
437 |
+
index_rate: Blending rate for speaker embedding retrieval.
|
438 |
+
version: Model version ("v1" or "v2").
|
439 |
+
protect: Protection level for preserving the original pitch.
|
440 |
+
"""
|
441 |
+
with torch.no_grad():
|
442 |
+
pitch_guidance = pitch != None and pitchf != None
|
443 |
+
# prepare source audio
|
444 |
+
feats = (
|
445 |
+
torch.from_numpy(audio0).half()
|
446 |
+
if self.is_half
|
447 |
+
else torch.from_numpy(audio0).float()
|
448 |
+
)
|
449 |
+
feats = feats.mean(-1) if feats.dim() == 2 else feats
|
450 |
+
assert feats.dim() == 1, feats.dim()
|
451 |
+
feats = feats.view(1, -1).to(self.device)
|
452 |
+
# extract features
|
453 |
+
feats = model(feats)["last_hidden_state"]
|
454 |
+
feats = (
|
455 |
+
model.final_proj(feats[0]).unsqueeze(0) if version == "v1" else feats
|
456 |
+
)
|
457 |
+
# make a copy for pitch guidance and protection
|
458 |
+
feats0 = feats.clone() if pitch_guidance else None
|
459 |
+
if (
|
460 |
+
index
|
461 |
+
): # set by parent function, only true if index is available, loaded, and index rate > 0
|
462 |
+
feats = self._retrieve_speaker_embeddings(
|
463 |
+
feats, index, big_npy, index_rate
|
464 |
+
)
|
465 |
+
# feature upsampling
|
466 |
+
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(
|
467 |
+
0, 2, 1
|
468 |
+
)
|
469 |
+
# adjust the length if the audio is short
|
470 |
+
p_len = min(audio0.shape[0] // self.window, feats.shape[1])
|
471 |
+
if pitch_guidance:
|
472 |
+
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
|
473 |
+
0, 2, 1
|
474 |
+
)
|
475 |
+
pitch, pitchf = pitch[:, :p_len], pitchf[:, :p_len]
|
476 |
+
# Pitch protection blending
|
477 |
+
if protect < 0.5:
|
478 |
+
pitchff = pitchf.clone()
|
479 |
+
pitchff[pitchf > 0] = 1
|
480 |
+
pitchff[pitchf < 1] = protect
|
481 |
+
feats = feats * pitchff.unsqueeze(-1) + feats0 * (
|
482 |
+
1 - pitchff.unsqueeze(-1)
|
483 |
+
)
|
484 |
+
feats = feats.to(feats0.dtype)
|
485 |
+
else:
|
486 |
+
pitch, pitchf = None, None
|
487 |
+
p_len = torch.tensor([p_len], device=self.device).long()
|
488 |
+
audio1 = (
|
489 |
+
(net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0])
|
490 |
+
.data.cpu()
|
491 |
+
.float()
|
492 |
+
.numpy()
|
493 |
+
)
|
494 |
+
# clean up
|
495 |
+
del feats, feats0, p_len
|
496 |
+
if torch.cuda.is_available():
|
497 |
+
torch.cuda.empty_cache()
|
498 |
+
return audio1
|
499 |
+
|
500 |
+
def _retrieve_speaker_embeddings(self, feats, index, big_npy, index_rate):
|
501 |
+
npy = feats[0].cpu().numpy()
|
502 |
+
npy = npy.astype("float32") if self.is_half else npy
|
503 |
+
score, ix = index.search(npy, k=8)
|
504 |
+
weight = np.square(1 / score)
|
505 |
+
weight /= weight.sum(axis=1, keepdims=True)
|
506 |
+
npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
|
507 |
+
npy = npy.astype("float16") if self.is_half else npy
|
508 |
+
feats = (
|
509 |
+
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
|
510 |
+
+ (1 - index_rate) * feats
|
511 |
+
)
|
512 |
+
return feats
|
513 |
+
|
514 |
+
def pipeline(
|
515 |
+
self,
|
516 |
+
model,
|
517 |
+
net_g,
|
518 |
+
sid,
|
519 |
+
audio,
|
520 |
+
pitch,
|
521 |
+
f0_method,
|
522 |
+
file_index,
|
523 |
+
index_rate,
|
524 |
+
pitch_guidance,
|
525 |
+
filter_radius,
|
526 |
+
volume_envelope,
|
527 |
+
version,
|
528 |
+
protect,
|
529 |
+
hop_length,
|
530 |
+
f0_autotune,
|
531 |
+
f0_autotune_strength,
|
532 |
+
f0_file,
|
533 |
+
):
|
534 |
+
"""
|
535 |
+
The main pipeline function for performing voice conversion.
|
536 |
+
|
537 |
+
Args:
|
538 |
+
model: The feature extractor model.
|
539 |
+
net_g: The generative model for synthesizing speech.
|
540 |
+
sid: Speaker ID for the target voice.
|
541 |
+
audio: The input audio signal.
|
542 |
+
input_audio_path: Path to the input audio file.
|
543 |
+
pitch: Key to adjust the pitch of the F0 contour.
|
544 |
+
f0_method: Method to use for F0 estimation.
|
545 |
+
file_index: Path to the FAISS index file for speaker embedding retrieval.
|
546 |
+
index_rate: Blending rate for speaker embedding retrieval.
|
547 |
+
pitch_guidance: Whether to use pitch guidance during voice conversion.
|
548 |
+
filter_radius: Radius for median filtering the F0 contour.
|
549 |
+
tgt_sr: Target sampling rate for the output audio.
|
550 |
+
resample_sr: Resampling rate for the output audio.
|
551 |
+
volume_envelope: Blending rate for adjusting the RMS level of the output audio.
|
552 |
+
version: Model version.
|
553 |
+
protect: Protection level for preserving the original pitch.
|
554 |
+
hop_length: Hop length for F0 estimation methods.
|
555 |
+
f0_autotune: Whether to apply autotune to the F0 contour.
|
556 |
+
f0_file: Path to a file containing an F0 contour to use.
|
557 |
+
"""
|
558 |
+
if file_index != "" and os.path.exists(file_index) and index_rate > 0:
|
559 |
+
try:
|
560 |
+
index = faiss.read_index(file_index)
|
561 |
+
big_npy = index.reconstruct_n(0, index.ntotal)
|
562 |
+
except Exception as error:
|
563 |
+
print(f"An error occurred reading the FAISS index: {error}")
|
564 |
+
index = big_npy = None
|
565 |
+
else:
|
566 |
+
index = big_npy = None
|
567 |
+
audio = signal.filtfilt(bh, ah, audio)
|
568 |
+
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
|
569 |
+
opt_ts = []
|
570 |
+
if audio_pad.shape[0] > self.t_max:
|
571 |
+
audio_sum = np.zeros_like(audio)
|
572 |
+
for i in range(self.window):
|
573 |
+
audio_sum += audio_pad[i : i - self.window]
|
574 |
+
for t in range(self.t_center, audio.shape[0], self.t_center):
|
575 |
+
opt_ts.append(
|
576 |
+
t
|
577 |
+
- self.t_query
|
578 |
+
+ np.where(
|
579 |
+
np.abs(audio_sum[t - self.t_query : t + self.t_query])
|
580 |
+
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
|
581 |
+
)[0][0]
|
582 |
+
)
|
583 |
+
s = 0
|
584 |
+
audio_opt = []
|
585 |
+
t = None
|
586 |
+
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
|
587 |
+
p_len = audio_pad.shape[0] // self.window
|
588 |
+
inp_f0 = None
|
589 |
+
if hasattr(f0_file, "name"):
|
590 |
+
try:
|
591 |
+
with open(f0_file.name, "r") as f:
|
592 |
+
lines = f.read().strip("\n").split("\n")
|
593 |
+
inp_f0 = []
|
594 |
+
for line in lines:
|
595 |
+
inp_f0.append([float(i) for i in line.split(",")])
|
596 |
+
inp_f0 = np.array(inp_f0, dtype="float32")
|
597 |
+
except Exception as error:
|
598 |
+
print(f"An error occurred reading the F0 file: {error}")
|
599 |
+
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
600 |
+
if pitch_guidance:
|
601 |
+
pitch, pitchf = self.get_f0(
|
602 |
+
"input_audio_path", # questionable purpose of making a key for an array
|
603 |
+
audio_pad,
|
604 |
+
p_len,
|
605 |
+
pitch,
|
606 |
+
f0_method,
|
607 |
+
filter_radius,
|
608 |
+
hop_length,
|
609 |
+
f0_autotune,
|
610 |
+
f0_autotune_strength,
|
611 |
+
inp_f0,
|
612 |
+
)
|
613 |
+
pitch = pitch[:p_len]
|
614 |
+
pitchf = pitchf[:p_len]
|
615 |
+
if self.device == "mps":
|
616 |
+
pitchf = pitchf.astype(np.float32)
|
617 |
+
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
618 |
+
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
619 |
+
for t in opt_ts:
|
620 |
+
t = t // self.window * self.window
|
621 |
+
if pitch_guidance:
|
622 |
+
audio_opt.append(
|
623 |
+
self.voice_conversion(
|
624 |
+
model,
|
625 |
+
net_g,
|
626 |
+
sid,
|
627 |
+
audio_pad[s : t + self.t_pad2 + self.window],
|
628 |
+
pitch[:, s // self.window : (t + self.t_pad2) // self.window],
|
629 |
+
pitchf[:, s // self.window : (t + self.t_pad2) // self.window],
|
630 |
+
index,
|
631 |
+
big_npy,
|
632 |
+
index_rate,
|
633 |
+
version,
|
634 |
+
protect,
|
635 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
636 |
+
)
|
637 |
+
else:
|
638 |
+
audio_opt.append(
|
639 |
+
self.voice_conversion(
|
640 |
+
model,
|
641 |
+
net_g,
|
642 |
+
sid,
|
643 |
+
audio_pad[s : t + self.t_pad2 + self.window],
|
644 |
+
None,
|
645 |
+
None,
|
646 |
+
index,
|
647 |
+
big_npy,
|
648 |
+
index_rate,
|
649 |
+
version,
|
650 |
+
protect,
|
651 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
652 |
+
)
|
653 |
+
s = t
|
654 |
+
if pitch_guidance:
|
655 |
+
audio_opt.append(
|
656 |
+
self.voice_conversion(
|
657 |
+
model,
|
658 |
+
net_g,
|
659 |
+
sid,
|
660 |
+
audio_pad[t:],
|
661 |
+
pitch[:, t // self.window :] if t is not None else pitch,
|
662 |
+
pitchf[:, t // self.window :] if t is not None else pitchf,
|
663 |
+
index,
|
664 |
+
big_npy,
|
665 |
+
index_rate,
|
666 |
+
version,
|
667 |
+
protect,
|
668 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
669 |
+
)
|
670 |
+
else:
|
671 |
+
audio_opt.append(
|
672 |
+
self.voice_conversion(
|
673 |
+
model,
|
674 |
+
net_g,
|
675 |
+
sid,
|
676 |
+
audio_pad[t:],
|
677 |
+
None,
|
678 |
+
None,
|
679 |
+
index,
|
680 |
+
big_npy,
|
681 |
+
index_rate,
|
682 |
+
version,
|
683 |
+
protect,
|
684 |
+
)[self.t_pad_tgt : -self.t_pad_tgt]
|
685 |
+
)
|
686 |
+
audio_opt = np.concatenate(audio_opt)
|
687 |
+
if volume_envelope != 1:
|
688 |
+
audio_opt = AudioProcessor.change_rms(
|
689 |
+
audio, self.sample_rate, audio_opt, self.sample_rate, volume_envelope
|
690 |
+
)
|
691 |
+
# if resample_sr >= self.sample_rate and tgt_sr != resample_sr:
|
692 |
+
# audio_opt = librosa.resample(
|
693 |
+
# audio_opt, orig_sr=tgt_sr, target_sr=resample_sr
|
694 |
+
# )
|
695 |
+
# audio_max = np.abs(audio_opt).max() / 0.99
|
696 |
+
# max_int16 = 32768
|
697 |
+
# if audio_max > 1:
|
698 |
+
# max_int16 /= audio_max
|
699 |
+
# audio_opt = (audio_opt * 32768).astype(np.int16)
|
700 |
+
audio_max = np.abs(audio_opt).max() / 0.99
|
701 |
+
if audio_max > 1:
|
702 |
+
audio_opt /= audio_max
|
703 |
+
if pitch_guidance:
|
704 |
+
del pitch, pitchf
|
705 |
+
del sid
|
706 |
+
if torch.cuda.is_available():
|
707 |
+
torch.cuda.empty_cache()
|
708 |
+
return audio_opt
|
rvc/lib/algorithm/__init__.py
ADDED
File without changes
|
rvc/lib/algorithm/attentions.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from rvc.lib.algorithm.commons import convert_pad_shape
|
4 |
+
|
5 |
+
|
6 |
+
class MultiHeadAttention(torch.nn.Module):
|
7 |
+
"""
|
8 |
+
Multi-head attention module with optional relative positional encoding and proximal bias.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
channels (int): Number of input channels.
|
12 |
+
out_channels (int): Number of output channels.
|
13 |
+
n_heads (int): Number of attention heads.
|
14 |
+
p_dropout (float, optional): Dropout probability. Defaults to 0.0.
|
15 |
+
window_size (int, optional): Window size for relative positional encoding. Defaults to None.
|
16 |
+
heads_share (bool, optional): Whether to share relative positional embeddings across heads. Defaults to True.
|
17 |
+
block_length (int, optional): Block length for local attention. Defaults to None.
|
18 |
+
proximal_bias (bool, optional): Whether to use proximal bias in self-attention. Defaults to False.
|
19 |
+
proximal_init (bool, optional): Whether to initialize the key projection weights the same as query projection weights. Defaults to False.
|
20 |
+
"""
|
21 |
+
|
22 |
+
def __init__(
|
23 |
+
self,
|
24 |
+
channels,
|
25 |
+
out_channels,
|
26 |
+
n_heads,
|
27 |
+
p_dropout=0.0,
|
28 |
+
window_size=None,
|
29 |
+
heads_share=True,
|
30 |
+
block_length=None,
|
31 |
+
proximal_bias=False,
|
32 |
+
proximal_init=False,
|
33 |
+
):
|
34 |
+
super().__init__()
|
35 |
+
assert (
|
36 |
+
channels % n_heads == 0
|
37 |
+
), "Channels must be divisible by the number of heads."
|
38 |
+
|
39 |
+
self.channels = channels
|
40 |
+
self.out_channels = out_channels
|
41 |
+
self.n_heads = n_heads
|
42 |
+
self.k_channels = channels // n_heads
|
43 |
+
self.window_size = window_size
|
44 |
+
self.block_length = block_length
|
45 |
+
self.proximal_bias = proximal_bias
|
46 |
+
|
47 |
+
# Define projections
|
48 |
+
self.conv_q = torch.nn.Conv1d(channels, channels, 1)
|
49 |
+
self.conv_k = torch.nn.Conv1d(channels, channels, 1)
|
50 |
+
self.conv_v = torch.nn.Conv1d(channels, channels, 1)
|
51 |
+
self.conv_o = torch.nn.Conv1d(channels, out_channels, 1)
|
52 |
+
|
53 |
+
self.drop = torch.nn.Dropout(p_dropout)
|
54 |
+
|
55 |
+
# Relative positional encodings
|
56 |
+
if window_size:
|
57 |
+
n_heads_rel = 1 if heads_share else n_heads
|
58 |
+
rel_stddev = self.k_channels**-0.5
|
59 |
+
self.emb_rel_k = torch.nn.Parameter(
|
60 |
+
torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels)
|
61 |
+
* rel_stddev
|
62 |
+
)
|
63 |
+
self.emb_rel_v = torch.nn.Parameter(
|
64 |
+
torch.randn(n_heads_rel, 2 * window_size + 1, self.k_channels)
|
65 |
+
* rel_stddev
|
66 |
+
)
|
67 |
+
|
68 |
+
# Initialize weights
|
69 |
+
torch.nn.init.xavier_uniform_(self.conv_q.weight)
|
70 |
+
torch.nn.init.xavier_uniform_(self.conv_k.weight)
|
71 |
+
torch.nn.init.xavier_uniform_(self.conv_v.weight)
|
72 |
+
torch.nn.init.xavier_uniform_(self.conv_o.weight)
|
73 |
+
|
74 |
+
if proximal_init:
|
75 |
+
with torch.no_grad():
|
76 |
+
self.conv_k.weight.copy_(self.conv_q.weight)
|
77 |
+
self.conv_k.bias.copy_(self.conv_q.bias)
|
78 |
+
|
79 |
+
def forward(self, x, c, attn_mask=None):
|
80 |
+
# Compute query, key, value projections
|
81 |
+
q, k, v = self.conv_q(x), self.conv_k(c), self.conv_v(c)
|
82 |
+
|
83 |
+
# Compute attention
|
84 |
+
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
85 |
+
|
86 |
+
# Final output projection
|
87 |
+
return self.conv_o(x)
|
88 |
+
|
89 |
+
def attention(self, query, key, value, mask=None):
|
90 |
+
# Reshape and compute scaled dot-product attention
|
91 |
+
b, d, t_s, t_t = (*key.size(), query.size(2))
|
92 |
+
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
|
93 |
+
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
94 |
+
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
95 |
+
|
96 |
+
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
|
97 |
+
|
98 |
+
if self.window_size:
|
99 |
+
assert t_s == t_t, "Relative attention only supports self-attention."
|
100 |
+
scores += self._compute_relative_scores(query, t_s)
|
101 |
+
|
102 |
+
if self.proximal_bias:
|
103 |
+
assert t_s == t_t, "Proximal bias only supports self-attention."
|
104 |
+
scores += self._attention_bias_proximal(t_s).to(scores.device, scores.dtype)
|
105 |
+
|
106 |
+
if mask is not None:
|
107 |
+
scores = scores.masked_fill(mask == 0, -1e4)
|
108 |
+
if self.block_length:
|
109 |
+
block_mask = (
|
110 |
+
torch.ones_like(scores)
|
111 |
+
.triu(-self.block_length)
|
112 |
+
.tril(self.block_length)
|
113 |
+
)
|
114 |
+
scores = scores.masked_fill(block_mask == 0, -1e4)
|
115 |
+
|
116 |
+
# Apply softmax and dropout
|
117 |
+
p_attn = self.drop(torch.nn.functional.softmax(scores, dim=-1))
|
118 |
+
|
119 |
+
# Compute attention output
|
120 |
+
output = torch.matmul(p_attn, value)
|
121 |
+
|
122 |
+
if self.window_size:
|
123 |
+
output += self._apply_relative_values(p_attn, t_s)
|
124 |
+
|
125 |
+
return output.transpose(2, 3).contiguous().view(b, d, t_t), p_attn
|
126 |
+
|
127 |
+
def _compute_relative_scores(self, query, length):
|
128 |
+
rel_emb = self._get_relative_embeddings(self.emb_rel_k, length)
|
129 |
+
rel_logits = self._matmul_with_relative_keys(
|
130 |
+
query / math.sqrt(self.k_channels), rel_emb
|
131 |
+
)
|
132 |
+
return self._relative_position_to_absolute_position(rel_logits)
|
133 |
+
|
134 |
+
def _apply_relative_values(self, p_attn, length):
|
135 |
+
rel_weights = self._absolute_position_to_relative_position(p_attn)
|
136 |
+
rel_emb = self._get_relative_embeddings(self.emb_rel_v, length)
|
137 |
+
return self._matmul_with_relative_values(rel_weights, rel_emb)
|
138 |
+
|
139 |
+
# Helper methods
|
140 |
+
def _matmul_with_relative_values(self, x, y):
|
141 |
+
return torch.matmul(x, y.unsqueeze(0))
|
142 |
+
|
143 |
+
def _matmul_with_relative_keys(self, x, y):
|
144 |
+
return torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
|
145 |
+
|
146 |
+
def _get_relative_embeddings(self, embeddings, length):
|
147 |
+
pad_length = max(length - (self.window_size + 1), 0)
|
148 |
+
start = max((self.window_size + 1) - length, 0)
|
149 |
+
end = start + 2 * length - 1
|
150 |
+
|
151 |
+
if pad_length > 0:
|
152 |
+
embeddings = torch.nn.functional.pad(
|
153 |
+
embeddings,
|
154 |
+
convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
|
155 |
+
)
|
156 |
+
return embeddings[:, start:end]
|
157 |
+
|
158 |
+
def _relative_position_to_absolute_position(self, x):
|
159 |
+
batch, heads, length, _ = x.size()
|
160 |
+
x = torch.nn.functional.pad(
|
161 |
+
x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])
|
162 |
+
)
|
163 |
+
x_flat = x.view(batch, heads, length * 2 * length)
|
164 |
+
x_flat = torch.nn.functional.pad(
|
165 |
+
x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
|
166 |
+
)
|
167 |
+
return x_flat.view(batch, heads, length + 1, 2 * length - 1)[
|
168 |
+
:, :, :length, length - 1 :
|
169 |
+
]
|
170 |
+
|
171 |
+
def _absolute_position_to_relative_position(self, x):
|
172 |
+
batch, heads, length, _ = x.size()
|
173 |
+
x = torch.nn.functional.pad(
|
174 |
+
x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
|
175 |
+
)
|
176 |
+
x_flat = x.view(batch, heads, length**2 + length * (length - 1))
|
177 |
+
x_flat = torch.nn.functional.pad(
|
178 |
+
x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]])
|
179 |
+
)
|
180 |
+
return x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:]
|
181 |
+
|
182 |
+
def _attention_bias_proximal(self, length):
|
183 |
+
r = torch.arange(length, dtype=torch.float32)
|
184 |
+
diff = r.unsqueeze(0) - r.unsqueeze(1)
|
185 |
+
return -torch.log1p(torch.abs(diff)).unsqueeze(0).unsqueeze(0)
|
186 |
+
|
187 |
+
|
188 |
+
class FFN(torch.nn.Module):
|
189 |
+
"""
|
190 |
+
Feed-forward network module.
|
191 |
+
|
192 |
+
Args:
|
193 |
+
in_channels (int): Number of input channels.
|
194 |
+
out_channels (int): Number of output channels.
|
195 |
+
filter_channels (int): Number of filter channels in the convolution layers.
|
196 |
+
kernel_size (int): Kernel size of the convolution layers.
|
197 |
+
p_dropout (float, optional): Dropout probability. Defaults to 0.0.
|
198 |
+
activation (str, optional): Activation function to use. Defaults to None.
|
199 |
+
causal (bool, optional): Whether to use causal padding in the convolution layers. Defaults to False.
|
200 |
+
"""
|
201 |
+
|
202 |
+
def __init__(
|
203 |
+
self,
|
204 |
+
in_channels,
|
205 |
+
out_channels,
|
206 |
+
filter_channels,
|
207 |
+
kernel_size,
|
208 |
+
p_dropout=0.0,
|
209 |
+
activation=None,
|
210 |
+
causal=False,
|
211 |
+
):
|
212 |
+
super().__init__()
|
213 |
+
self.padding_fn = self._causal_padding if causal else self._same_padding
|
214 |
+
|
215 |
+
self.conv_1 = torch.nn.Conv1d(in_channels, filter_channels, kernel_size)
|
216 |
+
self.conv_2 = torch.nn.Conv1d(filter_channels, out_channels, kernel_size)
|
217 |
+
self.drop = torch.nn.Dropout(p_dropout)
|
218 |
+
|
219 |
+
self.activation = activation
|
220 |
+
|
221 |
+
def forward(self, x, x_mask):
|
222 |
+
x = self.conv_1(self.padding_fn(x * x_mask))
|
223 |
+
x = self._apply_activation(x)
|
224 |
+
x = self.drop(x)
|
225 |
+
x = self.conv_2(self.padding_fn(x * x_mask))
|
226 |
+
return x * x_mask
|
227 |
+
|
228 |
+
def _apply_activation(self, x):
|
229 |
+
if self.activation == "gelu":
|
230 |
+
return x * torch.sigmoid(1.702 * x)
|
231 |
+
return torch.relu(x)
|
232 |
+
|
233 |
+
def _causal_padding(self, x):
|
234 |
+
pad_l, pad_r = self.conv_1.kernel_size[0] - 1, 0
|
235 |
+
return torch.nn.functional.pad(
|
236 |
+
x, convert_pad_shape([[0, 0], [0, 0], [pad_l, pad_r]])
|
237 |
+
)
|
238 |
+
|
239 |
+
def _same_padding(self, x):
|
240 |
+
pad = (self.conv_1.kernel_size[0] - 1) // 2
|
241 |
+
return torch.nn.functional.pad(
|
242 |
+
x, convert_pad_shape([[0, 0], [0, 0], [pad, pad]])
|
243 |
+
)
|
rvc/lib/algorithm/commons.py
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from typing import List, Optional
|
4 |
+
|
5 |
+
|
6 |
+
def init_weights(m, mean=0.0, std=0.01):
|
7 |
+
"""
|
8 |
+
Initialize the weights of a module.
|
9 |
+
|
10 |
+
Args:
|
11 |
+
m: The module to initialize.
|
12 |
+
mean: The mean of the normal distribution.
|
13 |
+
std: The standard deviation of the normal distribution.
|
14 |
+
"""
|
15 |
+
classname = m.__class__.__name__
|
16 |
+
if classname.find("Conv") != -1:
|
17 |
+
m.weight.data.normal_(mean, std)
|
18 |
+
|
19 |
+
|
20 |
+
def get_padding(kernel_size, dilation=1):
|
21 |
+
"""
|
22 |
+
Calculate the padding needed for a convolution.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
kernel_size: The size of the kernel.
|
26 |
+
dilation: The dilation of the convolution.
|
27 |
+
"""
|
28 |
+
return int((kernel_size * dilation - dilation) / 2)
|
29 |
+
|
30 |
+
|
31 |
+
def convert_pad_shape(pad_shape):
|
32 |
+
"""
|
33 |
+
Convert the pad shape to a list of integers.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
pad_shape: The pad shape..
|
37 |
+
"""
|
38 |
+
l = pad_shape[::-1]
|
39 |
+
pad_shape = [item for sublist in l for item in sublist]
|
40 |
+
return pad_shape
|
41 |
+
|
42 |
+
|
43 |
+
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
44 |
+
"""
|
45 |
+
Calculate the KL divergence between two distributions.
|
46 |
+
|
47 |
+
Args:
|
48 |
+
m_p: The mean of the first distribution.
|
49 |
+
logs_p: The log of the standard deviation of the first distribution.
|
50 |
+
m_q: The mean of the second distribution.
|
51 |
+
logs_q: The log of the standard deviation of the second distribution.
|
52 |
+
"""
|
53 |
+
kl = (logs_q - logs_p) - 0.5
|
54 |
+
kl += (
|
55 |
+
0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
|
56 |
+
)
|
57 |
+
return kl
|
58 |
+
|
59 |
+
|
60 |
+
def slice_segments(
|
61 |
+
x: torch.Tensor, ids_str: torch.Tensor, segment_size: int = 4, dim: int = 2
|
62 |
+
):
|
63 |
+
"""
|
64 |
+
Slice segments from a tensor, handling tensors with different numbers of dimensions.
|
65 |
+
|
66 |
+
Args:
|
67 |
+
x (torch.Tensor): The tensor to slice.
|
68 |
+
ids_str (torch.Tensor): The starting indices of the segments.
|
69 |
+
segment_size (int, optional): The size of each segment. Defaults to 4.
|
70 |
+
dim (int, optional): The dimension to slice across (2D or 3D tensors). Defaults to 2.
|
71 |
+
"""
|
72 |
+
if dim == 2:
|
73 |
+
ret = torch.zeros_like(x[:, :segment_size])
|
74 |
+
elif dim == 3:
|
75 |
+
ret = torch.zeros_like(x[:, :, :segment_size])
|
76 |
+
|
77 |
+
for i in range(x.size(0)):
|
78 |
+
idx_str = ids_str[i].item()
|
79 |
+
idx_end = idx_str + segment_size
|
80 |
+
if dim == 2:
|
81 |
+
ret[i] = x[i, idx_str:idx_end]
|
82 |
+
else:
|
83 |
+
ret[i] = x[i, :, idx_str:idx_end]
|
84 |
+
|
85 |
+
return ret
|
86 |
+
|
87 |
+
|
88 |
+
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
89 |
+
"""
|
90 |
+
Randomly slice segments from a tensor.
|
91 |
+
|
92 |
+
Args:
|
93 |
+
x: The tensor to slice.
|
94 |
+
x_lengths: The lengths of the sequences.
|
95 |
+
segment_size: The size of each segment.
|
96 |
+
"""
|
97 |
+
b, d, t = x.size()
|
98 |
+
if x_lengths is None:
|
99 |
+
x_lengths = t
|
100 |
+
ids_str_max = x_lengths - segment_size + 1
|
101 |
+
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
102 |
+
ret = slice_segments(x, ids_str, segment_size, dim=3)
|
103 |
+
return ret, ids_str
|
104 |
+
|
105 |
+
|
106 |
+
def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
|
107 |
+
"""
|
108 |
+
Generate a 1D timing signal.
|
109 |
+
|
110 |
+
Args:
|
111 |
+
length: The length of the signal.
|
112 |
+
channels: The number of channels of the signal.
|
113 |
+
min_timescale: The minimum timescale.
|
114 |
+
max_timescale: The maximum timescale.
|
115 |
+
"""
|
116 |
+
position = torch.arange(length, dtype=torch.float)
|
117 |
+
num_timescales = channels // 2
|
118 |
+
log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
|
119 |
+
num_timescales - 1
|
120 |
+
)
|
121 |
+
inv_timescales = min_timescale * torch.exp(
|
122 |
+
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
|
123 |
+
)
|
124 |
+
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
|
125 |
+
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
|
126 |
+
signal = torch.nn.functional.pad(signal, [0, 0, 0, channels % 2])
|
127 |
+
signal = signal.view(1, channels, length)
|
128 |
+
return signal
|
129 |
+
|
130 |
+
|
131 |
+
def subsequent_mask(length):
|
132 |
+
"""
|
133 |
+
Generate a subsequent mask.
|
134 |
+
|
135 |
+
Args:
|
136 |
+
length: The length of the sequence.
|
137 |
+
"""
|
138 |
+
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
|
139 |
+
return mask
|
140 |
+
|
141 |
+
|
142 |
+
@torch.jit.script
|
143 |
+
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
144 |
+
"""
|
145 |
+
Fused add tanh sigmoid multiply operation.
|
146 |
+
|
147 |
+
Args:
|
148 |
+
input_a: The first input tensor.
|
149 |
+
input_b: The second input tensor.
|
150 |
+
n_channels: The number of channels.
|
151 |
+
"""
|
152 |
+
n_channels_int = n_channels[0]
|
153 |
+
in_act = input_a + input_b
|
154 |
+
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
155 |
+
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
156 |
+
acts = t_act * s_act
|
157 |
+
return acts
|
158 |
+
|
159 |
+
|
160 |
+
def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]:
|
161 |
+
"""
|
162 |
+
Convert the pad shape to a list of integers.
|
163 |
+
|
164 |
+
Args:
|
165 |
+
pad_shape: The pad shape.
|
166 |
+
"""
|
167 |
+
return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist()
|
168 |
+
|
169 |
+
|
170 |
+
def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
|
171 |
+
"""
|
172 |
+
Generate a sequence mask.
|
173 |
+
|
174 |
+
Args:
|
175 |
+
length: The lengths of the sequences.
|
176 |
+
max_length: The maximum length of the sequences.
|
177 |
+
"""
|
178 |
+
if max_length is None:
|
179 |
+
max_length = length.max()
|
180 |
+
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
181 |
+
return x.unsqueeze(0) < length.unsqueeze(1)
|
182 |
+
|
183 |
+
|
184 |
+
def clip_grad_value(parameters, clip_value, norm_type=2):
|
185 |
+
"""
|
186 |
+
Clip the gradients of a list of parameters.
|
187 |
+
|
188 |
+
Args:
|
189 |
+
parameters: The list of parameters to clip.
|
190 |
+
clip_value: The maximum value of the gradients.
|
191 |
+
norm_type: The type of norm to use for clipping.
|
192 |
+
"""
|
193 |
+
if isinstance(parameters, torch.Tensor):
|
194 |
+
parameters = [parameters]
|
195 |
+
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
196 |
+
norm_type = float(norm_type)
|
197 |
+
if clip_value is not None:
|
198 |
+
clip_value = float(clip_value)
|
199 |
+
|
200 |
+
total_norm = 0
|
201 |
+
for p in parameters:
|
202 |
+
param_norm = p.grad.data.norm(norm_type)
|
203 |
+
total_norm += param_norm.item() ** norm_type
|
204 |
+
if clip_value is not None:
|
205 |
+
p.grad.data.clamp_(min=-clip_value, max=clip_value)
|
206 |
+
total_norm = total_norm ** (1.0 / norm_type)
|
207 |
+
return total_norm
|
rvc/lib/algorithm/discriminators.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.nn.utils.parametrizations import spectral_norm, weight_norm
|
3 |
+
|
4 |
+
from rvc.lib.algorithm.commons import get_padding
|
5 |
+
from rvc.lib.algorithm.residuals import LRELU_SLOPE
|
6 |
+
|
7 |
+
|
8 |
+
class MultiPeriodDiscriminator(torch.nn.Module):
|
9 |
+
"""
|
10 |
+
Multi-period discriminator.
|
11 |
+
|
12 |
+
This class implements a multi-period discriminator, which is used to
|
13 |
+
discriminate between real and fake audio signals. The discriminator
|
14 |
+
is composed of a series of convolutional layers that are applied to
|
15 |
+
the input signal at different periods.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
periods (str): Periods of the discriminator. V1 = [2, 3, 5, 7, 11, 17], V2 = [2, 3, 5, 7, 11, 17, 23, 37].
|
19 |
+
use_spectral_norm (bool): Whether to use spectral normalization.
|
20 |
+
Defaults to False.
|
21 |
+
"""
|
22 |
+
|
23 |
+
def __init__(self, version, use_spectral_norm=False):
|
24 |
+
super(MultiPeriodDiscriminator, self).__init__()
|
25 |
+
periods = (
|
26 |
+
[2, 3, 5, 7, 11, 17] if version == "v1" else [2, 3, 5, 7, 11, 17, 23, 37]
|
27 |
+
)
|
28 |
+
self.discriminators = torch.nn.ModuleList(
|
29 |
+
[DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
30 |
+
+ [DiscriminatorP(p, use_spectral_norm=use_spectral_norm) for p in periods]
|
31 |
+
)
|
32 |
+
|
33 |
+
def forward(self, y, y_hat):
|
34 |
+
"""
|
35 |
+
Forward pass of the multi-period discriminator.
|
36 |
+
|
37 |
+
Args:
|
38 |
+
y (torch.Tensor): Real audio signal.
|
39 |
+
y_hat (torch.Tensor): Fake audio signal.
|
40 |
+
"""
|
41 |
+
y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
|
42 |
+
for d in self.discriminators:
|
43 |
+
y_d_r, fmap_r = d(y)
|
44 |
+
y_d_g, fmap_g = d(y_hat)
|
45 |
+
y_d_rs.append(y_d_r)
|
46 |
+
y_d_gs.append(y_d_g)
|
47 |
+
fmap_rs.append(fmap_r)
|
48 |
+
fmap_gs.append(fmap_g)
|
49 |
+
|
50 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
51 |
+
|
52 |
+
|
53 |
+
class DiscriminatorS(torch.nn.Module):
|
54 |
+
"""
|
55 |
+
Discriminator for the short-term component.
|
56 |
+
|
57 |
+
This class implements a discriminator for the short-term component
|
58 |
+
of the audio signal. The discriminator is composed of a series of
|
59 |
+
convolutional layers that are applied to the input signal.
|
60 |
+
"""
|
61 |
+
|
62 |
+
def __init__(self, use_spectral_norm=False):
|
63 |
+
super(DiscriminatorS, self).__init__()
|
64 |
+
norm_f = spectral_norm if use_spectral_norm else weight_norm
|
65 |
+
self.convs = torch.nn.ModuleList(
|
66 |
+
[
|
67 |
+
norm_f(torch.nn.Conv1d(1, 16, 15, 1, padding=7)),
|
68 |
+
norm_f(torch.nn.Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
69 |
+
norm_f(torch.nn.Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
70 |
+
norm_f(torch.nn.Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
71 |
+
norm_f(torch.nn.Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
72 |
+
norm_f(torch.nn.Conv1d(1024, 1024, 5, 1, padding=2)),
|
73 |
+
]
|
74 |
+
)
|
75 |
+
self.conv_post = norm_f(torch.nn.Conv1d(1024, 1, 3, 1, padding=1))
|
76 |
+
self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE)
|
77 |
+
|
78 |
+
def forward(self, x):
|
79 |
+
"""
|
80 |
+
Forward pass of the discriminator.
|
81 |
+
|
82 |
+
Args:
|
83 |
+
x (torch.Tensor): Input audio signal.
|
84 |
+
"""
|
85 |
+
fmap = []
|
86 |
+
for conv in self.convs:
|
87 |
+
x = self.lrelu(conv(x))
|
88 |
+
fmap.append(x)
|
89 |
+
x = self.conv_post(x)
|
90 |
+
fmap.append(x)
|
91 |
+
x = torch.flatten(x, 1, -1)
|
92 |
+
return x, fmap
|
93 |
+
|
94 |
+
|
95 |
+
class DiscriminatorP(torch.nn.Module):
|
96 |
+
"""
|
97 |
+
Discriminator for the long-term component.
|
98 |
+
|
99 |
+
This class implements a discriminator for the long-term component
|
100 |
+
of the audio signal. The discriminator is composed of a series of
|
101 |
+
convolutional layers that are applied to the input signal at a given
|
102 |
+
period.
|
103 |
+
|
104 |
+
Args:
|
105 |
+
period (int): Period of the discriminator.
|
106 |
+
kernel_size (int): Kernel size of the convolutional layers.
|
107 |
+
Defaults to 5.
|
108 |
+
stride (int): Stride of the convolutional layers. Defaults to 3.
|
109 |
+
use_spectral_norm (bool): Whether to use spectral normalization.
|
110 |
+
Defaults to False.
|
111 |
+
"""
|
112 |
+
|
113 |
+
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
114 |
+
super(DiscriminatorP, self).__init__()
|
115 |
+
self.period = period
|
116 |
+
norm_f = spectral_norm if use_spectral_norm else weight_norm
|
117 |
+
|
118 |
+
in_channels = [1, 32, 128, 512, 1024]
|
119 |
+
out_channels = [32, 128, 512, 1024, 1024]
|
120 |
+
|
121 |
+
self.convs = torch.nn.ModuleList(
|
122 |
+
[
|
123 |
+
norm_f(
|
124 |
+
torch.nn.Conv2d(
|
125 |
+
in_ch,
|
126 |
+
out_ch,
|
127 |
+
(kernel_size, 1),
|
128 |
+
(stride, 1),
|
129 |
+
padding=(get_padding(kernel_size, 1), 0),
|
130 |
+
)
|
131 |
+
)
|
132 |
+
for in_ch, out_ch in zip(in_channels, out_channels)
|
133 |
+
]
|
134 |
+
)
|
135 |
+
|
136 |
+
self.conv_post = norm_f(torch.nn.Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
137 |
+
self.lrelu = torch.nn.LeakyReLU(LRELU_SLOPE)
|
138 |
+
|
139 |
+
def forward(self, x):
|
140 |
+
"""
|
141 |
+
Forward pass of the discriminator.
|
142 |
+
|
143 |
+
Args:
|
144 |
+
x (torch.Tensor): Input audio signal.
|
145 |
+
"""
|
146 |
+
fmap = []
|
147 |
+
b, c, t = x.shape
|
148 |
+
if t % self.period != 0:
|
149 |
+
n_pad = self.period - (t % self.period)
|
150 |
+
x = torch.nn.functional.pad(x, (0, n_pad), "reflect")
|
151 |
+
x = x.view(b, c, -1, self.period)
|
152 |
+
|
153 |
+
for conv in self.convs:
|
154 |
+
x = self.lrelu(conv(x))
|
155 |
+
fmap.append(x)
|
156 |
+
|
157 |
+
x = self.conv_post(x)
|
158 |
+
fmap.append(x)
|
159 |
+
x = torch.flatten(x, 1, -1)
|
160 |
+
return x, fmap
|
rvc/lib/algorithm/encoders.py
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from typing import Optional
|
4 |
+
|
5 |
+
from rvc.lib.algorithm.commons import sequence_mask
|
6 |
+
from rvc.lib.algorithm.modules import WaveNet
|
7 |
+
from rvc.lib.algorithm.normalization import LayerNorm
|
8 |
+
from rvc.lib.algorithm.attentions import FFN, MultiHeadAttention
|
9 |
+
|
10 |
+
|
11 |
+
class Encoder(torch.nn.Module):
|
12 |
+
"""
|
13 |
+
Encoder module for the Transformer model.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
hidden_channels (int): Number of hidden channels in the encoder.
|
17 |
+
filter_channels (int): Number of filter channels in the feed-forward network.
|
18 |
+
n_heads (int): Number of attention heads.
|
19 |
+
n_layers (int): Number of encoder layers.
|
20 |
+
kernel_size (int, optional): Kernel size of the convolution layers in the feed-forward network. Defaults to 1.
|
21 |
+
p_dropout (float, optional): Dropout probability. Defaults to 0.0.
|
22 |
+
window_size (int, optional): Window size for relative positional encoding. Defaults to 10.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
hidden_channels,
|
28 |
+
filter_channels,
|
29 |
+
n_heads,
|
30 |
+
n_layers,
|
31 |
+
kernel_size=1,
|
32 |
+
p_dropout=0.0,
|
33 |
+
window_size=10,
|
34 |
+
):
|
35 |
+
super().__init__()
|
36 |
+
self.hidden_channels = hidden_channels
|
37 |
+
self.filter_channels = filter_channels
|
38 |
+
self.n_heads = n_heads
|
39 |
+
self.n_layers = n_layers
|
40 |
+
self.kernel_size = kernel_size
|
41 |
+
self.p_dropout = p_dropout
|
42 |
+
self.window_size = window_size
|
43 |
+
|
44 |
+
self.drop = torch.nn.Dropout(p_dropout)
|
45 |
+
self.attn_layers = torch.nn.ModuleList()
|
46 |
+
self.norm_layers_1 = torch.nn.ModuleList()
|
47 |
+
self.ffn_layers = torch.nn.ModuleList()
|
48 |
+
self.norm_layers_2 = torch.nn.ModuleList()
|
49 |
+
for i in range(self.n_layers):
|
50 |
+
self.attn_layers.append(
|
51 |
+
MultiHeadAttention(
|
52 |
+
hidden_channels,
|
53 |
+
hidden_channels,
|
54 |
+
n_heads,
|
55 |
+
p_dropout=p_dropout,
|
56 |
+
window_size=window_size,
|
57 |
+
)
|
58 |
+
)
|
59 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
60 |
+
self.ffn_layers.append(
|
61 |
+
FFN(
|
62 |
+
hidden_channels,
|
63 |
+
hidden_channels,
|
64 |
+
filter_channels,
|
65 |
+
kernel_size,
|
66 |
+
p_dropout=p_dropout,
|
67 |
+
)
|
68 |
+
)
|
69 |
+
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
70 |
+
|
71 |
+
def forward(self, x, x_mask):
|
72 |
+
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
73 |
+
x = x * x_mask
|
74 |
+
for i in range(self.n_layers):
|
75 |
+
y = self.attn_layers[i](x, x, attn_mask)
|
76 |
+
y = self.drop(y)
|
77 |
+
x = self.norm_layers_1[i](x + y)
|
78 |
+
|
79 |
+
y = self.ffn_layers[i](x, x_mask)
|
80 |
+
y = self.drop(y)
|
81 |
+
x = self.norm_layers_2[i](x + y)
|
82 |
+
x = x * x_mask
|
83 |
+
return x
|
84 |
+
|
85 |
+
|
86 |
+
class TextEncoder(torch.nn.Module):
|
87 |
+
"""Text Encoder with configurable embedding dimension.
|
88 |
+
|
89 |
+
Args:
|
90 |
+
out_channels (int): Output channels of the encoder.
|
91 |
+
hidden_channels (int): Hidden channels of the encoder.
|
92 |
+
filter_channels (int): Filter channels of the encoder.
|
93 |
+
n_heads (int): Number of attention heads.
|
94 |
+
n_layers (int): Number of encoder layers.
|
95 |
+
kernel_size (int): Kernel size of the convolutional layers.
|
96 |
+
p_dropout (float): Dropout probability.
|
97 |
+
embedding_dim (int): Embedding dimension for phone embeddings (v1 = 256, v2 = 768).
|
98 |
+
f0 (bool, optional): Whether to use F0 embedding. Defaults to True.
|
99 |
+
"""
|
100 |
+
|
101 |
+
def __init__(
|
102 |
+
self,
|
103 |
+
out_channels,
|
104 |
+
hidden_channels,
|
105 |
+
filter_channels,
|
106 |
+
n_heads,
|
107 |
+
n_layers,
|
108 |
+
kernel_size,
|
109 |
+
p_dropout,
|
110 |
+
embedding_dim,
|
111 |
+
f0=True,
|
112 |
+
):
|
113 |
+
super(TextEncoder, self).__init__()
|
114 |
+
self.out_channels = out_channels
|
115 |
+
self.hidden_channels = hidden_channels
|
116 |
+
self.filter_channels = filter_channels
|
117 |
+
self.n_heads = n_heads
|
118 |
+
self.n_layers = n_layers
|
119 |
+
self.kernel_size = kernel_size
|
120 |
+
self.p_dropout = float(p_dropout)
|
121 |
+
self.emb_phone = torch.nn.Linear(embedding_dim, hidden_channels)
|
122 |
+
self.lrelu = torch.nn.LeakyReLU(0.1, inplace=True)
|
123 |
+
if f0:
|
124 |
+
self.emb_pitch = torch.nn.Embedding(256, hidden_channels)
|
125 |
+
self.encoder = Encoder(
|
126 |
+
hidden_channels,
|
127 |
+
filter_channels,
|
128 |
+
n_heads,
|
129 |
+
n_layers,
|
130 |
+
kernel_size,
|
131 |
+
float(p_dropout),
|
132 |
+
)
|
133 |
+
self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
134 |
+
|
135 |
+
def forward(
|
136 |
+
self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor
|
137 |
+
):
|
138 |
+
if pitch is None:
|
139 |
+
x = self.emb_phone(phone)
|
140 |
+
else:
|
141 |
+
x = self.emb_phone(phone) + self.emb_pitch(pitch)
|
142 |
+
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
|
143 |
+
x = self.lrelu(x)
|
144 |
+
x = torch.transpose(x, 1, -1) # [b, h, t]
|
145 |
+
x_mask = torch.unsqueeze(sequence_mask(lengths, x.size(2)), 1).to(x.dtype)
|
146 |
+
x = self.encoder(x * x_mask, x_mask)
|
147 |
+
stats = self.proj(x) * x_mask
|
148 |
+
|
149 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
150 |
+
return m, logs, x_mask
|
151 |
+
|
152 |
+
|
153 |
+
class PosteriorEncoder(torch.nn.Module):
|
154 |
+
"""Posterior Encoder for inferring latent representation.
|
155 |
+
|
156 |
+
Args:
|
157 |
+
in_channels (int): Number of channels in the input.
|
158 |
+
out_channels (int): Number of channels in the output.
|
159 |
+
hidden_channels (int): Number of hidden channels in the encoder.
|
160 |
+
kernel_size (int): Kernel size of the convolutional layers.
|
161 |
+
dilation_rate (int): Dilation rate of the convolutional layers.
|
162 |
+
n_layers (int): Number of layers in the encoder.
|
163 |
+
gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
|
164 |
+
"""
|
165 |
+
|
166 |
+
def __init__(
|
167 |
+
self,
|
168 |
+
in_channels,
|
169 |
+
out_channels,
|
170 |
+
hidden_channels,
|
171 |
+
kernel_size,
|
172 |
+
dilation_rate,
|
173 |
+
n_layers,
|
174 |
+
gin_channels=0,
|
175 |
+
):
|
176 |
+
super(PosteriorEncoder, self).__init__()
|
177 |
+
self.in_channels = in_channels
|
178 |
+
self.out_channels = out_channels
|
179 |
+
self.hidden_channels = hidden_channels
|
180 |
+
self.kernel_size = kernel_size
|
181 |
+
self.dilation_rate = dilation_rate
|
182 |
+
self.n_layers = n_layers
|
183 |
+
self.gin_channels = gin_channels
|
184 |
+
|
185 |
+
self.pre = torch.nn.Conv1d(in_channels, hidden_channels, 1)
|
186 |
+
self.enc = WaveNet(
|
187 |
+
hidden_channels,
|
188 |
+
kernel_size,
|
189 |
+
dilation_rate,
|
190 |
+
n_layers,
|
191 |
+
gin_channels=gin_channels,
|
192 |
+
)
|
193 |
+
self.proj = torch.nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
194 |
+
|
195 |
+
def forward(
|
196 |
+
self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
|
197 |
+
):
|
198 |
+
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
199 |
+
x = self.pre(x) * x_mask
|
200 |
+
x = self.enc(x, x_mask, g=g)
|
201 |
+
stats = self.proj(x) * x_mask
|
202 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
203 |
+
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
204 |
+
return z, m, logs, x_mask
|
205 |
+
|
206 |
+
def remove_weight_norm(self):
|
207 |
+
"""Removes weight normalization from the encoder."""
|
208 |
+
self.enc.remove_weight_norm()
|
209 |
+
|
210 |
+
def __prepare_scriptable__(self):
|
211 |
+
"""Prepares the module for scripting."""
|
212 |
+
for hook in self.enc._forward_pre_hooks.values():
|
213 |
+
if (
|
214 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
215 |
+
and hook.__class__.__name__ == "WeightNorm"
|
216 |
+
):
|
217 |
+
torch.nn.utils.remove_weight_norm(self.enc)
|
218 |
+
return self
|
rvc/lib/algorithm/generators.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
from torch.nn.utils import remove_weight_norm
|
4 |
+
from torch.nn.utils.parametrizations import weight_norm
|
5 |
+
from typing import Optional
|
6 |
+
|
7 |
+
from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock1, ResBlock2
|
8 |
+
from rvc.lib.algorithm.commons import init_weights
|
9 |
+
|
10 |
+
|
11 |
+
class Generator(torch.nn.Module):
|
12 |
+
"""Generator for synthesizing audio.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
initial_channel (int): Number of channels in the initial convolutional layer.
|
16 |
+
resblock (str): Type of residual block to use (1 or 2).
|
17 |
+
resblock_kernel_sizes (list): Kernel sizes of the residual blocks.
|
18 |
+
resblock_dilation_sizes (list): Dilation rates of the residual blocks.
|
19 |
+
upsample_rates (list): Upsampling rates.
|
20 |
+
upsample_initial_channel (int): Number of channels in the initial upsampling layer.
|
21 |
+
upsample_kernel_sizes (list): Kernel sizes of the upsampling layers.
|
22 |
+
gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
initial_channel,
|
28 |
+
resblock,
|
29 |
+
resblock_kernel_sizes,
|
30 |
+
resblock_dilation_sizes,
|
31 |
+
upsample_rates,
|
32 |
+
upsample_initial_channel,
|
33 |
+
upsample_kernel_sizes,
|
34 |
+
gin_channels=0,
|
35 |
+
):
|
36 |
+
super(Generator, self).__init__()
|
37 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
38 |
+
self.num_upsamples = len(upsample_rates)
|
39 |
+
self.conv_pre = torch.nn.Conv1d(
|
40 |
+
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
41 |
+
)
|
42 |
+
resblock = ResBlock1 if resblock == "1" else ResBlock2
|
43 |
+
|
44 |
+
self.ups = torch.nn.ModuleList()
|
45 |
+
self.resblocks = torch.nn.ModuleList()
|
46 |
+
|
47 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
48 |
+
self.ups.append(
|
49 |
+
weight_norm(
|
50 |
+
torch.nn.ConvTranspose1d(
|
51 |
+
upsample_initial_channel // (2**i),
|
52 |
+
upsample_initial_channel // (2 ** (i + 1)),
|
53 |
+
k,
|
54 |
+
u,
|
55 |
+
padding=(k - u) // 2,
|
56 |
+
)
|
57 |
+
)
|
58 |
+
)
|
59 |
+
ch = upsample_initial_channel // (2 ** (i + 1))
|
60 |
+
for j, (k, d) in enumerate(
|
61 |
+
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
62 |
+
):
|
63 |
+
self.resblocks.append(resblock(ch, k, d))
|
64 |
+
|
65 |
+
self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
66 |
+
self.ups.apply(init_weights)
|
67 |
+
|
68 |
+
if gin_channels != 0:
|
69 |
+
self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
70 |
+
|
71 |
+
def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
|
72 |
+
x = self.conv_pre(x)
|
73 |
+
if g is not None:
|
74 |
+
x = x + self.cond(g)
|
75 |
+
|
76 |
+
for i in range(self.num_upsamples):
|
77 |
+
x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
|
78 |
+
x = self.ups[i](x)
|
79 |
+
xs = None
|
80 |
+
for j in range(self.num_kernels):
|
81 |
+
if xs == None:
|
82 |
+
xs = self.resblocks[i * self.num_kernels + j](x)
|
83 |
+
else:
|
84 |
+
xs += self.resblocks[i * self.num_kernels + j](x)
|
85 |
+
x = xs / self.num_kernels
|
86 |
+
|
87 |
+
x = torch.nn.functional.leaky_relu(x)
|
88 |
+
x = self.conv_post(x)
|
89 |
+
x = torch.tanh(x)
|
90 |
+
|
91 |
+
return x
|
92 |
+
|
93 |
+
def __prepare_scriptable__(self):
|
94 |
+
"""Prepares the module for scripting."""
|
95 |
+
for l in self.ups_and_resblocks:
|
96 |
+
for hook in l._forward_pre_hooks.values():
|
97 |
+
if (
|
98 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
99 |
+
and hook.__class__.__name__ == "WeightNorm"
|
100 |
+
):
|
101 |
+
torch.nn.utils.remove_weight_norm(l)
|
102 |
+
return self
|
103 |
+
|
104 |
+
def remove_weight_norm(self):
|
105 |
+
"""Removes weight normalization from the upsampling and residual blocks."""
|
106 |
+
for l in self.ups:
|
107 |
+
remove_weight_norm(l)
|
108 |
+
for l in self.resblocks:
|
109 |
+
l.remove_weight_norm()
|
110 |
+
|
111 |
+
|
112 |
+
class SineGenerator(torch.nn.Module):
|
113 |
+
"""
|
114 |
+
A sine wave generator that synthesizes waveforms with optional harmonic overtones and noise.
|
115 |
+
|
116 |
+
Args:
|
117 |
+
sampling_rate (int): The sampling rate in Hz.
|
118 |
+
num_harmonics (int, optional): The number of harmonic overtones to include. Defaults to 0.
|
119 |
+
sine_amplitude (float, optional): The amplitude of the sine waveform. Defaults to 0.1.
|
120 |
+
noise_stddev (float, optional): The standard deviation of Gaussian noise. Defaults to 0.003.
|
121 |
+
voiced_threshold (float, optional): F0 threshold for distinguishing voiced/unvoiced frames. Defaults to 0.
|
122 |
+
"""
|
123 |
+
|
124 |
+
def __init__(
|
125 |
+
self,
|
126 |
+
sampling_rate: int,
|
127 |
+
num_harmonics: int = 0,
|
128 |
+
sine_amplitude: float = 0.1,
|
129 |
+
noise_stddev: float = 0.003,
|
130 |
+
voiced_threshold: float = 0.0,
|
131 |
+
):
|
132 |
+
super(SineGenerator, self).__init__()
|
133 |
+
self.sampling_rate = sampling_rate
|
134 |
+
self.num_harmonics = num_harmonics
|
135 |
+
self.sine_amplitude = sine_amplitude
|
136 |
+
self.noise_stddev = noise_stddev
|
137 |
+
self.voiced_threshold = voiced_threshold
|
138 |
+
self.waveform_dim = self.num_harmonics + 1 # fundamental + harmonics
|
139 |
+
|
140 |
+
def _compute_voiced_unvoiced(self, f0: torch.Tensor) -> torch.Tensor:
|
141 |
+
"""
|
142 |
+
Generate a binary mask to indicate voiced/unvoiced frames.
|
143 |
+
|
144 |
+
Args:
|
145 |
+
f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length).
|
146 |
+
"""
|
147 |
+
uv_mask = (f0 > self.voiced_threshold).float()
|
148 |
+
return uv_mask
|
149 |
+
|
150 |
+
def _generate_sine_wave(
|
151 |
+
self, f0: torch.Tensor, upsampling_factor: int
|
152 |
+
) -> torch.Tensor:
|
153 |
+
"""
|
154 |
+
Generate sine waves for the fundamental frequency and its harmonics.
|
155 |
+
|
156 |
+
Args:
|
157 |
+
f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length, 1).
|
158 |
+
upsampling_factor (int): Upsampling factor.
|
159 |
+
"""
|
160 |
+
batch_size, length, _ = f0.shape
|
161 |
+
|
162 |
+
# Create an upsampling grid
|
163 |
+
upsampling_grid = torch.arange(
|
164 |
+
1, upsampling_factor + 1, dtype=f0.dtype, device=f0.device
|
165 |
+
)
|
166 |
+
|
167 |
+
# Calculate phase increments
|
168 |
+
phase_increments = (f0 / self.sampling_rate) * upsampling_grid
|
169 |
+
phase_remainder = torch.fmod(phase_increments[:, :-1, -1:] + 0.5, 1.0) - 0.5
|
170 |
+
cumulative_phase = phase_remainder.cumsum(dim=1).fmod(1.0).to(f0.dtype)
|
171 |
+
phase_increments += torch.nn.functional.pad(
|
172 |
+
cumulative_phase, (0, 0, 1, 0), mode="constant"
|
173 |
+
)
|
174 |
+
|
175 |
+
# Reshape to match the sine wave shape
|
176 |
+
phase_increments = phase_increments.reshape(batch_size, -1, 1)
|
177 |
+
|
178 |
+
# Scale for harmonics
|
179 |
+
harmonic_scale = torch.arange(
|
180 |
+
1, self.waveform_dim + 1, dtype=f0.dtype, device=f0.device
|
181 |
+
).reshape(1, 1, -1)
|
182 |
+
phase_increments *= harmonic_scale
|
183 |
+
|
184 |
+
# Add random phase offset (except for the fundamental)
|
185 |
+
random_phase = torch.rand(1, 1, self.waveform_dim, device=f0.device)
|
186 |
+
random_phase[..., 0] = 0 # Fundamental frequency has no random offset
|
187 |
+
phase_increments += random_phase
|
188 |
+
|
189 |
+
# Generate sine waves
|
190 |
+
sine_waves = torch.sin(2 * np.pi * phase_increments)
|
191 |
+
return sine_waves
|
192 |
+
|
193 |
+
def forward(self, f0: torch.Tensor, upsampling_factor: int):
|
194 |
+
"""
|
195 |
+
Forward pass to generate sine waveforms with noise and voiced/unvoiced masking.
|
196 |
+
|
197 |
+
Args:
|
198 |
+
f0 (torch.Tensor): Fundamental frequency tensor (batch_size, length, 1).
|
199 |
+
upsampling_factor (int): Upsampling factor.
|
200 |
+
"""
|
201 |
+
with torch.no_grad():
|
202 |
+
# Expand `f0` to include waveform dimensions
|
203 |
+
f0 = f0.unsqueeze(-1)
|
204 |
+
|
205 |
+
# Generate sine waves
|
206 |
+
sine_waves = (
|
207 |
+
self._generate_sine_wave(f0, upsampling_factor) * self.sine_amplitude
|
208 |
+
)
|
209 |
+
|
210 |
+
# Compute voiced/unvoiced mask
|
211 |
+
voiced_mask = self._compute_voiced_unvoiced(f0)
|
212 |
+
|
213 |
+
# Upsample voiced/unvoiced mask
|
214 |
+
voiced_mask = torch.nn.functional.interpolate(
|
215 |
+
voiced_mask.transpose(2, 1),
|
216 |
+
scale_factor=float(upsampling_factor),
|
217 |
+
mode="nearest",
|
218 |
+
).transpose(2, 1)
|
219 |
+
|
220 |
+
# Compute noise amplitude
|
221 |
+
noise_amplitude = voiced_mask * self.noise_stddev + (1 - voiced_mask) * (
|
222 |
+
self.sine_amplitude / 3
|
223 |
+
)
|
224 |
+
|
225 |
+
# Add Gaussian noise
|
226 |
+
noise = noise_amplitude * torch.randn_like(sine_waves)
|
227 |
+
|
228 |
+
# Combine sine waves and noise
|
229 |
+
sine_waveforms = sine_waves * voiced_mask + noise
|
230 |
+
|
231 |
+
return sine_waveforms, voiced_mask, noise
|
rvc/lib/algorithm/modules.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from rvc.lib.algorithm.commons import fused_add_tanh_sigmoid_multiply
|
3 |
+
|
4 |
+
|
5 |
+
class WaveNet(torch.nn.Module):
|
6 |
+
"""WaveNet residual blocks as used in WaveGlow.
|
7 |
+
|
8 |
+
Args:
|
9 |
+
hidden_channels (int): Number of hidden channels.
|
10 |
+
kernel_size (int): Size of the convolutional kernel.
|
11 |
+
dilation_rate (int): Dilation rate of the convolution.
|
12 |
+
n_layers (int): Number of convolutional layers.
|
13 |
+
gin_channels (int, optional): Number of conditioning channels. Defaults to 0.
|
14 |
+
p_dropout (float, optional): Dropout probability. Defaults to 0.
|
15 |
+
"""
|
16 |
+
|
17 |
+
def __init__(
|
18 |
+
self,
|
19 |
+
hidden_channels,
|
20 |
+
kernel_size,
|
21 |
+
dilation_rate,
|
22 |
+
n_layers,
|
23 |
+
gin_channels=0,
|
24 |
+
p_dropout=0,
|
25 |
+
):
|
26 |
+
super().__init__()
|
27 |
+
assert kernel_size % 2 == 1, "Kernel size must be odd for proper padding."
|
28 |
+
|
29 |
+
self.hidden_channels = hidden_channels
|
30 |
+
self.kernel_size = (kernel_size,)
|
31 |
+
self.dilation_rate = dilation_rate
|
32 |
+
self.n_layers = n_layers
|
33 |
+
self.gin_channels = gin_channels
|
34 |
+
self.p_dropout = p_dropout
|
35 |
+
self.n_channels_tensor = torch.IntTensor([hidden_channels]) # Static tensor
|
36 |
+
|
37 |
+
self.in_layers = torch.nn.ModuleList()
|
38 |
+
self.res_skip_layers = torch.nn.ModuleList()
|
39 |
+
self.drop = torch.nn.Dropout(p_dropout)
|
40 |
+
|
41 |
+
# Conditional layer for global conditioning
|
42 |
+
if gin_channels:
|
43 |
+
self.cond_layer = torch.nn.utils.parametrizations.weight_norm(
|
44 |
+
torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1),
|
45 |
+
name="weight",
|
46 |
+
)
|
47 |
+
|
48 |
+
# Precompute dilations and paddings
|
49 |
+
dilations = [dilation_rate**i for i in range(n_layers)]
|
50 |
+
paddings = [(kernel_size * d - d) // 2 for d in dilations]
|
51 |
+
|
52 |
+
# Initialize layers
|
53 |
+
for i in range(n_layers):
|
54 |
+
self.in_layers.append(
|
55 |
+
torch.nn.utils.parametrizations.weight_norm(
|
56 |
+
torch.nn.Conv1d(
|
57 |
+
hidden_channels,
|
58 |
+
2 * hidden_channels,
|
59 |
+
kernel_size,
|
60 |
+
dilation=dilations[i],
|
61 |
+
padding=paddings[i],
|
62 |
+
),
|
63 |
+
name="weight",
|
64 |
+
)
|
65 |
+
)
|
66 |
+
|
67 |
+
res_skip_channels = (
|
68 |
+
hidden_channels if i == n_layers - 1 else 2 * hidden_channels
|
69 |
+
)
|
70 |
+
self.res_skip_layers.append(
|
71 |
+
torch.nn.utils.parametrizations.weight_norm(
|
72 |
+
torch.nn.Conv1d(hidden_channels, res_skip_channels, 1),
|
73 |
+
name="weight",
|
74 |
+
)
|
75 |
+
)
|
76 |
+
|
77 |
+
def forward(self, x, x_mask, g=None):
|
78 |
+
"""Forward pass.
|
79 |
+
|
80 |
+
Args:
|
81 |
+
x (torch.Tensor): Input tensor (batch_size, hidden_channels, time_steps).
|
82 |
+
x_mask (torch.Tensor): Mask tensor (batch_size, 1, time_steps).
|
83 |
+
g (torch.Tensor, optional): Conditioning tensor (batch_size, gin_channels, time_steps).
|
84 |
+
"""
|
85 |
+
output = x.clone().zero_()
|
86 |
+
|
87 |
+
# Apply conditional layer if global conditioning is provided
|
88 |
+
g = self.cond_layer(g) if g is not None else None
|
89 |
+
|
90 |
+
for i in range(self.n_layers):
|
91 |
+
x_in = self.in_layers[i](x)
|
92 |
+
g_l = (
|
93 |
+
g[
|
94 |
+
:,
|
95 |
+
i * 2 * self.hidden_channels : (i + 1) * 2 * self.hidden_channels,
|
96 |
+
:,
|
97 |
+
]
|
98 |
+
if g is not None
|
99 |
+
else 0
|
100 |
+
)
|
101 |
+
|
102 |
+
# Activation with fused Tanh-Sigmoid
|
103 |
+
acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, self.n_channels_tensor)
|
104 |
+
acts = self.drop(acts)
|
105 |
+
|
106 |
+
# Residual and skip connections
|
107 |
+
res_skip_acts = self.res_skip_layers[i](acts)
|
108 |
+
if i < self.n_layers - 1:
|
109 |
+
res_acts = res_skip_acts[:, : self.hidden_channels, :]
|
110 |
+
x = (x + res_acts) * x_mask
|
111 |
+
output = output + res_skip_acts[:, self.hidden_channels :, :]
|
112 |
+
else:
|
113 |
+
output = output + res_skip_acts
|
114 |
+
|
115 |
+
return output * x_mask
|
116 |
+
|
117 |
+
def remove_weight_norm(self):
|
118 |
+
"""Remove weight normalization from the module."""
|
119 |
+
if self.gin_channels:
|
120 |
+
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
121 |
+
for layer in self.in_layers:
|
122 |
+
torch.nn.utils.remove_weight_norm(layer)
|
123 |
+
for layer in self.res_skip_layers:
|
124 |
+
torch.nn.utils.remove_weight_norm(layer)
|
rvc/lib/algorithm/normalization.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
|
4 |
+
class LayerNorm(torch.nn.Module):
|
5 |
+
"""Layer normalization module.
|
6 |
+
|
7 |
+
Args:
|
8 |
+
channels (int): Number of channels.
|
9 |
+
eps (float, optional): Epsilon value for numerical stability. Defaults to 1e-5.
|
10 |
+
"""
|
11 |
+
|
12 |
+
def __init__(self, channels, eps=1e-5):
|
13 |
+
super().__init__()
|
14 |
+
self.eps = eps
|
15 |
+
self.gamma = torch.nn.Parameter(torch.ones(channels))
|
16 |
+
self.beta = torch.nn.Parameter(torch.zeros(channels))
|
17 |
+
|
18 |
+
def forward(self, x):
|
19 |
+
"""Forward pass.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps).
|
23 |
+
|
24 |
+
"""
|
25 |
+
# Transpose to (batch_size, time_steps, channels) for layer_norm
|
26 |
+
x = x.transpose(1, -1)
|
27 |
+
x = torch.nn.functional.layer_norm(
|
28 |
+
x, (x.size(-1),), self.gamma, self.beta, self.eps
|
29 |
+
)
|
30 |
+
# Transpose back to (batch_size, channels, time_steps)
|
31 |
+
return x.transpose(1, -1)
|
rvc/lib/algorithm/nsf.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from torch.nn.utils import remove_weight_norm
|
4 |
+
from torch.nn.utils.parametrizations import weight_norm
|
5 |
+
from typing import Optional
|
6 |
+
|
7 |
+
from rvc.lib.algorithm.generators import SineGenerator
|
8 |
+
from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock1, ResBlock2
|
9 |
+
from rvc.lib.algorithm.commons import init_weights
|
10 |
+
|
11 |
+
|
12 |
+
class SourceModuleHnNSF(torch.nn.Module):
|
13 |
+
"""
|
14 |
+
Source Module for harmonic-plus-noise excitation.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
sample_rate (int): Sampling rate in Hz.
|
18 |
+
harmonic_num (int, optional): Number of harmonics above F0. Defaults to 0.
|
19 |
+
sine_amp (float, optional): Amplitude of sine source signal. Defaults to 0.1.
|
20 |
+
add_noise_std (float, optional): Standard deviation of additive Gaussian noise. Defaults to 0.003.
|
21 |
+
voiced_threshod (float, optional): Threshold to set voiced/unvoiced given F0. Defaults to 0.
|
22 |
+
is_half (bool, optional): Whether to use half precision. Defaults to True.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
sample_rate,
|
28 |
+
harmonic_num=0,
|
29 |
+
sine_amp=0.1,
|
30 |
+
add_noise_std=0.003,
|
31 |
+
voiced_threshod=0,
|
32 |
+
is_half=True,
|
33 |
+
):
|
34 |
+
super(SourceModuleHnNSF, self).__init__()
|
35 |
+
|
36 |
+
self.sine_amp = sine_amp
|
37 |
+
self.noise_std = add_noise_std
|
38 |
+
self.is_half = is_half
|
39 |
+
|
40 |
+
self.l_sin_gen = SineGenerator(
|
41 |
+
sample_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
|
42 |
+
)
|
43 |
+
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
44 |
+
self.l_tanh = torch.nn.Tanh()
|
45 |
+
|
46 |
+
def forward(self, x: torch.Tensor, upsample_factor: int = 1):
|
47 |
+
sine_wavs, uv, _ = self.l_sin_gen(x, upsample_factor)
|
48 |
+
sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
|
49 |
+
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
|
50 |
+
return sine_merge, None, None
|
51 |
+
|
52 |
+
|
53 |
+
class GeneratorNSF(torch.nn.Module):
|
54 |
+
"""
|
55 |
+
Generator for synthesizing audio using the NSF (Neural Source Filter) approach.
|
56 |
+
|
57 |
+
Args:
|
58 |
+
initial_channel (int): Number of channels in the initial convolutional layer.
|
59 |
+
resblock (str): Type of residual block to use (1 or 2).
|
60 |
+
resblock_kernel_sizes (list): Kernel sizes of the residual blocks.
|
61 |
+
resblock_dilation_sizes (list): Dilation rates of the residual blocks.
|
62 |
+
upsample_rates (list): Upsampling rates.
|
63 |
+
upsample_initial_channel (int): Number of channels in the initial upsampling layer.
|
64 |
+
upsample_kernel_sizes (list): Kernel sizes of the upsampling layers.
|
65 |
+
gin_channels (int): Number of channels for the global conditioning input.
|
66 |
+
sr (int): Sampling rate.
|
67 |
+
is_half (bool, optional): Whether to use half precision. Defaults to False.
|
68 |
+
"""
|
69 |
+
|
70 |
+
def __init__(
|
71 |
+
self,
|
72 |
+
initial_channel,
|
73 |
+
resblock,
|
74 |
+
resblock_kernel_sizes,
|
75 |
+
resblock_dilation_sizes,
|
76 |
+
upsample_rates,
|
77 |
+
upsample_initial_channel,
|
78 |
+
upsample_kernel_sizes,
|
79 |
+
gin_channels,
|
80 |
+
sr,
|
81 |
+
is_half=False,
|
82 |
+
):
|
83 |
+
super(GeneratorNSF, self).__init__()
|
84 |
+
|
85 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
86 |
+
self.num_upsamples = len(upsample_rates)
|
87 |
+
self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates))
|
88 |
+
self.m_source = SourceModuleHnNSF(
|
89 |
+
sample_rate=sr, harmonic_num=0, is_half=is_half
|
90 |
+
)
|
91 |
+
|
92 |
+
self.conv_pre = torch.nn.Conv1d(
|
93 |
+
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
94 |
+
)
|
95 |
+
resblock_cls = ResBlock1 if resblock == "1" else ResBlock2
|
96 |
+
|
97 |
+
self.ups = torch.nn.ModuleList()
|
98 |
+
self.noise_convs = torch.nn.ModuleList()
|
99 |
+
|
100 |
+
channels = [
|
101 |
+
upsample_initial_channel // (2 ** (i + 1))
|
102 |
+
for i in range(len(upsample_rates))
|
103 |
+
]
|
104 |
+
stride_f0s = [
|
105 |
+
math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1
|
106 |
+
for i in range(len(upsample_rates))
|
107 |
+
]
|
108 |
+
|
109 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
110 |
+
self.ups.append(
|
111 |
+
weight_norm(
|
112 |
+
torch.nn.ConvTranspose1d(
|
113 |
+
upsample_initial_channel // (2**i),
|
114 |
+
channels[i],
|
115 |
+
k,
|
116 |
+
u,
|
117 |
+
padding=(k - u) // 2,
|
118 |
+
)
|
119 |
+
)
|
120 |
+
)
|
121 |
+
|
122 |
+
self.noise_convs.append(
|
123 |
+
torch.nn.Conv1d(
|
124 |
+
1,
|
125 |
+
channels[i],
|
126 |
+
kernel_size=(stride_f0s[i] * 2 if stride_f0s[i] > 1 else 1),
|
127 |
+
stride=stride_f0s[i],
|
128 |
+
padding=(stride_f0s[i] // 2 if stride_f0s[i] > 1 else 0),
|
129 |
+
)
|
130 |
+
)
|
131 |
+
|
132 |
+
self.resblocks = torch.nn.ModuleList(
|
133 |
+
[
|
134 |
+
resblock_cls(channels[i], k, d)
|
135 |
+
for i in range(len(self.ups))
|
136 |
+
for k, d in zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
137 |
+
]
|
138 |
+
)
|
139 |
+
|
140 |
+
self.conv_post = torch.nn.Conv1d(channels[-1], 1, 7, 1, padding=3, bias=False)
|
141 |
+
self.ups.apply(init_weights)
|
142 |
+
|
143 |
+
if gin_channels != 0:
|
144 |
+
self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
145 |
+
|
146 |
+
self.upp = math.prod(upsample_rates)
|
147 |
+
self.lrelu_slope = LRELU_SLOPE
|
148 |
+
|
149 |
+
def forward(self, x, f0, g: Optional[torch.Tensor] = None):
|
150 |
+
har_source, _, _ = self.m_source(f0, self.upp)
|
151 |
+
har_source = har_source.transpose(1, 2)
|
152 |
+
x = self.conv_pre(x)
|
153 |
+
|
154 |
+
if g is not None:
|
155 |
+
x = x + self.cond(g)
|
156 |
+
|
157 |
+
for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)):
|
158 |
+
x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
|
159 |
+
x = ups(x)
|
160 |
+
x = x + noise_convs(har_source)
|
161 |
+
|
162 |
+
xs = sum(
|
163 |
+
[
|
164 |
+
resblock(x)
|
165 |
+
for j, resblock in enumerate(self.resblocks)
|
166 |
+
if j in range(i * self.num_kernels, (i + 1) * self.num_kernels)
|
167 |
+
]
|
168 |
+
)
|
169 |
+
x = xs / self.num_kernels
|
170 |
+
|
171 |
+
x = torch.nn.functional.leaky_relu(x)
|
172 |
+
x = torch.tanh(self.conv_post(x))
|
173 |
+
return x
|
174 |
+
|
175 |
+
def remove_weight_norm(self):
|
176 |
+
for l in self.ups:
|
177 |
+
remove_weight_norm(l)
|
178 |
+
for l in self.resblocks:
|
179 |
+
l.remove_weight_norm()
|
180 |
+
|
181 |
+
def __prepare_scriptable__(self):
|
182 |
+
for l in self.ups:
|
183 |
+
for hook in l._forward_pre_hooks.values():
|
184 |
+
if (
|
185 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
186 |
+
and hook.__class__.__name__ == "WeightNorm"
|
187 |
+
):
|
188 |
+
remove_weight_norm(l)
|
189 |
+
for l in self.resblocks:
|
190 |
+
for hook in l._forward_pre_hooks.values():
|
191 |
+
if (
|
192 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
193 |
+
and hook.__class__.__name__ == "WeightNorm"
|
194 |
+
):
|
195 |
+
remove_weight_norm(l)
|
196 |
+
return self
|
rvc/lib/algorithm/residuals.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
import torch
|
3 |
+
from torch.nn.utils import remove_weight_norm
|
4 |
+
from torch.nn.utils.parametrizations import weight_norm
|
5 |
+
|
6 |
+
from rvc.lib.algorithm.modules import WaveNet
|
7 |
+
from rvc.lib.algorithm.commons import get_padding, init_weights
|
8 |
+
|
9 |
+
LRELU_SLOPE = 0.1
|
10 |
+
|
11 |
+
|
12 |
+
def create_conv1d_layer(channels, kernel_size, dilation):
|
13 |
+
return weight_norm(
|
14 |
+
torch.nn.Conv1d(
|
15 |
+
channels,
|
16 |
+
channels,
|
17 |
+
kernel_size,
|
18 |
+
1,
|
19 |
+
dilation=dilation,
|
20 |
+
padding=get_padding(kernel_size, dilation),
|
21 |
+
)
|
22 |
+
)
|
23 |
+
|
24 |
+
|
25 |
+
def apply_mask(tensor, mask):
|
26 |
+
return tensor * mask if mask is not None else tensor
|
27 |
+
|
28 |
+
|
29 |
+
class ResBlockBase(torch.nn.Module):
|
30 |
+
def __init__(self, channels, kernel_size, dilations):
|
31 |
+
super(ResBlockBase, self).__init__()
|
32 |
+
self.convs1 = torch.nn.ModuleList(
|
33 |
+
[create_conv1d_layer(channels, kernel_size, d) for d in dilations]
|
34 |
+
)
|
35 |
+
self.convs1.apply(init_weights)
|
36 |
+
|
37 |
+
self.convs2 = torch.nn.ModuleList(
|
38 |
+
[create_conv1d_layer(channels, kernel_size, 1) for _ in dilations]
|
39 |
+
)
|
40 |
+
self.convs2.apply(init_weights)
|
41 |
+
|
42 |
+
def forward(self, x, x_mask=None):
|
43 |
+
for c1, c2 in zip(self.convs1, self.convs2):
|
44 |
+
xt = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
|
45 |
+
xt = apply_mask(xt, x_mask)
|
46 |
+
xt = torch.nn.functional.leaky_relu(c1(xt), LRELU_SLOPE)
|
47 |
+
xt = apply_mask(xt, x_mask)
|
48 |
+
xt = c2(xt)
|
49 |
+
x = xt + x
|
50 |
+
return apply_mask(x, x_mask)
|
51 |
+
|
52 |
+
def remove_weight_norm(self):
|
53 |
+
for conv in self.convs1 + self.convs2:
|
54 |
+
remove_weight_norm(conv)
|
55 |
+
|
56 |
+
|
57 |
+
class ResBlock1(ResBlockBase):
|
58 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
|
59 |
+
super(ResBlock1, self).__init__(channels, kernel_size, dilation)
|
60 |
+
|
61 |
+
|
62 |
+
class ResBlock2(ResBlockBase):
|
63 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
|
64 |
+
super(ResBlock2, self).__init__(channels, kernel_size, dilation)
|
65 |
+
|
66 |
+
|
67 |
+
class Flip(torch.nn.Module):
|
68 |
+
"""Flip module for flow-based models.
|
69 |
+
|
70 |
+
This module flips the input along the time dimension.
|
71 |
+
"""
|
72 |
+
|
73 |
+
def forward(self, x, *args, reverse=False, **kwargs):
|
74 |
+
"""Forward pass.
|
75 |
+
|
76 |
+
Args:
|
77 |
+
x (torch.Tensor): Input tensor.
|
78 |
+
reverse (bool, optional): Whether to reverse the operation. Defaults to False.
|
79 |
+
"""
|
80 |
+
x = torch.flip(x, [1])
|
81 |
+
if not reverse:
|
82 |
+
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
|
83 |
+
return x, logdet
|
84 |
+
else:
|
85 |
+
return x
|
86 |
+
|
87 |
+
|
88 |
+
class ResidualCouplingBlock(torch.nn.Module):
|
89 |
+
"""Residual Coupling Block for normalizing flow.
|
90 |
+
|
91 |
+
Args:
|
92 |
+
channels (int): Number of channels in the input.
|
93 |
+
hidden_channels (int): Number of hidden channels in the coupling layer.
|
94 |
+
kernel_size (int): Kernel size of the convolutional layers.
|
95 |
+
dilation_rate (int): Dilation rate of the convolutional layers.
|
96 |
+
n_layers (int): Number of layers in the coupling layer.
|
97 |
+
n_flows (int, optional): Number of coupling layers in the block. Defaults to 4.
|
98 |
+
gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
|
99 |
+
"""
|
100 |
+
|
101 |
+
def __init__(
|
102 |
+
self,
|
103 |
+
channels,
|
104 |
+
hidden_channels,
|
105 |
+
kernel_size,
|
106 |
+
dilation_rate,
|
107 |
+
n_layers,
|
108 |
+
n_flows=4,
|
109 |
+
gin_channels=0,
|
110 |
+
):
|
111 |
+
super(ResidualCouplingBlock, self).__init__()
|
112 |
+
self.channels = channels
|
113 |
+
self.hidden_channels = hidden_channels
|
114 |
+
self.kernel_size = kernel_size
|
115 |
+
self.dilation_rate = dilation_rate
|
116 |
+
self.n_layers = n_layers
|
117 |
+
self.n_flows = n_flows
|
118 |
+
self.gin_channels = gin_channels
|
119 |
+
|
120 |
+
self.flows = torch.nn.ModuleList()
|
121 |
+
for i in range(n_flows):
|
122 |
+
self.flows.append(
|
123 |
+
ResidualCouplingLayer(
|
124 |
+
channels,
|
125 |
+
hidden_channels,
|
126 |
+
kernel_size,
|
127 |
+
dilation_rate,
|
128 |
+
n_layers,
|
129 |
+
gin_channels=gin_channels,
|
130 |
+
mean_only=True,
|
131 |
+
)
|
132 |
+
)
|
133 |
+
self.flows.append(Flip())
|
134 |
+
|
135 |
+
def forward(
|
136 |
+
self,
|
137 |
+
x: torch.Tensor,
|
138 |
+
x_mask: torch.Tensor,
|
139 |
+
g: Optional[torch.Tensor] = None,
|
140 |
+
reverse: bool = False,
|
141 |
+
):
|
142 |
+
if not reverse:
|
143 |
+
for flow in self.flows:
|
144 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
145 |
+
else:
|
146 |
+
for flow in reversed(self.flows):
|
147 |
+
x = flow.forward(x, x_mask, g=g, reverse=reverse)
|
148 |
+
return x
|
149 |
+
|
150 |
+
def remove_weight_norm(self):
|
151 |
+
"""Removes weight normalization from the coupling layers."""
|
152 |
+
for i in range(self.n_flows):
|
153 |
+
self.flows[i * 2].remove_weight_norm()
|
154 |
+
|
155 |
+
def __prepare_scriptable__(self):
|
156 |
+
"""Prepares the module for scripting."""
|
157 |
+
for i in range(self.n_flows):
|
158 |
+
for hook in self.flows[i * 2]._forward_pre_hooks.values():
|
159 |
+
if (
|
160 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
161 |
+
and hook.__class__.__name__ == "WeightNorm"
|
162 |
+
):
|
163 |
+
torch.nn.utils.remove_weight_norm(self.flows[i * 2])
|
164 |
+
|
165 |
+
return self
|
166 |
+
|
167 |
+
|
168 |
+
class ResidualCouplingLayer(torch.nn.Module):
|
169 |
+
"""Residual coupling layer for flow-based models.
|
170 |
+
|
171 |
+
Args:
|
172 |
+
channels (int): Number of channels.
|
173 |
+
hidden_channels (int): Number of hidden channels.
|
174 |
+
kernel_size (int): Size of the convolutional kernel.
|
175 |
+
dilation_rate (int): Dilation rate of the convolution.
|
176 |
+
n_layers (int): Number of convolutional layers.
|
177 |
+
p_dropout (float, optional): Dropout probability. Defaults to 0.
|
178 |
+
gin_channels (int, optional): Number of conditioning channels. Defaults to 0.
|
179 |
+
mean_only (bool, optional): Whether to use mean-only coupling. Defaults to False.
|
180 |
+
"""
|
181 |
+
|
182 |
+
def __init__(
|
183 |
+
self,
|
184 |
+
channels,
|
185 |
+
hidden_channels,
|
186 |
+
kernel_size,
|
187 |
+
dilation_rate,
|
188 |
+
n_layers,
|
189 |
+
p_dropout=0,
|
190 |
+
gin_channels=0,
|
191 |
+
mean_only=False,
|
192 |
+
):
|
193 |
+
assert channels % 2 == 0, "channels should be divisible by 2"
|
194 |
+
super().__init__()
|
195 |
+
self.channels = channels
|
196 |
+
self.hidden_channels = hidden_channels
|
197 |
+
self.kernel_size = kernel_size
|
198 |
+
self.dilation_rate = dilation_rate
|
199 |
+
self.n_layers = n_layers
|
200 |
+
self.half_channels = channels // 2
|
201 |
+
self.mean_only = mean_only
|
202 |
+
|
203 |
+
self.pre = torch.nn.Conv1d(self.half_channels, hidden_channels, 1)
|
204 |
+
self.enc = WaveNet(
|
205 |
+
hidden_channels,
|
206 |
+
kernel_size,
|
207 |
+
dilation_rate,
|
208 |
+
n_layers,
|
209 |
+
p_dropout=p_dropout,
|
210 |
+
gin_channels=gin_channels,
|
211 |
+
)
|
212 |
+
self.post = torch.nn.Conv1d(
|
213 |
+
hidden_channels, self.half_channels * (2 - mean_only), 1
|
214 |
+
)
|
215 |
+
self.post.weight.data.zero_()
|
216 |
+
self.post.bias.data.zero_()
|
217 |
+
|
218 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
219 |
+
"""Forward pass.
|
220 |
+
|
221 |
+
Args:
|
222 |
+
x (torch.Tensor): Input tensor of shape (batch_size, channels, time_steps).
|
223 |
+
x_mask (torch.Tensor): Mask tensor of shape (batch_size, 1, time_steps).
|
224 |
+
g (torch.Tensor, optional): Conditioning tensor of shape (batch_size, gin_channels, time_steps).
|
225 |
+
Defaults to None.
|
226 |
+
reverse (bool, optional): Whether to reverse the operation. Defaults to False.
|
227 |
+
"""
|
228 |
+
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
229 |
+
h = self.pre(x0) * x_mask
|
230 |
+
h = self.enc(h, x_mask, g=g)
|
231 |
+
stats = self.post(h) * x_mask
|
232 |
+
if not self.mean_only:
|
233 |
+
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
|
234 |
+
else:
|
235 |
+
m = stats
|
236 |
+
logs = torch.zeros_like(m)
|
237 |
+
|
238 |
+
if not reverse:
|
239 |
+
x1 = m + x1 * torch.exp(logs) * x_mask
|
240 |
+
x = torch.cat([x0, x1], 1)
|
241 |
+
logdet = torch.sum(logs, [1, 2])
|
242 |
+
return x, logdet
|
243 |
+
else:
|
244 |
+
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
245 |
+
x = torch.cat([x0, x1], 1)
|
246 |
+
return x
|
247 |
+
|
248 |
+
def remove_weight_norm(self):
|
249 |
+
"""Remove weight normalization from the module."""
|
250 |
+
self.enc.remove_weight_norm()
|
rvc/lib/algorithm/synthesizers.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
+
from rvc.lib.algorithm.nsf import GeneratorNSF
|
5 |
+
from rvc.lib.algorithm.generators import Generator
|
6 |
+
from rvc.lib.algorithm.commons import slice_segments, rand_slice_segments
|
7 |
+
from rvc.lib.algorithm.residuals import ResidualCouplingBlock
|
8 |
+
from rvc.lib.algorithm.encoders import TextEncoder, PosteriorEncoder
|
9 |
+
|
10 |
+
|
11 |
+
class Synthesizer(torch.nn.Module):
|
12 |
+
"""
|
13 |
+
Base Synthesizer model.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
spec_channels (int): Number of channels in the spectrogram.
|
17 |
+
segment_size (int): Size of the audio segment.
|
18 |
+
inter_channels (int): Number of channels in the intermediate layers.
|
19 |
+
hidden_channels (int): Number of channels in the hidden layers.
|
20 |
+
filter_channels (int): Number of channels in the filter layers.
|
21 |
+
n_heads (int): Number of attention heads.
|
22 |
+
n_layers (int): Number of layers in the encoder.
|
23 |
+
kernel_size (int): Size of the convolution kernel.
|
24 |
+
p_dropout (float): Dropout probability.
|
25 |
+
resblock (str): Type of residual block.
|
26 |
+
resblock_kernel_sizes (list): Kernel sizes for the residual blocks.
|
27 |
+
resblock_dilation_sizes (list): Dilation sizes for the residual blocks.
|
28 |
+
upsample_rates (list): Upsampling rates for the decoder.
|
29 |
+
upsample_initial_channel (int): Number of channels in the initial upsampling layer.
|
30 |
+
upsample_kernel_sizes (list): Kernel sizes for the upsampling layers.
|
31 |
+
spk_embed_dim (int): Dimension of the speaker embedding.
|
32 |
+
gin_channels (int): Number of channels in the global conditioning vector.
|
33 |
+
sr (int): Sampling rate of the audio.
|
34 |
+
use_f0 (bool): Whether to use F0 information.
|
35 |
+
text_enc_hidden_dim (int): Hidden dimension for the text encoder.
|
36 |
+
kwargs: Additional keyword arguments.
|
37 |
+
"""
|
38 |
+
|
39 |
+
def __init__(
|
40 |
+
self,
|
41 |
+
spec_channels,
|
42 |
+
segment_size,
|
43 |
+
inter_channels,
|
44 |
+
hidden_channels,
|
45 |
+
filter_channels,
|
46 |
+
n_heads,
|
47 |
+
n_layers,
|
48 |
+
kernel_size,
|
49 |
+
p_dropout,
|
50 |
+
resblock,
|
51 |
+
resblock_kernel_sizes,
|
52 |
+
resblock_dilation_sizes,
|
53 |
+
upsample_rates,
|
54 |
+
upsample_initial_channel,
|
55 |
+
upsample_kernel_sizes,
|
56 |
+
spk_embed_dim,
|
57 |
+
gin_channels,
|
58 |
+
sr,
|
59 |
+
use_f0,
|
60 |
+
text_enc_hidden_dim=768,
|
61 |
+
**kwargs
|
62 |
+
):
|
63 |
+
super(Synthesizer, self).__init__()
|
64 |
+
self.spec_channels = spec_channels
|
65 |
+
self.inter_channels = inter_channels
|
66 |
+
self.hidden_channels = hidden_channels
|
67 |
+
self.filter_channels = filter_channels
|
68 |
+
self.n_heads = n_heads
|
69 |
+
self.n_layers = n_layers
|
70 |
+
self.kernel_size = kernel_size
|
71 |
+
self.p_dropout = float(p_dropout)
|
72 |
+
self.resblock = resblock
|
73 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
74 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
75 |
+
self.upsample_rates = upsample_rates
|
76 |
+
self.upsample_initial_channel = upsample_initial_channel
|
77 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
78 |
+
self.segment_size = segment_size
|
79 |
+
self.gin_channels = gin_channels
|
80 |
+
self.spk_embed_dim = spk_embed_dim
|
81 |
+
self.use_f0 = use_f0
|
82 |
+
|
83 |
+
self.enc_p = TextEncoder(
|
84 |
+
inter_channels,
|
85 |
+
hidden_channels,
|
86 |
+
filter_channels,
|
87 |
+
n_heads,
|
88 |
+
n_layers,
|
89 |
+
kernel_size,
|
90 |
+
float(p_dropout),
|
91 |
+
text_enc_hidden_dim,
|
92 |
+
f0=use_f0,
|
93 |
+
)
|
94 |
+
|
95 |
+
if use_f0:
|
96 |
+
self.dec = GeneratorNSF(
|
97 |
+
inter_channels,
|
98 |
+
resblock,
|
99 |
+
resblock_kernel_sizes,
|
100 |
+
resblock_dilation_sizes,
|
101 |
+
upsample_rates,
|
102 |
+
upsample_initial_channel,
|
103 |
+
upsample_kernel_sizes,
|
104 |
+
gin_channels=gin_channels,
|
105 |
+
sr=sr,
|
106 |
+
is_half=kwargs["is_half"],
|
107 |
+
)
|
108 |
+
else:
|
109 |
+
self.dec = Generator(
|
110 |
+
inter_channels,
|
111 |
+
resblock,
|
112 |
+
resblock_kernel_sizes,
|
113 |
+
resblock_dilation_sizes,
|
114 |
+
upsample_rates,
|
115 |
+
upsample_initial_channel,
|
116 |
+
upsample_kernel_sizes,
|
117 |
+
gin_channels=gin_channels,
|
118 |
+
)
|
119 |
+
|
120 |
+
self.enc_q = PosteriorEncoder(
|
121 |
+
spec_channels,
|
122 |
+
inter_channels,
|
123 |
+
hidden_channels,
|
124 |
+
5,
|
125 |
+
1,
|
126 |
+
16,
|
127 |
+
gin_channels=gin_channels,
|
128 |
+
)
|
129 |
+
self.flow = ResidualCouplingBlock(
|
130 |
+
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
131 |
+
)
|
132 |
+
self.emb_g = torch.nn.Embedding(self.spk_embed_dim, gin_channels)
|
133 |
+
|
134 |
+
def remove_weight_norm(self):
|
135 |
+
"""Removes weight normalization from the model."""
|
136 |
+
self.dec.remove_weight_norm()
|
137 |
+
self.flow.remove_weight_norm()
|
138 |
+
self.enc_q.remove_weight_norm()
|
139 |
+
|
140 |
+
def __prepare_scriptable__(self):
|
141 |
+
for hook in self.dec._forward_pre_hooks.values():
|
142 |
+
if (
|
143 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
144 |
+
and hook.__class__.__name__ == "WeightNorm"
|
145 |
+
):
|
146 |
+
torch.nn.utils.remove_weight_norm(self.dec)
|
147 |
+
for hook in self.flow._forward_pre_hooks.values():
|
148 |
+
if (
|
149 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
150 |
+
and hook.__class__.__name__ == "WeightNorm"
|
151 |
+
):
|
152 |
+
torch.nn.utils.remove_weight_norm(self.flow)
|
153 |
+
if hasattr(self, "enc_q"):
|
154 |
+
for hook in self.enc_q._forward_pre_hooks.values():
|
155 |
+
if (
|
156 |
+
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
|
157 |
+
and hook.__class__.__name__ == "WeightNorm"
|
158 |
+
):
|
159 |
+
torch.nn.utils.remove_weight_norm(self.enc_q)
|
160 |
+
return self
|
161 |
+
|
162 |
+
@torch.jit.ignore
|
163 |
+
def forward(
|
164 |
+
self,
|
165 |
+
phone: torch.Tensor,
|
166 |
+
phone_lengths: torch.Tensor,
|
167 |
+
pitch: Optional[torch.Tensor] = None,
|
168 |
+
pitchf: Optional[torch.Tensor] = None,
|
169 |
+
y: torch.Tensor = None,
|
170 |
+
y_lengths: torch.Tensor = None,
|
171 |
+
ds: Optional[torch.Tensor] = None,
|
172 |
+
):
|
173 |
+
"""
|
174 |
+
Forward pass of the model.
|
175 |
+
|
176 |
+
Args:
|
177 |
+
phone (torch.Tensor): Phoneme sequence.
|
178 |
+
phone_lengths (torch.Tensor): Lengths of the phoneme sequences.
|
179 |
+
pitch (torch.Tensor, optional): Pitch sequence.
|
180 |
+
pitchf (torch.Tensor, optional): Fine-grained pitch sequence.
|
181 |
+
y (torch.Tensor, optional): Target spectrogram.
|
182 |
+
y_lengths (torch.Tensor, optional): Lengths of the target spectrograms.
|
183 |
+
ds (torch.Tensor, optional): Speaker embedding. Defaults to None.
|
184 |
+
"""
|
185 |
+
g = self.emb_g(ds).unsqueeze(-1)
|
186 |
+
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
187 |
+
if y is not None:
|
188 |
+
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
189 |
+
z_p = self.flow(z, y_mask, g=g)
|
190 |
+
z_slice, ids_slice = rand_slice_segments(z, y_lengths, self.segment_size)
|
191 |
+
if self.use_f0:
|
192 |
+
pitchf = slice_segments(pitchf, ids_slice, self.segment_size, 2)
|
193 |
+
o = self.dec(z_slice, pitchf, g=g)
|
194 |
+
else:
|
195 |
+
o = self.dec(z_slice, g=g)
|
196 |
+
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
197 |
+
else:
|
198 |
+
return None, None, x_mask, None, (None, None, m_p, logs_p, None, None)
|
199 |
+
|
200 |
+
@torch.jit.export
|
201 |
+
def infer(
|
202 |
+
self,
|
203 |
+
phone: torch.Tensor,
|
204 |
+
phone_lengths: torch.Tensor,
|
205 |
+
pitch: Optional[torch.Tensor] = None,
|
206 |
+
nsff0: Optional[torch.Tensor] = None,
|
207 |
+
sid: torch.Tensor = None,
|
208 |
+
rate: Optional[torch.Tensor] = None,
|
209 |
+
):
|
210 |
+
"""
|
211 |
+
Inference of the model.
|
212 |
+
|
213 |
+
Args:
|
214 |
+
phone (torch.Tensor): Phoneme sequence.
|
215 |
+
phone_lengths (torch.Tensor): Lengths of the phoneme sequences.
|
216 |
+
pitch (torch.Tensor, optional): Pitch sequence.
|
217 |
+
nsff0 (torch.Tensor, optional): Fine-grained pitch sequence.
|
218 |
+
sid (torch.Tensor): Speaker embedding.
|
219 |
+
rate (torch.Tensor, optional): Rate for time-stretching. Defaults to None.
|
220 |
+
"""
|
221 |
+
g = self.emb_g(sid).unsqueeze(-1)
|
222 |
+
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
223 |
+
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
224 |
+
if rate is not None:
|
225 |
+
assert isinstance(rate, torch.Tensor)
|
226 |
+
head = int(z_p.shape[2] * (1.0 - rate.item()))
|
227 |
+
z_p = z_p[:, :, head:]
|
228 |
+
x_mask = x_mask[:, :, head:]
|
229 |
+
if self.use_f0:
|
230 |
+
nsff0 = nsff0[:, head:]
|
231 |
+
if self.use_f0:
|
232 |
+
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
233 |
+
o = self.dec(z * x_mask, nsff0, g=g)
|
234 |
+
else:
|
235 |
+
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
236 |
+
o = self.dec(z * x_mask, g=g)
|
237 |
+
return o, x_mask, (z, z_p, m_p, logs_p)
|
rvc/lib/predictors/F0Extractor.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import dataclasses
|
2 |
+
import pathlib
|
3 |
+
import libf0
|
4 |
+
import librosa
|
5 |
+
import numpy as np
|
6 |
+
import resampy
|
7 |
+
import torch
|
8 |
+
import torchcrepe
|
9 |
+
import torchfcpe
|
10 |
+
import os
|
11 |
+
|
12 |
+
# from tools.anyf0.rmvpe import RMVPE
|
13 |
+
from rvc.lib.predictors.RMVPE import RMVPE0Predictor
|
14 |
+
from rvc.configs.config import Config
|
15 |
+
|
16 |
+
config = Config()
|
17 |
+
|
18 |
+
|
19 |
+
@dataclasses.dataclass
|
20 |
+
class F0Extractor:
|
21 |
+
wav_path: pathlib.Path
|
22 |
+
sample_rate: int = 44100
|
23 |
+
hop_length: int = 512
|
24 |
+
f0_min: int = 50
|
25 |
+
f0_max: int = 1600
|
26 |
+
method: str = "rmvpe"
|
27 |
+
x: np.ndarray = dataclasses.field(init=False)
|
28 |
+
|
29 |
+
def __post_init__(self):
|
30 |
+
self.x, self.sample_rate = librosa.load(self.wav_path, sr=self.sample_rate)
|
31 |
+
|
32 |
+
@property
|
33 |
+
def hop_size(self) -> float:
|
34 |
+
return self.hop_length / self.sample_rate
|
35 |
+
|
36 |
+
@property
|
37 |
+
def wav16k(self) -> np.ndarray:
|
38 |
+
return resampy.resample(self.x, self.sample_rate, 16000)
|
39 |
+
|
40 |
+
def extract_f0(self) -> np.ndarray:
|
41 |
+
f0 = None
|
42 |
+
method = self.method
|
43 |
+
if method == "crepe":
|
44 |
+
wav16k_torch = torch.FloatTensor(self.wav16k).unsqueeze(0).to(config.device)
|
45 |
+
f0 = torchcrepe.predict(
|
46 |
+
wav16k_torch,
|
47 |
+
sample_rate=16000,
|
48 |
+
hop_length=160,
|
49 |
+
batch_size=512,
|
50 |
+
fmin=self.f0_min,
|
51 |
+
fmax=self.f0_max,
|
52 |
+
device=config.device,
|
53 |
+
)
|
54 |
+
f0 = f0[0].cpu().numpy()
|
55 |
+
elif method == "fcpe":
|
56 |
+
audio = librosa.to_mono(self.x)
|
57 |
+
audio_length = len(audio)
|
58 |
+
f0_target_length = (audio_length // self.hop_length) + 1
|
59 |
+
audio = (
|
60 |
+
torch.from_numpy(audio)
|
61 |
+
.float()
|
62 |
+
.unsqueeze(0)
|
63 |
+
.unsqueeze(-1)
|
64 |
+
.to(config.device)
|
65 |
+
)
|
66 |
+
model = torchfcpe.spawn_bundled_infer_model(device=config.device)
|
67 |
+
|
68 |
+
f0 = model.infer(
|
69 |
+
audio,
|
70 |
+
sr=self.sample_rate,
|
71 |
+
decoder_mode="local_argmax",
|
72 |
+
threshold=0.006,
|
73 |
+
f0_min=self.f0_min,
|
74 |
+
f0_max=self.f0_max,
|
75 |
+
interp_uv=False,
|
76 |
+
output_interp_target_length=f0_target_length,
|
77 |
+
)
|
78 |
+
f0 = f0.squeeze().cpu().numpy()
|
79 |
+
elif method == "rmvpe":
|
80 |
+
model_rmvpe = RMVPE0Predictor(
|
81 |
+
os.path.join("rvc", "models", "predictors", "rmvpe.pt"),
|
82 |
+
is_half=config.is_half,
|
83 |
+
device=config.device,
|
84 |
+
# hop_length=80
|
85 |
+
)
|
86 |
+
f0 = model_rmvpe.infer_from_audio(self.wav16k, thred=0.03)
|
87 |
+
|
88 |
+
else:
|
89 |
+
raise ValueError(f"Unknown method: {self.method}")
|
90 |
+
return libf0.hz_to_cents(f0, librosa.midi_to_hz(0))
|
91 |
+
|
92 |
+
def plot_f0(self, f0):
|
93 |
+
from matplotlib import pyplot as plt
|
94 |
+
|
95 |
+
plt.figure(figsize=(10, 4))
|
96 |
+
plt.plot(f0)
|
97 |
+
plt.title(self.method)
|
98 |
+
plt.xlabel("Time (frames)")
|
99 |
+
plt.ylabel("F0 (cents)")
|
100 |
+
plt.show()
|
rvc/lib/predictors/FCPE.py
ADDED
@@ -0,0 +1,920 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Union
|
2 |
+
|
3 |
+
import torch.nn.functional as F
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
import torch.nn as nn
|
7 |
+
from torch.nn.utils.parametrizations import weight_norm
|
8 |
+
from torchaudio.transforms import Resample
|
9 |
+
import os
|
10 |
+
import librosa
|
11 |
+
import soundfile as sf
|
12 |
+
import torch.utils.data
|
13 |
+
from librosa.filters import mel as librosa_mel_fn
|
14 |
+
import math
|
15 |
+
from functools import partial
|
16 |
+
|
17 |
+
from einops import rearrange, repeat
|
18 |
+
from local_attention import LocalAttention
|
19 |
+
from torch import nn
|
20 |
+
|
21 |
+
os.environ["LRU_CACHE_CAPACITY"] = "3"
|
22 |
+
|
23 |
+
|
24 |
+
def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
|
25 |
+
"""Loads wav file to torch tensor."""
|
26 |
+
try:
|
27 |
+
data, sample_rate = sf.read(full_path, always_2d=True)
|
28 |
+
except Exception as error:
|
29 |
+
print(f"An error occurred loading {full_path}: {error}")
|
30 |
+
if return_empty_on_exception:
|
31 |
+
return [], sample_rate or target_sr or 48000
|
32 |
+
else:
|
33 |
+
raise
|
34 |
+
|
35 |
+
data = data[:, 0] if len(data.shape) > 1 else data
|
36 |
+
assert len(data) > 2
|
37 |
+
|
38 |
+
# Normalize data
|
39 |
+
max_mag = (
|
40 |
+
-np.iinfo(data.dtype).min
|
41 |
+
if np.issubdtype(data.dtype, np.integer)
|
42 |
+
else max(np.amax(data), -np.amin(data))
|
43 |
+
)
|
44 |
+
max_mag = (
|
45 |
+
(2**31) + 1 if max_mag > (2**15) else ((2**15) + 1 if max_mag > 1.01 else 1.0)
|
46 |
+
)
|
47 |
+
data = torch.FloatTensor(data.astype(np.float32)) / max_mag
|
48 |
+
|
49 |
+
# Handle exceptions and resample
|
50 |
+
if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:
|
51 |
+
return [], sample_rate or target_sr or 48000
|
52 |
+
if target_sr is not None and sample_rate != target_sr:
|
53 |
+
data = torch.from_numpy(
|
54 |
+
librosa.core.resample(
|
55 |
+
data.numpy(), orig_sr=sample_rate, target_sr=target_sr
|
56 |
+
)
|
57 |
+
)
|
58 |
+
sample_rate = target_sr
|
59 |
+
|
60 |
+
return data, sample_rate
|
61 |
+
|
62 |
+
|
63 |
+
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
64 |
+
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
|
65 |
+
|
66 |
+
|
67 |
+
def dynamic_range_decompression(x, C=1):
|
68 |
+
return np.exp(x) / C
|
69 |
+
|
70 |
+
|
71 |
+
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
72 |
+
return torch.log(torch.clamp(x, min=clip_val) * C)
|
73 |
+
|
74 |
+
|
75 |
+
def dynamic_range_decompression_torch(x, C=1):
|
76 |
+
return torch.exp(x) / C
|
77 |
+
|
78 |
+
|
79 |
+
class STFT:
|
80 |
+
def __init__(
|
81 |
+
self,
|
82 |
+
sr=22050,
|
83 |
+
n_mels=80,
|
84 |
+
n_fft=1024,
|
85 |
+
win_size=1024,
|
86 |
+
hop_length=256,
|
87 |
+
fmin=20,
|
88 |
+
fmax=11025,
|
89 |
+
clip_val=1e-5,
|
90 |
+
):
|
91 |
+
self.target_sr = sr
|
92 |
+
self.n_mels = n_mels
|
93 |
+
self.n_fft = n_fft
|
94 |
+
self.win_size = win_size
|
95 |
+
self.hop_length = hop_length
|
96 |
+
self.fmin = fmin
|
97 |
+
self.fmax = fmax
|
98 |
+
self.clip_val = clip_val
|
99 |
+
self.mel_basis = {}
|
100 |
+
self.hann_window = {}
|
101 |
+
|
102 |
+
def get_mel(self, y, keyshift=0, speed=1, center=False, train=False):
|
103 |
+
sample_rate = self.target_sr
|
104 |
+
n_mels = self.n_mels
|
105 |
+
n_fft = self.n_fft
|
106 |
+
win_size = self.win_size
|
107 |
+
hop_length = self.hop_length
|
108 |
+
fmin = self.fmin
|
109 |
+
fmax = self.fmax
|
110 |
+
clip_val = self.clip_val
|
111 |
+
|
112 |
+
factor = 2 ** (keyshift / 12)
|
113 |
+
n_fft_new = int(np.round(n_fft * factor))
|
114 |
+
win_size_new = int(np.round(win_size * factor))
|
115 |
+
hop_length_new = int(np.round(hop_length * speed))
|
116 |
+
|
117 |
+
# Optimize mel_basis and hann_window caching
|
118 |
+
mel_basis = self.mel_basis if not train else {}
|
119 |
+
hann_window = self.hann_window if not train else {}
|
120 |
+
|
121 |
+
mel_basis_key = str(fmax) + "_" + str(y.device)
|
122 |
+
if mel_basis_key not in mel_basis:
|
123 |
+
mel = librosa_mel_fn(
|
124 |
+
sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax
|
125 |
+
)
|
126 |
+
mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
|
127 |
+
|
128 |
+
keyshift_key = str(keyshift) + "_" + str(y.device)
|
129 |
+
if keyshift_key not in hann_window:
|
130 |
+
hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
|
131 |
+
|
132 |
+
# Padding and STFT
|
133 |
+
pad_left = (win_size_new - hop_length_new) // 2
|
134 |
+
pad_right = max(
|
135 |
+
(win_size_new - hop_length_new + 1) // 2,
|
136 |
+
win_size_new - y.size(-1) - pad_left,
|
137 |
+
)
|
138 |
+
mode = "reflect" if pad_right < y.size(-1) else "constant"
|
139 |
+
y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode)
|
140 |
+
y = y.squeeze(1)
|
141 |
+
|
142 |
+
spec = torch.stft(
|
143 |
+
y,
|
144 |
+
n_fft_new,
|
145 |
+
hop_length=hop_length_new,
|
146 |
+
win_length=win_size_new,
|
147 |
+
window=hann_window[keyshift_key],
|
148 |
+
center=center,
|
149 |
+
pad_mode="reflect",
|
150 |
+
normalized=False,
|
151 |
+
onesided=True,
|
152 |
+
return_complex=True,
|
153 |
+
)
|
154 |
+
spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
|
155 |
+
|
156 |
+
# Handle keyshift and mel conversion
|
157 |
+
if keyshift != 0:
|
158 |
+
size = n_fft // 2 + 1
|
159 |
+
resize = spec.size(1)
|
160 |
+
spec = (
|
161 |
+
F.pad(spec, (0, 0, 0, size - resize))
|
162 |
+
if resize < size
|
163 |
+
else spec[:, :size, :]
|
164 |
+
)
|
165 |
+
spec = spec * win_size / win_size_new
|
166 |
+
spec = torch.matmul(mel_basis[mel_basis_key], spec)
|
167 |
+
spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
|
168 |
+
return spec
|
169 |
+
|
170 |
+
def __call__(self, audiopath):
|
171 |
+
audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
|
172 |
+
spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
|
173 |
+
return spect
|
174 |
+
|
175 |
+
|
176 |
+
stft = STFT()
|
177 |
+
|
178 |
+
|
179 |
+
def softmax_kernel(
|
180 |
+
data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device=None
|
181 |
+
):
|
182 |
+
b, h, *_ = data.shape
|
183 |
+
|
184 |
+
# Normalize data
|
185 |
+
data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.0
|
186 |
+
|
187 |
+
# Project data
|
188 |
+
ratio = projection_matrix.shape[0] ** -0.5
|
189 |
+
projection = repeat(projection_matrix, "j d -> b h j d", b=b, h=h)
|
190 |
+
projection = projection.type_as(data)
|
191 |
+
data_dash = torch.einsum("...id,...jd->...ij", (data_normalizer * data), projection)
|
192 |
+
|
193 |
+
# Calculate diagonal data
|
194 |
+
diag_data = data**2
|
195 |
+
diag_data = torch.sum(diag_data, dim=-1)
|
196 |
+
diag_data = (diag_data / 2.0) * (data_normalizer**2)
|
197 |
+
diag_data = diag_data.unsqueeze(dim=-1)
|
198 |
+
|
199 |
+
# Apply softmax
|
200 |
+
if is_query:
|
201 |
+
data_dash = ratio * (
|
202 |
+
torch.exp(
|
203 |
+
data_dash
|
204 |
+
- diag_data
|
205 |
+
- torch.max(data_dash, dim=-1, keepdim=True).values
|
206 |
+
)
|
207 |
+
+ eps
|
208 |
+
)
|
209 |
+
else:
|
210 |
+
data_dash = ratio * (torch.exp(data_dash - diag_data + eps))
|
211 |
+
|
212 |
+
return data_dash.type_as(data)
|
213 |
+
|
214 |
+
|
215 |
+
def orthogonal_matrix_chunk(cols, qr_uniform_q=False, device=None):
|
216 |
+
unstructured_block = torch.randn((cols, cols), device=device)
|
217 |
+
q, r = torch.linalg.qr(unstructured_block.cpu(), mode="reduced")
|
218 |
+
q, r = map(lambda t: t.to(device), (q, r))
|
219 |
+
|
220 |
+
if qr_uniform_q:
|
221 |
+
d = torch.diag(r, 0)
|
222 |
+
q *= d.sign()
|
223 |
+
return q.t()
|
224 |
+
|
225 |
+
|
226 |
+
def exists(val):
|
227 |
+
return val is not None
|
228 |
+
|
229 |
+
|
230 |
+
def empty(tensor):
|
231 |
+
return tensor.numel() == 0
|
232 |
+
|
233 |
+
|
234 |
+
def default(val, d):
|
235 |
+
return val if exists(val) else d
|
236 |
+
|
237 |
+
|
238 |
+
def cast_tuple(val):
|
239 |
+
return (val,) if not isinstance(val, tuple) else val
|
240 |
+
|
241 |
+
|
242 |
+
class PCmer(nn.Module):
|
243 |
+
def __init__(
|
244 |
+
self,
|
245 |
+
num_layers,
|
246 |
+
num_heads,
|
247 |
+
dim_model,
|
248 |
+
dim_keys,
|
249 |
+
dim_values,
|
250 |
+
residual_dropout,
|
251 |
+
attention_dropout,
|
252 |
+
):
|
253 |
+
super().__init__()
|
254 |
+
self.num_layers = num_layers
|
255 |
+
self.num_heads = num_heads
|
256 |
+
self.dim_model = dim_model
|
257 |
+
self.dim_values = dim_values
|
258 |
+
self.dim_keys = dim_keys
|
259 |
+
self.residual_dropout = residual_dropout
|
260 |
+
self.attention_dropout = attention_dropout
|
261 |
+
|
262 |
+
self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])
|
263 |
+
|
264 |
+
def forward(self, phone, mask=None):
|
265 |
+
for layer in self._layers:
|
266 |
+
phone = layer(phone, mask)
|
267 |
+
return phone
|
268 |
+
|
269 |
+
|
270 |
+
class _EncoderLayer(nn.Module):
|
271 |
+
def __init__(self, parent: PCmer):
|
272 |
+
super().__init__()
|
273 |
+
self.conformer = ConformerConvModule(parent.dim_model)
|
274 |
+
self.norm = nn.LayerNorm(parent.dim_model)
|
275 |
+
self.dropout = nn.Dropout(parent.residual_dropout)
|
276 |
+
self.attn = SelfAttention(
|
277 |
+
dim=parent.dim_model, heads=parent.num_heads, causal=False
|
278 |
+
)
|
279 |
+
|
280 |
+
def forward(self, phone, mask=None):
|
281 |
+
phone = phone + (self.attn(self.norm(phone), mask=mask))
|
282 |
+
phone = phone + (self.conformer(phone))
|
283 |
+
return phone
|
284 |
+
|
285 |
+
|
286 |
+
def calc_same_padding(kernel_size):
|
287 |
+
pad = kernel_size // 2
|
288 |
+
return (pad, pad - (kernel_size + 1) % 2)
|
289 |
+
|
290 |
+
|
291 |
+
class Swish(nn.Module):
|
292 |
+
def forward(self, x):
|
293 |
+
return x * x.sigmoid()
|
294 |
+
|
295 |
+
|
296 |
+
class Transpose(nn.Module):
|
297 |
+
def __init__(self, dims):
|
298 |
+
super().__init__()
|
299 |
+
assert len(dims) == 2, "dims must be a tuple of two dimensions"
|
300 |
+
self.dims = dims
|
301 |
+
|
302 |
+
def forward(self, x):
|
303 |
+
return x.transpose(*self.dims)
|
304 |
+
|
305 |
+
|
306 |
+
class GLU(nn.Module):
|
307 |
+
def __init__(self, dim):
|
308 |
+
super().__init__()
|
309 |
+
self.dim = dim
|
310 |
+
|
311 |
+
def forward(self, x):
|
312 |
+
out, gate = x.chunk(2, dim=self.dim)
|
313 |
+
return out * gate.sigmoid()
|
314 |
+
|
315 |
+
|
316 |
+
class DepthWiseConv1d(nn.Module):
|
317 |
+
def __init__(self, chan_in, chan_out, kernel_size, padding):
|
318 |
+
super().__init__()
|
319 |
+
self.padding = padding
|
320 |
+
self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups=chan_in)
|
321 |
+
|
322 |
+
def forward(self, x):
|
323 |
+
x = F.pad(x, self.padding)
|
324 |
+
return self.conv(x)
|
325 |
+
|
326 |
+
|
327 |
+
class ConformerConvModule(nn.Module):
|
328 |
+
def __init__(
|
329 |
+
self, dim, causal=False, expansion_factor=2, kernel_size=31, dropout=0.0
|
330 |
+
):
|
331 |
+
super().__init__()
|
332 |
+
|
333 |
+
inner_dim = dim * expansion_factor
|
334 |
+
padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
|
335 |
+
|
336 |
+
self.net = nn.Sequential(
|
337 |
+
nn.LayerNorm(dim),
|
338 |
+
Transpose((1, 2)),
|
339 |
+
nn.Conv1d(dim, inner_dim * 2, 1),
|
340 |
+
GLU(dim=1),
|
341 |
+
DepthWiseConv1d(
|
342 |
+
inner_dim, inner_dim, kernel_size=kernel_size, padding=padding
|
343 |
+
),
|
344 |
+
Swish(),
|
345 |
+
nn.Conv1d(inner_dim, dim, 1),
|
346 |
+
Transpose((1, 2)),
|
347 |
+
nn.Dropout(dropout),
|
348 |
+
)
|
349 |
+
|
350 |
+
def forward(self, x):
|
351 |
+
return self.net(x)
|
352 |
+
|
353 |
+
|
354 |
+
def linear_attention(q, k, v):
|
355 |
+
if v is None:
|
356 |
+
out = torch.einsum("...ed,...nd->...ne", k, q)
|
357 |
+
return out
|
358 |
+
else:
|
359 |
+
k_cumsum = k.sum(dim=-2)
|
360 |
+
D_inv = 1.0 / (torch.einsum("...nd,...d->...n", q, k_cumsum.type_as(q)) + 1e-8)
|
361 |
+
context = torch.einsum("...nd,...ne->...de", k, v)
|
362 |
+
out = torch.einsum("...de,...nd,...n->...ne", context, q, D_inv)
|
363 |
+
return out
|
364 |
+
|
365 |
+
|
366 |
+
def gaussian_orthogonal_random_matrix(
|
367 |
+
nb_rows, nb_columns, scaling=0, qr_uniform_q=False, device=None
|
368 |
+
):
|
369 |
+
nb_full_blocks = int(nb_rows / nb_columns)
|
370 |
+
block_list = []
|
371 |
+
|
372 |
+
for _ in range(nb_full_blocks):
|
373 |
+
q = orthogonal_matrix_chunk(
|
374 |
+
nb_columns, qr_uniform_q=qr_uniform_q, device=device
|
375 |
+
)
|
376 |
+
block_list.append(q)
|
377 |
+
|
378 |
+
remaining_rows = nb_rows - nb_full_blocks * nb_columns
|
379 |
+
if remaining_rows > 0:
|
380 |
+
q = orthogonal_matrix_chunk(
|
381 |
+
nb_columns, qr_uniform_q=qr_uniform_q, device=device
|
382 |
+
)
|
383 |
+
block_list.append(q[:remaining_rows])
|
384 |
+
|
385 |
+
final_matrix = torch.cat(block_list)
|
386 |
+
|
387 |
+
if scaling == 0:
|
388 |
+
multiplier = torch.randn((nb_rows, nb_columns), device=device).norm(dim=1)
|
389 |
+
elif scaling == 1:
|
390 |
+
multiplier = math.sqrt((float(nb_columns))) * torch.ones(
|
391 |
+
(nb_rows,), device=device
|
392 |
+
)
|
393 |
+
else:
|
394 |
+
raise ValueError(f"Invalid scaling {scaling}")
|
395 |
+
|
396 |
+
return torch.diag(multiplier) @ final_matrix
|
397 |
+
|
398 |
+
|
399 |
+
class FastAttention(nn.Module):
|
400 |
+
def __init__(
|
401 |
+
self,
|
402 |
+
dim_heads,
|
403 |
+
nb_features=None,
|
404 |
+
ortho_scaling=0,
|
405 |
+
causal=False,
|
406 |
+
generalized_attention=False,
|
407 |
+
kernel_fn=nn.ReLU(),
|
408 |
+
qr_uniform_q=False,
|
409 |
+
no_projection=False,
|
410 |
+
):
|
411 |
+
super().__init__()
|
412 |
+
nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
|
413 |
+
|
414 |
+
self.dim_heads = dim_heads
|
415 |
+
self.nb_features = nb_features
|
416 |
+
self.ortho_scaling = ortho_scaling
|
417 |
+
|
418 |
+
self.create_projection = partial(
|
419 |
+
gaussian_orthogonal_random_matrix,
|
420 |
+
nb_rows=self.nb_features,
|
421 |
+
nb_columns=dim_heads,
|
422 |
+
scaling=ortho_scaling,
|
423 |
+
qr_uniform_q=qr_uniform_q,
|
424 |
+
)
|
425 |
+
projection_matrix = self.create_projection()
|
426 |
+
self.register_buffer("projection_matrix", projection_matrix)
|
427 |
+
|
428 |
+
self.generalized_attention = generalized_attention
|
429 |
+
self.kernel_fn = kernel_fn
|
430 |
+
self.no_projection = no_projection
|
431 |
+
self.causal = causal
|
432 |
+
|
433 |
+
@torch.no_grad()
|
434 |
+
def redraw_projection_matrix(self):
|
435 |
+
projections = self.create_projection()
|
436 |
+
self.projection_matrix.copy_(projections)
|
437 |
+
del projections
|
438 |
+
|
439 |
+
def forward(self, q, k, v):
|
440 |
+
device = q.device
|
441 |
+
|
442 |
+
if self.no_projection:
|
443 |
+
q = q.softmax(dim=-1)
|
444 |
+
k = torch.exp(k) if self.causal else k.softmax(dim=-2)
|
445 |
+
else:
|
446 |
+
create_kernel = partial(
|
447 |
+
softmax_kernel, projection_matrix=self.projection_matrix, device=device
|
448 |
+
)
|
449 |
+
q = create_kernel(q, is_query=True)
|
450 |
+
k = create_kernel(k, is_query=False)
|
451 |
+
|
452 |
+
attn_fn = linear_attention if not self.causal else self.causal_linear_fn
|
453 |
+
|
454 |
+
if v is None:
|
455 |
+
out = attn_fn(q, k, None)
|
456 |
+
return out
|
457 |
+
else:
|
458 |
+
out = attn_fn(q, k, v)
|
459 |
+
return out
|
460 |
+
|
461 |
+
|
462 |
+
class SelfAttention(nn.Module):
|
463 |
+
def __init__(
|
464 |
+
self,
|
465 |
+
dim,
|
466 |
+
causal=False,
|
467 |
+
heads=8,
|
468 |
+
dim_head=64,
|
469 |
+
local_heads=0,
|
470 |
+
local_window_size=256,
|
471 |
+
nb_features=None,
|
472 |
+
feature_redraw_interval=1000,
|
473 |
+
generalized_attention=False,
|
474 |
+
kernel_fn=nn.ReLU(),
|
475 |
+
qr_uniform_q=False,
|
476 |
+
dropout=0.0,
|
477 |
+
no_projection=False,
|
478 |
+
):
|
479 |
+
super().__init__()
|
480 |
+
assert dim % heads == 0, "dimension must be divisible by number of heads"
|
481 |
+
dim_head = default(dim_head, dim // heads)
|
482 |
+
inner_dim = dim_head * heads
|
483 |
+
self.fast_attention = FastAttention(
|
484 |
+
dim_head,
|
485 |
+
nb_features,
|
486 |
+
causal=causal,
|
487 |
+
generalized_attention=generalized_attention,
|
488 |
+
kernel_fn=kernel_fn,
|
489 |
+
qr_uniform_q=qr_uniform_q,
|
490 |
+
no_projection=no_projection,
|
491 |
+
)
|
492 |
+
|
493 |
+
self.heads = heads
|
494 |
+
self.global_heads = heads - local_heads
|
495 |
+
self.local_attn = (
|
496 |
+
LocalAttention(
|
497 |
+
window_size=local_window_size,
|
498 |
+
causal=causal,
|
499 |
+
autopad=True,
|
500 |
+
dropout=dropout,
|
501 |
+
look_forward=int(not causal),
|
502 |
+
rel_pos_emb_config=(dim_head, local_heads),
|
503 |
+
)
|
504 |
+
if local_heads > 0
|
505 |
+
else None
|
506 |
+
)
|
507 |
+
|
508 |
+
self.to_q = nn.Linear(dim, inner_dim)
|
509 |
+
self.to_k = nn.Linear(dim, inner_dim)
|
510 |
+
self.to_v = nn.Linear(dim, inner_dim)
|
511 |
+
self.to_out = nn.Linear(inner_dim, dim)
|
512 |
+
self.dropout = nn.Dropout(dropout)
|
513 |
+
|
514 |
+
@torch.no_grad()
|
515 |
+
def redraw_projection_matrix(self):
|
516 |
+
self.fast_attention.redraw_projection_matrix()
|
517 |
+
|
518 |
+
def forward(
|
519 |
+
self,
|
520 |
+
x,
|
521 |
+
context=None,
|
522 |
+
mask=None,
|
523 |
+
context_mask=None,
|
524 |
+
name=None,
|
525 |
+
inference=False,
|
526 |
+
**kwargs,
|
527 |
+
):
|
528 |
+
_, _, _, h, gh = *x.shape, self.heads, self.global_heads
|
529 |
+
|
530 |
+
cross_attend = exists(context)
|
531 |
+
context = default(context, x)
|
532 |
+
context_mask = default(context_mask, mask) if not cross_attend else context_mask
|
533 |
+
q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
|
534 |
+
|
535 |
+
q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))
|
536 |
+
(q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
|
537 |
+
|
538 |
+
attn_outs = []
|
539 |
+
if not empty(q):
|
540 |
+
if exists(context_mask):
|
541 |
+
global_mask = context_mask[:, None, :, None]
|
542 |
+
v.masked_fill_(~global_mask, 0.0)
|
543 |
+
if cross_attend:
|
544 |
+
pass # TODO: Implement cross-attention
|
545 |
+
else:
|
546 |
+
out = self.fast_attention(q, k, v)
|
547 |
+
attn_outs.append(out)
|
548 |
+
|
549 |
+
if not empty(lq):
|
550 |
+
assert (
|
551 |
+
not cross_attend
|
552 |
+
), "local attention is not compatible with cross attention"
|
553 |
+
out = self.local_attn(lq, lk, lv, input_mask=mask)
|
554 |
+
attn_outs.append(out)
|
555 |
+
|
556 |
+
out = torch.cat(attn_outs, dim=1)
|
557 |
+
out = rearrange(out, "b h n d -> b n (h d)")
|
558 |
+
out = self.to_out(out)
|
559 |
+
return self.dropout(out)
|
560 |
+
|
561 |
+
|
562 |
+
def l2_regularization(model, l2_alpha):
|
563 |
+
l2_loss = []
|
564 |
+
for module in model.modules():
|
565 |
+
if type(module) is nn.Conv2d:
|
566 |
+
l2_loss.append((module.weight**2).sum() / 2.0)
|
567 |
+
return l2_alpha * sum(l2_loss)
|
568 |
+
|
569 |
+
|
570 |
+
class FCPE(nn.Module):
|
571 |
+
def __init__(
|
572 |
+
self,
|
573 |
+
input_channel=128,
|
574 |
+
out_dims=360,
|
575 |
+
n_layers=12,
|
576 |
+
n_chans=512,
|
577 |
+
use_siren=False,
|
578 |
+
use_full=False,
|
579 |
+
loss_mse_scale=10,
|
580 |
+
loss_l2_regularization=False,
|
581 |
+
loss_l2_regularization_scale=1,
|
582 |
+
loss_grad1_mse=False,
|
583 |
+
loss_grad1_mse_scale=1,
|
584 |
+
f0_max=1975.5,
|
585 |
+
f0_min=32.70,
|
586 |
+
confidence=False,
|
587 |
+
threshold=0.05,
|
588 |
+
use_input_conv=True,
|
589 |
+
):
|
590 |
+
super().__init__()
|
591 |
+
if use_siren is True:
|
592 |
+
raise ValueError("Siren is not supported yet.")
|
593 |
+
if use_full is True:
|
594 |
+
raise ValueError("Full model is not supported yet.")
|
595 |
+
|
596 |
+
self.loss_mse_scale = loss_mse_scale if (loss_mse_scale is not None) else 10
|
597 |
+
self.loss_l2_regularization = (
|
598 |
+
loss_l2_regularization if (loss_l2_regularization is not None) else False
|
599 |
+
)
|
600 |
+
self.loss_l2_regularization_scale = (
|
601 |
+
loss_l2_regularization_scale
|
602 |
+
if (loss_l2_regularization_scale is not None)
|
603 |
+
else 1
|
604 |
+
)
|
605 |
+
self.loss_grad1_mse = loss_grad1_mse if (loss_grad1_mse is not None) else False
|
606 |
+
self.loss_grad1_mse_scale = (
|
607 |
+
loss_grad1_mse_scale if (loss_grad1_mse_scale is not None) else 1
|
608 |
+
)
|
609 |
+
self.f0_max = f0_max if (f0_max is not None) else 1975.5
|
610 |
+
self.f0_min = f0_min if (f0_min is not None) else 32.70
|
611 |
+
self.confidence = confidence if (confidence is not None) else False
|
612 |
+
self.threshold = threshold if (threshold is not None) else 0.05
|
613 |
+
self.use_input_conv = use_input_conv if (use_input_conv is not None) else True
|
614 |
+
|
615 |
+
self.cent_table_b = torch.Tensor(
|
616 |
+
np.linspace(
|
617 |
+
self.f0_to_cent(torch.Tensor([f0_min]))[0],
|
618 |
+
self.f0_to_cent(torch.Tensor([f0_max]))[0],
|
619 |
+
out_dims,
|
620 |
+
)
|
621 |
+
)
|
622 |
+
self.register_buffer("cent_table", self.cent_table_b)
|
623 |
+
|
624 |
+
# conv in stack
|
625 |
+
_leaky = nn.LeakyReLU()
|
626 |
+
self.stack = nn.Sequential(
|
627 |
+
nn.Conv1d(input_channel, n_chans, 3, 1, 1),
|
628 |
+
nn.GroupNorm(4, n_chans),
|
629 |
+
_leaky,
|
630 |
+
nn.Conv1d(n_chans, n_chans, 3, 1, 1),
|
631 |
+
)
|
632 |
+
|
633 |
+
# transformer
|
634 |
+
self.decoder = PCmer(
|
635 |
+
num_layers=n_layers,
|
636 |
+
num_heads=8,
|
637 |
+
dim_model=n_chans,
|
638 |
+
dim_keys=n_chans,
|
639 |
+
dim_values=n_chans,
|
640 |
+
residual_dropout=0.1,
|
641 |
+
attention_dropout=0.1,
|
642 |
+
)
|
643 |
+
self.norm = nn.LayerNorm(n_chans)
|
644 |
+
|
645 |
+
# out
|
646 |
+
self.n_out = out_dims
|
647 |
+
self.dense_out = weight_norm(nn.Linear(n_chans, self.n_out))
|
648 |
+
|
649 |
+
def forward(
|
650 |
+
self, mel, infer=True, gt_f0=None, return_hz_f0=False, cdecoder="local_argmax"
|
651 |
+
):
|
652 |
+
if cdecoder == "argmax":
|
653 |
+
self.cdecoder = self.cents_decoder
|
654 |
+
elif cdecoder == "local_argmax":
|
655 |
+
self.cdecoder = self.cents_local_decoder
|
656 |
+
|
657 |
+
x = (
|
658 |
+
self.stack(mel.transpose(1, 2)).transpose(1, 2)
|
659 |
+
if self.use_input_conv
|
660 |
+
else mel
|
661 |
+
)
|
662 |
+
x = self.decoder(x)
|
663 |
+
x = self.norm(x)
|
664 |
+
x = self.dense_out(x)
|
665 |
+
x = torch.sigmoid(x)
|
666 |
+
|
667 |
+
if not infer:
|
668 |
+
gt_cent_f0 = self.f0_to_cent(gt_f0)
|
669 |
+
gt_cent_f0 = self.gaussian_blurred_cent(gt_cent_f0)
|
670 |
+
loss_all = self.loss_mse_scale * F.binary_cross_entropy(x, gt_cent_f0)
|
671 |
+
if self.loss_l2_regularization:
|
672 |
+
loss_all = loss_all + l2_regularization(
|
673 |
+
model=self, l2_alpha=self.loss_l2_regularization_scale
|
674 |
+
)
|
675 |
+
x = loss_all
|
676 |
+
if infer:
|
677 |
+
x = self.cdecoder(x)
|
678 |
+
x = self.cent_to_f0(x)
|
679 |
+
x = (1 + x / 700).log() if not return_hz_f0 else x
|
680 |
+
|
681 |
+
return x
|
682 |
+
|
683 |
+
def cents_decoder(self, y, mask=True):
|
684 |
+
B, N, _ = y.size()
|
685 |
+
ci = self.cent_table[None, None, :].expand(B, N, -1)
|
686 |
+
rtn = torch.sum(ci * y, dim=-1, keepdim=True) / torch.sum(
|
687 |
+
y, dim=-1, keepdim=True
|
688 |
+
)
|
689 |
+
if mask:
|
690 |
+
confident = torch.max(y, dim=-1, keepdim=True)[0]
|
691 |
+
confident_mask = torch.ones_like(confident)
|
692 |
+
confident_mask[confident <= self.threshold] = float("-INF")
|
693 |
+
rtn = rtn * confident_mask
|
694 |
+
return (rtn, confident) if self.confidence else rtn
|
695 |
+
|
696 |
+
def cents_local_decoder(self, y, mask=True):
|
697 |
+
B, N, _ = y.size()
|
698 |
+
ci = self.cent_table[None, None, :].expand(B, N, -1)
|
699 |
+
confident, max_index = torch.max(y, dim=-1, keepdim=True)
|
700 |
+
local_argmax_index = torch.arange(0, 9).to(max_index.device) + (max_index - 4)
|
701 |
+
local_argmax_index = torch.clamp(local_argmax_index, 0, self.n_out - 1)
|
702 |
+
ci_l = torch.gather(ci, -1, local_argmax_index)
|
703 |
+
y_l = torch.gather(y, -1, local_argmax_index)
|
704 |
+
rtn = torch.sum(ci_l * y_l, dim=-1, keepdim=True) / torch.sum(
|
705 |
+
y_l, dim=-1, keepdim=True
|
706 |
+
)
|
707 |
+
if mask:
|
708 |
+
confident_mask = torch.ones_like(confident)
|
709 |
+
confident_mask[confident <= self.threshold] = float("-INF")
|
710 |
+
rtn = rtn * confident_mask
|
711 |
+
return (rtn, confident) if self.confidence else rtn
|
712 |
+
|
713 |
+
def cent_to_f0(self, cent):
|
714 |
+
return 10.0 * 2 ** (cent / 1200.0)
|
715 |
+
|
716 |
+
def f0_to_cent(self, f0):
|
717 |
+
return 1200.0 * torch.log2(f0 / 10.0)
|
718 |
+
|
719 |
+
def gaussian_blurred_cent(self, cents):
|
720 |
+
mask = (cents > 0.1) & (cents < (1200.0 * np.log2(self.f0_max / 10.0)))
|
721 |
+
B, N, _ = cents.size()
|
722 |
+
ci = self.cent_table[None, None, :].expand(B, N, -1)
|
723 |
+
return torch.exp(-torch.square(ci - cents) / 1250) * mask.float()
|
724 |
+
|
725 |
+
|
726 |
+
class FCPEInfer:
|
727 |
+
def __init__(self, model_path, device=None, dtype=torch.float32):
|
728 |
+
if device is None:
|
729 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
730 |
+
self.device = device
|
731 |
+
ckpt = torch.load(model_path, map_location=torch.device(self.device))
|
732 |
+
self.args = DotDict(ckpt["config"])
|
733 |
+
self.dtype = dtype
|
734 |
+
model = FCPE(
|
735 |
+
input_channel=self.args.model.input_channel,
|
736 |
+
out_dims=self.args.model.out_dims,
|
737 |
+
n_layers=self.args.model.n_layers,
|
738 |
+
n_chans=self.args.model.n_chans,
|
739 |
+
use_siren=self.args.model.use_siren,
|
740 |
+
use_full=self.args.model.use_full,
|
741 |
+
loss_mse_scale=self.args.loss.loss_mse_scale,
|
742 |
+
loss_l2_regularization=self.args.loss.loss_l2_regularization,
|
743 |
+
loss_l2_regularization_scale=self.args.loss.loss_l2_regularization_scale,
|
744 |
+
loss_grad1_mse=self.args.loss.loss_grad1_mse,
|
745 |
+
loss_grad1_mse_scale=self.args.loss.loss_grad1_mse_scale,
|
746 |
+
f0_max=self.args.model.f0_max,
|
747 |
+
f0_min=self.args.model.f0_min,
|
748 |
+
confidence=self.args.model.confidence,
|
749 |
+
)
|
750 |
+
model.to(self.device).to(self.dtype)
|
751 |
+
model.load_state_dict(ckpt["model"])
|
752 |
+
model.eval()
|
753 |
+
self.model = model
|
754 |
+
self.wav2mel = Wav2Mel(self.args, dtype=self.dtype, device=self.device)
|
755 |
+
|
756 |
+
@torch.no_grad()
|
757 |
+
def __call__(self, audio, sr, threshold=0.05):
|
758 |
+
self.model.threshold = threshold
|
759 |
+
audio = audio[None, :]
|
760 |
+
mel = self.wav2mel(audio=audio, sample_rate=sr).to(self.dtype)
|
761 |
+
f0 = self.model(mel=mel, infer=True, return_hz_f0=True)
|
762 |
+
return f0
|
763 |
+
|
764 |
+
|
765 |
+
class Wav2Mel:
|
766 |
+
def __init__(self, args, device=None, dtype=torch.float32):
|
767 |
+
self.sample_rate = args.mel.sampling_rate
|
768 |
+
self.hop_size = args.mel.hop_size
|
769 |
+
if device is None:
|
770 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
771 |
+
self.device = device
|
772 |
+
self.dtype = dtype
|
773 |
+
self.stft = STFT(
|
774 |
+
args.mel.sampling_rate,
|
775 |
+
args.mel.num_mels,
|
776 |
+
args.mel.n_fft,
|
777 |
+
args.mel.win_size,
|
778 |
+
args.mel.hop_size,
|
779 |
+
args.mel.fmin,
|
780 |
+
args.mel.fmax,
|
781 |
+
)
|
782 |
+
self.resample_kernel = {}
|
783 |
+
|
784 |
+
def extract_nvstft(self, audio, keyshift=0, train=False):
|
785 |
+
mel = self.stft.get_mel(audio, keyshift=keyshift, train=train).transpose(1, 2)
|
786 |
+
return mel
|
787 |
+
|
788 |
+
def extract_mel(self, audio, sample_rate, keyshift=0, train=False):
|
789 |
+
audio = audio.to(self.dtype).to(self.device)
|
790 |
+
if sample_rate == self.sample_rate:
|
791 |
+
audio_res = audio
|
792 |
+
else:
|
793 |
+
key_str = str(sample_rate)
|
794 |
+
if key_str not in self.resample_kernel:
|
795 |
+
self.resample_kernel[key_str] = Resample(
|
796 |
+
sample_rate, self.sample_rate, lowpass_filter_width=128
|
797 |
+
)
|
798 |
+
self.resample_kernel[key_str] = (
|
799 |
+
self.resample_kernel[key_str].to(self.dtype).to(self.device)
|
800 |
+
)
|
801 |
+
audio_res = self.resample_kernel[key_str](audio)
|
802 |
+
|
803 |
+
mel = self.extract_nvstft(
|
804 |
+
audio_res, keyshift=keyshift, train=train
|
805 |
+
) # B, n_frames, bins
|
806 |
+
n_frames = int(audio.shape[1] // self.hop_size) + 1
|
807 |
+
mel = (
|
808 |
+
torch.cat((mel, mel[:, -1:, :]), 1) if n_frames > int(mel.shape[1]) else mel
|
809 |
+
)
|
810 |
+
mel = mel[:, :n_frames, :] if n_frames < int(mel.shape[1]) else mel
|
811 |
+
return mel
|
812 |
+
|
813 |
+
def __call__(self, audio, sample_rate, keyshift=0, train=False):
|
814 |
+
return self.extract_mel(audio, sample_rate, keyshift=keyshift, train=train)
|
815 |
+
|
816 |
+
|
817 |
+
class DotDict(dict):
|
818 |
+
def __getattr__(*args):
|
819 |
+
val = dict.get(*args)
|
820 |
+
return DotDict(val) if type(val) is dict else val
|
821 |
+
|
822 |
+
__setattr__ = dict.__setitem__
|
823 |
+
__delattr__ = dict.__delitem__
|
824 |
+
|
825 |
+
|
826 |
+
class F0Predictor(object):
|
827 |
+
def compute_f0(self, wav, p_len):
|
828 |
+
pass
|
829 |
+
|
830 |
+
def compute_f0_uv(self, wav, p_len):
|
831 |
+
pass
|
832 |
+
|
833 |
+
|
834 |
+
class FCPEF0Predictor(F0Predictor):
|
835 |
+
def __init__(
|
836 |
+
self,
|
837 |
+
model_path,
|
838 |
+
hop_length=512,
|
839 |
+
f0_min=50,
|
840 |
+
f0_max=1100,
|
841 |
+
dtype=torch.float32,
|
842 |
+
device=None,
|
843 |
+
sample_rate=44100,
|
844 |
+
threshold=0.05,
|
845 |
+
):
|
846 |
+
self.fcpe = FCPEInfer(model_path, device=device, dtype=dtype)
|
847 |
+
self.hop_length = hop_length
|
848 |
+
self.f0_min = f0_min
|
849 |
+
self.f0_max = f0_max
|
850 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
851 |
+
self.threshold = threshold
|
852 |
+
self.sample_rate = sample_rate
|
853 |
+
self.dtype = dtype
|
854 |
+
self.name = "fcpe"
|
855 |
+
|
856 |
+
def repeat_expand(
|
857 |
+
self,
|
858 |
+
content: Union[torch.Tensor, np.ndarray],
|
859 |
+
target_len: int,
|
860 |
+
mode: str = "nearest",
|
861 |
+
):
|
862 |
+
ndim = content.ndim
|
863 |
+
content = (
|
864 |
+
content[None, None]
|
865 |
+
if ndim == 1
|
866 |
+
else content[None] if ndim == 2 else content
|
867 |
+
)
|
868 |
+
assert content.ndim == 3
|
869 |
+
is_np = isinstance(content, np.ndarray)
|
870 |
+
content = torch.from_numpy(content) if is_np else content
|
871 |
+
results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
|
872 |
+
results = results.numpy() if is_np else results
|
873 |
+
return results[0, 0] if ndim == 1 else results[0] if ndim == 2 else results
|
874 |
+
|
875 |
+
def post_process(self, x, sample_rate, f0, pad_to):
|
876 |
+
f0 = (
|
877 |
+
torch.from_numpy(f0).float().to(x.device)
|
878 |
+
if isinstance(f0, np.ndarray)
|
879 |
+
else f0
|
880 |
+
)
|
881 |
+
f0 = self.repeat_expand(f0, pad_to) if pad_to is not None else f0
|
882 |
+
|
883 |
+
vuv_vector = torch.zeros_like(f0)
|
884 |
+
vuv_vector[f0 > 0.0] = 1.0
|
885 |
+
vuv_vector[f0 <= 0.0] = 0.0
|
886 |
+
|
887 |
+
nzindex = torch.nonzero(f0).squeeze()
|
888 |
+
f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
|
889 |
+
time_org = self.hop_length / sample_rate * nzindex.cpu().numpy()
|
890 |
+
time_frame = np.arange(pad_to) * self.hop_length / sample_rate
|
891 |
+
|
892 |
+
vuv_vector = F.interpolate(vuv_vector[None, None, :], size=pad_to)[0][0]
|
893 |
+
|
894 |
+
if f0.shape[0] <= 0:
|
895 |
+
return np.zeros(pad_to), vuv_vector.cpu().numpy()
|
896 |
+
if f0.shape[0] == 1:
|
897 |
+
return np.ones(pad_to) * f0[0], vuv_vector.cpu().numpy()
|
898 |
+
|
899 |
+
f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
|
900 |
+
return f0, vuv_vector.cpu().numpy()
|
901 |
+
|
902 |
+
def compute_f0(self, wav, p_len=None):
|
903 |
+
x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
|
904 |
+
p_len = x.shape[0] // self.hop_length if p_len is None else p_len
|
905 |
+
f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
|
906 |
+
if torch.all(f0 == 0):
|
907 |
+
return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (
|
908 |
+
f0.cpu().numpy() if p_len is None else np.zeros(p_len)
|
909 |
+
)
|
910 |
+
return self.post_process(x, self.sample_rate, f0, p_len)[0]
|
911 |
+
|
912 |
+
def compute_f0_uv(self, wav, p_len=None):
|
913 |
+
x = torch.FloatTensor(wav).to(self.dtype).to(self.device)
|
914 |
+
p_len = x.shape[0] // self.hop_length if p_len is None else p_len
|
915 |
+
f0 = self.fcpe(x, sr=self.sample_rate, threshold=self.threshold)[0, :, 0]
|
916 |
+
if torch.all(f0 == 0):
|
917 |
+
return f0.cpu().numpy() if p_len is None else np.zeros(p_len), (
|
918 |
+
f0.cpu().numpy() if p_len is None else np.zeros(p_len)
|
919 |
+
)
|
920 |
+
return self.post_process(x, self.sample_rate, f0, p_len)
|
rvc/lib/predictors/RMVPE.py
ADDED
@@ -0,0 +1,560 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
from librosa.filters import mel
|
7 |
+
from typing import List
|
8 |
+
|
9 |
+
# Constants for readability
|
10 |
+
N_MELS = 128
|
11 |
+
N_CLASS = 360
|
12 |
+
|
13 |
+
|
14 |
+
# Define a helper function for creating convolutional blocks
|
15 |
+
class ConvBlockRes(nn.Module):
|
16 |
+
"""
|
17 |
+
A convolutional block with residual connection.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
in_channels (int): Number of input channels.
|
21 |
+
out_channels (int): Number of output channels.
|
22 |
+
momentum (float): Momentum for batch normalization.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(self, in_channels, out_channels, momentum=0.01):
|
26 |
+
super(ConvBlockRes, self).__init__()
|
27 |
+
self.conv = nn.Sequential(
|
28 |
+
nn.Conv2d(
|
29 |
+
in_channels=in_channels,
|
30 |
+
out_channels=out_channels,
|
31 |
+
kernel_size=(3, 3),
|
32 |
+
stride=(1, 1),
|
33 |
+
padding=(1, 1),
|
34 |
+
bias=False,
|
35 |
+
),
|
36 |
+
nn.BatchNorm2d(out_channels, momentum=momentum),
|
37 |
+
nn.ReLU(),
|
38 |
+
nn.Conv2d(
|
39 |
+
in_channels=out_channels,
|
40 |
+
out_channels=out_channels,
|
41 |
+
kernel_size=(3, 3),
|
42 |
+
stride=(1, 1),
|
43 |
+
padding=(1, 1),
|
44 |
+
bias=False,
|
45 |
+
),
|
46 |
+
nn.BatchNorm2d(out_channels, momentum=momentum),
|
47 |
+
nn.ReLU(),
|
48 |
+
)
|
49 |
+
if in_channels != out_channels:
|
50 |
+
self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
|
51 |
+
self.is_shortcut = True
|
52 |
+
else:
|
53 |
+
self.is_shortcut = False
|
54 |
+
|
55 |
+
def forward(self, x):
|
56 |
+
if self.is_shortcut:
|
57 |
+
return self.conv(x) + self.shortcut(x)
|
58 |
+
else:
|
59 |
+
return self.conv(x) + x
|
60 |
+
|
61 |
+
|
62 |
+
# Define a class for residual encoder blocks
|
63 |
+
class ResEncoderBlock(nn.Module):
|
64 |
+
"""
|
65 |
+
A residual encoder block.
|
66 |
+
|
67 |
+
Args:
|
68 |
+
in_channels (int): Number of input channels.
|
69 |
+
out_channels (int): Number of output channels.
|
70 |
+
kernel_size (tuple): Size of the average pooling kernel.
|
71 |
+
n_blocks (int): Number of convolutional blocks in the block.
|
72 |
+
momentum (float): Momentum for batch normalization.
|
73 |
+
"""
|
74 |
+
|
75 |
+
def __init__(
|
76 |
+
self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
|
77 |
+
):
|
78 |
+
super(ResEncoderBlock, self).__init__()
|
79 |
+
self.n_blocks = n_blocks
|
80 |
+
self.conv = nn.ModuleList()
|
81 |
+
self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
|
82 |
+
for _ in range(n_blocks - 1):
|
83 |
+
self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
|
84 |
+
self.kernel_size = kernel_size
|
85 |
+
if self.kernel_size is not None:
|
86 |
+
self.pool = nn.AvgPool2d(kernel_size=kernel_size)
|
87 |
+
|
88 |
+
def forward(self, x):
|
89 |
+
for i in range(self.n_blocks):
|
90 |
+
x = self.conv[i](x)
|
91 |
+
if self.kernel_size is not None:
|
92 |
+
return x, self.pool(x)
|
93 |
+
else:
|
94 |
+
return x
|
95 |
+
|
96 |
+
|
97 |
+
# Define a class for the encoder
|
98 |
+
class Encoder(nn.Module):
|
99 |
+
"""
|
100 |
+
The encoder part of the DeepUnet.
|
101 |
+
|
102 |
+
Args:
|
103 |
+
in_channels (int): Number of input channels.
|
104 |
+
in_size (int): Size of the input tensor.
|
105 |
+
n_encoders (int): Number of encoder blocks.
|
106 |
+
kernel_size (tuple): Size of the average pooling kernel.
|
107 |
+
n_blocks (int): Number of convolutional blocks in each encoder block.
|
108 |
+
out_channels (int): Number of output channels for the first encoder block.
|
109 |
+
momentum (float): Momentum for batch normalization.
|
110 |
+
"""
|
111 |
+
|
112 |
+
def __init__(
|
113 |
+
self,
|
114 |
+
in_channels,
|
115 |
+
in_size,
|
116 |
+
n_encoders,
|
117 |
+
kernel_size,
|
118 |
+
n_blocks,
|
119 |
+
out_channels=16,
|
120 |
+
momentum=0.01,
|
121 |
+
):
|
122 |
+
super(Encoder, self).__init__()
|
123 |
+
self.n_encoders = n_encoders
|
124 |
+
self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
|
125 |
+
self.layers = nn.ModuleList()
|
126 |
+
self.latent_channels = []
|
127 |
+
for i in range(self.n_encoders):
|
128 |
+
self.layers.append(
|
129 |
+
ResEncoderBlock(
|
130 |
+
in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
|
131 |
+
)
|
132 |
+
)
|
133 |
+
self.latent_channels.append([out_channels, in_size])
|
134 |
+
in_channels = out_channels
|
135 |
+
out_channels *= 2
|
136 |
+
in_size //= 2
|
137 |
+
self.out_size = in_size
|
138 |
+
self.out_channel = out_channels
|
139 |
+
|
140 |
+
def forward(self, x: torch.Tensor):
|
141 |
+
concat_tensors: List[torch.Tensor] = []
|
142 |
+
x = self.bn(x)
|
143 |
+
for i in range(self.n_encoders):
|
144 |
+
t, x = self.layers[i](x)
|
145 |
+
concat_tensors.append(t)
|
146 |
+
return x, concat_tensors
|
147 |
+
|
148 |
+
|
149 |
+
# Define a class for the intermediate layer
|
150 |
+
class Intermediate(nn.Module):
|
151 |
+
"""
|
152 |
+
The intermediate layer of the DeepUnet.
|
153 |
+
|
154 |
+
Args:
|
155 |
+
in_channels (int): Number of input channels.
|
156 |
+
out_channels (int): Number of output channels.
|
157 |
+
n_inters (int): Number of convolutional blocks in the intermediate layer.
|
158 |
+
n_blocks (int): Number of convolutional blocks in each intermediate block.
|
159 |
+
momentum (float): Momentum for batch normalization.
|
160 |
+
"""
|
161 |
+
|
162 |
+
def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
|
163 |
+
super(Intermediate, self).__init__()
|
164 |
+
self.n_inters = n_inters
|
165 |
+
self.layers = nn.ModuleList()
|
166 |
+
self.layers.append(
|
167 |
+
ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
|
168 |
+
)
|
169 |
+
for _ in range(self.n_inters - 1):
|
170 |
+
self.layers.append(
|
171 |
+
ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
|
172 |
+
)
|
173 |
+
|
174 |
+
def forward(self, x):
|
175 |
+
for i in range(self.n_inters):
|
176 |
+
x = self.layers[i](x)
|
177 |
+
return x
|
178 |
+
|
179 |
+
|
180 |
+
# Define a class for residual decoder blocks
|
181 |
+
class ResDecoderBlock(nn.Module):
|
182 |
+
"""
|
183 |
+
A residual decoder block.
|
184 |
+
|
185 |
+
Args:
|
186 |
+
in_channels (int): Number of input channels.
|
187 |
+
out_channels (int): Number of output channels.
|
188 |
+
stride (tuple): Stride for transposed convolution.
|
189 |
+
n_blocks (int): Number of convolutional blocks in the block.
|
190 |
+
momentum (float): Momentum for batch normalization.
|
191 |
+
"""
|
192 |
+
|
193 |
+
def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
|
194 |
+
super(ResDecoderBlock, self).__init__()
|
195 |
+
out_padding = (0, 1) if stride == (1, 2) else (1, 1)
|
196 |
+
self.n_blocks = n_blocks
|
197 |
+
self.conv1 = nn.Sequential(
|
198 |
+
nn.ConvTranspose2d(
|
199 |
+
in_channels=in_channels,
|
200 |
+
out_channels=out_channels,
|
201 |
+
kernel_size=(3, 3),
|
202 |
+
stride=stride,
|
203 |
+
padding=(1, 1),
|
204 |
+
output_padding=out_padding,
|
205 |
+
bias=False,
|
206 |
+
),
|
207 |
+
nn.BatchNorm2d(out_channels, momentum=momentum),
|
208 |
+
nn.ReLU(),
|
209 |
+
)
|
210 |
+
self.conv2 = nn.ModuleList()
|
211 |
+
self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
|
212 |
+
for _ in range(n_blocks - 1):
|
213 |
+
self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
|
214 |
+
|
215 |
+
def forward(self, x, concat_tensor):
|
216 |
+
x = self.conv1(x)
|
217 |
+
x = torch.cat((x, concat_tensor), dim=1)
|
218 |
+
for i in range(self.n_blocks):
|
219 |
+
x = self.conv2[i](x)
|
220 |
+
return x
|
221 |
+
|
222 |
+
|
223 |
+
# Define a class for the decoder
|
224 |
+
class Decoder(nn.Module):
|
225 |
+
"""
|
226 |
+
The decoder part of the DeepUnet.
|
227 |
+
|
228 |
+
Args:
|
229 |
+
in_channels (int): Number of input channels.
|
230 |
+
n_decoders (int): Number of decoder blocks.
|
231 |
+
stride (tuple): Stride for transposed convolution.
|
232 |
+
n_blocks (int): Number of convolutional blocks in each decoder block.
|
233 |
+
momentum (float): Momentum for batch normalization.
|
234 |
+
"""
|
235 |
+
|
236 |
+
def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
|
237 |
+
super(Decoder, self).__init__()
|
238 |
+
self.layers = nn.ModuleList()
|
239 |
+
self.n_decoders = n_decoders
|
240 |
+
for _ in range(self.n_decoders):
|
241 |
+
out_channels = in_channels // 2
|
242 |
+
self.layers.append(
|
243 |
+
ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
|
244 |
+
)
|
245 |
+
in_channels = out_channels
|
246 |
+
|
247 |
+
def forward(self, x, concat_tensors):
|
248 |
+
for i in range(self.n_decoders):
|
249 |
+
x = self.layers[i](x, concat_tensors[-1 - i])
|
250 |
+
return x
|
251 |
+
|
252 |
+
|
253 |
+
# Define a class for the DeepUnet architecture
|
254 |
+
class DeepUnet(nn.Module):
|
255 |
+
"""
|
256 |
+
The DeepUnet architecture.
|
257 |
+
|
258 |
+
Args:
|
259 |
+
kernel_size (tuple): Size of the average pooling kernel.
|
260 |
+
n_blocks (int): Number of convolutional blocks in each encoder/decoder block.
|
261 |
+
en_de_layers (int): Number of encoder/decoder layers.
|
262 |
+
inter_layers (int): Number of convolutional blocks in the intermediate layer.
|
263 |
+
in_channels (int): Number of input channels.
|
264 |
+
en_out_channels (int): Number of output channels for the first encoder block.
|
265 |
+
"""
|
266 |
+
|
267 |
+
def __init__(
|
268 |
+
self,
|
269 |
+
kernel_size,
|
270 |
+
n_blocks,
|
271 |
+
en_de_layers=5,
|
272 |
+
inter_layers=4,
|
273 |
+
in_channels=1,
|
274 |
+
en_out_channels=16,
|
275 |
+
):
|
276 |
+
super(DeepUnet, self).__init__()
|
277 |
+
self.encoder = Encoder(
|
278 |
+
in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
|
279 |
+
)
|
280 |
+
self.intermediate = Intermediate(
|
281 |
+
self.encoder.out_channel // 2,
|
282 |
+
self.encoder.out_channel,
|
283 |
+
inter_layers,
|
284 |
+
n_blocks,
|
285 |
+
)
|
286 |
+
self.decoder = Decoder(
|
287 |
+
self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
|
288 |
+
)
|
289 |
+
|
290 |
+
def forward(self, x):
|
291 |
+
x, concat_tensors = self.encoder(x)
|
292 |
+
x = self.intermediate(x)
|
293 |
+
x = self.decoder(x, concat_tensors)
|
294 |
+
return x
|
295 |
+
|
296 |
+
|
297 |
+
# Define a class for the end-to-end model
|
298 |
+
class E2E(nn.Module):
|
299 |
+
"""
|
300 |
+
The end-to-end model.
|
301 |
+
|
302 |
+
Args:
|
303 |
+
n_blocks (int): Number of convolutional blocks in each encoder/decoder block.
|
304 |
+
n_gru (int): Number of GRU layers.
|
305 |
+
kernel_size (tuple): Size of the average pooling kernel.
|
306 |
+
en_de_layers (int): Number of encoder/decoder layers.
|
307 |
+
inter_layers (int): Number of convolutional blocks in the intermediate layer.
|
308 |
+
in_channels (int): Number of input channels.
|
309 |
+
en_out_channels (int): Number of output channels for the first encoder block.
|
310 |
+
"""
|
311 |
+
|
312 |
+
def __init__(
|
313 |
+
self,
|
314 |
+
n_blocks,
|
315 |
+
n_gru,
|
316 |
+
kernel_size,
|
317 |
+
en_de_layers=5,
|
318 |
+
inter_layers=4,
|
319 |
+
in_channels=1,
|
320 |
+
en_out_channels=16,
|
321 |
+
):
|
322 |
+
super(E2E, self).__init__()
|
323 |
+
self.unet = DeepUnet(
|
324 |
+
kernel_size,
|
325 |
+
n_blocks,
|
326 |
+
en_de_layers,
|
327 |
+
inter_layers,
|
328 |
+
in_channels,
|
329 |
+
en_out_channels,
|
330 |
+
)
|
331 |
+
self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
|
332 |
+
if n_gru:
|
333 |
+
self.fc = nn.Sequential(
|
334 |
+
BiGRU(3 * 128, 256, n_gru),
|
335 |
+
nn.Linear(512, N_CLASS),
|
336 |
+
nn.Dropout(0.25),
|
337 |
+
nn.Sigmoid(),
|
338 |
+
)
|
339 |
+
else:
|
340 |
+
self.fc = nn.Sequential(
|
341 |
+
nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
|
342 |
+
)
|
343 |
+
|
344 |
+
def forward(self, mel):
|
345 |
+
mel = mel.transpose(-1, -2).unsqueeze(1)
|
346 |
+
x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
|
347 |
+
x = self.fc(x)
|
348 |
+
return x
|
349 |
+
|
350 |
+
|
351 |
+
# Define a class for the MelSpectrogram extractor
|
352 |
+
class MelSpectrogram(torch.nn.Module):
|
353 |
+
"""
|
354 |
+
Extracts Mel-spectrogram features from audio.
|
355 |
+
|
356 |
+
Args:
|
357 |
+
is_half (bool): Whether to use half-precision floating-point numbers.
|
358 |
+
n_mel_channels (int): Number of Mel-frequency bands.
|
359 |
+
sample_rate (int): Sampling rate of the audio.
|
360 |
+
win_length (int): Length of the window function in samples.
|
361 |
+
hop_length (int): Hop size between frames in samples.
|
362 |
+
n_fft (int, optional): Length of the FFT window. Defaults to None, which uses win_length.
|
363 |
+
mel_fmin (int, optional): Minimum frequency for the Mel filter bank. Defaults to 0.
|
364 |
+
mel_fmax (int, optional): Maximum frequency for the Mel filter bank. Defaults to None.
|
365 |
+
clamp (float, optional): Minimum value for clamping the Mel-spectrogram. Defaults to 1e-5.
|
366 |
+
"""
|
367 |
+
|
368 |
+
def __init__(
|
369 |
+
self,
|
370 |
+
is_half,
|
371 |
+
n_mel_channels,
|
372 |
+
sample_rate,
|
373 |
+
win_length,
|
374 |
+
hop_length,
|
375 |
+
n_fft=None,
|
376 |
+
mel_fmin=0,
|
377 |
+
mel_fmax=None,
|
378 |
+
clamp=1e-5,
|
379 |
+
):
|
380 |
+
super().__init__()
|
381 |
+
n_fft = win_length if n_fft is None else n_fft
|
382 |
+
self.hann_window = {}
|
383 |
+
mel_basis = mel(
|
384 |
+
sr=sample_rate,
|
385 |
+
n_fft=n_fft,
|
386 |
+
n_mels=n_mel_channels,
|
387 |
+
fmin=mel_fmin,
|
388 |
+
fmax=mel_fmax,
|
389 |
+
htk=True,
|
390 |
+
)
|
391 |
+
mel_basis = torch.from_numpy(mel_basis).float()
|
392 |
+
self.register_buffer("mel_basis", mel_basis)
|
393 |
+
self.n_fft = win_length if n_fft is None else n_fft
|
394 |
+
self.hop_length = hop_length
|
395 |
+
self.win_length = win_length
|
396 |
+
self.sample_rate = sample_rate
|
397 |
+
self.n_mel_channels = n_mel_channels
|
398 |
+
self.clamp = clamp
|
399 |
+
self.is_half = is_half
|
400 |
+
|
401 |
+
def forward(self, audio, keyshift=0, speed=1, center=True):
|
402 |
+
factor = 2 ** (keyshift / 12)
|
403 |
+
n_fft_new = int(np.round(self.n_fft * factor))
|
404 |
+
win_length_new = int(np.round(self.win_length * factor))
|
405 |
+
hop_length_new = int(np.round(self.hop_length * speed))
|
406 |
+
keyshift_key = str(keyshift) + "_" + str(audio.device)
|
407 |
+
if keyshift_key not in self.hann_window:
|
408 |
+
self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
|
409 |
+
audio.device
|
410 |
+
)
|
411 |
+
fft = torch.stft(
|
412 |
+
audio,
|
413 |
+
n_fft=n_fft_new,
|
414 |
+
hop_length=hop_length_new,
|
415 |
+
win_length=win_length_new,
|
416 |
+
window=self.hann_window[keyshift_key],
|
417 |
+
center=center,
|
418 |
+
return_complex=True,
|
419 |
+
)
|
420 |
+
|
421 |
+
magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
|
422 |
+
if keyshift != 0:
|
423 |
+
size = self.n_fft // 2 + 1
|
424 |
+
resize = magnitude.size(1)
|
425 |
+
if resize < size:
|
426 |
+
magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
|
427 |
+
magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
|
428 |
+
mel_output = torch.matmul(self.mel_basis, magnitude)
|
429 |
+
if self.is_half:
|
430 |
+
mel_output = mel_output.half()
|
431 |
+
log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
|
432 |
+
return log_mel_spec
|
433 |
+
|
434 |
+
|
435 |
+
# Define a class for the RMVPE0 predictor
|
436 |
+
class RMVPE0Predictor:
|
437 |
+
"""
|
438 |
+
A predictor for fundamental frequency (F0) based on the RMVPE0 model.
|
439 |
+
|
440 |
+
Args:
|
441 |
+
model_path (str): Path to the RMVPE0 model file.
|
442 |
+
is_half (bool): Whether to use half-precision floating-point numbers.
|
443 |
+
device (str, optional): Device to use for computation. Defaults to None, which uses CUDA if available.
|
444 |
+
"""
|
445 |
+
|
446 |
+
def __init__(self, model_path, is_half, device=None):
|
447 |
+
self.resample_kernel = {}
|
448 |
+
model = E2E(4, 1, (2, 2))
|
449 |
+
ckpt = torch.load(model_path, map_location="cpu")
|
450 |
+
model.load_state_dict(ckpt)
|
451 |
+
model.eval()
|
452 |
+
if is_half:
|
453 |
+
model = model.half()
|
454 |
+
self.model = model
|
455 |
+
self.resample_kernel = {}
|
456 |
+
self.is_half = is_half
|
457 |
+
self.device = device
|
458 |
+
self.mel_extractor = MelSpectrogram(
|
459 |
+
is_half, N_MELS, 16000, 1024, 160, None, 30, 8000
|
460 |
+
).to(device)
|
461 |
+
self.model = self.model.to(device)
|
462 |
+
cents_mapping = 20 * np.arange(N_CLASS) + 1997.3794084376191
|
463 |
+
self.cents_mapping = np.pad(cents_mapping, (4, 4))
|
464 |
+
|
465 |
+
def mel2hidden(self, mel):
|
466 |
+
"""
|
467 |
+
Converts Mel-spectrogram features to hidden representation.
|
468 |
+
|
469 |
+
Args:
|
470 |
+
mel (torch.Tensor): Mel-spectrogram features.
|
471 |
+
"""
|
472 |
+
with torch.no_grad():
|
473 |
+
n_frames = mel.shape[-1]
|
474 |
+
mel = F.pad(
|
475 |
+
mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect"
|
476 |
+
)
|
477 |
+
hidden = self.model(mel)
|
478 |
+
return hidden[:, :n_frames]
|
479 |
+
|
480 |
+
def decode(self, hidden, thred=0.03):
|
481 |
+
"""
|
482 |
+
Decodes hidden representation to F0.
|
483 |
+
|
484 |
+
Args:
|
485 |
+
hidden (np.ndarray): Hidden representation.
|
486 |
+
thred (float, optional): Threshold for salience. Defaults to 0.03.
|
487 |
+
"""
|
488 |
+
cents_pred = self.to_local_average_cents(hidden, thred=thred)
|
489 |
+
f0 = 10 * (2 ** (cents_pred / 1200))
|
490 |
+
f0[f0 == 10] = 0
|
491 |
+
return f0
|
492 |
+
|
493 |
+
def infer_from_audio(self, audio, thred=0.03):
|
494 |
+
"""
|
495 |
+
Infers F0 from audio.
|
496 |
+
|
497 |
+
Args:
|
498 |
+
audio (np.ndarray): Audio signal.
|
499 |
+
thred (float, optional): Threshold for salience. Defaults to 0.03.
|
500 |
+
"""
|
501 |
+
audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0)
|
502 |
+
mel = self.mel_extractor(audio, center=True)
|
503 |
+
hidden = self.mel2hidden(mel)
|
504 |
+
hidden = hidden.squeeze(0).cpu().numpy()
|
505 |
+
if self.is_half == True:
|
506 |
+
hidden = hidden.astype("float32")
|
507 |
+
f0 = self.decode(hidden, thred=thred)
|
508 |
+
return f0
|
509 |
+
|
510 |
+
def to_local_average_cents(self, salience, thred=0.05):
|
511 |
+
"""
|
512 |
+
Converts salience to local average cents.
|
513 |
+
|
514 |
+
Args:
|
515 |
+
salience (np.ndarray): Salience values.
|
516 |
+
thred (float, optional): Threshold for salience. Defaults to 0.05.
|
517 |
+
"""
|
518 |
+
center = np.argmax(salience, axis=1)
|
519 |
+
salience = np.pad(salience, ((0, 0), (4, 4)))
|
520 |
+
center += 4
|
521 |
+
todo_salience = []
|
522 |
+
todo_cents_mapping = []
|
523 |
+
starts = center - 4
|
524 |
+
ends = center + 5
|
525 |
+
for idx in range(salience.shape[0]):
|
526 |
+
todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
|
527 |
+
todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
|
528 |
+
todo_salience = np.array(todo_salience)
|
529 |
+
todo_cents_mapping = np.array(todo_cents_mapping)
|
530 |
+
product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
|
531 |
+
weight_sum = np.sum(todo_salience, 1)
|
532 |
+
devided = product_sum / weight_sum
|
533 |
+
maxx = np.max(salience, axis=1)
|
534 |
+
devided[maxx <= thred] = 0
|
535 |
+
return devided
|
536 |
+
|
537 |
+
|
538 |
+
# Define a class for BiGRU (bidirectional GRU)
|
539 |
+
class BiGRU(nn.Module):
|
540 |
+
"""
|
541 |
+
A bidirectional GRU layer.
|
542 |
+
|
543 |
+
Args:
|
544 |
+
input_features (int): Number of input features.
|
545 |
+
hidden_features (int): Number of hidden features.
|
546 |
+
num_layers (int): Number of GRU layers.
|
547 |
+
"""
|
548 |
+
|
549 |
+
def __init__(self, input_features, hidden_features, num_layers):
|
550 |
+
super(BiGRU, self).__init__()
|
551 |
+
self.gru = nn.GRU(
|
552 |
+
input_features,
|
553 |
+
hidden_features,
|
554 |
+
num_layers=num_layers,
|
555 |
+
batch_first=True,
|
556 |
+
bidirectional=True,
|
557 |
+
)
|
558 |
+
|
559 |
+
def forward(self, x):
|
560 |
+
return self.gru(x)[0]
|
rvc/lib/tools/analyzer.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import librosa.display
|
4 |
+
import librosa
|
5 |
+
|
6 |
+
|
7 |
+
def calculate_features(y, sr):
|
8 |
+
stft = np.abs(librosa.stft(y))
|
9 |
+
duration = librosa.get_duration(y=y, sr=sr)
|
10 |
+
cent = librosa.feature.spectral_centroid(S=stft, sr=sr)[0]
|
11 |
+
bw = librosa.feature.spectral_bandwidth(S=stft, sr=sr)[0]
|
12 |
+
rolloff = librosa.feature.spectral_rolloff(S=stft, sr=sr)[0]
|
13 |
+
return stft, duration, cent, bw, rolloff
|
14 |
+
|
15 |
+
|
16 |
+
def plot_title(title):
|
17 |
+
plt.suptitle(title, fontsize=16, fontweight="bold")
|
18 |
+
|
19 |
+
|
20 |
+
def plot_spectrogram(y, sr, stft, duration, cmap="inferno"):
|
21 |
+
plt.subplot(3, 1, 1)
|
22 |
+
plt.imshow(
|
23 |
+
librosa.amplitude_to_db(stft, ref=np.max),
|
24 |
+
origin="lower",
|
25 |
+
extent=[0, duration, 0, sr / 1000],
|
26 |
+
aspect="auto",
|
27 |
+
cmap=cmap, # Change the colormap here
|
28 |
+
)
|
29 |
+
plt.colorbar(format="%+2.0f dB")
|
30 |
+
plt.xlabel("Time (s)")
|
31 |
+
plt.ylabel("Frequency (kHz)")
|
32 |
+
plt.title("Spectrogram")
|
33 |
+
|
34 |
+
|
35 |
+
def plot_waveform(y, sr, duration):
|
36 |
+
plt.subplot(3, 1, 2)
|
37 |
+
librosa.display.waveshow(y, sr=sr)
|
38 |
+
plt.xlabel("Time (s)")
|
39 |
+
plt.ylabel("Amplitude")
|
40 |
+
plt.title("Waveform")
|
41 |
+
|
42 |
+
|
43 |
+
def plot_features(times, cent, bw, rolloff, duration):
|
44 |
+
plt.subplot(3, 1, 3)
|
45 |
+
plt.plot(times, cent, label="Spectral Centroid (kHz)", color="b")
|
46 |
+
plt.plot(times, bw, label="Spectral Bandwidth (kHz)", color="g")
|
47 |
+
plt.plot(times, rolloff, label="Spectral Rolloff (kHz)", color="r")
|
48 |
+
plt.xlabel("Time (s)")
|
49 |
+
plt.title("Spectral Features")
|
50 |
+
plt.legend()
|
51 |
+
|
52 |
+
|
53 |
+
def analyze_audio(audio_file, save_plot_path="logs/audio_analysis.png"):
|
54 |
+
y, sr = librosa.load(audio_file)
|
55 |
+
stft, duration, cent, bw, rolloff = calculate_features(y, sr)
|
56 |
+
|
57 |
+
plt.figure(figsize=(12, 10))
|
58 |
+
|
59 |
+
plot_title("Audio Analysis" + " - " + audio_file.split("/")[-1])
|
60 |
+
plot_spectrogram(y, sr, stft, duration)
|
61 |
+
plot_waveform(y, sr, duration)
|
62 |
+
plot_features(librosa.times_like(cent), cent, bw, rolloff, duration)
|
63 |
+
|
64 |
+
plt.tight_layout()
|
65 |
+
|
66 |
+
if save_plot_path:
|
67 |
+
plt.savefig(save_plot_path, bbox_inches="tight", dpi=300)
|
68 |
+
plt.close()
|
69 |
+
|
70 |
+
audio_info = f"""Sample Rate: {sr}\nDuration: {(
|
71 |
+
str(round(duration, 2)) + " seconds"
|
72 |
+
if duration < 60
|
73 |
+
else str(round(duration / 60, 2)) + " minutes"
|
74 |
+
)}\nNumber of Samples: {len(y)}\nBits per Sample: {librosa.get_samplerate(audio_file)}\nChannels: {"Mono (1)" if y.ndim == 1 else "Stereo (2)"}"""
|
75 |
+
|
76 |
+
return audio_info, save_plot_path
|
rvc/lib/tools/gdown.py
ADDED
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import six
|
4 |
+
import sys
|
5 |
+
import json
|
6 |
+
import tqdm
|
7 |
+
import time
|
8 |
+
import shutil
|
9 |
+
import warnings
|
10 |
+
import tempfile
|
11 |
+
import textwrap
|
12 |
+
import requests
|
13 |
+
from six.moves import urllib_parse
|
14 |
+
|
15 |
+
|
16 |
+
def indent(text, prefix):
|
17 |
+
"""Indent each non-empty line of text with the given prefix."""
|
18 |
+
return "".join(
|
19 |
+
(prefix + line if line.strip() else line) for line in text.splitlines(True)
|
20 |
+
)
|
21 |
+
|
22 |
+
|
23 |
+
class FileURLRetrievalError(Exception):
|
24 |
+
pass
|
25 |
+
|
26 |
+
|
27 |
+
class FolderContentsMaximumLimitError(Exception):
|
28 |
+
pass
|
29 |
+
|
30 |
+
|
31 |
+
def parse_url(url, warning=True):
|
32 |
+
"""Parse URLs especially for Google Drive links.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
url: URL to parse.
|
36 |
+
warning: Whether to warn if the URL is not a download link.
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
A tuple (file_id, is_download_link), where file_id is the ID of the
|
40 |
+
file on Google Drive, and is_download_link is a flag indicating
|
41 |
+
whether the URL is a download link.
|
42 |
+
"""
|
43 |
+
parsed = urllib_parse.urlparse(url)
|
44 |
+
query = urllib_parse.parse_qs(parsed.query)
|
45 |
+
is_gdrive = parsed.hostname in ("drive.google.com", "docs.google.com")
|
46 |
+
is_download_link = parsed.path.endswith("/uc")
|
47 |
+
|
48 |
+
if not is_gdrive:
|
49 |
+
return None, is_download_link
|
50 |
+
|
51 |
+
file_id = query.get("id", [None])[0]
|
52 |
+
if file_id is None:
|
53 |
+
for pattern in (
|
54 |
+
r"^/file/d/(.*?)/(edit|view)$",
|
55 |
+
r"^/file/u/[0-9]+/d/(.*?)/(edit|view)$",
|
56 |
+
r"^/document/d/(.*?)/(edit|htmlview|view)$",
|
57 |
+
r"^/document/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$",
|
58 |
+
r"^/presentation/d/(.*?)/(edit|htmlview|view)$",
|
59 |
+
r"^/presentation/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$",
|
60 |
+
r"^/spreadsheets/d/(.*?)/(edit|htmlview|view)$",
|
61 |
+
r"^/spreadsheets/u/[0-9]+/d/(.*?)/(edit|htmlview|view)$",
|
62 |
+
):
|
63 |
+
match = re.match(pattern, parsed.path)
|
64 |
+
if match:
|
65 |
+
file_id = match.group(1)
|
66 |
+
break
|
67 |
+
|
68 |
+
if warning and not is_download_link:
|
69 |
+
warnings.warn(
|
70 |
+
"You specified a Google Drive link that is not the correct link "
|
71 |
+
"to download a file. You might want to try `--fuzzy` option "
|
72 |
+
f"or the following url: https://drive.google.com/uc?id={file_id}"
|
73 |
+
)
|
74 |
+
|
75 |
+
return file_id, is_download_link
|
76 |
+
|
77 |
+
|
78 |
+
CHUNK_SIZE = 512 * 1024 # 512KB
|
79 |
+
HOME = os.path.expanduser("~")
|
80 |
+
|
81 |
+
|
82 |
+
def get_url_from_gdrive_confirmation(contents):
|
83 |
+
"""Extract the download URL from a Google Drive confirmation page."""
|
84 |
+
for pattern in (
|
85 |
+
r'href="(\/uc\?export=download[^"]+)',
|
86 |
+
r'href="/open\?id=([^"]+)"',
|
87 |
+
r'"downloadUrl":"([^"]+)',
|
88 |
+
):
|
89 |
+
match = re.search(pattern, contents)
|
90 |
+
if match:
|
91 |
+
url = match.group(1)
|
92 |
+
if pattern == r'href="/open\?id=([^"]+)"':
|
93 |
+
uuid = re.search(
|
94 |
+
r'<input\s+type="hidden"\s+name="uuid"\s+value="([^"]+)"',
|
95 |
+
contents,
|
96 |
+
).group(1)
|
97 |
+
url = (
|
98 |
+
"https://drive.usercontent.google.com/download?id="
|
99 |
+
+ url
|
100 |
+
+ "&confirm=t&uuid="
|
101 |
+
+ uuid
|
102 |
+
)
|
103 |
+
elif pattern == r'"downloadUrl":"([^"]+)':
|
104 |
+
url = url.replace("\\u003d", "=").replace("\\u0026", "&")
|
105 |
+
else:
|
106 |
+
url = "https://docs.google.com" + url.replace("&", "&")
|
107 |
+
return url
|
108 |
+
|
109 |
+
match = re.search(r'<p class="uc-error-subcaption">(.*)</p>', contents)
|
110 |
+
if match:
|
111 |
+
error = match.group(1)
|
112 |
+
raise FileURLRetrievalError(error)
|
113 |
+
|
114 |
+
raise FileURLRetrievalError(
|
115 |
+
"Cannot retrieve the public link of the file. "
|
116 |
+
"You may need to change the permission to "
|
117 |
+
"'Anyone with the link', or have had many accesses."
|
118 |
+
)
|
119 |
+
|
120 |
+
|
121 |
+
def _get_session(proxy, use_cookies, return_cookies_file=False):
|
122 |
+
"""Create a requests session with optional proxy and cookie handling."""
|
123 |
+
sess = requests.session()
|
124 |
+
sess.headers.update(
|
125 |
+
{"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)"}
|
126 |
+
)
|
127 |
+
|
128 |
+
if proxy is not None:
|
129 |
+
sess.proxies = {"http": proxy, "https": proxy}
|
130 |
+
print("Using proxy:", proxy, file=sys.stderr)
|
131 |
+
|
132 |
+
cookies_file = os.path.join(HOME, ".cache/gdown/cookies.json")
|
133 |
+
if os.path.exists(cookies_file) and use_cookies:
|
134 |
+
with open(cookies_file) as f:
|
135 |
+
cookies = json.load(f)
|
136 |
+
for k, v in cookies:
|
137 |
+
sess.cookies[k] = v
|
138 |
+
|
139 |
+
return (sess, cookies_file) if return_cookies_file else sess
|
140 |
+
|
141 |
+
|
142 |
+
def download(
|
143 |
+
url=None,
|
144 |
+
output=None,
|
145 |
+
quiet=False,
|
146 |
+
proxy=None,
|
147 |
+
speed=None,
|
148 |
+
use_cookies=True,
|
149 |
+
verify=True,
|
150 |
+
id=None,
|
151 |
+
fuzzy=True,
|
152 |
+
resume=False,
|
153 |
+
format=None,
|
154 |
+
):
|
155 |
+
"""Download file from URL.
|
156 |
+
|
157 |
+
Parameters
|
158 |
+
----------
|
159 |
+
url: str
|
160 |
+
URL. Google Drive URL is also supported.
|
161 |
+
output: str
|
162 |
+
Output filename. Default is basename of URL.
|
163 |
+
quiet: bool
|
164 |
+
Suppress terminal output. Default is False.
|
165 |
+
proxy: str
|
166 |
+
Proxy.
|
167 |
+
speed: float
|
168 |
+
Download byte size per second (e.g., 256KB/s = 256 * 1024).
|
169 |
+
use_cookies: bool
|
170 |
+
Flag to use cookies. Default is True.
|
171 |
+
verify: bool or string
|
172 |
+
Either a bool, in which case it controls whether the server's TLS
|
173 |
+
certificate is verified, or a string, in which case it must be a path
|
174 |
+
to a CA bundle to use. Default is True.
|
175 |
+
id: str
|
176 |
+
Google Drive's file ID.
|
177 |
+
fuzzy: bool
|
178 |
+
Fuzzy extraction of Google Drive's file Id. Default is False.
|
179 |
+
resume: bool
|
180 |
+
Resume the download from existing tmp file if possible.
|
181 |
+
Default is False.
|
182 |
+
format: str, optional
|
183 |
+
Format of Google Docs, Spreadsheets and Slides. Default is:
|
184 |
+
- Google Docs: 'docx'
|
185 |
+
- Google Spreadsheet: 'xlsx'
|
186 |
+
- Google Slides: 'pptx'
|
187 |
+
|
188 |
+
Returns
|
189 |
+
-------
|
190 |
+
output: str
|
191 |
+
Output filename.
|
192 |
+
"""
|
193 |
+
if not (id is None) ^ (url is None):
|
194 |
+
raise ValueError("Either url or id has to be specified")
|
195 |
+
if id is not None:
|
196 |
+
url = f"https://drive.google.com/uc?id={id}"
|
197 |
+
|
198 |
+
url_origin = url
|
199 |
+
|
200 |
+
sess, cookies_file = _get_session(
|
201 |
+
proxy=proxy, use_cookies=use_cookies, return_cookies_file=True
|
202 |
+
)
|
203 |
+
|
204 |
+
gdrive_file_id, is_gdrive_download_link = parse_url(url, warning=not fuzzy)
|
205 |
+
|
206 |
+
if fuzzy and gdrive_file_id:
|
207 |
+
# overwrite the url with fuzzy match of a file id
|
208 |
+
url = f"https://drive.google.com/uc?id={gdrive_file_id}"
|
209 |
+
url_origin = url
|
210 |
+
is_gdrive_download_link = True
|
211 |
+
|
212 |
+
while True:
|
213 |
+
res = sess.get(url, stream=True, verify=verify)
|
214 |
+
|
215 |
+
if url == url_origin and res.status_code == 500:
|
216 |
+
# The file could be Google Docs or Spreadsheets.
|
217 |
+
url = f"https://drive.google.com/open?id={gdrive_file_id}"
|
218 |
+
continue
|
219 |
+
|
220 |
+
if res.headers["Content-Type"].startswith("text/html"):
|
221 |
+
title = re.search("<title>(.+)</title>", res.text)
|
222 |
+
if title:
|
223 |
+
title = title.group(1)
|
224 |
+
if title.endswith(" - Google Docs"):
|
225 |
+
url = f"https://docs.google.com/document/d/{gdrive_file_id}/export?format={'docx' if format is None else format}"
|
226 |
+
continue
|
227 |
+
if title.endswith(" - Google Sheets"):
|
228 |
+
url = f"https://docs.google.com/spreadsheets/d/{gdrive_file_id}/export?format={'xlsx' if format is None else format}"
|
229 |
+
continue
|
230 |
+
if title.endswith(" - Google Slides"):
|
231 |
+
url = f"https://docs.google.com/presentation/d/{gdrive_file_id}/export?format={'pptx' if format is None else format}"
|
232 |
+
continue
|
233 |
+
elif (
|
234 |
+
"Content-Disposition" in res.headers
|
235 |
+
and res.headers["Content-Disposition"].endswith("pptx")
|
236 |
+
and format not in (None, "pptx")
|
237 |
+
):
|
238 |
+
url = f"https://docs.google.com/presentation/d/{gdrive_file_id}/export?format={'pptx' if format is None else format}"
|
239 |
+
continue
|
240 |
+
|
241 |
+
if use_cookies:
|
242 |
+
os.makedirs(os.path.dirname(cookies_file), exist_ok=True)
|
243 |
+
with open(cookies_file, "w") as f:
|
244 |
+
cookies = [
|
245 |
+
(k, v)
|
246 |
+
for k, v in sess.cookies.items()
|
247 |
+
if not k.startswith("download_warning_")
|
248 |
+
]
|
249 |
+
json.dump(cookies, f, indent=2)
|
250 |
+
|
251 |
+
if "Content-Disposition" in res.headers:
|
252 |
+
# This is the file
|
253 |
+
break
|
254 |
+
if not (gdrive_file_id and is_gdrive_download_link):
|
255 |
+
break
|
256 |
+
|
257 |
+
# Need to redirect with confirmation
|
258 |
+
try:
|
259 |
+
url = get_url_from_gdrive_confirmation(res.text)
|
260 |
+
except FileURLRetrievalError as e:
|
261 |
+
message = (
|
262 |
+
"Failed to retrieve file url:\n\n"
|
263 |
+
"{}\n\n"
|
264 |
+
"You may still be able to access the file from the browser:"
|
265 |
+
f"\n\n\t{url_origin}\n\n"
|
266 |
+
"but Gdown can't. Please check connections and permissions."
|
267 |
+
).format(indent("\n".join(textwrap.wrap(str(e))), prefix="\t"))
|
268 |
+
raise FileURLRetrievalError(message)
|
269 |
+
|
270 |
+
if gdrive_file_id and is_gdrive_download_link:
|
271 |
+
content_disposition = urllib_parse.unquote(res.headers["Content-Disposition"])
|
272 |
+
filename_from_url = (
|
273 |
+
re.search(r"filename\*=UTF-8''(.*)", content_disposition)
|
274 |
+
or re.search(r'filename=["\']?(.*?)["\']?$', content_disposition)
|
275 |
+
).group(1)
|
276 |
+
filename_from_url = filename_from_url.replace(os.path.sep, "_")
|
277 |
+
else:
|
278 |
+
filename_from_url = os.path.basename(url)
|
279 |
+
|
280 |
+
output = output or filename_from_url
|
281 |
+
|
282 |
+
output_is_path = isinstance(output, six.string_types)
|
283 |
+
if output_is_path and output.endswith(os.path.sep):
|
284 |
+
os.makedirs(output, exist_ok=True)
|
285 |
+
output = os.path.join(output, filename_from_url)
|
286 |
+
|
287 |
+
if output_is_path:
|
288 |
+
temp_dir = os.path.dirname(output) or "."
|
289 |
+
prefix = os.path.basename(output)
|
290 |
+
existing_tmp_files = [
|
291 |
+
os.path.join(temp_dir, file)
|
292 |
+
for file in os.listdir(temp_dir)
|
293 |
+
if file.startswith(prefix)
|
294 |
+
]
|
295 |
+
if resume and existing_tmp_files:
|
296 |
+
if len(existing_tmp_files) > 1:
|
297 |
+
print(
|
298 |
+
"There are multiple temporary files to resume:",
|
299 |
+
file=sys.stderr,
|
300 |
+
)
|
301 |
+
for file in existing_tmp_files:
|
302 |
+
print(f"\t{file}", file=sys.stderr)
|
303 |
+
print(
|
304 |
+
"Please remove them except one to resume downloading.",
|
305 |
+
file=sys.stderr,
|
306 |
+
)
|
307 |
+
return
|
308 |
+
tmp_file = existing_tmp_files[0]
|
309 |
+
else:
|
310 |
+
resume = False
|
311 |
+
tmp_file = tempfile.mktemp(
|
312 |
+
suffix=tempfile.template, prefix=prefix, dir=temp_dir
|
313 |
+
)
|
314 |
+
f = open(tmp_file, "ab")
|
315 |
+
else:
|
316 |
+
tmp_file = None
|
317 |
+
f = output
|
318 |
+
|
319 |
+
if tmp_file is not None and f.tell() != 0:
|
320 |
+
headers = {"Range": f"bytes={f.tell()}-"}
|
321 |
+
res = sess.get(url, headers=headers, stream=True, verify=verify)
|
322 |
+
|
323 |
+
if not quiet:
|
324 |
+
if resume:
|
325 |
+
print("Resume:", tmp_file, file=sys.stderr)
|
326 |
+
print(
|
327 |
+
"To:",
|
328 |
+
os.path.abspath(output) if output_is_path else output,
|
329 |
+
file=sys.stderr,
|
330 |
+
)
|
331 |
+
|
332 |
+
try:
|
333 |
+
total = int(res.headers.get("Content-Length", 0))
|
334 |
+
if not quiet:
|
335 |
+
pbar = tqdm.tqdm(total=total, unit="B", unit_scale=True)
|
336 |
+
t_start = time.time()
|
337 |
+
for chunk in res.iter_content(chunk_size=CHUNK_SIZE):
|
338 |
+
f.write(chunk)
|
339 |
+
if not quiet:
|
340 |
+
pbar.update(len(chunk))
|
341 |
+
if speed is not None:
|
342 |
+
elapsed_time_expected = 1.0 * pbar.n / speed
|
343 |
+
elapsed_time = time.time() - t_start
|
344 |
+
if elapsed_time < elapsed_time_expected:
|
345 |
+
time.sleep(elapsed_time_expected - elapsed_time)
|
346 |
+
if not quiet:
|
347 |
+
pbar.close()
|
348 |
+
if tmp_file:
|
349 |
+
f.close()
|
350 |
+
shutil.move(tmp_file, output)
|
351 |
+
finally:
|
352 |
+
sess.close()
|
353 |
+
|
354 |
+
return output
|
rvc/lib/tools/launch_tensorboard.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import logging
|
3 |
+
from tensorboard import program
|
4 |
+
|
5 |
+
log_path = "logs"
|
6 |
+
|
7 |
+
|
8 |
+
def launch_tensorboard_pipeline():
|
9 |
+
logging.getLogger("root").setLevel(logging.WARNING)
|
10 |
+
logging.getLogger("tensorboard").setLevel(logging.WARNING)
|
11 |
+
|
12 |
+
tb = program.TensorBoard()
|
13 |
+
tb.configure(argv=[None, "--logdir", log_path])
|
14 |
+
url = tb.launch()
|
15 |
+
|
16 |
+
print(
|
17 |
+
f"Access the tensorboard using the following link:\n{url}?pinnedCards=%5B%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fd%2Ftotal%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fkl%22%7D%2C%7B%22plugin%22%3A%22scalars%22%2C%22tag%22%3A%22loss%2Fg%2Fmel%22%7D%5D"
|
18 |
+
)
|
19 |
+
|
20 |
+
while True:
|
21 |
+
time.sleep(600)
|
rvc/lib/tools/model_download.py
ADDED
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import six
|
4 |
+
import sys
|
5 |
+
import wget
|
6 |
+
import shutil
|
7 |
+
import zipfile
|
8 |
+
import requests
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
from urllib.parse import unquote, urlencode, parse_qs, urlparse
|
11 |
+
|
12 |
+
now_dir = os.getcwd()
|
13 |
+
sys.path.append(now_dir)
|
14 |
+
|
15 |
+
from rvc.lib.utils import format_title
|
16 |
+
from rvc.lib.tools import gdown
|
17 |
+
|
18 |
+
|
19 |
+
def find_folder_parent(search_dir, folder_name):
|
20 |
+
for dirpath, dirnames, _ in os.walk(search_dir):
|
21 |
+
if folder_name in dirnames:
|
22 |
+
return os.path.abspath(dirpath)
|
23 |
+
return None
|
24 |
+
|
25 |
+
|
26 |
+
file_path = find_folder_parent(now_dir, "logs")
|
27 |
+
zips_path = os.path.join(file_path, "zips")
|
28 |
+
|
29 |
+
|
30 |
+
def search_pth_index(folder):
|
31 |
+
pth_paths = [
|
32 |
+
os.path.join(folder, file)
|
33 |
+
for file in os.listdir(folder)
|
34 |
+
if os.path.isfile(os.path.join(folder, file)) and file.endswith(".pth")
|
35 |
+
]
|
36 |
+
index_paths = [
|
37 |
+
os.path.join(folder, file)
|
38 |
+
for file in os.listdir(folder)
|
39 |
+
if os.path.isfile(os.path.join(folder, file)) and file.endswith(".index")
|
40 |
+
]
|
41 |
+
|
42 |
+
return pth_paths, index_paths
|
43 |
+
|
44 |
+
|
45 |
+
def get_mediafire_download_link(url):
|
46 |
+
response = requests.get(url)
|
47 |
+
response.raise_for_status()
|
48 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
49 |
+
download_button = soup.find(
|
50 |
+
"a", {"class": "input popsok", "aria-label": "Download file"}
|
51 |
+
)
|
52 |
+
if download_button:
|
53 |
+
download_link = download_button.get("href")
|
54 |
+
return download_link
|
55 |
+
else:
|
56 |
+
return None
|
57 |
+
|
58 |
+
|
59 |
+
def download_from_url(url):
|
60 |
+
os.makedirs(zips_path, exist_ok=True)
|
61 |
+
if url != "":
|
62 |
+
if "drive.google.com" in url:
|
63 |
+
if "file/d/" in url:
|
64 |
+
file_id = url.split("file/d/")[1].split("/")[0]
|
65 |
+
elif "id=" in url:
|
66 |
+
file_id = url.split("id=")[1].split("&")[0]
|
67 |
+
else:
|
68 |
+
return None
|
69 |
+
|
70 |
+
if file_id:
|
71 |
+
os.chdir(zips_path)
|
72 |
+
try:
|
73 |
+
gdown.download(
|
74 |
+
f"https://drive.google.com/uc?id={file_id}",
|
75 |
+
quiet=True,
|
76 |
+
fuzzy=True,
|
77 |
+
)
|
78 |
+
except Exception as error:
|
79 |
+
error_message = str(
|
80 |
+
f"An error occurred downloading the file: {error}"
|
81 |
+
)
|
82 |
+
if (
|
83 |
+
"Too many users have viewed or downloaded this file recently"
|
84 |
+
in error_message
|
85 |
+
):
|
86 |
+
os.chdir(now_dir)
|
87 |
+
return "too much use"
|
88 |
+
elif (
|
89 |
+
"Cannot retrieve the public link of the file." in error_message
|
90 |
+
):
|
91 |
+
os.chdir(now_dir)
|
92 |
+
return "private link"
|
93 |
+
else:
|
94 |
+
print(error_message)
|
95 |
+
os.chdir(now_dir)
|
96 |
+
return None
|
97 |
+
elif "disk.yandex.ru" in url:
|
98 |
+
base_url = "https://cloud-api.yandex.net/v1/disk/public/resources/download?"
|
99 |
+
public_key = url
|
100 |
+
final_url = base_url + urlencode(dict(public_key=public_key))
|
101 |
+
response = requests.get(final_url)
|
102 |
+
download_url = response.json()["href"]
|
103 |
+
download_response = requests.get(download_url)
|
104 |
+
|
105 |
+
if download_response.status_code == 200:
|
106 |
+
filename = parse_qs(urlparse(unquote(download_url)).query).get(
|
107 |
+
"filename", [""]
|
108 |
+
)[0]
|
109 |
+
if filename:
|
110 |
+
os.chdir(zips_path)
|
111 |
+
with open(filename, "wb") as f:
|
112 |
+
f.write(download_response.content)
|
113 |
+
else:
|
114 |
+
print("Failed to get filename from URL.")
|
115 |
+
return None
|
116 |
+
|
117 |
+
elif "pixeldrain.com" in url:
|
118 |
+
try:
|
119 |
+
file_id = url.split("pixeldrain.com/u/")[1]
|
120 |
+
os.chdir(zips_path)
|
121 |
+
print(file_id)
|
122 |
+
response = requests.get(f"https://pixeldrain.com/api/file/{file_id}")
|
123 |
+
if response.status_code == 200:
|
124 |
+
file_name = (
|
125 |
+
response.headers.get("Content-Disposition")
|
126 |
+
.split("filename=")[-1]
|
127 |
+
.strip('";')
|
128 |
+
)
|
129 |
+
os.makedirs(zips_path, exist_ok=True)
|
130 |
+
with open(os.path.join(zips_path, file_name), "wb") as newfile:
|
131 |
+
newfile.write(response.content)
|
132 |
+
os.chdir(file_path)
|
133 |
+
return "downloaded"
|
134 |
+
else:
|
135 |
+
os.chdir(file_path)
|
136 |
+
return None
|
137 |
+
except Exception as error:
|
138 |
+
print(f"An error occurred downloading the file: {error}")
|
139 |
+
os.chdir(file_path)
|
140 |
+
return None
|
141 |
+
|
142 |
+
elif "cdn.discordapp.com" in url:
|
143 |
+
file = requests.get(url)
|
144 |
+
os.chdir(zips_path)
|
145 |
+
if file.status_code == 200:
|
146 |
+
name = url.split("/")
|
147 |
+
with open(os.path.join(name[-1]), "wb") as newfile:
|
148 |
+
newfile.write(file.content)
|
149 |
+
else:
|
150 |
+
return None
|
151 |
+
elif "/blob/" in url or "/resolve/" in url:
|
152 |
+
os.chdir(zips_path)
|
153 |
+
if "/blob/" in url:
|
154 |
+
url = url.replace("/blob/", "/resolve/")
|
155 |
+
|
156 |
+
response = requests.get(url, stream=True)
|
157 |
+
if response.status_code == 200:
|
158 |
+
content_disposition = six.moves.urllib_parse.unquote(
|
159 |
+
response.headers["Content-Disposition"]
|
160 |
+
)
|
161 |
+
m = re.search(r'filename="([^"]+)"', content_disposition)
|
162 |
+
file_name = m.groups()[0]
|
163 |
+
file_name = file_name.replace(os.path.sep, "_")
|
164 |
+
total_size_in_bytes = int(response.headers.get("content-length", 0))
|
165 |
+
block_size = 1024
|
166 |
+
progress_bar_length = 50
|
167 |
+
progress = 0
|
168 |
+
|
169 |
+
with open(os.path.join(zips_path, file_name), "wb") as file:
|
170 |
+
for data in response.iter_content(block_size):
|
171 |
+
file.write(data)
|
172 |
+
progress += len(data)
|
173 |
+
progress_percent = int((progress / total_size_in_bytes) * 100)
|
174 |
+
num_dots = int(
|
175 |
+
(progress / total_size_in_bytes) * progress_bar_length
|
176 |
+
)
|
177 |
+
progress_bar = (
|
178 |
+
"["
|
179 |
+
+ "." * num_dots
|
180 |
+
+ " " * (progress_bar_length - num_dots)
|
181 |
+
+ "]"
|
182 |
+
)
|
183 |
+
print(
|
184 |
+
f"{progress_percent}% {progress_bar} {progress}/{total_size_in_bytes} ",
|
185 |
+
end="\r",
|
186 |
+
)
|
187 |
+
if progress_percent == 100:
|
188 |
+
print("\n")
|
189 |
+
|
190 |
+
else:
|
191 |
+
os.chdir(now_dir)
|
192 |
+
return None
|
193 |
+
elif "/tree/main" in url:
|
194 |
+
os.chdir(zips_path)
|
195 |
+
response = requests.get(url)
|
196 |
+
soup = BeautifulSoup(response.content, "html.parser")
|
197 |
+
temp_url = ""
|
198 |
+
for link in soup.find_all("a", href=True):
|
199 |
+
if link["href"].endswith(".zip"):
|
200 |
+
temp_url = link["href"]
|
201 |
+
break
|
202 |
+
if temp_url:
|
203 |
+
url = temp_url
|
204 |
+
url = url.replace("blob", "resolve")
|
205 |
+
if "huggingface.co" not in url:
|
206 |
+
url = "https://huggingface.co" + url
|
207 |
+
|
208 |
+
wget.download(url)
|
209 |
+
else:
|
210 |
+
os.chdir(now_dir)
|
211 |
+
return None
|
212 |
+
elif "applio.org" in url:
|
213 |
+
parts = url.split("/")
|
214 |
+
id_with_query = parts[-1]
|
215 |
+
id_parts = id_with_query.split("?")
|
216 |
+
id_number = id_parts[0]
|
217 |
+
|
218 |
+
url = "https://cjtfqzjfdimgpvpwhzlv.supabase.co/rest/v1/models"
|
219 |
+
headers = {
|
220 |
+
"apikey": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImNqdGZxempmZGltZ3B2cHdoemx2Iiwicm9sZSI6ImFub24iLCJpYXQiOjE2OTUxNjczODgsImV4cCI6MjAxMDc0MzM4OH0.7z5WMIbjR99c2Ooc0ma7B_FyGq10G8X-alkCYTkKR10"
|
221 |
+
}
|
222 |
+
|
223 |
+
params = {"id": f"eq.{id_number}"}
|
224 |
+
response = requests.get(url, headers=headers, params=params)
|
225 |
+
if response.status_code == 200:
|
226 |
+
json_response = response.json()
|
227 |
+
print(json_response)
|
228 |
+
if json_response:
|
229 |
+
link = json_response[0]["link"]
|
230 |
+
verify = download_from_url(link)
|
231 |
+
if verify == "downloaded":
|
232 |
+
return "downloaded"
|
233 |
+
else:
|
234 |
+
return None
|
235 |
+
else:
|
236 |
+
return None
|
237 |
+
else:
|
238 |
+
try:
|
239 |
+
os.chdir(zips_path)
|
240 |
+
wget.download(url)
|
241 |
+
except Exception as error:
|
242 |
+
os.chdir(now_dir)
|
243 |
+
print(f"An error occurred downloading the file: {error}")
|
244 |
+
return None
|
245 |
+
|
246 |
+
for currentPath, _, zipFiles in os.walk(zips_path):
|
247 |
+
for Files in zipFiles:
|
248 |
+
filePart = Files.split(".")
|
249 |
+
extensionFile = filePart[len(filePart) - 1]
|
250 |
+
filePart.pop()
|
251 |
+
nameFile = "_".join(filePart)
|
252 |
+
realPath = os.path.join(currentPath, Files)
|
253 |
+
os.rename(realPath, nameFile + "." + extensionFile)
|
254 |
+
|
255 |
+
os.chdir(now_dir)
|
256 |
+
return "downloaded"
|
257 |
+
|
258 |
+
os.chdir(now_dir)
|
259 |
+
return None
|
260 |
+
|
261 |
+
|
262 |
+
def extract_and_show_progress(zipfile_path, unzips_path):
|
263 |
+
try:
|
264 |
+
with zipfile.ZipFile(zipfile_path, "r") as zip_ref:
|
265 |
+
for file_info in zip_ref.infolist():
|
266 |
+
zip_ref.extract(file_info, unzips_path)
|
267 |
+
os.remove(zipfile_path)
|
268 |
+
return True
|
269 |
+
except Exception as error:
|
270 |
+
print(f"An error occurred extracting the zip file: {error}")
|
271 |
+
return False
|
272 |
+
|
273 |
+
|
274 |
+
def unzip_file(zip_path, zip_file_name):
|
275 |
+
zip_file_path = os.path.join(zip_path, zip_file_name + ".zip")
|
276 |
+
extract_path = os.path.join(file_path, zip_file_name)
|
277 |
+
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
278 |
+
zip_ref.extractall(extract_path)
|
279 |
+
os.remove(zip_file_path)
|
280 |
+
|
281 |
+
|
282 |
+
def model_download_pipeline(url: str):
|
283 |
+
try:
|
284 |
+
verify = download_from_url(url)
|
285 |
+
if verify == "downloaded":
|
286 |
+
extract_folder_path = ""
|
287 |
+
for filename in os.listdir(zips_path):
|
288 |
+
if filename.endswith(".zip"):
|
289 |
+
zipfile_path = os.path.join(zips_path, filename)
|
290 |
+
print("Proceeding with the extraction...")
|
291 |
+
|
292 |
+
model_zip = os.path.basename(zipfile_path)
|
293 |
+
model_name = format_title(model_zip.split(".zip")[0])
|
294 |
+
extract_folder_path = os.path.join(
|
295 |
+
"logs",
|
296 |
+
os.path.normpath(model_name),
|
297 |
+
)
|
298 |
+
success = extract_and_show_progress(
|
299 |
+
zipfile_path, extract_folder_path
|
300 |
+
)
|
301 |
+
|
302 |
+
macosx_path = os.path.join(extract_folder_path, "__MACOSX")
|
303 |
+
if os.path.exists(macosx_path):
|
304 |
+
shutil.rmtree(macosx_path)
|
305 |
+
|
306 |
+
subfolders = [
|
307 |
+
f
|
308 |
+
for f in os.listdir(extract_folder_path)
|
309 |
+
if os.path.isdir(os.path.join(extract_folder_path, f))
|
310 |
+
]
|
311 |
+
if len(subfolders) == 1:
|
312 |
+
subfolder_path = os.path.join(
|
313 |
+
extract_folder_path, subfolders[0]
|
314 |
+
)
|
315 |
+
for item in os.listdir(subfolder_path):
|
316 |
+
s = os.path.join(subfolder_path, item)
|
317 |
+
d = os.path.join(extract_folder_path, item)
|
318 |
+
shutil.move(s, d)
|
319 |
+
os.rmdir(subfolder_path)
|
320 |
+
|
321 |
+
for item in os.listdir(extract_folder_path):
|
322 |
+
if ".pth" in item:
|
323 |
+
file_name = item.split(".pth")[0]
|
324 |
+
if file_name != model_name:
|
325 |
+
os.rename(
|
326 |
+
os.path.join(extract_folder_path, item),
|
327 |
+
os.path.join(
|
328 |
+
extract_folder_path, model_name + ".pth"
|
329 |
+
),
|
330 |
+
)
|
331 |
+
else:
|
332 |
+
if "v2" not in item:
|
333 |
+
if "_nprobe_1_" in item and "_v1" in item:
|
334 |
+
file_name = item.split("_nprobe_1_")[1].split(
|
335 |
+
"_v1"
|
336 |
+
)[0]
|
337 |
+
if file_name != model_name:
|
338 |
+
new_file_name = (
|
339 |
+
item.split("_nprobe_1_")[0]
|
340 |
+
+ "_nprobe_1_"
|
341 |
+
+ model_name
|
342 |
+
+ "_v1"
|
343 |
+
)
|
344 |
+
os.rename(
|
345 |
+
os.path.join(extract_folder_path, item),
|
346 |
+
os.path.join(
|
347 |
+
extract_folder_path,
|
348 |
+
new_file_name + ".index",
|
349 |
+
),
|
350 |
+
)
|
351 |
+
else:
|
352 |
+
if "_nprobe_1_" in item and "_v2" in item:
|
353 |
+
file_name = item.split("_nprobe_1_")[1].split(
|
354 |
+
"_v2"
|
355 |
+
)[0]
|
356 |
+
if file_name != model_name:
|
357 |
+
new_file_name = (
|
358 |
+
item.split("_nprobe_1_")[0]
|
359 |
+
+ "_nprobe_1_"
|
360 |
+
+ model_name
|
361 |
+
+ "_v2"
|
362 |
+
)
|
363 |
+
os.rename(
|
364 |
+
os.path.join(extract_folder_path, item),
|
365 |
+
os.path.join(
|
366 |
+
extract_folder_path,
|
367 |
+
new_file_name + ".index",
|
368 |
+
),
|
369 |
+
)
|
370 |
+
|
371 |
+
if success:
|
372 |
+
print(f"Model {model_name} downloaded!")
|
373 |
+
else:
|
374 |
+
print(f"Error downloading {model_name}")
|
375 |
+
return "Error"
|
376 |
+
if extract_folder_path == "":
|
377 |
+
print("Zip file was not found.")
|
378 |
+
return "Error"
|
379 |
+
result = search_pth_index(extract_folder_path)
|
380 |
+
return result
|
381 |
+
else:
|
382 |
+
return "Error"
|
383 |
+
except Exception as error:
|
384 |
+
print(f"An unexpected error occurred: {error}")
|
385 |
+
return "Error"
|
rvc/lib/tools/prerequisites_download.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from concurrent.futures import ThreadPoolExecutor
|
3 |
+
from tqdm import tqdm
|
4 |
+
import requests
|
5 |
+
|
6 |
+
url_base = "https://huggingface.co/IAHispano/Applio/resolve/main/Resources"
|
7 |
+
|
8 |
+
# Define the file lists
|
9 |
+
models_list = [("predictors/", ["rmvpe.pt", "fcpe.pt"])]
|
10 |
+
embedders_list = [("embedders/contentvec/", ["pytorch_model.bin", "config.json"])]
|
11 |
+
executables_list = [
|
12 |
+
("", ["ffmpeg.exe", "ffprobe.exe"]),
|
13 |
+
]
|
14 |
+
|
15 |
+
folder_mapping_list = {
|
16 |
+
"embedders/contentvec/": "rvc/models/embedders/contentvec/",
|
17 |
+
"predictors/": "rvc/models/predictors/",
|
18 |
+
"formant/": "rvc/models/formant/",
|
19 |
+
}
|
20 |
+
|
21 |
+
|
22 |
+
def get_file_size_all(file_list):
|
23 |
+
"""
|
24 |
+
Calculate the total size of files to be downloaded, regardless of local existence.
|
25 |
+
"""
|
26 |
+
total_size = 0
|
27 |
+
for remote_folder, files in file_list:
|
28 |
+
# Use the mapping if available; otherwise, use an empty local folder
|
29 |
+
local_folder = folder_mapping_list.get(remote_folder, "")
|
30 |
+
for file in files:
|
31 |
+
url = f"{url_base}/{remote_folder}{file}"
|
32 |
+
response = requests.head(url)
|
33 |
+
total_size += int(response.headers.get("content-length", 0))
|
34 |
+
return total_size
|
35 |
+
|
36 |
+
|
37 |
+
def download_file(url, destination_path, global_bar):
|
38 |
+
"""
|
39 |
+
Download a file from the given URL to the specified destination path,
|
40 |
+
updating the global progress bar as data is downloaded.
|
41 |
+
"""
|
42 |
+
dir_name = os.path.dirname(destination_path)
|
43 |
+
if dir_name:
|
44 |
+
os.makedirs(dir_name, exist_ok=True)
|
45 |
+
response = requests.get(url, stream=True)
|
46 |
+
block_size = 1024
|
47 |
+
with open(destination_path, "wb") as file:
|
48 |
+
for data in response.iter_content(block_size):
|
49 |
+
file.write(data)
|
50 |
+
global_bar.update(len(data))
|
51 |
+
|
52 |
+
|
53 |
+
def download_mapping_files(file_mapping_list, global_bar):
|
54 |
+
"""
|
55 |
+
Download all files in the provided file mapping list using a thread pool executor,
|
56 |
+
and update the global progress bar as downloads progress.
|
57 |
+
This version downloads all files regardless of whether they already exist.
|
58 |
+
"""
|
59 |
+
with ThreadPoolExecutor() as executor:
|
60 |
+
futures = []
|
61 |
+
for remote_folder, file_list in file_mapping_list:
|
62 |
+
local_folder = folder_mapping_list.get(remote_folder, "")
|
63 |
+
for file in file_list:
|
64 |
+
destination_path = os.path.join(local_folder, file)
|
65 |
+
url = f"{url_base}/{remote_folder}{file}"
|
66 |
+
futures.append(
|
67 |
+
executor.submit(download_file, url, destination_path, global_bar)
|
68 |
+
)
|
69 |
+
for future in futures:
|
70 |
+
future.result()
|
71 |
+
|
72 |
+
|
73 |
+
def calculate_total_size(models, exe):
|
74 |
+
"""
|
75 |
+
Calculate the total size of all files to be downloaded based on selected categories.
|
76 |
+
"""
|
77 |
+
total_size = 0
|
78 |
+
if models:
|
79 |
+
total_size += get_file_size_all(models_list)
|
80 |
+
total_size += get_file_size_all(embedders_list)
|
81 |
+
if exe and os.name == "nt":
|
82 |
+
total_size += get_file_size_all(executables_list)
|
83 |
+
return total_size
|
84 |
+
|
85 |
+
|
86 |
+
def prerequisites_download_pipeline(models, exe):
|
87 |
+
"""
|
88 |
+
Manage the download pipeline for different categories of files.
|
89 |
+
"""
|
90 |
+
total_size = calculate_total_size(models, exe)
|
91 |
+
if total_size > 0:
|
92 |
+
with tqdm(
|
93 |
+
total=total_size, unit="iB", unit_scale=True, desc="Downloading all files"
|
94 |
+
) as global_bar:
|
95 |
+
if models:
|
96 |
+
download_mapping_files(models_list, global_bar)
|
97 |
+
download_mapping_files(embedders_list, global_bar)
|
98 |
+
if exe:
|
99 |
+
if os.name == "nt":
|
100 |
+
download_mapping_files(executables_list, global_bar)
|
101 |
+
else:
|
102 |
+
print("No executables needed for non-Windows systems.")
|
103 |
+
else:
|
104 |
+
print("No files to download.")
|
rvc/lib/tools/pretrained_selector.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def pretrained_selector(pitch_guidance):
|
2 |
+
if pitch_guidance == True:
|
3 |
+
return {
|
4 |
+
"v1": {
|
5 |
+
32000: (
|
6 |
+
"rvc/models/pretraineds/pretrained_v1/f0G32k.pth",
|
7 |
+
"rvc/models/pretraineds/pretrained_v1/f0D32k.pth",
|
8 |
+
),
|
9 |
+
40000: (
|
10 |
+
"rvc/models/pretraineds/pretrained_v1/f0G40k.pth",
|
11 |
+
"rvc/models/pretraineds/pretrained_v1/f0D40k.pth",
|
12 |
+
),
|
13 |
+
48000: (
|
14 |
+
"rvc/models/pretraineds/pretrained_v1/f0G48k.pth",
|
15 |
+
"rvc/models/pretraineds/pretrained_v1/f0D48k.pth",
|
16 |
+
),
|
17 |
+
},
|
18 |
+
"v2": {
|
19 |
+
32000: (
|
20 |
+
"rvc/models/pretraineds/pretrained_v2/f0G32k.pth",
|
21 |
+
"rvc/models/pretraineds/pretrained_v2/f0D32k.pth",
|
22 |
+
),
|
23 |
+
40000: (
|
24 |
+
"rvc/models/pretraineds/pretrained_v2/f0G40k.pth",
|
25 |
+
"rvc/models/pretraineds/pretrained_v2/f0D40k.pth",
|
26 |
+
),
|
27 |
+
48000: (
|
28 |
+
"rvc/models/pretraineds/pretrained_v2/f0G48k.pth",
|
29 |
+
"rvc/models/pretraineds/pretrained_v2/f0D48k.pth",
|
30 |
+
),
|
31 |
+
},
|
32 |
+
}
|
33 |
+
elif pitch_guidance == False:
|
34 |
+
return {
|
35 |
+
"v1": {
|
36 |
+
32000: (
|
37 |
+
"rvc/models/pretraineds/pretrained_v1/G32k.pth",
|
38 |
+
"rvc/models/pretraineds/pretrained_v1/D32k.pth",
|
39 |
+
),
|
40 |
+
40000: (
|
41 |
+
"rvc/models/pretraineds/pretrained_v1/G40k.pth",
|
42 |
+
"rvc/models/pretraineds/pretrained_v1/D40k.pth",
|
43 |
+
),
|
44 |
+
48000: (
|
45 |
+
"rvc/models/pretraineds/pretrained_v1/G48k.pth",
|
46 |
+
"rvc/models/pretraineds/pretrained_v1/D48k.pth",
|
47 |
+
),
|
48 |
+
},
|
49 |
+
"v2": {
|
50 |
+
32000: (
|
51 |
+
"rvc/models/pretraineds/pretrained_v2/G32k.pth",
|
52 |
+
"rvc/models/pretraineds/pretrained_v2/D32k.pth",
|
53 |
+
),
|
54 |
+
40000: (
|
55 |
+
"rvc/models/pretraineds/pretrained_v2/G40k.pth",
|
56 |
+
"rvc/models/pretraineds/pretrained_v2/D40k.pth",
|
57 |
+
),
|
58 |
+
48000: (
|
59 |
+
"rvc/models/pretraineds/pretrained_v2/G48k.pth",
|
60 |
+
"rvc/models/pretraineds/pretrained_v2/D48k.pth",
|
61 |
+
),
|
62 |
+
},
|
63 |
+
}
|
rvc/lib/tools/split_audio.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import librosa
|
3 |
+
|
4 |
+
|
5 |
+
def process_audio(audio, sr=16000, silence_thresh=-60, min_silence_len=250):
|
6 |
+
"""
|
7 |
+
Splits an audio signal into segments using a fixed frame size and hop size.
|
8 |
+
|
9 |
+
Parameters:
|
10 |
+
- audio (np.ndarray): The audio signal to split.
|
11 |
+
- sr (int): The sample rate of the input audio (default is 16000).
|
12 |
+
- silence_thresh (int): Silence threshold (default =-60dB)
|
13 |
+
- min_silence_len (int): Minimum silence duration (default 250ms).
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
- list of np.ndarray: A list of audio segments.
|
17 |
+
- np.ndarray: The intervals where the audio was split.
|
18 |
+
"""
|
19 |
+
frame_length = int(min_silence_len / 1000 * sr)
|
20 |
+
hop_length = frame_length // 2
|
21 |
+
intervals = librosa.effects.split(
|
22 |
+
audio, top_db=-silence_thresh, frame_length=frame_length, hop_length=hop_length
|
23 |
+
)
|
24 |
+
audio_segments = [audio[start:end] for start, end in intervals]
|
25 |
+
|
26 |
+
return audio_segments, intervals
|
27 |
+
|
28 |
+
|
29 |
+
def merge_audio(audio_segments, intervals, sr_orig, sr_new):
|
30 |
+
"""
|
31 |
+
Merges audio segments back into a single audio signal, filling gaps with silence.
|
32 |
+
|
33 |
+
Parameters:
|
34 |
+
- audio_segments (list of np.ndarray): The non-silent audio segments.
|
35 |
+
- intervals (np.ndarray): The intervals used for splitting the original audio.
|
36 |
+
- sr_orig (int): The sample rate of the original audio
|
37 |
+
- sr_new (int): The sample rate of the model
|
38 |
+
|
39 |
+
Returns:
|
40 |
+
- np.ndarray: The merged audio signal with silent gaps restored.
|
41 |
+
"""
|
42 |
+
sr_ratio = sr_new / sr_orig if sr_new > sr_orig else 1.0
|
43 |
+
|
44 |
+
merged_audio = np.zeros(
|
45 |
+
int(intervals[0][0] * sr_ratio if intervals[0][0] > 0 else 0),
|
46 |
+
dtype=audio_segments[0].dtype,
|
47 |
+
)
|
48 |
+
|
49 |
+
merged_audio = np.concatenate((merged_audio, audio_segments[0]))
|
50 |
+
|
51 |
+
for i in range(1, len(intervals)):
|
52 |
+
silence_duration = int((intervals[i][0] - intervals[i - 1][1]) * sr_ratio)
|
53 |
+
silence = np.zeros(silence_duration, dtype=audio_segments[0].dtype)
|
54 |
+
merged_audio = np.concatenate((merged_audio, silence, audio_segments[i]))
|
55 |
+
|
56 |
+
return merged_audio
|
rvc/lib/tools/tts.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import asyncio
|
3 |
+
import edge_tts
|
4 |
+
import os
|
5 |
+
|
6 |
+
|
7 |
+
async def main():
|
8 |
+
# Parse command line arguments
|
9 |
+
tts_file = str(sys.argv[1])
|
10 |
+
text = str(sys.argv[2])
|
11 |
+
voice = str(sys.argv[3])
|
12 |
+
rate = int(sys.argv[4])
|
13 |
+
output_file = str(sys.argv[5])
|
14 |
+
|
15 |
+
rates = f"+{rate}%" if rate >= 0 else f"{rate}%"
|
16 |
+
if tts_file and os.path.exists(tts_file):
|
17 |
+
text = ""
|
18 |
+
try:
|
19 |
+
with open(tts_file, "r", encoding="utf-8") as file:
|
20 |
+
text = file.read()
|
21 |
+
except UnicodeDecodeError:
|
22 |
+
with open(tts_file, "r") as file:
|
23 |
+
text = file.read()
|
24 |
+
await edge_tts.Communicate(text, voice, rate=rates).save(output_file)
|
25 |
+
print(f"TTS with {voice} completed. Output TTS file: '{output_file}'")
|
26 |
+
|
27 |
+
|
28 |
+
if __name__ == "__main__":
|
29 |
+
asyncio.run(main())
|
rvc/lib/tools/tts_voices.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
rvc/lib/utils.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, sys
|
2 |
+
import librosa
|
3 |
+
import soundfile as sf
|
4 |
+
import numpy as np
|
5 |
+
import re
|
6 |
+
import unicodedata
|
7 |
+
import wget
|
8 |
+
from pydub import AudioSegment
|
9 |
+
from torch import nn
|
10 |
+
|
11 |
+
import logging
|
12 |
+
from transformers import HubertModel
|
13 |
+
import warnings
|
14 |
+
|
15 |
+
# Remove this to see warnings about transformers models
|
16 |
+
warnings.filterwarnings("ignore")
|
17 |
+
|
18 |
+
logging.getLogger("fairseq").setLevel(logging.ERROR)
|
19 |
+
logging.getLogger("faiss.loader").setLevel(logging.ERROR)
|
20 |
+
logging.getLogger("transformers").setLevel(logging.ERROR)
|
21 |
+
logging.getLogger("torch").setLevel(logging.ERROR)
|
22 |
+
|
23 |
+
now_dir = os.getcwd()
|
24 |
+
sys.path.append(now_dir)
|
25 |
+
|
26 |
+
base_path = os.path.join(now_dir, "rvc", "models", "formant", "stftpitchshift")
|
27 |
+
stft = base_path + ".exe" if sys.platform == "win32" else base_path
|
28 |
+
|
29 |
+
|
30 |
+
class HubertModelWithFinalProj(HubertModel):
|
31 |
+
def __init__(self, config):
|
32 |
+
super().__init__(config)
|
33 |
+
self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
|
34 |
+
|
35 |
+
|
36 |
+
def load_audio(file, sample_rate):
|
37 |
+
try:
|
38 |
+
file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
39 |
+
audio, sr = sf.read(file)
|
40 |
+
if len(audio.shape) > 1:
|
41 |
+
audio = librosa.to_mono(audio.T)
|
42 |
+
if sr != sample_rate:
|
43 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
|
44 |
+
except Exception as error:
|
45 |
+
raise RuntimeError(f"An error occurred loading the audio: {error}")
|
46 |
+
|
47 |
+
return audio.flatten()
|
48 |
+
|
49 |
+
|
50 |
+
def load_audio_infer(
|
51 |
+
file,
|
52 |
+
sample_rate,
|
53 |
+
**kwargs,
|
54 |
+
):
|
55 |
+
formant_shifting = kwargs.get("formant_shifting", False)
|
56 |
+
try:
|
57 |
+
file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
|
58 |
+
if not os.path.isfile(file):
|
59 |
+
raise FileNotFoundError(f"File not found: {file}")
|
60 |
+
audio, sr = sf.read(file)
|
61 |
+
if len(audio.shape) > 1:
|
62 |
+
audio = librosa.to_mono(audio.T)
|
63 |
+
if sr != sample_rate:
|
64 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=sample_rate)
|
65 |
+
if formant_shifting:
|
66 |
+
formant_qfrency = kwargs.get("formant_qfrency", 0.8)
|
67 |
+
formant_timbre = kwargs.get("formant_timbre", 0.8)
|
68 |
+
|
69 |
+
from stftpitchshift import StftPitchShift
|
70 |
+
|
71 |
+
pitchshifter = StftPitchShift(1024, 32, sample_rate)
|
72 |
+
audio = pitchshifter.shiftpitch(
|
73 |
+
audio,
|
74 |
+
factors=1,
|
75 |
+
quefrency=formant_qfrency * 1e-3,
|
76 |
+
distortion=formant_timbre,
|
77 |
+
)
|
78 |
+
except Exception as error:
|
79 |
+
raise RuntimeError(f"An error occurred loading the audio: {error}")
|
80 |
+
return np.array(audio).flatten()
|
81 |
+
|
82 |
+
|
83 |
+
def format_title(title):
|
84 |
+
formatted_title = (
|
85 |
+
unicodedata.normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8")
|
86 |
+
)
|
87 |
+
formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title)
|
88 |
+
formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title)
|
89 |
+
formatted_title = re.sub(r"\s+", "_", formatted_title)
|
90 |
+
return formatted_title
|
91 |
+
|
92 |
+
|
93 |
+
def load_embedding(embedder_model, custom_embedder=None):
|
94 |
+
embedder_root = os.path.join(now_dir, "rvc", "models", "embedders")
|
95 |
+
embedding_list = {
|
96 |
+
"contentvec": os.path.join(embedder_root, "contentvec"),
|
97 |
+
"chinese-hubert-base": os.path.join(embedder_root, "chinese_hubert_base"),
|
98 |
+
"japanese-hubert-base": os.path.join(embedder_root, "japanese_hubert_base"),
|
99 |
+
"korean-hubert-base": os.path.join(embedder_root, "korean_hubert_base"),
|
100 |
+
}
|
101 |
+
|
102 |
+
online_embedders = {
|
103 |
+
"contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/pytorch_model.bin",
|
104 |
+
"chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/pytorch_model.bin",
|
105 |
+
"japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/pytorch_model.bin",
|
106 |
+
"korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/pytorch_model.bin",
|
107 |
+
}
|
108 |
+
|
109 |
+
config_files = {
|
110 |
+
"contentvec": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/contentvec/config.json",
|
111 |
+
"chinese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/chinese_hubert_base/config.json",
|
112 |
+
"japanese-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/japanese_hubert_base/config.json",
|
113 |
+
"korean-hubert-base": "https://huggingface.co/IAHispano/Applio/resolve/main/Resources/embedders/korean_hubert_base/config.json",
|
114 |
+
}
|
115 |
+
|
116 |
+
if embedder_model == "custom":
|
117 |
+
if os.path.exists(custom_embedder):
|
118 |
+
model_path = custom_embedder
|
119 |
+
else:
|
120 |
+
print(f"Custom embedder not found: {custom_embedder}, using contentvec")
|
121 |
+
model_path = embedding_list["contentvec"]
|
122 |
+
else:
|
123 |
+
model_path = embedding_list[embedder_model]
|
124 |
+
bin_file = os.path.join(model_path, "pytorch_model.bin")
|
125 |
+
json_file = os.path.join(model_path, "config.json")
|
126 |
+
os.makedirs(model_path, exist_ok=True)
|
127 |
+
if not os.path.exists(bin_file):
|
128 |
+
url = online_embedders[embedder_model]
|
129 |
+
print(f"Downloading {url} to {model_path}...")
|
130 |
+
wget.download(url, out=bin_file)
|
131 |
+
if not os.path.exists(json_file):
|
132 |
+
url = config_files[embedder_model]
|
133 |
+
print(f"Downloading {url} to {model_path}...")
|
134 |
+
wget.download(url, out=json_file)
|
135 |
+
|
136 |
+
models = HubertModelWithFinalProj.from_pretrained(model_path)
|
137 |
+
return models
|
rvc/lib/zluda.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
if torch.cuda.is_available() and torch.cuda.get_device_name().endswith("[ZLUDA]"):
|
4 |
+
_torch_stft = torch.stft
|
5 |
+
|
6 |
+
def z_stft(
|
7 |
+
audio: torch.Tensor,
|
8 |
+
n_fft: int,
|
9 |
+
hop_length: int = None,
|
10 |
+
win_length: int = None,
|
11 |
+
window: torch.Tensor = None,
|
12 |
+
center: bool = True,
|
13 |
+
pad_mode: str = "reflect",
|
14 |
+
normalized: bool = False,
|
15 |
+
onesided: bool = None,
|
16 |
+
return_complex: bool = None,
|
17 |
+
):
|
18 |
+
sd = audio.device
|
19 |
+
return _torch_stft(
|
20 |
+
audio.to("cpu"),
|
21 |
+
n_fft=n_fft,
|
22 |
+
hop_length=hop_length,
|
23 |
+
win_length=win_length,
|
24 |
+
window=window.to("cpu"),
|
25 |
+
center=center,
|
26 |
+
pad_mode=pad_mode,
|
27 |
+
normalized=normalized,
|
28 |
+
onesided=onesided,
|
29 |
+
return_complex=return_complex,
|
30 |
+
).to(sd)
|
31 |
+
|
32 |
+
def z_jit(f, *_, **__):
|
33 |
+
f.graph = torch._C.Graph()
|
34 |
+
return f
|
35 |
+
|
36 |
+
# hijacks
|
37 |
+
torch.stft = z_stft
|
38 |
+
torch.jit.script = z_jit
|
39 |
+
# disabling unsupported cudnn
|
40 |
+
torch.backends.cudnn.enabled = False
|
41 |
+
torch.backends.cuda.enable_flash_sdp(False)
|
42 |
+
torch.backends.cuda.enable_math_sdp(True)
|
43 |
+
torch.backends.cuda.enable_mem_efficient_sdp(False)
|
scrpt.py
ADDED
@@ -0,0 +1,1897 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import json
|
4 |
+
import argparse
|
5 |
+
import subprocess
|
6 |
+
from functools import lru_cache
|
7 |
+
from distutils.util import strtobool
|
8 |
+
|
9 |
+
now_dir = os.getcwd()
|
10 |
+
sys.path.append(now_dir)
|
11 |
+
|
12 |
+
current_script_directory = os.path.dirname(os.path.realpath(__file__))
|
13 |
+
logs_path = os.path.join(current_script_directory, "logs")
|
14 |
+
|
15 |
+
from rvc.lib.tools.analyzer import analyze_audio
|
16 |
+
from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline
|
17 |
+
from rvc.lib.tools.model_download import model_download_pipeline
|
18 |
+
|
19 |
+
python = sys.executable
|
20 |
+
|
21 |
+
|
22 |
+
# Get TTS Voices -> https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=6A5AA1D4EAFF4E9FB37E23D68491D6F4
|
23 |
+
@lru_cache(maxsize=1) # Cache only one result since the file is static
|
24 |
+
def load_voices_data():
|
25 |
+
with open(
|
26 |
+
os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r", encoding="utf-8"
|
27 |
+
) as file:
|
28 |
+
return json.load(file)
|
29 |
+
|
30 |
+
|
31 |
+
voices_data = load_voices_data()
|
32 |
+
locales = list({voice["ShortName"] for voice in voices_data})
|
33 |
+
|
34 |
+
|
35 |
+
@lru_cache(maxsize=None)
|
36 |
+
def import_voice_converter():
|
37 |
+
from rvc.infer.infer import VoiceConverter
|
38 |
+
|
39 |
+
return VoiceConverter()
|
40 |
+
|
41 |
+
|
42 |
+
@lru_cache(maxsize=1)
|
43 |
+
def get_config():
|
44 |
+
from rvc.configs.config import Config
|
45 |
+
|
46 |
+
return Config()
|
47 |
+
|
48 |
+
|
49 |
+
# Infer
|
50 |
+
def run_infer_script(
|
51 |
+
pitch: int,
|
52 |
+
filter_radius: int,
|
53 |
+
index_rate: float,
|
54 |
+
volume_envelope: int,
|
55 |
+
protect: float,
|
56 |
+
hop_length: int,
|
57 |
+
f0_method: str,
|
58 |
+
input_path: str,
|
59 |
+
output_path: str,
|
60 |
+
pth_path: str,
|
61 |
+
index_path: str,
|
62 |
+
split_audio: bool,
|
63 |
+
f0_autotune: bool,
|
64 |
+
f0_autotune_strength: float,
|
65 |
+
clean_audio: bool,
|
66 |
+
clean_strength: float,
|
67 |
+
export_format: str,
|
68 |
+
f0_file: str,
|
69 |
+
embedder_model: str,
|
70 |
+
embedder_model_custom: str = None,
|
71 |
+
formant_shifting: bool = False,
|
72 |
+
formant_qfrency: float = 1.0,
|
73 |
+
formant_timbre: float = 1.0,
|
74 |
+
post_process: bool = False,
|
75 |
+
reverb: bool = False,
|
76 |
+
pitch_shift: bool = False,
|
77 |
+
limiter: bool = False,
|
78 |
+
gain: bool = False,
|
79 |
+
distortion: bool = False,
|
80 |
+
chorus: bool = False,
|
81 |
+
bitcrush: bool = False,
|
82 |
+
clipping: bool = False,
|
83 |
+
compressor: bool = False,
|
84 |
+
delay: bool = False,
|
85 |
+
reverb_room_size: float = 0.5,
|
86 |
+
reverb_damping: float = 0.5,
|
87 |
+
reverb_wet_gain: float = 0.5,
|
88 |
+
reverb_dry_gain: float = 0.5,
|
89 |
+
reverb_width: float = 0.5,
|
90 |
+
reverb_freeze_mode: float = 0.5,
|
91 |
+
pitch_shift_semitones: float = 0.0,
|
92 |
+
limiter_threshold: float = -6,
|
93 |
+
limiter_release_time: float = 0.01,
|
94 |
+
gain_db: float = 0.0,
|
95 |
+
distortion_gain: float = 25,
|
96 |
+
chorus_rate: float = 1.0,
|
97 |
+
chorus_depth: float = 0.25,
|
98 |
+
chorus_center_delay: float = 7,
|
99 |
+
chorus_feedback: float = 0.0,
|
100 |
+
chorus_mix: float = 0.5,
|
101 |
+
bitcrush_bit_depth: int = 8,
|
102 |
+
clipping_threshold: float = -6,
|
103 |
+
compressor_threshold: float = 0,
|
104 |
+
compressor_ratio: float = 1,
|
105 |
+
compressor_attack: float = 1.0,
|
106 |
+
compressor_release: float = 100,
|
107 |
+
delay_seconds: float = 0.5,
|
108 |
+
delay_feedback: float = 0.0,
|
109 |
+
delay_mix: float = 0.5,
|
110 |
+
sid: int = 0,
|
111 |
+
):
|
112 |
+
kwargs = {
|
113 |
+
"audio_input_path": input_path,
|
114 |
+
"audio_output_path": output_path,
|
115 |
+
"model_path": pth_path,
|
116 |
+
"index_path": index_path,
|
117 |
+
"pitch": pitch,
|
118 |
+
"filter_radius": filter_radius,
|
119 |
+
"index_rate": index_rate,
|
120 |
+
"volume_envelope": volume_envelope,
|
121 |
+
"protect": protect,
|
122 |
+
"hop_length": hop_length,
|
123 |
+
"f0_method": f0_method,
|
124 |
+
"pth_path": pth_path,
|
125 |
+
"index_path": index_path,
|
126 |
+
"split_audio": split_audio,
|
127 |
+
"f0_autotune": f0_autotune,
|
128 |
+
"f0_autotune_strength": f0_autotune_strength,
|
129 |
+
"clean_audio": clean_audio,
|
130 |
+
"clean_strength": clean_strength,
|
131 |
+
"export_format": export_format,
|
132 |
+
"f0_file": f0_file,
|
133 |
+
"embedder_model": embedder_model,
|
134 |
+
"embedder_model_custom": embedder_model_custom,
|
135 |
+
"post_process": post_process,
|
136 |
+
"formant_shifting": formant_shifting,
|
137 |
+
"formant_qfrency": formant_qfrency,
|
138 |
+
"formant_timbre": formant_timbre,
|
139 |
+
"reverb": reverb,
|
140 |
+
"pitch_shift": pitch_shift,
|
141 |
+
"limiter": limiter,
|
142 |
+
"gain": gain,
|
143 |
+
"distortion": distortion,
|
144 |
+
"chorus": chorus,
|
145 |
+
"bitcrush": bitcrush,
|
146 |
+
"clipping": clipping,
|
147 |
+
"compressor": compressor,
|
148 |
+
"delay": delay,
|
149 |
+
"reverb_room_size": reverb_room_size,
|
150 |
+
"reverb_damping": reverb_damping,
|
151 |
+
"reverb_wet_level": reverb_wet_gain,
|
152 |
+
"reverb_dry_level": reverb_dry_gain,
|
153 |
+
"reverb_width": reverb_width,
|
154 |
+
"reverb_freeze_mode": reverb_freeze_mode,
|
155 |
+
"pitch_shift_semitones": pitch_shift_semitones,
|
156 |
+
"limiter_threshold": limiter_threshold,
|
157 |
+
"limiter_release": limiter_release_time,
|
158 |
+
"gain_db": gain_db,
|
159 |
+
"distortion_gain": distortion_gain,
|
160 |
+
"chorus_rate": chorus_rate,
|
161 |
+
"chorus_depth": chorus_depth,
|
162 |
+
"chorus_delay": chorus_center_delay,
|
163 |
+
"chorus_feedback": chorus_feedback,
|
164 |
+
"chorus_mix": chorus_mix,
|
165 |
+
"bitcrush_bit_depth": bitcrush_bit_depth,
|
166 |
+
"clipping_threshold": clipping_threshold,
|
167 |
+
"compressor_threshold": compressor_threshold,
|
168 |
+
"compressor_ratio": compressor_ratio,
|
169 |
+
"compressor_attack": compressor_attack,
|
170 |
+
"compressor_release": compressor_release,
|
171 |
+
"delay_seconds": delay_seconds,
|
172 |
+
"delay_feedback": delay_feedback,
|
173 |
+
"delay_mix": delay_mix,
|
174 |
+
"sid": sid,
|
175 |
+
}
|
176 |
+
infer_pipeline = import_voice_converter()
|
177 |
+
infer_pipeline.convert_audio(
|
178 |
+
**kwargs,
|
179 |
+
)
|
180 |
+
return f"File {input_path} inferred successfully.", output_path.replace(
|
181 |
+
".wav", f".{export_format.lower()}"
|
182 |
+
)
|
183 |
+
|
184 |
+
|
185 |
+
# Batch infer
|
186 |
+
def run_batch_infer_script(
|
187 |
+
pitch: int,
|
188 |
+
filter_radius: int,
|
189 |
+
index_rate: float,
|
190 |
+
volume_envelope: int,
|
191 |
+
protect: float,
|
192 |
+
hop_length: int,
|
193 |
+
f0_method: str,
|
194 |
+
input_folder: str,
|
195 |
+
output_folder: str,
|
196 |
+
pth_path: str,
|
197 |
+
index_path: str,
|
198 |
+
split_audio: bool,
|
199 |
+
f0_autotune: bool,
|
200 |
+
f0_autotune_strength: float,
|
201 |
+
clean_audio: bool,
|
202 |
+
clean_strength: float,
|
203 |
+
export_format: str,
|
204 |
+
f0_file: str,
|
205 |
+
embedder_model: str,
|
206 |
+
embedder_model_custom: str = None,
|
207 |
+
formant_shifting: bool = False,
|
208 |
+
formant_qfrency: float = 1.0,
|
209 |
+
formant_timbre: float = 1.0,
|
210 |
+
post_process: bool = False,
|
211 |
+
reverb: bool = False,
|
212 |
+
pitch_shift: bool = False,
|
213 |
+
limiter: bool = False,
|
214 |
+
gain: bool = False,
|
215 |
+
distortion: bool = False,
|
216 |
+
chorus: bool = False,
|
217 |
+
bitcrush: bool = False,
|
218 |
+
clipping: bool = False,
|
219 |
+
compressor: bool = False,
|
220 |
+
delay: bool = False,
|
221 |
+
reverb_room_size: float = 0.5,
|
222 |
+
reverb_damping: float = 0.5,
|
223 |
+
reverb_wet_gain: float = 0.5,
|
224 |
+
reverb_dry_gain: float = 0.5,
|
225 |
+
reverb_width: float = 0.5,
|
226 |
+
reverb_freeze_mode: float = 0.5,
|
227 |
+
pitch_shift_semitones: float = 0.0,
|
228 |
+
limiter_threshold: float = -6,
|
229 |
+
limiter_release_time: float = 0.01,
|
230 |
+
gain_db: float = 0.0,
|
231 |
+
distortion_gain: float = 25,
|
232 |
+
chorus_rate: float = 1.0,
|
233 |
+
chorus_depth: float = 0.25,
|
234 |
+
chorus_center_delay: float = 7,
|
235 |
+
chorus_feedback: float = 0.0,
|
236 |
+
chorus_mix: float = 0.5,
|
237 |
+
bitcrush_bit_depth: int = 8,
|
238 |
+
clipping_threshold: float = -6,
|
239 |
+
compressor_threshold: float = 0,
|
240 |
+
compressor_ratio: float = 1,
|
241 |
+
compressor_attack: float = 1.0,
|
242 |
+
compressor_release: float = 100,
|
243 |
+
delay_seconds: float = 0.5,
|
244 |
+
delay_feedback: float = 0.0,
|
245 |
+
delay_mix: float = 0.5,
|
246 |
+
sid: int = 0,
|
247 |
+
):
|
248 |
+
kwargs = {
|
249 |
+
"audio_input_paths": input_folder,
|
250 |
+
"audio_output_path": output_folder,
|
251 |
+
"model_path": pth_path,
|
252 |
+
"index_path": index_path,
|
253 |
+
"pitch": pitch,
|
254 |
+
"filter_radius": filter_radius,
|
255 |
+
"index_rate": index_rate,
|
256 |
+
"volume_envelope": volume_envelope,
|
257 |
+
"protect": protect,
|
258 |
+
"hop_length": hop_length,
|
259 |
+
"f0_method": f0_method,
|
260 |
+
"pth_path": pth_path,
|
261 |
+
"index_path": index_path,
|
262 |
+
"split_audio": split_audio,
|
263 |
+
"f0_autotune": f0_autotune,
|
264 |
+
"f0_autotune_strength": f0_autotune_strength,
|
265 |
+
"clean_audio": clean_audio,
|
266 |
+
"clean_strength": clean_strength,
|
267 |
+
"export_format": export_format,
|
268 |
+
"f0_file": f0_file,
|
269 |
+
"embedder_model": embedder_model,
|
270 |
+
"embedder_model_custom": embedder_model_custom,
|
271 |
+
"post_process": post_process,
|
272 |
+
"formant_shifting": formant_shifting,
|
273 |
+
"formant_qfrency": formant_qfrency,
|
274 |
+
"formant_timbre": formant_timbre,
|
275 |
+
"reverb": reverb,
|
276 |
+
"pitch_shift": pitch_shift,
|
277 |
+
"limiter": limiter,
|
278 |
+
"gain": gain,
|
279 |
+
"distortion": distortion,
|
280 |
+
"chorus": chorus,
|
281 |
+
"bitcrush": bitcrush,
|
282 |
+
"clipping": clipping,
|
283 |
+
"compressor": compressor,
|
284 |
+
"delay": delay,
|
285 |
+
"reverb_room_size": reverb_room_size,
|
286 |
+
"reverb_damping": reverb_damping,
|
287 |
+
"reverb_wet_level": reverb_wet_gain,
|
288 |
+
"reverb_dry_level": reverb_dry_gain,
|
289 |
+
"reverb_width": reverb_width,
|
290 |
+
"reverb_freeze_mode": reverb_freeze_mode,
|
291 |
+
"pitch_shift_semitones": pitch_shift_semitones,
|
292 |
+
"limiter_threshold": limiter_threshold,
|
293 |
+
"limiter_release": limiter_release_time,
|
294 |
+
"gain_db": gain_db,
|
295 |
+
"distortion_gain": distortion_gain,
|
296 |
+
"chorus_rate": chorus_rate,
|
297 |
+
"chorus_depth": chorus_depth,
|
298 |
+
"chorus_delay": chorus_center_delay,
|
299 |
+
"chorus_feedback": chorus_feedback,
|
300 |
+
"chorus_mix": chorus_mix,
|
301 |
+
"bitcrush_bit_depth": bitcrush_bit_depth,
|
302 |
+
"clipping_threshold": clipping_threshold,
|
303 |
+
"compressor_threshold": compressor_threshold,
|
304 |
+
"compressor_ratio": compressor_ratio,
|
305 |
+
"compressor_attack": compressor_attack,
|
306 |
+
"compressor_release": compressor_release,
|
307 |
+
"delay_seconds": delay_seconds,
|
308 |
+
"delay_feedback": delay_feedback,
|
309 |
+
"delay_mix": delay_mix,
|
310 |
+
"sid": sid,
|
311 |
+
}
|
312 |
+
infer_pipeline = import_voice_converter()
|
313 |
+
infer_pipeline.convert_audio_batch(
|
314 |
+
**kwargs,
|
315 |
+
)
|
316 |
+
|
317 |
+
return f"Files from {input_folder} inferred successfully."
|
318 |
+
|
319 |
+
|
320 |
+
# TTS
|
321 |
+
def run_tts_script(
|
322 |
+
tts_file: str,
|
323 |
+
tts_text: str,
|
324 |
+
tts_voice: str,
|
325 |
+
tts_rate: int,
|
326 |
+
pitch: int,
|
327 |
+
filter_radius: int,
|
328 |
+
index_rate: float,
|
329 |
+
volume_envelope: int,
|
330 |
+
protect: float,
|
331 |
+
hop_length: int,
|
332 |
+
f0_method: str,
|
333 |
+
output_tts_path: str,
|
334 |
+
output_rvc_path: str,
|
335 |
+
pth_path: str,
|
336 |
+
index_path: str,
|
337 |
+
split_audio: bool,
|
338 |
+
f0_autotune: bool,
|
339 |
+
f0_autotune_strength: float,
|
340 |
+
clean_audio: bool,
|
341 |
+
clean_strength: float,
|
342 |
+
export_format: str,
|
343 |
+
f0_file: str,
|
344 |
+
embedder_model: str,
|
345 |
+
embedder_model_custom: str = None,
|
346 |
+
sid: int = 0,
|
347 |
+
):
|
348 |
+
|
349 |
+
tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
|
350 |
+
|
351 |
+
if os.path.exists(output_tts_path):
|
352 |
+
os.remove(output_tts_path)
|
353 |
+
|
354 |
+
command_tts = [
|
355 |
+
*map(
|
356 |
+
str,
|
357 |
+
[
|
358 |
+
python,
|
359 |
+
tts_script_path,
|
360 |
+
tts_file,
|
361 |
+
tts_text,
|
362 |
+
tts_voice,
|
363 |
+
tts_rate,
|
364 |
+
output_tts_path,
|
365 |
+
],
|
366 |
+
),
|
367 |
+
]
|
368 |
+
subprocess.run(command_tts)
|
369 |
+
infer_pipeline = import_voice_converter()
|
370 |
+
infer_pipeline.convert_audio(
|
371 |
+
pitch=pitch,
|
372 |
+
filter_radius=filter_radius,
|
373 |
+
index_rate=index_rate,
|
374 |
+
volume_envelope=volume_envelope,
|
375 |
+
protect=protect,
|
376 |
+
hop_length=hop_length,
|
377 |
+
f0_method=f0_method,
|
378 |
+
audio_input_path=output_tts_path,
|
379 |
+
audio_output_path=output_rvc_path,
|
380 |
+
model_path=pth_path,
|
381 |
+
index_path=index_path,
|
382 |
+
split_audio=split_audio,
|
383 |
+
f0_autotune=f0_autotune,
|
384 |
+
f0_autotune_strength=f0_autotune_strength,
|
385 |
+
clean_audio=clean_audio,
|
386 |
+
clean_strength=clean_strength,
|
387 |
+
export_format=export_format,
|
388 |
+
f0_file=f0_file,
|
389 |
+
embedder_model=embedder_model,
|
390 |
+
embedder_model_custom=embedder_model_custom,
|
391 |
+
sid=sid,
|
392 |
+
formant_shifting=None,
|
393 |
+
formant_qfrency=None,
|
394 |
+
formant_timbre=None,
|
395 |
+
post_process=None,
|
396 |
+
reverb=None,
|
397 |
+
pitch_shift=None,
|
398 |
+
limiter=None,
|
399 |
+
gain=None,
|
400 |
+
distortion=None,
|
401 |
+
chorus=None,
|
402 |
+
bitcrush=None,
|
403 |
+
clipping=None,
|
404 |
+
compressor=None,
|
405 |
+
delay=None,
|
406 |
+
sliders=None,
|
407 |
+
)
|
408 |
+
|
409 |
+
return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace(
|
410 |
+
".wav", f".{export_format.lower()}"
|
411 |
+
)
|
412 |
+
|
413 |
+
|
414 |
+
# Model information
|
415 |
+
def run_model_information_script(pth_path: str):
|
416 |
+
print(model_information(pth_path))
|
417 |
+
return model_information(pth_path)
|
418 |
+
|
419 |
+
|
420 |
+
# Model blender
|
421 |
+
def run_model_blender_script(
|
422 |
+
model_name: str, pth_path_1: str, pth_path_2: str, ratio: float
|
423 |
+
):
|
424 |
+
message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio)
|
425 |
+
return message, model_blended
|
426 |
+
|
427 |
+
|
428 |
+
# Tensorboard
|
429 |
+
def run_tensorboard_script():
|
430 |
+
launch_tensorboard_pipeline()
|
431 |
+
|
432 |
+
|
433 |
+
# Download
|
434 |
+
def run_download_script(model_link: str):
|
435 |
+
model_download_pipeline(model_link)
|
436 |
+
return f"Model downloaded successfully."
|
437 |
+
|
438 |
+
|
439 |
+
# Audio analyzer
|
440 |
+
def run_audio_analyzer_script(
|
441 |
+
input_path: str, save_plot_path: str = "logs/audio_analysis.png"
|
442 |
+
):
|
443 |
+
audio_info, plot_path = analyze_audio(input_path, save_plot_path)
|
444 |
+
print(
|
445 |
+
f"Audio info of {input_path}: {audio_info}",
|
446 |
+
f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}",
|
447 |
+
)
|
448 |
+
return audio_info, plot_path
|
449 |
+
|
450 |
+
|
451 |
+
# Parse arguments
|
452 |
+
def parse_arguments():
|
453 |
+
parser = argparse.ArgumentParser(
|
454 |
+
description="Run the main.py script with specific parameters."
|
455 |
+
)
|
456 |
+
subparsers = parser.add_subparsers(
|
457 |
+
title="subcommands", dest="mode", help="Choose a mode"
|
458 |
+
)
|
459 |
+
|
460 |
+
# Parser for 'infer' mode
|
461 |
+
infer_parser = subparsers.add_parser("infer", help="Run inference")
|
462 |
+
pitch_description = (
|
463 |
+
"Set the pitch of the audio. Higher values result in a higher pitch."
|
464 |
+
)
|
465 |
+
infer_parser.add_argument(
|
466 |
+
"--pitch",
|
467 |
+
type=int,
|
468 |
+
help=pitch_description,
|
469 |
+
choices=range(-24, 25),
|
470 |
+
default=0,
|
471 |
+
)
|
472 |
+
filter_radius_description = "Apply median filtering to the extracted pitch values if this value is greater than or equal to three. This can help reduce breathiness in the output audio."
|
473 |
+
infer_parser.add_argument(
|
474 |
+
"--filter_radius",
|
475 |
+
type=int,
|
476 |
+
help=filter_radius_description,
|
477 |
+
choices=range(11),
|
478 |
+
default=3,
|
479 |
+
)
|
480 |
+
index_rate_description = "Control the influence of the index file on the output. Higher values mean stronger influence. Lower values can help reduce artifacts but may result in less accurate voice cloning."
|
481 |
+
infer_parser.add_argument(
|
482 |
+
"--index_rate",
|
483 |
+
type=float,
|
484 |
+
help=index_rate_description,
|
485 |
+
choices=[i / 100.0 for i in range(0, 101)],
|
486 |
+
default=0.3,
|
487 |
+
)
|
488 |
+
volume_envelope_description = "Control the blending of the output's volume envelope. A value of 1 means the output envelope is fully used."
|
489 |
+
infer_parser.add_argument(
|
490 |
+
"--volume_envelope",
|
491 |
+
type=float,
|
492 |
+
help=volume_envelope_description,
|
493 |
+
choices=[i / 100.0 for i in range(0, 101)],
|
494 |
+
default=1,
|
495 |
+
)
|
496 |
+
protect_description = "Protect consonants and breathing sounds from artifacts. A value of 0.5 offers the strongest protection, while lower values may reduce the protection level but potentially mitigate the indexing effect."
|
497 |
+
infer_parser.add_argument(
|
498 |
+
"--protect",
|
499 |
+
type=float,
|
500 |
+
help=protect_description,
|
501 |
+
choices=[i / 1000.0 for i in range(0, 501)],
|
502 |
+
default=0.33,
|
503 |
+
)
|
504 |
+
hop_length_description = "Only applicable for the Crepe pitch extraction method. Determines the time it takes for the system to react to a significant pitch change. Smaller values require more processing time but can lead to better pitch accuracy."
|
505 |
+
infer_parser.add_argument(
|
506 |
+
"--hop_length",
|
507 |
+
type=int,
|
508 |
+
help=hop_length_description,
|
509 |
+
choices=range(1, 513),
|
510 |
+
default=128,
|
511 |
+
)
|
512 |
+
f0_method_description = "Choose the pitch extraction algorithm for the conversion. 'rmvpe' is the default and generally recommended."
|
513 |
+
infer_parser.add_argument(
|
514 |
+
"--f0_method",
|
515 |
+
type=str,
|
516 |
+
help=f0_method_description,
|
517 |
+
choices=[
|
518 |
+
"crepe",
|
519 |
+
"crepe-tiny",
|
520 |
+
"rmvpe",
|
521 |
+
"fcpe",
|
522 |
+
"hybrid[crepe+rmvpe]",
|
523 |
+
"hybrid[crepe+fcpe]",
|
524 |
+
"hybrid[rmvpe+fcpe]",
|
525 |
+
"hybrid[crepe+rmvpe+fcpe]",
|
526 |
+
],
|
527 |
+
default="rmvpe",
|
528 |
+
)
|
529 |
+
infer_parser.add_argument(
|
530 |
+
"--input_path",
|
531 |
+
type=str,
|
532 |
+
help="Full path to the input audio file.",
|
533 |
+
required=True,
|
534 |
+
)
|
535 |
+
infer_parser.add_argument(
|
536 |
+
"--output_path",
|
537 |
+
type=str,
|
538 |
+
help="Full path to the output audio file.",
|
539 |
+
required=True,
|
540 |
+
)
|
541 |
+
pth_path_description = "Full path to the RVC model file (.pth)."
|
542 |
+
infer_parser.add_argument(
|
543 |
+
"--pth_path", type=str, help=pth_path_description, required=True
|
544 |
+
)
|
545 |
+
index_path_description = "Full path to the index file (.index)."
|
546 |
+
infer_parser.add_argument(
|
547 |
+
"--index_path", type=str, help=index_path_description, required=True
|
548 |
+
)
|
549 |
+
split_audio_description = "Split the audio into smaller segments before inference. This can improve the quality of the output for longer audio files."
|
550 |
+
infer_parser.add_argument(
|
551 |
+
"--split_audio",
|
552 |
+
type=lambda x: bool(strtobool(x)),
|
553 |
+
choices=[True, False],
|
554 |
+
help=split_audio_description,
|
555 |
+
default=False,
|
556 |
+
)
|
557 |
+
f0_autotune_description = "Apply a light autotune to the inferred audio. Particularly useful for singing voice conversions."
|
558 |
+
infer_parser.add_argument(
|
559 |
+
"--f0_autotune",
|
560 |
+
type=lambda x: bool(strtobool(x)),
|
561 |
+
choices=[True, False],
|
562 |
+
help=f0_autotune_description,
|
563 |
+
default=False,
|
564 |
+
)
|
565 |
+
f0_autotune_strength_description = "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid."
|
566 |
+
infer_parser.add_argument(
|
567 |
+
"--f0_autotune_strength",
|
568 |
+
type=float,
|
569 |
+
help=f0_autotune_strength_description,
|
570 |
+
choices=[(i / 10) for i in range(11)],
|
571 |
+
default=1.0,
|
572 |
+
)
|
573 |
+
clean_audio_description = "Clean the output audio using noise reduction algorithms. Recommended for speech conversions."
|
574 |
+
infer_parser.add_argument(
|
575 |
+
"--clean_audio",
|
576 |
+
type=lambda x: bool(strtobool(x)),
|
577 |
+
choices=[True, False],
|
578 |
+
help=clean_audio_description,
|
579 |
+
default=False,
|
580 |
+
)
|
581 |
+
clean_strength_description = "Adjust the intensity of the audio cleaning process. Higher values result in stronger cleaning, but may lead to a more compressed sound."
|
582 |
+
infer_parser.add_argument(
|
583 |
+
"--clean_strength",
|
584 |
+
type=float,
|
585 |
+
help=clean_strength_description,
|
586 |
+
choices=[(i / 10) for i in range(11)],
|
587 |
+
default=0.7,
|
588 |
+
)
|
589 |
+
export_format_description = "Select the desired output audio format."
|
590 |
+
infer_parser.add_argument(
|
591 |
+
"--export_format",
|
592 |
+
type=str,
|
593 |
+
help=export_format_description,
|
594 |
+
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
595 |
+
default="WAV",
|
596 |
+
)
|
597 |
+
embedder_model_description = (
|
598 |
+
"Choose the model used for generating speaker embeddings."
|
599 |
+
)
|
600 |
+
infer_parser.add_argument(
|
601 |
+
"--embedder_model",
|
602 |
+
type=str,
|
603 |
+
help=embedder_model_description,
|
604 |
+
choices=[
|
605 |
+
"contentvec",
|
606 |
+
"chinese-hubert-base",
|
607 |
+
"japanese-hubert-base",
|
608 |
+
"korean-hubert-base",
|
609 |
+
"custom",
|
610 |
+
],
|
611 |
+
default="contentvec",
|
612 |
+
)
|
613 |
+
embedder_model_custom_description = "Specify the path to a custom model for speaker embedding. Only applicable if 'embedder_model' is set to 'custom'."
|
614 |
+
infer_parser.add_argument(
|
615 |
+
"--embedder_model_custom",
|
616 |
+
type=str,
|
617 |
+
help=embedder_model_custom_description,
|
618 |
+
default=None,
|
619 |
+
)
|
620 |
+
f0_file_description = "Full path to an external F0 file (.f0). This allows you to use pre-computed pitch values for the input audio."
|
621 |
+
infer_parser.add_argument(
|
622 |
+
"--f0_file",
|
623 |
+
type=str,
|
624 |
+
help=f0_file_description,
|
625 |
+
default=None,
|
626 |
+
)
|
627 |
+
formant_shifting_description = "Apply formant shifting to the input audio. This can help adjust the timbre of the voice."
|
628 |
+
infer_parser.add_argument(
|
629 |
+
"--formant_shifting",
|
630 |
+
type=lambda x: bool(strtobool(x)),
|
631 |
+
choices=[True, False],
|
632 |
+
help=formant_shifting_description,
|
633 |
+
default=False,
|
634 |
+
required=False,
|
635 |
+
)
|
636 |
+
formant_qfrency_description = "Control the frequency of the formant shifting effect. Higher values result in a more pronounced effect."
|
637 |
+
infer_parser.add_argument(
|
638 |
+
"--formant_qfrency",
|
639 |
+
type=float,
|
640 |
+
help=formant_qfrency_description,
|
641 |
+
default=1.0,
|
642 |
+
required=False,
|
643 |
+
)
|
644 |
+
formant_timbre_description = "Control the timbre of the formant shifting effect. Higher values result in a more pronounced effect."
|
645 |
+
infer_parser.add_argument(
|
646 |
+
"--formant_timbre",
|
647 |
+
type=float,
|
648 |
+
help=formant_timbre_description,
|
649 |
+
default=1.0,
|
650 |
+
required=False,
|
651 |
+
)
|
652 |
+
sid_description = "Speaker ID for multi-speaker models."
|
653 |
+
infer_parser.add_argument(
|
654 |
+
"--sid",
|
655 |
+
type=int,
|
656 |
+
help=sid_description,
|
657 |
+
default=0,
|
658 |
+
required=False,
|
659 |
+
)
|
660 |
+
post_process_description = "Apply post-processing effects to the output audio."
|
661 |
+
infer_parser.add_argument(
|
662 |
+
"--post_process",
|
663 |
+
type=lambda x: bool(strtobool(x)),
|
664 |
+
choices=[True, False],
|
665 |
+
help=post_process_description,
|
666 |
+
default=False,
|
667 |
+
required=False,
|
668 |
+
)
|
669 |
+
reverb_description = "Apply reverb effect to the output audio."
|
670 |
+
infer_parser.add_argument(
|
671 |
+
"--reverb",
|
672 |
+
type=lambda x: bool(strtobool(x)),
|
673 |
+
choices=[True, False],
|
674 |
+
help=reverb_description,
|
675 |
+
default=False,
|
676 |
+
required=False,
|
677 |
+
)
|
678 |
+
|
679 |
+
pitch_shift_description = "Apply pitch shifting effect to the output audio."
|
680 |
+
infer_parser.add_argument(
|
681 |
+
"--pitch_shift",
|
682 |
+
type=lambda x: bool(strtobool(x)),
|
683 |
+
choices=[True, False],
|
684 |
+
help=pitch_shift_description,
|
685 |
+
default=False,
|
686 |
+
required=False,
|
687 |
+
)
|
688 |
+
|
689 |
+
limiter_description = "Apply limiter effect to the output audio."
|
690 |
+
infer_parser.add_argument(
|
691 |
+
"--limiter",
|
692 |
+
type=lambda x: bool(strtobool(x)),
|
693 |
+
choices=[True, False],
|
694 |
+
help=limiter_description,
|
695 |
+
default=False,
|
696 |
+
required=False,
|
697 |
+
)
|
698 |
+
|
699 |
+
gain_description = "Apply gain effect to the output audio."
|
700 |
+
infer_parser.add_argument(
|
701 |
+
"--gain",
|
702 |
+
type=lambda x: bool(strtobool(x)),
|
703 |
+
choices=[True, False],
|
704 |
+
help=gain_description,
|
705 |
+
default=False,
|
706 |
+
required=False,
|
707 |
+
)
|
708 |
+
|
709 |
+
distortion_description = "Apply distortion effect to the output audio."
|
710 |
+
infer_parser.add_argument(
|
711 |
+
"--distortion",
|
712 |
+
type=lambda x: bool(strtobool(x)),
|
713 |
+
choices=[True, False],
|
714 |
+
help=distortion_description,
|
715 |
+
default=False,
|
716 |
+
required=False,
|
717 |
+
)
|
718 |
+
|
719 |
+
chorus_description = "Apply chorus effect to the output audio."
|
720 |
+
infer_parser.add_argument(
|
721 |
+
"--chorus",
|
722 |
+
type=lambda x: bool(strtobool(x)),
|
723 |
+
choices=[True, False],
|
724 |
+
help=chorus_description,
|
725 |
+
default=False,
|
726 |
+
required=False,
|
727 |
+
)
|
728 |
+
|
729 |
+
bitcrush_description = "Apply bitcrush effect to the output audio."
|
730 |
+
infer_parser.add_argument(
|
731 |
+
"--bitcrush",
|
732 |
+
type=lambda x: bool(strtobool(x)),
|
733 |
+
choices=[True, False],
|
734 |
+
help=bitcrush_description,
|
735 |
+
default=False,
|
736 |
+
required=False,
|
737 |
+
)
|
738 |
+
|
739 |
+
clipping_description = "Apply clipping effect to the output audio."
|
740 |
+
infer_parser.add_argument(
|
741 |
+
"--clipping",
|
742 |
+
type=lambda x: bool(strtobool(x)),
|
743 |
+
choices=[True, False],
|
744 |
+
help=clipping_description,
|
745 |
+
default=False,
|
746 |
+
required=False,
|
747 |
+
)
|
748 |
+
|
749 |
+
compressor_description = "Apply compressor effect to the output audio."
|
750 |
+
infer_parser.add_argument(
|
751 |
+
"--compressor",
|
752 |
+
type=lambda x: bool(strtobool(x)),
|
753 |
+
choices=[True, False],
|
754 |
+
help=compressor_description,
|
755 |
+
default=False,
|
756 |
+
required=False,
|
757 |
+
)
|
758 |
+
|
759 |
+
delay_description = "Apply delay effect to the output audio."
|
760 |
+
infer_parser.add_argument(
|
761 |
+
"--delay",
|
762 |
+
type=lambda x: bool(strtobool(x)),
|
763 |
+
choices=[True, False],
|
764 |
+
help=delay_description,
|
765 |
+
default=False,
|
766 |
+
required=False,
|
767 |
+
)
|
768 |
+
|
769 |
+
reverb_room_size_description = "Control the room size of the reverb effect. Higher values result in a larger room size."
|
770 |
+
infer_parser.add_argument(
|
771 |
+
"--reverb_room_size",
|
772 |
+
type=float,
|
773 |
+
help=reverb_room_size_description,
|
774 |
+
default=0.5,
|
775 |
+
required=False,
|
776 |
+
)
|
777 |
+
|
778 |
+
reverb_damping_description = "Control the damping of the reverb effect. Higher values result in a more damped sound."
|
779 |
+
infer_parser.add_argument(
|
780 |
+
"--reverb_damping",
|
781 |
+
type=float,
|
782 |
+
help=reverb_damping_description,
|
783 |
+
default=0.5,
|
784 |
+
required=False,
|
785 |
+
)
|
786 |
+
|
787 |
+
reverb_wet_gain_description = "Control the wet gain of the reverb effect. Higher values result in a stronger reverb effect."
|
788 |
+
infer_parser.add_argument(
|
789 |
+
"--reverb_wet_gain",
|
790 |
+
type=float,
|
791 |
+
help=reverb_wet_gain_description,
|
792 |
+
default=0.5,
|
793 |
+
required=False,
|
794 |
+
)
|
795 |
+
|
796 |
+
reverb_dry_gain_description = "Control the dry gain of the reverb effect. Higher values result in a stronger dry signal."
|
797 |
+
infer_parser.add_argument(
|
798 |
+
"--reverb_dry_gain",
|
799 |
+
type=float,
|
800 |
+
help=reverb_dry_gain_description,
|
801 |
+
default=0.5,
|
802 |
+
required=False,
|
803 |
+
)
|
804 |
+
|
805 |
+
reverb_width_description = "Control the stereo width of the reverb effect. Higher values result in a wider stereo image."
|
806 |
+
infer_parser.add_argument(
|
807 |
+
"--reverb_width",
|
808 |
+
type=float,
|
809 |
+
help=reverb_width_description,
|
810 |
+
default=0.5,
|
811 |
+
required=False,
|
812 |
+
)
|
813 |
+
|
814 |
+
reverb_freeze_mode_description = "Control the freeze mode of the reverb effect. Higher values result in a stronger freeze effect."
|
815 |
+
infer_parser.add_argument(
|
816 |
+
"--reverb_freeze_mode",
|
817 |
+
type=float,
|
818 |
+
help=reverb_freeze_mode_description,
|
819 |
+
default=0.5,
|
820 |
+
required=False,
|
821 |
+
)
|
822 |
+
|
823 |
+
pitch_shift_semitones_description = "Control the pitch shift in semitones. Positive values increase the pitch, while negative values decrease it."
|
824 |
+
infer_parser.add_argument(
|
825 |
+
"--pitch_shift_semitones",
|
826 |
+
type=float,
|
827 |
+
help=pitch_shift_semitones_description,
|
828 |
+
default=0.0,
|
829 |
+
required=False,
|
830 |
+
)
|
831 |
+
|
832 |
+
limiter_threshold_description = "Control the threshold of the limiter effect. Higher values result in a stronger limiting effect."
|
833 |
+
infer_parser.add_argument(
|
834 |
+
"--limiter_threshold",
|
835 |
+
type=float,
|
836 |
+
help=limiter_threshold_description,
|
837 |
+
default=-6,
|
838 |
+
required=False,
|
839 |
+
)
|
840 |
+
|
841 |
+
limiter_release_time_description = "Control the release time of the limiter effect. Higher values result in a longer release time."
|
842 |
+
infer_parser.add_argument(
|
843 |
+
"--limiter_release_time",
|
844 |
+
type=float,
|
845 |
+
help=limiter_release_time_description,
|
846 |
+
default=0.01,
|
847 |
+
required=False,
|
848 |
+
)
|
849 |
+
|
850 |
+
gain_db_description = "Control the gain in decibels. Positive values increase the gain, while negative values decrease it."
|
851 |
+
infer_parser.add_argument(
|
852 |
+
"--gain_db",
|
853 |
+
type=float,
|
854 |
+
help=gain_db_description,
|
855 |
+
default=0.0,
|
856 |
+
required=False,
|
857 |
+
)
|
858 |
+
|
859 |
+
distortion_gain_description = "Control the gain of the distortion effect. Higher values result in a stronger distortion effect."
|
860 |
+
infer_parser.add_argument(
|
861 |
+
"--distortion_gain",
|
862 |
+
type=float,
|
863 |
+
help=distortion_gain_description,
|
864 |
+
default=25,
|
865 |
+
required=False,
|
866 |
+
)
|
867 |
+
|
868 |
+
chorus_rate_description = "Control the rate of the chorus effect. Higher values result in a faster chorus effect."
|
869 |
+
infer_parser.add_argument(
|
870 |
+
"--chorus_rate",
|
871 |
+
type=float,
|
872 |
+
help=chorus_rate_description,
|
873 |
+
default=1.0,
|
874 |
+
required=False,
|
875 |
+
)
|
876 |
+
|
877 |
+
chorus_depth_description = "Control the depth of the chorus effect. Higher values result in a stronger chorus effect."
|
878 |
+
infer_parser.add_argument(
|
879 |
+
"--chorus_depth",
|
880 |
+
type=float,
|
881 |
+
help=chorus_depth_description,
|
882 |
+
default=0.25,
|
883 |
+
required=False,
|
884 |
+
)
|
885 |
+
|
886 |
+
chorus_center_delay_description = "Control the center delay of the chorus effect. Higher values result in a longer center delay."
|
887 |
+
infer_parser.add_argument(
|
888 |
+
"--chorus_center_delay",
|
889 |
+
type=float,
|
890 |
+
help=chorus_center_delay_description,
|
891 |
+
default=7,
|
892 |
+
required=False,
|
893 |
+
)
|
894 |
+
|
895 |
+
chorus_feedback_description = "Control the feedback of the chorus effect. Higher values result in a stronger feedback effect."
|
896 |
+
infer_parser.add_argument(
|
897 |
+
"--chorus_feedback",
|
898 |
+
type=float,
|
899 |
+
help=chorus_feedback_description,
|
900 |
+
default=0.0,
|
901 |
+
required=False,
|
902 |
+
)
|
903 |
+
|
904 |
+
chorus_mix_description = "Control the mix of the chorus effect. Higher values result in a stronger chorus effect."
|
905 |
+
infer_parser.add_argument(
|
906 |
+
"--chorus_mix",
|
907 |
+
type=float,
|
908 |
+
help=chorus_mix_description,
|
909 |
+
default=0.5,
|
910 |
+
required=False,
|
911 |
+
)
|
912 |
+
|
913 |
+
bitcrush_bit_depth_description = "Control the bit depth of the bitcrush effect. Higher values result in a stronger bitcrush effect."
|
914 |
+
infer_parser.add_argument(
|
915 |
+
"--bitcrush_bit_depth",
|
916 |
+
type=int,
|
917 |
+
help=bitcrush_bit_depth_description,
|
918 |
+
default=8,
|
919 |
+
required=False,
|
920 |
+
)
|
921 |
+
|
922 |
+
clipping_threshold_description = "Control the threshold of the clipping effect. Higher values result in a stronger clipping effect."
|
923 |
+
infer_parser.add_argument(
|
924 |
+
"--clipping_threshold",
|
925 |
+
type=float,
|
926 |
+
help=clipping_threshold_description,
|
927 |
+
default=-6,
|
928 |
+
required=False,
|
929 |
+
)
|
930 |
+
|
931 |
+
compressor_threshold_description = "Control the threshold of the compressor effect. Higher values result in a stronger compressor effect."
|
932 |
+
infer_parser.add_argument(
|
933 |
+
"--compressor_threshold",
|
934 |
+
type=float,
|
935 |
+
help=compressor_threshold_description,
|
936 |
+
default=0,
|
937 |
+
required=False,
|
938 |
+
)
|
939 |
+
|
940 |
+
compressor_ratio_description = "Control the ratio of the compressor effect. Higher values result in a stronger compressor effect."
|
941 |
+
infer_parser.add_argument(
|
942 |
+
"--compressor_ratio",
|
943 |
+
type=float,
|
944 |
+
help=compressor_ratio_description,
|
945 |
+
default=1,
|
946 |
+
required=False,
|
947 |
+
)
|
948 |
+
|
949 |
+
compressor_attack_description = "Control the attack of the compressor effect. Higher values result in a stronger compressor effect."
|
950 |
+
infer_parser.add_argument(
|
951 |
+
"--compressor_attack",
|
952 |
+
type=float,
|
953 |
+
help=compressor_attack_description,
|
954 |
+
default=1.0,
|
955 |
+
required=False,
|
956 |
+
)
|
957 |
+
|
958 |
+
compressor_release_description = "Control the release of the compressor effect. Higher values result in a stronger compressor effect."
|
959 |
+
infer_parser.add_argument(
|
960 |
+
"--compressor_release",
|
961 |
+
type=float,
|
962 |
+
help=compressor_release_description,
|
963 |
+
default=100,
|
964 |
+
required=False,
|
965 |
+
)
|
966 |
+
|
967 |
+
delay_seconds_description = "Control the delay time in seconds. Higher values result in a longer delay time."
|
968 |
+
infer_parser.add_argument(
|
969 |
+
"--delay_seconds",
|
970 |
+
type=float,
|
971 |
+
help=delay_seconds_description,
|
972 |
+
default=0.5,
|
973 |
+
required=False,
|
974 |
+
)
|
975 |
+
delay_feedback_description = "Control the feedback of the delay effect. Higher values result in a stronger feedback effect."
|
976 |
+
infer_parser.add_argument(
|
977 |
+
"--delay_feedback",
|
978 |
+
type=float,
|
979 |
+
help=delay_feedback_description,
|
980 |
+
default=0.0,
|
981 |
+
required=False,
|
982 |
+
)
|
983 |
+
delay_mix_description = "Control the mix of the delay effect. Higher values result in a stronger delay effect."
|
984 |
+
infer_parser.add_argument(
|
985 |
+
"--delay_mix",
|
986 |
+
type=float,
|
987 |
+
help=delay_mix_description,
|
988 |
+
default=0.5,
|
989 |
+
required=False,
|
990 |
+
)
|
991 |
+
|
992 |
+
# Parser for 'batch_infer' mode
|
993 |
+
batch_infer_parser = subparsers.add_parser(
|
994 |
+
"batch_infer",
|
995 |
+
help="Run batch inference",
|
996 |
+
)
|
997 |
+
batch_infer_parser.add_argument(
|
998 |
+
"--pitch",
|
999 |
+
type=int,
|
1000 |
+
help=pitch_description,
|
1001 |
+
choices=range(-24, 25),
|
1002 |
+
default=0,
|
1003 |
+
)
|
1004 |
+
batch_infer_parser.add_argument(
|
1005 |
+
"--filter_radius",
|
1006 |
+
type=int,
|
1007 |
+
help=filter_radius_description,
|
1008 |
+
choices=range(11),
|
1009 |
+
default=3,
|
1010 |
+
)
|
1011 |
+
batch_infer_parser.add_argument(
|
1012 |
+
"--index_rate",
|
1013 |
+
type=float,
|
1014 |
+
help=index_rate_description,
|
1015 |
+
choices=[i / 100.0 for i in range(0, 101)],
|
1016 |
+
default=0.3,
|
1017 |
+
)
|
1018 |
+
batch_infer_parser.add_argument(
|
1019 |
+
"--volume_envelope",
|
1020 |
+
type=float,
|
1021 |
+
help=volume_envelope_description,
|
1022 |
+
choices=[i / 100.0 for i in range(0, 101)],
|
1023 |
+
default=1,
|
1024 |
+
)
|
1025 |
+
batch_infer_parser.add_argument(
|
1026 |
+
"--protect",
|
1027 |
+
type=float,
|
1028 |
+
help=protect_description,
|
1029 |
+
choices=[i / 1000.0 for i in range(0, 501)],
|
1030 |
+
default=0.33,
|
1031 |
+
)
|
1032 |
+
batch_infer_parser.add_argument(
|
1033 |
+
"--hop_length",
|
1034 |
+
type=int,
|
1035 |
+
help=hop_length_description,
|
1036 |
+
choices=range(1, 513),
|
1037 |
+
default=128,
|
1038 |
+
)
|
1039 |
+
batch_infer_parser.add_argument(
|
1040 |
+
"--f0_method",
|
1041 |
+
type=str,
|
1042 |
+
help=f0_method_description,
|
1043 |
+
choices=[
|
1044 |
+
"crepe",
|
1045 |
+
"crepe-tiny",
|
1046 |
+
"rmvpe",
|
1047 |
+
"fcpe",
|
1048 |
+
"hybrid[crepe+rmvpe]",
|
1049 |
+
"hybrid[crepe+fcpe]",
|
1050 |
+
"hybrid[rmvpe+fcpe]",
|
1051 |
+
"hybrid[crepe+rmvpe+fcpe]",
|
1052 |
+
],
|
1053 |
+
default="rmvpe",
|
1054 |
+
)
|
1055 |
+
batch_infer_parser.add_argument(
|
1056 |
+
"--input_folder",
|
1057 |
+
type=str,
|
1058 |
+
help="Path to the folder containing input audio files.",
|
1059 |
+
required=True,
|
1060 |
+
)
|
1061 |
+
batch_infer_parser.add_argument(
|
1062 |
+
"--output_folder",
|
1063 |
+
type=str,
|
1064 |
+
help="Path to the folder for saving output audio files.",
|
1065 |
+
required=True,
|
1066 |
+
)
|
1067 |
+
batch_infer_parser.add_argument(
|
1068 |
+
"--pth_path", type=str, help=pth_path_description, required=True
|
1069 |
+
)
|
1070 |
+
batch_infer_parser.add_argument(
|
1071 |
+
"--index_path", type=str, help=index_path_description, required=True
|
1072 |
+
)
|
1073 |
+
batch_infer_parser.add_argument(
|
1074 |
+
"--split_audio",
|
1075 |
+
type=lambda x: bool(strtobool(x)),
|
1076 |
+
choices=[True, False],
|
1077 |
+
help=split_audio_description,
|
1078 |
+
default=False,
|
1079 |
+
)
|
1080 |
+
batch_infer_parser.add_argument(
|
1081 |
+
"--f0_autotune",
|
1082 |
+
type=lambda x: bool(strtobool(x)),
|
1083 |
+
choices=[True, False],
|
1084 |
+
help=f0_autotune_description,
|
1085 |
+
default=False,
|
1086 |
+
)
|
1087 |
+
batch_infer_parser.add_argument(
|
1088 |
+
"--f0_autotune_strength",
|
1089 |
+
type=float,
|
1090 |
+
help=clean_strength_description,
|
1091 |
+
choices=[(i / 10) for i in range(11)],
|
1092 |
+
default=1.0,
|
1093 |
+
)
|
1094 |
+
batch_infer_parser.add_argument(
|
1095 |
+
"--clean_audio",
|
1096 |
+
type=lambda x: bool(strtobool(x)),
|
1097 |
+
choices=[True, False],
|
1098 |
+
help=clean_audio_description,
|
1099 |
+
default=False,
|
1100 |
+
)
|
1101 |
+
batch_infer_parser.add_argument(
|
1102 |
+
"--clean_strength",
|
1103 |
+
type=float,
|
1104 |
+
help=clean_strength_description,
|
1105 |
+
choices=[(i / 10) for i in range(11)],
|
1106 |
+
default=0.7,
|
1107 |
+
)
|
1108 |
+
batch_infer_parser.add_argument(
|
1109 |
+
"--export_format",
|
1110 |
+
type=str,
|
1111 |
+
help=export_format_description,
|
1112 |
+
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
1113 |
+
default="WAV",
|
1114 |
+
)
|
1115 |
+
batch_infer_parser.add_argument(
|
1116 |
+
"--embedder_model",
|
1117 |
+
type=str,
|
1118 |
+
help=embedder_model_description,
|
1119 |
+
choices=[
|
1120 |
+
"contentvec",
|
1121 |
+
"chinese-hubert-base",
|
1122 |
+
"japanese-hubert-base",
|
1123 |
+
"korean-hubert-base",
|
1124 |
+
"custom",
|
1125 |
+
],
|
1126 |
+
default="contentvec",
|
1127 |
+
)
|
1128 |
+
batch_infer_parser.add_argument(
|
1129 |
+
"--embedder_model_custom",
|
1130 |
+
type=str,
|
1131 |
+
help=embedder_model_custom_description,
|
1132 |
+
default=None,
|
1133 |
+
)
|
1134 |
+
batch_infer_parser.add_argument(
|
1135 |
+
"--f0_file",
|
1136 |
+
type=str,
|
1137 |
+
help=f0_file_description,
|
1138 |
+
default=None,
|
1139 |
+
)
|
1140 |
+
batch_infer_parser.add_argument(
|
1141 |
+
"--formant_shifting",
|
1142 |
+
type=lambda x: bool(strtobool(x)),
|
1143 |
+
choices=[True, False],
|
1144 |
+
help=formant_shifting_description,
|
1145 |
+
default=False,
|
1146 |
+
required=False,
|
1147 |
+
)
|
1148 |
+
batch_infer_parser.add_argument(
|
1149 |
+
"--formant_qfrency",
|
1150 |
+
type=float,
|
1151 |
+
help=formant_qfrency_description,
|
1152 |
+
default=1.0,
|
1153 |
+
required=False,
|
1154 |
+
)
|
1155 |
+
batch_infer_parser.add_argument(
|
1156 |
+
"--formant_timbre",
|
1157 |
+
type=float,
|
1158 |
+
help=formant_timbre_description,
|
1159 |
+
default=1.0,
|
1160 |
+
required=False,
|
1161 |
+
)
|
1162 |
+
batch_infer_parser.add_argument(
|
1163 |
+
"--sid",
|
1164 |
+
type=int,
|
1165 |
+
help=sid_description,
|
1166 |
+
default=0,
|
1167 |
+
required=False,
|
1168 |
+
)
|
1169 |
+
batch_infer_parser.add_argument(
|
1170 |
+
"--post_process",
|
1171 |
+
type=lambda x: bool(strtobool(x)),
|
1172 |
+
choices=[True, False],
|
1173 |
+
help=post_process_description,
|
1174 |
+
default=False,
|
1175 |
+
required=False,
|
1176 |
+
)
|
1177 |
+
batch_infer_parser.add_argument(
|
1178 |
+
"--reverb",
|
1179 |
+
type=lambda x: bool(strtobool(x)),
|
1180 |
+
choices=[True, False],
|
1181 |
+
help=reverb_description,
|
1182 |
+
default=False,
|
1183 |
+
required=False,
|
1184 |
+
)
|
1185 |
+
|
1186 |
+
batch_infer_parser.add_argument(
|
1187 |
+
"--pitch_shift",
|
1188 |
+
type=lambda x: bool(strtobool(x)),
|
1189 |
+
choices=[True, False],
|
1190 |
+
help=pitch_shift_description,
|
1191 |
+
default=False,
|
1192 |
+
required=False,
|
1193 |
+
)
|
1194 |
+
|
1195 |
+
batch_infer_parser.add_argument(
|
1196 |
+
"--limiter",
|
1197 |
+
type=lambda x: bool(strtobool(x)),
|
1198 |
+
choices=[True, False],
|
1199 |
+
help=limiter_description,
|
1200 |
+
default=False,
|
1201 |
+
required=False,
|
1202 |
+
)
|
1203 |
+
|
1204 |
+
batch_infer_parser.add_argument(
|
1205 |
+
"--gain",
|
1206 |
+
type=lambda x: bool(strtobool(x)),
|
1207 |
+
choices=[True, False],
|
1208 |
+
help=gain_description,
|
1209 |
+
default=False,
|
1210 |
+
required=False,
|
1211 |
+
)
|
1212 |
+
|
1213 |
+
batch_infer_parser.add_argument(
|
1214 |
+
"--distortion",
|
1215 |
+
type=lambda x: bool(strtobool(x)),
|
1216 |
+
choices=[True, False],
|
1217 |
+
help=distortion_description,
|
1218 |
+
default=False,
|
1219 |
+
required=False,
|
1220 |
+
)
|
1221 |
+
|
1222 |
+
batch_infer_parser.add_argument(
|
1223 |
+
"--chorus",
|
1224 |
+
type=lambda x: bool(strtobool(x)),
|
1225 |
+
choices=[True, False],
|
1226 |
+
help=chorus_description,
|
1227 |
+
default=False,
|
1228 |
+
required=False,
|
1229 |
+
)
|
1230 |
+
|
1231 |
+
batch_infer_parser.add_argument(
|
1232 |
+
"--bitcrush",
|
1233 |
+
type=lambda x: bool(strtobool(x)),
|
1234 |
+
choices=[True, False],
|
1235 |
+
help=bitcrush_description,
|
1236 |
+
default=False,
|
1237 |
+
required=False,
|
1238 |
+
)
|
1239 |
+
|
1240 |
+
batch_infer_parser.add_argument(
|
1241 |
+
"--clipping",
|
1242 |
+
type=lambda x: bool(strtobool(x)),
|
1243 |
+
choices=[True, False],
|
1244 |
+
help=clipping_description,
|
1245 |
+
default=False,
|
1246 |
+
required=False,
|
1247 |
+
)
|
1248 |
+
|
1249 |
+
batch_infer_parser.add_argument(
|
1250 |
+
"--compressor",
|
1251 |
+
type=lambda x: bool(strtobool(x)),
|
1252 |
+
choices=[True, False],
|
1253 |
+
help=compressor_description,
|
1254 |
+
default=False,
|
1255 |
+
required=False,
|
1256 |
+
)
|
1257 |
+
|
1258 |
+
batch_infer_parser.add_argument(
|
1259 |
+
"--delay",
|
1260 |
+
type=lambda x: bool(strtobool(x)),
|
1261 |
+
choices=[True, False],
|
1262 |
+
help=delay_description,
|
1263 |
+
default=False,
|
1264 |
+
required=False,
|
1265 |
+
)
|
1266 |
+
|
1267 |
+
batch_infer_parser.add_argument(
|
1268 |
+
"--reverb_room_size",
|
1269 |
+
type=float,
|
1270 |
+
help=reverb_room_size_description,
|
1271 |
+
default=0.5,
|
1272 |
+
required=False,
|
1273 |
+
)
|
1274 |
+
|
1275 |
+
batch_infer_parser.add_argument(
|
1276 |
+
"--reverb_damping",
|
1277 |
+
type=float,
|
1278 |
+
help=reverb_damping_description,
|
1279 |
+
default=0.5,
|
1280 |
+
required=False,
|
1281 |
+
)
|
1282 |
+
|
1283 |
+
batch_infer_parser.add_argument(
|
1284 |
+
"--reverb_wet_gain",
|
1285 |
+
type=float,
|
1286 |
+
help=reverb_wet_gain_description,
|
1287 |
+
default=0.5,
|
1288 |
+
required=False,
|
1289 |
+
)
|
1290 |
+
|
1291 |
+
batch_infer_parser.add_argument(
|
1292 |
+
"--reverb_dry_gain",
|
1293 |
+
type=float,
|
1294 |
+
help=reverb_dry_gain_description,
|
1295 |
+
default=0.5,
|
1296 |
+
required=False,
|
1297 |
+
)
|
1298 |
+
|
1299 |
+
batch_infer_parser.add_argument(
|
1300 |
+
"--reverb_width",
|
1301 |
+
type=float,
|
1302 |
+
help=reverb_width_description,
|
1303 |
+
default=0.5,
|
1304 |
+
required=False,
|
1305 |
+
)
|
1306 |
+
|
1307 |
+
batch_infer_parser.add_argument(
|
1308 |
+
"--reverb_freeze_mode",
|
1309 |
+
type=float,
|
1310 |
+
help=reverb_freeze_mode_description,
|
1311 |
+
default=0.5,
|
1312 |
+
required=False,
|
1313 |
+
)
|
1314 |
+
|
1315 |
+
batch_infer_parser.add_argument(
|
1316 |
+
"--pitch_shift_semitones",
|
1317 |
+
type=float,
|
1318 |
+
help=pitch_shift_semitones_description,
|
1319 |
+
default=0.0,
|
1320 |
+
required=False,
|
1321 |
+
)
|
1322 |
+
|
1323 |
+
batch_infer_parser.add_argument(
|
1324 |
+
"--limiter_threshold",
|
1325 |
+
type=float,
|
1326 |
+
help=limiter_threshold_description,
|
1327 |
+
default=-6,
|
1328 |
+
required=False,
|
1329 |
+
)
|
1330 |
+
|
1331 |
+
batch_infer_parser.add_argument(
|
1332 |
+
"--limiter_release_time",
|
1333 |
+
type=float,
|
1334 |
+
help=limiter_release_time_description,
|
1335 |
+
default=0.01,
|
1336 |
+
required=False,
|
1337 |
+
)
|
1338 |
+
batch_infer_parser.add_argument(
|
1339 |
+
"--gain_db",
|
1340 |
+
type=float,
|
1341 |
+
help=gain_db_description,
|
1342 |
+
default=0.0,
|
1343 |
+
required=False,
|
1344 |
+
)
|
1345 |
+
|
1346 |
+
batch_infer_parser.add_argument(
|
1347 |
+
"--distortion_gain",
|
1348 |
+
type=float,
|
1349 |
+
help=distortion_gain_description,
|
1350 |
+
default=25,
|
1351 |
+
required=False,
|
1352 |
+
)
|
1353 |
+
|
1354 |
+
batch_infer_parser.add_argument(
|
1355 |
+
"--chorus_rate",
|
1356 |
+
type=float,
|
1357 |
+
help=chorus_rate_description,
|
1358 |
+
default=1.0,
|
1359 |
+
required=False,
|
1360 |
+
)
|
1361 |
+
|
1362 |
+
batch_infer_parser.add_argument(
|
1363 |
+
"--chorus_depth",
|
1364 |
+
type=float,
|
1365 |
+
help=chorus_depth_description,
|
1366 |
+
default=0.25,
|
1367 |
+
required=False,
|
1368 |
+
)
|
1369 |
+
batch_infer_parser.add_argument(
|
1370 |
+
"--chorus_center_delay",
|
1371 |
+
type=float,
|
1372 |
+
help=chorus_center_delay_description,
|
1373 |
+
default=7,
|
1374 |
+
required=False,
|
1375 |
+
)
|
1376 |
+
|
1377 |
+
batch_infer_parser.add_argument(
|
1378 |
+
"--chorus_feedback",
|
1379 |
+
type=float,
|
1380 |
+
help=chorus_feedback_description,
|
1381 |
+
default=0.0,
|
1382 |
+
required=False,
|
1383 |
+
)
|
1384 |
+
|
1385 |
+
batch_infer_parser.add_argument(
|
1386 |
+
"--chorus_mix",
|
1387 |
+
type=float,
|
1388 |
+
help=chorus_mix_description,
|
1389 |
+
default=0.5,
|
1390 |
+
required=False,
|
1391 |
+
)
|
1392 |
+
|
1393 |
+
batch_infer_parser.add_argument(
|
1394 |
+
"--bitcrush_bit_depth",
|
1395 |
+
type=int,
|
1396 |
+
help=bitcrush_bit_depth_description,
|
1397 |
+
default=8,
|
1398 |
+
required=False,
|
1399 |
+
)
|
1400 |
+
|
1401 |
+
batch_infer_parser.add_argument(
|
1402 |
+
"--clipping_threshold",
|
1403 |
+
type=float,
|
1404 |
+
help=clipping_threshold_description,
|
1405 |
+
default=-6,
|
1406 |
+
required=False,
|
1407 |
+
)
|
1408 |
+
|
1409 |
+
batch_infer_parser.add_argument(
|
1410 |
+
"--compressor_threshold",
|
1411 |
+
type=float,
|
1412 |
+
help=compressor_threshold_description,
|
1413 |
+
default=0,
|
1414 |
+
required=False,
|
1415 |
+
)
|
1416 |
+
|
1417 |
+
batch_infer_parser.add_argument(
|
1418 |
+
"--compressor_ratio",
|
1419 |
+
type=float,
|
1420 |
+
help=compressor_ratio_description,
|
1421 |
+
default=1,
|
1422 |
+
required=False,
|
1423 |
+
)
|
1424 |
+
|
1425 |
+
batch_infer_parser.add_argument(
|
1426 |
+
"--compressor_attack",
|
1427 |
+
type=float,
|
1428 |
+
help=compressor_attack_description,
|
1429 |
+
default=1.0,
|
1430 |
+
required=False,
|
1431 |
+
)
|
1432 |
+
|
1433 |
+
batch_infer_parser.add_argument(
|
1434 |
+
"--compressor_release",
|
1435 |
+
type=float,
|
1436 |
+
help=compressor_release_description,
|
1437 |
+
default=100,
|
1438 |
+
required=False,
|
1439 |
+
)
|
1440 |
+
batch_infer_parser.add_argument(
|
1441 |
+
"--delay_seconds",
|
1442 |
+
type=float,
|
1443 |
+
help=delay_seconds_description,
|
1444 |
+
default=0.5,
|
1445 |
+
required=False,
|
1446 |
+
)
|
1447 |
+
batch_infer_parser.add_argument(
|
1448 |
+
"--delay_feedback",
|
1449 |
+
type=float,
|
1450 |
+
help=delay_feedback_description,
|
1451 |
+
default=0.0,
|
1452 |
+
required=False,
|
1453 |
+
)
|
1454 |
+
batch_infer_parser.add_argument(
|
1455 |
+
"--delay_mix",
|
1456 |
+
type=float,
|
1457 |
+
help=delay_mix_description,
|
1458 |
+
default=0.5,
|
1459 |
+
required=False,
|
1460 |
+
)
|
1461 |
+
|
1462 |
+
# Parser for 'tts' mode
|
1463 |
+
tts_parser = subparsers.add_parser("tts", help="Run TTS inference")
|
1464 |
+
tts_parser.add_argument(
|
1465 |
+
"--tts_file", type=str, help="File with a text to be synthesized", required=True
|
1466 |
+
)
|
1467 |
+
tts_parser.add_argument(
|
1468 |
+
"--tts_text", type=str, help="Text to be synthesized", required=True
|
1469 |
+
)
|
1470 |
+
tts_parser.add_argument(
|
1471 |
+
"--tts_voice",
|
1472 |
+
type=str,
|
1473 |
+
help="Voice to be used for TTS synthesis.",
|
1474 |
+
choices=locales,
|
1475 |
+
required=True,
|
1476 |
+
)
|
1477 |
+
tts_parser.add_argument(
|
1478 |
+
"--tts_rate",
|
1479 |
+
type=int,
|
1480 |
+
help="Control the speaking rate of the TTS. Values range from -100 (slower) to 100 (faster).",
|
1481 |
+
choices=range(-100, 101),
|
1482 |
+
default=0,
|
1483 |
+
)
|
1484 |
+
tts_parser.add_argument(
|
1485 |
+
"--pitch",
|
1486 |
+
type=int,
|
1487 |
+
help=pitch_description,
|
1488 |
+
choices=range(-24, 25),
|
1489 |
+
default=0,
|
1490 |
+
)
|
1491 |
+
tts_parser.add_argument(
|
1492 |
+
"--filter_radius",
|
1493 |
+
type=int,
|
1494 |
+
help=filter_radius_description,
|
1495 |
+
choices=range(11),
|
1496 |
+
default=3,
|
1497 |
+
)
|
1498 |
+
tts_parser.add_argument(
|
1499 |
+
"--index_rate",
|
1500 |
+
type=float,
|
1501 |
+
help=index_rate_description,
|
1502 |
+
choices=[(i / 10) for i in range(11)],
|
1503 |
+
default=0.3,
|
1504 |
+
)
|
1505 |
+
tts_parser.add_argument(
|
1506 |
+
"--volume_envelope",
|
1507 |
+
type=float,
|
1508 |
+
help=volume_envelope_description,
|
1509 |
+
choices=[(i / 10) for i in range(11)],
|
1510 |
+
default=1,
|
1511 |
+
)
|
1512 |
+
tts_parser.add_argument(
|
1513 |
+
"--protect",
|
1514 |
+
type=float,
|
1515 |
+
help=protect_description,
|
1516 |
+
choices=[(i / 10) for i in range(6)],
|
1517 |
+
default=0.33,
|
1518 |
+
)
|
1519 |
+
tts_parser.add_argument(
|
1520 |
+
"--hop_length",
|
1521 |
+
type=int,
|
1522 |
+
help=hop_length_description,
|
1523 |
+
choices=range(1, 513),
|
1524 |
+
default=128,
|
1525 |
+
)
|
1526 |
+
tts_parser.add_argument(
|
1527 |
+
"--f0_method",
|
1528 |
+
type=str,
|
1529 |
+
help=f0_method_description,
|
1530 |
+
choices=[
|
1531 |
+
"crepe",
|
1532 |
+
"crepe-tiny",
|
1533 |
+
"rmvpe",
|
1534 |
+
"fcpe",
|
1535 |
+
"hybrid[crepe+rmvpe]",
|
1536 |
+
"hybrid[crepe+fcpe]",
|
1537 |
+
"hybrid[rmvpe+fcpe]",
|
1538 |
+
"hybrid[crepe+rmvpe+fcpe]",
|
1539 |
+
],
|
1540 |
+
default="rmvpe",
|
1541 |
+
)
|
1542 |
+
tts_parser.add_argument(
|
1543 |
+
"--output_tts_path",
|
1544 |
+
type=str,
|
1545 |
+
help="Full path to save the synthesized TTS audio.",
|
1546 |
+
required=True,
|
1547 |
+
)
|
1548 |
+
tts_parser.add_argument(
|
1549 |
+
"--output_rvc_path",
|
1550 |
+
type=str,
|
1551 |
+
help="Full path to save the voice-converted audio using the synthesized TTS.",
|
1552 |
+
required=True,
|
1553 |
+
)
|
1554 |
+
tts_parser.add_argument(
|
1555 |
+
"--pth_path", type=str, help=pth_path_description, required=True
|
1556 |
+
)
|
1557 |
+
tts_parser.add_argument(
|
1558 |
+
"--index_path", type=str, help=index_path_description, required=True
|
1559 |
+
)
|
1560 |
+
tts_parser.add_argument(
|
1561 |
+
"--split_audio",
|
1562 |
+
type=lambda x: bool(strtobool(x)),
|
1563 |
+
choices=[True, False],
|
1564 |
+
help=split_audio_description,
|
1565 |
+
default=False,
|
1566 |
+
)
|
1567 |
+
tts_parser.add_argument(
|
1568 |
+
"--f0_autotune",
|
1569 |
+
type=lambda x: bool(strtobool(x)),
|
1570 |
+
choices=[True, False],
|
1571 |
+
help=f0_autotune_description,
|
1572 |
+
default=False,
|
1573 |
+
)
|
1574 |
+
tts_parser.add_argument(
|
1575 |
+
"--f0_autotune_strength",
|
1576 |
+
type=float,
|
1577 |
+
help=clean_strength_description,
|
1578 |
+
choices=[(i / 10) for i in range(11)],
|
1579 |
+
default=1.0,
|
1580 |
+
)
|
1581 |
+
tts_parser.add_argument(
|
1582 |
+
"--clean_audio",
|
1583 |
+
type=lambda x: bool(strtobool(x)),
|
1584 |
+
choices=[True, False],
|
1585 |
+
help=clean_audio_description,
|
1586 |
+
default=False,
|
1587 |
+
)
|
1588 |
+
tts_parser.add_argument(
|
1589 |
+
"--clean_strength",
|
1590 |
+
type=float,
|
1591 |
+
help=clean_strength_description,
|
1592 |
+
choices=[(i / 10) for i in range(11)],
|
1593 |
+
default=0.7,
|
1594 |
+
)
|
1595 |
+
tts_parser.add_argument(
|
1596 |
+
"--export_format",
|
1597 |
+
type=str,
|
1598 |
+
help=export_format_description,
|
1599 |
+
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
1600 |
+
default="WAV",
|
1601 |
+
)
|
1602 |
+
tts_parser.add_argument(
|
1603 |
+
"--embedder_model",
|
1604 |
+
type=str,
|
1605 |
+
help=embedder_model_description,
|
1606 |
+
choices=[
|
1607 |
+
"contentvec",
|
1608 |
+
"chinese-hubert-base",
|
1609 |
+
"japanese-hubert-base",
|
1610 |
+
"korean-hubert-base",
|
1611 |
+
"custom",
|
1612 |
+
],
|
1613 |
+
default="contentvec",
|
1614 |
+
)
|
1615 |
+
tts_parser.add_argument(
|
1616 |
+
"--embedder_model_custom",
|
1617 |
+
type=str,
|
1618 |
+
help=embedder_model_custom_description,
|
1619 |
+
default=None,
|
1620 |
+
)
|
1621 |
+
tts_parser.add_argument(
|
1622 |
+
"--f0_file",
|
1623 |
+
type=str,
|
1624 |
+
help=f0_file_description,
|
1625 |
+
default=None,
|
1626 |
+
)
|
1627 |
+
|
1628 |
+
# Parser for 'model_information' mode
|
1629 |
+
model_information_parser = subparsers.add_parser(
|
1630 |
+
"model_information", help="Display information about a trained model."
|
1631 |
+
)
|
1632 |
+
model_information_parser.add_argument(
|
1633 |
+
"--pth_path", type=str, help="Path to the .pth model file.", required=True
|
1634 |
+
)
|
1635 |
+
|
1636 |
+
# Parser for 'model_blender' mode
|
1637 |
+
model_blender_parser = subparsers.add_parser(
|
1638 |
+
"model_blender", help="Fuse two RVC models together."
|
1639 |
+
)
|
1640 |
+
model_blender_parser.add_argument(
|
1641 |
+
"--model_name", type=str, help="Name of the new fused model.", required=True
|
1642 |
+
)
|
1643 |
+
model_blender_parser.add_argument(
|
1644 |
+
"--pth_path_1",
|
1645 |
+
type=str,
|
1646 |
+
help="Path to the first .pth model file.",
|
1647 |
+
required=True,
|
1648 |
+
)
|
1649 |
+
model_blender_parser.add_argument(
|
1650 |
+
"--pth_path_2",
|
1651 |
+
type=str,
|
1652 |
+
help="Path to the second .pth model file.",
|
1653 |
+
required=True,
|
1654 |
+
)
|
1655 |
+
model_blender_parser.add_argument(
|
1656 |
+
"--ratio",
|
1657 |
+
type=float,
|
1658 |
+
help="Ratio for blending the two models (0.0 to 1.0).",
|
1659 |
+
choices=[(i / 10) for i in range(11)],
|
1660 |
+
default=0.5,
|
1661 |
+
)
|
1662 |
+
|
1663 |
+
# Parser for 'tensorboard' mode
|
1664 |
+
subparsers.add_parser(
|
1665 |
+
"tensorboard", help="Launch TensorBoard for monitoring training progress."
|
1666 |
+
)
|
1667 |
+
|
1668 |
+
# Parser for 'download' mode
|
1669 |
+
download_parser = subparsers.add_parser(
|
1670 |
+
"download", help="Download a model from a provided link."
|
1671 |
+
)
|
1672 |
+
download_parser.add_argument(
|
1673 |
+
"--model_link", type=str, help="Direct link to the model file.", required=True
|
1674 |
+
)
|
1675 |
+
|
1676 |
+
# Parser for 'prerequisites' mode
|
1677 |
+
prerequisites_parser = subparsers.add_parser(
|
1678 |
+
"prerequisites", help="Install prerequisites for RVC."
|
1679 |
+
)
|
1680 |
+
prerequisites_parser.add_argument(
|
1681 |
+
"--models",
|
1682 |
+
type=lambda x: bool(strtobool(x)),
|
1683 |
+
choices=[True, False],
|
1684 |
+
default=True,
|
1685 |
+
help="Download additional models.",
|
1686 |
+
)
|
1687 |
+
prerequisites_parser.add_argument(
|
1688 |
+
"--exe",
|
1689 |
+
type=lambda x: bool(strtobool(x)),
|
1690 |
+
choices=[True, False],
|
1691 |
+
default=True,
|
1692 |
+
help="Download required executables.",
|
1693 |
+
)
|
1694 |
+
|
1695 |
+
# Parser for 'audio_analyzer' mode
|
1696 |
+
audio_analyzer = subparsers.add_parser(
|
1697 |
+
"audio_analyzer", help="Analyze an audio file."
|
1698 |
+
)
|
1699 |
+
audio_analyzer.add_argument(
|
1700 |
+
"--input_path", type=str, help="Path to the input audio file.", required=True
|
1701 |
+
)
|
1702 |
+
|
1703 |
+
return parser.parse_args()
|
1704 |
+
|
1705 |
+
|
1706 |
+
def main():
|
1707 |
+
if len(sys.argv) == 1:
|
1708 |
+
print("Please run the script with '-h' for more information.")
|
1709 |
+
sys.exit(1)
|
1710 |
+
|
1711 |
+
args = parse_arguments()
|
1712 |
+
|
1713 |
+
try:
|
1714 |
+
if args.mode == "infer":
|
1715 |
+
run_infer_script(
|
1716 |
+
pitch=args.pitch,
|
1717 |
+
filter_radius=args.filter_radius,
|
1718 |
+
index_rate=args.index_rate,
|
1719 |
+
volume_envelope=args.volume_envelope,
|
1720 |
+
protect=args.protect,
|
1721 |
+
hop_length=args.hop_length,
|
1722 |
+
f0_method=args.f0_method,
|
1723 |
+
input_path=args.input_path,
|
1724 |
+
output_path=args.output_path,
|
1725 |
+
pth_path=args.pth_path,
|
1726 |
+
index_path=args.index_path,
|
1727 |
+
split_audio=args.split_audio,
|
1728 |
+
f0_autotune=args.f0_autotune,
|
1729 |
+
f0_autotune_strength=args.f0_autotune_strength,
|
1730 |
+
clean_audio=args.clean_audio,
|
1731 |
+
clean_strength=args.clean_strength,
|
1732 |
+
export_format=args.export_format,
|
1733 |
+
embedder_model=args.embedder_model,
|
1734 |
+
embedder_model_custom=args.embedder_model_custom,
|
1735 |
+
f0_file=args.f0_file,
|
1736 |
+
formant_shifting=args.formant_shifting,
|
1737 |
+
formant_qfrency=args.formant_qfrency,
|
1738 |
+
formant_timbre=args.formant_timbre,
|
1739 |
+
sid=args.sid,
|
1740 |
+
post_process=args.post_process,
|
1741 |
+
reverb=args.reverb,
|
1742 |
+
pitch_shift=args.pitch_shift,
|
1743 |
+
limiter=args.limiter,
|
1744 |
+
gain=args.gain,
|
1745 |
+
distortion=args.distortion,
|
1746 |
+
chorus=args.chorus,
|
1747 |
+
bitcrush=args.bitcrush,
|
1748 |
+
clipping=args.clipping,
|
1749 |
+
compressor=args.compressor,
|
1750 |
+
delay=args.delay,
|
1751 |
+
reverb_room_size=args.reverb_room_size,
|
1752 |
+
reverb_damping=args.reverb_damping,
|
1753 |
+
reverb_wet_gain=args.reverb_wet_gain,
|
1754 |
+
reverb_dry_gain=args.reverb_dry_gain,
|
1755 |
+
reverb_width=args.reverb_width,
|
1756 |
+
reverb_freeze_mode=args.reverb_freeze_mode,
|
1757 |
+
pitch_shift_semitones=args.pitch_shift_semitones,
|
1758 |
+
limiter_threshold=args.limiter_threshold,
|
1759 |
+
limiter_release_time=args.limiter_release_time,
|
1760 |
+
gain_db=args.gain_db,
|
1761 |
+
distortion_gain=args.distortion_gain,
|
1762 |
+
chorus_rate=args.chorus_rate,
|
1763 |
+
chorus_depth=args.chorus_depth,
|
1764 |
+
chorus_center_delay=args.chorus_center_delay,
|
1765 |
+
chorus_feedback=args.chorus_feedback,
|
1766 |
+
chorus_mix=args.chorus_mix,
|
1767 |
+
bitcrush_bit_depth=args.bitcrush_bit_depth,
|
1768 |
+
clipping_threshold=args.clipping_threshold,
|
1769 |
+
compressor_threshold=args.compressor_threshold,
|
1770 |
+
compressor_ratio=args.compressor_ratio,
|
1771 |
+
compressor_attack=args.compressor_attack,
|
1772 |
+
compressor_release=args.compressor_release,
|
1773 |
+
delay_seconds=args.delay_seconds,
|
1774 |
+
delay_feedback=args.delay_feedback,
|
1775 |
+
delay_mix=args.delay_mix,
|
1776 |
+
)
|
1777 |
+
elif args.mode == "batch_infer":
|
1778 |
+
run_batch_infer_script(
|
1779 |
+
pitch=args.pitch,
|
1780 |
+
filter_radius=args.filter_radius,
|
1781 |
+
index_rate=args.index_rate,
|
1782 |
+
volume_envelope=args.volume_envelope,
|
1783 |
+
protect=args.protect,
|
1784 |
+
hop_length=args.hop_length,
|
1785 |
+
f0_method=args.f0_method,
|
1786 |
+
input_folder=args.input_folder,
|
1787 |
+
output_folder=args.output_folder,
|
1788 |
+
pth_path=args.pth_path,
|
1789 |
+
index_path=args.index_path,
|
1790 |
+
split_audio=args.split_audio,
|
1791 |
+
f0_autotune=args.f0_autotune,
|
1792 |
+
f0_autotune_strength=args.f0_autotune_strength,
|
1793 |
+
clean_audio=args.clean_audio,
|
1794 |
+
clean_strength=args.clean_strength,
|
1795 |
+
export_format=args.export_format,
|
1796 |
+
embedder_model=args.embedder_model,
|
1797 |
+
embedder_model_custom=args.embedder_model_custom,
|
1798 |
+
f0_file=args.f0_file,
|
1799 |
+
formant_shifting=args.formant_shifting,
|
1800 |
+
formant_qfrency=args.formant_qfrency,
|
1801 |
+
formant_timbre=args.formant_timbre,
|
1802 |
+
sid=args.sid,
|
1803 |
+
post_process=args.post_process,
|
1804 |
+
reverb=args.reverb,
|
1805 |
+
pitch_shift=args.pitch_shift,
|
1806 |
+
limiter=args.limiter,
|
1807 |
+
gain=args.gain,
|
1808 |
+
distortion=args.distortion,
|
1809 |
+
chorus=args.chorus,
|
1810 |
+
bitcrush=args.bitcrush,
|
1811 |
+
clipping=args.clipping,
|
1812 |
+
compressor=args.compressor,
|
1813 |
+
delay=args.delay,
|
1814 |
+
reverb_room_size=args.reverb_room_size,
|
1815 |
+
reverb_damping=args.reverb_damping,
|
1816 |
+
reverb_wet_gain=args.reverb_wet_gain,
|
1817 |
+
reverb_dry_gain=args.reverb_dry_gain,
|
1818 |
+
reverb_width=args.reverb_width,
|
1819 |
+
reverb_freeze_mode=args.reverb_freeze_mode,
|
1820 |
+
pitch_shift_semitones=args.pitch_shift_semitones,
|
1821 |
+
limiter_threshold=args.limiter_threshold,
|
1822 |
+
limiter_release_time=args.limiter_release_time,
|
1823 |
+
gain_db=args.gain_db,
|
1824 |
+
distortion_gain=args.distortion_gain,
|
1825 |
+
chorus_rate=args.chorus_rate,
|
1826 |
+
chorus_depth=args.chorus_depth,
|
1827 |
+
chorus_center_delay=args.chorus_center_delay,
|
1828 |
+
chorus_feedback=args.chorus_feedback,
|
1829 |
+
chorus_mix=args.chorus_mix,
|
1830 |
+
bitcrush_bit_depth=args.bitcrush_bit_depth,
|
1831 |
+
clipping_threshold=args.clipping_threshold,
|
1832 |
+
compressor_threshold=args.compressor_threshold,
|
1833 |
+
compressor_ratio=args.compressor_ratio,
|
1834 |
+
compressor_attack=args.compressor_attack,
|
1835 |
+
compressor_release=args.compressor_release,
|
1836 |
+
delay_seconds=args.delay_seconds,
|
1837 |
+
delay_feedback=args.delay_feedback,
|
1838 |
+
delay_mix=args.delay_mix,
|
1839 |
+
)
|
1840 |
+
elif args.mode == "tts":
|
1841 |
+
run_tts_script(
|
1842 |
+
tts_file=args.tts_file,
|
1843 |
+
tts_text=args.tts_text,
|
1844 |
+
tts_voice=args.tts_voice,
|
1845 |
+
tts_rate=args.tts_rate,
|
1846 |
+
pitch=args.pitch,
|
1847 |
+
filter_radius=args.filter_radius,
|
1848 |
+
index_rate=args.index_rate,
|
1849 |
+
volume_envelope=args.volume_envelope,
|
1850 |
+
protect=args.protect,
|
1851 |
+
hop_length=args.hop_length,
|
1852 |
+
f0_method=args.f0_method,
|
1853 |
+
output_tts_path=args.output_tts_path,
|
1854 |
+
output_rvc_path=args.output_rvc_path,
|
1855 |
+
pth_path=args.pth_path,
|
1856 |
+
index_path=args.index_path,
|
1857 |
+
split_audio=args.split_audio,
|
1858 |
+
f0_autotune=args.f0_autotune,
|
1859 |
+
f0_autotune_strength=args.f0_autotune_strength,
|
1860 |
+
clean_audio=args.clean_audio,
|
1861 |
+
clean_strength=args.clean_strength,
|
1862 |
+
export_format=args.export_format,
|
1863 |
+
embedder_model=args.embedder_model,
|
1864 |
+
embedder_model_custom=args.embedder_model_custom,
|
1865 |
+
f0_file=args.f0_file,
|
1866 |
+
)
|
1867 |
+
elif args.mode == "model_information":
|
1868 |
+
run_model_information_script(
|
1869 |
+
pth_path=args.pth_path,
|
1870 |
+
)
|
1871 |
+
elif args.mode == "model_blender":
|
1872 |
+
run_model_blender_script(
|
1873 |
+
model_name=args.model_name,
|
1874 |
+
pth_path_1=args.pth_path_1,
|
1875 |
+
pth_path_2=args.pth_path_2,
|
1876 |
+
ratio=args.ratio,
|
1877 |
+
)
|
1878 |
+
elif args.mode == "tensorboard":
|
1879 |
+
run_tensorboard_script()
|
1880 |
+
elif args.mode == "download":
|
1881 |
+
run_download_script(
|
1882 |
+
model_link=args.model_link,
|
1883 |
+
)
|
1884 |
+
elif args.mode == "audio_analyzer":
|
1885 |
+
run_audio_analyzer_script(
|
1886 |
+
input_path=args.input_path,
|
1887 |
+
)
|
1888 |
+
except Exception as error:
|
1889 |
+
print(f"An error occurred during execution: {error}")
|
1890 |
+
|
1891 |
+
import traceback
|
1892 |
+
|
1893 |
+
traceback.print_exc()
|
1894 |
+
|
1895 |
+
|
1896 |
+
if __name__ == "__main__":
|
1897 |
+
main()
|
tabs/download/download.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import json
|
4 |
+
import shutil
|
5 |
+
import requests
|
6 |
+
import tempfile
|
7 |
+
import gradio as gr
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
from concurrent.futures import ThreadPoolExecutor
|
11 |
+
from tqdm import tqdm
|
12 |
+
|
13 |
+
|
14 |
+
now_dir = os.getcwd()
|
15 |
+
sys.path.append(now_dir)
|
16 |
+
|
17 |
+
from scrpt import run_download_script
|
18 |
+
from rvc.lib.utils import format_title
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
gradio_temp_dir = os.path.join(tempfile.gettempdir(), "gradio")
|
23 |
+
|
24 |
+
if os.path.exists(gradio_temp_dir):
|
25 |
+
shutil.rmtree(gradio_temp_dir)
|
26 |
+
|
27 |
+
|
28 |
+
def save_drop_model(dropbox):
|
29 |
+
if "pth" not in dropbox and "index" not in dropbox:
|
30 |
+
raise gr.Error(
|
31 |
+
message="The file you dropped is not a valid model file. Please try again."
|
32 |
+
)
|
33 |
+
|
34 |
+
file_name = format_title(os.path.basename(dropbox))
|
35 |
+
model_name = file_name
|
36 |
+
|
37 |
+
if ".pth" in model_name:
|
38 |
+
model_name = model_name.split(".pth")[0]
|
39 |
+
elif ".index" in model_name:
|
40 |
+
replacements = ["nprobe_1_", "_v1", "_v2", "added_"]
|
41 |
+
for rep in replacements:
|
42 |
+
model_name = model_name.replace(rep, "")
|
43 |
+
model_name = model_name.split(".index")[0]
|
44 |
+
|
45 |
+
model_path = os.path.join(now_dir, "logs", model_name)
|
46 |
+
if not os.path.exists(model_path):
|
47 |
+
os.makedirs(model_path)
|
48 |
+
if os.path.exists(os.path.join(model_path, file_name)):
|
49 |
+
os.remove(os.path.join(model_path, file_name))
|
50 |
+
shutil.move(dropbox, os.path.join(model_path, file_name))
|
51 |
+
print(f"{file_name} saved in {model_path}")
|
52 |
+
gr.Info(f"{file_name} saved in {model_path}")
|
53 |
+
|
54 |
+
return None
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
def get_file_size(url):
|
63 |
+
response = requests.head(url)
|
64 |
+
return int(response.headers.get("content-length", 0))
|
65 |
+
|
66 |
+
|
67 |
+
def download_file(url, destination_path, progress_bar):
|
68 |
+
os.makedirs(os.path.dirname(destination_path), exist_ok=True)
|
69 |
+
response = requests.get(url, stream=True)
|
70 |
+
block_size = 1024
|
71 |
+
with open(destination_path, "wb") as file:
|
72 |
+
for data in response.iter_content(block_size):
|
73 |
+
file.write(data)
|
74 |
+
progress_bar.update(len(data))
|
75 |
+
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
def download_tab():
|
83 |
+
with gr.Column():
|
84 |
+
gr.Markdown(value="## Download Model")
|
85 |
+
model_link = gr.Textbox(
|
86 |
+
label="Model Link",
|
87 |
+
placeholder="Introduce the model link",
|
88 |
+
interactive=True,
|
89 |
+
)
|
90 |
+
model_download_output_info = gr.Textbox(
|
91 |
+
label="Output Information",
|
92 |
+
info="The output information will be displayed here.",
|
93 |
+
value="",
|
94 |
+
max_lines=8,
|
95 |
+
interactive=False,
|
96 |
+
)
|
97 |
+
model_download_button = gr.Button("Download Model")
|
98 |
+
model_download_button.click(
|
99 |
+
fn=run_download_script,
|
100 |
+
inputs=[model_link],
|
101 |
+
outputs=[model_download_output_info],
|
102 |
+
)
|
103 |
+
gr.Markdown("## Drop files")
|
104 |
+
dropbox = gr.File(label="Drag your .pth file and .index file into this space. Drag one and then the other.", type="filepath")
|
105 |
+
|
106 |
+
dropbox.upload(
|
107 |
+
fn=save_drop_model,
|
108 |
+
inputs=[dropbox],
|
109 |
+
outputs=[dropbox],
|
110 |
+
)
|
111 |
+
|