Yusen commited on
Commit
819617f
1 Parent(s): addcd8b

update sovits to 4.1

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. pretrain/nsf_hifigan/put_nsf_hifigan_ckpt_here → .attach_pid27237 +0 -0
  2. .gitignore +165 -0
  3. .idea/.gitignore +3 -3
  4. .idea/inspectionProfiles/profiles_settings.xml +0 -0
  5. .idea/misc.xml +3 -3
  6. .idea/modules.xml +7 -7
  7. .idea/vcs.xml +5 -5
  8. .ruff.toml +4 -0
  9. G_20800.pth +0 -3
  10. LICENSE +661 -0
  11. README.md +1 -1
  12. README_zh_CN.md +532 -0
  13. cluster/__init__.py +1 -1
  14. cluster/kmeans.py +204 -201
  15. cluster/train_cluster.py +13 -15
  16. compress_model.py +71 -0
  17. configs/diffusion.yaml +51 -0
  18. config.json → configs_template/config_template.json +17 -45
  19. configs_template/diffusion_template.yaml +51 -0
  20. data_utils.py +184 -0
  21. diffusion/data_loaders.py +12 -8
  22. diffusion/diffusion.py +90 -11
  23. diffusion/diffusion_onnx.py +13 -11
  24. diffusion/dpm_solver_pytorch.py +425 -319
  25. diffusion/how to export onnx.md +3 -3
  26. diffusion/infer_gt_mel.py +1 -1
  27. diffusion/logger/saver.py +6 -11
  28. diffusion/logger/utils.py +5 -4
  29. diffusion/onnx_export.py +235 -226
  30. diffusion/solver.py +44 -23
  31. diffusion/uni_pc.py +733 -0
  32. diffusion/unit2mel.py +78 -11
  33. diffusion/vocoder.py +4 -3
  34. edgetts/tts.py +47 -0
  35. edgetts/tts_voices.py +306 -0
  36. flask_api.py +60 -0
  37. flask_api_full_song.py +55 -0
  38. inference/infer_tool.py +164 -41
  39. inference/infer_tool_grad.py +20 -35
  40. inference_main.py +155 -0
  41. modules/DSConv.py +76 -0
  42. modules/F0Predictor/CrepeF0Predictor.py +4 -2
  43. modules/F0Predictor/DioF0Predictor.py +22 -34
  44. modules/F0Predictor/HarvestF0Predictor.py +21 -34
  45. modules/F0Predictor/PMF0Predictor.py +22 -33
  46. modules/F0Predictor/crepe.py +11 -11
  47. modules/attentions.py +2 -4
  48. modules/commons.py +6 -11
  49. modules/enhancer.py +4 -2
  50. modules/losses.py +1 -4
pretrain/nsf_hifigan/put_nsf_hifigan_ckpt_here → .attach_pid27237 RENAMED
File without changes
.gitignore ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Created by https://www.toptal.com/developers/gitignore/api/python
3
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python
4
+
5
+ ### Python ###
6
+ # Byte-compiled / optimized / DLL files
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+
11
+ # C extensions
12
+ *.so
13
+ checkpoints/
14
+ # Distribution / packaging
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ var/
27
+ wheels/
28
+ pip-wheel-metadata/
29
+ share/python-wheels/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+ MANIFEST
34
+
35
+ # PyInstaller
36
+ # Usually these files are written by a python script from a template
37
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
38
+ *.manifest
39
+ *.spec
40
+
41
+ # Installer logs
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ # Unit test / coverage reports
46
+ htmlcov/
47
+ .tox/
48
+ .nox/
49
+ .coverage
50
+ .coverage.*
51
+ .cache
52
+ nosetests.xml
53
+ coverage.xml
54
+ *.cover
55
+ *.py,cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+ pytestdebug.log
59
+
60
+ # Translations
61
+ *.mo
62
+ *.pot
63
+
64
+ # Django stuff:
65
+ *.log
66
+ local_settings.py
67
+ db.sqlite3
68
+ db.sqlite3-journal
69
+
70
+ # Flask stuff:
71
+ instance/
72
+ .webassets-cache
73
+
74
+ # Scrapy stuff:
75
+ .scrapy
76
+
77
+ # Sphinx documentation
78
+ docs/_build/
79
+ doc/_build/
80
+
81
+ # PyBuilder
82
+ target/
83
+
84
+ # Jupyter Notebook
85
+ .ipynb_checkpoints
86
+
87
+ # IPython
88
+ profile_default/
89
+ ipython_config.py
90
+
91
+ # pyenv
92
+ .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102
+ __pypackages__/
103
+
104
+ # Celery stuff
105
+ celerybeat-schedule
106
+ celerybeat.pid
107
+
108
+ # SageMath parsed files
109
+ *.sage.py
110
+
111
+ # Environments
112
+ .env
113
+ .venv
114
+ env/
115
+ venv/
116
+ ENV/
117
+ env.bak/
118
+ venv.bak/
119
+
120
+ # Spyder project settings
121
+ .spyderproject
122
+ .spyproject
123
+
124
+ # Rope project settings
125
+ .ropeproject
126
+
127
+ # mkdocs documentation
128
+ /site
129
+
130
+ # mypy
131
+ .mypy_cache/
132
+ .dmypy.json
133
+ dmypy.json
134
+
135
+ # Pyre type checker
136
+ .pyre/
137
+
138
+ # pytype static type analyzer
139
+ .pytype/
140
+
141
+ # End of https://www.toptal.com/developers/gitignore/api/python
142
+
143
+ /shelf/
144
+ /workspace.xml
145
+
146
+ dataset
147
+ dataset_raw
148
+ raw
149
+ results
150
+ inference/chunks_temp.json
151
+ logs
152
+ hubert/checkpoint_best_legacy_500.pt
153
+ configs/config.json
154
+ filelists/test.txt
155
+ filelists/train.txt
156
+ filelists/val.txt
157
+ .idea/
158
+ .vscode/
159
+ .idea/modules.xml
160
+ .idea/so-vits-svc.iml
161
+ .idea/vcs.xml
162
+ .idea/inspectionProfiles/profiles_settings.xml
163
+ .idea/inspectionProfiles/Project_Default.xml
164
+ pretrain/
165
+ .vscode/launch.json
.idea/.gitignore CHANGED
@@ -1,3 +1,3 @@
1
- # 默认忽略的文件
2
- /shelf/
3
- /workspace.xml
 
1
+ # 默认忽略的文件
2
+ /shelf/
3
+ /workspace.xml
.idea/inspectionProfiles/profiles_settings.xml CHANGED
File without changes
.idea/misc.xml CHANGED
@@ -1,4 +1,4 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="ProjectRootManager" version="2" project-jdk-name="sovits" project-jdk-type="Python SDK" />
4
  </project>
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (so-vits-svc)" project-jdk-type="Python SDK" />
4
  </project>
.idea/modules.xml CHANGED
@@ -1,8 +1,8 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="ProjectModuleManager">
4
- <modules>
5
- <module fileurl="file://$PROJECT_DIR$/.idea/sovits_app.iml" filepath="$PROJECT_DIR$/.idea/sovits_app.iml" />
6
- </modules>
7
- </component>
8
  </project>
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/so-vits-svc.iml" filepath="$PROJECT_DIR$/.idea/so-vits-svc.iml" />
6
+ </modules>
7
+ </component>
8
  </project>
.idea/vcs.xml CHANGED
@@ -1,6 +1,6 @@
1
- <?xml version="1.0" encoding="UTF-8"?>
2
- <project version="4">
3
- <component name="VcsDirectoryMappings">
4
- <mapping directory="" vcs="Git" />
5
- </component>
6
  </project>
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ </component>
6
  </project>
.ruff.toml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ select = ["E", "F", "I"]
2
+
3
+ # Never enforce `E501` (line length violations).
4
+ ignore = ["E501", "E741"]
G_20800.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:98cb2002a3da538cff9ec9bea9e61caffd5620c27a5e0d5f6329dbb0a4bfb433
3
- size 627905373
 
 
 
 
LICENSE ADDED
@@ -0,0 +1,661 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ GNU AFFERO GENERAL PUBLIC LICENSE
2
+ Version 3, 19 November 2007
3
+
4
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
5
+ Everyone is permitted to copy and distribute verbatim copies
6
+ of this license document, but changing it is not allowed.
7
+
8
+ Preamble
9
+
10
+ The GNU Affero General Public License is a free, copyleft license for
11
+ software and other kinds of works, specifically designed to ensure
12
+ cooperation with the community in the case of network server software.
13
+
14
+ The licenses for most software and other practical works are designed
15
+ to take away your freedom to share and change the works. By contrast,
16
+ our General Public Licenses are intended to guarantee your freedom to
17
+ share and change all versions of a program--to make sure it remains free
18
+ software for all its users.
19
+
20
+ When we speak of free software, we are referring to freedom, not
21
+ price. Our General Public Licenses are designed to make sure that you
22
+ have the freedom to distribute copies of free software (and charge for
23
+ them if you wish), that you receive source code or can get it if you
24
+ want it, that you can change the software or use pieces of it in new
25
+ free programs, and that you know you can do these things.
26
+
27
+ Developers that use our General Public Licenses protect your rights
28
+ with two steps: (1) assert copyright on the software, and (2) offer
29
+ you this License which gives you legal permission to copy, distribute
30
+ and/or modify the software.
31
+
32
+ A secondary benefit of defending all users' freedom is that
33
+ improvements made in alternate versions of the program, if they
34
+ receive widespread use, become available for other developers to
35
+ incorporate. Many developers of free software are heartened and
36
+ encouraged by the resulting cooperation. However, in the case of
37
+ software used on network servers, this result may fail to come about.
38
+ The GNU General Public License permits making a modified version and
39
+ letting the public access it on a server without ever releasing its
40
+ source code to the public.
41
+
42
+ The GNU Affero General Public License is designed specifically to
43
+ ensure that, in such cases, the modified source code becomes available
44
+ to the community. It requires the operator of a network server to
45
+ provide the source code of the modified version running there to the
46
+ users of that server. Therefore, public use of a modified version, on
47
+ a publicly accessible server, gives the public access to the source
48
+ code of the modified version.
49
+
50
+ An older license, called the Affero General Public License and
51
+ published by Affero, was designed to accomplish similar goals. This is
52
+ a different license, not a version of the Affero GPL, but Affero has
53
+ released a new version of the Affero GPL which permits relicensing under
54
+ this license.
55
+
56
+ The precise terms and conditions for copying, distribution and
57
+ modification follow.
58
+
59
+ TERMS AND CONDITIONS
60
+
61
+ 0. Definitions.
62
+
63
+ "This License" refers to version 3 of the GNU Affero General Public License.
64
+
65
+ "Copyright" also means copyright-like laws that apply to other kinds of
66
+ works, such as semiconductor masks.
67
+
68
+ "The Program" refers to any copyrightable work licensed under this
69
+ License. Each licensee is addressed as "you". "Licensees" and
70
+ "recipients" may be individuals or organizations.
71
+
72
+ To "modify" a work means to copy from or adapt all or part of the work
73
+ in a fashion requiring copyright permission, other than the making of an
74
+ exact copy. The resulting work is called a "modified version" of the
75
+ earlier work or a work "based on" the earlier work.
76
+
77
+ A "covered work" means either the unmodified Program or a work based
78
+ on the Program.
79
+
80
+ To "propagate" a work means to do anything with it that, without
81
+ permission, would make you directly or secondarily liable for
82
+ infringement under applicable copyright law, except executing it on a
83
+ computer or modifying a private copy. Propagation includes copying,
84
+ distribution (with or without modification), making available to the
85
+ public, and in some countries other activities as well.
86
+
87
+ To "convey" a work means any kind of propagation that enables other
88
+ parties to make or receive copies. Mere interaction with a user through
89
+ a computer network, with no transfer of a copy, is not conveying.
90
+
91
+ An interactive user interface displays "Appropriate Legal Notices"
92
+ to the extent that it includes a convenient and prominently visible
93
+ feature that (1) displays an appropriate copyright notice, and (2)
94
+ tells the user that there is no warranty for the work (except to the
95
+ extent that warranties are provided), that licensees may convey the
96
+ work under this License, and how to view a copy of this License. If
97
+ the interface presents a list of user commands or options, such as a
98
+ menu, a prominent item in the list meets this criterion.
99
+
100
+ 1. Source Code.
101
+
102
+ The "source code" for a work means the preferred form of the work
103
+ for making modifications to it. "Object code" means any non-source
104
+ form of a work.
105
+
106
+ A "Standard Interface" means an interface that either is an official
107
+ standard defined by a recognized standards body, or, in the case of
108
+ interfaces specified for a particular programming language, one that
109
+ is widely used among developers working in that language.
110
+
111
+ The "System Libraries" of an executable work include anything, other
112
+ than the work as a whole, that (a) is included in the normal form of
113
+ packaging a Major Component, but which is not part of that Major
114
+ Component, and (b) serves only to enable use of the work with that
115
+ Major Component, or to implement a Standard Interface for which an
116
+ implementation is available to the public in source code form. A
117
+ "Major Component", in this context, means a major essential component
118
+ (kernel, window system, and so on) of the specific operating system
119
+ (if any) on which the executable work runs, or a compiler used to
120
+ produce the work, or an object code interpreter used to run it.
121
+
122
+ The "Corresponding Source" for a work in object code form means all
123
+ the source code needed to generate, install, and (for an executable
124
+ work) run the object code and to modify the work, including scripts to
125
+ control those activities. However, it does not include the work's
126
+ System Libraries, or general-purpose tools or generally available free
127
+ programs which are used unmodified in performing those activities but
128
+ which are not part of the work. For example, Corresponding Source
129
+ includes interface definition files associated with source files for
130
+ the work, and the source code for shared libraries and dynamically
131
+ linked subprograms that the work is specifically designed to require,
132
+ such as by intimate data communication or control flow between those
133
+ subprograms and other parts of the work.
134
+
135
+ The Corresponding Source need not include anything that users
136
+ can regenerate automatically from other parts of the Corresponding
137
+ Source.
138
+
139
+ The Corresponding Source for a work in source code form is that
140
+ same work.
141
+
142
+ 2. Basic Permissions.
143
+
144
+ All rights granted under this License are granted for the term of
145
+ copyright on the Program, and are irrevocable provided the stated
146
+ conditions are met. This License explicitly affirms your unlimited
147
+ permission to run the unmodified Program. The output from running a
148
+ covered work is covered by this License only if the output, given its
149
+ content, constitutes a covered work. This License acknowledges your
150
+ rights of fair use or other equivalent, as provided by copyright law.
151
+
152
+ You may make, run and propagate covered works that you do not
153
+ convey, without conditions so long as your license otherwise remains
154
+ in force. You may convey covered works to others for the sole purpose
155
+ of having them make modifications exclusively for you, or provide you
156
+ with facilities for running those works, provided that you comply with
157
+ the terms of this License in conveying all material for which you do
158
+ not control copyright. Those thus making or running the covered works
159
+ for you must do so exclusively on your behalf, under your direction
160
+ and control, on terms that prohibit them from making any copies of
161
+ your copyrighted material outside their relationship with you.
162
+
163
+ Conveying under any other circumstances is permitted solely under
164
+ the conditions stated below. Sublicensing is not allowed; section 10
165
+ makes it unnecessary.
166
+
167
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168
+
169
+ No covered work shall be deemed part of an effective technological
170
+ measure under any applicable law fulfilling obligations under article
171
+ 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172
+ similar laws prohibiting or restricting circumvention of such
173
+ measures.
174
+
175
+ When you convey a covered work, you waive any legal power to forbid
176
+ circumvention of technological measures to the extent such circumvention
177
+ is effected by exercising rights under this License with respect to
178
+ the covered work, and you disclaim any intention to limit operation or
179
+ modification of the work as a means of enforcing, against the work's
180
+ users, your or third parties' legal rights to forbid circumvention of
181
+ technological measures.
182
+
183
+ 4. Conveying Verbatim Copies.
184
+
185
+ You may convey verbatim copies of the Program's source code as you
186
+ receive it, in any medium, provided that you conspicuously and
187
+ appropriately publish on each copy an appropriate copyright notice;
188
+ keep intact all notices stating that this License and any
189
+ non-permissive terms added in accord with section 7 apply to the code;
190
+ keep intact all notices of the absence of any warranty; and give all
191
+ recipients a copy of this License along with the Program.
192
+
193
+ You may charge any price or no price for each copy that you convey,
194
+ and you may offer support or warranty protection for a fee.
195
+
196
+ 5. Conveying Modified Source Versions.
197
+
198
+ You may convey a work based on the Program, or the modifications to
199
+ produce it from the Program, in the form of source code under the
200
+ terms of section 4, provided that you also meet all of these conditions:
201
+
202
+ a) The work must carry prominent notices stating that you modified
203
+ it, and giving a relevant date.
204
+
205
+ b) The work must carry prominent notices stating that it is
206
+ released under this License and any conditions added under section
207
+ 7. This requirement modifies the requirement in section 4 to
208
+ "keep intact all notices".
209
+
210
+ c) You must license the entire work, as a whole, under this
211
+ License to anyone who comes into possession of a copy. This
212
+ License will therefore apply, along with any applicable section 7
213
+ additional terms, to the whole of the work, and all its parts,
214
+ regardless of how they are packaged. This License gives no
215
+ permission to license the work in any other way, but it does not
216
+ invalidate such permission if you have separately received it.
217
+
218
+ d) If the work has interactive user interfaces, each must display
219
+ Appropriate Legal Notices; however, if the Program has interactive
220
+ interfaces that do not display Appropriate Legal Notices, your
221
+ work need not make them do so.
222
+
223
+ A compilation of a covered work with other separate and independent
224
+ works, which are not by their nature extensions of the covered work,
225
+ and which are not combined with it such as to form a larger program,
226
+ in or on a volume of a storage or distribution medium, is called an
227
+ "aggregate" if the compilation and its resulting copyright are not
228
+ used to limit the access or legal rights of the compilation's users
229
+ beyond what the individual works permit. Inclusion of a covered work
230
+ in an aggregate does not cause this License to apply to the other
231
+ parts of the aggregate.
232
+
233
+ 6. Conveying Non-Source Forms.
234
+
235
+ You may convey a covered work in object code form under the terms
236
+ of sections 4 and 5, provided that you also convey the
237
+ machine-readable Corresponding Source under the terms of this License,
238
+ in one of these ways:
239
+
240
+ a) Convey the object code in, or embodied in, a physical product
241
+ (including a physical distribution medium), accompanied by the
242
+ Corresponding Source fixed on a durable physical medium
243
+ customarily used for software interchange.
244
+
245
+ b) Convey the object code in, or embodied in, a physical product
246
+ (including a physical distribution medium), accompanied by a
247
+ written offer, valid for at least three years and valid for as
248
+ long as you offer spare parts or customer support for that product
249
+ model, to give anyone who possesses the object code either (1) a
250
+ copy of the Corresponding Source for all the software in the
251
+ product that is covered by this License, on a durable physical
252
+ medium customarily used for software interchange, for a price no
253
+ more than your reasonable cost of physically performing this
254
+ conveying of source, or (2) access to copy the
255
+ Corresponding Source from a network server at no charge.
256
+
257
+ c) Convey individual copies of the object code with a copy of the
258
+ written offer to provide the Corresponding Source. This
259
+ alternative is allowed only occasionally and noncommercially, and
260
+ only if you received the object code with such an offer, in accord
261
+ with subsection 6b.
262
+
263
+ d) Convey the object code by offering access from a designated
264
+ place (gratis or for a charge), and offer equivalent access to the
265
+ Corresponding Source in the same way through the same place at no
266
+ further charge. You need not require recipients to copy the
267
+ Corresponding Source along with the object code. If the place to
268
+ copy the object code is a network server, the Corresponding Source
269
+ may be on a different server (operated by you or a third party)
270
+ that supports equivalent copying facilities, provided you maintain
271
+ clear directions next to the object code saying where to find the
272
+ Corresponding Source. Regardless of what server hosts the
273
+ Corresponding Source, you remain obligated to ensure that it is
274
+ available for as long as needed to satisfy these requirements.
275
+
276
+ e) Convey the object code using peer-to-peer transmission, provided
277
+ you inform other peers where the object code and Corresponding
278
+ Source of the work are being offered to the general public at no
279
+ charge under subsection 6d.
280
+
281
+ A separable portion of the object code, whose source code is excluded
282
+ from the Corresponding Source as a System Library, need not be
283
+ included in conveying the object code work.
284
+
285
+ A "User Product" is either (1) a "consumer product", which means any
286
+ tangible personal property which is normally used for personal, family,
287
+ or household purposes, or (2) anything designed or sold for incorporation
288
+ into a dwelling. In determining whether a product is a consumer product,
289
+ doubtful cases shall be resolved in favor of coverage. For a particular
290
+ product received by a particular user, "normally used" refers to a
291
+ typical or common use of that class of product, regardless of the status
292
+ of the particular user or of the way in which the particular user
293
+ actually uses, or expects or is expected to use, the product. A product
294
+ is a consumer product regardless of whether the product has substantial
295
+ commercial, industrial or non-consumer uses, unless such uses represent
296
+ the only significant mode of use of the product.
297
+
298
+ "Installation Information" for a User Product means any methods,
299
+ procedures, authorization keys, or other information required to install
300
+ and execute modified versions of a covered work in that User Product from
301
+ a modified version of its Corresponding Source. The information must
302
+ suffice to ensure that the continued functioning of the modified object
303
+ code is in no case prevented or interfered with solely because
304
+ modification has been made.
305
+
306
+ If you convey an object code work under this section in, or with, or
307
+ specifically for use in, a User Product, and the conveying occurs as
308
+ part of a transaction in which the right of possession and use of the
309
+ User Product is transferred to the recipient in perpetuity or for a
310
+ fixed term (regardless of how the transaction is characterized), the
311
+ Corresponding Source conveyed under this section must be accompanied
312
+ by the Installation Information. But this requirement does not apply
313
+ if neither you nor any third party retains the ability to install
314
+ modified object code on the User Product (for example, the work has
315
+ been installed in ROM).
316
+
317
+ The requirement to provide Installation Information does not include a
318
+ requirement to continue to provide support service, warranty, or updates
319
+ for a work that has been modified or installed by the recipient, or for
320
+ the User Product in which it has been modified or installed. Access to a
321
+ network may be denied when the modification itself materially and
322
+ adversely affects the operation of the network or violates the rules and
323
+ protocols for communication across the network.
324
+
325
+ Corresponding Source conveyed, and Installation Information provided,
326
+ in accord with this section must be in a format that is publicly
327
+ documented (and with an implementation available to the public in
328
+ source code form), and must require no special password or key for
329
+ unpacking, reading or copying.
330
+
331
+ 7. Additional Terms.
332
+
333
+ "Additional permissions" are terms that supplement the terms of this
334
+ License by making exceptions from one or more of its conditions.
335
+ Additional permissions that are applicable to the entire Program shall
336
+ be treated as though they were included in this License, to the extent
337
+ that they are valid under applicable law. If additional permissions
338
+ apply only to part of the Program, that part may be used separately
339
+ under those permissions, but the entire Program remains governed by
340
+ this License without regard to the additional permissions.
341
+
342
+ When you convey a copy of a covered work, you may at your option
343
+ remove any additional permissions from that copy, or from any part of
344
+ it. (Additional permissions may be written to require their own
345
+ removal in certain cases when you modify the work.) You may place
346
+ additional permissions on material, added by you to a covered work,
347
+ for which you have or can give appropriate copyright permission.
348
+
349
+ Notwithstanding any other provision of this License, for material you
350
+ add to a covered work, you may (if authorized by the copyright holders of
351
+ that material) supplement the terms of this License with terms:
352
+
353
+ a) Disclaiming warranty or limiting liability differently from the
354
+ terms of sections 15 and 16 of this License; or
355
+
356
+ b) Requiring preservation of specified reasonable legal notices or
357
+ author attributions in that material or in the Appropriate Legal
358
+ Notices displayed by works containing it; or
359
+
360
+ c) Prohibiting misrepresentation of the origin of that material, or
361
+ requiring that modified versions of such material be marked in
362
+ reasonable ways as different from the original version; or
363
+
364
+ d) Limiting the use for publicity purposes of names of licensors or
365
+ authors of the material; or
366
+
367
+ e) Declining to grant rights under trademark law for use of some
368
+ trade names, trademarks, or service marks; or
369
+
370
+ f) Requiring indemnification of licensors and authors of that
371
+ material by anyone who conveys the material (or modified versions of
372
+ it) with contractual assumptions of liability to the recipient, for
373
+ any liability that these contractual assumptions directly impose on
374
+ those licensors and authors.
375
+
376
+ All other non-permissive additional terms are considered "further
377
+ restrictions" within the meaning of section 10. If the Program as you
378
+ received it, or any part of it, contains a notice stating that it is
379
+ governed by this License along with a term that is a further
380
+ restriction, you may remove that term. If a license document contains
381
+ a further restriction but permits relicensing or conveying under this
382
+ License, you may add to a covered work material governed by the terms
383
+ of that license document, provided that the further restriction does
384
+ not survive such relicensing or conveying.
385
+
386
+ If you add terms to a covered work in accord with this section, you
387
+ must place, in the relevant source files, a statement of the
388
+ additional terms that apply to those files, or a notice indicating
389
+ where to find the applicable terms.
390
+
391
+ Additional terms, permissive or non-permissive, may be stated in the
392
+ form of a separately written license, or stated as exceptions;
393
+ the above requirements apply either way.
394
+
395
+ 8. Termination.
396
+
397
+ You may not propagate or modify a covered work except as expressly
398
+ provided under this License. Any attempt otherwise to propagate or
399
+ modify it is void, and will automatically terminate your rights under
400
+ this License (including any patent licenses granted under the third
401
+ paragraph of section 11).
402
+
403
+ However, if you cease all violation of this License, then your
404
+ license from a particular copyright holder is reinstated (a)
405
+ provisionally, unless and until the copyright holder explicitly and
406
+ finally terminates your license, and (b) permanently, if the copyright
407
+ holder fails to notify you of the violation by some reasonable means
408
+ prior to 60 days after the cessation.
409
+
410
+ Moreover, your license from a particular copyright holder is
411
+ reinstated permanently if the copyright holder notifies you of the
412
+ violation by some reasonable means, this is the first time you have
413
+ received notice of violation of this License (for any work) from that
414
+ copyright holder, and you cure the violation prior to 30 days after
415
+ your receipt of the notice.
416
+
417
+ Termination of your rights under this section does not terminate the
418
+ licenses of parties who have received copies or rights from you under
419
+ this License. If your rights have been terminated and not permanently
420
+ reinstated, you do not qualify to receive new licenses for the same
421
+ material under section 10.
422
+
423
+ 9. Acceptance Not Required for Having Copies.
424
+
425
+ You are not required to accept this License in order to receive or
426
+ run a copy of the Program. Ancillary propagation of a covered work
427
+ occurring solely as a consequence of using peer-to-peer transmission
428
+ to receive a copy likewise does not require acceptance. However,
429
+ nothing other than this License grants you permission to propagate or
430
+ modify any covered work. These actions infringe copyright if you do
431
+ not accept this License. Therefore, by modifying or propagating a
432
+ covered work, you indicate your acceptance of this License to do so.
433
+
434
+ 10. Automatic Licensing of Downstream Recipients.
435
+
436
+ Each time you convey a covered work, the recipient automatically
437
+ receives a license from the original licensors, to run, modify and
438
+ propagate that work, subject to this License. You are not responsible
439
+ for enforcing compliance by third parties with this License.
440
+
441
+ An "entity transaction" is a transaction transferring control of an
442
+ organization, or substantially all assets of one, or subdividing an
443
+ organization, or merging organizations. If propagation of a covered
444
+ work results from an entity transaction, each party to that
445
+ transaction who receives a copy of the work also receives whatever
446
+ licenses to the work the party's predecessor in interest had or could
447
+ give under the previous paragraph, plus a right to possession of the
448
+ Corresponding Source of the work from the predecessor in interest, if
449
+ the predecessor has it or can get it with reasonable efforts.
450
+
451
+ You may not impose any further restrictions on the exercise of the
452
+ rights granted or affirmed under this License. For example, you may
453
+ not impose a license fee, royalty, or other charge for exercise of
454
+ rights granted under this License, and you may not initiate litigation
455
+ (including a cross-claim or counterclaim in a lawsuit) alleging that
456
+ any patent claim is infringed by making, using, selling, offering for
457
+ sale, or importing the Program or any portion of it.
458
+
459
+ 11. Patents.
460
+
461
+ A "contributor" is a copyright holder who authorizes use under this
462
+ License of the Program or a work on which the Program is based. The
463
+ work thus licensed is called the contributor's "contributor version".
464
+
465
+ A contributor's "essential patent claims" are all patent claims
466
+ owned or controlled by the contributor, whether already acquired or
467
+ hereafter acquired, that would be infringed by some manner, permitted
468
+ by this License, of making, using, or selling its contributor version,
469
+ but do not include claims that would be infringed only as a
470
+ consequence of further modification of the contributor version. For
471
+ purposes of this definition, "control" includes the right to grant
472
+ patent sublicenses in a manner consistent with the requirements of
473
+ this License.
474
+
475
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
476
+ patent license under the contributor's essential patent claims, to
477
+ make, use, sell, offer for sale, import and otherwise run, modify and
478
+ propagate the contents of its contributor version.
479
+
480
+ In the following three paragraphs, a "patent license" is any express
481
+ agreement or commitment, however denominated, not to enforce a patent
482
+ (such as an express permission to practice a patent or covenant not to
483
+ sue for patent infringement). To "grant" such a patent license to a
484
+ party means to make such an agreement or commitment not to enforce a
485
+ patent against the party.
486
+
487
+ If you convey a covered work, knowingly relying on a patent license,
488
+ and the Corresponding Source of the work is not available for anyone
489
+ to copy, free of charge and under the terms of this License, through a
490
+ publicly available network server or other readily accessible means,
491
+ then you must either (1) cause the Corresponding Source to be so
492
+ available, or (2) arrange to deprive yourself of the benefit of the
493
+ patent license for this particular work, or (3) arrange, in a manner
494
+ consistent with the requirements of this License, to extend the patent
495
+ license to downstream recipients. "Knowingly relying" means you have
496
+ actual knowledge that, but for the patent license, your conveying the
497
+ covered work in a country, or your recipient's use of the covered work
498
+ in a country, would infringe one or more identifiable patents in that
499
+ country that you have reason to believe are valid.
500
+
501
+ If, pursuant to or in connection with a single transaction or
502
+ arrangement, you convey, or propagate by procuring conveyance of, a
503
+ covered work, and grant a patent license to some of the parties
504
+ receiving the covered work authorizing them to use, propagate, modify
505
+ or convey a specific copy of the covered work, then the patent license
506
+ you grant is automatically extended to all recipients of the covered
507
+ work and works based on it.
508
+
509
+ A patent license is "discriminatory" if it does not include within
510
+ the scope of its coverage, prohibits the exercise of, or is
511
+ conditioned on the non-exercise of one or more of the rights that are
512
+ specifically granted under this License. You may not convey a covered
513
+ work if you are a party to an arrangement with a third party that is
514
+ in the business of distributing software, under which you make payment
515
+ to the third party based on the extent of your activity of conveying
516
+ the work, and under which the third party grants, to any of the
517
+ parties who would receive the covered work from you, a discriminatory
518
+ patent license (a) in connection with copies of the covered work
519
+ conveyed by you (or copies made from those copies), or (b) primarily
520
+ for and in connection with specific products or compilations that
521
+ contain the covered work, unless you entered into that arrangement,
522
+ or that patent license was granted, prior to 28 March 2007.
523
+
524
+ Nothing in this License shall be construed as excluding or limiting
525
+ any implied license or other defenses to infringement that may
526
+ otherwise be available to you under applicable patent law.
527
+
528
+ 12. No Surrender of Others' Freedom.
529
+
530
+ If conditions are imposed on you (whether by court order, agreement or
531
+ otherwise) that contradict the conditions of this License, they do not
532
+ excuse you from the conditions of this License. If you cannot convey a
533
+ covered work so as to satisfy simultaneously your obligations under this
534
+ License and any other pertinent obligations, then as a consequence you may
535
+ not convey it at all. For example, if you agree to terms that obligate you
536
+ to collect a royalty for further conveying from those to whom you convey
537
+ the Program, the only way you could satisfy both those terms and this
538
+ License would be to refrain entirely from conveying the Program.
539
+
540
+ 13. Remote Network Interaction; Use with the GNU General Public License.
541
+
542
+ Notwithstanding any other provision of this License, if you modify the
543
+ Program, your modified version must prominently offer all users
544
+ interacting with it remotely through a computer network (if your version
545
+ supports such interaction) an opportunity to receive the Corresponding
546
+ Source of your version by providing access to the Corresponding Source
547
+ from a network server at no charge, through some standard or customary
548
+ means of facilitating copying of software. This Corresponding Source
549
+ shall include the Corresponding Source for any work covered by version 3
550
+ of the GNU General Public License that is incorporated pursuant to the
551
+ following paragraph.
552
+
553
+ Notwithstanding any other provision of this License, you have
554
+ permission to link or combine any covered work with a work licensed
555
+ under version 3 of the GNU General Public License into a single
556
+ combined work, and to convey the resulting work. The terms of this
557
+ License will continue to apply to the part which is the covered work,
558
+ but the work with which it is combined will remain governed by version
559
+ 3 of the GNU General Public License.
560
+
561
+ 14. Revised Versions of this License.
562
+
563
+ The Free Software Foundation may publish revised and/or new versions of
564
+ the GNU Affero General Public License from time to time. Such new versions
565
+ will be similar in spirit to the present version, but may differ in detail to
566
+ address new problems or concerns.
567
+
568
+ Each version is given a distinguishing version number. If the
569
+ Program specifies that a certain numbered version of the GNU Affero General
570
+ Public License "or any later version" applies to it, you have the
571
+ option of following the terms and conditions either of that numbered
572
+ version or of any later version published by the Free Software
573
+ Foundation. If the Program does not specify a version number of the
574
+ GNU Affero General Public License, you may choose any version ever published
575
+ by the Free Software Foundation.
576
+
577
+ If the Program specifies that a proxy can decide which future
578
+ versions of the GNU Affero General Public License can be used, that proxy's
579
+ public statement of acceptance of a version permanently authorizes you
580
+ to choose that version for the Program.
581
+
582
+ Later license versions may give you additional or different
583
+ permissions. However, no additional obligations are imposed on any
584
+ author or copyright holder as a result of your choosing to follow a
585
+ later version.
586
+
587
+ 15. Disclaimer of Warranty.
588
+
589
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590
+ APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591
+ HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592
+ OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593
+ THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594
+ PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595
+ IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596
+ ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597
+
598
+ 16. Limitation of Liability.
599
+
600
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601
+ WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602
+ THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603
+ GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604
+ USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605
+ DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606
+ PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607
+ EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608
+ SUCH DAMAGES.
609
+
610
+ 17. Interpretation of Sections 15 and 16.
611
+
612
+ If the disclaimer of warranty and limitation of liability provided
613
+ above cannot be given local legal effect according to their terms,
614
+ reviewing courts shall apply local law that most closely approximates
615
+ an absolute waiver of all civil liability in connection with the
616
+ Program, unless a warranty or assumption of liability accompanies a
617
+ copy of the Program in return for a fee.
618
+
619
+ END OF TERMS AND CONDITIONS
620
+
621
+ How to Apply These Terms to Your New Programs
622
+
623
+ If you develop a new program, and you want it to be of the greatest
624
+ possible use to the public, the best way to achieve this is to make it
625
+ free software which everyone can redistribute and change under these terms.
626
+
627
+ To do so, attach the following notices to the program. It is safest
628
+ to attach them to the start of each source file to most effectively
629
+ state the exclusion of warranty; and each file should have at least
630
+ the "copyright" line and a pointer to where the full notice is found.
631
+
632
+ <one line to give the program's name and a brief idea of what it does.>
633
+ Copyright (C) <year> <name of author>
634
+
635
+ This program is free software: you can redistribute it and/or modify
636
+ it under the terms of the GNU Affero General Public License as published
637
+ by the Free Software Foundation, either version 3 of the License, or
638
+ (at your option) any later version.
639
+
640
+ This program is distributed in the hope that it will be useful,
641
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
642
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643
+ GNU Affero General Public License for more details.
644
+
645
+ You should have received a copy of the GNU Affero General Public License
646
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
647
+
648
+ Also add information on how to contact you by electronic and paper mail.
649
+
650
+ If your software can interact with users remotely through a computer
651
+ network, you should also make sure that it provides a way for users to
652
+ get its source. For example, if your program is a web application, its
653
+ interface could display a "Source" link that leads users to an archive
654
+ of the code. There are many ways you could offer source, and different
655
+ solutions will be better for different programs; see section 13 for the
656
+ specific requirements.
657
+
658
+ You should also get your employer (if you work as a programmer) or school,
659
+ if any, to sign a "copyright disclaimer" for the program, if necessary.
660
+ For more information on this, and how to apply and follow the GNU AGPL, see
661
+ <https://www.gnu.org/licenses/>.
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 3.32.0
8
- app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
 
5
  colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 3.32.0
8
+ app_file: webUI2.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
README_zh_CN.md ADDED
@@ -0,0 +1,532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SoftVC VITS Singing Voice Conversion
2
+
3
+ [**English**](./README.md) | [**中文简体**](./README_zh_CN.md)
4
+
5
+ #### ✨ 带有F0曲线编辑器,角色混合时间轴编辑器的推理端 (Onnx模型的用途) : [MoeVoiceStudio](https://github.com/NaruseMioShirakana/MoeVoiceStudio)
6
+
7
+ #### ✨ 改善了交互的一个分支推荐:[34j/so-vits-svc-fork](https://github.com/34j/so-vits-svc-fork)
8
+
9
+ #### ✨ 支持实时转换的一个客户端:[w-okada/voice-changer](https://github.com/w-okada/voice-changer)
10
+
11
+ **本项目与Vits有着根本上的不同。Vits是TTS,本项目是SVC。本项目无法实现TTS,Vits也无法实现SVC,这两个项目的模型是完全不通用的。**
12
+
13
+ ## 重要通知
14
+
15
+ 这个项目是为了让开发者最喜欢的动画角色唱歌而开发的,任何涉及真人的东西都与开发者的意图背道而驰。
16
+
17
+ ## 声明
18
+
19
+ 本项目为开源、离线的项目,SvcDevelopTeam的所有成员与本项目的所有开发者以及维护者(以下简称贡献者)对本项目没有控制力。本项目的贡献者从未向任何组织或个人提供包括但不限于数据集提取、数据集加工、算力支持、训练支持、推理等一切形式的帮助;本项目的贡献者不知晓也无法知晓使用者使用该项目的用途。故一切基于本项目训练的AI模型和合成的音频都与本项目贡献者无关。一切由此造成的问题由使用者自行承担。
20
+
21
+ 此项目完全离线运行,不能收集任何用户信息或获取用户输入数据。因此,这个项目的贡献者不知道所有的用户输入和模型,因此不负责任何用户输入。
22
+
23
+ 本项目只是一个框架项目,本身并没有语音合成的功能,所有的功能都需要用户自己训练模型。同时,这个项目没有任何模型,任何二次分发的项目都与这个项目的贡献者无关。
24
+
25
+ ## 📏 使用规约
26
+
27
+ # Warning:请自行解决数据集授权问题,禁止使用非授权数据集进行训练!任何由于使用非授权数据集进行训练造成的问题,需自行承担全部责任和后果!与仓库、仓库维护者、svc develop team 无关!
28
+
29
+ 1. 本项目是基于学术交流目的建立,仅供交流与学习使用,并非为生产环境准备。
30
+ 2. 任何发布到视频平台的基于 sovits 制作的视频,都必须要在简介明确指明用于变声器转换的输入源歌声、音频,例如:使用他人发布的视频 / 音频,通过分离的人声作为输入源进行转换的,必须要给出明确的原视频、音乐链接;若使用是自己的人声,或是使用其他歌声合成引擎合成的声音作为输入源进行转换的,也必须在简介加以说明。
31
+ 3. 由输入源造成的侵权问题需自行承担全部责任和一切后果。使用其他商用歌声合成软件作为输入源时,请确保遵守该软件的使用条例,注意,许多歌声合成引擎使用条例中明确指明不可用于输入源进行转换!
32
+ 4. 禁止使用该项目从事违法行为与宗教、政治等活动,该项目维护者坚决抵制上述行为,不同意此条则禁止使用该项目。
33
+ 5. 继续使用视为已同意本仓库 README 所述相关条例,本仓库 README 已进行劝导义务,不对后续可能存在问题负责。
34
+ 6. 如果将此项目用于任何其他企划,请提前联系并告知本仓库作者,十分感谢。
35
+
36
+ ## 📝 模型简介
37
+
38
+ 歌声音色转换模型,通过SoftVC内容编码器提取源音频语音特征,与F0同时输入VITS替换原本的文本输入达到歌声转换的效果。同时,更换声码器为 [NSF HiFiGAN](https://github.com/openvpi/DiffSinger/tree/refactor/modules/nsf_hifigan)解决断音问题。
39
+
40
+ ### 🆕 4.1-Stable 版本更新内容
41
+
42
+ + 特征输入更换为 [Content Vec](https://github.com/auspicious3000/contentvec) 的第12层Transformer输出,并兼容4.0分支
43
+ + 更新浅层扩散,可以使用浅层扩散模型提升音质
44
+ + 增加whisper语音编码器的支持
45
+ + 增加静态/动态声线融合
46
+ + 增加响度嵌入
47
+ + 增加特征检索,来自于[RVC](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)
48
+
49
+ ### 🆕 关于兼容4.0模型的问题
50
+
51
+ + 可通过修改4.0模型的config.json对4.0的模型进行支持,需要在config.json的model字段中添加speech_encoder字段,具体见下
52
+
53
+ ```
54
+ "model": {
55
+ .........
56
+ "ssl_dim": 256,
57
+ "n_speakers": 200,
58
+ "speech_encoder":"vec256l9"
59
+ }
60
+ ```
61
+
62
+ ### 🆕 关于浅扩散
63
+ ![Diagram](shadowdiffusion.png)
64
+
65
+ ## 💬 关于 Python 版本问题
66
+
67
+ 在进行测试后,我们认为`Python 3.8.9`能够稳定地运行该项目
68
+
69
+ ## 📥 预先下载的模型文件
70
+
71
+ #### **必须项**
72
+
73
+ **以下编码器需要选择一个使用**
74
+
75
+ ##### **1. 若使用contentvec作为声音编码器(推荐)**
76
+
77
+ `vec768l12`与`vec256l9` 需要该编码器
78
+
79
+ + contentvec :[checkpoint_best_legacy_500.pt](https://ibm.box.com/s/z1wgl1stco8ffooyatzdwsqn2psd9lrr)
80
+ + 放在`pretrain`目录下
81
+
82
+ 或者下载下面的ContentVec,大小只有199MB,但效果相同:
83
+ + contentvec :[hubert_base.pt](https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt)
84
+ + 将文件名改为`checkpoint_best_legacy_500.pt`后,放在`pretrain`目录下
85
+
86
+ ```shell
87
+ # contentvec
88
+ wget -P pretrain/ http://obs.cstcloud.cn/share/obs/sankagenkeshi/checkpoint_best_legacy_500.pt
89
+ # 也可手动下载放在pretrain目录
90
+ ```
91
+
92
+ ##### **2. 若使用hubertsoft作为声音编码器**
93
+ + soft vc hubert:[hubert-soft-0d54a1f4.pt](https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt)
94
+ + 放在`pretrain`目录下
95
+
96
+ ##### **3. 若使用Whisper-ppg作为声音编码器**
97
+ + 下载模型 [medium.pt](https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt), 该模型适配`whisper-ppg`
98
+ + 下载模型 [large-v2.pt](https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt), 该模型适配`whisper-ppg-large`
99
+ + 放在`pretrain`目录下
100
+
101
+ ##### **4. 若使用cnhubertlarge作为声音编码器**
102
+ + 下载模型 [chinese-hubert-large-fairseq-ckpt.pt](https://huggingface.co/TencentGameMate/chinese-hubert-large/resolve/main/chinese-hubert-large-fairseq-ckpt.pt)
103
+ + 放在`pretrain`目录下
104
+
105
+ ##### **5. 若使用dphubert作为声音编码器**
106
+ + 下载模型 [DPHuBERT-sp0.75.pth](https://huggingface.co/pyf98/DPHuBERT/resolve/main/DPHuBERT-sp0.75.pth)
107
+ + 放在`pretrain`目录下
108
+
109
+ ##### **6. 若使用WavLM作为声音编码器**
110
+ + 下载模型 [WavLM-Base+.pt](https://valle.blob.core.windows.net/share/wavlm/WavLM-Base+.pt?sv=2020-08-04&st=2023-03-01T07%3A51%3A05Z&se=2033-03-02T07%3A51%3A00Z&sr=c&sp=rl&sig=QJXmSJG9DbMKf48UDIU1MfzIro8HQOf3sqlNXiflY1I%3D), 该模型适配`wavlmbase+`
111
+ + 放在`pretrain`目录下
112
+
113
+ ##### **7. 若使用OnnxHubert/ContentVec作为声音编码器**
114
+ + 下载模型 [MoeSS-SUBModel](https://huggingface.co/NaruseMioShirakana/MoeSS-SUBModel/tree/main)
115
+ + 放在`pretrain`目录下
116
+
117
+ #### **编码器列表**
118
+ - "vec768l12"
119
+ - "vec256l9"
120
+ - "vec256l9-onnx"
121
+ - "vec256l12-onnx"
122
+ - "vec768l9-onnx"
123
+ - "vec768l12-onnx"
124
+ - "hubertsoft-onnx"
125
+ - "hubertsoft"
126
+ - "whisper-ppg"
127
+ - "cnhubertlarge"
128
+ - "dphubert"
129
+ - "whisper-ppg-large"
130
+ - "wavlmbase+"
131
+
132
+ #### **可选项(强烈建议使用)**
133
+
134
+ + 预训练底模文件: `G_0.pth` `D_0.pth`
135
+ + 放在`logs/44k`目录下
136
+
137
+ + 扩散模型预训练底模文件: `model_0.pt `
138
+ + 放在`logs/44k/diffusion`目录下
139
+
140
+ 从svc-develop-team(待定)或任何其他地方获取Sovits底模
141
+
142
+ 扩散模型引用了[Diffusion-SVC](https://github.com/CNChTu/Diffusion-SVC)的Diffusion Model,底模与[Diffusion-SVC](https://github.com/CNChTu/Diffusion-SVC)的扩散模型底模通用,可以去[Diffusion-SVC](https://github.com/CNChTu/Diffusion-SVC)获取扩散模型的底模
143
+
144
+ 虽然底模一般不会引起什么版权问题,但还是请注意一下,比如事先询问作者,又或者作者在模型描述中明确写明了可行的用途
145
+
146
+ #### **可选项(根据情况选择)**
147
+
148
+ 如果使用`NSF-HIFIGAN增强器`或`浅层扩散`的话,需要下载预训练的NSF-HIFIGAN模型,如果不需要可以不下载
149
+
150
+ + 预训练的NSF-HIFIGAN声码器 :[nsf_hifigan_20221211.zip](https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-v1/nsf_hifigan_20221211.zip)
151
+ + 解压后,将四个文件放在`pretrain/nsf_hifigan`目录下
152
+
153
+ ```shell
154
+ # nsf_hifigan
155
+ wget -P pretrain/ https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-v1/nsf_hifigan_20221211.zip
156
+ unzip -od pretrain/nsf_hifigan pretrain/nsf_hifigan_20221211.zip
157
+ # 也可手动下载放在pretrain/nsf_hifigan目录
158
+ # 地址:https://github.com/openvpi/vocoders/releases/tag/nsf-hifigan-v1
159
+ ```
160
+
161
+ ## 📊 数据集准备
162
+
163
+ 仅需要以以下文件结构将数据集放入dataset_raw目录即可
164
+
165
+ ```
166
+ dataset_raw
167
+ ├───speaker0
168
+ │ ├───xxx1-xxx1.wav
169
+ │ ├───...
170
+ │ └───Lxx-0xx8.wav
171
+ └───speaker1
172
+ ├───xx2-0xxx2.wav
173
+ ├───...
174
+ └───xxx7-xxx007.wav
175
+ ```
176
+
177
+ 可以自定义说话人名称
178
+
179
+ ```
180
+ dataset_raw
181
+ └───suijiSUI
182
+ ├───1.wav
183
+ ├───...
184
+ └───25788785-20221210-200143-856_01_(Vocals)_0_0.wav
185
+ ```
186
+
187
+ ## 🛠️ 数据预处理
188
+
189
+ ### 0. 音频切片
190
+
191
+ 将音频切片至`5s - 15s`, 稍微长点也无伤大雅,实在太长可能会导致训练中途甚至预处理就爆显存
192
+
193
+ 可以使用[audio-slicer-GUI](https://github.com/flutydeer/audio-slicer)、[audio-slicer-CLI](https://github.com/openvpi/audio-slicer)
194
+
195
+ 一般情况下只需调整其中的`Minimum Interval`,普通陈述素材通常保持默认即可,歌唱素材可以调整至`100`甚至`50`
196
+
197
+ 切完之后手动删除过长过短的音频
198
+
199
+ **如果你使用Whisper-ppg声音编码器进行训练,所有的切片长度必须小于30s**
200
+
201
+ ### 1. 重采样至44100Hz单声道
202
+
203
+ ```shell
204
+ python resample.py
205
+ ```
206
+
207
+ #### 注意
208
+
209
+ 虽然本项目拥有重采样、转换单声道与响度匹配的脚本resample.py,但是默认的响度匹配是匹配到0db。这可能会造成音质的受损。而python的响度匹配包pyloudnorm无法对电平进行压限,这会导致爆音。所以建议可以考虑使用专业声音处理软件如`adobe audition`等软件做响度匹配处理。若已经使用其他软件做响度匹配,可以在运行上述命令时添加`--skip_loudnorm`跳过响度匹配步骤。如:
210
+
211
+ ```shell
212
+ python resample.py --skip_loudnorm
213
+ ```
214
+
215
+ ### 2. 自动划分训练集、验证集,以及自动生成配置文件
216
+
217
+ ```shell
218
+ python preprocess_flist_config.py --speech_encoder vec768l12
219
+ ```
220
+
221
+ speech_encoder拥有以下选择
222
+
223
+ ```
224
+ vec768l12
225
+ vec256l9
226
+ hubertsoft
227
+ whisper-ppg
228
+ whisper-ppg-large
229
+ cnhubertlarge
230
+ dphubert
231
+ wavlmbase+
232
+ ```
233
+
234
+ 如果省略speech_encoder参数,默认值为vec768l12
235
+
236
+ **使用响度嵌入**
237
+
238
+ 若使用响度嵌入,需要增加`--vol_aug`参数,比如:
239
+
240
+ ```shell
241
+ python preprocess_flist_config.py --speech_encoder vec768l12 --vol_aug
242
+ ```
243
+
244
+ 使用后训练出的模型将匹配到输入源响度,否则为训练集响度。
245
+
246
+ #### 此时可以在生成的config.json与diffusion.yaml修改部分参数
247
+
248
+ ##### config.json
249
+
250
+ * `keep_ckpts`:训练时保留最后几个模型,`0`为保留所有,默认只保留最后`3`个
251
+
252
+ * `all_in_mem`:加载所有数据集到内存中,某些平台的硬盘IO过于低下、同时内存容量 **远大于** 数据集体积时可以启用
253
+
254
+ * `batch_size`:单次训练加载到GPU的数据量,调整到低于显存容量的大小即可
255
+
256
+ * `vocoder_name` : 选择一种声码器,默认为`nsf-hifigan`.
257
+
258
+ ##### diffusion.yaml
259
+
260
+ * `cache_all_data`:加载所有数据集到内存中,某些平台的硬盘IO过于低下、同时内存容量 **远大于** 数据集体积时可以启用
261
+
262
+ * `duration`:训练时音频切片时长,可根据显存大小调整,**注意,该值必须小于训练集内音频的最短时间!**
263
+
264
+ * `batch_size`:单次训练加载到GPU的数据量,调整到低于显存容量的大小即可
265
+
266
+ * `timesteps` : 扩散模型总步数,默认为1000.
267
+
268
+ * `k_step_max` : 训练时可仅训练`k_step_max`步扩散以节约训练时间,注意,该值必须小于`timesteps`,0为训练整个扩散模型,**注意,如果不训练整个扩散模型将无法使用仅扩散模型推理!**
269
+
270
+ ##### **声码器列表**
271
+
272
+ ```
273
+ nsf-hifigan
274
+ nsf-snake-hifigan
275
+ ```
276
+
277
+ ### 3. 生成hubert与f0
278
+
279
+ ```shell
280
+ python preprocess_hubert_f0.py --f0_predictor dio
281
+ ```
282
+
283
+ f0_predictor拥有四个选择
284
+
285
+ ```
286
+ crepe
287
+ dio
288
+ pm
289
+ harvest
290
+ ```
291
+
292
+ 如果训练集过于嘈杂,请使用crepe处理f0
293
+
294
+ 如果省略f0_predictor参数,默认值为dio
295
+
296
+ 尚若需要浅扩散功能(可选),需要增加--use_diff参数,比如
297
+
298
+ ```shell
299
+ python preprocess_hubert_f0.py --f0_predictor dio --use_diff
300
+ ```
301
+
302
+ 执行完以上步骤后 dataset 目录便是预处理完成的数据,可以删除 dataset_raw 文件夹了
303
+
304
+ ## 🏋️‍ 训练
305
+
306
+ ### 主模型训练
307
+
308
+ ```shell
309
+ python train.py -c configs/config.json -m 44k
310
+ ```
311
+
312
+ ### 扩散模型(可选)
313
+
314
+ 尚若需要浅扩散功能,需要训练扩散模型,扩散模型训练方法为:
315
+
316
+ ```shell
317
+ python train_diff.py -c configs/diffusion.yaml
318
+ ```
319
+
320
+ 模型训练结束后,模型文件保存在`logs/44k`目录下,扩散模型在`logs/44k/diffusion`下
321
+
322
+ ## 🤖 推理
323
+
324
+ 使用 [inference_main.py](inference_main.py)
325
+
326
+ ```shell
327
+ # 例
328
+ python inference_main.py -m "logs/44k/G_30400.pth" -c "configs/config.json" -n "君の知らない物語-src.wav" -t 0 -s "nen"
329
+ ```
330
+
331
+ 必填项部分:
332
+ + `-m` | `--model_path`:模型路径
333
+ + `-c` | `--config_path`:配置文件路径
334
+ + `-n` | `--clean_names`:wav 文件名列表,放在 raw 文件夹下
335
+ + `-t` | `--trans`:音高调整,支持正负(半音)
336
+ + `-s` | `--spk_list`:合成目标说话人名称
337
+ + `-cl` | `--clip`:音频强制切片,默认0为自动切片,单位为秒/s
338
+
339
+ 可选项部分:部分具体见下一节
340
+ + `-lg` | `--linear_gradient`:两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒
341
+ + `-f0p` | `--f0_predictor`:选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意:crepe为原F0使用均值滤波器)
342
+ + `-a` | `--auto_predict_f0`:语音转换自动预测音高,转换歌声时不要打开这个会严重跑调
343
+ + `-cm` | `--cluster_model_path`:聚类模型或特征检索索引路径,留空则自动设为各方案模型的默认路径,如果没有训练聚类或特征检索则随便填
344
+ + `-cr` | `--cluster_infer_ratio`:聚类方案或特征检索占比,范围0-1,若没有训练聚类模型或特征检索则默认0即可
345
+ + `-eh` | `--enhance`:是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭
346
+ + `-shd` | `--shallow_diffusion`:是否使用浅层扩散,使用后可解决一部分电音问题,默认关闭,该选项打开时,NSF_HIFIGAN增强器将会被禁止
347
+ + `-usm` | `--use_spk_mix`:是否使用角色融合/动态声线融合
348
+ + `-lea` | `--loudness_envelope_adjustment`:输入源响度包络替换输出响度包络融合比例,越靠近1越使用输出响度包络
349
+ + `-fr` | `--feature_retrieval`:是否使用特征检索,如果使用聚类模型将被禁用,且cm与cr参数将会变成特征检索的索引路径与混合比例
350
+
351
+ 浅扩散设置:
352
+ + `-dm` | `--diffusion_model_path`:扩散模型路径
353
+ + `-dc` | `--diffusion_config_path`:扩散模型配置文件路径
354
+ + `-ks` | `--k_step`:扩散步数,越大越接近扩散模型的结果,默认100
355
+ + `-od` | `--only_diffusion`:纯扩散模式,该模式不会加载sovits模型,以扩散模型推理
356
+ + `-se` | `--second_encoding`:二次编码,浅扩散前会对原始音频进行二次编码,玄学选项,有时候效果好,有时候效果差
357
+
358
+ ### 注意!
359
+
360
+ 如果使用`whisper-ppg` 声音编码器进行推理,需要将`--clip`设置为25,`-lg`设置为1。否则将无法正常推理。
361
+
362
+ ## 🤔 可选项
363
+
364
+ 如果前面的效果已经满意,或者没看明白下面在讲啥,那后面的内容都可以忽略,不影响模型使用(这些可选项影响比较小,可能在某些特定数据上有点效果,但大部分情况似乎都感知不太明显)
365
+
366
+ ### 自动f0预测
367
+
368
+ 4.0模型训练过程会训练一个f0预测器,对于语音转换可以开启自动音高预测,如果效果不好也可以使用手动的,但转换歌声时请不要启用此功能!!!会严重跑调!!
369
+ + 在inference_main中设置auto_predict_f0为true即可
370
+
371
+ ### 聚类音色泄漏控制
372
+
373
+ 介绍:聚类方案可以减小音色泄漏,使得模型训练出来更像目标的音色(但其实不是特别明显),但是单纯的聚类方案会降低模型的咬字(会口齿不清)(这个很明显),本模型采用了融合的方式,可以线性控制聚类方案与非聚类方案的占比,也就是可以手动在"像目标音色" 和 "咬字清晰" 之间调整比例,找到合适的折中点
374
+
375
+ 使用聚类前面的已有步骤不用进行任何的变动,只需要额外训练一个聚类模型,虽然效果比较有限,但训练成本也比较低
376
+
377
+ + 训练过程:
378
+ + 使用cpu性能较好的机器训练,据我的经验在腾讯云6核cpu训练每个speaker需要约4分钟即可完成训练
379
+ + 执行`python cluster/train_cluster.py`,模型的输出会在`logs/44k/kmeans_10000.pt`
380
+ + 聚类模型目前可以使用gpu进行训练,执行`python cluster/train_cluster.py --gpu`
381
+ + 推理过程:
382
+ + `inference_main.py`中指定`cluster_model_path` 为模型输出文件, 留空则默认为`logs/44k/kmeans_10000.pt`
383
+ + `inference_main.py`中指定`cluster_infer_ratio`,`0`为完全不使用聚类,`1`为只使用聚类,通常设置`0.5`即可
384
+
385
+ ### 特征检索
386
+
387
+ 介绍:跟聚类方案一样可以减小音色泄漏,咬字比聚类稍好,但会降低推理速度,采用了融合的方式,可以线性控制特征检索与非特征检索的占比,
388
+
389
+ + 训练过程:
390
+ 首先需要在生成hubert与f0后执行:
391
+
392
+ ```shell
393
+ python train_index.py -c configs/config.json
394
+ ```
395
+
396
+ 模型的输出会在`logs/44k/feature_and_index.pkl`
397
+
398
+ + 推理过程:
399
+ + 需要首先指定`--feature_retrieval`,此时聚类方案会自动切换到特征检索方案
400
+ + `inference_main.py`中指定`cluster_model_path` 为模型输出文件, 留空则默认为`logs/44k/feature_and_index.pkl`
401
+ + `inference_main.py`中指定`cluster_infer_ratio`,`0`为完全不使用特征检索,`1`为只使用特征检索,通常设置`0.5`即可
402
+
403
+ ### [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/svc-develop-team/so-vits-svc/blob/4.1-Stable/sovits4_for_colab.ipynb) [sovits4_for_colab.ipynb](https://colab.research.google.com/github/svc-develop-team/so-vits-svc/blob/4.1-Stable/sovits4_for_colab.ipynb)
404
+
405
+ ## 🗜️ 模型压缩
406
+
407
+ 生成的模型含有继续训练所需的信息。如果确认不再训练,可以移除模型中此部分信息,得到约 1/3 大小的最终模型。
408
+
409
+ 使用 [compress_model.py](compress_model.py)
410
+
411
+ ```shell
412
+ # 例
413
+ python compress_model.py -c="configs/config.json" -i="logs/44k/G_30400.pth" -o="logs/44k/release.pth"
414
+ ```
415
+
416
+ ## 👨‍🔧 声线混合
417
+
418
+ ### 静态声线混合
419
+
420
+ **参考`webUI.py`文件中,小工具/实验室特性的静态声线融合。**
421
+
422
+ 介绍:该功能可以将多个声音模型合成为一个声音模型(多个模型参数的凸组合或线性组合),从而制造出现实中不存在的声线
423
+ **注意:**
424
+
425
+ 1. 该功能仅支持单说话人的模型
426
+ 2. 如果强行使用多说话人模型,需要保证多个模型的说话人数量相同,这样可以混合同一个SpaekerID下的声音
427
+ 3. 保证所有待混合模型的config.json中的model字段是相同的
428
+ 4. 输出的混合模型可以使用待合成模型的任意一个config.json,但聚类模型将不能使用
429
+ 5. 批量上传模型的时候最好把模型放到一个文件夹选中后一起上传
430
+ 6. 混合比例调整建议大小在0-100之间,也可以调为其他数字,但在线性组合模式下会出现未知的效果
431
+ 7. 混合完毕后,文件将会保存在项目根目录中,文件名为output.pth
432
+ 8. 凸组合模式会将混合比例执行Softmax使混合比例相加为1,而线性组合模式不会
433
+
434
+ ### 动态声线混合
435
+
436
+ **参考`spkmix.py`文件中关于动态声线混合的介绍**
437
+
438
+ 角色混合轨道 编写规则:
439
+
440
+ 角色ID : \[\[起始时间1, 终止时间1, 起始数值1, 起始数值1], [起始时间2, 终止时间2, 起始数值2, 起始数值2]]
441
+
442
+ 起始时间和前一个的终止时间必须相同,第一个起始时间必须为0,最后一个终止时间必须为1 (时间的范围为0-1)
443
+
444
+ 全部角色必须填写,不使用的角色填\[\[0., 1., 0., 0.]]即可
445
+
446
+ 融合数值可以随便填,在指定的时间段内从起始数值线性变化为终止数值,内部会自动确保线性组合为1(凸组合条件),可以放心使用
447
+
448
+ 推理的时候使用`--use_spk_mix`参数即可启用动态声线混合
449
+
450
+ ## 📤 Onnx导出
451
+
452
+ 使用 [onnx_export.py](onnx_export.py)
453
+
454
+ + 新建文件夹:`checkpoints` 并打开
455
+ + 在`checkpoints`文件夹中新建一个文件夹作为项目文件夹,文件夹名为你的项目名称,比如`aziplayer`
456
+ + 将你的模型更名为`model.pth`,配置文件更名为`config.json`,并放置到刚才创建的`aziplayer`文件夹下
457
+ + 将 [onnx_export.py](onnx_export.py) 中`path = "NyaruTaffy"` 的 `"NyaruTaffy"` 修改为你的项目名称,`path = "aziplayer" (onnx_export_speaker_mix,为支持角色混合的onnx导出)`
458
+ + 运行 [onnx_export.py](onnx_export.py)
459
+ + 等待执行完毕,在你的项目文件夹下会生成一个`model.onnx`,即为导出的模型
460
+
461
+ 注意:Hubert Onnx模型请使用MoeSS提供的模型,目前无法自行导出(fairseq中Hubert有不少onnx不支持的算子和涉及到常量的东西,在导出时会报错或者导出的模型输入输出shape和结果都有问题)
462
+
463
+
464
+ ## 📎 引用及论文
465
+
466
+ | URL | 名称 | 标题 | 源码 |
467
+ | --- | ----------- | ----- | --------------------- |
468
+ |[2106.06103](https://arxiv.org/abs/2106.06103) | VITS (Synthesizer)| Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech | [jaywalnut310/vits](https://github.com/jaywalnut310/vits) |
469
+ |[2111.02392](https://arxiv.org/abs/2111.02392) | SoftVC (Speech Encoder)| A Comparison of Discrete and Soft Speech Units for Improved Voice Conversion | [bshall/hubert](https://github.com/bshall/hubert) |
470
+ |[2204.09224](https://arxiv.org/abs/2204.09224) | ContentVec (Speech Encoder)| ContentVec: An Improved Self-Supervised Speech Representation by Disentangling Speakers | [auspicious3000/contentvec](https://github.com/auspicious3000/contentvec) |
471
+ |[2212.04356](https://arxiv.org/abs/2212.04356) | Whisper (Speech Encoder) | Robust Speech Recognition via Large-Scale Weak Supervision | [openai/whisper](https://github.com/openai/whisper) |
472
+ |[2110.13900](https://arxiv.org/abs/2110.13900) | WavLM (Speech Encoder) | WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing | [microsoft/unilm/wavlm](https://github.com/microsoft/unilm/tree/master/wavlm) |
473
+ |[2305.17651](https://arxiv.org/abs/2305.17651) | DPHubert (Speech Encoder) | DPHuBERT: Joint Distillation and Pruning of Self-Supervised Speech Models | [pyf98/DPHuBERT](https://github.com/pyf98/DPHuBERT) |
474
+ |[DOI:10.21437/Interspeech.2017-68](http://dx.doi.org/10.21437/Interspeech.2017-68) | Harvest (F0 Predictor) | Harvest: A high-performance fundamental frequency estimator from speech signals | [mmorise/World/harvest](https://github.com/mmorise/World/blob/master/src/harvest.cpp) |
475
+ |[aes35-000039](https://www.aes.org/e-lib/online/browse.cfm?elib=15165) | Dio (F0 Predictor) | Fast and reliable F0 estimation method based on the period extraction of vocal fold vibration of singing voice and speech | [mmorise/World/dio](https://github.com/mmorise/World/blob/master/src/dio.cpp) |
476
+ |[8461329](https://ieeexplore.ieee.org/document/8461329) | Crepe (F0 Predictor) | Crepe: A Convolutional Representation for Pitch Estimation | [maxrmorrison/torchcrepe](https://github.com/maxrmorrison/torchcrepe) |
477
+ |[DOI:10.1016/j.wocn.2018.07.001](https://doi.org/10.1016/j.wocn.2018.07.001) | Parselmouth (F0 Predictor) | Introducing Parselmouth: A Python interface to Praat | [YannickJadoul/Parselmouth](https://github.com/YannickJadoul/Parselmouth) |
478
+ |[2010.05646](https://arxiv.org/abs/2010.05646) | HIFIGAN (Vocoder) | HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis | [jik876/hifi-gan](https://github.com/jik876/hifi-gan) |
479
+ |[1810.11946](https://arxiv.org/abs/1810.11946.pdf) | NSF (Vocoder) | Neural source-filter-based waveform model for statistical parametric speech synthesis | [openvpi/DiffSinger/modules/nsf_hifigan](https://github.com/openvpi/DiffSinger/tree/refactor/modules/nsf_hifigan)
480
+ |[2006.08195](https://arxiv.org/abs/2006.08195) | Snake (Vocoder) | Neural Networks Fail to Learn Periodic Functions and How to Fix It | [EdwardDixon/snake](https://github.com/EdwardDixon/snake)
481
+ |[2105.02446v3](https://arxiv.org/abs/2105.02446v3) | Shallow Diffusion (PostProcessing)| DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism | [CNChTu/Diffusion-SVC](https://github.com/CNChTu/Diffusion-SVC) |
482
+ |[K-means](https://citeseerx.ist.psu.edu/viewdoc/download;jsessionid=01D65490BADCC216F350D06F84D721AD?doi=10.1.1.308.8619&rep=rep1&type=pdf) | Feature K-means Clustering (PreProcessing)| Some methods for classification and analysis of multivariate observations | 本代码库 |
483
+ | | Feature TopK Retrieval (PreProcessing)| Retrieval based Voice Conversion | [RVC-Project/Retrieval-based-Voice-Conversion-WebUI](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) |
484
+
485
+ ## ☀️ 旧贡献者
486
+
487
+ 因为某些原因原作者进行了删库处理,本仓库重建之初由于组织成员疏忽直接重新上传了所有文件导致以前的contributors全部木大,现在在README里重新添加一个旧贡献者列表
488
+
489
+ *某些成员已根据其个人意愿不将其列出*
490
+
491
+ <table>
492
+ <tr>
493
+ <td align="center"><a href="https://github.com/MistEO"><img src="https://avatars.githubusercontent.com/u/18511905?v=4" width="100px;" alt=""/><br /><sub><b>MistEO</b></sub></a><br /></td>
494
+ <td align="center"><a href="https://github.com/XiaoMiku01"><img src="https://avatars.githubusercontent.com/u/54094119?v=4" width="100px;" alt=""/><br /><sub><b>XiaoMiku01</b></sub></a><br /></td>
495
+ <td align="center"><a href="https://github.com/ForsakenRei"><img src="https://avatars.githubusercontent.com/u/23041178?v=4" width="100px;" alt=""/><br /><sub><b>しぐれ</b></sub></a><br /></td>
496
+ <td align="center"><a href="https://github.com/TomoGaSukunai"><img src="https://avatars.githubusercontent.com/u/25863522?v=4" width="100px;" alt=""/><br /><sub><b>TomoGaSukunai</b></sub></a><br /></td>
497
+ <td align="center"><a href="https://github.com/Plachtaa"><img src="https://avatars.githubusercontent.com/u/112609742?v=4" width="100px;" alt=""/><br /><sub><b>Plachtaa</b></sub></a><br /></td>
498
+ <td align="center"><a href="https://github.com/zdxiaoda"><img src="https://avatars.githubusercontent.com/u/45501959?v=4" width="100px;" alt=""/><br /><sub><b>zd小达</b></sub></a><br /></td>
499
+ <td align="center"><a href="https://github.com/Archivoice"><img src="https://avatars.githubusercontent.com/u/107520869?v=4" width="100px;" alt=""/><br /><sub><b>凍聲響世</b></sub></a><br /></td>
500
+ </tr>
501
+ </table>
502
+
503
+ ## 📚 一些法律条例参考
504
+
505
+ #### 任何国家,地区,组织和个人使用此项目必须遵守以下法律
506
+
507
+ #### 《民法典》
508
+
509
+ ##### 第一千零一十九条
510
+
511
+ 任何组织或者个人不得以丑化、污损,或者利用信息技术手段伪造等方式侵害他人的肖像权。未经肖像权人同意,不得制作、使用、公开肖像权人的肖像,但是法律另有规定的除外。未经肖像权人同意,肖像作品权利人不得以发表、复制、发行、出租、展览等方式使用或者公开肖像权人的肖像。对自然人声音的保护,参照适用肖像权保护的有关规定。
512
+
513
+ ##### 第一千零二十四条
514
+
515
+ 【名誉权】民事主体享有名誉权。任何组织或者个人不得以侮辱、诽谤等方式侵害他人的名誉权。
516
+
517
+ ##### 第一千零二十七条
518
+
519
+ 【作品侵害名誉权】行为人发表的文学、艺术作品以真人真事或者特定人为描述对象,含有侮辱、诽谤内容,侵害他人名誉权的,受害人有权依法请求该行为人承担民事责任。行为人发表的文学、艺术作品不以特定人为描述对象,仅其中的情节与该特定人的情况相似的,不承担民事责任。
520
+
521
+ #### 《[中华人民共和国宪法](http://www.gov.cn/guoqing/2018-03/22/content_5276318.htm)》
522
+
523
+ #### 《[中华人民共和国刑法](http://gongbao.court.gov.cn/Details/f8e30d0689b23f57bfc782d21035c3.html?sw=中华人民共和国刑法)》
524
+
525
+ #### 《[中华人民共和国民法典](http://gongbao.court.gov.cn/Details/51eb6750b8361f79be8f90d09bc202.html)》
526
+
527
+ #### 《[中华人民共和国合同法](http://www.npc.gov.cn/zgrdw/npc/lfzt/rlyw/2016-07/01/content_1992739.htm)》
528
+
529
+ ## 💪 感谢所有的贡献者
530
+ <a href="https://github.com/svc-develop-team/so-vits-svc/graphs/contributors" target="_blank">
531
+ <img src="https://contrib.rocks/image?repo=svc-develop-team/so-vits-svc" />
532
+ </a>
cluster/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- import numpy as np
2
  import torch
3
  from sklearn.cluster import KMeans
4
 
 
5
  def get_cluster_model(ckpt_path):
6
  checkpoint = torch.load(ckpt_path)
7
  kmeans_dict = {}
 
 
1
  import torch
2
  from sklearn.cluster import KMeans
3
 
4
+
5
  def get_cluster_model(ckpt_path):
6
  checkpoint = torch.load(ckpt_path)
7
  kmeans_dict = {}
cluster/kmeans.py CHANGED
@@ -1,201 +1,204 @@
1
- import math,pdb
2
- import torch,pynvml
3
- from torch.nn.functional import normalize
4
- from time import time
5
- import numpy as np
6
- # device=torch.device("cuda:0")
7
- def _kpp(data: torch.Tensor, k: int, sample_size: int = -1):
8
- """ Picks k points in the data based on the kmeans++ method.
9
-
10
- Parameters
11
- ----------
12
- data : torch.Tensor
13
- Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D
14
- data, rank 2 multidimensional data, in which case one
15
- row is one observation.
16
- k : int
17
- Number of samples to generate.
18
- sample_size : int
19
- sample data to avoid memory overflow during calculation
20
-
21
- Returns
22
- -------
23
- init : ndarray
24
- A 'k' by 'N' containing the initial centroids.
25
-
26
- References
27
- ----------
28
- .. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
29
- careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
30
- on Discrete Algorithms, 2007.
31
- .. [2] scipy/cluster/vq.py: _kpp
32
- """
33
- batch_size=data.shape[0]
34
- if batch_size>sample_size:
35
- data = data[torch.randint(0, batch_size,[sample_size], device=data.device)]
36
- dims = data.shape[1] if len(data.shape) > 1 else 1
37
- init = torch.zeros((k, dims)).to(data.device)
38
- r = torch.distributions.uniform.Uniform(0, 1)
39
- for i in range(k):
40
- if i == 0:
41
- init[i, :] = data[torch.randint(data.shape[0], [1])]
42
- else:
43
- D2 = torch.cdist(init[:i, :][None, :], data[None, :], p=2)[0].amin(dim=0)
44
- probs = D2 / torch.sum(D2)
45
- cumprobs = torch.cumsum(probs, dim=0)
46
- init[i, :] = data[torch.searchsorted(cumprobs, r.sample([1]).to(data.device))]
47
- return init
48
- class KMeansGPU:
49
- '''
50
- Kmeans clustering algorithm implemented with PyTorch
51
-
52
- Parameters:
53
- n_clusters: int,
54
- Number of clusters
55
-
56
- max_iter: int, default: 100
57
- Maximum number of iterations
58
-
59
- tol: float, default: 0.0001
60
- Tolerance
61
-
62
- verbose: int, default: 0
63
- Verbosity
64
-
65
- mode: {'euclidean', 'cosine'}, default: 'euclidean'
66
- Type of distance measure
67
-
68
- init_method: {'random', 'point', '++'}
69
- Type of initialization
70
-
71
- minibatch: {None, int}, default: None
72
- Batch size of MinibatchKmeans algorithm
73
- if None perform full KMeans algorithm
74
-
75
- Attributes:
76
- centroids: torch.Tensor, shape: [n_clusters, n_features]
77
- cluster centroids
78
- '''
79
- def __init__(self, n_clusters, max_iter=200, tol=1e-4, verbose=0, mode="euclidean",device=torch.device("cuda:0")):
80
- self.n_clusters = n_clusters
81
- self.max_iter = max_iter
82
- self.tol = tol
83
- self.verbose = verbose
84
- self.mode = mode
85
- self.device=device
86
- pynvml.nvmlInit()
87
- gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(device.index)
88
- info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
89
- self.minibatch=int(33e6/self.n_clusters*info.free/ 1024 / 1024 / 1024)
90
- print("free_mem/GB:",info.free/ 1024 / 1024 / 1024,"minibatch:",self.minibatch)
91
-
92
- @staticmethod
93
- def cos_sim(a, b):
94
- """
95
- Compute cosine similarity of 2 sets of vectors
96
-
97
- Parameters:
98
- a: torch.Tensor, shape: [m, n_features]
99
-
100
- b: torch.Tensor, shape: [n, n_features]
101
- """
102
- return normalize(a, dim=-1) @ normalize(b, dim=-1).transpose(-2, -1)
103
-
104
- @staticmethod
105
- def euc_sim(a, b):
106
- """
107
- Compute euclidean similarity of 2 sets of vectors
108
- Parameters:
109
- a: torch.Tensor, shape: [m, n_features]
110
- b: torch.Tensor, shape: [n, n_features]
111
- """
112
- return 2 * a @ b.transpose(-2, -1) -(a**2).sum(dim=1)[..., :, None] - (b**2).sum(dim=1)[..., None, :]
113
-
114
- def max_sim(self, a, b):
115
- """
116
- Compute maximum similarity (or minimum distance) of each vector
117
- in a with all of the vectors in b
118
- Parameters:
119
- a: torch.Tensor, shape: [m, n_features]
120
- b: torch.Tensor, shape: [n, n_features]
121
- """
122
- if self.mode == 'cosine':
123
- sim_func = self.cos_sim
124
- elif self.mode == 'euclidean':
125
- sim_func = self.euc_sim
126
- sim = sim_func(a, b)
127
- max_sim_v, max_sim_i = sim.max(dim=-1)
128
- return max_sim_v, max_sim_i
129
-
130
- def fit_predict(self, X):
131
- """
132
- Combination of fit() and predict() methods.
133
- This is faster than calling fit() and predict() seperately.
134
- Parameters:
135
- X: torch.Tensor, shape: [n_samples, n_features]
136
- centroids: {torch.Tensor, None}, default: None
137
- if given, centroids will be initialized with given tensor
138
- if None, centroids will be randomly chosen from X
139
- Return:
140
- labels: torch.Tensor, shape: [n_samples]
141
-
142
- mini_=33kk/k*remain
143
- mini=min(mini_,fea_shape)
144
- offset=log2(k/1000)*1.5
145
- kpp_all=min(mini_*10/offset,fea_shape)
146
- kpp_sample=min(mini_/12/offset,fea_shape)
147
- """
148
- assert isinstance(X, torch.Tensor), "input must be torch.Tensor"
149
- assert X.dtype in [torch.half, torch.float, torch.double], "input must be floating point"
150
- assert X.ndim == 2, "input must be a 2d tensor with shape: [n_samples, n_features] "
151
- # print("verbose:%s"%self.verbose)
152
-
153
- offset = np.power(1.5,np.log(self.n_clusters / 1000))/np.log(2)
154
- with torch.no_grad():
155
- batch_size= X.shape[0]
156
- # print(self.minibatch, int(self.minibatch * 10 / offset), batch_size)
157
- start_time = time()
158
- if (self.minibatch*10//offset< batch_size):
159
- x = X[torch.randint(0, batch_size,[int(self.minibatch*10/offset)])].to(self.device)
160
- else:
161
- x = X.to(self.device)
162
- # print(x.device)
163
- self.centroids = _kpp(x, self.n_clusters, min(int(self.minibatch/12/offset),batch_size))
164
- del x
165
- torch.cuda.empty_cache()
166
- # self.centroids = self.centroids.to(self.device)
167
- num_points_in_clusters = torch.ones(self.n_clusters, device=self.device, dtype=X.dtype)#全1
168
- closest = None#[3098036]#int64
169
- if(self.minibatch>=batch_size//2 and self.minibatch<batch_size):
170
- X = X[torch.randint(0, batch_size,[self.minibatch])].to(self.device)
171
- elif(self.minibatch>=batch_size):
172
- X=X.to(self.device)
173
- for i in range(self.max_iter):
174
- iter_time = time()
175
- if self.minibatch<batch_size//2:#可用minibatch数太小,每次都得从内存倒腾到显存
176
- x = X[torch.randint(0, batch_size, [self.minibatch])].to(self.device)
177
- else:#否则直接全部缓存
178
- x = X
179
-
180
- closest = self.max_sim(a=x, b=self.centroids)[1].to(torch.int16)#[3098036]#int64#0~999
181
- matched_clusters, counts = closest.unique(return_counts=True)#int64#1k
182
- expanded_closest = closest[None].expand(self.n_clusters, -1)#[1000, 3098036]#int16#0~999
183
- mask = (expanded_closest==torch.arange(self.n_clusters, device=self.device)[:, None]).to(X.dtype)#==后者是int64*1000
184
- c_grad = mask @ x / mask.sum(-1)[..., :, None]
185
- c_grad[c_grad!=c_grad] = 0 # remove NaNs
186
- error = (c_grad - self.centroids).pow(2).sum()
187
- if self.minibatch is not None:
188
- lr = 1/num_points_in_clusters[:,None] * 0.9 + 0.1
189
- else:
190
- lr = 1
191
- matched_clusters=matched_clusters.long()
192
- num_points_in_clusters[matched_clusters] += counts#IndexError: tensors used as indices must be long, byte or bool tensors
193
- self.centroids = self.centroids * (1-lr) + c_grad * lr
194
- if self.verbose >= 2:
195
- print('iter:', i, 'error:', error.item(), 'time spent:', round(time()-iter_time, 4))
196
- if error <= self.tol:
197
- break
198
-
199
- if self.verbose >= 1:
200
- print(f'used {i+1} iterations ({round(time()-start_time, 4)}s) to cluster {batch_size} items into {self.n_clusters} clusters')
201
- return closest
 
 
 
 
1
+ from time import time
2
+
3
+ import numpy as np
4
+ import pynvml
5
+ import torch
6
+ from torch.nn.functional import normalize
7
+
8
+
9
+ # device=torch.device("cuda:0")
10
+ def _kpp(data: torch.Tensor, k: int, sample_size: int = -1):
11
+ """ Picks k points in the data based on the kmeans++ method.
12
+
13
+ Parameters
14
+ ----------
15
+ data : torch.Tensor
16
+ Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D
17
+ data, rank 2 multidimensional data, in which case one
18
+ row is one observation.
19
+ k : int
20
+ Number of samples to generate.
21
+ sample_size : int
22
+ sample data to avoid memory overflow during calculation
23
+
24
+ Returns
25
+ -------
26
+ init : ndarray
27
+ A 'k' by 'N' containing the initial centroids.
28
+
29
+ References
30
+ ----------
31
+ .. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
32
+ careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
33
+ on Discrete Algorithms, 2007.
34
+ .. [2] scipy/cluster/vq.py: _kpp
35
+ """
36
+ batch_size=data.shape[0]
37
+ if batch_size>sample_size:
38
+ data = data[torch.randint(0, batch_size,[sample_size], device=data.device)]
39
+ dims = data.shape[1] if len(data.shape) > 1 else 1
40
+ init = torch.zeros((k, dims)).to(data.device)
41
+ r = torch.distributions.uniform.Uniform(0, 1)
42
+ for i in range(k):
43
+ if i == 0:
44
+ init[i, :] = data[torch.randint(data.shape[0], [1])]
45
+ else:
46
+ D2 = torch.cdist(init[:i, :][None, :], data[None, :], p=2)[0].amin(dim=0)
47
+ probs = D2 / torch.sum(D2)
48
+ cumprobs = torch.cumsum(probs, dim=0)
49
+ init[i, :] = data[torch.searchsorted(cumprobs, r.sample([1]).to(data.device))]
50
+ return init
51
+ class KMeansGPU:
52
+ '''
53
+ Kmeans clustering algorithm implemented with PyTorch
54
+
55
+ Parameters:
56
+ n_clusters: int,
57
+ Number of clusters
58
+
59
+ max_iter: int, default: 100
60
+ Maximum number of iterations
61
+
62
+ tol: float, default: 0.0001
63
+ Tolerance
64
+
65
+ verbose: int, default: 0
66
+ Verbosity
67
+
68
+ mode: {'euclidean', 'cosine'}, default: 'euclidean'
69
+ Type of distance measure
70
+
71
+ init_method: {'random', 'point', '++'}
72
+ Type of initialization
73
+
74
+ minibatch: {None, int}, default: None
75
+ Batch size of MinibatchKmeans algorithm
76
+ if None perform full KMeans algorithm
77
+
78
+ Attributes:
79
+ centroids: torch.Tensor, shape: [n_clusters, n_features]
80
+ cluster centroids
81
+ '''
82
+ def __init__(self, n_clusters, max_iter=200, tol=1e-4, verbose=0, mode="euclidean",device=torch.device("cuda:0")):
83
+ self.n_clusters = n_clusters
84
+ self.max_iter = max_iter
85
+ self.tol = tol
86
+ self.verbose = verbose
87
+ self.mode = mode
88
+ self.device=device
89
+ pynvml.nvmlInit()
90
+ gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(device.index)
91
+ info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
92
+ self.minibatch=int(33e6/self.n_clusters*info.free/ 1024 / 1024 / 1024)
93
+ print("free_mem/GB:",info.free/ 1024 / 1024 / 1024,"minibatch:",self.minibatch)
94
+
95
+ @staticmethod
96
+ def cos_sim(a, b):
97
+ """
98
+ Compute cosine similarity of 2 sets of vectors
99
+
100
+ Parameters:
101
+ a: torch.Tensor, shape: [m, n_features]
102
+
103
+ b: torch.Tensor, shape: [n, n_features]
104
+ """
105
+ return normalize(a, dim=-1) @ normalize(b, dim=-1).transpose(-2, -1)
106
+
107
+ @staticmethod
108
+ def euc_sim(a, b):
109
+ """
110
+ Compute euclidean similarity of 2 sets of vectors
111
+ Parameters:
112
+ a: torch.Tensor, shape: [m, n_features]
113
+ b: torch.Tensor, shape: [n, n_features]
114
+ """
115
+ return 2 * a @ b.transpose(-2, -1) -(a**2).sum(dim=1)[..., :, None] - (b**2).sum(dim=1)[..., None, :]
116
+
117
+ def max_sim(self, a, b):
118
+ """
119
+ Compute maximum similarity (or minimum distance) of each vector
120
+ in a with all of the vectors in b
121
+ Parameters:
122
+ a: torch.Tensor, shape: [m, n_features]
123
+ b: torch.Tensor, shape: [n, n_features]
124
+ """
125
+ if self.mode == 'cosine':
126
+ sim_func = self.cos_sim
127
+ elif self.mode == 'euclidean':
128
+ sim_func = self.euc_sim
129
+ sim = sim_func(a, b)
130
+ max_sim_v, max_sim_i = sim.max(dim=-1)
131
+ return max_sim_v, max_sim_i
132
+
133
+ def fit_predict(self, X):
134
+ """
135
+ Combination of fit() and predict() methods.
136
+ This is faster than calling fit() and predict() seperately.
137
+ Parameters:
138
+ X: torch.Tensor, shape: [n_samples, n_features]
139
+ centroids: {torch.Tensor, None}, default: None
140
+ if given, centroids will be initialized with given tensor
141
+ if None, centroids will be randomly chosen from X
142
+ Return:
143
+ labels: torch.Tensor, shape: [n_samples]
144
+
145
+ mini_=33kk/k*remain
146
+ mini=min(mini_,fea_shape)
147
+ offset=log2(k/1000)*1.5
148
+ kpp_all=min(mini_*10/offset,fea_shape)
149
+ kpp_sample=min(mini_/12/offset,fea_shape)
150
+ """
151
+ assert isinstance(X, torch.Tensor), "input must be torch.Tensor"
152
+ assert X.dtype in [torch.half, torch.float, torch.double], "input must be floating point"
153
+ assert X.ndim == 2, "input must be a 2d tensor with shape: [n_samples, n_features] "
154
+ # print("verbose:%s"%self.verbose)
155
+
156
+ offset = np.power(1.5,np.log(self.n_clusters / 1000))/np.log(2)
157
+ with torch.no_grad():
158
+ batch_size= X.shape[0]
159
+ # print(self.minibatch, int(self.minibatch * 10 / offset), batch_size)
160
+ start_time = time()
161
+ if (self.minibatch*10//offset< batch_size):
162
+ x = X[torch.randint(0, batch_size,[int(self.minibatch*10/offset)])].to(self.device)
163
+ else:
164
+ x = X.to(self.device)
165
+ # print(x.device)
166
+ self.centroids = _kpp(x, self.n_clusters, min(int(self.minibatch/12/offset),batch_size))
167
+ del x
168
+ torch.cuda.empty_cache()
169
+ # self.centroids = self.centroids.to(self.device)
170
+ num_points_in_clusters = torch.ones(self.n_clusters, device=self.device, dtype=X.dtype)#全1
171
+ closest = None#[3098036]#int64
172
+ if(self.minibatch>=batch_size//2 and self.minibatch<batch_size):
173
+ X = X[torch.randint(0, batch_size,[self.minibatch])].to(self.device)
174
+ elif(self.minibatch>=batch_size):
175
+ X=X.to(self.device)
176
+ for i in range(self.max_iter):
177
+ iter_time = time()
178
+ if self.minibatch<batch_size//2:#可用minibatch数太小,每次都得从内存倒腾到显存
179
+ x = X[torch.randint(0, batch_size, [self.minibatch])].to(self.device)
180
+ else:#否则直接全部缓存
181
+ x = X
182
+
183
+ closest = self.max_sim(a=x, b=self.centroids)[1].to(torch.int16)#[3098036]#int64#0~999
184
+ matched_clusters, counts = closest.unique(return_counts=True)#int64#1k
185
+ expanded_closest = closest[None].expand(self.n_clusters, -1)#[1000, 3098036]#int16#0~999
186
+ mask = (expanded_closest==torch.arange(self.n_clusters, device=self.device)[:, None]).to(X.dtype)#==后者是int64*1000
187
+ c_grad = mask @ x / mask.sum(-1)[..., :, None]
188
+ c_grad[c_grad!=c_grad] = 0 # remove NaNs
189
+ error = (c_grad - self.centroids).pow(2).sum()
190
+ if self.minibatch is not None:
191
+ lr = 1/num_points_in_clusters[:,None] * 0.9 + 0.1
192
+ else:
193
+ lr = 1
194
+ matched_clusters=matched_clusters.long()
195
+ num_points_in_clusters[matched_clusters] += counts#IndexError: tensors used as indices must be long, byte or bool tensors
196
+ self.centroids = self.centroids * (1-lr) + c_grad * lr
197
+ if self.verbose >= 2:
198
+ print('iter:', i, 'error:', error.item(), 'time spent:', round(time()-iter_time, 4))
199
+ if error <= self.tol:
200
+ break
201
+
202
+ if self.verbose >= 1:
203
+ print(f'used {i+1} iterations ({round(time()-start_time, 4)}s) to cluster {batch_size} items into {self.n_clusters} clusters')
204
+ return closest
cluster/train_cluster.py CHANGED
@@ -1,19 +1,17 @@
1
- import time,pdb
2
- import tqdm
3
- from time import time as ttime
4
  import os
 
5
  from pathlib import Path
6
- import logging
7
- import argparse
8
- from kmeans import KMeansGPU
9
- import torch
10
  import numpy as np
11
- from sklearn.cluster import KMeans,MiniBatchKMeans
 
 
 
12
 
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
15
- from time import time as ttime
16
- import pynvml,torch
17
 
18
  def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉,虽然库支持但是也不考虑
19
  logger.info(f"Loading features from {in_dir}")
@@ -29,7 +27,7 @@ def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=
29
  features = features.astype(np.float32)
30
  logger.info(f"Clustering features of shape: {features.shape}")
31
  t = time.time()
32
- if(use_gpu==False):
33
  if use_minibatch:
34
  kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
35
  else:
@@ -37,14 +35,14 @@ def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=
37
  else:
38
  kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
39
  features=torch.from_numpy(features)#.to(device)
40
- labels = kmeans.fit_predict(features)#
41
 
42
  print(time.time()-t, "s")
43
 
44
  x = {
45
- "n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[1],
46
- "_n_threads": kmeans._n_threads if use_gpu==False else 4,
47
- "cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(),
48
  }
49
  print("end")
50
 
 
1
+ import argparse
2
+ import logging
 
3
  import os
4
+ import time
5
  from pathlib import Path
6
+
 
 
 
7
  import numpy as np
8
+ import torch
9
+ import tqdm
10
+ from kmeans import KMeansGPU
11
+ from sklearn.cluster import KMeans, MiniBatchKMeans
12
 
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
 
 
15
 
16
  def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉,虽然库支持但是也不考虑
17
  logger.info(f"Loading features from {in_dir}")
 
27
  features = features.astype(np.float32)
28
  logger.info(f"Clustering features of shape: {features.shape}")
29
  t = time.time()
30
+ if(use_gpu is False):
31
  if use_minibatch:
32
  kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
33
  else:
 
35
  else:
36
  kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
37
  features=torch.from_numpy(features)#.to(device)
38
+ kmeans.fit_predict(features)#
39
 
40
  print(time.time()-t, "s")
41
 
42
  x = {
43
+ "n_features_in_": kmeans.n_features_in_ if use_gpu is False else features.shape[1],
44
+ "_n_threads": kmeans._n_threads if use_gpu is False else 4,
45
+ "cluster_centers_": kmeans.cluster_centers_ if use_gpu is False else kmeans.centroids.cpu().numpy(),
46
  }
47
  print("end")
48
 
compress_model.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+
3
+ import torch
4
+
5
+ import utils
6
+ from models import SynthesizerTrn
7
+
8
+
9
+ def copyStateDict(state_dict):
10
+ if list(state_dict.keys())[0].startswith('module'):
11
+ start_idx = 1
12
+ else:
13
+ start_idx = 0
14
+ new_state_dict = OrderedDict()
15
+ for k, v in state_dict.items():
16
+ name = ','.join(k.split('.')[start_idx:])
17
+ new_state_dict[name] = v
18
+ return new_state_dict
19
+
20
+
21
+ def removeOptimizer(config: str, input_model: str, ishalf: bool, output_model: str):
22
+ hps = utils.get_hparams_from_file(config)
23
+
24
+ net_g = SynthesizerTrn(hps.data.filter_length // 2 + 1,
25
+ hps.train.segment_size // hps.data.hop_length,
26
+ **hps.model)
27
+
28
+ optim_g = torch.optim.AdamW(net_g.parameters(),
29
+ hps.train.learning_rate,
30
+ betas=hps.train.betas,
31
+ eps=hps.train.eps)
32
+
33
+ state_dict_g = torch.load(input_model, map_location="cpu")
34
+ new_dict_g = copyStateDict(state_dict_g)
35
+ keys = []
36
+ for k, v in new_dict_g['model'].items():
37
+ keys.append(k)
38
+
39
+ new_dict_g = {k: new_dict_g['model'][k].half() for k in keys} if ishalf else {k: new_dict_g['model'][k] for k in keys}
40
+
41
+ torch.save(
42
+ {
43
+ 'model': new_dict_g,
44
+ 'iteration': 0,
45
+ 'optimizer': optim_g.state_dict(),
46
+ 'learning_rate': 0.0001
47
+ }, output_model)
48
+
49
+
50
+ if __name__ == "__main__":
51
+ import argparse
52
+ parser = argparse.ArgumentParser()
53
+ parser.add_argument("-c",
54
+ "--config",
55
+ type=str,
56
+ default='configs/config.json')
57
+ parser.add_argument("-i", "--input", type=str)
58
+ parser.add_argument("-o", "--output", type=str, default=None)
59
+ parser.add_argument('-hf', '--half', action='store_true', default=False, help='Save as FP16')
60
+
61
+ args = parser.parse_args()
62
+
63
+ output = args.output
64
+
65
+ if output is None:
66
+ import os.path
67
+ filename, ext = os.path.splitext(args.input)
68
+ half = "_half" if args.half else ""
69
+ output = filename + "_release" + half + ext
70
+
71
+ removeOptimizer(args.config, args.input, args.half, output)
configs/diffusion.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ block_size: 512
3
+ cnhubertsoft_gate: 10
4
+ duration: 2
5
+ encoder: vec768l12
6
+ encoder_hop_size: 320
7
+ encoder_out_channels: 768
8
+ encoder_sample_rate: 16000
9
+ extensions:
10
+ - wav
11
+ sampling_rate: 44100
12
+ training_files: filelists/train.txt
13
+ unit_interpolate_mode: nearest
14
+ validation_files: filelists/val.txt
15
+ device: cuda
16
+ env:
17
+ expdir: logs/44k/diffusion
18
+ gpu_id: 0
19
+ infer:
20
+ method: dpm-solver++
21
+ speedup: 10
22
+ model:
23
+ k_step_max: 0
24
+ n_chans: 512
25
+ n_hidden: 256
26
+ n_layers: 20
27
+ n_spk: 1
28
+ timesteps: 1000
29
+ type: Diffusion
30
+ use_pitch_aug: true
31
+ spk:
32
+ ATRI: 0
33
+ train:
34
+ amp_dtype: fp32
35
+ batch_size: 48
36
+ cache_all_data: true
37
+ cache_device: cpu
38
+ cache_fp16: true
39
+ decay_step: 100000
40
+ epochs: 100000
41
+ gamma: 0.5
42
+ interval_force_save: 10000
43
+ interval_log: 10
44
+ interval_val: 2000
45
+ lr: 0.0002
46
+ num_workers: 2
47
+ save_opt: false
48
+ weight_decay: 0
49
+ vocoder:
50
+ ckpt: pretrain/nsf_hifigan/model
51
+ type: nsf-hifigan
config.json → configs_template/config_template.json RENAMED
@@ -24,7 +24,7 @@
24
  "port": "8001",
25
  "keep_ckpts": 3,
26
  "all_in_mem": false,
27
- "vol_aug": false
28
  },
29
  "data": {
30
  "training_files": "filelists/train.txt",
@@ -37,7 +37,7 @@
37
  "n_mel_channels": 80,
38
  "mel_fmin": 0.0,
39
  "mel_fmax": 22050,
40
- "unit_interpolate_mode": "nearest"
41
  },
42
  "model": {
43
  "inter_channels": 192,
@@ -48,58 +48,30 @@
48
  "kernel_size": 3,
49
  "p_dropout": 0.1,
50
  "resblock": "1",
51
- "resblock_kernel_sizes": [
52
- 3,
53
- 7,
54
- 11
55
- ],
56
- "resblock_dilation_sizes": [
57
- [
58
- 1,
59
- 3,
60
- 5
61
- ],
62
- [
63
- 1,
64
- 3,
65
- 5
66
- ],
67
- [
68
- 1,
69
- 3,
70
- 5
71
- ]
72
- ],
73
- "upsample_rates": [
74
- 8,
75
- 8,
76
- 2,
77
- 2,
78
- 2
79
- ],
80
  "upsample_initial_channel": 512,
81
- "upsample_kernel_sizes": [
82
- 16,
83
- 16,
84
- 4,
85
- 4,
86
- 4
87
- ],
88
  "n_layers_q": 3,
89
  "n_flow_layer": 4,
90
  "use_spectral_norm": false,
91
  "gin_channels": 768,
92
  "ssl_dim": 768,
93
- "n_speakers": 1,
94
- "vocoder_name": "nsf-hifigan",
95
- "speech_encoder": "vec768l12",
96
- "speaker_embedding": false,
97
- "vol_embedding": false,
98
- "use_depthwise_conv": false,
99
  "flow_share_parameter": false,
100
  "use_automatic_f0_prediction": true
101
  },
102
  "spk": {
103
- "ATRI": 0
 
 
 
 
104
  }
105
  }
 
24
  "port": "8001",
25
  "keep_ckpts": 3,
26
  "all_in_mem": false,
27
+ "vol_aug":false
28
  },
29
  "data": {
30
  "training_files": "filelists/train.txt",
 
37
  "n_mel_channels": 80,
38
  "mel_fmin": 0.0,
39
  "mel_fmax": 22050,
40
+ "unit_interpolate_mode":"nearest"
41
  },
42
  "model": {
43
  "inter_channels": 192,
 
48
  "kernel_size": 3,
49
  "p_dropout": 0.1,
50
  "resblock": "1",
51
+ "resblock_kernel_sizes": [3,7,11],
52
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
53
+ "upsample_rates": [ 8, 8, 2, 2, 2],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  "upsample_initial_channel": 512,
55
+ "upsample_kernel_sizes": [16,16, 4, 4, 4],
 
 
 
 
 
 
56
  "n_layers_q": 3,
57
  "n_flow_layer": 4,
58
  "use_spectral_norm": false,
59
  "gin_channels": 768,
60
  "ssl_dim": 768,
61
+ "n_speakers": 200,
62
+ "vocoder_name":"nsf-hifigan",
63
+ "speech_encoder":"vec768l12",
64
+ "speaker_embedding":false,
65
+ "vol_embedding":false,
66
+ "use_depthwise_conv":false,
67
  "flow_share_parameter": false,
68
  "use_automatic_f0_prediction": true
69
  },
70
  "spk": {
71
+ "nyaru": 0,
72
+ "huiyu": 1,
73
+ "nen": 2,
74
+ "paimon": 3,
75
+ "yunhao": 4
76
  }
77
  }
configs_template/diffusion_template.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ sampling_rate: 44100
3
+ block_size: 512 # Equal to hop_length
4
+ duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip
5
+ encoder: 'vec768l12' # 'hubertsoft', 'vec256l9', 'vec768l12'
6
+ cnhubertsoft_gate: 10
7
+ encoder_sample_rate: 16000
8
+ encoder_hop_size: 320
9
+ encoder_out_channels: 768 # 256 if using 'hubertsoft'
10
+ training_files: "filelists/train.txt"
11
+ validation_files: "filelists/val.txt"
12
+ extensions: # List of extension included in the data collection
13
+ - wav
14
+ unit_interpolate_mode: "nearest"
15
+ model:
16
+ type: 'Diffusion'
17
+ n_layers: 20
18
+ n_chans: 512
19
+ n_hidden: 256
20
+ use_pitch_aug: true
21
+ timesteps : 1000
22
+ k_step_max: 0 # must <= timesteps, If it is 0, train all
23
+ n_spk: 1 # max number of different speakers
24
+ device: cuda
25
+ vocoder:
26
+ type: 'nsf-hifigan'
27
+ ckpt: 'pretrain/nsf_hifigan/model'
28
+ infer:
29
+ speedup: 10
30
+ method: 'dpm-solver++' # 'pndm' or 'dpm-solver' or 'ddim' or 'unipc' or 'dpm-solver++'
31
+ env:
32
+ expdir: logs/44k/diffusion
33
+ gpu_id: 0
34
+ train:
35
+ num_workers: 2 # If your cpu and gpu are both very strong, set to 0 may be faster!
36
+ amp_dtype: fp32 # fp32, fp16 or bf16 (fp16 or bf16 may be faster if it is supported by your gpu)
37
+ batch_size: 48
38
+ cache_all_data: true # Save Internal-Memory or Graphics-Memory if it is false, but may be slow
39
+ cache_device: 'cpu' # Set to 'cuda' to cache the data into the Graphics-Memory, fastest speed for strong gpu
40
+ cache_fp16: true
41
+ epochs: 100000
42
+ interval_log: 10
43
+ interval_val: 2000
44
+ interval_force_save: 10000
45
+ lr: 0.0002
46
+ decay_step: 100000
47
+ gamma: 0.5
48
+ weight_decay: 0
49
+ save_opt: false
50
+ spk:
51
+ 'nyaru': 0
data_utils.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+
4
+ import numpy as np
5
+ import torch
6
+ import torch.utils.data
7
+
8
+ import utils
9
+ from modules.mel_processing import spectrogram_torch
10
+ from utils import load_filepaths_and_text, load_wav_to_torch
11
+
12
+ # import h5py
13
+
14
+
15
+ """Multi speaker version"""
16
+
17
+
18
+ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
19
+ """
20
+ 1) loads audio, speaker_id, text pairs
21
+ 2) normalizes text and converts them to sequences of integers
22
+ 3) computes spectrograms from audio files.
23
+ """
24
+
25
+ def __init__(self, audiopaths, hparams, all_in_mem: bool = False, vol_aug: bool = True):
26
+ self.audiopaths = load_filepaths_and_text(audiopaths)
27
+ self.hparams = hparams
28
+ self.max_wav_value = hparams.data.max_wav_value
29
+ self.sampling_rate = hparams.data.sampling_rate
30
+ self.filter_length = hparams.data.filter_length
31
+ self.hop_length = hparams.data.hop_length
32
+ self.win_length = hparams.data.win_length
33
+ self.unit_interpolate_mode = hparams.data.unit_interpolate_mode
34
+ self.sampling_rate = hparams.data.sampling_rate
35
+ self.use_sr = hparams.train.use_sr
36
+ self.spec_len = hparams.train.max_speclen
37
+ self.spk_map = hparams.spk
38
+ self.vol_emb = hparams.model.vol_embedding
39
+ self.vol_aug = hparams.train.vol_aug and vol_aug
40
+ random.seed(1234)
41
+ random.shuffle(self.audiopaths)
42
+
43
+ self.all_in_mem = all_in_mem
44
+ if self.all_in_mem:
45
+ self.cache = [self.get_audio(p[0]) for p in self.audiopaths]
46
+
47
+ def get_audio(self, filename):
48
+ filename = filename.replace("\\", "/")
49
+ audio, sampling_rate = load_wav_to_torch(filename)
50
+ if sampling_rate != self.sampling_rate:
51
+ raise ValueError("{} SR doesn't match target {} SR".format(
52
+ sampling_rate, self.sampling_rate))
53
+ audio_norm = audio / self.max_wav_value
54
+ audio_norm = audio_norm.unsqueeze(0)
55
+ spec_filename = filename.replace(".wav", ".spec.pt")
56
+
57
+ # Ideally, all data generated after Mar 25 should have .spec.pt
58
+ if os.path.exists(spec_filename):
59
+ spec = torch.load(spec_filename)
60
+ else:
61
+ spec = spectrogram_torch(audio_norm, self.filter_length,
62
+ self.sampling_rate, self.hop_length, self.win_length,
63
+ center=False)
64
+ spec = torch.squeeze(spec, 0)
65
+ torch.save(spec, spec_filename)
66
+
67
+ spk = filename.split("/")[-2]
68
+ spk = torch.LongTensor([self.spk_map[spk]])
69
+
70
+ f0, uv = np.load(filename + ".f0.npy",allow_pickle=True)
71
+
72
+ f0 = torch.FloatTensor(np.array(f0,dtype=float))
73
+ uv = torch.FloatTensor(np.array(uv,dtype=float))
74
+
75
+ c = torch.load(filename+ ".soft.pt")
76
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[0], mode=self.unit_interpolate_mode)
77
+ if self.vol_emb:
78
+ volume_path = filename + ".vol.npy"
79
+ volume = np.load(volume_path)
80
+ volume = torch.from_numpy(volume).float()
81
+ else:
82
+ volume = None
83
+
84
+ lmin = min(c.size(-1), spec.size(-1))
85
+ assert abs(c.size(-1) - spec.size(-1)) < 3, (c.size(-1), spec.size(-1), f0.shape, filename)
86
+ assert abs(audio_norm.shape[1]-lmin * self.hop_length) < 3 * self.hop_length
87
+ spec, c, f0, uv = spec[:, :lmin], c[:, :lmin], f0[:lmin], uv[:lmin]
88
+ audio_norm = audio_norm[:, :lmin * self.hop_length]
89
+ if volume is not None:
90
+ volume = volume[:lmin]
91
+ return c, f0, spec, audio_norm, spk, uv, volume
92
+
93
+ def random_slice(self, c, f0, spec, audio_norm, spk, uv, volume):
94
+ # if spec.shape[1] < 30:
95
+ # print("skip too short audio:", filename)
96
+ # return None
97
+
98
+ if random.choice([True, False]) and self.vol_aug and volume is not None:
99
+ max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5
100
+ max_shift = min(1, np.log10(1/max_amp))
101
+ log10_vol_shift = random.uniform(-1, max_shift)
102
+ audio_norm = audio_norm * (10 ** log10_vol_shift)
103
+ volume = volume * (10 ** log10_vol_shift)
104
+ spec = spectrogram_torch(audio_norm,
105
+ self.hparams.data.filter_length,
106
+ self.hparams.data.sampling_rate,
107
+ self.hparams.data.hop_length,
108
+ self.hparams.data.win_length,
109
+ center=False)[0]
110
+
111
+ if spec.shape[1] > 800:
112
+ start = random.randint(0, spec.shape[1]-800)
113
+ end = start + 790
114
+ spec, c, f0, uv = spec[:, start:end], c[:, start:end], f0[start:end], uv[start:end]
115
+ audio_norm = audio_norm[:, start * self.hop_length : end * self.hop_length]
116
+ if volume is not None:
117
+ volume = volume[start:end]
118
+ return c, f0, spec, audio_norm, spk, uv,volume
119
+
120
+ def __getitem__(self, index):
121
+ if self.all_in_mem:
122
+ return self.random_slice(*self.cache[index])
123
+ else:
124
+ return self.random_slice(*self.get_audio(self.audiopaths[index][0]))
125
+
126
+ def __len__(self):
127
+ return len(self.audiopaths)
128
+
129
+
130
+ class TextAudioCollate:
131
+
132
+ def __call__(self, batch):
133
+ batch = [b for b in batch if b is not None]
134
+
135
+ input_lengths, ids_sorted_decreasing = torch.sort(
136
+ torch.LongTensor([x[0].shape[1] for x in batch]),
137
+ dim=0, descending=True)
138
+
139
+ max_c_len = max([x[0].size(1) for x in batch])
140
+ max_wav_len = max([x[3].size(1) for x in batch])
141
+
142
+ lengths = torch.LongTensor(len(batch))
143
+
144
+ c_padded = torch.FloatTensor(len(batch), batch[0][0].shape[0], max_c_len)
145
+ f0_padded = torch.FloatTensor(len(batch), max_c_len)
146
+ spec_padded = torch.FloatTensor(len(batch), batch[0][2].shape[0], max_c_len)
147
+ wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
148
+ spkids = torch.LongTensor(len(batch), 1)
149
+ uv_padded = torch.FloatTensor(len(batch), max_c_len)
150
+ volume_padded = torch.FloatTensor(len(batch), max_c_len)
151
+
152
+ c_padded.zero_()
153
+ spec_padded.zero_()
154
+ f0_padded.zero_()
155
+ wav_padded.zero_()
156
+ uv_padded.zero_()
157
+ volume_padded.zero_()
158
+
159
+ for i in range(len(ids_sorted_decreasing)):
160
+ row = batch[ids_sorted_decreasing[i]]
161
+
162
+ c = row[0]
163
+ c_padded[i, :, :c.size(1)] = c
164
+ lengths[i] = c.size(1)
165
+
166
+ f0 = row[1]
167
+ f0_padded[i, :f0.size(0)] = f0
168
+
169
+ spec = row[2]
170
+ spec_padded[i, :, :spec.size(1)] = spec
171
+
172
+ wav = row[3]
173
+ wav_padded[i, :, :wav.size(1)] = wav
174
+
175
+ spkids[i, 0] = row[4]
176
+
177
+ uv = row[5]
178
+ uv_padded[i, :uv.size(0)] = uv
179
+ volume = row[6]
180
+ if volume is not None:
181
+ volume_padded[i, :volume.size(0)] = volume
182
+ else :
183
+ volume_padded = None
184
+ return c_padded, f0_padded, spec_padded, wav_padded, spkids, lengths, uv_padded, volume_padded
diffusion/data_loaders.py CHANGED
@@ -1,13 +1,14 @@
1
  import os
2
  import random
3
- import re
4
- import numpy as np
5
  import librosa
 
6
  import torch
7
- import random
8
- from utils import repeat_expand_2d
9
- from tqdm import tqdm
10
  from torch.utils.data import Dataset
 
 
 
 
11
 
12
  def traverse_dir(
13
  root_dir,
@@ -63,6 +64,7 @@ def get_data_loaders(args, whole_audio=False):
63
  spk=args.spk,
64
  device=args.train.cache_device,
65
  fp16=args.train.cache_fp16,
 
66
  use_aug=True)
67
  loader_train = torch.utils.data.DataLoader(
68
  data_train ,
@@ -81,6 +83,7 @@ def get_data_loaders(args, whole_audio=False):
81
  whole_audio=True,
82
  spk=args.spk,
83
  extensions=args.data.extensions,
 
84
  n_spk=args.model.n_spk)
85
  loader_valid = torch.utils.data.DataLoader(
86
  data_valid,
@@ -107,6 +110,7 @@ class AudioDataset(Dataset):
107
  device='cpu',
108
  fp16=False,
109
  use_aug=False,
 
110
  ):
111
  super().__init__()
112
 
@@ -118,6 +122,7 @@ class AudioDataset(Dataset):
118
  self.use_aug = use_aug
119
  self.data_buffer={}
120
  self.pitch_aug_dict = {}
 
121
  # np.load(os.path.join(self.path_root, 'pitch_aug_dict.npy'), allow_pickle=True).item()
122
  if load_all_data:
123
  print('Load all the data filelists:', filelists)
@@ -126,7 +131,6 @@ class AudioDataset(Dataset):
126
  with open(filelists,"r") as f:
127
  self.paths = f.read().splitlines()
128
  for name_ext in tqdm(self.paths, total=len(self.paths)):
129
- name = os.path.splitext(name_ext)[0]
130
  path_audio = name_ext
131
  duration = librosa.get_duration(filename = path_audio, sr = self.sample_rate)
132
 
@@ -171,7 +175,7 @@ class AudioDataset(Dataset):
171
  path_units = name_ext + ".soft.pt"
172
  units = torch.load(path_units).to(device)
173
  units = units[0]
174
- units = repeat_expand_2d(units,f0.size(0)).transpose(0,1)
175
 
176
  if fp16:
177
  mel = mel.half()
@@ -263,7 +267,7 @@ class AudioDataset(Dataset):
263
  path_units = name_ext + ".soft.pt"
264
  units = torch.load(path_units)
265
  units = units[0]
266
- units = repeat_expand_2d(units,f0.size(0)).transpose(0,1)
267
 
268
  units = units[start_frame : start_frame + units_frame_len]
269
 
 
1
  import os
2
  import random
3
+
 
4
  import librosa
5
+ import numpy as np
6
  import torch
 
 
 
7
  from torch.utils.data import Dataset
8
+ from tqdm import tqdm
9
+
10
+ from utils import repeat_expand_2d
11
+
12
 
13
  def traverse_dir(
14
  root_dir,
 
64
  spk=args.spk,
65
  device=args.train.cache_device,
66
  fp16=args.train.cache_fp16,
67
+ unit_interpolate_mode = args.data.unit_interpolate_mode,
68
  use_aug=True)
69
  loader_train = torch.utils.data.DataLoader(
70
  data_train ,
 
83
  whole_audio=True,
84
  spk=args.spk,
85
  extensions=args.data.extensions,
86
+ unit_interpolate_mode = args.data.unit_interpolate_mode,
87
  n_spk=args.model.n_spk)
88
  loader_valid = torch.utils.data.DataLoader(
89
  data_valid,
 
110
  device='cpu',
111
  fp16=False,
112
  use_aug=False,
113
+ unit_interpolate_mode = 'left'
114
  ):
115
  super().__init__()
116
 
 
122
  self.use_aug = use_aug
123
  self.data_buffer={}
124
  self.pitch_aug_dict = {}
125
+ self.unit_interpolate_mode = unit_interpolate_mode
126
  # np.load(os.path.join(self.path_root, 'pitch_aug_dict.npy'), allow_pickle=True).item()
127
  if load_all_data:
128
  print('Load all the data filelists:', filelists)
 
131
  with open(filelists,"r") as f:
132
  self.paths = f.read().splitlines()
133
  for name_ext in tqdm(self.paths, total=len(self.paths)):
 
134
  path_audio = name_ext
135
  duration = librosa.get_duration(filename = path_audio, sr = self.sample_rate)
136
 
 
175
  path_units = name_ext + ".soft.pt"
176
  units = torch.load(path_units).to(device)
177
  units = units[0]
178
+ units = repeat_expand_2d(units,f0.size(0),unit_interpolate_mode).transpose(0,1)
179
 
180
  if fp16:
181
  mel = mel.half()
 
267
  path_units = name_ext + ".soft.pt"
268
  units = torch.load(path_units)
269
  units = units[0]
270
+ units = repeat_expand_2d(units,f0.size(0),self.unit_interpolate_mode).transpose(0,1)
271
 
272
  units = units[start_frame : start_frame + units_frame_len]
273
 
diffusion/diffusion.py CHANGED
@@ -1,10 +1,10 @@
1
  from collections import deque
2
  from functools import partial
3
  from inspect import isfunction
4
- import torch.nn.functional as F
5
- import librosa.sequence
6
  import numpy as np
7
  import torch
 
8
  from torch import nn
9
  from tqdm import tqdm
10
 
@@ -26,8 +26,10 @@ def extract(a, t, x_shape):
26
 
27
 
28
  def noise_like(shape, device, repeat=False):
29
- repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
30
- noise = lambda: torch.randn(shape, device=device)
 
 
31
  return repeat_noise() if repeat else noise()
32
 
33
 
@@ -67,6 +69,7 @@ class GaussianDiffusion(nn.Module):
67
  max_beta=0.02,
68
  spec_min=-12,
69
  spec_max=2):
 
70
  super().__init__()
71
  self.denoise_fn = denoise_fn
72
  self.out_dims = out_dims
@@ -78,7 +81,7 @@ class GaussianDiffusion(nn.Module):
78
 
79
  timesteps, = betas.shape
80
  self.num_timesteps = int(timesteps)
81
- self.k_step = k_step
82
 
83
  self.noise_list = deque(maxlen=4)
84
 
@@ -139,6 +142,18 @@ class GaussianDiffusion(nn.Module):
139
  model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
140
  return model_mean, posterior_variance, posterior_log_variance
141
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  @torch.no_grad()
143
  def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False):
144
  b, *_, device = *x.shape, x.device
@@ -239,8 +254,12 @@ class GaussianDiffusion(nn.Module):
239
  x = self.q_sample(x_start=norm_spec, t=torch.tensor([t - 1], device=device).long())
240
 
241
  if method is not None and infer_speedup > 1:
242
- if method == 'dpm-solver':
243
- from .dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
 
 
 
 
244
  # 1. Define the noise schedule.
245
  noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
246
 
@@ -267,17 +286,20 @@ class GaussianDiffusion(nn.Module):
267
  # (We recommend singlestep DPM-Solver for unconditional sampling)
268
  # You can adjust the `steps` to balance the computation
269
  # costs and the sample quality.
270
- dpm_solver = DPM_Solver(model_fn, noise_schedule)
271
-
 
 
 
272
  steps = t // infer_speedup
273
  if use_tqdm:
274
  self.bar = tqdm(desc="sample time step", total=steps)
275
  x = dpm_solver.sample(
276
  x,
277
  steps=steps,
278
- order=3,
279
  skip_type="time_uniform",
280
- method="singlestep",
281
  )
282
  if use_tqdm:
283
  self.bar.close()
@@ -298,6 +320,63 @@ class GaussianDiffusion(nn.Module):
298
  x, torch.full((b,), i, device=device, dtype=torch.long),
299
  infer_speedup, cond=cond
300
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  else:
302
  raise NotImplementedError(method)
303
  else:
 
1
  from collections import deque
2
  from functools import partial
3
  from inspect import isfunction
4
+
 
5
  import numpy as np
6
  import torch
7
+ import torch.nn.functional as F
8
  from torch import nn
9
  from tqdm import tqdm
10
 
 
26
 
27
 
28
  def noise_like(shape, device, repeat=False):
29
+ def repeat_noise():
30
+ return torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
31
+ def noise():
32
+ return torch.randn(shape, device=device)
33
  return repeat_noise() if repeat else noise()
34
 
35
 
 
69
  max_beta=0.02,
70
  spec_min=-12,
71
  spec_max=2):
72
+
73
  super().__init__()
74
  self.denoise_fn = denoise_fn
75
  self.out_dims = out_dims
 
81
 
82
  timesteps, = betas.shape
83
  self.num_timesteps = int(timesteps)
84
+ self.k_step = k_step if k_step>0 and k_step<timesteps else timesteps
85
 
86
  self.noise_list = deque(maxlen=4)
87
 
 
142
  model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
143
  return model_mean, posterior_variance, posterior_log_variance
144
 
145
+ @torch.no_grad()
146
+ def p_sample_ddim(self, x, t, interval, cond):
147
+ """
148
+ Use the DDIM method from
149
+ """
150
+ a_t = extract(self.alphas_cumprod, t, x.shape)
151
+ a_prev = extract(self.alphas_cumprod, torch.max(t - interval, torch.zeros_like(t)), x.shape)
152
+
153
+ noise_pred = self.denoise_fn(x, t, cond=cond)
154
+ x_prev = a_prev.sqrt() * (x / a_t.sqrt() + (((1 - a_prev) / a_prev).sqrt()-((1 - a_t) / a_t).sqrt()) * noise_pred)
155
+ return x_prev
156
+
157
  @torch.no_grad()
158
  def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False):
159
  b, *_, device = *x.shape, x.device
 
254
  x = self.q_sample(x_start=norm_spec, t=torch.tensor([t - 1], device=device).long())
255
 
256
  if method is not None and infer_speedup > 1:
257
+ if method == 'dpm-solver' or method == 'dpm-solver++':
258
+ from .dpm_solver_pytorch import (
259
+ DPM_Solver,
260
+ NoiseScheduleVP,
261
+ model_wrapper,
262
+ )
263
  # 1. Define the noise schedule.
264
  noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
265
 
 
286
  # (We recommend singlestep DPM-Solver for unconditional sampling)
287
  # You can adjust the `steps` to balance the computation
288
  # costs and the sample quality.
289
+ if method == 'dpm-solver':
290
+ dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver")
291
+ elif method == 'dpm-solver++':
292
+ dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
293
+
294
  steps = t // infer_speedup
295
  if use_tqdm:
296
  self.bar = tqdm(desc="sample time step", total=steps)
297
  x = dpm_solver.sample(
298
  x,
299
  steps=steps,
300
+ order=2,
301
  skip_type="time_uniform",
302
+ method="multistep",
303
  )
304
  if use_tqdm:
305
  self.bar.close()
 
320
  x, torch.full((b,), i, device=device, dtype=torch.long),
321
  infer_speedup, cond=cond
322
  )
323
+ elif method == 'ddim':
324
+ if use_tqdm:
325
+ for i in tqdm(
326
+ reversed(range(0, t, infer_speedup)), desc='sample time step',
327
+ total=t // infer_speedup,
328
+ ):
329
+ x = self.p_sample_ddim(
330
+ x, torch.full((b,), i, device=device, dtype=torch.long),
331
+ infer_speedup, cond=cond
332
+ )
333
+ else:
334
+ for i in reversed(range(0, t, infer_speedup)):
335
+ x = self.p_sample_ddim(
336
+ x, torch.full((b,), i, device=device, dtype=torch.long),
337
+ infer_speedup, cond=cond
338
+ )
339
+ elif method == 'unipc':
340
+ from .uni_pc import NoiseScheduleVP, UniPC, model_wrapper
341
+ # 1. Define the noise schedule.
342
+ noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
343
+
344
+ # 2. Convert your discrete-time `model` to the continuous-time
345
+ # noise prediction model. Here is an example for a diffusion model
346
+ # `model` with the noise prediction type ("noise") .
347
+ def my_wrapper(fn):
348
+ def wrapped(x, t, **kwargs):
349
+ ret = fn(x, t, **kwargs)
350
+ if use_tqdm:
351
+ self.bar.update(1)
352
+ return ret
353
+
354
+ return wrapped
355
+
356
+ model_fn = model_wrapper(
357
+ my_wrapper(self.denoise_fn),
358
+ noise_schedule,
359
+ model_type="noise", # or "x_start" or "v" or "score"
360
+ model_kwargs={"cond": cond}
361
+ )
362
+
363
+ # 3. Define uni_pc and sample by multistep UniPC.
364
+ # You can adjust the `steps` to balance the computation
365
+ # costs and the sample quality.
366
+ uni_pc = UniPC(model_fn, noise_schedule, variant='bh2')
367
+
368
+ steps = t // infer_speedup
369
+ if use_tqdm:
370
+ self.bar = tqdm(desc="sample time step", total=steps)
371
+ x = uni_pc.sample(
372
+ x,
373
+ steps=steps,
374
+ order=2,
375
+ skip_type="time_uniform",
376
+ method="multistep",
377
+ )
378
+ if use_tqdm:
379
+ self.bar.close()
380
  else:
381
  raise NotImplementedError(method)
382
  else:
diffusion/diffusion_onnx.py CHANGED
@@ -1,15 +1,14 @@
 
1
  from collections import deque
2
  from functools import partial
3
  from inspect import isfunction
4
- import torch.nn.functional as F
5
- import librosa.sequence
6
  import numpy as np
7
- from torch.nn import Conv1d
8
- from torch.nn import Mish
9
  import torch
 
10
  from torch import nn
 
11
  from tqdm import tqdm
12
- import math
13
 
14
 
15
  def exists(x):
@@ -27,8 +26,10 @@ def extract(a, t):
27
 
28
 
29
  def noise_like(shape, device, repeat=False):
30
- repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
31
- noise = lambda: torch.randn(shape, device=device)
 
 
32
  return repeat_noise() if repeat else noise()
33
 
34
 
@@ -389,7 +390,11 @@ class GaussianDiffusion(nn.Module):
389
 
390
  if method is not None and infer_speedup > 1:
391
  if method == 'dpm-solver':
392
- from .dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
 
 
 
 
393
  # 1. Define the noise schedule.
394
  noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
395
 
@@ -576,9 +581,6 @@ class GaussianDiffusion(nn.Module):
576
  plms_noise_stage = torch.tensor(0, dtype=torch.long, device=device)
577
  noise_list = torch.zeros((0, 1, 1, self.mel_bins, n_frames), device=device)
578
 
579
- ot = step_range[0]
580
- ot_1 = torch.full((1,), ot, device=device, dtype=torch.long)
581
-
582
  for t in step_range:
583
  t_1 = torch.full((1,), t, device=device, dtype=torch.long)
584
  noise_pred = self.denoise_fn(x, t_1, cond)
 
1
+ import math
2
  from collections import deque
3
  from functools import partial
4
  from inspect import isfunction
5
+
 
6
  import numpy as np
 
 
7
  import torch
8
+ import torch.nn.functional as F
9
  from torch import nn
10
+ from torch.nn import Conv1d, Mish
11
  from tqdm import tqdm
 
12
 
13
 
14
  def exists(x):
 
26
 
27
 
28
  def noise_like(shape, device, repeat=False):
29
+ def repeat_noise():
30
+ return torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
31
+ def noise():
32
+ return torch.randn(shape, device=device)
33
  return repeat_noise() if repeat else noise()
34
 
35
 
 
390
 
391
  if method is not None and infer_speedup > 1:
392
  if method == 'dpm-solver':
393
+ from .dpm_solver_pytorch import (
394
+ DPM_Solver,
395
+ NoiseScheduleVP,
396
+ model_wrapper,
397
+ )
398
  # 1. Define the noise schedule.
399
  noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
400
 
 
581
  plms_noise_stage = torch.tensor(0, dtype=torch.long, device=device)
582
  noise_list = torch.zeros((0, 1, 1, self.mel_bins, n_frames), device=device)
583
 
 
 
 
584
  for t in step_range:
585
  t_1 = torch.full((1,), t, device=device, dtype=torch.long)
586
  noise_pred = self.denoise_fn(x, t_1, cond)
diffusion/dpm_solver_pytorch.py CHANGED
@@ -1,5 +1,3 @@
1
- import math
2
-
3
  import torch
4
 
5
 
@@ -11,7 +9,8 @@ class NoiseScheduleVP:
11
  alphas_cumprod=None,
12
  continuous_beta_0=0.1,
13
  continuous_beta_1=20.,
14
- ):
 
15
  """Create a wrapper class for the forward SDE (VP type).
16
 
17
  ***
@@ -46,7 +45,7 @@ class NoiseScheduleVP:
46
  betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
47
  alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
48
 
49
- Note that we always have alphas_cumprod = cumprod(betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
50
 
51
  **Important**: Please pay special attention for the args for `alphas_cumprod`:
52
  The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
@@ -59,21 +58,19 @@ class NoiseScheduleVP:
59
 
60
  2. For continuous-time DPMs:
61
 
62
- We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
63
- schedule are the default settings in DDPM and improved-DDPM:
64
 
65
  Args:
66
  beta_min: A `float` number. The smallest beta for the linear schedule.
67
  beta_max: A `float` number. The largest beta for the linear schedule.
68
- cosine_s: A `float` number. The hyperparameter in the cosine schedule.
69
- cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
70
  T: A `float` number. The ending time of the forward process.
71
 
72
  ===============================================================
73
 
74
  Args:
75
  schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
76
- 'linear' or 'cosine' for continuous-time DPMs.
77
  Returns:
78
  A wrapper object of the forward SDE (VP type).
79
 
@@ -92,10 +89,8 @@ class NoiseScheduleVP:
92
 
93
  """
94
 
95
- if schedule not in ['discrete', 'linear', 'cosine']:
96
- raise ValueError(
97
- "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(
98
- schedule))
99
 
100
  self.schedule = schedule
101
  if schedule == 'discrete':
@@ -104,40 +99,37 @@ class NoiseScheduleVP:
104
  else:
105
  assert alphas_cumprod is not None
106
  log_alphas = 0.5 * torch.log(alphas_cumprod)
107
- self.total_N = len(log_alphas)
108
  self.T = 1.
109
- self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1))
110
- self.log_alpha_array = log_alphas.reshape((1, -1,))
 
111
  else:
 
112
  self.total_N = 1000
113
  self.beta_0 = continuous_beta_0
114
  self.beta_1 = continuous_beta_1
115
- self.cosine_s = 0.008
116
- self.cosine_beta_max = 999.
117
- self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (
118
- 1. + self.cosine_s) / math.pi - self.cosine_s
119
- self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
120
- self.schedule = schedule
121
- if schedule == 'cosine':
122
- # For the cosine schedule, T = 1 will have numerical issues. So we manually set the ending time T.
123
- # Note that T = 0.9946 may be not the optimal setting. However, we find it works well.
124
- self.T = 0.9946
125
- else:
126
- self.T = 1.
 
127
 
128
  def marginal_log_mean_coeff(self, t):
129
  """
130
  Compute log(alpha_t) of a given continuous-time label t in [0, T].
131
  """
132
  if self.schedule == 'discrete':
133
- return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device),
134
- self.log_alpha_array.to(t.device)).reshape((-1))
135
  elif self.schedule == 'linear':
136
  return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
137
- elif self.schedule == 'cosine':
138
- log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
139
- log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0
140
- return log_alpha_t
141
 
142
  def marginal_alpha(self, t):
143
  """
@@ -165,32 +157,25 @@ class NoiseScheduleVP:
165
  """
166
  if self.schedule == 'linear':
167
  tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
168
- Delta = self.beta_0 ** 2 + tmp
169
  return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
170
  elif self.schedule == 'discrete':
171
  log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
172
- t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]),
173
- torch.flip(self.t_array.to(lamb.device), [1]))
174
  return t.reshape((-1,))
175
- else:
176
- log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
177
- t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (
178
- 1. + self.cosine_s) / math.pi - self.cosine_s
179
- t = t_fn(log_alpha)
180
- return t
181
 
182
 
183
  def model_wrapper(
184
- model,
185
- noise_schedule,
186
- model_type="noise",
187
- model_kwargs={},
188
- guidance_type="uncond",
189
- condition=None,
190
- unconditional_condition=None,
191
- guidance_scale=1.,
192
- classifier_fn=None,
193
- classifier_kwargs={},
194
  ):
195
  """Create a wrapper function for the noise prediction model.
196
 
@@ -293,8 +278,6 @@ def model_wrapper(
293
  return t_continuous
294
 
295
  def noise_pred_fn(x, t_continuous, cond=None):
296
- if t_continuous.reshape((-1,)).shape[0] == 1:
297
- t_continuous = t_continuous.expand((x.shape[0]))
298
  t_input = get_model_input_time(t_continuous)
299
  if cond is None:
300
  output = model(x, t_input, **model_kwargs)
@@ -304,16 +287,13 @@ def model_wrapper(
304
  return output
305
  elif model_type == "x_start":
306
  alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
307
- dims = x.dim()
308
- return (x - expand_dims(alpha_t, dims) * output) / expand_dims(sigma_t, dims)
309
  elif model_type == "v":
310
  alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
311
- dims = x.dim()
312
- return expand_dims(alpha_t, dims) * output + expand_dims(sigma_t, dims) * x
313
  elif model_type == "score":
314
  sigma_t = noise_schedule.marginal_std(t_continuous)
315
- dims = x.dim()
316
- return -expand_dims(sigma_t, dims) * output
317
 
318
  def cond_grad_fn(x, t_input):
319
  """
@@ -328,8 +308,6 @@ def model_wrapper(
328
  """
329
  The noise predicition model function that is used for DPM-Solver.
330
  """
331
- if t_continuous.reshape((-1,)).shape[0] == 1:
332
- t_continuous = t_continuous.expand((x.shape[0]))
333
  if guidance_type == "uncond":
334
  return noise_pred_fn(x, t_continuous)
335
  elif guidance_type == "classifier":
@@ -338,7 +316,7 @@ def model_wrapper(
338
  cond_grad = cond_grad_fn(x, t_input)
339
  sigma_t = noise_schedule.marginal_std(t_continuous)
340
  noise = noise_pred_fn(x, t_continuous)
341
- return noise - guidance_scale * expand_dims(sigma_t, dims=cond_grad.dim()) * cond_grad
342
  elif guidance_type == "classifier-free":
343
  if guidance_scale == 1. or unconditional_condition is None:
344
  return noise_pred_fn(x, t_continuous, cond=condition)
@@ -349,20 +327,34 @@ def model_wrapper(
349
  noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
350
  return noise_uncond + guidance_scale * (noise - noise_uncond)
351
 
352
- assert model_type in ["noise", "x_start", "v"]
353
  assert guidance_type in ["uncond", "classifier", "classifier-free"]
354
  return model_fn
355
 
356
 
357
  class DPM_Solver:
358
- def __init__(self, model_fn, noise_schedule, predict_x0=False, thresholding=False, max_val=1.):
 
 
 
 
 
 
 
 
 
359
  """Construct a DPM-Solver.
360
 
361
- We support both the noise prediction model ("predicting epsilon") and the data prediction model ("predicting x0").
362
- If `predict_x0` is False, we use the solver for the noise prediction model (DPM-Solver).
363
- If `predict_x0` is True, we use the solver for the data prediction model (DPM-Solver++).
364
- In such case, we further support the "dynamic thresholding" in [1] when `thresholding` is True.
365
- The "dynamic thresholding" can greatly improve the sample quality for pixel-space DPMs with large guidance scales.
 
 
 
 
 
366
 
367
  Args:
368
  model_fn: A noise prediction model function which accepts the continuous-time input (t in [epsilon, T]):
@@ -370,18 +362,65 @@ class DPM_Solver:
370
  def model_fn(x, t_continuous):
371
  return noise
372
  ``
 
373
  noise_schedule: A noise schedule object, such as NoiseScheduleVP.
374
- predict_x0: A `bool`. If true, use the data prediction model; else, use the noise prediction model.
375
- thresholding: A `bool`. Valid when `predict_x0` is True. Whether to use the "dynamic thresholding" in [1].
376
- max_val: A `float`. Valid when both `predict_x0` and `thresholding` are True. The max value for thresholding.
377
-
378
- [1] Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  """
380
- self.model = model_fn
381
  self.noise_schedule = noise_schedule
382
- self.predict_x0 = predict_x0
383
- self.thresholding = thresholding
384
- self.max_val = max_val
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
 
386
  def noise_prediction_fn(self, x, t):
387
  """
@@ -391,24 +430,20 @@ class DPM_Solver:
391
 
392
  def data_prediction_fn(self, x, t):
393
  """
394
- Return the data prediction model (with thresholding).
395
  """
396
  noise = self.noise_prediction_fn(x, t)
397
- dims = x.dim()
398
  alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
399
- x0 = (x - expand_dims(sigma_t, dims) * noise) / expand_dims(alpha_t, dims)
400
- if self.thresholding:
401
- p = 0.995 # A hyperparameter in the paper of "Imagen" [1].
402
- s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
403
- s = expand_dims(torch.maximum(s, self.max_val * torch.ones_like(s).to(s.device)), dims)
404
- x0 = torch.clamp(x0, -s, s) / s
405
  return x0
406
 
407
  def model_fn(self, x, t):
408
  """
409
  Convert the model to the noise prediction model or the data prediction model.
410
  """
411
- if self.predict_x0:
412
  return self.data_prediction_fn(x, t)
413
  else:
414
  return self.noise_prediction_fn(x, t)
@@ -437,11 +472,10 @@ class DPM_Solver:
437
  return torch.linspace(t_T, t_0, N + 1).to(device)
438
  elif skip_type == 'time_quadratic':
439
  t_order = 2
440
- t = torch.linspace(t_T ** (1. / t_order), t_0 ** (1. / t_order), N + 1).pow(t_order).to(device)
441
  return t
442
  else:
443
- raise ValueError(
444
- "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
445
 
446
  def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
447
  """
@@ -478,32 +512,31 @@ class DPM_Solver:
478
  if order == 3:
479
  K = steps // 3 + 1
480
  if steps % 3 == 0:
481
- orders = [3, ] * (K - 2) + [2, 1]
482
  elif steps % 3 == 1:
483
- orders = [3, ] * (K - 1) + [1]
484
  else:
485
- orders = [3, ] * (K - 1) + [2]
486
  elif order == 2:
487
  if steps % 2 == 0:
488
  K = steps // 2
489
- orders = [2, ] * K
490
  else:
491
  K = steps // 2 + 1
492
- orders = [2, ] * (K - 1) + [1]
493
  elif order == 1:
494
  K = 1
495
- orders = [1, ] * steps
496
  else:
497
  raise ValueError("'order' must be '1' or '2' or '3'.")
498
  if skip_type == 'logSNR':
499
  # To reproduce the results in DPM-Solver paper
500
  timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
501
  else:
502
- timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[
503
- torch.cumsum(torch.tensor([0, ] + orders), dim=0).to(device)]
504
  return timesteps_outer, orders
505
 
506
- def denoise_fn(self, x, s):
507
  """
508
  Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
509
  """
@@ -515,8 +548,8 @@ class DPM_Solver:
515
 
516
  Args:
517
  x: A pytorch tensor. The initial value at time `s`.
518
- s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
519
- t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
520
  model_s: A pytorch tensor. The model function evaluated at time `s`.
521
  If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
522
  return_intermediate: A `bool`. If true, also return the model value at time `s`.
@@ -524,20 +557,19 @@ class DPM_Solver:
524
  x_t: A pytorch tensor. The approximated solution at time `t`.
525
  """
526
  ns = self.noise_schedule
527
- dims = x.dim()
528
  lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
529
  h = lambda_t - lambda_s
530
  log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
531
  sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t)
532
  alpha_t = torch.exp(log_alpha_t)
533
 
534
- if self.predict_x0:
535
  phi_1 = torch.expm1(-h)
536
  if model_s is None:
537
  model_s = self.model_fn(x, s)
538
  x_t = (
539
- expand_dims(sigma_t / sigma_s, dims) * x
540
- - expand_dims(alpha_t * phi_1, dims) * model_s
541
  )
542
  if return_intermediate:
543
  return x_t, {'model_s': model_s}
@@ -548,70 +580,66 @@ class DPM_Solver:
548
  if model_s is None:
549
  model_s = self.model_fn(x, s)
550
  x_t = (
551
- expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
552
- - expand_dims(sigma_t * phi_1, dims) * model_s
553
  )
554
  if return_intermediate:
555
  return x_t, {'model_s': model_s}
556
  else:
557
  return x_t
558
 
559
- def singlestep_dpm_solver_second_update(self, x, s, t, r1=0.5, model_s=None, return_intermediate=False,
560
- solver_type='dpm_solver'):
561
  """
562
  Singlestep solver DPM-Solver-2 from time `s` to time `t`.
563
 
564
  Args:
565
  x: A pytorch tensor. The initial value at time `s`.
566
- s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
567
- t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
568
  r1: A `float`. The hyperparameter of the second-order solver.
569
  model_s: A pytorch tensor. The model function evaluated at time `s`.
570
  If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
571
  return_intermediate: A `bool`. If true, also return the model value at time `s` and `s1` (the intermediate time).
572
- solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
573
- The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
574
  Returns:
575
  x_t: A pytorch tensor. The approximated solution at time `t`.
576
  """
577
- if solver_type not in ['dpm_solver', 'taylor']:
578
- raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type))
579
  if r1 is None:
580
  r1 = 0.5
581
  ns = self.noise_schedule
582
- dims = x.dim()
583
  lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
584
  h = lambda_t - lambda_s
585
  lambda_s1 = lambda_s + r1 * h
586
  s1 = ns.inverse_lambda(lambda_s1)
587
- log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(
588
- s1), ns.marginal_log_mean_coeff(t)
589
  sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t)
590
  alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)
591
 
592
- if self.predict_x0:
593
  phi_11 = torch.expm1(-r1 * h)
594
  phi_1 = torch.expm1(-h)
595
 
596
  if model_s is None:
597
  model_s = self.model_fn(x, s)
598
  x_s1 = (
599
- expand_dims(sigma_s1 / sigma_s, dims) * x
600
- - expand_dims(alpha_s1 * phi_11, dims) * model_s
601
  )
602
  model_s1 = self.model_fn(x_s1, s1)
603
- if solver_type == 'dpm_solver':
604
  x_t = (
605
- expand_dims(sigma_t / sigma_s, dims) * x
606
- - expand_dims(alpha_t * phi_1, dims) * model_s
607
- - (0.5 / r1) * expand_dims(alpha_t * phi_1, dims) * (model_s1 - model_s)
608
  )
609
  elif solver_type == 'taylor':
610
  x_t = (
611
- expand_dims(sigma_t / sigma_s, dims) * x
612
- - expand_dims(alpha_t * phi_1, dims) * model_s
613
- + (1. / r1) * expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * (
614
- model_s1 - model_s)
615
  )
616
  else:
617
  phi_11 = torch.expm1(r1 * h)
@@ -620,36 +648,35 @@ class DPM_Solver:
620
  if model_s is None:
621
  model_s = self.model_fn(x, s)
622
  x_s1 = (
623
- expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
624
- - expand_dims(sigma_s1 * phi_11, dims) * model_s
625
  )
626
  model_s1 = self.model_fn(x_s1, s1)
627
- if solver_type == 'dpm_solver':
628
  x_t = (
629
- expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
630
- - expand_dims(sigma_t * phi_1, dims) * model_s
631
- - (0.5 / r1) * expand_dims(sigma_t * phi_1, dims) * (model_s1 - model_s)
632
  )
633
  elif solver_type == 'taylor':
634
  x_t = (
635
- expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
636
- - expand_dims(sigma_t * phi_1, dims) * model_s
637
- - (1. / r1) * expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * (model_s1 - model_s)
638
  )
639
  if return_intermediate:
640
  return x_t, {'model_s': model_s, 'model_s1': model_s1}
641
  else:
642
  return x_t
643
 
644
- def singlestep_dpm_solver_third_update(self, x, s, t, r1=1. / 3., r2=2. / 3., model_s=None, model_s1=None,
645
- return_intermediate=False, solver_type='dpm_solver'):
646
  """
647
  Singlestep solver DPM-Solver-3 from time `s` to time `t`.
648
 
649
  Args:
650
  x: A pytorch tensor. The initial value at time `s`.
651
- s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
652
- t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
653
  r1: A `float`. The hyperparameter of the third-order solver.
654
  r2: A `float`. The hyperparameter of the third-order solver.
655
  model_s: A pytorch tensor. The model function evaluated at time `s`.
@@ -657,32 +684,29 @@ class DPM_Solver:
657
  model_s1: A pytorch tensor. The model function evaluated at time `s1` (the intermediate time given by `r1`).
658
  If `model_s1` is None, we evaluate the model at `s1`; otherwise we directly use it.
659
  return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
660
- solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
661
- The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
662
  Returns:
663
  x_t: A pytorch tensor. The approximated solution at time `t`.
664
  """
665
- if solver_type not in ['dpm_solver', 'taylor']:
666
- raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type))
667
  if r1 is None:
668
  r1 = 1. / 3.
669
  if r2 is None:
670
  r2 = 2. / 3.
671
  ns = self.noise_schedule
672
- dims = x.dim()
673
  lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
674
  h = lambda_t - lambda_s
675
  lambda_s1 = lambda_s + r1 * h
676
  lambda_s2 = lambda_s + r2 * h
677
  s1 = ns.inverse_lambda(lambda_s1)
678
  s2 = ns.inverse_lambda(lambda_s2)
679
- log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff(
680
- s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t)
681
- sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(
682
- s2), ns.marginal_std(t)
683
  alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t)
684
 
685
- if self.predict_x0:
686
  phi_11 = torch.expm1(-r1 * h)
687
  phi_12 = torch.expm1(-r2 * h)
688
  phi_1 = torch.expm1(-h)
@@ -694,21 +718,21 @@ class DPM_Solver:
694
  model_s = self.model_fn(x, s)
695
  if model_s1 is None:
696
  x_s1 = (
697
- expand_dims(sigma_s1 / sigma_s, dims) * x
698
- - expand_dims(alpha_s1 * phi_11, dims) * model_s
699
  )
700
  model_s1 = self.model_fn(x_s1, s1)
701
  x_s2 = (
702
- expand_dims(sigma_s2 / sigma_s, dims) * x
703
- - expand_dims(alpha_s2 * phi_12, dims) * model_s
704
- + r2 / r1 * expand_dims(alpha_s2 * phi_22, dims) * (model_s1 - model_s)
705
  )
706
  model_s2 = self.model_fn(x_s2, s2)
707
- if solver_type == 'dpm_solver':
708
  x_t = (
709
- expand_dims(sigma_t / sigma_s, dims) * x
710
- - expand_dims(alpha_t * phi_1, dims) * model_s
711
- + (1. / r2) * expand_dims(alpha_t * phi_2, dims) * (model_s2 - model_s)
712
  )
713
  elif solver_type == 'taylor':
714
  D1_0 = (1. / r1) * (model_s1 - model_s)
@@ -716,10 +740,10 @@ class DPM_Solver:
716
  D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
717
  D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
718
  x_t = (
719
- expand_dims(sigma_t / sigma_s, dims) * x
720
- - expand_dims(alpha_t * phi_1, dims) * model_s
721
- + expand_dims(alpha_t * phi_2, dims) * D1
722
- - expand_dims(alpha_t * phi_3, dims) * D2
723
  )
724
  else:
725
  phi_11 = torch.expm1(r1 * h)
@@ -733,21 +757,21 @@ class DPM_Solver:
733
  model_s = self.model_fn(x, s)
734
  if model_s1 is None:
735
  x_s1 = (
736
- expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
737
- - expand_dims(sigma_s1 * phi_11, dims) * model_s
738
  )
739
  model_s1 = self.model_fn(x_s1, s1)
740
  x_s2 = (
741
- expand_dims(torch.exp(log_alpha_s2 - log_alpha_s), dims) * x
742
- - expand_dims(sigma_s2 * phi_12, dims) * model_s
743
- - r2 / r1 * expand_dims(sigma_s2 * phi_22, dims) * (model_s1 - model_s)
744
  )
745
  model_s2 = self.model_fn(x_s2, s2)
746
- if solver_type == 'dpm_solver':
747
  x_t = (
748
- expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
749
- - expand_dims(sigma_t * phi_1, dims) * model_s
750
- - (1. / r2) * expand_dims(sigma_t * phi_2, dims) * (model_s2 - model_s)
751
  )
752
  elif solver_type == 'taylor':
753
  D1_0 = (1. / r1) * (model_s1 - model_s)
@@ -755,10 +779,10 @@ class DPM_Solver:
755
  D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
756
  D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
757
  x_t = (
758
- expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
759
- - expand_dims(sigma_t * phi_1, dims) * model_s
760
- - expand_dims(sigma_t * phi_2, dims) * D1
761
- - expand_dims(sigma_t * phi_3, dims) * D2
762
  )
763
 
764
  if return_intermediate:
@@ -766,28 +790,26 @@ class DPM_Solver:
766
  else:
767
  return x_t
768
 
769
- def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpm_solver"):
770
  """
771
  Multistep solver DPM-Solver-2 from time `t_prev_list[-1]` to time `t`.
772
 
773
  Args:
774
  x: A pytorch tensor. The initial value at time `s`.
775
  model_prev_list: A list of pytorch tensor. The previous computed model values.
776
- t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],)
777
- t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
778
- solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
779
- The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
780
  Returns:
781
  x_t: A pytorch tensor. The approximated solution at time `t`.
782
  """
783
- if solver_type not in ['dpm_solver', 'taylor']:
784
- raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type))
785
  ns = self.noise_schedule
786
- dims = x.dim()
787
- model_prev_1, model_prev_0 = model_prev_list
788
- t_prev_1, t_prev_0 = t_prev_list
789
- lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda(
790
- t_prev_0), ns.marginal_lambda(t)
791
  log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
792
  sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
793
  alpha_t = torch.exp(log_alpha_t)
@@ -795,55 +817,55 @@ class DPM_Solver:
795
  h_0 = lambda_prev_0 - lambda_prev_1
796
  h = lambda_t - lambda_prev_0
797
  r0 = h_0 / h
798
- D1_0 = expand_dims(1. / r0, dims) * (model_prev_0 - model_prev_1)
799
- if self.predict_x0:
800
- if solver_type == 'dpm_solver':
 
801
  x_t = (
802
- expand_dims(sigma_t / sigma_prev_0, dims) * x
803
- - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
804
- - 0.5 * expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * D1_0
805
  )
806
  elif solver_type == 'taylor':
807
  x_t = (
808
- expand_dims(sigma_t / sigma_prev_0, dims) * x
809
- - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
810
- + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1_0
811
  )
812
  else:
813
- if solver_type == 'dpm_solver':
 
814
  x_t = (
815
- expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
816
- - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
817
- - 0.5 * expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * D1_0
818
  )
819
  elif solver_type == 'taylor':
820
  x_t = (
821
- expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
822
- - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
823
- - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1_0
824
  )
825
  return x_t
826
 
827
- def multistep_dpm_solver_third_update(self, x, model_prev_list, t_prev_list, t, solver_type='dpm_solver'):
828
  """
829
  Multistep solver DPM-Solver-3 from time `t_prev_list[-1]` to time `t`.
830
 
831
  Args:
832
  x: A pytorch tensor. The initial value at time `s`.
833
  model_prev_list: A list of pytorch tensor. The previous computed model values.
834
- t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],)
835
- t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
836
- solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
837
- The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
838
  Returns:
839
  x_t: A pytorch tensor. The approximated solution at time `t`.
840
  """
841
  ns = self.noise_schedule
842
- dims = x.dim()
843
  model_prev_2, model_prev_1, model_prev_0 = model_prev_list
844
  t_prev_2, t_prev_1, t_prev_0 = t_prev_list
845
- lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda(
846
- t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
847
  log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
848
  sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
849
  alpha_t = torch.exp(log_alpha_t)
@@ -852,39 +874,44 @@ class DPM_Solver:
852
  h_0 = lambda_prev_0 - lambda_prev_1
853
  h = lambda_t - lambda_prev_0
854
  r0, r1 = h_0 / h, h_1 / h
855
- D1_0 = expand_dims(1. / r0, dims) * (model_prev_0 - model_prev_1)
856
- D1_1 = expand_dims(1. / r1, dims) * (model_prev_1 - model_prev_2)
857
- D1 = D1_0 + expand_dims(r0 / (r0 + r1), dims) * (D1_0 - D1_1)
858
- D2 = expand_dims(1. / (r0 + r1), dims) * (D1_0 - D1_1)
859
- if self.predict_x0:
 
 
 
860
  x_t = (
861
- expand_dims(sigma_t / sigma_prev_0, dims) * x
862
- - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
863
- + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1
864
- - expand_dims(alpha_t * ((torch.exp(-h) - 1. + h) / h ** 2 - 0.5), dims) * D2
865
  )
866
  else:
 
 
 
867
  x_t = (
868
- expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
869
- - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
870
- - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1
871
- - expand_dims(sigma_t * ((torch.exp(h) - 1. - h) / h ** 2 - 0.5), dims) * D2
872
  )
873
  return x_t
874
 
875
- def singlestep_dpm_solver_update(self, x, s, t, order, return_intermediate=False, solver_type='dpm_solver', r1=None,
876
- r2=None):
877
  """
878
  Singlestep DPM-Solver with the order `order` from time `s` to time `t`.
879
 
880
  Args:
881
  x: A pytorch tensor. The initial value at time `s`.
882
- s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
883
- t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
884
  order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
885
  return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
886
- solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
887
- The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
888
  r1: A `float`. The hyperparameter of the second-order or third-order solver.
889
  r2: A `float`. The hyperparameter of the third-order solver.
890
  Returns:
@@ -893,26 +920,24 @@ class DPM_Solver:
893
  if order == 1:
894
  return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate)
895
  elif order == 2:
896
- return self.singlestep_dpm_solver_second_update(x, s, t, return_intermediate=return_intermediate,
897
- solver_type=solver_type, r1=r1)
898
  elif order == 3:
899
- return self.singlestep_dpm_solver_third_update(x, s, t, return_intermediate=return_intermediate,
900
- solver_type=solver_type, r1=r1, r2=r2)
901
  else:
902
  raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
903
 
904
- def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type='dpm_solver'):
905
  """
906
  Multistep DPM-Solver with the order `order` from time `t_prev_list[-1]` to time `t`.
907
 
908
  Args:
909
  x: A pytorch tensor. The initial value at time `s`.
910
  model_prev_list: A list of pytorch tensor. The previous computed model values.
911
- t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],)
912
- t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
913
  order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
914
- solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
915
- The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
916
  Returns:
917
  x_t: A pytorch tensor. The approximated solution at time `t`.
918
  """
@@ -925,8 +950,7 @@ class DPM_Solver:
925
  else:
926
  raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
927
 
928
- def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5,
929
- solver_type='dpm_solver'):
930
  """
931
  The adaptive step size solver based on singlestep DPM-Solver.
932
 
@@ -941,15 +965,15 @@ class DPM_Solver:
941
  theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1].
942
  t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the
943
  current time and `t_0` is less than `t_err`. The default setting is 1e-5.
944
- solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
945
- The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
946
  Returns:
947
  x_0: A pytorch tensor. The approximated solution at time `t_0`.
948
 
949
  [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021.
950
  """
951
  ns = self.noise_schedule
952
- s = t_T * torch.ones((x.shape[0],)).to(x)
953
  lambda_s = ns.marginal_lambda(s)
954
  lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x))
955
  h = h_init * torch.ones_like(s).to(x)
@@ -957,18 +981,16 @@ class DPM_Solver:
957
  nfe = 0
958
  if order == 2:
959
  r1 = 0.5
960
- lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
961
- higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
962
- solver_type=solver_type,
963
- **kwargs)
964
  elif order == 3:
965
  r1, r2 = 1. / 3., 2. / 3.
966
- lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
967
- return_intermediate=True,
968
- solver_type=solver_type)
969
- higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2,
970
- solver_type=solver_type,
971
- **kwargs)
972
  else:
973
  raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
974
  while torch.abs((s - t_0)).mean() > t_err:
@@ -976,7 +998,8 @@ class DPM_Solver:
976
  x_lower, lower_noise_kwargs = lower_update(x, s, t)
977
  x_higher = higher_update(x, s, t, **lower_noise_kwargs)
978
  delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)))
979
- norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
 
980
  E = norm_fn((x_higher - x_lower) / delta).max()
981
  if torch.all(E <= 1.):
982
  x = x_higher
@@ -988,10 +1011,45 @@ class DPM_Solver:
988
  print('adaptive solver nfe', nfe)
989
  return x
990
 
991
- def sample(self, x, steps=20, t_start=None, t_end=None, order=3, skip_type='time_uniform',
992
- method='singlestep', denoise=False, solver_type='dpm_solver', atol=0.0078,
993
- rtol=0.05,
994
- ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
995
  """
996
  Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`.
997
 
@@ -1040,15 +1098,19 @@ class DPM_Solver:
1040
 
1041
  Some advices for choosing the algorithm:
1042
  - For **unconditional sampling** or **guided sampling with small guidance scale** by DPMs:
1043
- Use singlestep DPM-Solver ("DPM-Solver-fast" in the paper) with `order = 3`.
1044
- e.g.
1045
- >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, predict_x0=False)
 
 
 
 
1046
  >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
1047
  skip_type='time_uniform', method='singlestep')
1048
  - For **guided sampling with large guidance scale** by DPMs:
1049
- Use multistep DPM-Solver with `predict_x0 = True` and `order = 2`.
1050
  e.g.
1051
- >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, predict_x0=True)
1052
  >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=2,
1053
  skip_type='time_uniform', method='multistep')
1054
 
@@ -1074,72 +1136,116 @@ class DPM_Solver:
1074
  order: A `int`. The order of DPM-Solver.
1075
  skip_type: A `str`. The type for the spacing of the time steps. 'time_uniform' or 'logSNR' or 'time_quadratic'.
1076
  method: A `str`. The method for sampling. 'singlestep' or 'multistep' or 'singlestep_fixed' or 'adaptive'.
1077
- denoise: A `bool`. Whether to denoise at the final step. Default is False.
1078
- If `denoise` is True, the total NFE is (`steps` + 1).
1079
- solver_type: A `str`. The taylor expansion type for the solver. `dpm_solver` or `taylor`. We recommend `dpm_solver`.
 
 
 
 
 
 
 
 
 
 
 
1080
  atol: A `float`. The absolute tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
1081
  rtol: A `float`. The relative tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
 
 
1082
  Returns:
1083
  x_end: A pytorch tensor. The approximated solution at time `t_end`.
1084
 
1085
  """
1086
  t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
1087
  t_T = self.noise_schedule.T if t_start is None else t_start
 
 
 
 
 
1088
  device = x.device
1089
- if method == 'adaptive':
1090
- with torch.no_grad():
1091
- x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol,
1092
- solver_type=solver_type)
1093
- elif method == 'multistep':
1094
- assert steps >= order
1095
- timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
1096
- assert timesteps.shape[0] - 1 == steps
1097
- with torch.no_grad():
1098
- vec_t = timesteps[0].expand((x.shape[0]))
1099
- model_prev_list = [self.model_fn(x, vec_t)]
1100
- t_prev_list = [vec_t]
 
 
 
 
 
1101
  # Init the first `order` values by lower order multistep DPM-Solver.
1102
- for init_order in range(1, order):
1103
- vec_t = timesteps[init_order].expand(x.shape[0])
1104
- x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, init_order,
1105
- solver_type=solver_type)
1106
- model_prev_list.append(self.model_fn(x, vec_t))
1107
- t_prev_list.append(vec_t)
 
 
 
1108
  # Compute the remaining values by `order`-th order multistep DPM-Solver.
1109
  for step in range(order, steps + 1):
1110
- vec_t = timesteps[step].expand(x.shape[0])
1111
- x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, order,
1112
- solver_type=solver_type)
 
 
 
 
 
 
 
 
1113
  for i in range(order - 1):
1114
  t_prev_list[i] = t_prev_list[i + 1]
1115
  model_prev_list[i] = model_prev_list[i + 1]
1116
- t_prev_list[-1] = vec_t
1117
  # We do not need to evaluate the final model value.
1118
  if step < steps:
1119
- model_prev_list[-1] = self.model_fn(x, vec_t)
1120
- elif method in ['singlestep', 'singlestep_fixed']:
1121
- if method == 'singlestep':
1122
- timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(steps=steps, order=order,
1123
- skip_type=skip_type,
1124
- t_T=t_T, t_0=t_0,
1125
- device=device)
1126
- elif method == 'singlestep_fixed':
1127
- K = steps // order
1128
- orders = [order, ] * K
1129
- timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device)
1130
- for i, order in enumerate(orders):
1131
- t_T_inner, t_0_inner = timesteps_outer[i], timesteps_outer[i + 1]
1132
- timesteps_inner = self.get_time_steps(skip_type=skip_type, t_T=t_T_inner.item(), t_0=t_0_inner.item(),
1133
- N=order, device=device)
1134
- lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner)
1135
- vec_s, vec_t = t_T_inner.repeat(x.shape[0]), t_0_inner.repeat(x.shape[0])
1136
- h = lambda_inner[-1] - lambda_inner[0]
1137
- r1 = None if order <= 1 else (lambda_inner[1] - lambda_inner[0]) / h
1138
- r2 = None if order <= 2 else (lambda_inner[2] - lambda_inner[0]) / h
1139
- x = self.singlestep_dpm_solver_update(x, vec_s, vec_t, order, solver_type=solver_type, r1=r1, r2=r2)
1140
- if denoise:
1141
- x = self.denoise_fn(x, torch.ones((x.shape[0],)).to(device) * t_0)
1142
- return x
 
 
 
 
 
 
 
 
 
 
1143
 
1144
 
1145
  #############################################################
@@ -1198,4 +1304,4 @@ def expand_dims(v, dims):
1198
  Returns:
1199
  a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
1200
  """
1201
- return v[(...,) + (None,) * (dims - 1)]
 
 
 
1
  import torch
2
 
3
 
 
9
  alphas_cumprod=None,
10
  continuous_beta_0=0.1,
11
  continuous_beta_1=20.,
12
+ dtype=torch.float32,
13
+ ):
14
  """Create a wrapper class for the forward SDE (VP type).
15
 
16
  ***
 
45
  betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
46
  alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
47
 
48
+ Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
49
 
50
  **Important**: Please pay special attention for the args for `alphas_cumprod`:
51
  The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
 
58
 
59
  2. For continuous-time DPMs:
60
 
61
+ We support the linear VPSDE for the continuous time setting. The hyperparameters for the noise
62
+ schedule are the default settings in Yang Song's ScoreSDE:
63
 
64
  Args:
65
  beta_min: A `float` number. The smallest beta for the linear schedule.
66
  beta_max: A `float` number. The largest beta for the linear schedule.
 
 
67
  T: A `float` number. The ending time of the forward process.
68
 
69
  ===============================================================
70
 
71
  Args:
72
  schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
73
+ 'linear' for continuous-time DPMs.
74
  Returns:
75
  A wrapper object of the forward SDE (VP type).
76
 
 
89
 
90
  """
91
 
92
+ if schedule not in ['discrete', 'linear']:
93
+ raise ValueError("Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear'".format(schedule))
 
 
94
 
95
  self.schedule = schedule
96
  if schedule == 'discrete':
 
99
  else:
100
  assert alphas_cumprod is not None
101
  log_alphas = 0.5 * torch.log(alphas_cumprod)
 
102
  self.T = 1.
103
+ self.log_alpha_array = self.numerical_clip_alpha(log_alphas).reshape((1, -1,)).to(dtype=dtype)
104
+ self.total_N = self.log_alpha_array.shape[1]
105
+ self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype)
106
  else:
107
+ self.T = 1.
108
  self.total_N = 1000
109
  self.beta_0 = continuous_beta_0
110
  self.beta_1 = continuous_beta_1
111
+
112
+ def numerical_clip_alpha(self, log_alphas, clipped_lambda=-5.1):
113
+ """
114
+ For some beta schedules such as cosine schedule, the log-SNR has numerical isssues.
115
+ We clip the log-SNR near t=T within -5.1 to ensure the stability.
116
+ Such a trick is very useful for diffusion models with the cosine schedule, such as i-DDPM, guided-diffusion and GLIDE.
117
+ """
118
+ log_sigmas = 0.5 * torch.log(1. - torch.exp(2. * log_alphas))
119
+ lambs = log_alphas - log_sigmas
120
+ idx = torch.searchsorted(torch.flip(lambs, [0]), clipped_lambda)
121
+ if idx > 0:
122
+ log_alphas = log_alphas[:-idx]
123
+ return log_alphas
124
 
125
  def marginal_log_mean_coeff(self, t):
126
  """
127
  Compute log(alpha_t) of a given continuous-time label t in [0, T].
128
  """
129
  if self.schedule == 'discrete':
130
+ return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)).reshape((-1))
 
131
  elif self.schedule == 'linear':
132
  return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
 
 
 
 
133
 
134
  def marginal_alpha(self, t):
135
  """
 
157
  """
158
  if self.schedule == 'linear':
159
  tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
160
+ Delta = self.beta_0**2 + tmp
161
  return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
162
  elif self.schedule == 'discrete':
163
  log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
164
+ t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]), torch.flip(self.t_array.to(lamb.device), [1]))
 
165
  return t.reshape((-1,))
 
 
 
 
 
 
166
 
167
 
168
  def model_wrapper(
169
+ model,
170
+ noise_schedule,
171
+ model_type="noise",
172
+ model_kwargs={},
173
+ guidance_type="uncond",
174
+ condition=None,
175
+ unconditional_condition=None,
176
+ guidance_scale=1.,
177
+ classifier_fn=None,
178
+ classifier_kwargs={},
179
  ):
180
  """Create a wrapper function for the noise prediction model.
181
 
 
278
  return t_continuous
279
 
280
  def noise_pred_fn(x, t_continuous, cond=None):
 
 
281
  t_input = get_model_input_time(t_continuous)
282
  if cond is None:
283
  output = model(x, t_input, **model_kwargs)
 
287
  return output
288
  elif model_type == "x_start":
289
  alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
290
+ return (x - expand_dims(alpha_t, x.dim()) * output) / expand_dims(sigma_t, x.dim())
 
291
  elif model_type == "v":
292
  alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
293
+ return expand_dims(alpha_t, x.dim()) * output + expand_dims(sigma_t, x.dim()) * x
 
294
  elif model_type == "score":
295
  sigma_t = noise_schedule.marginal_std(t_continuous)
296
+ return -expand_dims(sigma_t, x.dim()) * output
 
297
 
298
  def cond_grad_fn(x, t_input):
299
  """
 
308
  """
309
  The noise predicition model function that is used for DPM-Solver.
310
  """
 
 
311
  if guidance_type == "uncond":
312
  return noise_pred_fn(x, t_continuous)
313
  elif guidance_type == "classifier":
 
316
  cond_grad = cond_grad_fn(x, t_input)
317
  sigma_t = noise_schedule.marginal_std(t_continuous)
318
  noise = noise_pred_fn(x, t_continuous)
319
+ return noise - guidance_scale * expand_dims(sigma_t, x.dim()) * cond_grad
320
  elif guidance_type == "classifier-free":
321
  if guidance_scale == 1. or unconditional_condition is None:
322
  return noise_pred_fn(x, t_continuous, cond=condition)
 
327
  noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
328
  return noise_uncond + guidance_scale * (noise - noise_uncond)
329
 
330
+ assert model_type in ["noise", "x_start", "v", "score"]
331
  assert guidance_type in ["uncond", "classifier", "classifier-free"]
332
  return model_fn
333
 
334
 
335
  class DPM_Solver:
336
+ def __init__(
337
+ self,
338
+ model_fn,
339
+ noise_schedule,
340
+ algorithm_type="dpmsolver++",
341
+ correcting_x0_fn=None,
342
+ correcting_xt_fn=None,
343
+ thresholding_max_val=1.,
344
+ dynamic_thresholding_ratio=0.995,
345
+ ):
346
  """Construct a DPM-Solver.
347
 
348
+ We support both DPM-Solver (`algorithm_type="dpmsolver"`) and DPM-Solver++ (`algorithm_type="dpmsolver++"`).
349
+
350
+ We also support the "dynamic thresholding" method in Imagen[1]. For pixel-space diffusion models, you
351
+ can set both `algorithm_type="dpmsolver++"` and `correcting_x0_fn="dynamic_thresholding"` to use the
352
+ dynamic thresholding. The "dynamic thresholding" can greatly improve the sample quality for pixel-space
353
+ DPMs with large guidance scales. Note that the thresholding method is **unsuitable** for latent-space
354
+ DPMs (such as stable-diffusion).
355
+
356
+ To support advanced algorithms in image-to-image applications, we also support corrector functions for
357
+ both x0 and xt.
358
 
359
  Args:
360
  model_fn: A noise prediction model function which accepts the continuous-time input (t in [epsilon, T]):
 
362
  def model_fn(x, t_continuous):
363
  return noise
364
  ``
365
+ The shape of `x` is `(batch_size, **shape)`, and the shape of `t_continuous` is `(batch_size,)`.
366
  noise_schedule: A noise schedule object, such as NoiseScheduleVP.
367
+ algorithm_type: A `str`. Either "dpmsolver" or "dpmsolver++".
368
+ correcting_x0_fn: A `str` or a function with the following format:
369
+ ```
370
+ def correcting_x0_fn(x0, t):
371
+ x0_new = ...
372
+ return x0_new
373
+ ```
374
+ This function is to correct the outputs of the data prediction model at each sampling step. e.g.,
375
+ ```
376
+ x0_pred = data_pred_model(xt, t)
377
+ if correcting_x0_fn is not None:
378
+ x0_pred = correcting_x0_fn(x0_pred, t)
379
+ xt_1 = update(x0_pred, xt, t)
380
+ ```
381
+ If `correcting_x0_fn="dynamic_thresholding"`, we use the dynamic thresholding proposed in Imagen[1].
382
+ correcting_xt_fn: A function with the following format:
383
+ ```
384
+ def correcting_xt_fn(xt, t, step):
385
+ x_new = ...
386
+ return x_new
387
+ ```
388
+ This function is to correct the intermediate samples xt at each sampling step. e.g.,
389
+ ```
390
+ xt = ...
391
+ xt = correcting_xt_fn(xt, t, step)
392
+ ```
393
+ thresholding_max_val: A `float`. The max value for thresholding.
394
+ Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`.
395
+ dynamic_thresholding_ratio: A `float`. The ratio for dynamic thresholding (see Imagen[1] for details).
396
+ Valid only when use `dpmsolver++` and `correcting_x0_fn="dynamic_thresholding"`.
397
+
398
+ [1] Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour,
399
+ Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models
400
+ with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b.
401
  """
402
+ self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
403
  self.noise_schedule = noise_schedule
404
+ assert algorithm_type in ["dpmsolver", "dpmsolver++"]
405
+ self.algorithm_type = algorithm_type
406
+ if correcting_x0_fn == "dynamic_thresholding":
407
+ self.correcting_x0_fn = self.dynamic_thresholding_fn
408
+ else:
409
+ self.correcting_x0_fn = correcting_x0_fn
410
+ self.correcting_xt_fn = correcting_xt_fn
411
+ self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
412
+ self.thresholding_max_val = thresholding_max_val
413
+
414
+ def dynamic_thresholding_fn(self, x0, t):
415
+ """
416
+ The dynamic thresholding method.
417
+ """
418
+ dims = x0.dim()
419
+ p = self.dynamic_thresholding_ratio
420
+ s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
421
+ s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims)
422
+ x0 = torch.clamp(x0, -s, s) / s
423
+ return x0
424
 
425
  def noise_prediction_fn(self, x, t):
426
  """
 
430
 
431
  def data_prediction_fn(self, x, t):
432
  """
433
+ Return the data prediction model (with corrector).
434
  """
435
  noise = self.noise_prediction_fn(x, t)
 
436
  alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
437
+ x0 = (x - sigma_t * noise) / alpha_t
438
+ if self.correcting_x0_fn is not None:
439
+ x0 = self.correcting_x0_fn(x0, t)
 
 
 
440
  return x0
441
 
442
  def model_fn(self, x, t):
443
  """
444
  Convert the model to the noise prediction model or the data prediction model.
445
  """
446
+ if self.algorithm_type == "dpmsolver++":
447
  return self.data_prediction_fn(x, t)
448
  else:
449
  return self.noise_prediction_fn(x, t)
 
472
  return torch.linspace(t_T, t_0, N + 1).to(device)
473
  elif skip_type == 'time_quadratic':
474
  t_order = 2
475
+ t = torch.linspace(t_T**(1. / t_order), t_0**(1. / t_order), N + 1).pow(t_order).to(device)
476
  return t
477
  else:
478
+ raise ValueError("Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
 
479
 
480
  def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
481
  """
 
512
  if order == 3:
513
  K = steps // 3 + 1
514
  if steps % 3 == 0:
515
+ orders = [3,] * (K - 2) + [2, 1]
516
  elif steps % 3 == 1:
517
+ orders = [3,] * (K - 1) + [1]
518
  else:
519
+ orders = [3,] * (K - 1) + [2]
520
  elif order == 2:
521
  if steps % 2 == 0:
522
  K = steps // 2
523
+ orders = [2,] * K
524
  else:
525
  K = steps // 2 + 1
526
+ orders = [2,] * (K - 1) + [1]
527
  elif order == 1:
528
  K = 1
529
+ orders = [1,] * steps
530
  else:
531
  raise ValueError("'order' must be '1' or '2' or '3'.")
532
  if skip_type == 'logSNR':
533
  # To reproduce the results in DPM-Solver paper
534
  timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
535
  else:
536
+ timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[torch.cumsum(torch.tensor([0,] + orders), 0).to(device)]
 
537
  return timesteps_outer, orders
538
 
539
+ def denoise_to_zero_fn(self, x, s):
540
  """
541
  Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
542
  """
 
548
 
549
  Args:
550
  x: A pytorch tensor. The initial value at time `s`.
551
+ s: A pytorch tensor. The starting time, with the shape (1,).
552
+ t: A pytorch tensor. The ending time, with the shape (1,).
553
  model_s: A pytorch tensor. The model function evaluated at time `s`.
554
  If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
555
  return_intermediate: A `bool`. If true, also return the model value at time `s`.
 
557
  x_t: A pytorch tensor. The approximated solution at time `t`.
558
  """
559
  ns = self.noise_schedule
 
560
  lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
561
  h = lambda_t - lambda_s
562
  log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
563
  sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t)
564
  alpha_t = torch.exp(log_alpha_t)
565
 
566
+ if self.algorithm_type == "dpmsolver++":
567
  phi_1 = torch.expm1(-h)
568
  if model_s is None:
569
  model_s = self.model_fn(x, s)
570
  x_t = (
571
+ sigma_t / sigma_s * x
572
+ - alpha_t * phi_1 * model_s
573
  )
574
  if return_intermediate:
575
  return x_t, {'model_s': model_s}
 
580
  if model_s is None:
581
  model_s = self.model_fn(x, s)
582
  x_t = (
583
+ torch.exp(log_alpha_t - log_alpha_s) * x
584
+ - (sigma_t * phi_1) * model_s
585
  )
586
  if return_intermediate:
587
  return x_t, {'model_s': model_s}
588
  else:
589
  return x_t
590
 
591
+ def singlestep_dpm_solver_second_update(self, x, s, t, r1=0.5, model_s=None, return_intermediate=False, solver_type='dpmsolver'):
 
592
  """
593
  Singlestep solver DPM-Solver-2 from time `s` to time `t`.
594
 
595
  Args:
596
  x: A pytorch tensor. The initial value at time `s`.
597
+ s: A pytorch tensor. The starting time, with the shape (1,).
598
+ t: A pytorch tensor. The ending time, with the shape (1,).
599
  r1: A `float`. The hyperparameter of the second-order solver.
600
  model_s: A pytorch tensor. The model function evaluated at time `s`.
601
  If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
602
  return_intermediate: A `bool`. If true, also return the model value at time `s` and `s1` (the intermediate time).
603
+ solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
604
+ The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
605
  Returns:
606
  x_t: A pytorch tensor. The approximated solution at time `t`.
607
  """
608
+ if solver_type not in ['dpmsolver', 'taylor']:
609
+ raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
610
  if r1 is None:
611
  r1 = 0.5
612
  ns = self.noise_schedule
 
613
  lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
614
  h = lambda_t - lambda_s
615
  lambda_s1 = lambda_s + r1 * h
616
  s1 = ns.inverse_lambda(lambda_s1)
617
+ log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(t)
 
618
  sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t)
619
  alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)
620
 
621
+ if self.algorithm_type == "dpmsolver++":
622
  phi_11 = torch.expm1(-r1 * h)
623
  phi_1 = torch.expm1(-h)
624
 
625
  if model_s is None:
626
  model_s = self.model_fn(x, s)
627
  x_s1 = (
628
+ (sigma_s1 / sigma_s) * x
629
+ - (alpha_s1 * phi_11) * model_s
630
  )
631
  model_s1 = self.model_fn(x_s1, s1)
632
+ if solver_type == 'dpmsolver':
633
  x_t = (
634
+ (sigma_t / sigma_s) * x
635
+ - (alpha_t * phi_1) * model_s
636
+ - (0.5 / r1) * (alpha_t * phi_1) * (model_s1 - model_s)
637
  )
638
  elif solver_type == 'taylor':
639
  x_t = (
640
+ (sigma_t / sigma_s) * x
641
+ - (alpha_t * phi_1) * model_s
642
+ + (1. / r1) * (alpha_t * (phi_1 / h + 1.)) * (model_s1 - model_s)
 
643
  )
644
  else:
645
  phi_11 = torch.expm1(r1 * h)
 
648
  if model_s is None:
649
  model_s = self.model_fn(x, s)
650
  x_s1 = (
651
+ torch.exp(log_alpha_s1 - log_alpha_s) * x
652
+ - (sigma_s1 * phi_11) * model_s
653
  )
654
  model_s1 = self.model_fn(x_s1, s1)
655
+ if solver_type == 'dpmsolver':
656
  x_t = (
657
+ torch.exp(log_alpha_t - log_alpha_s) * x
658
+ - (sigma_t * phi_1) * model_s
659
+ - (0.5 / r1) * (sigma_t * phi_1) * (model_s1 - model_s)
660
  )
661
  elif solver_type == 'taylor':
662
  x_t = (
663
+ torch.exp(log_alpha_t - log_alpha_s) * x
664
+ - (sigma_t * phi_1) * model_s
665
+ - (1. / r1) * (sigma_t * (phi_1 / h - 1.)) * (model_s1 - model_s)
666
  )
667
  if return_intermediate:
668
  return x_t, {'model_s': model_s, 'model_s1': model_s1}
669
  else:
670
  return x_t
671
 
672
+ def singlestep_dpm_solver_third_update(self, x, s, t, r1=1./3., r2=2./3., model_s=None, model_s1=None, return_intermediate=False, solver_type='dpmsolver'):
 
673
  """
674
  Singlestep solver DPM-Solver-3 from time `s` to time `t`.
675
 
676
  Args:
677
  x: A pytorch tensor. The initial value at time `s`.
678
+ s: A pytorch tensor. The starting time, with the shape (1,).
679
+ t: A pytorch tensor. The ending time, with the shape (1,).
680
  r1: A `float`. The hyperparameter of the third-order solver.
681
  r2: A `float`. The hyperparameter of the third-order solver.
682
  model_s: A pytorch tensor. The model function evaluated at time `s`.
 
684
  model_s1: A pytorch tensor. The model function evaluated at time `s1` (the intermediate time given by `r1`).
685
  If `model_s1` is None, we evaluate the model at `s1`; otherwise we directly use it.
686
  return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
687
+ solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
688
+ The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
689
  Returns:
690
  x_t: A pytorch tensor. The approximated solution at time `t`.
691
  """
692
+ if solver_type not in ['dpmsolver', 'taylor']:
693
+ raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
694
  if r1 is None:
695
  r1 = 1. / 3.
696
  if r2 is None:
697
  r2 = 2. / 3.
698
  ns = self.noise_schedule
 
699
  lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
700
  h = lambda_t - lambda_s
701
  lambda_s1 = lambda_s + r1 * h
702
  lambda_s2 = lambda_s + r2 * h
703
  s1 = ns.inverse_lambda(lambda_s1)
704
  s2 = ns.inverse_lambda(lambda_s2)
705
+ log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t)
706
+ sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(s2), ns.marginal_std(t)
 
 
707
  alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t)
708
 
709
+ if self.algorithm_type == "dpmsolver++":
710
  phi_11 = torch.expm1(-r1 * h)
711
  phi_12 = torch.expm1(-r2 * h)
712
  phi_1 = torch.expm1(-h)
 
718
  model_s = self.model_fn(x, s)
719
  if model_s1 is None:
720
  x_s1 = (
721
+ (sigma_s1 / sigma_s) * x
722
+ - (alpha_s1 * phi_11) * model_s
723
  )
724
  model_s1 = self.model_fn(x_s1, s1)
725
  x_s2 = (
726
+ (sigma_s2 / sigma_s) * x
727
+ - (alpha_s2 * phi_12) * model_s
728
+ + r2 / r1 * (alpha_s2 * phi_22) * (model_s1 - model_s)
729
  )
730
  model_s2 = self.model_fn(x_s2, s2)
731
+ if solver_type == 'dpmsolver':
732
  x_t = (
733
+ (sigma_t / sigma_s) * x
734
+ - (alpha_t * phi_1) * model_s
735
+ + (1. / r2) * (alpha_t * phi_2) * (model_s2 - model_s)
736
  )
737
  elif solver_type == 'taylor':
738
  D1_0 = (1. / r1) * (model_s1 - model_s)
 
740
  D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
741
  D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
742
  x_t = (
743
+ (sigma_t / sigma_s) * x
744
+ - (alpha_t * phi_1) * model_s
745
+ + (alpha_t * phi_2) * D1
746
+ - (alpha_t * phi_3) * D2
747
  )
748
  else:
749
  phi_11 = torch.expm1(r1 * h)
 
757
  model_s = self.model_fn(x, s)
758
  if model_s1 is None:
759
  x_s1 = (
760
+ (torch.exp(log_alpha_s1 - log_alpha_s)) * x
761
+ - (sigma_s1 * phi_11) * model_s
762
  )
763
  model_s1 = self.model_fn(x_s1, s1)
764
  x_s2 = (
765
+ (torch.exp(log_alpha_s2 - log_alpha_s)) * x
766
+ - (sigma_s2 * phi_12) * model_s
767
+ - r2 / r1 * (sigma_s2 * phi_22) * (model_s1 - model_s)
768
  )
769
  model_s2 = self.model_fn(x_s2, s2)
770
+ if solver_type == 'dpmsolver':
771
  x_t = (
772
+ (torch.exp(log_alpha_t - log_alpha_s)) * x
773
+ - (sigma_t * phi_1) * model_s
774
+ - (1. / r2) * (sigma_t * phi_2) * (model_s2 - model_s)
775
  )
776
  elif solver_type == 'taylor':
777
  D1_0 = (1. / r1) * (model_s1 - model_s)
 
779
  D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
780
  D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
781
  x_t = (
782
+ (torch.exp(log_alpha_t - log_alpha_s)) * x
783
+ - (sigma_t * phi_1) * model_s
784
+ - (sigma_t * phi_2) * D1
785
+ - (sigma_t * phi_3) * D2
786
  )
787
 
788
  if return_intermediate:
 
790
  else:
791
  return x_t
792
 
793
+ def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpmsolver"):
794
  """
795
  Multistep solver DPM-Solver-2 from time `t_prev_list[-1]` to time `t`.
796
 
797
  Args:
798
  x: A pytorch tensor. The initial value at time `s`.
799
  model_prev_list: A list of pytorch tensor. The previous computed model values.
800
+ t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
801
+ t: A pytorch tensor. The ending time, with the shape (1,).
802
+ solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
803
+ The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
804
  Returns:
805
  x_t: A pytorch tensor. The approximated solution at time `t`.
806
  """
807
+ if solver_type not in ['dpmsolver', 'taylor']:
808
+ raise ValueError("'solver_type' must be either 'dpmsolver' or 'taylor', got {}".format(solver_type))
809
  ns = self.noise_schedule
810
+ model_prev_1, model_prev_0 = model_prev_list[-2], model_prev_list[-1]
811
+ t_prev_1, t_prev_0 = t_prev_list[-2], t_prev_list[-1]
812
+ lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
 
 
813
  log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
814
  sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
815
  alpha_t = torch.exp(log_alpha_t)
 
817
  h_0 = lambda_prev_0 - lambda_prev_1
818
  h = lambda_t - lambda_prev_0
819
  r0 = h_0 / h
820
+ D1_0 = (1. / r0) * (model_prev_0 - model_prev_1)
821
+ if self.algorithm_type == "dpmsolver++":
822
+ phi_1 = torch.expm1(-h)
823
+ if solver_type == 'dpmsolver':
824
  x_t = (
825
+ (sigma_t / sigma_prev_0) * x
826
+ - (alpha_t * phi_1) * model_prev_0
827
+ - 0.5 * (alpha_t * phi_1) * D1_0
828
  )
829
  elif solver_type == 'taylor':
830
  x_t = (
831
+ (sigma_t / sigma_prev_0) * x
832
+ - (alpha_t * phi_1) * model_prev_0
833
+ + (alpha_t * (phi_1 / h + 1.)) * D1_0
834
  )
835
  else:
836
+ phi_1 = torch.expm1(h)
837
+ if solver_type == 'dpmsolver':
838
  x_t = (
839
+ (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
840
+ - (sigma_t * phi_1) * model_prev_0
841
+ - 0.5 * (sigma_t * phi_1) * D1_0
842
  )
843
  elif solver_type == 'taylor':
844
  x_t = (
845
+ (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
846
+ - (sigma_t * phi_1) * model_prev_0
847
+ - (sigma_t * (phi_1 / h - 1.)) * D1_0
848
  )
849
  return x_t
850
 
851
+ def multistep_dpm_solver_third_update(self, x, model_prev_list, t_prev_list, t, solver_type='dpmsolver'):
852
  """
853
  Multistep solver DPM-Solver-3 from time `t_prev_list[-1]` to time `t`.
854
 
855
  Args:
856
  x: A pytorch tensor. The initial value at time `s`.
857
  model_prev_list: A list of pytorch tensor. The previous computed model values.
858
+ t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
859
+ t: A pytorch tensor. The ending time, with the shape (1,).
860
+ solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
861
+ The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
862
  Returns:
863
  x_t: A pytorch tensor. The approximated solution at time `t`.
864
  """
865
  ns = self.noise_schedule
 
866
  model_prev_2, model_prev_1, model_prev_0 = model_prev_list
867
  t_prev_2, t_prev_1, t_prev_0 = t_prev_list
868
+ lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda(t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
 
869
  log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
870
  sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
871
  alpha_t = torch.exp(log_alpha_t)
 
874
  h_0 = lambda_prev_0 - lambda_prev_1
875
  h = lambda_t - lambda_prev_0
876
  r0, r1 = h_0 / h, h_1 / h
877
+ D1_0 = (1. / r0) * (model_prev_0 - model_prev_1)
878
+ D1_1 = (1. / r1) * (model_prev_1 - model_prev_2)
879
+ D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
880
+ D2 = (1. / (r0 + r1)) * (D1_0 - D1_1)
881
+ if self.algorithm_type == "dpmsolver++":
882
+ phi_1 = torch.expm1(-h)
883
+ phi_2 = phi_1 / h + 1.
884
+ phi_3 = phi_2 / h - 0.5
885
  x_t = (
886
+ (sigma_t / sigma_prev_0) * x
887
+ - (alpha_t * phi_1) * model_prev_0
888
+ + (alpha_t * phi_2) * D1
889
+ - (alpha_t * phi_3) * D2
890
  )
891
  else:
892
+ phi_1 = torch.expm1(h)
893
+ phi_2 = phi_1 / h - 1.
894
+ phi_3 = phi_2 / h - 0.5
895
  x_t = (
896
+ (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
897
+ - (sigma_t * phi_1) * model_prev_0
898
+ - (sigma_t * phi_2) * D1
899
+ - (sigma_t * phi_3) * D2
900
  )
901
  return x_t
902
 
903
+ def singlestep_dpm_solver_update(self, x, s, t, order, return_intermediate=False, solver_type='dpmsolver', r1=None, r2=None):
 
904
  """
905
  Singlestep DPM-Solver with the order `order` from time `s` to time `t`.
906
 
907
  Args:
908
  x: A pytorch tensor. The initial value at time `s`.
909
+ s: A pytorch tensor. The starting time, with the shape (1,).
910
+ t: A pytorch tensor. The ending time, with the shape (1,).
911
  order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
912
  return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
913
+ solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
914
+ The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
915
  r1: A `float`. The hyperparameter of the second-order or third-order solver.
916
  r2: A `float`. The hyperparameter of the third-order solver.
917
  Returns:
 
920
  if order == 1:
921
  return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate)
922
  elif order == 2:
923
+ return self.singlestep_dpm_solver_second_update(x, s, t, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1)
 
924
  elif order == 3:
925
+ return self.singlestep_dpm_solver_third_update(x, s, t, return_intermediate=return_intermediate, solver_type=solver_type, r1=r1, r2=r2)
 
926
  else:
927
  raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
928
 
929
+ def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type='dpmsolver'):
930
  """
931
  Multistep DPM-Solver with the order `order` from time `t_prev_list[-1]` to time `t`.
932
 
933
  Args:
934
  x: A pytorch tensor. The initial value at time `s`.
935
  model_prev_list: A list of pytorch tensor. The previous computed model values.
936
+ t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (1,)
937
+ t: A pytorch tensor. The ending time, with the shape (1,).
938
  order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
939
+ solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
940
+ The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
941
  Returns:
942
  x_t: A pytorch tensor. The approximated solution at time `t`.
943
  """
 
950
  else:
951
  raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
952
 
953
+ def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5, solver_type='dpmsolver'):
 
954
  """
955
  The adaptive step size solver based on singlestep DPM-Solver.
956
 
 
965
  theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1].
966
  t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the
967
  current time and `t_0` is less than `t_err`. The default setting is 1e-5.
968
+ solver_type: either 'dpmsolver' or 'taylor'. The type for the high-order solvers.
969
+ The type slightly impacts the performance. We recommend to use 'dpmsolver' type.
970
  Returns:
971
  x_0: A pytorch tensor. The approximated solution at time `t_0`.
972
 
973
  [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021.
974
  """
975
  ns = self.noise_schedule
976
+ s = t_T * torch.ones((1,)).to(x)
977
  lambda_s = ns.marginal_lambda(s)
978
  lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x))
979
  h = h_init * torch.ones_like(s).to(x)
 
981
  nfe = 0
982
  if order == 2:
983
  r1 = 0.5
984
+ def lower_update(x, s, t):
985
+ return self.dpm_solver_first_update(x, s, t, return_intermediate=True)
986
+ def higher_update(x, s, t, **kwargs):
987
+ return self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs)
988
  elif order == 3:
989
  r1, r2 = 1. / 3., 2. / 3.
990
+ def lower_update(x, s, t):
991
+ return self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, return_intermediate=True, solver_type=solver_type)
992
+ def higher_update(x, s, t, **kwargs):
993
+ return self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs)
 
 
994
  else:
995
  raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
996
  while torch.abs((s - t_0)).mean() > t_err:
 
998
  x_lower, lower_noise_kwargs = lower_update(x, s, t)
999
  x_higher = higher_update(x, s, t, **lower_noise_kwargs)
1000
  delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)))
1001
+ def norm_fn(v):
1002
+ return torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
1003
  E = norm_fn((x_higher - x_lower) / delta).max()
1004
  if torch.all(E <= 1.):
1005
  x = x_higher
 
1011
  print('adaptive solver nfe', nfe)
1012
  return x
1013
 
1014
+ def add_noise(self, x, t, noise=None):
1015
+ """
1016
+ Compute the noised input xt = alpha_t * x + sigma_t * noise.
1017
+
1018
+ Args:
1019
+ x: A `torch.Tensor` with shape `(batch_size, *shape)`.
1020
+ t: A `torch.Tensor` with shape `(t_size,)`.
1021
+ Returns:
1022
+ xt with shape `(t_size, batch_size, *shape)`.
1023
+ """
1024
+ alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
1025
+ if noise is None:
1026
+ noise = torch.randn((t.shape[0], *x.shape), device=x.device)
1027
+ x = x.reshape((-1, *x.shape))
1028
+ xt = expand_dims(alpha_t, x.dim()) * x + expand_dims(sigma_t, x.dim()) * noise
1029
+ if t.shape[0] == 1:
1030
+ return xt.squeeze(0)
1031
+ else:
1032
+ return xt
1033
+
1034
+ def inverse(self, x, steps=20, t_start=None, t_end=None, order=2, skip_type='time_uniform',
1035
+ method='multistep', lower_order_final=True, denoise_to_zero=False, solver_type='dpmsolver',
1036
+ atol=0.0078, rtol=0.05, return_intermediate=False,
1037
+ ):
1038
+ """
1039
+ Inverse the sample `x` from time `t_start` to `t_end` by DPM-Solver.
1040
+ For discrete-time DPMs, we use `t_start=1/N`, where `N` is the total time steps during training.
1041
+ """
1042
+ t_0 = 1. / self.noise_schedule.total_N if t_start is None else t_start
1043
+ t_T = self.noise_schedule.T if t_end is None else t_end
1044
+ assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
1045
+ return self.sample(x, steps=steps, t_start=t_0, t_end=t_T, order=order, skip_type=skip_type,
1046
+ method=method, lower_order_final=lower_order_final, denoise_to_zero=denoise_to_zero, solver_type=solver_type,
1047
+ atol=atol, rtol=rtol, return_intermediate=return_intermediate)
1048
+
1049
+ def sample(self, x, steps=20, t_start=None, t_end=None, order=2, skip_type='time_uniform',
1050
+ method='multistep', lower_order_final=True, denoise_to_zero=False, solver_type='dpmsolver',
1051
+ atol=0.0078, rtol=0.05, return_intermediate=False,
1052
+ ):
1053
  """
1054
  Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`.
1055
 
 
1098
 
1099
  Some advices for choosing the algorithm:
1100
  - For **unconditional sampling** or **guided sampling with small guidance scale** by DPMs:
1101
+ Use singlestep DPM-Solver or DPM-Solver++ ("DPM-Solver-fast" in the paper) with `order = 3`.
1102
+ e.g., DPM-Solver:
1103
+ >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver")
1104
+ >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
1105
+ skip_type='time_uniform', method='singlestep')
1106
+ e.g., DPM-Solver++:
1107
+ >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
1108
  >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
1109
  skip_type='time_uniform', method='singlestep')
1110
  - For **guided sampling with large guidance scale** by DPMs:
1111
+ Use multistep DPM-Solver with `algorithm_type="dpmsolver++"` and `order = 2`.
1112
  e.g.
1113
+ >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
1114
  >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=2,
1115
  skip_type='time_uniform', method='multistep')
1116
 
 
1136
  order: A `int`. The order of DPM-Solver.
1137
  skip_type: A `str`. The type for the spacing of the time steps. 'time_uniform' or 'logSNR' or 'time_quadratic'.
1138
  method: A `str`. The method for sampling. 'singlestep' or 'multistep' or 'singlestep_fixed' or 'adaptive'.
1139
+ denoise_to_zero: A `bool`. Whether to denoise to time 0 at the final step.
1140
+ Default is `False`. If `denoise_to_zero` is `True`, the total NFE is (`steps` + 1).
1141
+
1142
+ This trick is firstly proposed by DDPM (https://arxiv.org/abs/2006.11239) and
1143
+ score_sde (https://arxiv.org/abs/2011.13456). Such trick can improve the FID
1144
+ for diffusion models sampling by diffusion SDEs for low-resolutional images
1145
+ (such as CIFAR-10). However, we observed that such trick does not matter for
1146
+ high-resolutional images. As it needs an additional NFE, we do not recommend
1147
+ it for high-resolutional images.
1148
+ lower_order_final: A `bool`. Whether to use lower order solvers at the final steps.
1149
+ Only valid for `method=multistep` and `steps < 15`. We empirically find that
1150
+ this trick is a key to stabilizing the sampling by DPM-Solver with very few steps
1151
+ (especially for steps <= 10). So we recommend to set it to be `True`.
1152
+ solver_type: A `str`. The taylor expansion type for the solver. `dpmsolver` or `taylor`. We recommend `dpmsolver`.
1153
  atol: A `float`. The absolute tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
1154
  rtol: A `float`. The relative tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
1155
+ return_intermediate: A `bool`. Whether to save the xt at each step.
1156
+ When set to `True`, method returns a tuple (x0, intermediates); when set to False, method returns only x0.
1157
  Returns:
1158
  x_end: A pytorch tensor. The approximated solution at time `t_end`.
1159
 
1160
  """
1161
  t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
1162
  t_T = self.noise_schedule.T if t_start is None else t_start
1163
+ assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
1164
+ if return_intermediate:
1165
+ assert method in ['multistep', 'singlestep', 'singlestep_fixed'], "Cannot use adaptive solver when saving intermediate values"
1166
+ if self.correcting_xt_fn is not None:
1167
+ assert method in ['multistep', 'singlestep', 'singlestep_fixed'], "Cannot use adaptive solver when correcting_xt_fn is not None"
1168
  device = x.device
1169
+ intermediates = []
1170
+ with torch.no_grad():
1171
+ if method == 'adaptive':
1172
+ x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol, solver_type=solver_type)
1173
+ elif method == 'multistep':
1174
+ assert steps >= order
1175
+ timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
1176
+ assert timesteps.shape[0] - 1 == steps
1177
+ # Init the initial values.
1178
+ step = 0
1179
+ t = timesteps[step]
1180
+ t_prev_list = [t]
1181
+ model_prev_list = [self.model_fn(x, t)]
1182
+ if self.correcting_xt_fn is not None:
1183
+ x = self.correcting_xt_fn(x, t, step)
1184
+ if return_intermediate:
1185
+ intermediates.append(x)
1186
  # Init the first `order` values by lower order multistep DPM-Solver.
1187
+ for step in range(1, order):
1188
+ t = timesteps[step]
1189
+ x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, t, step, solver_type=solver_type)
1190
+ if self.correcting_xt_fn is not None:
1191
+ x = self.correcting_xt_fn(x, t, step)
1192
+ if return_intermediate:
1193
+ intermediates.append(x)
1194
+ t_prev_list.append(t)
1195
+ model_prev_list.append(self.model_fn(x, t))
1196
  # Compute the remaining values by `order`-th order multistep DPM-Solver.
1197
  for step in range(order, steps + 1):
1198
+ t = timesteps[step]
1199
+ # We only use lower order for steps < 10
1200
+ if lower_order_final and steps < 10:
1201
+ step_order = min(order, steps + 1 - step)
1202
+ else:
1203
+ step_order = order
1204
+ x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, t, step_order, solver_type=solver_type)
1205
+ if self.correcting_xt_fn is not None:
1206
+ x = self.correcting_xt_fn(x, t, step)
1207
+ if return_intermediate:
1208
+ intermediates.append(x)
1209
  for i in range(order - 1):
1210
  t_prev_list[i] = t_prev_list[i + 1]
1211
  model_prev_list[i] = model_prev_list[i + 1]
1212
+ t_prev_list[-1] = t
1213
  # We do not need to evaluate the final model value.
1214
  if step < steps:
1215
+ model_prev_list[-1] = self.model_fn(x, t)
1216
+ elif method in ['singlestep', 'singlestep_fixed']:
1217
+ if method == 'singlestep':
1218
+ timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(steps=steps, order=order, skip_type=skip_type, t_T=t_T, t_0=t_0, device=device)
1219
+ elif method == 'singlestep_fixed':
1220
+ K = steps // order
1221
+ orders = [order,] * K
1222
+ timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device)
1223
+ for step, order in enumerate(orders):
1224
+ s, t = timesteps_outer[step], timesteps_outer[step + 1]
1225
+ timesteps_inner = self.get_time_steps(skip_type=skip_type, t_T=s.item(), t_0=t.item(), N=order, device=device)
1226
+ lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner)
1227
+ h = lambda_inner[-1] - lambda_inner[0]
1228
+ r1 = None if order <= 1 else (lambda_inner[1] - lambda_inner[0]) / h
1229
+ r2 = None if order <= 2 else (lambda_inner[2] - lambda_inner[0]) / h
1230
+ x = self.singlestep_dpm_solver_update(x, s, t, order, solver_type=solver_type, r1=r1, r2=r2)
1231
+ if self.correcting_xt_fn is not None:
1232
+ x = self.correcting_xt_fn(x, t, step)
1233
+ if return_intermediate:
1234
+ intermediates.append(x)
1235
+ else:
1236
+ raise ValueError("Got wrong method {}".format(method))
1237
+ if denoise_to_zero:
1238
+ t = torch.ones((1,)).to(device) * t_0
1239
+ x = self.denoise_to_zero_fn(x, t)
1240
+ if self.correcting_xt_fn is not None:
1241
+ x = self.correcting_xt_fn(x, t, step + 1)
1242
+ if return_intermediate:
1243
+ intermediates.append(x)
1244
+ if return_intermediate:
1245
+ return x, intermediates
1246
+ else:
1247
+ return x
1248
+
1249
 
1250
 
1251
  #############################################################
 
1304
  Returns:
1305
  a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
1306
  """
1307
+ return v[(...,) + (None,)*(dims - 1)]
diffusion/how to export onnx.md CHANGED
@@ -1,4 +1,4 @@
1
- - Open [onnx_export](onnx_export.py)
2
- - project_name = "dddsp" change "project_name" to your project name
3
- - model_path = f'{project_name}/model_500000.pt' change "model_path" to your model path
4
  - Run
 
1
+ - Open [onnx_export](onnx_export.py)
2
+ - project_name = "dddsp" change "project_name" to your project name
3
+ - model_path = f'{project_name}/model_500000.pt' change "model_path" to your model path
4
  - Run
diffusion/infer_gt_mel.py CHANGED
@@ -1,6 +1,6 @@
1
- import numpy as np
2
  import torch
3
  import torch.nn.functional as F
 
4
  from diffusion.unit2mel import load_model_vocoder
5
 
6
 
 
 
1
  import torch
2
  import torch.nn.functional as F
3
+
4
  from diffusion.unit2mel import load_model_vocoder
5
 
6
 
diffusion/logger/saver.py CHANGED
@@ -2,16 +2,16 @@
2
  author: wayn391@mastertones
3
  '''
4
 
 
5
  import os
6
- import json
7
  import time
8
- import yaml
9
- import datetime
10
- import torch
11
  import matplotlib.pyplot as plt
12
- from . import utils
 
13
  from torch.utils.tensorboard import SummaryWriter
14
 
 
15
  class Saver(object):
16
  def __init__(
17
  self,
@@ -125,12 +125,7 @@ class Saver(object):
125
  torch.save({
126
  'global_step': self.global_step,
127
  'model': model.state_dict()}, path_pt)
128
-
129
- # to json
130
- if to_json:
131
- path_json = os.path.join(
132
- self.expdir , name+'.json')
133
- utils.to_json(path_params, path_json)
134
 
135
  def delete_model(self, name='model', postfix=''):
136
  # path
 
2
  author: wayn391@mastertones
3
  '''
4
 
5
+ import datetime
6
  import os
 
7
  import time
8
+
 
 
9
  import matplotlib.pyplot as plt
10
+ import torch
11
+ import yaml
12
  from torch.utils.tensorboard import SummaryWriter
13
 
14
+
15
  class Saver(object):
16
  def __init__(
17
  self,
 
125
  torch.save({
126
  'global_step': self.global_step,
127
  'model': model.state_dict()}, path_pt)
128
+
 
 
 
 
 
129
 
130
  def delete_model(self, name='model', postfix=''):
131
  # path
diffusion/logger/utils.py CHANGED
@@ -1,8 +1,9 @@
1
- import os
2
- import yaml
3
  import json
4
- import pickle
 
5
  import torch
 
 
6
 
7
  def traverse_dir(
8
  root_dir,
@@ -121,6 +122,6 @@ def load_model(
121
  ckpt = torch.load(path_pt, map_location=torch.device(device))
122
  global_step = ckpt['global_step']
123
  model.load_state_dict(ckpt['model'], strict=False)
124
- if ckpt.get('optimizer') != None:
125
  optimizer.load_state_dict(ckpt['optimizer'])
126
  return global_step, model, optimizer
 
 
 
1
  import json
2
+ import os
3
+
4
  import torch
5
+ import yaml
6
+
7
 
8
  def traverse_dir(
9
  root_dir,
 
122
  ckpt = torch.load(path_pt, map_location=torch.device(device))
123
  global_step = ckpt['global_step']
124
  model.load_state_dict(ckpt['model'], strict=False)
125
+ if ckpt.get("optimizer") is not None:
126
  optimizer.load_state_dict(ckpt['optimizer'])
127
  return global_step, model, optimizer
diffusion/onnx_export.py CHANGED
@@ -1,226 +1,235 @@
1
- from diffusion_onnx import GaussianDiffusion
2
- import os
3
- import yaml
4
- import torch
5
- import torch.nn as nn
6
- import numpy as np
7
- from wavenet import WaveNet
8
- import torch.nn.functional as F
9
- import diffusion
10
-
11
- class DotDict(dict):
12
- def __getattr__(*args):
13
- val = dict.get(*args)
14
- return DotDict(val) if type(val) is dict else val
15
-
16
- __setattr__ = dict.__setitem__
17
- __delattr__ = dict.__delitem__
18
-
19
-
20
- def load_model_vocoder(
21
- model_path,
22
- device='cpu'):
23
- config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
24
- with open(config_file, "r") as config:
25
- args = yaml.safe_load(config)
26
- args = DotDict(args)
27
-
28
- # load model
29
- model = Unit2Mel(
30
- args.data.encoder_out_channels,
31
- args.model.n_spk,
32
- args.model.use_pitch_aug,
33
- 128,
34
- args.model.n_layers,
35
- args.model.n_chans,
36
- args.model.n_hidden)
37
-
38
- print(' [Loading] ' + model_path)
39
- ckpt = torch.load(model_path, map_location=torch.device(device))
40
- model.to(device)
41
- model.load_state_dict(ckpt['model'])
42
- model.eval()
43
- return model, args
44
-
45
-
46
- class Unit2Mel(nn.Module):
47
- def __init__(
48
- self,
49
- input_channel,
50
- n_spk,
51
- use_pitch_aug=False,
52
- out_dims=128,
53
- n_layers=20,
54
- n_chans=384,
55
- n_hidden=256):
56
- super().__init__()
57
- self.unit_embed = nn.Linear(input_channel, n_hidden)
58
- self.f0_embed = nn.Linear(1, n_hidden)
59
- self.volume_embed = nn.Linear(1, n_hidden)
60
- if use_pitch_aug:
61
- self.aug_shift_embed = nn.Linear(1, n_hidden, bias=False)
62
- else:
63
- self.aug_shift_embed = None
64
- self.n_spk = n_spk
65
- if n_spk is not None and n_spk > 1:
66
- self.spk_embed = nn.Embedding(n_spk, n_hidden)
67
-
68
- # diffusion
69
- self.decoder = GaussianDiffusion(out_dims, n_layers, n_chans, n_hidden)
70
- self.hidden_size = n_hidden
71
- self.speaker_map = torch.zeros((self.n_spk,1,1,n_hidden))
72
-
73
-
74
-
75
- def forward(self, units, mel2ph, f0, volume, g = None):
76
-
77
- '''
78
- input:
79
- B x n_frames x n_unit
80
- return:
81
- dict of B x n_frames x feat
82
- '''
83
-
84
- decoder_inp = F.pad(units, [0, 0, 1, 0])
85
- mel2ph_ = mel2ph.unsqueeze(2).repeat([1, 1, units.shape[-1]])
86
- units = torch.gather(decoder_inp, 1, mel2ph_) # [B, T, H]
87
-
88
- x = self.unit_embed(units) + self.f0_embed((1 + f0.unsqueeze(-1) / 700).log()) + self.volume_embed(volume.unsqueeze(-1))
89
-
90
- if self.n_spk is not None and self.n_spk > 1: # [N, S] * [S, B, 1, H]
91
- g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
92
- g = g * self.speaker_map # [N, S, B, 1, H]
93
- g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
94
- g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
95
- x = x.transpose(1, 2) + g
96
- return x
97
- else:
98
- return x.transpose(1, 2)
99
-
100
-
101
- def init_spkembed(self, units, f0, volume, spk_id = None, spk_mix_dict = None, aug_shift = None,
102
- gt_spec=None, infer=True, infer_speedup=10, method='dpm-solver', k_step=300, use_tqdm=True):
103
-
104
- '''
105
- input:
106
- B x n_frames x n_unit
107
- return:
108
- dict of B x n_frames x feat
109
- '''
110
- x = self.unit_embed(units) + self.f0_embed((1+ f0 / 700).log()) + self.volume_embed(volume)
111
- if self.n_spk is not None and self.n_spk > 1:
112
- if spk_mix_dict is not None:
113
- spk_embed_mix = torch.zeros((1,1,self.hidden_size))
114
- for k, v in spk_mix_dict.items():
115
- spk_id_torch = torch.LongTensor(np.array([[k]])).to(units.device)
116
- spk_embeddd = self.spk_embed(spk_id_torch)
117
- self.speaker_map[k] = spk_embeddd
118
- spk_embed_mix = spk_embed_mix + v * spk_embeddd
119
- x = x + spk_embed_mix
120
- else:
121
- x = x + self.spk_embed(spk_id - 1)
122
- self.speaker_map = self.speaker_map.unsqueeze(0)
123
- self.speaker_map = self.speaker_map.detach()
124
- return x.transpose(1, 2)
125
-
126
- def OnnxExport(self, project_name=None, init_noise=None, export_encoder=True, export_denoise=True, export_pred=True, export_after=True):
127
- hubert_hidden_size = 768
128
- n_frames = 100
129
- hubert = torch.randn((1, n_frames, hubert_hidden_size))
130
- mel2ph = torch.arange(end=n_frames).unsqueeze(0).long()
131
- f0 = torch.randn((1, n_frames))
132
- volume = torch.randn((1, n_frames))
133
- spk_mix = []
134
- spks = {}
135
- if self.n_spk is not None and self.n_spk > 1:
136
- for i in range(self.n_spk):
137
- spk_mix.append(1.0/float(self.n_spk))
138
- spks.update({i:1.0/float(self.n_spk)})
139
- spk_mix = torch.tensor(spk_mix)
140
- spk_mix = spk_mix.repeat(n_frames, 1)
141
- orgouttt = self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
142
- outtt = self.forward(hubert, mel2ph, f0, volume, spk_mix)
143
- if export_encoder:
144
- torch.onnx.export(
145
- self,
146
- (hubert, mel2ph, f0, volume, spk_mix),
147
- f"{project_name}_encoder.onnx",
148
- input_names=["hubert", "mel2ph", "f0", "volume", "spk_mix"],
149
- output_names=["mel_pred"],
150
- dynamic_axes={
151
- "hubert": [1],
152
- "f0": [1],
153
- "volume": [1],
154
- "mel2ph": [1],
155
- "spk_mix": [0],
156
- },
157
- opset_version=16
158
- )
159
-
160
- self.decoder.OnnxExport(project_name, init_noise=init_noise, export_denoise=export_denoise, export_pred=export_pred, export_after=export_after)
161
-
162
- def ExportOnnx(self, project_name=None):
163
- hubert_hidden_size = 768
164
- n_frames = 100
165
- hubert = torch.randn((1, n_frames, hubert_hidden_size))
166
- mel2ph = torch.arange(end=n_frames).unsqueeze(0).long()
167
- f0 = torch.randn((1, n_frames))
168
- volume = torch.randn((1, n_frames))
169
- spk_mix = []
170
- spks = {}
171
- if self.n_spk is not None and self.n_spk > 1:
172
- for i in range(self.n_spk):
173
- spk_mix.append(1.0/float(self.n_spk))
174
- spks.update({i:1.0/float(self.n_spk)})
175
- spk_mix = torch.tensor(spk_mix)
176
- orgouttt = self.orgforward(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
177
- outtt = self.forward(hubert, mel2ph, f0, volume, spk_mix)
178
-
179
- torch.onnx.export(
180
- self,
181
- (hubert, mel2ph, f0, volume, spk_mix),
182
- f"{project_name}_encoder.onnx",
183
- input_names=["hubert", "mel2ph", "f0", "volume", "spk_mix"],
184
- output_names=["mel_pred"],
185
- dynamic_axes={
186
- "hubert": [1],
187
- "f0": [1],
188
- "volume": [1],
189
- "mel2ph": [1]
190
- },
191
- opset_version=16
192
- )
193
-
194
- condition = torch.randn(1,self.decoder.n_hidden,n_frames)
195
- noise = torch.randn((1, 1, self.decoder.mel_bins, condition.shape[2]), dtype=torch.float32)
196
- pndm_speedup = torch.LongTensor([100])
197
- K_steps = torch.LongTensor([1000])
198
- self.decoder = torch.jit.script(self.decoder)
199
- self.decoder(condition, noise, pndm_speedup, K_steps)
200
-
201
- torch.onnx.export(
202
- self.decoder,
203
- (condition, noise, pndm_speedup, K_steps),
204
- f"{project_name}_diffusion.onnx",
205
- input_names=["condition", "noise", "pndm_speedup", "K_steps"],
206
- output_names=["mel"],
207
- dynamic_axes={
208
- "condition": [2],
209
- "noise": [3],
210
- },
211
- opset_version=16
212
- )
213
-
214
-
215
- if __name__ == "__main__":
216
- project_name = "dddsp"
217
- model_path = f'{project_name}/model_500000.pt'
218
-
219
- model, _ = load_model_vocoder(model_path)
220
-
221
- # 分开Diffusion导出(需要使用MoeSS/MoeVoiceStudio或者自己编写Pndm/Dpm采样)
222
- model.OnnxExport(project_name, export_encoder=True, export_denoise=True, export_pred=True, export_after=True)
223
-
224
- # 合并Diffusion导出(Encoder和Diffusion分开,直接将Encoder的结果和初始噪声输入Diffusion即可)
225
- # model.ExportOnnx(project_name)
226
-
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ import yaml
8
+ from diffusion_onnx import GaussianDiffusion
9
+
10
+
11
+ class DotDict(dict):
12
+ def __getattr__(*args):
13
+ val = dict.get(*args)
14
+ return DotDict(val) if type(val) is dict else val
15
+
16
+ __setattr__ = dict.__setitem__
17
+ __delattr__ = dict.__delitem__
18
+
19
+
20
+ def load_model_vocoder(
21
+ model_path,
22
+ device='cpu'):
23
+ config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
24
+ with open(config_file, "r") as config:
25
+ args = yaml.safe_load(config)
26
+ args = DotDict(args)
27
+
28
+ # load model
29
+ model = Unit2Mel(
30
+ args.data.encoder_out_channels,
31
+ args.model.n_spk,
32
+ args.model.use_pitch_aug,
33
+ 128,
34
+ args.model.n_layers,
35
+ args.model.n_chans,
36
+ args.model.n_hidden,
37
+ args.model.timesteps,
38
+ args.model.k_step_max)
39
+
40
+ print(' [Loading] ' + model_path)
41
+ ckpt = torch.load(model_path, map_location=torch.device(device))
42
+ model.to(device)
43
+ model.load_state_dict(ckpt['model'])
44
+ model.eval()
45
+ return model, args
46
+
47
+
48
+ class Unit2Mel(nn.Module):
49
+ def __init__(
50
+ self,
51
+ input_channel,
52
+ n_spk,
53
+ use_pitch_aug=False,
54
+ out_dims=128,
55
+ n_layers=20,
56
+ n_chans=384,
57
+ n_hidden=256,
58
+ timesteps=1000,
59
+ k_step_max=1000):
60
+ super().__init__()
61
+
62
+ self.unit_embed = nn.Linear(input_channel, n_hidden)
63
+ self.f0_embed = nn.Linear(1, n_hidden)
64
+ self.volume_embed = nn.Linear(1, n_hidden)
65
+ if use_pitch_aug:
66
+ self.aug_shift_embed = nn.Linear(1, n_hidden, bias=False)
67
+ else:
68
+ self.aug_shift_embed = None
69
+ self.n_spk = n_spk
70
+ if n_spk is not None and n_spk > 1:
71
+ self.spk_embed = nn.Embedding(n_spk, n_hidden)
72
+
73
+ self.timesteps = timesteps if timesteps is not None else 1000
74
+ self.k_step_max = k_step_max if k_step_max is not None and k_step_max>0 and k_step_max<self.timesteps else self.timesteps
75
+
76
+
77
+ # diffusion
78
+ self.decoder = GaussianDiffusion(out_dims, n_layers, n_chans, n_hidden,self.timesteps,self.k_step_max)
79
+ self.hidden_size = n_hidden
80
+ self.speaker_map = torch.zeros((self.n_spk,1,1,n_hidden))
81
+
82
+
83
+
84
+ def forward(self, units, mel2ph, f0, volume, g = None):
85
+
86
+ '''
87
+ input:
88
+ B x n_frames x n_unit
89
+ return:
90
+ dict of B x n_frames x feat
91
+ '''
92
+
93
+ decoder_inp = F.pad(units, [0, 0, 1, 0])
94
+ mel2ph_ = mel2ph.unsqueeze(2).repeat([1, 1, units.shape[-1]])
95
+ units = torch.gather(decoder_inp, 1, mel2ph_) # [B, T, H]
96
+
97
+ x = self.unit_embed(units) + self.f0_embed((1 + f0.unsqueeze(-1) / 700).log()) + self.volume_embed(volume.unsqueeze(-1))
98
+
99
+ if self.n_spk is not None and self.n_spk > 1: # [N, S] * [S, B, 1, H]
100
+ g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
101
+ g = g * self.speaker_map # [N, S, B, 1, H]
102
+ g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
103
+ g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
104
+ x = x.transpose(1, 2) + g
105
+ return x
106
+ else:
107
+ return x.transpose(1, 2)
108
+
109
+
110
+ def init_spkembed(self, units, f0, volume, spk_id = None, spk_mix_dict = None, aug_shift = None,
111
+ gt_spec=None, infer=True, infer_speedup=10, method='dpm-solver', k_step=300, use_tqdm=True):
112
+
113
+ '''
114
+ input:
115
+ B x n_frames x n_unit
116
+ return:
117
+ dict of B x n_frames x feat
118
+ '''
119
+ x = self.unit_embed(units) + self.f0_embed((1+ f0 / 700).log()) + self.volume_embed(volume)
120
+ if self.n_spk is not None and self.n_spk > 1:
121
+ if spk_mix_dict is not None:
122
+ spk_embed_mix = torch.zeros((1,1,self.hidden_size))
123
+ for k, v in spk_mix_dict.items():
124
+ spk_id_torch = torch.LongTensor(np.array([[k]])).to(units.device)
125
+ spk_embeddd = self.spk_embed(spk_id_torch)
126
+ self.speaker_map[k] = spk_embeddd
127
+ spk_embed_mix = spk_embed_mix + v * spk_embeddd
128
+ x = x + spk_embed_mix
129
+ else:
130
+ x = x + self.spk_embed(spk_id - 1)
131
+ self.speaker_map = self.speaker_map.unsqueeze(0)
132
+ self.speaker_map = self.speaker_map.detach()
133
+ return x.transpose(1, 2)
134
+
135
+ def OnnxExport(self, project_name=None, init_noise=None, export_encoder=True, export_denoise=True, export_pred=True, export_after=True):
136
+ hubert_hidden_size = 768
137
+ n_frames = 100
138
+ hubert = torch.randn((1, n_frames, hubert_hidden_size))
139
+ mel2ph = torch.arange(end=n_frames).unsqueeze(0).long()
140
+ f0 = torch.randn((1, n_frames))
141
+ volume = torch.randn((1, n_frames))
142
+ spk_mix = []
143
+ spks = {}
144
+ if self.n_spk is not None and self.n_spk > 1:
145
+ for i in range(self.n_spk):
146
+ spk_mix.append(1.0/float(self.n_spk))
147
+ spks.update({i:1.0/float(self.n_spk)})
148
+ spk_mix = torch.tensor(spk_mix)
149
+ spk_mix = spk_mix.repeat(n_frames, 1)
150
+ self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
151
+ self.forward(hubert, mel2ph, f0, volume, spk_mix)
152
+ if export_encoder:
153
+ torch.onnx.export(
154
+ self,
155
+ (hubert, mel2ph, f0, volume, spk_mix),
156
+ f"{project_name}_encoder.onnx",
157
+ input_names=["hubert", "mel2ph", "f0", "volume", "spk_mix"],
158
+ output_names=["mel_pred"],
159
+ dynamic_axes={
160
+ "hubert": [1],
161
+ "f0": [1],
162
+ "volume": [1],
163
+ "mel2ph": [1],
164
+ "spk_mix": [0],
165
+ },
166
+ opset_version=16
167
+ )
168
+
169
+ self.decoder.OnnxExport(project_name, init_noise=init_noise, export_denoise=export_denoise, export_pred=export_pred, export_after=export_after)
170
+
171
+ def ExportOnnx(self, project_name=None):
172
+ hubert_hidden_size = 768
173
+ n_frames = 100
174
+ hubert = torch.randn((1, n_frames, hubert_hidden_size))
175
+ mel2ph = torch.arange(end=n_frames).unsqueeze(0).long()
176
+ f0 = torch.randn((1, n_frames))
177
+ volume = torch.randn((1, n_frames))
178
+ spk_mix = []
179
+ spks = {}
180
+ if self.n_spk is not None and self.n_spk > 1:
181
+ for i in range(self.n_spk):
182
+ spk_mix.append(1.0/float(self.n_spk))
183
+ spks.update({i:1.0/float(self.n_spk)})
184
+ spk_mix = torch.tensor(spk_mix)
185
+ self.orgforward(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
186
+ self.forward(hubert, mel2ph, f0, volume, spk_mix)
187
+
188
+ torch.onnx.export(
189
+ self,
190
+ (hubert, mel2ph, f0, volume, spk_mix),
191
+ f"{project_name}_encoder.onnx",
192
+ input_names=["hubert", "mel2ph", "f0", "volume", "spk_mix"],
193
+ output_names=["mel_pred"],
194
+ dynamic_axes={
195
+ "hubert": [1],
196
+ "f0": [1],
197
+ "volume": [1],
198
+ "mel2ph": [1]
199
+ },
200
+ opset_version=16
201
+ )
202
+
203
+ condition = torch.randn(1,self.decoder.n_hidden,n_frames)
204
+ noise = torch.randn((1, 1, self.decoder.mel_bins, condition.shape[2]), dtype=torch.float32)
205
+ pndm_speedup = torch.LongTensor([100])
206
+ K_steps = torch.LongTensor([1000])
207
+ self.decoder = torch.jit.script(self.decoder)
208
+ self.decoder(condition, noise, pndm_speedup, K_steps)
209
+
210
+ torch.onnx.export(
211
+ self.decoder,
212
+ (condition, noise, pndm_speedup, K_steps),
213
+ f"{project_name}_diffusion.onnx",
214
+ input_names=["condition", "noise", "pndm_speedup", "K_steps"],
215
+ output_names=["mel"],
216
+ dynamic_axes={
217
+ "condition": [2],
218
+ "noise": [3],
219
+ },
220
+ opset_version=16
221
+ )
222
+
223
+
224
+ if __name__ == "__main__":
225
+ project_name = "dddsp"
226
+ model_path = f'{project_name}/model_500000.pt'
227
+
228
+ model, _ = load_model_vocoder(model_path)
229
+
230
+ # 分开Diffusion导出(需要使用MoeSS/MoeVoiceStudio或者自己编写Pndm/Dpm采样)
231
+ model.OnnxExport(project_name, export_encoder=True, export_denoise=True, export_pred=True, export_after=True)
232
+
233
+ # 合并Diffusion导出(Encoder和Diffusion分开,直接将Encoder的结果和初始噪声输入Diffusion即可)
234
+ # model.ExportOnnx(project_name)
235
+
diffusion/solver.py CHANGED
@@ -1,13 +1,15 @@
1
- import os
2
  import time
 
 
3
  import numpy as np
4
  import torch
5
- import librosa
6
- from diffusion.logger.saver import Saver
7
- from diffusion.logger import utils
8
  from torch import autocast
9
  from torch.cuda.amp import GradScaler
10
 
 
 
 
 
11
  def test(args, model, vocoder, loader_test, saver):
12
  print(' [*] testing...')
13
  model.eval()
@@ -40,10 +42,12 @@ def test(args, model, vocoder, loader_test, saver):
40
  data['f0'],
41
  data['volume'],
42
  data['spk_id'],
43
- gt_spec=None,
44
  infer=True,
45
  infer_speedup=args.infer.speedup,
46
- method=args.infer.method)
 
 
47
  signal = vocoder.infer(mel, data['f0'])
48
  ed_time = time.time()
49
 
@@ -62,7 +66,8 @@ def test(args, model, vocoder, loader_test, saver):
62
  data['volume'],
63
  data['spk_id'],
64
  gt_spec=data['mel'],
65
- infer=False)
 
66
  test_loss += loss.item()
67
 
68
  # log mel
@@ -121,11 +126,11 @@ def train(args, initial_global_step, model, optimizer, scheduler, vocoder, loade
121
  # forward
122
  if dtype == torch.float32:
123
  loss = model(data['units'].float(), data['f0'], data['volume'], data['spk_id'],
124
- aug_shift = data['aug_shift'], gt_spec=data['mel'].float(), infer=False)
125
  else:
126
  with autocast(device_type=args.device, dtype=dtype):
127
  loss = model(data['units'], data['f0'], data['volume'], data['spk_id'],
128
- aug_shift = data['aug_shift'], gt_spec=data['mel'], infer=False)
129
 
130
  # handle nan loss
131
  if torch.isnan(loss):
@@ -171,25 +176,41 @@ def train(args, initial_global_step, model, optimizer, scheduler, vocoder, loade
171
  optimizer_save = optimizer if args.train.save_opt else None
172
 
173
  # save latest
 
 
 
 
 
 
 
 
174
  saver.save_model(model, optimizer_save, postfix=f'{saver.global_step}')
175
  last_val_step = saver.global_step - args.train.interval_val
176
  if last_val_step % args.train.interval_force_save != 0:
177
  saver.delete_model(postfix=f'{last_val_step}')
178
 
179
- # run testing set
180
- test_loss = test(args, model, vocoder, loader_test, saver)
181
-
182
- # log loss
183
- saver.log_info(
184
- ' --- <validation> --- \nloss: {:.3f}. '.format(
185
- test_loss,
186
- )
187
- )
188
-
189
- saver.log_value({
190
- 'validation/loss': test_loss
191
- })
192
-
 
 
 
 
 
 
 
 
193
  model.train()
194
 
195
 
 
 
1
  import time
2
+
3
+ import librosa
4
  import numpy as np
5
  import torch
 
 
 
6
  from torch import autocast
7
  from torch.cuda.amp import GradScaler
8
 
9
+ from diffusion.logger import utils
10
+ from diffusion.logger.saver import Saver
11
+
12
+
13
  def test(args, model, vocoder, loader_test, saver):
14
  print(' [*] testing...')
15
  model.eval()
 
42
  data['f0'],
43
  data['volume'],
44
  data['spk_id'],
45
+ gt_spec=None if model.k_step_max == model.timesteps else data['mel'],
46
  infer=True,
47
  infer_speedup=args.infer.speedup,
48
+ method=args.infer.method,
49
+ k_step=model.k_step_max
50
+ )
51
  signal = vocoder.infer(mel, data['f0'])
52
  ed_time = time.time()
53
 
 
66
  data['volume'],
67
  data['spk_id'],
68
  gt_spec=data['mel'],
69
+ infer=False,
70
+ k_step=model.k_step_max)
71
  test_loss += loss.item()
72
 
73
  # log mel
 
126
  # forward
127
  if dtype == torch.float32:
128
  loss = model(data['units'].float(), data['f0'], data['volume'], data['spk_id'],
129
+ aug_shift = data['aug_shift'], gt_spec=data['mel'].float(), infer=False, k_step=model.k_step_max)
130
  else:
131
  with autocast(device_type=args.device, dtype=dtype):
132
  loss = model(data['units'], data['f0'], data['volume'], data['spk_id'],
133
+ aug_shift = data['aug_shift'], gt_spec=data['mel'], infer=False, k_step=model.k_step_max)
134
 
135
  # handle nan loss
136
  if torch.isnan(loss):
 
176
  optimizer_save = optimizer if args.train.save_opt else None
177
 
178
  # save latest
179
+ import os
180
+ import re
181
+ file_name_list = os.listdir("./logs/44k/diffusion/")
182
+ for i in range(len(file_name_list)):
183
+ if re.search(".pt", file_name_list[i]):
184
+ os.remove("./logs/44k/diffusion/" + file_name_list[i])
185
+ pwd = os.getcwd()
186
+
187
  saver.save_model(model, optimizer_save, postfix=f'{saver.global_step}')
188
  last_val_step = saver.global_step - args.train.interval_val
189
  if last_val_step % args.train.interval_force_save != 0:
190
  saver.delete_model(postfix=f'{last_val_step}')
191
 
192
+ # # run testing set
193
+ # test_loss = test(args, model, vocoder, loader_test, saver)
194
+ #
195
+ # # log loss
196
+ # saver.log_info(
197
+ # ' --- <validation> --- \nloss: {:.3f}. '.format(
198
+ # test_loss,
199
+ # )
200
+ # )
201
+ #
202
+ # saver.log_value({
203
+ # 'validation/loss': test_loss
204
+ # })
205
+
206
+ if os.path.exists("/content/so-vits-svc/TMP"):
207
+ os.chdir("/content/so-vits-svc/TMP")
208
+ os.system("git pull")
209
+ os.system("git add .")
210
+ os.system('git commit -m "update"')
211
+ os.system("git push")
212
+ os.chdir(pwd)
213
+
214
  model.train()
215
 
216
 
diffusion/uni_pc.py ADDED
@@ -0,0 +1,733 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import torch
4
+
5
+
6
+ class NoiseScheduleVP:
7
+ def __init__(
8
+ self,
9
+ schedule='discrete',
10
+ betas=None,
11
+ alphas_cumprod=None,
12
+ continuous_beta_0=0.1,
13
+ continuous_beta_1=20.,
14
+ dtype=torch.float32,
15
+ ):
16
+ """Create a wrapper class for the forward SDE (VP type).
17
+ ***
18
+ Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
19
+ We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
20
+ ***
21
+ The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
22
+ We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
23
+ Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
24
+ log_alpha_t = self.marginal_log_mean_coeff(t)
25
+ sigma_t = self.marginal_std(t)
26
+ lambda_t = self.marginal_lambda(t)
27
+ Moreover, as lambda(t) is an invertible function, we also support its inverse function:
28
+ t = self.inverse_lambda(lambda_t)
29
+ ===============================================================
30
+ We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
31
+ 1. For discrete-time DPMs:
32
+ For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
33
+ t_i = (i + 1) / N
34
+ e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
35
+ We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
36
+ Args:
37
+ betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
38
+ alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
39
+ Note that we always have alphas_cumprod = cumprod(1 - betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
40
+ **Important**: Please pay special attention for the args for `alphas_cumprod`:
41
+ The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
42
+ q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
43
+ Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
44
+ alpha_{t_n} = \sqrt{\hat{alpha_n}},
45
+ and
46
+ log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
47
+ 2. For continuous-time DPMs:
48
+ We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
49
+ schedule are the default settings in DDPM and improved-DDPM:
50
+ Args:
51
+ beta_min: A `float` number. The smallest beta for the linear schedule.
52
+ beta_max: A `float` number. The largest beta for the linear schedule.
53
+ cosine_s: A `float` number. The hyperparameter in the cosine schedule.
54
+ cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
55
+ T: A `float` number. The ending time of the forward process.
56
+ ===============================================================
57
+ Args:
58
+ schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
59
+ 'linear' or 'cosine' for continuous-time DPMs.
60
+ Returns:
61
+ A wrapper object of the forward SDE (VP type).
62
+
63
+ ===============================================================
64
+ Example:
65
+ # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
66
+ >>> ns = NoiseScheduleVP('discrete', betas=betas)
67
+ # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
68
+ >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
69
+ # For continuous-time DPMs (VPSDE), linear schedule:
70
+ >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
71
+ """
72
+
73
+ if schedule not in ['discrete', 'linear', 'cosine']:
74
+ raise ValueError("Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(schedule))
75
+
76
+ self.schedule = schedule
77
+ if schedule == 'discrete':
78
+ if betas is not None:
79
+ log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
80
+ else:
81
+ assert alphas_cumprod is not None
82
+ log_alphas = 0.5 * torch.log(alphas_cumprod)
83
+ self.total_N = len(log_alphas)
84
+ self.T = 1.
85
+ self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1)).to(dtype=dtype)
86
+ self.log_alpha_array = log_alphas.reshape((1, -1,)).to(dtype=dtype)
87
+ else:
88
+ self.total_N = 1000
89
+ self.beta_0 = continuous_beta_0
90
+ self.beta_1 = continuous_beta_1
91
+ self.cosine_s = 0.008
92
+ self.cosine_beta_max = 999.
93
+ self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
94
+ self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
95
+ self.schedule = schedule
96
+ if schedule == 'cosine':
97
+ # For the cosine schedule, T = 1 will have numerical issues. So we manually set the ending time T.
98
+ # Note that T = 0.9946 may be not the optimal setting. However, we find it works well.
99
+ self.T = 0.9946
100
+ else:
101
+ self.T = 1.
102
+
103
+ def marginal_log_mean_coeff(self, t):
104
+ """
105
+ Compute log(alpha_t) of a given continuous-time label t in [0, T].
106
+ """
107
+ if self.schedule == 'discrete':
108
+ return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)).reshape((-1))
109
+ elif self.schedule == 'linear':
110
+ return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
111
+ elif self.schedule == 'cosine':
112
+ def log_alpha_fn(s):
113
+ return torch.log(torch.cos((s + self.cosine_s) / (1.0 + self.cosine_s) * math.pi / 2.0))
114
+ log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0
115
+ return log_alpha_t
116
+
117
+ def marginal_alpha(self, t):
118
+ """
119
+ Compute alpha_t of a given continuous-time label t in [0, T].
120
+ """
121
+ return torch.exp(self.marginal_log_mean_coeff(t))
122
+
123
+ def marginal_std(self, t):
124
+ """
125
+ Compute sigma_t of a given continuous-time label t in [0, T].
126
+ """
127
+ return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
128
+
129
+ def marginal_lambda(self, t):
130
+ """
131
+ Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
132
+ """
133
+ log_mean_coeff = self.marginal_log_mean_coeff(t)
134
+ log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
135
+ return log_mean_coeff - log_std
136
+
137
+ def inverse_lambda(self, lamb):
138
+ """
139
+ Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
140
+ """
141
+ if self.schedule == 'linear':
142
+ tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
143
+ Delta = self.beta_0**2 + tmp
144
+ return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
145
+ elif self.schedule == 'discrete':
146
+ log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
147
+ t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]), torch.flip(self.t_array.to(lamb.device), [1]))
148
+ return t.reshape((-1,))
149
+ else:
150
+ log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
151
+ def t_fn(log_alpha_t):
152
+ return torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2.0 * (1.0 + self.cosine_s) / math.pi - self.cosine_s
153
+ t = t_fn(log_alpha)
154
+ return t
155
+
156
+
157
+ def model_wrapper(
158
+ model,
159
+ noise_schedule,
160
+ model_type="noise",
161
+ model_kwargs={},
162
+ guidance_type="uncond",
163
+ condition=None,
164
+ unconditional_condition=None,
165
+ guidance_scale=1.,
166
+ classifier_fn=None,
167
+ classifier_kwargs={},
168
+ ):
169
+ """Create a wrapper function for the noise prediction model.
170
+ """
171
+
172
+ def get_model_input_time(t_continuous):
173
+ """
174
+ Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
175
+ For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
176
+ For continuous-time DPMs, we just use `t_continuous`.
177
+ """
178
+ if noise_schedule.schedule == 'discrete':
179
+ return (t_continuous - 1. / noise_schedule.total_N) * noise_schedule.total_N
180
+ else:
181
+ return t_continuous
182
+
183
+ def noise_pred_fn(x, t_continuous, cond=None):
184
+ t_input = get_model_input_time(t_continuous)
185
+ if cond is None:
186
+ output = model(x, t_input, **model_kwargs)
187
+ else:
188
+ output = model(x, t_input, cond, **model_kwargs)
189
+ if model_type == "noise":
190
+ return output
191
+ elif model_type == "x_start":
192
+ alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
193
+ return (x - alpha_t * output) / sigma_t
194
+ elif model_type == "v":
195
+ alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
196
+ return alpha_t * output + sigma_t * x
197
+ elif model_type == "score":
198
+ sigma_t = noise_schedule.marginal_std(t_continuous)
199
+ return -sigma_t * output
200
+
201
+ def cond_grad_fn(x, t_input):
202
+ """
203
+ Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
204
+ """
205
+ with torch.enable_grad():
206
+ x_in = x.detach().requires_grad_(True)
207
+ log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
208
+ return torch.autograd.grad(log_prob.sum(), x_in)[0]
209
+
210
+ def model_fn(x, t_continuous):
211
+ """
212
+ The noise predicition model function that is used for DPM-Solver.
213
+ """
214
+ if guidance_type == "uncond":
215
+ return noise_pred_fn(x, t_continuous)
216
+ elif guidance_type == "classifier":
217
+ assert classifier_fn is not None
218
+ t_input = get_model_input_time(t_continuous)
219
+ cond_grad = cond_grad_fn(x, t_input)
220
+ sigma_t = noise_schedule.marginal_std(t_continuous)
221
+ noise = noise_pred_fn(x, t_continuous)
222
+ return noise - guidance_scale * sigma_t * cond_grad
223
+ elif guidance_type == "classifier-free":
224
+ if guidance_scale == 1. or unconditional_condition is None:
225
+ return noise_pred_fn(x, t_continuous, cond=condition)
226
+ else:
227
+ x_in = torch.cat([x] * 2)
228
+ t_in = torch.cat([t_continuous] * 2)
229
+ c_in = torch.cat([unconditional_condition, condition])
230
+ noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
231
+ return noise_uncond + guidance_scale * (noise - noise_uncond)
232
+
233
+ assert model_type in ["noise", "x_start", "v"]
234
+ assert guidance_type in ["uncond", "classifier", "classifier-free"]
235
+ return model_fn
236
+
237
+
238
+ class UniPC:
239
+ def __init__(
240
+ self,
241
+ model_fn,
242
+ noise_schedule,
243
+ algorithm_type="data_prediction",
244
+ correcting_x0_fn=None,
245
+ correcting_xt_fn=None,
246
+ thresholding_max_val=1.,
247
+ dynamic_thresholding_ratio=0.995,
248
+ variant='bh1'
249
+ ):
250
+ """Construct a UniPC.
251
+
252
+ We support both data_prediction and noise_prediction.
253
+ """
254
+ self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
255
+ self.noise_schedule = noise_schedule
256
+ assert algorithm_type in ["data_prediction", "noise_prediction"]
257
+
258
+ if correcting_x0_fn == "dynamic_thresholding":
259
+ self.correcting_x0_fn = self.dynamic_thresholding_fn
260
+ else:
261
+ self.correcting_x0_fn = correcting_x0_fn
262
+
263
+ self.correcting_xt_fn = correcting_xt_fn
264
+ self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
265
+ self.thresholding_max_val = thresholding_max_val
266
+
267
+ self.variant = variant
268
+ self.predict_x0 = algorithm_type == "data_prediction"
269
+
270
+ def dynamic_thresholding_fn(self, x0, t=None):
271
+ """
272
+ The dynamic thresholding method.
273
+ """
274
+ dims = x0.dim()
275
+ p = self.dynamic_thresholding_ratio
276
+ s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
277
+ s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims)
278
+ x0 = torch.clamp(x0, -s, s) / s
279
+ return x0
280
+
281
+ def noise_prediction_fn(self, x, t):
282
+ """
283
+ Return the noise prediction model.
284
+ """
285
+ return self.model(x, t)
286
+
287
+ def data_prediction_fn(self, x, t):
288
+ """
289
+ Return the data prediction model (with corrector).
290
+ """
291
+ noise = self.noise_prediction_fn(x, t)
292
+ alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
293
+ x0 = (x - sigma_t * noise) / alpha_t
294
+ if self.correcting_x0_fn is not None:
295
+ x0 = self.correcting_x0_fn(x0)
296
+ return x0
297
+
298
+ def model_fn(self, x, t):
299
+ """
300
+ Convert the model to the noise prediction model or the data prediction model.
301
+ """
302
+ if self.predict_x0:
303
+ return self.data_prediction_fn(x, t)
304
+ else:
305
+ return self.noise_prediction_fn(x, t)
306
+
307
+ def get_time_steps(self, skip_type, t_T, t_0, N, device):
308
+ """Compute the intermediate time steps for sampling.
309
+ """
310
+ if skip_type == 'logSNR':
311
+ lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
312
+ lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
313
+ logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
314
+ return self.noise_schedule.inverse_lambda(logSNR_steps)
315
+ elif skip_type == 'time_uniform':
316
+ return torch.linspace(t_T, t_0, N + 1).to(device)
317
+ elif skip_type == 'time_quadratic':
318
+ t_order = 2
319
+ t = torch.linspace(t_T**(1. / t_order), t_0**(1. / t_order), N + 1).pow(t_order).to(device)
320
+ return t
321
+ else:
322
+ raise ValueError("Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
323
+
324
+ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
325
+ """
326
+ Get the order of each step for sampling by the singlestep DPM-Solver.
327
+ """
328
+ if order == 3:
329
+ K = steps // 3 + 1
330
+ if steps % 3 == 0:
331
+ orders = [3,] * (K - 2) + [2, 1]
332
+ elif steps % 3 == 1:
333
+ orders = [3,] * (K - 1) + [1]
334
+ else:
335
+ orders = [3,] * (K - 1) + [2]
336
+ elif order == 2:
337
+ if steps % 2 == 0:
338
+ K = steps // 2
339
+ orders = [2,] * K
340
+ else:
341
+ K = steps // 2 + 1
342
+ orders = [2,] * (K - 1) + [1]
343
+ elif order == 1:
344
+ K = steps
345
+ orders = [1,] * steps
346
+ else:
347
+ raise ValueError("'order' must be '1' or '2' or '3'.")
348
+ if skip_type == 'logSNR':
349
+ # To reproduce the results in DPM-Solver paper
350
+ timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
351
+ else:
352
+ timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[torch.cumsum(torch.tensor([0,] + orders), 0).to(device)]
353
+ return timesteps_outer, orders
354
+
355
+ def denoise_to_zero_fn(self, x, s):
356
+ """
357
+ Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
358
+ """
359
+ return self.data_prediction_fn(x, s)
360
+
361
+ def multistep_uni_pc_update(self, x, model_prev_list, t_prev_list, t, order, **kwargs):
362
+ if len(t.shape) == 0:
363
+ t = t.view(-1)
364
+ if 'bh' in self.variant:
365
+ return self.multistep_uni_pc_bh_update(x, model_prev_list, t_prev_list, t, order, **kwargs)
366
+ else:
367
+ assert self.variant == 'vary_coeff'
368
+ return self.multistep_uni_pc_vary_update(x, model_prev_list, t_prev_list, t, order, **kwargs)
369
+
370
+ def multistep_uni_pc_vary_update(self, x, model_prev_list, t_prev_list, t, order, use_corrector=True):
371
+ #print(f'using unified predictor-corrector with order {order} (solver type: vary coeff)')
372
+ ns = self.noise_schedule
373
+ assert order <= len(model_prev_list)
374
+
375
+ # first compute rks
376
+ t_prev_0 = t_prev_list[-1]
377
+ lambda_prev_0 = ns.marginal_lambda(t_prev_0)
378
+ lambda_t = ns.marginal_lambda(t)
379
+ model_prev_0 = model_prev_list[-1]
380
+ sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
381
+ log_alpha_t = ns.marginal_log_mean_coeff(t)
382
+ alpha_t = torch.exp(log_alpha_t)
383
+
384
+ h = lambda_t - lambda_prev_0
385
+
386
+ rks = []
387
+ D1s = []
388
+ for i in range(1, order):
389
+ t_prev_i = t_prev_list[-(i + 1)]
390
+ model_prev_i = model_prev_list[-(i + 1)]
391
+ lambda_prev_i = ns.marginal_lambda(t_prev_i)
392
+ rk = (lambda_prev_i - lambda_prev_0) / h
393
+ rks.append(rk)
394
+ D1s.append((model_prev_i - model_prev_0) / rk)
395
+
396
+ rks.append(1.)
397
+ rks = torch.tensor(rks, device=x.device)
398
+
399
+ K = len(rks)
400
+ # build C matrix
401
+ C = []
402
+
403
+ col = torch.ones_like(rks)
404
+ for k in range(1, K + 1):
405
+ C.append(col)
406
+ col = col * rks / (k + 1)
407
+ C = torch.stack(C, dim=1)
408
+
409
+ if len(D1s) > 0:
410
+ D1s = torch.stack(D1s, dim=1) # (B, K)
411
+ C_inv_p = torch.linalg.inv(C[:-1, :-1])
412
+ A_p = C_inv_p
413
+
414
+ if use_corrector:
415
+ #print('using corrector')
416
+ C_inv = torch.linalg.inv(C)
417
+ A_c = C_inv
418
+
419
+ hh = -h if self.predict_x0 else h
420
+ h_phi_1 = torch.expm1(hh)
421
+ h_phi_ks = []
422
+ factorial_k = 1
423
+ h_phi_k = h_phi_1
424
+ for k in range(1, K + 2):
425
+ h_phi_ks.append(h_phi_k)
426
+ h_phi_k = h_phi_k / hh - 1 / factorial_k
427
+ factorial_k *= (k + 1)
428
+
429
+ model_t = None
430
+ if self.predict_x0:
431
+ x_t_ = (
432
+ sigma_t / sigma_prev_0 * x
433
+ - alpha_t * h_phi_1 * model_prev_0
434
+ )
435
+ # now predictor
436
+ x_t = x_t_
437
+ if len(D1s) > 0:
438
+ # compute the residuals for predictor
439
+ for k in range(K - 1):
440
+ x_t = x_t - alpha_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_p[k])
441
+ # now corrector
442
+ if use_corrector:
443
+ model_t = self.model_fn(x_t, t)
444
+ D1_t = (model_t - model_prev_0)
445
+ x_t = x_t_
446
+ k = 0
447
+ for k in range(K - 1):
448
+ x_t = x_t - alpha_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_c[k][:-1])
449
+ x_t = x_t - alpha_t * h_phi_ks[K] * (D1_t * A_c[k][-1])
450
+ else:
451
+ log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
452
+ x_t_ = (
453
+ (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
454
+ - (sigma_t * h_phi_1) * model_prev_0
455
+ )
456
+ # now predictor
457
+ x_t = x_t_
458
+ if len(D1s) > 0:
459
+ # compute the residuals for predictor
460
+ for k in range(K - 1):
461
+ x_t = x_t - sigma_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_p[k])
462
+ # now corrector
463
+ if use_corrector:
464
+ model_t = self.model_fn(x_t, t)
465
+ D1_t = (model_t - model_prev_0)
466
+ x_t = x_t_
467
+ k = 0
468
+ for k in range(K - 1):
469
+ x_t = x_t - sigma_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_c[k][:-1])
470
+ x_t = x_t - sigma_t * h_phi_ks[K] * (D1_t * A_c[k][-1])
471
+ return x_t, model_t
472
+
473
+ def multistep_uni_pc_bh_update(self, x, model_prev_list, t_prev_list, t, order, x_t=None, use_corrector=True):
474
+ #print(f'using unified predictor-corrector with order {order} (solver type: B(h))')
475
+ ns = self.noise_schedule
476
+ assert order <= len(model_prev_list)
477
+
478
+ # first compute rks
479
+ t_prev_0 = t_prev_list[-1]
480
+ lambda_prev_0 = ns.marginal_lambda(t_prev_0)
481
+ lambda_t = ns.marginal_lambda(t)
482
+ model_prev_0 = model_prev_list[-1]
483
+ sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
484
+ log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
485
+ alpha_t = torch.exp(log_alpha_t)
486
+
487
+ h = lambda_t - lambda_prev_0
488
+
489
+ rks = []
490
+ D1s = []
491
+ for i in range(1, order):
492
+ t_prev_i = t_prev_list[-(i + 1)]
493
+ model_prev_i = model_prev_list[-(i + 1)]
494
+ lambda_prev_i = ns.marginal_lambda(t_prev_i)
495
+ rk = (lambda_prev_i - lambda_prev_0) / h
496
+ rks.append(rk)
497
+ D1s.append((model_prev_i - model_prev_0) / rk)
498
+
499
+ rks.append(1.)
500
+ rks = torch.tensor(rks, device=x.device)
501
+
502
+ R = []
503
+ b = []
504
+
505
+ hh = -h if self.predict_x0 else h
506
+ h_phi_1 = torch.expm1(hh) # h\phi_1(h) = e^h - 1
507
+ h_phi_k = h_phi_1 / hh - 1
508
+
509
+ factorial_i = 1
510
+
511
+ if self.variant == 'bh1':
512
+ B_h = hh
513
+ elif self.variant == 'bh2':
514
+ B_h = torch.expm1(hh)
515
+ else:
516
+ raise NotImplementedError()
517
+
518
+ for i in range(1, order + 1):
519
+ R.append(torch.pow(rks, i - 1))
520
+ b.append(h_phi_k * factorial_i / B_h)
521
+ factorial_i *= (i + 1)
522
+ h_phi_k = h_phi_k / hh - 1 / factorial_i
523
+
524
+ R = torch.stack(R)
525
+ b = torch.cat(b)
526
+
527
+ # now predictor
528
+ use_predictor = len(D1s) > 0 and x_t is None
529
+ if len(D1s) > 0:
530
+ D1s = torch.stack(D1s, dim=1) # (B, K)
531
+ if x_t is None:
532
+ # for order 2, we use a simplified version
533
+ if order == 2:
534
+ rhos_p = torch.tensor([0.5], device=b.device)
535
+ else:
536
+ rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
537
+ else:
538
+ D1s = None
539
+
540
+ if use_corrector:
541
+ #print('using corrector')
542
+ # for order 1, we use a simplified version
543
+ if order == 1:
544
+ rhos_c = torch.tensor([0.5], device=b.device)
545
+ else:
546
+ rhos_c = torch.linalg.solve(R, b)
547
+
548
+ model_t = None
549
+ if self.predict_x0:
550
+ x_t_ = (
551
+ sigma_t / sigma_prev_0 * x
552
+ - alpha_t * h_phi_1 * model_prev_0
553
+ )
554
+
555
+ if x_t is None:
556
+ if use_predictor:
557
+ pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
558
+ else:
559
+ pred_res = 0
560
+ x_t = x_t_ - alpha_t * B_h * pred_res
561
+
562
+ if use_corrector:
563
+ model_t = self.model_fn(x_t, t)
564
+ if D1s is not None:
565
+ corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
566
+ else:
567
+ corr_res = 0
568
+ D1_t = (model_t - model_prev_0)
569
+ x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t)
570
+ else:
571
+ x_t_ = (
572
+ torch.exp(log_alpha_t - log_alpha_prev_0) * x
573
+ - sigma_t * h_phi_1 * model_prev_0
574
+ )
575
+ if x_t is None:
576
+ if use_predictor:
577
+ pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
578
+ else:
579
+ pred_res = 0
580
+ x_t = x_t_ - sigma_t * B_h * pred_res
581
+
582
+ if use_corrector:
583
+ model_t = self.model_fn(x_t, t)
584
+ if D1s is not None:
585
+ corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
586
+ else:
587
+ corr_res = 0
588
+ D1_t = (model_t - model_prev_0)
589
+ x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t)
590
+ return x_t, model_t
591
+
592
+ def sample(self, x, steps=20, t_start=None, t_end=None, order=2, skip_type='time_uniform',
593
+ method='multistep', lower_order_final=True, denoise_to_zero=False, atol=0.0078, rtol=0.05, return_intermediate=False,
594
+ ):
595
+ """
596
+ Compute the sample at time `t_end` by UniPC, given the initial `x` at time `t_start`.
597
+ """
598
+ t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
599
+ t_T = self.noise_schedule.T if t_start is None else t_start
600
+ assert t_0 > 0 and t_T > 0, "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
601
+ if return_intermediate:
602
+ assert method in ['multistep', 'singlestep', 'singlestep_fixed'], "Cannot use adaptive solver when saving intermediate values"
603
+ if self.correcting_xt_fn is not None:
604
+ assert method in ['multistep', 'singlestep', 'singlestep_fixed'], "Cannot use adaptive solver when correcting_xt_fn is not None"
605
+ device = x.device
606
+ intermediates = []
607
+ with torch.no_grad():
608
+ if method == 'multistep':
609
+ assert steps >= order
610
+ timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
611
+ assert timesteps.shape[0] - 1 == steps
612
+ # Init the initial values.
613
+ step = 0
614
+ t = timesteps[step]
615
+ t_prev_list = [t]
616
+ model_prev_list = [self.model_fn(x, t)]
617
+ if self.correcting_xt_fn is not None:
618
+ x = self.correcting_xt_fn(x, t, step)
619
+ if return_intermediate:
620
+ intermediates.append(x)
621
+
622
+ # Init the first `order` values by lower order multistep UniPC.
623
+ for step in range(1, order):
624
+ t = timesteps[step]
625
+ x, model_x = self.multistep_uni_pc_update(x, model_prev_list, t_prev_list, t, step, use_corrector=True)
626
+ if model_x is None:
627
+ model_x = self.model_fn(x, t)
628
+ if self.correcting_xt_fn is not None:
629
+ x = self.correcting_xt_fn(x, t, step)
630
+ if return_intermediate:
631
+ intermediates.append(x)
632
+ t_prev_list.append(t)
633
+ model_prev_list.append(model_x)
634
+
635
+ # Compute the remaining values by `order`-th order multistep DPM-Solver.
636
+ for step in range(order, steps + 1):
637
+ t = timesteps[step]
638
+ if lower_order_final:
639
+ step_order = min(order, steps + 1 - step)
640
+ else:
641
+ step_order = order
642
+ if step == steps:
643
+ #print('do not run corrector at the last step')
644
+ use_corrector = False
645
+ else:
646
+ use_corrector = True
647
+ x, model_x = self.multistep_uni_pc_update(x, model_prev_list, t_prev_list, t, step_order, use_corrector=use_corrector)
648
+ if self.correcting_xt_fn is not None:
649
+ x = self.correcting_xt_fn(x, t, step)
650
+ if return_intermediate:
651
+ intermediates.append(x)
652
+ for i in range(order - 1):
653
+ t_prev_list[i] = t_prev_list[i + 1]
654
+ model_prev_list[i] = model_prev_list[i + 1]
655
+ t_prev_list[-1] = t
656
+ # We do not need to evaluate the final model value.
657
+ if step < steps:
658
+ if model_x is None:
659
+ model_x = self.model_fn(x, t)
660
+ model_prev_list[-1] = model_x
661
+ else:
662
+ raise ValueError("Got wrong method {}".format(method))
663
+
664
+ if denoise_to_zero:
665
+ t = torch.ones((1,)).to(device) * t_0
666
+ x = self.denoise_to_zero_fn(x, t)
667
+ if self.correcting_xt_fn is not None:
668
+ x = self.correcting_xt_fn(x, t, step + 1)
669
+ if return_intermediate:
670
+ intermediates.append(x)
671
+ if return_intermediate:
672
+ return x, intermediates
673
+ else:
674
+ return x
675
+
676
+
677
+ #############################################################
678
+ # other utility functions
679
+ #############################################################
680
+
681
+ def interpolate_fn(x, xp, yp):
682
+ """
683
+ A piecewise linear function y = f(x), using xp and yp as keypoints.
684
+ We implement f(x) in a differentiable way (i.e. applicable for autograd).
685
+ The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
686
+
687
+ Args:
688
+ x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
689
+ xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
690
+ yp: PyTorch tensor with shape [C, K].
691
+ Returns:
692
+ The function values f(x), with shape [N, C].
693
+ """
694
+ N, K = x.shape[0], xp.shape[1]
695
+ all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
696
+ sorted_all_x, x_indices = torch.sort(all_x, dim=2)
697
+ x_idx = torch.argmin(x_indices, dim=2)
698
+ cand_start_idx = x_idx - 1
699
+ start_idx = torch.where(
700
+ torch.eq(x_idx, 0),
701
+ torch.tensor(1, device=x.device),
702
+ torch.where(
703
+ torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
704
+ ),
705
+ )
706
+ end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
707
+ start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
708
+ end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
709
+ start_idx2 = torch.where(
710
+ torch.eq(x_idx, 0),
711
+ torch.tensor(0, device=x.device),
712
+ torch.where(
713
+ torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
714
+ ),
715
+ )
716
+ y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
717
+ start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
718
+ end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
719
+ cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
720
+ return cand
721
+
722
+
723
+ def expand_dims(v, dims):
724
+ """
725
+ Expand the tensor `v` to the dim `dims`.
726
+
727
+ Args:
728
+ `v`: a PyTorch tensor with shape [N].
729
+ `dim`: a `int`.
730
+ Returns:
731
+ a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
732
+ """
733
+ return v[(...,) + (None,)*(dims - 1)]
diffusion/unit2mel.py CHANGED
@@ -1,11 +1,14 @@
1
  import os
2
- import yaml
 
3
  import torch
4
  import torch.nn as nn
5
- import numpy as np
 
6
  from .diffusion import GaussianDiffusion
7
- from .wavenet import WaveNet
8
  from .vocoder import Vocoder
 
 
9
 
10
  class DotDict(dict):
11
  def __getattr__(*args):
@@ -21,9 +24,11 @@ def load_model_vocoder(
21
  device='cpu',
22
  config_path = None
23
  ):
24
- if config_path is None: config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
25
- else: config_file = config_path
26
-
 
 
27
  with open(config_file, "r") as config:
28
  args = yaml.safe_load(config)
29
  args = DotDict(args)
@@ -39,13 +44,17 @@ def load_model_vocoder(
39
  vocoder.dimension,
40
  args.model.n_layers,
41
  args.model.n_chans,
42
- args.model.n_hidden)
 
 
 
43
 
44
  print(' [Loading] ' + model_path)
45
  ckpt = torch.load(model_path, map_location=torch.device(device))
46
  model.to(device)
47
  model.load_state_dict(ckpt['model'])
48
  model.eval()
 
49
  return model, vocoder, args
50
 
51
 
@@ -58,7 +67,10 @@ class Unit2Mel(nn.Module):
58
  out_dims=128,
59
  n_layers=20,
60
  n_chans=384,
61
- n_hidden=256):
 
 
 
62
  super().__init__()
63
  self.unit_embed = nn.Linear(input_channel, n_hidden)
64
  self.f0_embed = nn.Linear(1, n_hidden)
@@ -70,9 +82,51 @@ class Unit2Mel(nn.Module):
70
  self.n_spk = n_spk
71
  if n_spk is not None and n_spk > 1:
72
  self.spk_embed = nn.Embedding(n_spk, n_hidden)
73
-
 
 
 
 
74
  # diffusion
75
- self.decoder = GaussianDiffusion(WaveNet(out_dims, n_layers, n_chans, n_hidden), out_dims=out_dims)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  def forward(self, units, f0, volume, spk_id = None, spk_mix_dict = None, aug_shift = None,
78
  gt_spec=None, infer=True, infer_speedup=10, method='dpm-solver', k_step=300, use_tqdm=True):
@@ -84,6 +138,12 @@ class Unit2Mel(nn.Module):
84
  dict of B x n_frames x feat
85
  '''
86
 
 
 
 
 
 
 
87
  x = self.unit_embed(units) + self.f0_embed((1+ f0 / 700).log()) + self.volume_embed(volume)
88
  if self.n_spk is not None and self.n_spk > 1:
89
  if spk_mix_dict is not None:
@@ -91,7 +151,14 @@ class Unit2Mel(nn.Module):
91
  spk_id_torch = torch.LongTensor(np.array([[k]])).to(units.device)
92
  x = x + v * self.spk_embed(spk_id_torch)
93
  else:
94
- x = x + self.spk_embed(spk_id)
 
 
 
 
 
 
 
95
  if self.aug_shift_embed is not None and aug_shift is not None:
96
  x = x + self.aug_shift_embed(aug_shift / 5)
97
  x = self.decoder(x, gt_spec=gt_spec, infer=infer, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm)
 
1
  import os
2
+
3
+ import numpy as np
4
  import torch
5
  import torch.nn as nn
6
+ import yaml
7
+
8
  from .diffusion import GaussianDiffusion
 
9
  from .vocoder import Vocoder
10
+ from .wavenet import WaveNet
11
+
12
 
13
  class DotDict(dict):
14
  def __getattr__(*args):
 
24
  device='cpu',
25
  config_path = None
26
  ):
27
+ if config_path is None:
28
+ config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
29
+ else:
30
+ config_file = config_path
31
+
32
  with open(config_file, "r") as config:
33
  args = yaml.safe_load(config)
34
  args = DotDict(args)
 
44
  vocoder.dimension,
45
  args.model.n_layers,
46
  args.model.n_chans,
47
+ args.model.n_hidden,
48
+ args.model.timesteps,
49
+ args.model.k_step_max
50
+ )
51
 
52
  print(' [Loading] ' + model_path)
53
  ckpt = torch.load(model_path, map_location=torch.device(device))
54
  model.to(device)
55
  model.load_state_dict(ckpt['model'])
56
  model.eval()
57
+ print(f'Loaded diffusion model, sampler is {args.infer.method}, speedup: {args.infer.speedup} ')
58
  return model, vocoder, args
59
 
60
 
 
67
  out_dims=128,
68
  n_layers=20,
69
  n_chans=384,
70
+ n_hidden=256,
71
+ timesteps=1000,
72
+ k_step_max=1000
73
+ ):
74
  super().__init__()
75
  self.unit_embed = nn.Linear(input_channel, n_hidden)
76
  self.f0_embed = nn.Linear(1, n_hidden)
 
82
  self.n_spk = n_spk
83
  if n_spk is not None and n_spk > 1:
84
  self.spk_embed = nn.Embedding(n_spk, n_hidden)
85
+
86
+ self.timesteps = timesteps if timesteps is not None else 1000
87
+ self.k_step_max = k_step_max if k_step_max is not None and k_step_max>0 and k_step_max<self.timesteps else self.timesteps
88
+
89
+ self.n_hidden = n_hidden
90
  # diffusion
91
+ self.decoder = GaussianDiffusion(WaveNet(out_dims, n_layers, n_chans, n_hidden),timesteps=self.timesteps,k_step=self.k_step_max, out_dims=out_dims)
92
+ self.input_channel = input_channel
93
+
94
+ def init_spkembed(self, units, f0, volume, spk_id = None, spk_mix_dict = None, aug_shift = None,
95
+ gt_spec=None, infer=True, infer_speedup=10, method='dpm-solver', k_step=300, use_tqdm=True):
96
+
97
+ '''
98
+ input:
99
+ B x n_frames x n_unit
100
+ return:
101
+ dict of B x n_frames x feat
102
+ '''
103
+ x = self.unit_embed(units) + self.f0_embed((1+ f0 / 700).log()) + self.volume_embed(volume)
104
+ if self.n_spk is not None and self.n_spk > 1:
105
+ if spk_mix_dict is not None:
106
+ spk_embed_mix = torch.zeros((1,1,self.hidden_size))
107
+ for k, v in spk_mix_dict.items():
108
+ spk_id_torch = torch.LongTensor(np.array([[k]])).to(units.device)
109
+ spk_embeddd = self.spk_embed(spk_id_torch)
110
+ self.speaker_map[k] = spk_embeddd
111
+ spk_embed_mix = spk_embed_mix + v * spk_embeddd
112
+ x = x + spk_embed_mix
113
+ else:
114
+ x = x + self.spk_embed(spk_id - 1)
115
+ self.speaker_map = self.speaker_map.unsqueeze(0)
116
+ self.speaker_map = self.speaker_map.detach()
117
+ return x.transpose(1, 2)
118
+
119
+ def init_spkmix(self, n_spk):
120
+ self.speaker_map = torch.zeros((n_spk,1,1,self.n_hidden))
121
+ hubert_hidden_size = self.input_channel
122
+ n_frames = 10
123
+ hubert = torch.randn((1, n_frames, hubert_hidden_size))
124
+ f0 = torch.randn((1, n_frames))
125
+ volume = torch.randn((1, n_frames))
126
+ spks = {}
127
+ for i in range(n_spk):
128
+ spks.update({i:1.0/float(self.n_spk)})
129
+ self.init_spkembed(hubert, f0.unsqueeze(-1), volume.unsqueeze(-1), spk_mix_dict=spks)
130
 
131
  def forward(self, units, f0, volume, spk_id = None, spk_mix_dict = None, aug_shift = None,
132
  gt_spec=None, infer=True, infer_speedup=10, method='dpm-solver', k_step=300, use_tqdm=True):
 
138
  dict of B x n_frames x feat
139
  '''
140
 
141
+ if not self.training and gt_spec is not None and k_step>self.k_step_max:
142
+ raise Exception("The shallow diffusion k_step is greater than the maximum diffusion k_step(k_step_max)!")
143
+
144
+ if not self.training and gt_spec is None and self.k_step_max!=self.timesteps:
145
+ raise Exception("This model can only be used for shallow diffusion and can not infer alone!")
146
+
147
  x = self.unit_embed(units) + self.f0_embed((1+ f0 / 700).log()) + self.volume_embed(volume)
148
  if self.n_spk is not None and self.n_spk > 1:
149
  if spk_mix_dict is not None:
 
151
  spk_id_torch = torch.LongTensor(np.array([[k]])).to(units.device)
152
  x = x + v * self.spk_embed(spk_id_torch)
153
  else:
154
+ if spk_id.shape[1] > 1:
155
+ g = spk_id.reshape((spk_id.shape[0], spk_id.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
156
+ g = g * self.speaker_map # [N, S, B, 1, H]
157
+ g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
158
+ g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
159
+ x = x + g
160
+ else:
161
+ x = x + self.spk_embed(spk_id)
162
  if self.aug_shift_embed is not None and aug_shift is not None:
163
  x = x + self.aug_shift_embed(aug_shift / 5)
164
  x = self.decoder(x, gt_spec=gt_spec, infer=infer, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm)
diffusion/vocoder.py CHANGED
@@ -1,9 +1,10 @@
1
  import torch
2
- from vdecoder.nsf_hifigan.nvSTFT import STFT
3
- from vdecoder.nsf_hifigan.models import load_model,load_config
4
  from torchaudio.transforms import Resample
5
 
6
-
 
 
 
7
  class Vocoder:
8
  def __init__(self, vocoder_type, vocoder_ckpt, device = None):
9
  if device is None:
 
1
  import torch
 
 
2
  from torchaudio.transforms import Resample
3
 
4
+ from vdecoder.nsf_hifigan.models import load_config, load_model
5
+ from vdecoder.nsf_hifigan.nvSTFT import STFT
6
+
7
+
8
  class Vocoder:
9
  def __init__(self, vocoder_type, vocoder_ckpt, device = None):
10
  if device is None:
edgetts/tts.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import random
3
+ import sys
4
+
5
+ import edge_tts
6
+ from edge_tts import VoicesManager
7
+ from langdetect import DetectorFactory, detect
8
+
9
+ DetectorFactory.seed = 0
10
+
11
+ TEXT = sys.argv[1]
12
+ LANG = detect(TEXT) if sys.argv[2] == "Auto" else sys.argv[2]
13
+ RATE = sys.argv[3]
14
+ VOLUME = sys.argv[4]
15
+ GENDER = sys.argv[5] if len(sys.argv) == 6 else None
16
+ OUTPUT_FILE = "tts.wav"
17
+
18
+ print("Running TTS...")
19
+ print(f"Text: {TEXT}, Language: {LANG}, Gender: {GENDER}, Rate: {RATE}, Volume: {VOLUME}")
20
+
21
+ async def _main() -> None:
22
+ voices = await VoicesManager.create()
23
+ if GENDER is not None:
24
+ # From "zh-cn" to "zh-CN" etc.
25
+ if LANG == "zh-cn" or LANG == "zh-tw":
26
+ LOCALE = LANG[:-2] + LANG[-2:].upper()
27
+ voice = voices.find(Gender=GENDER, Locale=LOCALE)
28
+ else:
29
+ voice = voices.find(Gender=GENDER, Language=LANG)
30
+ VOICE = random.choice(voice)["Name"]
31
+ print(f"Using random {LANG} voice: {VOICE}")
32
+ else:
33
+ VOICE = LANG
34
+
35
+ communicate = edge_tts.Communicate(text = TEXT, voice = VOICE, rate = RATE, volume = VOLUME)
36
+ await communicate.save(OUTPUT_FILE)
37
+
38
+ if __name__ == "__main__":
39
+ if sys.platform.startswith("win"):
40
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
41
+ asyncio.run(_main())
42
+ else:
43
+ loop = asyncio.get_event_loop_policy().get_event_loop()
44
+ try:
45
+ loop.run_until_complete(_main())
46
+ finally:
47
+ loop.close()
edgetts/tts_voices.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #List of Supported Voices for edge_TTS
2
+ SUPPORTED_VOICES = {
3
+ 'zh-CN-XiaoxiaoNeural': 'zh-CN',
4
+ 'zh-CN-XiaoyiNeural': 'zh-CN',
5
+ 'zh-CN-YunjianNeural': 'zh-CN',
6
+ 'zh-CN-YunxiNeural': 'zh-CN',
7
+ 'zh-CN-YunxiaNeural': 'zh-CN',
8
+ 'zh-CN-YunyangNeural': 'zh-CN',
9
+ 'zh-HK-HiuGaaiNeural': 'zh-HK',
10
+ 'zh-HK-HiuMaanNeural': 'zh-HK',
11
+ 'zh-HK-WanLungNeural': 'zh-HK',
12
+ 'zh-TW-HsiaoChenNeural': 'zh-TW',
13
+ 'zh-TW-YunJheNeural': 'zh-TW',
14
+ 'zh-TW-HsiaoYuNeural': 'zh-TW',
15
+ 'af-ZA-AdriNeural': 'af-ZA',
16
+ 'af-ZA-WillemNeural': 'af-ZA',
17
+ 'am-ET-AmehaNeural': 'am-ET',
18
+ 'am-ET-MekdesNeural': 'am-ET',
19
+ 'ar-AE-FatimaNeural': 'ar-AE',
20
+ 'ar-AE-HamdanNeural': 'ar-AE',
21
+ 'ar-BH-AliNeural': 'ar-BH',
22
+ 'ar-BH-LailaNeural': 'ar-BH',
23
+ 'ar-DZ-AminaNeural': 'ar-DZ',
24
+ 'ar-DZ-IsmaelNeural': 'ar-DZ',
25
+ 'ar-EG-SalmaNeural': 'ar-EG',
26
+ 'ar-EG-ShakirNeural': 'ar-EG',
27
+ 'ar-IQ-BasselNeural': 'ar-IQ',
28
+ 'ar-IQ-RanaNeural': 'ar-IQ',
29
+ 'ar-JO-SanaNeural': 'ar-JO',
30
+ 'ar-JO-TaimNeural': 'ar-JO',
31
+ 'ar-KW-FahedNeural': 'ar-KW',
32
+ 'ar-KW-NouraNeural': 'ar-KW',
33
+ 'ar-LB-LaylaNeural': 'ar-LB',
34
+ 'ar-LB-RamiNeural': 'ar-LB',
35
+ 'ar-LY-ImanNeural': 'ar-LY',
36
+ 'ar-LY-OmarNeural': 'ar-LY',
37
+ 'ar-MA-JamalNeural': 'ar-MA',
38
+ 'ar-MA-MounaNeural': 'ar-MA',
39
+ 'ar-OM-AbdullahNeural': 'ar-OM',
40
+ 'ar-OM-AyshaNeural': 'ar-OM',
41
+ 'ar-QA-AmalNeural': 'ar-QA',
42
+ 'ar-QA-MoazNeural': 'ar-QA',
43
+ 'ar-SA-HamedNeural': 'ar-SA',
44
+ 'ar-SA-ZariyahNeural': 'ar-SA',
45
+ 'ar-SY-AmanyNeural': 'ar-SY',
46
+ 'ar-SY-LaithNeural': 'ar-SY',
47
+ 'ar-TN-HediNeural': 'ar-TN',
48
+ 'ar-TN-ReemNeural': 'ar-TN',
49
+ 'ar-YE-MaryamNeural': 'ar-YE',
50
+ 'ar-YE-SalehNeural': 'ar-YE',
51
+ 'az-AZ-BabekNeural': 'az-AZ',
52
+ 'az-AZ-BanuNeural': 'az-AZ',
53
+ 'bg-BG-BorislavNeural': 'bg-BG',
54
+ 'bg-BG-KalinaNeural': 'bg-BG',
55
+ 'bn-BD-NabanitaNeural': 'bn-BD',
56
+ 'bn-BD-PradeepNeural': 'bn-BD',
57
+ 'bn-IN-BashkarNeural': 'bn-IN',
58
+ 'bn-IN-TanishaaNeural': 'bn-IN',
59
+ 'bs-BA-GoranNeural': 'bs-BA',
60
+ 'bs-BA-VesnaNeural': 'bs-BA',
61
+ 'ca-ES-EnricNeural': 'ca-ES',
62
+ 'ca-ES-JoanaNeural': 'ca-ES',
63
+ 'cs-CZ-AntoninNeural': 'cs-CZ',
64
+ 'cs-CZ-VlastaNeural': 'cs-CZ',
65
+ 'cy-GB-AledNeural': 'cy-GB',
66
+ 'cy-GB-NiaNeural': 'cy-GB',
67
+ 'da-DK-ChristelNeural': 'da-DK',
68
+ 'da-DK-JeppeNeural': 'da-DK',
69
+ 'de-AT-IngridNeural': 'de-AT',
70
+ 'de-AT-JonasNeural': 'de-AT',
71
+ 'de-CH-JanNeural': 'de-CH',
72
+ 'de-CH-LeniNeural': 'de-CH',
73
+ 'de-DE-AmalaNeural': 'de-DE',
74
+ 'de-DE-ConradNeural': 'de-DE',
75
+ 'de-DE-KatjaNeural': 'de-DE',
76
+ 'de-DE-KillianNeural': 'de-DE',
77
+ 'el-GR-AthinaNeural': 'el-GR',
78
+ 'el-GR-NestorasNeural': 'el-GR',
79
+ 'en-AU-NatashaNeural': 'en-AU',
80
+ 'en-AU-WilliamNeural': 'en-AU',
81
+ 'en-CA-ClaraNeural': 'en-CA',
82
+ 'en-CA-LiamNeural': 'en-CA',
83
+ 'en-GB-LibbyNeural': 'en-GB',
84
+ 'en-GB-MaisieNeural': 'en-GB',
85
+ 'en-GB-RyanNeural': 'en-GB',
86
+ 'en-GB-SoniaNeural': 'en-GB',
87
+ 'en-GB-ThomasNeural': 'en-GB',
88
+ 'en-HK-SamNeural': 'en-HK',
89
+ 'en-HK-YanNeural': 'en-HK',
90
+ 'en-IE-ConnorNeural': 'en-IE',
91
+ 'en-IE-EmilyNeural': 'en-IE',
92
+ 'en-IN-NeerjaNeural': 'en-IN',
93
+ 'en-IN-PrabhatNeural': 'en-IN',
94
+ 'en-KE-AsiliaNeural': 'en-KE',
95
+ 'en-KE-ChilembaNeural': 'en-KE',
96
+ 'en-NG-AbeoNeural': 'en-NG',
97
+ 'en-NG-EzinneNeural': 'en-NG',
98
+ 'en-NZ-MitchellNeural': 'en-NZ',
99
+ 'en-NZ-MollyNeural': 'en-NZ',
100
+ 'en-PH-JamesNeural': 'en-PH',
101
+ 'en-PH-RosaNeural': 'en-PH',
102
+ 'en-SG-LunaNeural': 'en-SG',
103
+ 'en-SG-WayneNeural': 'en-SG',
104
+ 'en-TZ-ElimuNeural': 'en-TZ',
105
+ 'en-TZ-ImaniNeural': 'en-TZ',
106
+ 'en-US-AnaNeural': 'en-US',
107
+ 'en-US-AriaNeural': 'en-US',
108
+ 'en-US-ChristopherNeural': 'en-US',
109
+ 'en-US-EricNeural': 'en-US',
110
+ 'en-US-GuyNeural': 'en-US',
111
+ 'en-US-JennyNeural': 'en-US',
112
+ 'en-US-MichelleNeural': 'en-US',
113
+ 'en-ZA-LeahNeural': 'en-ZA',
114
+ 'en-ZA-LukeNeural': 'en-ZA',
115
+ 'es-AR-ElenaNeural': 'es-AR',
116
+ 'es-AR-TomasNeural': 'es-AR',
117
+ 'es-BO-MarceloNeural': 'es-BO',
118
+ 'es-BO-SofiaNeural': 'es-BO',
119
+ 'es-CL-CatalinaNeural': 'es-CL',
120
+ 'es-CL-LorenzoNeural': 'es-CL',
121
+ 'es-CO-GonzaloNeural': 'es-CO',
122
+ 'es-CO-SalomeNeural': 'es-CO',
123
+ 'es-CR-JuanNeural': 'es-CR',
124
+ 'es-CR-MariaNeural': 'es-CR',
125
+ 'es-CU-BelkysNeural': 'es-CU',
126
+ 'es-CU-ManuelNeural': 'es-CU',
127
+ 'es-DO-EmilioNeural': 'es-DO',
128
+ 'es-DO-RamonaNeural': 'es-DO',
129
+ 'es-EC-AndreaNeural': 'es-EC',
130
+ 'es-EC-LuisNeural': 'es-EC',
131
+ 'es-ES-AlvaroNeural': 'es-ES',
132
+ 'es-ES-ElviraNeural': 'es-ES',
133
+ 'es-ES-ManuelEsCUNeural': 'es-ES',
134
+ 'es-GQ-JavierNeural': 'es-GQ',
135
+ 'es-GQ-TeresaNeural': 'es-GQ',
136
+ 'es-GT-AndresNeural': 'es-GT',
137
+ 'es-GT-MartaNeural': 'es-GT',
138
+ 'es-HN-CarlosNeural': 'es-HN',
139
+ 'es-HN-KarlaNeural': 'es-HN',
140
+ 'es-MX-DaliaNeural': 'es-MX',
141
+ 'es-MX-JorgeNeural': 'es-MX',
142
+ 'es-MX-LorenzoEsCLNeural': 'es-MX',
143
+ 'es-NI-FedericoNeural': 'es-NI',
144
+ 'es-NI-YolandaNeural': 'es-NI',
145
+ 'es-PA-MargaritaNeural': 'es-PA',
146
+ 'es-PA-RobertoNeural': 'es-PA',
147
+ 'es-PE-AlexNeural': 'es-PE',
148
+ 'es-PE-CamilaNeural': 'es-PE',
149
+ 'es-PR-KarinaNeural': 'es-PR',
150
+ 'es-PR-VictorNeural': 'es-PR',
151
+ 'es-PY-MarioNeural': 'es-PY',
152
+ 'es-PY-TaniaNeural': 'es-PY',
153
+ 'es-SV-LorenaNeural': 'es-SV',
154
+ 'es-SV-RodrigoNeural': 'es-SV',
155
+ 'es-US-AlonsoNeural': 'es-US',
156
+ 'es-US-PalomaNeural': 'es-US',
157
+ 'es-UY-MateoNeural': 'es-UY',
158
+ 'es-UY-ValentinaNeural': 'es-UY',
159
+ 'es-VE-PaolaNeural': 'es-VE',
160
+ 'es-VE-SebastianNeural': 'es-VE',
161
+ 'et-EE-AnuNeural': 'et-EE',
162
+ 'et-EE-KertNeural': 'et-EE',
163
+ 'fa-IR-DilaraNeural': 'fa-IR',
164
+ 'fa-IR-FaridNeural': 'fa-IR',
165
+ 'fi-FI-HarriNeural': 'fi-FI',
166
+ 'fi-FI-NooraNeural': 'fi-FI',
167
+ 'fil-PH-AngeloNeural': 'fil-PH',
168
+ 'fil-PH-BlessicaNeural': 'fil-PH',
169
+ 'fr-BE-CharlineNeural': 'fr-BE',
170
+ 'fr-BE-GerardNeural': 'fr-BE',
171
+ 'fr-CA-AntoineNeural': 'fr-CA',
172
+ 'fr-CA-JeanNeural': 'fr-CA',
173
+ 'fr-CA-SylvieNeural': 'fr-CA',
174
+ 'fr-CH-ArianeNeural': 'fr-CH',
175
+ 'fr-CH-FabriceNeural': 'fr-CH',
176
+ 'fr-FR-DeniseNeural': 'fr-FR',
177
+ 'fr-FR-EloiseNeural': 'fr-FR',
178
+ 'fr-FR-HenriNeural': 'fr-FR',
179
+ 'ga-IE-ColmNeural': 'ga-IE',
180
+ 'ga-IE-OrlaNeural': 'ga-IE',
181
+ 'gl-ES-RoiNeural': 'gl-ES',
182
+ 'gl-ES-SabelaNeural': 'gl-ES',
183
+ 'gu-IN-DhwaniNeural': 'gu-IN',
184
+ 'gu-IN-NiranjanNeural': 'gu-IN',
185
+ 'he-IL-AvriNeural': 'he-IL',
186
+ 'he-IL-HilaNeural': 'he-IL',
187
+ 'hi-IN-MadhurNeural': 'hi-IN',
188
+ 'hi-IN-SwaraNeural': 'hi-IN',
189
+ 'hr-HR-GabrijelaNeural': 'hr-HR',
190
+ 'hr-HR-SreckoNeural': 'hr-HR',
191
+ 'hu-HU-NoemiNeural': 'hu-HU',
192
+ 'hu-HU-TamasNeural': 'hu-HU',
193
+ 'id-ID-ArdiNeural': 'id-ID',
194
+ 'id-ID-GadisNeural': 'id-ID',
195
+ 'is-IS-GudrunNeural': 'is-IS',
196
+ 'is-IS-GunnarNeural': 'is-IS',
197
+ 'it-IT-DiegoNeural': 'it-IT',
198
+ 'it-IT-ElsaNeural': 'it-IT',
199
+ 'it-IT-IsabellaNeural': 'it-IT',
200
+ 'ja-JP-KeitaNeural': 'ja-JP',
201
+ 'ja-JP-NanamiNeural': 'ja-JP',
202
+ 'jv-ID-DimasNeural': 'jv-ID',
203
+ 'jv-ID-SitiNeural': 'jv-ID',
204
+ 'ka-GE-EkaNeural': 'ka-GE',
205
+ 'ka-GE-GiorgiNeural': 'ka-GE',
206
+ 'kk-KZ-AigulNeural': 'kk-KZ',
207
+ 'kk-KZ-DauletNeural': 'kk-KZ',
208
+ 'km-KH-PisethNeural': 'km-KH',
209
+ 'km-KH-SreymomNeural': 'km-KH',
210
+ 'kn-IN-GaganNeural': 'kn-IN',
211
+ 'kn-IN-SapnaNeural': 'kn-IN',
212
+ 'ko-KR-InJoonNeural': 'ko-KR',
213
+ 'ko-KR-SunHiNeural': 'ko-KR',
214
+ 'lo-LA-ChanthavongNeural': 'lo-LA',
215
+ 'lo-LA-KeomanyNeural': 'lo-LA',
216
+ 'lt-LT-LeonasNeural': 'lt-LT',
217
+ 'lt-LT-OnaNeural': 'lt-LT',
218
+ 'lv-LV-EveritaNeural': 'lv-LV',
219
+ 'lv-LV-NilsNeural': 'lv-LV',
220
+ 'mk-MK-AleksandarNeural': 'mk-MK',
221
+ 'mk-MK-MarijaNeural': 'mk-MK',
222
+ 'ml-IN-MidhunNeural': 'ml-IN',
223
+ 'ml-IN-SobhanaNeural': 'ml-IN',
224
+ 'mn-MN-BataaNeural': 'mn-MN',
225
+ 'mn-MN-YesuiNeural': 'mn-MN',
226
+ 'mr-IN-AarohiNeural': 'mr-IN',
227
+ 'mr-IN-ManoharNeural': 'mr-IN',
228
+ 'ms-MY-OsmanNeural': 'ms-MY',
229
+ 'ms-MY-YasminNeural': 'ms-MY',
230
+ 'mt-MT-GraceNeural': 'mt-MT',
231
+ 'mt-MT-JosephNeural': 'mt-MT',
232
+ 'my-MM-NilarNeural': 'my-MM',
233
+ 'my-MM-ThihaNeural': 'my-MM',
234
+ 'nb-NO-FinnNeural': 'nb-NO',
235
+ 'nb-NO-PernilleNeural': 'nb-NO',
236
+ 'ne-NP-HemkalaNeural': 'ne-NP',
237
+ 'ne-NP-SagarNeural': 'ne-NP',
238
+ 'nl-BE-ArnaudNeural': 'nl-BE',
239
+ 'nl-BE-DenaNeural': 'nl-BE',
240
+ 'nl-NL-ColetteNeural': 'nl-NL',
241
+ 'nl-NL-FennaNeural': 'nl-NL',
242
+ 'nl-NL-MaartenNeural': 'nl-NL',
243
+ 'pl-PL-MarekNeural': 'pl-PL',
244
+ 'pl-PL-ZofiaNeural': 'pl-PL',
245
+ 'ps-AF-GulNawazNeural': 'ps-AF',
246
+ 'ps-AF-LatifaNeural': 'ps-AF',
247
+ 'pt-BR-AntonioNeural': 'pt-BR',
248
+ 'pt-BR-FranciscaNeural': 'pt-BR',
249
+ 'pt-PT-DuarteNeural': 'pt-PT',
250
+ 'pt-PT-RaquelNeural': 'pt-PT',
251
+ 'ro-RO-AlinaNeural': 'ro-RO',
252
+ 'ro-RO-EmilNeural': 'ro-RO',
253
+ 'ru-RU-DmitryNeural': 'ru-RU',
254
+ 'ru-RU-SvetlanaNeural': 'ru-RU',
255
+ 'si-LK-SameeraNeural': 'si-LK',
256
+ 'si-LK-ThiliniNeural': 'si-LK',
257
+ 'sk-SK-LukasNeural': 'sk-SK',
258
+ 'sk-SK-ViktoriaNeural': 'sk-SK',
259
+ 'sl-SI-PetraNeural': 'sl-SI',
260
+ 'sl-SI-RokNeural': 'sl-SI',
261
+ 'so-SO-MuuseNeural': 'so-SO',
262
+ 'so-SO-UbaxNeural': 'so-SO',
263
+ 'sq-AL-AnilaNeural': 'sq-AL',
264
+ 'sq-AL-IlirNeural': 'sq-AL',
265
+ 'sr-RS-NicholasNeural': 'sr-RS',
266
+ 'sr-RS-SophieNeural': 'sr-RS',
267
+ 'su-ID-JajangNeural': 'su-ID',
268
+ 'su-ID-TutiNeural': 'su-ID',
269
+ 'sv-SE-MattiasNeural': 'sv-SE',
270
+ 'sv-SE-SofieNeural': 'sv-SE',
271
+ 'sw-KE-RafikiNeural': 'sw-KE',
272
+ 'sw-KE-ZuriNeural': 'sw-KE',
273
+ 'sw-TZ-DaudiNeural': 'sw-TZ',
274
+ 'sw-TZ-RehemaNeural': 'sw-TZ',
275
+ 'ta-IN-PallaviNeural': 'ta-IN',
276
+ 'ta-IN-ValluvarNeural': 'ta-IN',
277
+ 'ta-LK-KumarNeural': 'ta-LK',
278
+ 'ta-LK-SaranyaNeural': 'ta-LK',
279
+ 'ta-MY-KaniNeural': 'ta-MY',
280
+ 'ta-MY-SuryaNeural': 'ta-MY',
281
+ 'ta-SG-AnbuNeural': 'ta-SG',
282
+ 'ta-SG-VenbaNeural': 'ta-SG',
283
+ 'te-IN-MohanNeural': 'te-IN',
284
+ 'te-IN-ShrutiNeural': 'te-IN',
285
+ 'th-TH-NiwatNeural': 'th-TH',
286
+ 'th-TH-PremwadeeNeural': 'th-TH',
287
+ 'tr-TR-AhmetNeural': 'tr-TR',
288
+ 'tr-TR-EmelNeural': 'tr-TR',
289
+ 'uk-UA-OstapNeural': 'uk-UA',
290
+ 'uk-UA-PolinaNeural': 'uk-UA',
291
+ 'ur-IN-GulNeural': 'ur-IN',
292
+ 'ur-IN-SalmanNeural': 'ur-IN',
293
+ 'ur-PK-AsadNeural': 'ur-PK',
294
+ 'ur-PK-UzmaNeural': 'ur-PK',
295
+ 'uz-UZ-MadinaNeural': 'uz-UZ',
296
+ 'uz-UZ-SardorNeural': 'uz-UZ',
297
+ 'vi-VN-HoaiMyNeural': 'vi-VN',
298
+ 'vi-VN-NamMinhNeural': 'vi-VN',
299
+ 'zu-ZA-ThandoNeural': 'zu-ZA',
300
+ 'zu-ZA-ThembaNeural': 'zu-ZA',
301
+ }
302
+
303
+ SUPPORTED_LANGUAGES = [
304
+ "Auto",
305
+ *SUPPORTED_VOICES.keys()
306
+ ]
flask_api.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import logging
3
+
4
+ import soundfile
5
+ import torch
6
+ import torchaudio
7
+ from flask import Flask, request, send_file
8
+ from flask_cors import CORS
9
+
10
+ from inference.infer_tool import RealTimeVC, Svc
11
+
12
+ app = Flask(__name__)
13
+
14
+ CORS(app)
15
+
16
+ logging.getLogger('numba').setLevel(logging.WARNING)
17
+
18
+
19
+ @app.route("/voiceChangeModel", methods=["POST"])
20
+ def voice_change_model():
21
+ request_form = request.form
22
+ wave_file = request.files.get("sample", None)
23
+ # 变调信息
24
+ f_pitch_change = float(request_form.get("fPitchChange", 0))
25
+ # DAW所需的采样率
26
+ daw_sample = int(float(request_form.get("sampleRate", 0)))
27
+ speaker_id = int(float(request_form.get("sSpeakId", 0)))
28
+ # http获得wav文件并转换
29
+ input_wav_path = io.BytesIO(wave_file.read())
30
+
31
+ # 模型推理
32
+ if raw_infer:
33
+ # out_audio, out_sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path)
34
+ out_audio, out_sr = svc_model.infer(speaker_id, f_pitch_change, input_wav_path, cluster_infer_ratio=0,
35
+ auto_predict_f0=False, noice_scale=0.4, f0_filter=False)
36
+ tar_audio = torchaudio.functional.resample(out_audio, svc_model.target_sample, daw_sample)
37
+ else:
38
+ out_audio = svc.process(svc_model, speaker_id, f_pitch_change, input_wav_path, cluster_infer_ratio=0,
39
+ auto_predict_f0=False, noice_scale=0.4, f0_filter=False)
40
+ tar_audio = torchaudio.functional.resample(torch.from_numpy(out_audio), svc_model.target_sample, daw_sample)
41
+ # 返回音频
42
+ out_wav_path = io.BytesIO()
43
+ soundfile.write(out_wav_path, tar_audio.cpu().numpy(), daw_sample, format="wav")
44
+ out_wav_path.seek(0)
45
+ return send_file(out_wav_path, download_name="temp.wav", as_attachment=True)
46
+
47
+
48
+ if __name__ == '__main__':
49
+ # 启用则为直接切片合成,False为交叉淡化方式
50
+ # vst插件调整0.3-0.5s切片时间可以降低延迟,直接切片方法会有连接处爆音、交叉淡化会有轻微重叠声音
51
+ # 自行选择能接受的方法,或将vst最大切片时间调整为1s,此处设为Ture,延迟大音质稳定一些
52
+ raw_infer = True
53
+ # 每个模型和config是唯一对应的
54
+ model_name = "logs/32k/G_174000-Copy1.pth"
55
+ config_name = "configs/config.json"
56
+ cluster_model_path = "logs/44k/kmeans_10000.pt"
57
+ svc_model = Svc(model_name, config_name, cluster_model_path=cluster_model_path)
58
+ svc = RealTimeVC()
59
+ # 此处与vst插件对应,不建议更改
60
+ app.run(port=6842, host="0.0.0.0", debug=False, threaded=False)
flask_api_full_song.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+
3
+ import numpy as np
4
+ import soundfile
5
+ from flask import Flask, request, send_file
6
+
7
+ from inference import infer_tool, slicer
8
+
9
+ app = Flask(__name__)
10
+
11
+
12
+ @app.route("/wav2wav", methods=["POST"])
13
+ def wav2wav():
14
+ request_form = request.form
15
+ audio_path = request_form.get("audio_path", None) # wav文件地址
16
+ tran = int(float(request_form.get("tran", 0))) # 音调
17
+ spk = request_form.get("spk", 0) # 说话人(id或者name都可以,具体看你的config)
18
+ wav_format = request_form.get("wav_format", 'wav') # 范围文件格式
19
+ infer_tool.format_wav(audio_path)
20
+ chunks = slicer.cut(audio_path, db_thresh=-40)
21
+ audio_data, audio_sr = slicer.chunks2audio(audio_path, chunks)
22
+
23
+ audio = []
24
+ for (slice_tag, data) in audio_data:
25
+ print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
26
+
27
+ length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
28
+ if slice_tag:
29
+ print('jump empty segment')
30
+ _audio = np.zeros(length)
31
+ else:
32
+ # padd
33
+ pad_len = int(audio_sr * 0.5)
34
+ data = np.concatenate([np.zeros([pad_len]), data, np.zeros([pad_len])])
35
+ raw_path = io.BytesIO()
36
+ soundfile.write(raw_path, data, audio_sr, format="wav")
37
+ raw_path.seek(0)
38
+ out_audio, out_sr = svc_model.infer(spk, tran, raw_path)
39
+ svc_model.clear_empty()
40
+ _audio = out_audio.cpu().numpy()
41
+ pad_len = int(svc_model.target_sample * 0.5)
42
+ _audio = _audio[pad_len:-pad_len]
43
+
44
+ audio.extend(list(infer_tool.pad_array(_audio, length)))
45
+ out_wav_path = io.BytesIO()
46
+ soundfile.write(out_wav_path, audio, svc_model.target_sample, format=wav_format)
47
+ out_wav_path.seek(0)
48
+ return send_file(out_wav_path, download_name=f"temp.{wav_format}", as_attachment=True)
49
+
50
+
51
+ if __name__ == '__main__':
52
+ model_name = "logs/44k/G_60000.pth" # 模型地址
53
+ config_name = "configs/config.json" # config地址
54
+ svc_model = infer_tool.Svc(model_name, config_name)
55
+ app.run(port=1145, host="0.0.0.0", debug=False, threaded=False)
inference/infer_tool.py CHANGED
@@ -1,15 +1,16 @@
 
1
  import hashlib
2
  import io
3
  import json
4
  import logging
5
  import os
 
6
  import time
7
  from pathlib import Path
8
- from inference import slicer
9
- import gc
10
 
11
  import librosa
12
  import numpy as np
 
13
  # import onnxruntime
14
  import soundfile
15
  import torch
@@ -17,10 +18,9 @@ import torchaudio
17
 
18
  import cluster
19
  import utils
20
- from models import SynthesizerTrn
21
-
22
  from diffusion.unit2mel import load_model_vocoder
23
- import yaml
 
24
 
25
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
26
 
@@ -122,26 +122,27 @@ class Svc(object):
122
  diffusion_config_path="configs/diffusion.yaml",
123
  shallow_diffusion = False,
124
  only_diffusion = False,
 
 
125
  ):
126
  self.net_g_path = net_g_path
127
  self.only_diffusion = only_diffusion
128
  self.shallow_diffusion = shallow_diffusion
 
129
  if device is None:
130
  self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
131
- # self.dev = torch.device("cpu")
132
  else:
133
  self.dev = torch.device(device)
134
  self.net_g_ms = None
135
  if not self.only_diffusion:
136
- self.hps_ms = utils.get_hparams_from_file(config_path)
137
  self.target_sample = self.hps_ms.data.sampling_rate
138
  self.hop_size = self.hps_ms.data.hop_length
139
  self.spk2id = self.hps_ms.spk
140
- try:
141
- self.speech_encoder = self.hps_ms.model.speech_encoder
142
- except Exception as e:
143
- self.speech_encoder = 'vec768l12'
144
-
145
  self.nsf_hifigan_enhance = nsf_hifigan_enhance
146
  if self.shallow_diffusion or self.only_diffusion:
147
  if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
@@ -151,13 +152,16 @@ class Svc(object):
151
  self.hop_size = self.diffusion_args.data.block_size
152
  self.spk2id = self.diffusion_args.spk
153
  self.speech_encoder = self.diffusion_args.data.encoder
 
 
 
154
  else:
155
  print("No diffusion model or config found. Shallow diffusion mode will False")
156
  self.shallow_diffusion = self.only_diffusion = False
157
 
158
  # load hubert and model
159
  if not self.only_diffusion:
160
- self.load_model()
161
  self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
162
  self.volume_extractor = utils.Volume_Extractor(self.hop_size)
163
  else:
@@ -165,13 +169,23 @@ class Svc(object):
165
  self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
166
 
167
  if os.path.exists(cluster_model_path):
168
- self.cluster_model = cluster.get_cluster_model(cluster_model_path)
169
- if self.shallow_diffusion : self.nsf_hifigan_enhance = False
 
 
 
 
 
 
 
 
 
 
170
  if self.nsf_hifigan_enhance:
171
  from modules.enhancer import Enhancer
172
  self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
173
 
174
- def load_model(self):
175
  # get model configuration
176
  self.net_g_ms = SynthesizerTrn(
177
  self.hps_ms.data.filter_length // 2 + 1,
@@ -182,8 +196,8 @@ class Svc(object):
182
  _ = self.net_g_ms.half().eval().to(self.dev)
183
  else:
184
  _ = self.net_g_ms.eval().to(self.dev)
185
-
186
-
187
 
188
  def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
189
 
@@ -202,12 +216,33 @@ class Svc(object):
202
  wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
203
  wav16k = torch.from_numpy(wav16k).to(self.dev)
204
  c = self.hubert_model.encoder(wav16k)
205
- c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
206
 
207
  if cluster_infer_ratio !=0:
208
- cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
209
- cluster_c = torch.FloatTensor(cluster_c).to(self.dev)
210
- c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
  c = c.unsqueeze(0)
213
  return c, f0, uv
@@ -220,30 +255,47 @@ class Svc(object):
220
  f0_predictor='pm',
221
  enhancer_adaptive_key = 0,
222
  cr_threshold = 0.05,
223
- k_step = 100
 
 
 
 
224
  ):
225
-
226
- speaker_id = self.spk2id.get(speaker)
227
- if not speaker_id and type(speaker) is int:
228
- if len(self.spk2id.__dict__) >= speaker:
229
- speaker_id = speaker
230
- sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
231
  wav, sr = librosa.load(raw_path, sr=self.target_sample)
232
- c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  if "half" in self.net_g_path and torch.cuda.is_available():
234
  c = c.half()
235
  with torch.no_grad():
236
  start = time.time()
 
237
  if not self.only_diffusion:
238
- audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)
 
239
  audio = audio[0,0].data.float()
240
- if self.shallow_diffusion:
241
- audio_mel = self.vocoder.extract(audio[None,:],self.target_sample)
242
  else:
243
  audio = torch.FloatTensor(wav).to(self.dev)
244
  audio_mel = None
245
  if self.only_diffusion or self.shallow_diffusion:
246
- vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev)
 
 
 
 
 
247
  f0 = f0[:,:,None]
248
  c = c.transpose(-1,-2)
249
  audio_mel = self.diffusion_model(
@@ -265,9 +317,11 @@ class Svc(object):
265
  f0[:,:,None],
266
  self.hps_ms.data.hop_length,
267
  adaptive_key = enhancer_adaptive_key)
 
 
268
  use_time = time.time() - start
269
  print("vits use time:{}".format(use_time))
270
- return audio, audio.shape[-1]
271
 
272
  def clear_empty(self):
273
  # clean up vram
@@ -298,8 +352,15 @@ class Svc(object):
298
  f0_predictor='pm',
299
  enhancer_adaptive_key = 0,
300
  cr_threshold = 0.05,
301
- k_step = 100
 
 
 
302
  ):
 
 
 
 
303
  wav_path = Path(raw_audio_path).with_suffix('.wav')
304
  chunks = slicer.cut(wav_path, db_thresh=slice_db)
305
  audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
@@ -309,7 +370,62 @@ class Svc(object):
309
  lg_size_c_l = (lg_size-lg_size_r)//2
310
  lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
311
  lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
312
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  audio = []
314
  for (slice_tag, data) in audio_data:
315
  print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
@@ -319,6 +435,7 @@ class Svc(object):
319
  print('jump empty segment')
320
  _audio = np.zeros(length)
321
  audio.extend(list(pad_array(_audio, length)))
 
322
  continue
323
  if per_size != 0:
324
  datas = split_list_by_n(data, per_size,lg_size)
@@ -326,22 +443,28 @@ class Svc(object):
326
  datas = [data]
327
  for k,dat in enumerate(datas):
328
  per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
329
- if clip_seconds!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
 
330
  # padd
331
  pad_len = int(audio_sr * pad_seconds)
332
  dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
333
  raw_path = io.BytesIO()
334
  soundfile.write(raw_path, dat, audio_sr, format="wav")
335
  raw_path.seek(0)
336
- out_audio, out_sr = self.infer(spk, tran, raw_path,
337
  cluster_infer_ratio=cluster_infer_ratio,
338
  auto_predict_f0=auto_predict_f0,
339
  noice_scale=noice_scale,
340
  f0_predictor = f0_predictor,
341
  enhancer_adaptive_key = enhancer_adaptive_key,
342
  cr_threshold = cr_threshold,
343
- k_step = k_step
 
 
 
 
344
  )
 
345
  _audio = out_audio.cpu().numpy()
346
  pad_len = int(self.target_sample * pad_seconds)
347
  _audio = _audio[pad_len:-pad_len]
@@ -404,4 +527,4 @@ class RealTimeVC:
404
  self.last_chunk = audio[-self.pre_len:]
405
  self.last_o = audio
406
  return ret[self.chunk_len:2 * self.chunk_len]
407
-
 
1
+ import gc
2
  import hashlib
3
  import io
4
  import json
5
  import logging
6
  import os
7
+ import pickle
8
  import time
9
  from pathlib import Path
 
 
10
 
11
  import librosa
12
  import numpy as np
13
+
14
  # import onnxruntime
15
  import soundfile
16
  import torch
 
18
 
19
  import cluster
20
  import utils
 
 
21
  from diffusion.unit2mel import load_model_vocoder
22
+ from inference import slicer
23
+ from models import SynthesizerTrn
24
 
25
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
26
 
 
122
  diffusion_config_path="configs/diffusion.yaml",
123
  shallow_diffusion = False,
124
  only_diffusion = False,
125
+ spk_mix_enable = False,
126
+ feature_retrieval = False
127
  ):
128
  self.net_g_path = net_g_path
129
  self.only_diffusion = only_diffusion
130
  self.shallow_diffusion = shallow_diffusion
131
+ self.feature_retrieval = feature_retrieval
132
  if device is None:
133
  self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
134
  else:
135
  self.dev = torch.device(device)
136
  self.net_g_ms = None
137
  if not self.only_diffusion:
138
+ self.hps_ms = utils.get_hparams_from_file(config_path,True)
139
  self.target_sample = self.hps_ms.data.sampling_rate
140
  self.hop_size = self.hps_ms.data.hop_length
141
  self.spk2id = self.hps_ms.spk
142
+ self.unit_interpolate_mode = self.hps_ms.data.unit_interpolate_mode if self.hps_ms.data.unit_interpolate_mode is not None else 'left'
143
+ self.vol_embedding = self.hps_ms.model.vol_embedding if self.hps_ms.model.vol_embedding is not None else False
144
+ self.speech_encoder = self.hps_ms.model.speech_encoder if self.hps_ms.model.speech_encoder is not None else 'vec768l12'
145
+
 
146
  self.nsf_hifigan_enhance = nsf_hifigan_enhance
147
  if self.shallow_diffusion or self.only_diffusion:
148
  if os.path.exists(diffusion_model_path) and os.path.exists(diffusion_model_path):
 
152
  self.hop_size = self.diffusion_args.data.block_size
153
  self.spk2id = self.diffusion_args.spk
154
  self.speech_encoder = self.diffusion_args.data.encoder
155
+ self.unit_interpolate_mode = self.diffusion_args.data.unit_interpolate_mode if self.diffusion_args.data.unit_interpolate_mode is not None else 'left'
156
+ if spk_mix_enable:
157
+ self.diffusion_model.init_spkmix(len(self.spk2id))
158
  else:
159
  print("No diffusion model or config found. Shallow diffusion mode will False")
160
  self.shallow_diffusion = self.only_diffusion = False
161
 
162
  # load hubert and model
163
  if not self.only_diffusion:
164
+ self.load_model(spk_mix_enable)
165
  self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
166
  self.volume_extractor = utils.Volume_Extractor(self.hop_size)
167
  else:
 
169
  self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
170
 
171
  if os.path.exists(cluster_model_path):
172
+ if self.feature_retrieval:
173
+ with open(cluster_model_path,"rb") as f:
174
+ self.cluster_model = pickle.load(f)
175
+ self.big_npy = None
176
+ self.now_spk_id = -1
177
+ else:
178
+ self.cluster_model = cluster.get_cluster_model(cluster_model_path)
179
+ else:
180
+ self.feature_retrieval=False
181
+
182
+ if self.shallow_diffusion :
183
+ self.nsf_hifigan_enhance = False
184
  if self.nsf_hifigan_enhance:
185
  from modules.enhancer import Enhancer
186
  self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
187
 
188
+ def load_model(self, spk_mix_enable=False):
189
  # get model configuration
190
  self.net_g_ms = SynthesizerTrn(
191
  self.hps_ms.data.filter_length // 2 + 1,
 
196
  _ = self.net_g_ms.half().eval().to(self.dev)
197
  else:
198
  _ = self.net_g_ms.eval().to(self.dev)
199
+ if spk_mix_enable:
200
+ self.net_g_ms.EnableCharacterMix(len(self.spk2id), self.dev)
201
 
202
  def get_unit_f0(self, wav, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
203
 
 
216
  wav16k = librosa.resample(wav, orig_sr=self.target_sample, target_sr=16000)
217
  wav16k = torch.from_numpy(wav16k).to(self.dev)
218
  c = self.hubert_model.encoder(wav16k)
219
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
220
 
221
  if cluster_infer_ratio !=0:
222
+ if self.feature_retrieval:
223
+ speaker_id = self.spk2id.get(speaker)
224
+ if speaker_id is None:
225
+ raise RuntimeError("The name you entered is not in the speaker list!")
226
+ if not speaker_id and type(speaker) is int:
227
+ if len(self.spk2id.__dict__) >= speaker:
228
+ speaker_id = speaker
229
+ feature_index = self.cluster_model[speaker_id]
230
+ feat_np = c.transpose(0,1).cpu().numpy()
231
+ if self.big_npy is None or self.now_spk_id != speaker_id:
232
+ self.big_npy = feature_index.reconstruct_n(0, feature_index.ntotal)
233
+ self.now_spk_id = speaker_id
234
+ print("starting feature retrieval...")
235
+ score, ix = feature_index.search(feat_np, k=8)
236
+ weight = np.square(1 / score)
237
+ weight /= weight.sum(axis=1, keepdims=True)
238
+ npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
239
+ c = cluster_infer_ratio * npy + (1 - cluster_infer_ratio) * feat_np
240
+ c = torch.FloatTensor(c).to(self.dev).transpose(0,1)
241
+ print("end feature retrieval...")
242
+ else:
243
+ cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker).T
244
+ cluster_c = torch.FloatTensor(cluster_c).to(self.dev)
245
+ c = cluster_infer_ratio * cluster_c + (1 - cluster_infer_ratio) * c
246
 
247
  c = c.unsqueeze(0)
248
  return c, f0, uv
 
255
  f0_predictor='pm',
256
  enhancer_adaptive_key = 0,
257
  cr_threshold = 0.05,
258
+ k_step = 100,
259
+ frame = 0,
260
+ spk_mix = False,
261
+ second_encoding = False,
262
+ loudness_envelope_adjustment = 1
263
  ):
 
 
 
 
 
 
264
  wav, sr = librosa.load(raw_path, sr=self.target_sample)
265
+ if spk_mix:
266
+ c, f0, uv = self.get_unit_f0(wav, tran, 0, None, f0_filter,f0_predictor,cr_threshold=cr_threshold)
267
+ n_frames = f0.size(1)
268
+ sid = speaker[:, frame:frame+n_frames].transpose(0,1)
269
+ else:
270
+ speaker_id = self.spk2id.get(speaker)
271
+ if not speaker_id and type(speaker) is int:
272
+ if len(self.spk2id.__dict__) >= speaker:
273
+ speaker_id = speaker
274
+ if speaker_id is None:
275
+ raise RuntimeError("The name you entered is not in the speaker list!")
276
+ sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0)
277
+ c, f0, uv = self.get_unit_f0(wav, tran, cluster_infer_ratio, speaker, f0_filter,f0_predictor,cr_threshold=cr_threshold)
278
+ n_frames = f0.size(1)
279
  if "half" in self.net_g_path and torch.cuda.is_available():
280
  c = c.half()
281
  with torch.no_grad():
282
  start = time.time()
283
+ vol = None
284
  if not self.only_diffusion:
285
+ vol = self.volume_extractor.extract(torch.FloatTensor(wav).to(self.dev)[None,:])[None,:].to(self.dev) if self.vol_embedding else None
286
+ audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale,vol=vol)
287
  audio = audio[0,0].data.float()
288
+ audio_mel = self.vocoder.extract(audio[None,:],self.target_sample) if self.shallow_diffusion else None
 
289
  else:
290
  audio = torch.FloatTensor(wav).to(self.dev)
291
  audio_mel = None
292
  if self.only_diffusion or self.shallow_diffusion:
293
+ vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev) if vol is None else vol[:,:,None]
294
+ if self.shallow_diffusion and second_encoding:
295
+ audio16k = librosa.resample(audio.detach().cpu().numpy(), orig_sr=self.target_sample, target_sr=16000)
296
+ audio16k = torch.from_numpy(audio16k).to(self.dev)
297
+ c = self.hubert_model.encoder(audio16k)
298
+ c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1],self.unit_interpolate_mode)
299
  f0 = f0[:,:,None]
300
  c = c.transpose(-1,-2)
301
  audio_mel = self.diffusion_model(
 
317
  f0[:,:,None],
318
  self.hps_ms.data.hop_length,
319
  adaptive_key = enhancer_adaptive_key)
320
+ if loudness_envelope_adjustment != 1:
321
+ audio = utils.change_rms(wav,self.target_sample,audio,self.target_sample,loudness_envelope_adjustment)
322
  use_time = time.time() - start
323
  print("vits use time:{}".format(use_time))
324
+ return audio, audio.shape[-1], n_frames
325
 
326
  def clear_empty(self):
327
  # clean up vram
 
352
  f0_predictor='pm',
353
  enhancer_adaptive_key = 0,
354
  cr_threshold = 0.05,
355
+ k_step = 100,
356
+ use_spk_mix = False,
357
+ second_encoding = False,
358
+ loudness_envelope_adjustment = 1
359
  ):
360
+ if use_spk_mix:
361
+ if len(self.spk2id) == 1:
362
+ spk = self.spk2id.keys()[0]
363
+ use_spk_mix = False
364
  wav_path = Path(raw_audio_path).with_suffix('.wav')
365
  chunks = slicer.cut(wav_path, db_thresh=slice_db)
366
  audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
 
370
  lg_size_c_l = (lg_size-lg_size_r)//2
371
  lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
372
  lg = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
373
+
374
+ if use_spk_mix:
375
+ assert len(self.spk2id) == len(spk)
376
+ audio_length = 0
377
+ for (slice_tag, data) in audio_data:
378
+ aud_length = int(np.ceil(len(data) / audio_sr * self.target_sample))
379
+ if slice_tag:
380
+ audio_length += aud_length // self.hop_size
381
+ continue
382
+ if per_size != 0:
383
+ datas = split_list_by_n(data, per_size,lg_size)
384
+ else:
385
+ datas = [data]
386
+ for k,dat in enumerate(datas):
387
+ pad_len = int(audio_sr * pad_seconds)
388
+ per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample))
389
+ a_length = per_length + 2 * pad_len
390
+ audio_length += a_length // self.hop_size
391
+ audio_length += len(audio_data)
392
+ spk_mix_tensor = torch.zeros(size=(len(spk), audio_length)).to(self.dev)
393
+ for i in range(len(spk)):
394
+ last_end = None
395
+ for mix in spk[i]:
396
+ if mix[3]<0. or mix[2]<0.:
397
+ raise RuntimeError("mix value must higer Than zero!")
398
+ begin = int(audio_length * mix[0])
399
+ end = int(audio_length * mix[1])
400
+ length = end - begin
401
+ if length<=0:
402
+ raise RuntimeError("begin Must lower Than end!")
403
+ step = (mix[3] - mix[2])/length
404
+ if last_end is not None:
405
+ if last_end != begin:
406
+ raise RuntimeError("[i]EndTime Must Equal [i+1]BeginTime!")
407
+ last_end = end
408
+ if step == 0.:
409
+ spk_mix_data = torch.zeros(length).to(self.dev) + mix[2]
410
+ else:
411
+ spk_mix_data = torch.arange(mix[2],mix[3],step).to(self.dev)
412
+ if(len(spk_mix_data)<length):
413
+ num_pad = length - len(spk_mix_data)
414
+ spk_mix_data = torch.nn.functional.pad(spk_mix_data, [0, num_pad], mode="reflect").to(self.dev)
415
+ spk_mix_tensor[i][begin:end] = spk_mix_data[:length]
416
+
417
+ spk_mix_ten = torch.sum(spk_mix_tensor,dim=0).unsqueeze(0).to(self.dev)
418
+ # spk_mix_tensor[0][spk_mix_ten<0.001] = 1.0
419
+ for i, x in enumerate(spk_mix_ten[0]):
420
+ if x == 0.0:
421
+ spk_mix_ten[0][i] = 1.0
422
+ spk_mix_tensor[:,i] = 1.0 / len(spk)
423
+ spk_mix_tensor = spk_mix_tensor / spk_mix_ten
424
+ if not ((torch.sum(spk_mix_tensor,dim=0) - 1.)<0.0001).all():
425
+ raise RuntimeError("sum(spk_mix_tensor) not equal 1")
426
+ spk = spk_mix_tensor
427
+
428
+ global_frame = 0
429
  audio = []
430
  for (slice_tag, data) in audio_data:
431
  print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
 
435
  print('jump empty segment')
436
  _audio = np.zeros(length)
437
  audio.extend(list(pad_array(_audio, length)))
438
+ global_frame += length // self.hop_size
439
  continue
440
  if per_size != 0:
441
  datas = split_list_by_n(data, per_size,lg_size)
 
443
  datas = [data]
444
  for k,dat in enumerate(datas):
445
  per_length = int(np.ceil(len(dat) / audio_sr * self.target_sample)) if clip_seconds!=0 else length
446
+ if clip_seconds!=0:
447
+ print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
448
  # padd
449
  pad_len = int(audio_sr * pad_seconds)
450
  dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
451
  raw_path = io.BytesIO()
452
  soundfile.write(raw_path, dat, audio_sr, format="wav")
453
  raw_path.seek(0)
454
+ out_audio, out_sr, out_frame = self.infer(spk, tran, raw_path,
455
  cluster_infer_ratio=cluster_infer_ratio,
456
  auto_predict_f0=auto_predict_f0,
457
  noice_scale=noice_scale,
458
  f0_predictor = f0_predictor,
459
  enhancer_adaptive_key = enhancer_adaptive_key,
460
  cr_threshold = cr_threshold,
461
+ k_step = k_step,
462
+ frame = global_frame,
463
+ spk_mix = use_spk_mix,
464
+ second_encoding = second_encoding,
465
+ loudness_envelope_adjustment = loudness_envelope_adjustment
466
  )
467
+ global_frame += out_frame
468
  _audio = out_audio.cpu().numpy()
469
  pad_len = int(self.target_sample * pad_seconds)
470
  _audio = _audio[pad_len:-pad_len]
 
527
  self.last_chunk = audio[-self.pre_len:]
528
  self.last_o = audio
529
  return ret[self.chunk_len:2 * self.chunk_len]
530
+
inference/infer_tool_grad.py CHANGED
@@ -1,26 +1,21 @@
1
- import hashlib
2
- import json
3
  import logging
4
  import os
5
- import time
6
- from pathlib import Path
7
- import io
8
  import librosa
9
- import maad
10
  import numpy as np
11
- from inference import slicer
12
  import parselmouth
13
  import soundfile
14
  import torch
15
  import torchaudio
16
 
17
- # from hubert import hubert_model
18
  import utils
 
19
  from models import SynthesizerTrn
 
20
  logging.getLogger('numba').setLevel(logging.WARNING)
21
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
22
 
23
-
24
  def resize2d_f0(x, target_len):
25
  source = np.array(x)
26
  source[source < 0.001] = np.nan
@@ -29,8 +24,7 @@ def resize2d_f0(x, target_len):
29
  res = np.nan_to_num(target)
30
  return res
31
 
32
-
33
- def get_f0(x, p_len, f0_up_key=0):
34
 
35
  time_step = 160 / 16000 * 1000
36
  f0_min = 50
@@ -42,21 +36,18 @@ def get_f0(x, p_len, f0_up_key=0):
42
  time_step=time_step / 1000, voicing_threshold=0.6,
43
  pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
44
 
45
- pad_size = (p_len - len(f0) + 1) // 2
46
- if(pad_size > 0 or p_len - len(f0) - pad_size > 0):
47
- f0 = np.pad(
48
- f0, [[pad_size, p_len - len(f0) - pad_size]], mode='constant')
49
 
50
  f0 *= pow(2, f0_up_key / 12)
51
  f0_mel = 1127 * np.log(1 + f0 / 700)
52
- f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * \
53
- 254 / (f0_mel_max - f0_mel_min) + 1
54
  f0_mel[f0_mel <= 1] = 1
55
  f0_mel[f0_mel > 255] = 255
56
  f0_coarse = np.rint(f0_mel).astype(np.int)
57
  return f0_coarse, f0
58
 
59
-
60
  def clean_pitch(input_pitch):
61
  num_nan = np.sum(input_pitch == 1)
62
  if num_nan / len(input_pitch) > 0.9:
@@ -89,8 +80,7 @@ def mkdir(paths: list):
89
 
90
  class VitsSvc(object):
91
  def __init__(self):
92
- self.device = torch.device(
93
- "cuda" if torch.cuda.is_available() else "cpu")
94
  self.SVCVITS = None
95
  self.hps = None
96
  self.speakers = None
@@ -99,18 +89,16 @@ class VitsSvc(object):
99
  def set_device(self, device):
100
  self.device = torch.device(device)
101
  self.hubert_soft.to(self.device)
102
- if self.SVCVITS != None:
103
  self.SVCVITS.to(self.device)
104
 
105
  def loadCheckpoint(self, path):
106
- self.hps = utils.get_hparams_from_file(
107
- f"checkpoints/{path}/config.json")
108
  self.SVCVITS = SynthesizerTrn(
109
  self.hps.data.filter_length // 2 + 1,
110
  self.hps.train.segment_size // self.hps.data.hop_length,
111
  **self.hps.model)
112
- _ = utils.load_checkpoint(
113
- f"checkpoints/{path}/model.pth", self.SVCVITS, None)
114
  _ = self.SVCVITS.eval().to(self.device)
115
  self.speakers = self.hps.spk
116
 
@@ -120,6 +108,7 @@ class VitsSvc(object):
120
  units = self.hubert_soft.units(source)
121
  return units
122
 
 
123
  def get_unit_pitch(self, in_path, tran):
124
  source, sr = torchaudio.load(in_path)
125
  source = torchaudio.functional.resample(source, sr, 16000)
@@ -137,27 +126,23 @@ class VitsSvc(object):
137
  stn_tst = torch.FloatTensor(soft)
138
  with torch.no_grad():
139
  x_tst = stn_tst.unsqueeze(0).to(self.device)
140
- x_tst = torch.repeat_interleave(
141
- x_tst, repeats=2, dim=1).transpose(1, 2)
142
- audio, _ = self.SVCVITS.infer(x_tst, f0=f0, g=sid)[
143
- 0, 0].data.float()
144
  return audio, audio.shape[-1]
145
 
146
- def inference(self, srcaudio, chara, tran, slice_db):
147
  sampling_rate, audio = srcaudio
148
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
149
  if len(audio.shape) > 1:
150
  audio = librosa.to_mono(audio.transpose(1, 0))
151
  if sampling_rate != 16000:
152
- audio = librosa.resample(
153
- audio, orig_sr=sampling_rate, target_sr=16000)
154
  soundfile.write("tmpwav.wav", audio, 16000, format="wav")
155
  chunks = slicer.cut("tmpwav.wav", db_thresh=slice_db)
156
  audio_data, audio_sr = slicer.chunks2audio("tmpwav.wav", chunks)
157
  audio = []
158
  for (slice_tag, data) in audio_data:
159
- length = int(np.ceil(len(data) / audio_sr *
160
- self.hps.data.sampling_rate))
161
  raw_path = io.BytesIO()
162
  soundfile.write(raw_path, data, audio_sr, format="wav")
163
  raw_path.seek(0)
@@ -168,4 +153,4 @@ class VitsSvc(object):
168
  _audio = out_audio.cpu().numpy()
169
  audio.extend(list(_audio))
170
  audio = (np.array(audio) * 32768.0).astype('int16')
171
- return (self.hps.data.sampling_rate, audio)
 
1
+ import io
 
2
  import logging
3
  import os
4
+
 
 
5
  import librosa
 
6
  import numpy as np
 
7
  import parselmouth
8
  import soundfile
9
  import torch
10
  import torchaudio
11
 
 
12
  import utils
13
+ from inference import slicer
14
  from models import SynthesizerTrn
15
+
16
  logging.getLogger('numba').setLevel(logging.WARNING)
17
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
18
 
 
19
  def resize2d_f0(x, target_len):
20
  source = np.array(x)
21
  source[source < 0.001] = np.nan
 
24
  res = np.nan_to_num(target)
25
  return res
26
 
27
+ def get_f0(x, p_len,f0_up_key=0):
 
28
 
29
  time_step = 160 / 16000 * 1000
30
  f0_min = 50
 
36
  time_step=time_step / 1000, voicing_threshold=0.6,
37
  pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
38
 
39
+ pad_size=(p_len - len(f0) + 1) // 2
40
+ if(pad_size>0 or p_len - len(f0) - pad_size>0):
41
+ f0 = np.pad(f0,[[pad_size,p_len - len(f0) - pad_size]], mode='constant')
 
42
 
43
  f0 *= pow(2, f0_up_key / 12)
44
  f0_mel = 1127 * np.log(1 + f0 / 700)
45
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (f0_mel_max - f0_mel_min) + 1
 
46
  f0_mel[f0_mel <= 1] = 1
47
  f0_mel[f0_mel > 255] = 255
48
  f0_coarse = np.rint(f0_mel).astype(np.int)
49
  return f0_coarse, f0
50
 
 
51
  def clean_pitch(input_pitch):
52
  num_nan = np.sum(input_pitch == 1)
53
  if num_nan / len(input_pitch) > 0.9:
 
80
 
81
  class VitsSvc(object):
82
  def __init__(self):
83
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
84
  self.SVCVITS = None
85
  self.hps = None
86
  self.speakers = None
 
89
  def set_device(self, device):
90
  self.device = torch.device(device)
91
  self.hubert_soft.to(self.device)
92
+ if self.SVCVITS is not None:
93
  self.SVCVITS.to(self.device)
94
 
95
  def loadCheckpoint(self, path):
96
+ self.hps = utils.get_hparams_from_file(f"checkpoints/{path}/config.json")
 
97
  self.SVCVITS = SynthesizerTrn(
98
  self.hps.data.filter_length // 2 + 1,
99
  self.hps.train.segment_size // self.hps.data.hop_length,
100
  **self.hps.model)
101
+ _ = utils.load_checkpoint(f"checkpoints/{path}/model.pth", self.SVCVITS, None)
 
102
  _ = self.SVCVITS.eval().to(self.device)
103
  self.speakers = self.hps.spk
104
 
 
108
  units = self.hubert_soft.units(source)
109
  return units
110
 
111
+
112
  def get_unit_pitch(self, in_path, tran):
113
  source, sr = torchaudio.load(in_path)
114
  source = torchaudio.functional.resample(source, sr, 16000)
 
126
  stn_tst = torch.FloatTensor(soft)
127
  with torch.no_grad():
128
  x_tst = stn_tst.unsqueeze(0).to(self.device)
129
+ x_tst = torch.repeat_interleave(x_tst, repeats=2, dim=1).transpose(1, 2)
130
+ audio,_ = self.SVCVITS.infer(x_tst, f0=f0, g=sid)[0,0].data.float()
 
 
131
  return audio, audio.shape[-1]
132
 
133
+ def inference(self,srcaudio,chara,tran,slice_db):
134
  sampling_rate, audio = srcaudio
135
  audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
136
  if len(audio.shape) > 1:
137
  audio = librosa.to_mono(audio.transpose(1, 0))
138
  if sampling_rate != 16000:
139
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
 
140
  soundfile.write("tmpwav.wav", audio, 16000, format="wav")
141
  chunks = slicer.cut("tmpwav.wav", db_thresh=slice_db)
142
  audio_data, audio_sr = slicer.chunks2audio("tmpwav.wav", chunks)
143
  audio = []
144
  for (slice_tag, data) in audio_data:
145
+ length = int(np.ceil(len(data) / audio_sr * self.hps.data.sampling_rate))
 
146
  raw_path = io.BytesIO()
147
  soundfile.write(raw_path, data, audio_sr, format="wav")
148
  raw_path.seek(0)
 
153
  _audio = out_audio.cpu().numpy()
154
  audio.extend(list(_audio))
155
  audio = (np.array(audio) * 32768.0).astype('int16')
156
+ return (self.hps.data.sampling_rate,audio)
inference_main.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import soundfile
4
+
5
+ from inference import infer_tool
6
+ from inference.infer_tool import Svc
7
+ from spkmix import spk_mix_map
8
+
9
+ logging.getLogger('numba').setLevel(logging.WARNING)
10
+ chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
11
+
12
+
13
+
14
+ def main():
15
+ import argparse
16
+
17
+ parser = argparse.ArgumentParser(description='sovits4 inference')
18
+
19
+ # 一定要设置的部分
20
+ parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_37600.pth", help='模型路径')
21
+ parser.add_argument('-c', '--config_path', type=str, default="logs/44k/config.json", help='配置文件路径')
22
+ parser.add_argument('-cl', '--clip', type=float, default=0, help='音频强制切片,默认0为自动切片,单位为秒/s')
23
+ parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表,放在raw文件夹下')
24
+ parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
25
+ parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['buyizi'], help='合成目标说话人名称')
26
+
27
+ # 可选项部分
28
+ parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False, help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
29
+ parser.add_argument('-cm', '--cluster_model_path', type=str, default="", help='聚类模型或特征检索索引路径,留空则自动设为各方案模型的默认路径,如果没有训练聚类或特征检索则随便填')
30
+ parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案或特征检索占比,范围0-1,若没有训练聚类模型或特征检索则默认0即可')
31
+ parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
32
+ parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意:crepe为原F0使用均值滤波器)')
33
+ parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
34
+ parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='是否使用浅层扩散,使用后可解决一部分电音问题,默认关闭,该选项打开时,NSF_HIFIGAN增强器将会被禁止')
35
+ parser.add_argument('-usm', '--use_spk_mix', action='store_true', default=False, help='是否使用角色融合')
36
+ parser.add_argument('-lea', '--loudness_envelope_adjustment', type=float, default=1, help='输入源响度包络替换输出响度包络融合比例,越靠近1越使用输出响度包络')
37
+ parser.add_argument('-fr', '--feature_retrieval', action='store_true', default=False, help='是否使用特征检索,如果使用聚类模型将被禁用,且cm与cr参数将会变成特征检索的索引路径与混合比例')
38
+
39
+ # 浅扩散设置
40
+ parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt", help='扩散模型路径')
41
+ parser.add_argument('-dc', '--diffusion_config_path', type=str, default="logs/44k/diffusion/config.yaml", help='扩散模型配置文件路径')
42
+ parser.add_argument('-ks', '--k_step', type=int, default=100, help='扩散步数,越大越接近扩散模型的结果,默认100')
43
+ parser.add_argument('-se', '--second_encoding', action='store_true', default=False, help='二次编码,浅扩散前会对原始音频进行二次编码,玄学选项,有时候效果好,有时候效果差')
44
+ parser.add_argument('-od', '--only_diffusion', action='store_true', default=False, help='纯扩散模式,该模式不会加载sovits模型,以扩散模型推理')
45
+
46
+
47
+ # 不用动的部分
48
+ parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
49
+ parser.add_argument('-d', '--device', type=str, default=None, help='推理设备,None则为自动选择cpu和gpu')
50
+ parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='噪音级别,会影响咬字和音质,较为玄学')
51
+ parser.add_argument('-p', '--pad_seconds', type=float, default=0.5, help='推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现')
52
+ parser.add_argument('-wf', '--wav_format', type=str, default='flac', help='音频输出格式')
53
+ parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭')
54
+ parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0, help='使增强器适应更高的音域(单位为半音数)|默认为0')
55
+ parser.add_argument('-ft', '--f0_filter_threshold', type=float, default=0.05,help='F0过滤阈值,只有使用crepe时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音')
56
+
57
+
58
+ args = parser.parse_args()
59
+
60
+ clean_names = args.clean_names
61
+ trans = args.trans
62
+ spk_list = args.spk_list
63
+ slice_db = args.slice_db
64
+ wav_format = args.wav_format
65
+ auto_predict_f0 = args.auto_predict_f0
66
+ cluster_infer_ratio = args.cluster_infer_ratio
67
+ noice_scale = args.noice_scale
68
+ pad_seconds = args.pad_seconds
69
+ clip = args.clip
70
+ lg = args.linear_gradient
71
+ lgr = args.linear_gradient_retain
72
+ f0p = args.f0_predictor
73
+ enhance = args.enhance
74
+ enhancer_adaptive_key = args.enhancer_adaptive_key
75
+ cr_threshold = args.f0_filter_threshold
76
+ diffusion_model_path = args.diffusion_model_path
77
+ diffusion_config_path = args.diffusion_config_path
78
+ k_step = args.k_step
79
+ only_diffusion = args.only_diffusion
80
+ shallow_diffusion = args.shallow_diffusion
81
+ use_spk_mix = args.use_spk_mix
82
+ second_encoding = args.second_encoding
83
+ loudness_envelope_adjustment = args.loudness_envelope_adjustment
84
+
85
+ if cluster_infer_ratio != 0:
86
+ if args.cluster_model_path == "":
87
+ if args.feature_retrieval: # 若指定了占比但没有指定模型路径,则按是否使用特征检索分配默认的模型路径
88
+ args.cluster_model_path = "logs/44k/feature_and_index.pkl"
89
+ else:
90
+ args.cluster_model_path = "logs/44k/kmeans_10000.pt"
91
+ else: # 若未指定占比,则无论是否指定模型路径,都将其置空以避免之后的模型加载
92
+ args.cluster_model_path = ""
93
+
94
+ svc_model = Svc(args.model_path,
95
+ args.config_path,
96
+ args.device,
97
+ args.cluster_model_path,
98
+ enhance,
99
+ diffusion_model_path,
100
+ diffusion_config_path,
101
+ shallow_diffusion,
102
+ only_diffusion,
103
+ use_spk_mix,
104
+ args.feature_retrieval)
105
+
106
+ infer_tool.mkdir(["raw", "results"])
107
+
108
+ if len(spk_mix_map)<=1:
109
+ use_spk_mix = False
110
+ if use_spk_mix:
111
+ spk_list = [spk_mix_map]
112
+
113
+ infer_tool.fill_a_to_b(trans, clean_names)
114
+ for clean_name, tran in zip(clean_names, trans):
115
+ raw_audio_path = f"raw/{clean_name}"
116
+ if "." not in raw_audio_path:
117
+ raw_audio_path += ".wav"
118
+ infer_tool.format_wav(raw_audio_path)
119
+ for spk in spk_list:
120
+ kwarg = {
121
+ "raw_audio_path" : raw_audio_path,
122
+ "spk" : spk,
123
+ "tran" : tran,
124
+ "slice_db" : slice_db,
125
+ "cluster_infer_ratio" : cluster_infer_ratio,
126
+ "auto_predict_f0" : auto_predict_f0,
127
+ "noice_scale" : noice_scale,
128
+ "pad_seconds" : pad_seconds,
129
+ "clip_seconds" : clip,
130
+ "lg_num": lg,
131
+ "lgr_num" : lgr,
132
+ "f0_predictor" : f0p,
133
+ "enhancer_adaptive_key" : enhancer_adaptive_key,
134
+ "cr_threshold" : cr_threshold,
135
+ "k_step":k_step,
136
+ "use_spk_mix":use_spk_mix,
137
+ "second_encoding":second_encoding,
138
+ "loudness_envelope_adjustment":loudness_envelope_adjustment
139
+ }
140
+ audio = svc_model.slice_inference(**kwarg)
141
+ key = "auto" if auto_predict_f0 else f"{tran}key"
142
+ cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
143
+ isdiffusion = "sovits"
144
+ if shallow_diffusion :
145
+ isdiffusion = "sovdiff"
146
+ if only_diffusion :
147
+ isdiffusion = "diff"
148
+ if use_spk_mix:
149
+ spk = "spk_mix"
150
+ res_path = f'results/{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}_{f0p}.{wav_format}'
151
+ soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
152
+ svc_model.clear_empty()
153
+
154
+ if __name__ == '__main__':
155
+ main()
modules/DSConv.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ from torch.nn.utils import remove_weight_norm, weight_norm
3
+
4
+
5
+ class Depthwise_Separable_Conv1D(nn.Module):
6
+ def __init__(
7
+ self,
8
+ in_channels,
9
+ out_channels,
10
+ kernel_size,
11
+ stride = 1,
12
+ padding = 0,
13
+ dilation = 1,
14
+ bias = True,
15
+ padding_mode = 'zeros', # TODO: refine this type
16
+ device=None,
17
+ dtype=None
18
+ ):
19
+ super().__init__()
20
+ self.depth_conv = nn.Conv1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, groups=in_channels,stride = stride,padding=padding,dilation=dilation,bias=bias,padding_mode=padding_mode,device=device,dtype=dtype)
21
+ self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, device=device,dtype=dtype)
22
+
23
+ def forward(self, input):
24
+ return self.point_conv(self.depth_conv(input))
25
+
26
+ def weight_norm(self):
27
+ self.depth_conv = weight_norm(self.depth_conv, name = 'weight')
28
+ self.point_conv = weight_norm(self.point_conv, name = 'weight')
29
+
30
+ def remove_weight_norm(self):
31
+ self.depth_conv = remove_weight_norm(self.depth_conv, name = 'weight')
32
+ self.point_conv = remove_weight_norm(self.point_conv, name = 'weight')
33
+
34
+ class Depthwise_Separable_TransposeConv1D(nn.Module):
35
+ def __init__(
36
+ self,
37
+ in_channels,
38
+ out_channels,
39
+ kernel_size,
40
+ stride = 1,
41
+ padding = 0,
42
+ output_padding = 0,
43
+ bias = True,
44
+ dilation = 1,
45
+ padding_mode = 'zeros', # TODO: refine this type
46
+ device=None,
47
+ dtype=None
48
+ ):
49
+ super().__init__()
50
+ self.depth_conv = nn.ConvTranspose1d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, groups=in_channels,stride = stride,output_padding=output_padding,padding=padding,dilation=dilation,bias=bias,padding_mode=padding_mode,device=device,dtype=dtype)
51
+ self.point_conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=1, bias=bias, device=device,dtype=dtype)
52
+
53
+ def forward(self, input):
54
+ return self.point_conv(self.depth_conv(input))
55
+
56
+ def weight_norm(self):
57
+ self.depth_conv = weight_norm(self.depth_conv, name = 'weight')
58
+ self.point_conv = weight_norm(self.point_conv, name = 'weight')
59
+
60
+ def remove_weight_norm(self):
61
+ remove_weight_norm(self.depth_conv, name = 'weight')
62
+ remove_weight_norm(self.point_conv, name = 'weight')
63
+
64
+
65
+ def weight_norm_modules(module, name = 'weight', dim = 0):
66
+ if isinstance(module,Depthwise_Separable_Conv1D) or isinstance(module,Depthwise_Separable_TransposeConv1D):
67
+ module.weight_norm()
68
+ return module
69
+ else:
70
+ return weight_norm(module,name,dim)
71
+
72
+ def remove_weight_norm_modules(module, name = 'weight'):
73
+ if isinstance(module,Depthwise_Separable_Conv1D) or isinstance(module,Depthwise_Separable_TransposeConv1D):
74
+ module.remove_weight_norm()
75
+ else:
76
+ remove_weight_norm(module,name)
modules/F0Predictor/CrepeF0Predictor.py CHANGED
@@ -1,7 +1,9 @@
1
- from modules.F0Predictor.F0Predictor import F0Predictor
2
- from modules.F0Predictor.crepe import CrepePitchExtractor
3
  import torch
4
 
 
 
 
 
5
  class CrepeF0Predictor(F0Predictor):
6
  def __init__(self,hop_length=512,f0_min=50,f0_max=1100,device=None,sampling_rate=44100,threshold=0.05,model="full"):
7
  self.F0Creper = CrepePitchExtractor(hop_length=hop_length,f0_min=f0_min,f0_max=f0_max,device=device,threshold=threshold,model=model)
 
 
 
1
  import torch
2
 
3
+ from modules.F0Predictor.crepe import CrepePitchExtractor
4
+ from modules.F0Predictor.F0Predictor import F0Predictor
5
+
6
+
7
  class CrepeF0Predictor(F0Predictor):
8
  def __init__(self,hop_length=512,f0_min=50,f0_max=1100,device=None,sampling_rate=44100,threshold=0.05,model="full"):
9
  self.F0Creper = CrepePitchExtractor(hop_length=hop_length,f0_min=f0_min,f0_max=f0_max,device=device,threshold=threshold,model=model)
modules/F0Predictor/DioF0Predictor.py CHANGED
@@ -1,6 +1,8 @@
1
- from modules.F0Predictor.F0Predictor import F0Predictor
2
- import pyworld
3
  import numpy as np
 
 
 
 
4
 
5
  class DioF0Predictor(F0Predictor):
6
  def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
@@ -13,39 +15,25 @@ class DioF0Predictor(F0Predictor):
13
  '''
14
  对F0进行插值处理
15
  '''
 
 
 
16
 
17
- data = np.reshape(f0, (f0.size, 1))
18
-
19
- vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
20
- vuv_vector[data > 0.0] = 1.0
21
- vuv_vector[data <= 0.0] = 0.0
22
-
23
- ip_data = data
24
-
25
- frame_number = data.size
26
- last_value = 0.0
27
- for i in range(frame_number):
28
- if data[i] <= 0.0:
29
- j = i + 1
30
- for j in range(i + 1, frame_number):
31
- if data[j] > 0.0:
32
- break
33
- if j < frame_number - 1:
34
- if last_value > 0.0:
35
- step = (data[j] - data[i - 1]) / float(j - i)
36
- for k in range(i, j):
37
- ip_data[k] = data[i - 1] + step * (k - i + 1)
38
- else:
39
- for k in range(i, j):
40
- ip_data[k] = data[j]
41
- else:
42
- for k in range(i, frame_number):
43
- ip_data[k] = last_value
44
- else:
45
- ip_data[i] = data[i] #这里可能存在一个没有必要的拷贝
46
- last_value = data[i]
47
-
48
- return ip_data[:,0], vuv_vector[:,0]
49
 
50
  def resize_f0(self,x, target_len):
51
  source = np.array(x)
 
 
 
1
  import numpy as np
2
+ import pyworld
3
+
4
+ from modules.F0Predictor.F0Predictor import F0Predictor
5
+
6
 
7
  class DioF0Predictor(F0Predictor):
8
  def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
 
15
  '''
16
  对F0进行插值处理
17
  '''
18
+ vuv_vector = np.zeros_like(f0, dtype=np.float32)
19
+ vuv_vector[f0 > 0.0] = 1.0
20
+ vuv_vector[f0 <= 0.0] = 0.0
21
 
22
+ nzindex = np.nonzero(f0)[0]
23
+ data = f0[nzindex]
24
+ nzindex = nzindex.astype(np.float32)
25
+ time_org = self.hop_length / self.sampling_rate * nzindex
26
+ time_frame = np.arange(f0.shape[0]) * self.hop_length / self.sampling_rate
27
+
28
+ if data.shape[0] <= 0:
29
+ return np.zeros(f0.shape[0], dtype=np.float32),vuv_vector
30
+
31
+ if data.shape[0] == 1:
32
+ return np.ones(f0.shape[0], dtype=np.float32) * f0[0],vuv_vector
33
+
34
+ f0 = np.interp(time_frame, time_org, data, left=data[0], right=data[-1])
35
+
36
+ return f0,vuv_vector
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  def resize_f0(self,x, target_len):
39
  source = np.array(x)
modules/F0Predictor/HarvestF0Predictor.py CHANGED
@@ -1,6 +1,8 @@
1
- from modules.F0Predictor.F0Predictor import F0Predictor
2
- import pyworld
3
  import numpy as np
 
 
 
 
4
 
5
  class HarvestF0Predictor(F0Predictor):
6
  def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
@@ -13,40 +15,25 @@ class HarvestF0Predictor(F0Predictor):
13
  '''
14
  对F0进行插值处理
15
  '''
 
 
 
16
 
17
- data = np.reshape(f0, (f0.size, 1))
18
-
19
- vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
20
- vuv_vector[data > 0.0] = 1.0
21
- vuv_vector[data <= 0.0] = 0.0
22
-
23
- ip_data = data
24
-
25
- frame_number = data.size
26
- last_value = 0.0
27
- for i in range(frame_number):
28
- if data[i] <= 0.0:
29
- j = i + 1
30
- for j in range(i + 1, frame_number):
31
- if data[j] > 0.0:
32
- break
33
- if j < frame_number - 1:
34
- if last_value > 0.0:
35
- step = (data[j] - data[i - 1]) / float(j - i)
36
- for k in range(i, j):
37
- ip_data[k] = data[i - 1] + step * (k - i + 1)
38
- else:
39
- for k in range(i, j):
40
- ip_data[k] = data[j]
41
- else:
42
- for k in range(i, frame_number):
43
- ip_data[k] = last_value
44
- else:
45
- ip_data[i] = data[i] #这里可能存在一个没有必要的拷贝
46
- last_value = data[i]
47
-
48
- return ip_data[:,0], vuv_vector[:,0]
49
 
 
 
 
 
 
 
 
 
 
50
  def resize_f0(self,x, target_len):
51
  source = np.array(x)
52
  source[source<0.001] = np.nan
 
 
 
1
  import numpy as np
2
+ import pyworld
3
+
4
+ from modules.F0Predictor.F0Predictor import F0Predictor
5
+
6
 
7
  class HarvestF0Predictor(F0Predictor):
8
  def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
 
15
  '''
16
  对F0进行插值处理
17
  '''
18
+ vuv_vector = np.zeros_like(f0, dtype=np.float32)
19
+ vuv_vector[f0 > 0.0] = 1.0
20
+ vuv_vector[f0 <= 0.0] = 0.0
21
 
22
+ nzindex = np.nonzero(f0)[0]
23
+ data = f0[nzindex]
24
+ nzindex = nzindex.astype(np.float32)
25
+ time_org = self.hop_length / self.sampling_rate * nzindex
26
+ time_frame = np.arange(f0.shape[0]) * self.hop_length / self.sampling_rate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ if data.shape[0] <= 0:
29
+ return np.zeros(f0.shape[0], dtype=np.float32),vuv_vector
30
+
31
+ if data.shape[0] == 1:
32
+ return np.ones(f0.shape[0], dtype=np.float32) * f0[0],vuv_vector
33
+
34
+ f0 = np.interp(time_frame, time_org, data, left=data[0], right=data[-1])
35
+
36
+ return f0,vuv_vector
37
  def resize_f0(self,x, target_len):
38
  source = np.array(x)
39
  source[source<0.001] = np.nan
modules/F0Predictor/PMF0Predictor.py CHANGED
@@ -1,6 +1,8 @@
1
- from modules.F0Predictor.F0Predictor import F0Predictor
2
- import parselmouth
3
  import numpy as np
 
 
 
 
4
 
5
  class PMF0Predictor(F0Predictor):
6
  def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
@@ -14,39 +16,26 @@ class PMF0Predictor(F0Predictor):
14
  '''
15
  对F0进行插值处理
16
  '''
 
 
 
17
 
18
- data = np.reshape(f0, (f0.size, 1))
19
-
20
- vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
21
- vuv_vector[data > 0.0] = 1.0
22
- vuv_vector[data <= 0.0] = 0.0
23
-
24
- ip_data = data
25
-
26
- frame_number = data.size
27
- last_value = 0.0
28
- for i in range(frame_number):
29
- if data[i] <= 0.0:
30
- j = i + 1
31
- for j in range(i + 1, frame_number):
32
- if data[j] > 0.0:
33
- break
34
- if j < frame_number - 1:
35
- if last_value > 0.0:
36
- step = (data[j] - data[i - 1]) / float(j - i)
37
- for k in range(i, j):
38
- ip_data[k] = data[i - 1] + step * (k - i + 1)
39
- else:
40
- for k in range(i, j):
41
- ip_data[k] = data[j]
42
- else:
43
- for k in range(i, frame_number):
44
- ip_data[k] = last_value
45
- else:
46
- ip_data[i] = data[i] #这里可能存在一个没有必要的拷贝
47
- last_value = data[i]
48
 
49
- return ip_data[:,0], vuv_vector[:,0]
50
 
51
  def compute_f0(self,wav,p_len=None):
52
  x = wav
 
 
 
1
  import numpy as np
2
+ import parselmouth
3
+
4
+ from modules.F0Predictor.F0Predictor import F0Predictor
5
+
6
 
7
  class PMF0Predictor(F0Predictor):
8
  def __init__(self,hop_length=512,f0_min=50,f0_max=1100,sampling_rate=44100):
 
16
  '''
17
  对F0进行插值处理
18
  '''
19
+ vuv_vector = np.zeros_like(f0, dtype=np.float32)
20
+ vuv_vector[f0 > 0.0] = 1.0
21
+ vuv_vector[f0 <= 0.0] = 0.0
22
 
23
+ nzindex = np.nonzero(f0)[0]
24
+ data = f0[nzindex]
25
+ nzindex = nzindex.astype(np.float32)
26
+ time_org = self.hop_length / self.sampling_rate * nzindex
27
+ time_frame = np.arange(f0.shape[0]) * self.hop_length / self.sampling_rate
28
+
29
+ if data.shape[0] <= 0:
30
+ return np.zeros(f0.shape[0], dtype=np.float32),vuv_vector
31
+
32
+ if data.shape[0] == 1:
33
+ return np.ones(f0.shape[0], dtype=np.float32) * f0[0],vuv_vector
34
+
35
+ f0 = np.interp(time_frame, time_org, data, left=data[0], right=data[-1])
36
+
37
+ return f0,vuv_vector
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
 
39
 
40
  def compute_f0(self,wav,p_len=None):
41
  x = wav
modules/F0Predictor/crepe.py CHANGED
@@ -1,14 +1,14 @@
1
- from typing import Optional,Union
 
2
  try:
3
  from typing import Literal
4
- except Exception as e:
5
  from typing_extensions import Literal
6
  import numpy as np
7
  import torch
8
  import torchcrepe
9
  from torch import nn
10
  from torch.nn import functional as F
11
- import scipy
12
 
13
  #from:https://github.com/fishaudio/fish-diffusion
14
 
@@ -97,19 +97,19 @@ class BasePitchExtractor:
97
  f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
98
  time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy()
99
  time_frame = np.arange(pad_to) * self.hop_length / sampling_rate
 
 
100
 
101
  if f0.shape[0] <= 0:
102
- return torch.zeros(pad_to, dtype=torch.float, device=x.device),torch.zeros(pad_to, dtype=torch.float, device=x.device)
103
-
104
  if f0.shape[0] == 1:
105
- return torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[0],torch.ones(pad_to, dtype=torch.float, device=x.device)
106
 
107
  # 大概可以用 torch 重写?
108
  f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
109
- vuv_vector = vuv_vector.cpu().numpy()
110
- vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0))
111
 
112
- return f0,vuv_vector
113
 
114
 
115
  class MaskedAvgPool1d(nn.Module):
@@ -323,7 +323,7 @@ class CrepePitchExtractor(BasePitchExtractor):
323
  else:
324
  pd = torchcrepe.filter.median(pd, 3)
325
 
326
- pd = torchcrepe.threshold.Silence(-60.0)(pd, x, sampling_rate, 512)
327
  f0 = torchcrepe.threshold.At(self.threshold)(f0, pd)
328
 
329
  if self.use_fast_filters:
@@ -334,7 +334,7 @@ class CrepePitchExtractor(BasePitchExtractor):
334
  f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0)[0]
335
 
336
  if torch.all(f0 == 0):
337
- rtn = f0.cpu().numpy() if pad_to==None else np.zeros(pad_to)
338
  return rtn,rtn
339
 
340
  return self.post_process(x, sampling_rate, f0, pad_to)
 
1
+ from typing import Optional, Union
2
+
3
  try:
4
  from typing import Literal
5
+ except Exception:
6
  from typing_extensions import Literal
7
  import numpy as np
8
  import torch
9
  import torchcrepe
10
  from torch import nn
11
  from torch.nn import functional as F
 
12
 
13
  #from:https://github.com/fishaudio/fish-diffusion
14
 
 
97
  f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
98
  time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy()
99
  time_frame = np.arange(pad_to) * self.hop_length / sampling_rate
100
+
101
+ vuv_vector = F.interpolate(vuv_vector[None,None,:],size=pad_to)[0][0]
102
 
103
  if f0.shape[0] <= 0:
104
+ return torch.zeros(pad_to, dtype=torch.float, device=x.device),vuv_vector.cpu().numpy()
 
105
  if f0.shape[0] == 1:
106
+ return torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[0],vuv_vector.cpu().numpy()
107
 
108
  # 大概可以用 torch 重写?
109
  f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
110
+ #vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector,pad_to/len(vuv_vector),order = 0))
 
111
 
112
+ return f0,vuv_vector.cpu().numpy()
113
 
114
 
115
  class MaskedAvgPool1d(nn.Module):
 
323
  else:
324
  pd = torchcrepe.filter.median(pd, 3)
325
 
326
+ pd = torchcrepe.threshold.Silence(-60.0)(pd, x, sampling_rate, self.hop_length)
327
  f0 = torchcrepe.threshold.At(self.threshold)(f0, pd)
328
 
329
  if self.use_fast_filters:
 
334
  f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0)[0]
335
 
336
  if torch.all(f0 == 0):
337
+ rtn = f0.cpu().numpy() if pad_to is None else np.zeros(pad_to)
338
  return rtn,rtn
339
 
340
  return self.post_process(x, sampling_rate, f0, pad_to)
modules/attentions.py CHANGED
@@ -1,12 +1,10 @@
1
- import copy
2
  import math
3
- import numpy as np
4
  import torch
5
  from torch import nn
6
  from torch.nn import functional as F
7
 
8
  import modules.commons as commons
9
- import modules.modules as modules
10
  from modules.modules import LayerNorm
11
 
12
 
@@ -243,7 +241,7 @@ class MultiHeadAttention(nn.Module):
243
  return ret
244
 
245
  def _get_relative_embeddings(self, relative_embeddings, length):
246
- max_relative_position = 2 * self.window_size + 1
247
  # Pad first before slice to avoid using cond ops.
248
  pad_length = max(length - (self.window_size + 1), 0)
249
  slice_start_position = max((self.window_size + 1) - length, 0)
 
 
1
  import math
2
+
3
  import torch
4
  from torch import nn
5
  from torch.nn import functional as F
6
 
7
  import modules.commons as commons
 
8
  from modules.modules import LayerNorm
9
 
10
 
 
241
  return ret
242
 
243
  def _get_relative_embeddings(self, relative_embeddings, length):
244
+ 2 * self.window_size + 1
245
  # Pad first before slice to avoid using cond ops.
246
  pad_length = max(length - (self.window_size + 1), 0)
247
  slice_start_position = max((self.window_size + 1) - length, 0)
modules/commons.py CHANGED
@@ -1,9 +1,9 @@
1
  import math
2
- import numpy as np
3
  import torch
4
- from torch import nn
5
  from torch.nn import functional as F
6
 
 
7
  def slice_pitch_segments(x, ids_str, segment_size=4):
8
  ret = torch.zeros_like(x[:, :segment_size])
9
  for i in range(x.size(0)):
@@ -24,10 +24,12 @@ def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4):
24
 
25
  def init_weights(m, mean=0.0, std=0.01):
26
  classname = m.__class__.__name__
27
- if classname.find("Conv") != -1:
 
 
 
28
  m.weight.data.normal_(mean, std)
29
 
30
-
31
  def get_padding(kernel_size, dilation=1):
32
  return int((kernel_size*dilation - dilation)/2)
33
 
@@ -134,12 +136,6 @@ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
134
  return acts
135
 
136
 
137
- def convert_pad_shape(pad_shape):
138
- l = pad_shape[::-1]
139
- pad_shape = [item for sublist in l for item in sublist]
140
- return pad_shape
141
-
142
-
143
  def shift_1d(x):
144
  x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
145
  return x
@@ -157,7 +153,6 @@ def generate_path(duration, mask):
157
  duration: [b, 1, t_x]
158
  mask: [b, 1, t_y, t_x]
159
  """
160
- device = duration.device
161
 
162
  b, _, t_y, t_x = mask.shape
163
  cum_duration = torch.cumsum(duration, -1)
 
1
  import math
2
+
3
  import torch
 
4
  from torch.nn import functional as F
5
 
6
+
7
  def slice_pitch_segments(x, ids_str, segment_size=4):
8
  ret = torch.zeros_like(x[:, :segment_size])
9
  for i in range(x.size(0)):
 
24
 
25
  def init_weights(m, mean=0.0, std=0.01):
26
  classname = m.__class__.__name__
27
+ if "Depthwise_Separable" in classname:
28
+ m.depth_conv.weight.data.normal_(mean, std)
29
+ m.point_conv.weight.data.normal_(mean, std)
30
+ elif classname.find("Conv") != -1:
31
  m.weight.data.normal_(mean, std)
32
 
 
33
  def get_padding(kernel_size, dilation=1):
34
  return int((kernel_size*dilation - dilation)/2)
35
 
 
136
  return acts
137
 
138
 
 
 
 
 
 
 
139
  def shift_1d(x):
140
  x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
141
  return x
 
153
  duration: [b, 1, t_x]
154
  mask: [b, 1, t_y, t_x]
155
  """
 
156
 
157
  b, _, t_y, t_x = mask.shape
158
  cum_duration = torch.cumsum(duration, -1)
modules/enhancer.py CHANGED
@@ -1,10 +1,12 @@
1
  import numpy as np
2
  import torch
3
  import torch.nn.functional as F
4
- from vdecoder.nsf_hifigan.nvSTFT import STFT
5
- from vdecoder.nsf_hifigan.models import load_model
6
  from torchaudio.transforms import Resample
7
 
 
 
 
 
8
  class Enhancer:
9
  def __init__(self, enhancer_type, enhancer_ckpt, device=None):
10
  if device is None:
 
1
  import numpy as np
2
  import torch
3
  import torch.nn.functional as F
 
 
4
  from torchaudio.transforms import Resample
5
 
6
+ from vdecoder.nsf_hifigan.models import load_model
7
+ from vdecoder.nsf_hifigan.nvSTFT import STFT
8
+
9
+
10
  class Enhancer:
11
  def __init__(self, enhancer_type, enhancer_ckpt, device=None):
12
  if device is None:
modules/losses.py CHANGED
@@ -1,7 +1,4 @@
1
- import torch
2
- from torch.nn import functional as F
3
-
4
- import modules.commons as commons
5
 
6
 
7
  def feature_loss(fmap_r, fmap_g):
 
1
+ import torch
 
 
 
2
 
3
 
4
  def feature_loss(fmap_r, fmap_g):