Spaces:
Sleeping
Sleeping
init
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +19 -0
- Dockerfile +38 -0
- Dockerfile_GPU +38 -0
- LICENSE +661 -0
- README.md +5 -10
- README_zh.md +584 -0
- api_test.py +575 -0
- app.py +74 -0
- bert_vits2/LICENSE +674 -0
- bert_vits2/README.md +5 -0
- bert_vits2/__init__.py +2 -0
- bert_vits2/attentions.py +352 -0
- bert_vits2/bert_vits2.py +403 -0
- bert_vits2/clap_wrapper.py +17 -0
- bert_vits2/commons.py +158 -0
- bert_vits2/g2pW/pypinyin_G2pW_bv2/__init__.py +5 -0
- bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw.py +121 -0
- bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/__init__.py +0 -0
- bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/bopomofo_to_pinyin_wo_tune_dict.json +1 -0
- bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/char_bopomofo_dict.json +0 -0
- bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/char_convert.py +44 -0
- bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/dataset.py +181 -0
- bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/onnx_api.py +273 -0
- bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/utils.py +144 -0
- bert_vits2/get_emo.py +92 -0
- bert_vits2/models.py +799 -0
- bert_vits2/models_ja_extra.py +1016 -0
- bert_vits2/models_v230.py +1019 -0
- bert_vits2/modules.py +459 -0
- bert_vits2/requirements.txt +15 -0
- bert_vits2/text/__init__.py +25 -0
- bert_vits2/text/chinese.py +198 -0
- bert_vits2/text/chinese_bert.py +59 -0
- bert_vits2/text/chinese_bert_extra.py +60 -0
- bert_vits2/text/chinese_v100.py +197 -0
- bert_vits2/text/chinese_v240.py +211 -0
- bert_vits2/text/cleaner.py +53 -0
- bert_vits2/text/cmudict.rep +0 -0
- bert_vits2/text/cmudict_cache.pickle +3 -0
- bert_vits2/text/english.py +449 -0
- bert_vits2/text/english_bert_mock.py +36 -0
- bert_vits2/text/english_bert_mock_v200.py +22 -0
- bert_vits2/text/english_v200.py +360 -0
- bert_vits2/text/english_v230.py +493 -0
- bert_vits2/text/japanese.py +428 -0
- bert_vits2/text/japanese_bert.py +43 -0
- bert_vits2/text/japanese_bert_extra.py +42 -0
- bert_vits2/text/japanese_bert_v111.py +22 -0
- bert_vits2/text/japanese_bert_v200.py +39 -0
- bert_vits2/text/japanese_extra.py +524 -0
.gitignore
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
**/__pycache__
|
2 |
+
/Model/
|
3 |
+
/logs/
|
4 |
+
/cache/
|
5 |
+
/upload/
|
6 |
+
**/pytorch_model.bin
|
7 |
+
**/spm.model
|
8 |
+
/**/*.pt
|
9 |
+
/**/*.onnx
|
10 |
+
phrases_dict.txt
|
11 |
+
/config.yml
|
12 |
+
/config.yaml
|
13 |
+
/data/emotional/dimensional_emotion_model/model.onnx
|
14 |
+
/data/hubert_soft/hubert-soft-0d54a1f4.pt
|
15 |
+
/data/emotional/dimensional_emotion_npy/
|
16 |
+
/data/bert/vits_chinese_bert/prosody_model.pt
|
17 |
+
/data/emotional/dimensional_emotion_npy/
|
18 |
+
/data/models/
|
19 |
+
/vits/text/chinese_dialect_lexicons
|
Dockerfile
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM artrajz/pytorch:2.2.1-cpu-py3.10.11-ubuntu22.04
|
2 |
+
|
3 |
+
RUN mkdir -p /app
|
4 |
+
WORKDIR /app
|
5 |
+
|
6 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
7 |
+
|
8 |
+
|
9 |
+
RUN apt-get update && \
|
10 |
+
apt-get install -yq build-essential espeak-ng cmake wget ca-certificates tzdata&& \
|
11 |
+
update-ca-certificates && \
|
12 |
+
apt-get clean && \
|
13 |
+
apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
|
14 |
+
rm -rf /var/lib/apt/lists/*
|
15 |
+
|
16 |
+
# Install jemalloc
|
17 |
+
RUN wget https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2 && \
|
18 |
+
tar -xvf jemalloc-5.3.0.tar.bz2 && \
|
19 |
+
cd jemalloc-5.3.0 && \
|
20 |
+
./configure && \
|
21 |
+
make -j$(nproc) && \
|
22 |
+
make install && \
|
23 |
+
cd .. && \
|
24 |
+
rm -rf jemalloc-5.3.0* && \
|
25 |
+
ldconfig
|
26 |
+
|
27 |
+
ENV LD_PRELOAD=/usr/local/lib/libjemalloc.so
|
28 |
+
|
29 |
+
COPY requirements.txt /app/
|
30 |
+
RUN pip install gunicorn --no-cache-dir && \
|
31 |
+
pip install -r requirements.txt --no-cache-dir&& \
|
32 |
+
rm -rf /root/.cache/pip/*
|
33 |
+
|
34 |
+
COPY . /app
|
35 |
+
|
36 |
+
EXPOSE 23456
|
37 |
+
|
38 |
+
CMD ["gunicorn", "-c", "gunicorn_config.py", "app:app"]
|
Dockerfile_GPU
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM artrajz/pytorch:2.2.1-cu118-py3.10.11-ubuntu22.04
|
2 |
+
|
3 |
+
RUN mkdir -p /app
|
4 |
+
WORKDIR /app
|
5 |
+
|
6 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
7 |
+
|
8 |
+
RUN apt-get update && \
|
9 |
+
apt-get install -yq build-essential espeak-ng cmake wget ca-certificates tzdata&& \
|
10 |
+
update-ca-certificates && \
|
11 |
+
apt-get clean && \
|
12 |
+
apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
|
13 |
+
rm -rf /var/lib/apt/lists/*
|
14 |
+
|
15 |
+
|
16 |
+
# Install jemalloc
|
17 |
+
RUN wget https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2 && \
|
18 |
+
tar -xvf jemalloc-5.3.0.tar.bz2 && \
|
19 |
+
cd jemalloc-5.3.0 && \
|
20 |
+
./configure && \
|
21 |
+
make -j$(nproc) && \
|
22 |
+
make install && \
|
23 |
+
cd .. && \
|
24 |
+
rm -rf jemalloc-5.3.0* && \
|
25 |
+
ldconfig
|
26 |
+
|
27 |
+
ENV LD_PRELOAD=/usr/local/lib/libjemalloc.so
|
28 |
+
|
29 |
+
COPY requirements.txt /app/
|
30 |
+
RUN pip install gunicorn --no-cache-dir && \
|
31 |
+
pip install -r requirements.txt --no-cache-dir&& \
|
32 |
+
rm -rf /root/.cache/pip/*
|
33 |
+
|
34 |
+
COPY . /app
|
35 |
+
|
36 |
+
EXPOSE 23456
|
37 |
+
|
38 |
+
CMD ["gunicorn", "-c", "gunicorn_config.py", "app:app"]
|
LICENSE
ADDED
@@ -0,0 +1,661 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
GNU AFFERO GENERAL PUBLIC LICENSE
|
2 |
+
Version 3, 19 November 2007
|
3 |
+
|
4 |
+
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
5 |
+
Everyone is permitted to copy and distribute verbatim copies
|
6 |
+
of this license document, but changing it is not allowed.
|
7 |
+
|
8 |
+
Preamble
|
9 |
+
|
10 |
+
The GNU Affero General Public License is a free, copyleft license for
|
11 |
+
software and other kinds of works, specifically designed to ensure
|
12 |
+
cooperation with the community in the case of network server software.
|
13 |
+
|
14 |
+
The licenses for most software and other practical works are designed
|
15 |
+
to take away your freedom to share and change the works. By contrast,
|
16 |
+
our General Public Licenses are intended to guarantee your freedom to
|
17 |
+
share and change all versions of a program--to make sure it remains free
|
18 |
+
software for all its users.
|
19 |
+
|
20 |
+
When we speak of free software, we are referring to freedom, not
|
21 |
+
price. Our General Public Licenses are designed to make sure that you
|
22 |
+
have the freedom to distribute copies of free software (and charge for
|
23 |
+
them if you wish), that you receive source code or can get it if you
|
24 |
+
want it, that you can change the software or use pieces of it in new
|
25 |
+
free programs, and that you know you can do these things.
|
26 |
+
|
27 |
+
Developers that use our General Public Licenses protect your rights
|
28 |
+
with two steps: (1) assert copyright on the software, and (2) offer
|
29 |
+
you this License which gives you legal permission to copy, distribute
|
30 |
+
and/or modify the software.
|
31 |
+
|
32 |
+
A secondary benefit of defending all users' freedom is that
|
33 |
+
improvements made in alternate versions of the program, if they
|
34 |
+
receive widespread use, become available for other developers to
|
35 |
+
incorporate. Many developers of free software are heartened and
|
36 |
+
encouraged by the resulting cooperation. However, in the case of
|
37 |
+
software used on network servers, this result may fail to come about.
|
38 |
+
The GNU General Public License permits making a modified version and
|
39 |
+
letting the public access it on a server without ever releasing its
|
40 |
+
source code to the public.
|
41 |
+
|
42 |
+
The GNU Affero General Public License is designed specifically to
|
43 |
+
ensure that, in such cases, the modified source code becomes available
|
44 |
+
to the community. It requires the operator of a network server to
|
45 |
+
provide the source code of the modified version running there to the
|
46 |
+
users of that server. Therefore, public use of a modified version, on
|
47 |
+
a publicly accessible server, gives the public access to the source
|
48 |
+
code of the modified version.
|
49 |
+
|
50 |
+
An older license, called the Affero General Public License and
|
51 |
+
published by Affero, was designed to accomplish similar goals. This is
|
52 |
+
a different license, not a version of the Affero GPL, but Affero has
|
53 |
+
released a new version of the Affero GPL which permits relicensing under
|
54 |
+
this license.
|
55 |
+
|
56 |
+
The precise terms and conditions for copying, distribution and
|
57 |
+
modification follow.
|
58 |
+
|
59 |
+
TERMS AND CONDITIONS
|
60 |
+
|
61 |
+
0. Definitions.
|
62 |
+
|
63 |
+
"This License" refers to version 3 of the GNU Affero General Public License.
|
64 |
+
|
65 |
+
"Copyright" also means copyright-like laws that apply to other kinds of
|
66 |
+
works, such as semiconductor masks.
|
67 |
+
|
68 |
+
"The Program" refers to any copyrightable work licensed under this
|
69 |
+
License. Each licensee is addressed as "you". "Licensees" and
|
70 |
+
"recipients" may be individuals or organizations.
|
71 |
+
|
72 |
+
To "modify" a work means to copy from or adapt all or part of the work
|
73 |
+
in a fashion requiring copyright permission, other than the making of an
|
74 |
+
exact copy. The resulting work is called a "modified version" of the
|
75 |
+
earlier work or a work "based on" the earlier work.
|
76 |
+
|
77 |
+
A "covered work" means either the unmodified Program or a work based
|
78 |
+
on the Program.
|
79 |
+
|
80 |
+
To "propagate" a work means to do anything with it that, without
|
81 |
+
permission, would make you directly or secondarily liable for
|
82 |
+
infringement under applicable copyright law, except executing it on a
|
83 |
+
computer or modifying a private copy. Propagation includes copying,
|
84 |
+
distribution (with or without modification), making available to the
|
85 |
+
public, and in some countries other activities as well.
|
86 |
+
|
87 |
+
To "convey" a work means any kind of propagation that enables other
|
88 |
+
parties to make or receive copies. Mere interaction with a user through
|
89 |
+
a computer network, with no transfer of a copy, is not conveying.
|
90 |
+
|
91 |
+
An interactive user interface displays "Appropriate Legal Notices"
|
92 |
+
to the extent that it includes a convenient and prominently visible
|
93 |
+
feature that (1) displays an appropriate copyright notice, and (2)
|
94 |
+
tells the user that there is no warranty for the work (except to the
|
95 |
+
extent that warranties are provided), that licensees may convey the
|
96 |
+
work under this License, and how to view a copy of this License. If
|
97 |
+
the interface presents a list of user commands or options, such as a
|
98 |
+
menu, a prominent item in the list meets this criterion.
|
99 |
+
|
100 |
+
1. Source Code.
|
101 |
+
|
102 |
+
The "source code" for a work means the preferred form of the work
|
103 |
+
for making modifications to it. "Object code" means any non-source
|
104 |
+
form of a work.
|
105 |
+
|
106 |
+
A "Standard Interface" means an interface that either is an official
|
107 |
+
standard defined by a recognized standards body, or, in the case of
|
108 |
+
interfaces specified for a particular programming language, one that
|
109 |
+
is widely used among developers working in that language.
|
110 |
+
|
111 |
+
The "System Libraries" of an executable work include anything, other
|
112 |
+
than the work as a whole, that (a) is included in the normal form of
|
113 |
+
packaging a Major Component, but which is not part of that Major
|
114 |
+
Component, and (b) serves only to enable use of the work with that
|
115 |
+
Major Component, or to implement a Standard Interface for which an
|
116 |
+
implementation is available to the public in source code form. A
|
117 |
+
"Major Component", in this context, means a major essential component
|
118 |
+
(kernel, window system, and so on) of the specific operating system
|
119 |
+
(if any) on which the executable work runs, or a compiler used to
|
120 |
+
produce the work, or an object code interpreter used to run it.
|
121 |
+
|
122 |
+
The "Corresponding Source" for a work in object code form means all
|
123 |
+
the source code needed to generate, install, and (for an executable
|
124 |
+
work) run the object code and to modify the work, including scripts to
|
125 |
+
control those activities. However, it does not include the work's
|
126 |
+
System Libraries, or general-purpose tools or generally available free
|
127 |
+
programs which are used unmodified in performing those activities but
|
128 |
+
which are not part of the work. For example, Corresponding Source
|
129 |
+
includes interface definition files associated with source files for
|
130 |
+
the work, and the source code for shared libraries and dynamically
|
131 |
+
linked subprograms that the work is specifically designed to require,
|
132 |
+
such as by intimate data communication or control flow between those
|
133 |
+
subprograms and other parts of the work.
|
134 |
+
|
135 |
+
The Corresponding Source need not include anything that users
|
136 |
+
can regenerate automatically from other parts of the Corresponding
|
137 |
+
Source.
|
138 |
+
|
139 |
+
The Corresponding Source for a work in source code form is that
|
140 |
+
same work.
|
141 |
+
|
142 |
+
2. Basic Permissions.
|
143 |
+
|
144 |
+
All rights granted under this License are granted for the term of
|
145 |
+
copyright on the Program, and are irrevocable provided the stated
|
146 |
+
conditions are met. This License explicitly affirms your unlimited
|
147 |
+
permission to run the unmodified Program. The output from running a
|
148 |
+
covered work is covered by this License only if the output, given its
|
149 |
+
content, constitutes a covered work. This License acknowledges your
|
150 |
+
rights of fair use or other equivalent, as provided by copyright law.
|
151 |
+
|
152 |
+
You may make, run and propagate covered works that you do not
|
153 |
+
convey, without conditions so long as your license otherwise remains
|
154 |
+
in force. You may convey covered works to others for the sole purpose
|
155 |
+
of having them make modifications exclusively for you, or provide you
|
156 |
+
with facilities for running those works, provided that you comply with
|
157 |
+
the terms of this License in conveying all material for which you do
|
158 |
+
not control copyright. Those thus making or running the covered works
|
159 |
+
for you must do so exclusively on your behalf, under your direction
|
160 |
+
and control, on terms that prohibit them from making any copies of
|
161 |
+
your copyrighted material outside their relationship with you.
|
162 |
+
|
163 |
+
Conveying under any other circumstances is permitted solely under
|
164 |
+
the conditions stated below. Sublicensing is not allowed; section 10
|
165 |
+
makes it unnecessary.
|
166 |
+
|
167 |
+
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
168 |
+
|
169 |
+
No covered work shall be deemed part of an effective technological
|
170 |
+
measure under any applicable law fulfilling obligations under article
|
171 |
+
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
172 |
+
similar laws prohibiting or restricting circumvention of such
|
173 |
+
measures.
|
174 |
+
|
175 |
+
When you convey a covered work, you waive any legal power to forbid
|
176 |
+
circumvention of technological measures to the extent such circumvention
|
177 |
+
is effected by exercising rights under this License with respect to
|
178 |
+
the covered work, and you disclaim any intention to limit operation or
|
179 |
+
modification of the work as a means of enforcing, against the work's
|
180 |
+
users, your or third parties' legal rights to forbid circumvention of
|
181 |
+
technological measures.
|
182 |
+
|
183 |
+
4. Conveying Verbatim Copies.
|
184 |
+
|
185 |
+
You may convey verbatim copies of the Program's source code as you
|
186 |
+
receive it, in any medium, provided that you conspicuously and
|
187 |
+
appropriately publish on each copy an appropriate copyright notice;
|
188 |
+
keep intact all notices stating that this License and any
|
189 |
+
non-permissive terms added in accord with section 7 apply to the code;
|
190 |
+
keep intact all notices of the absence of any warranty; and give all
|
191 |
+
recipients a copy of this License along with the Program.
|
192 |
+
|
193 |
+
You may charge any price or no price for each copy that you convey,
|
194 |
+
and you may offer support or warranty protection for a fee.
|
195 |
+
|
196 |
+
5. Conveying Modified Source Versions.
|
197 |
+
|
198 |
+
You may convey a work based on the Program, or the modifications to
|
199 |
+
produce it from the Program, in the form of source code under the
|
200 |
+
terms of section 4, provided that you also meet all of these conditions:
|
201 |
+
|
202 |
+
a) The work must carry prominent notices stating that you modified
|
203 |
+
it, and giving a relevant date.
|
204 |
+
|
205 |
+
b) The work must carry prominent notices stating that it is
|
206 |
+
released under this License and any conditions added under section
|
207 |
+
7. This requirement modifies the requirement in section 4 to
|
208 |
+
"keep intact all notices".
|
209 |
+
|
210 |
+
c) You must license the entire work, as a whole, under this
|
211 |
+
License to anyone who comes into possession of a copy. This
|
212 |
+
License will therefore apply, along with any applicable section 7
|
213 |
+
additional terms, to the whole of the work, and all its parts,
|
214 |
+
regardless of how they are packaged. This License gives no
|
215 |
+
permission to license the work in any other way, but it does not
|
216 |
+
invalidate such permission if you have separately received it.
|
217 |
+
|
218 |
+
d) If the work has interactive user interfaces, each must display
|
219 |
+
Appropriate Legal Notices; however, if the Program has interactive
|
220 |
+
interfaces that do not display Appropriate Legal Notices, your
|
221 |
+
work need not make them do so.
|
222 |
+
|
223 |
+
A compilation of a covered work with other separate and independent
|
224 |
+
works, which are not by their nature extensions of the covered work,
|
225 |
+
and which are not combined with it such as to form a larger program,
|
226 |
+
in or on a volume of a storage or distribution medium, is called an
|
227 |
+
"aggregate" if the compilation and its resulting copyright are not
|
228 |
+
used to limit the access or legal rights of the compilation's users
|
229 |
+
beyond what the individual works permit. Inclusion of a covered work
|
230 |
+
in an aggregate does not cause this License to apply to the other
|
231 |
+
parts of the aggregate.
|
232 |
+
|
233 |
+
6. Conveying Non-Source Forms.
|
234 |
+
|
235 |
+
You may convey a covered work in object code form under the terms
|
236 |
+
of sections 4 and 5, provided that you also convey the
|
237 |
+
machine-readable Corresponding Source under the terms of this License,
|
238 |
+
in one of these ways:
|
239 |
+
|
240 |
+
a) Convey the object code in, or embodied in, a physical product
|
241 |
+
(including a physical distribution medium), accompanied by the
|
242 |
+
Corresponding Source fixed on a durable physical medium
|
243 |
+
customarily used for software interchange.
|
244 |
+
|
245 |
+
b) Convey the object code in, or embodied in, a physical product
|
246 |
+
(including a physical distribution medium), accompanied by a
|
247 |
+
written offer, valid for at least three years and valid for as
|
248 |
+
long as you offer spare parts or customer support for that product
|
249 |
+
model, to give anyone who possesses the object code either (1) a
|
250 |
+
copy of the Corresponding Source for all the software in the
|
251 |
+
product that is covered by this License, on a durable physical
|
252 |
+
medium customarily used for software interchange, for a price no
|
253 |
+
more than your reasonable cost of physically performing this
|
254 |
+
conveying of source, or (2) access to copy the
|
255 |
+
Corresponding Source from a network server at no charge.
|
256 |
+
|
257 |
+
c) Convey individual copies of the object code with a copy of the
|
258 |
+
written offer to provide the Corresponding Source. This
|
259 |
+
alternative is allowed only occasionally and noncommercially, and
|
260 |
+
only if you received the object code with such an offer, in accord
|
261 |
+
with subsection 6b.
|
262 |
+
|
263 |
+
d) Convey the object code by offering access from a designated
|
264 |
+
place (gratis or for a charge), and offer equivalent access to the
|
265 |
+
Corresponding Source in the same way through the same place at no
|
266 |
+
further charge. You need not require recipients to copy the
|
267 |
+
Corresponding Source along with the object code. If the place to
|
268 |
+
copy the object code is a network server, the Corresponding Source
|
269 |
+
may be on a different server (operated by you or a third party)
|
270 |
+
that supports equivalent copying facilities, provided you maintain
|
271 |
+
clear directions next to the object code saying where to find the
|
272 |
+
Corresponding Source. Regardless of what server hosts the
|
273 |
+
Corresponding Source, you remain obligated to ensure that it is
|
274 |
+
available for as long as needed to satisfy these requirements.
|
275 |
+
|
276 |
+
e) Convey the object code using peer-to-peer transmission, provided
|
277 |
+
you inform other peers where the object code and Corresponding
|
278 |
+
Source of the work are being offered to the general public at no
|
279 |
+
charge under subsection 6d.
|
280 |
+
|
281 |
+
A separable portion of the object code, whose source code is excluded
|
282 |
+
from the Corresponding Source as a System Library, need not be
|
283 |
+
included in conveying the object code work.
|
284 |
+
|
285 |
+
A "User Product" is either (1) a "consumer product", which means any
|
286 |
+
tangible personal property which is normally used for personal, family,
|
287 |
+
or household purposes, or (2) anything designed or sold for incorporation
|
288 |
+
into a dwelling. In determining whether a product is a consumer product,
|
289 |
+
doubtful cases shall be resolved in favor of coverage. For a particular
|
290 |
+
product received by a particular user, "normally used" refers to a
|
291 |
+
typical or common use of that class of product, regardless of the status
|
292 |
+
of the particular user or of the way in which the particular user
|
293 |
+
actually uses, or expects or is expected to use, the product. A product
|
294 |
+
is a consumer product regardless of whether the product has substantial
|
295 |
+
commercial, industrial or non-consumer uses, unless such uses represent
|
296 |
+
the only significant mode of use of the product.
|
297 |
+
|
298 |
+
"Installation Information" for a User Product means any methods,
|
299 |
+
procedures, authorization keys, or other information required to install
|
300 |
+
and execute modified versions of a covered work in that User Product from
|
301 |
+
a modified version of its Corresponding Source. The information must
|
302 |
+
suffice to ensure that the continued functioning of the modified object
|
303 |
+
code is in no case prevented or interfered with solely because
|
304 |
+
modification has been made.
|
305 |
+
|
306 |
+
If you convey an object code work under this section in, or with, or
|
307 |
+
specifically for use in, a User Product, and the conveying occurs as
|
308 |
+
part of a transaction in which the right of possession and use of the
|
309 |
+
User Product is transferred to the recipient in perpetuity or for a
|
310 |
+
fixed term (regardless of how the transaction is characterized), the
|
311 |
+
Corresponding Source conveyed under this section must be accompanied
|
312 |
+
by the Installation Information. But this requirement does not apply
|
313 |
+
if neither you nor any third party retains the ability to install
|
314 |
+
modified object code on the User Product (for example, the work has
|
315 |
+
been installed in ROM).
|
316 |
+
|
317 |
+
The requirement to provide Installation Information does not include a
|
318 |
+
requirement to continue to provide support service, warranty, or updates
|
319 |
+
for a work that has been modified or installed by the recipient, or for
|
320 |
+
the User Product in which it has been modified or installed. Access to a
|
321 |
+
network may be denied when the modification itself materially and
|
322 |
+
adversely affects the operation of the network or violates the rules and
|
323 |
+
protocols for communication across the network.
|
324 |
+
|
325 |
+
Corresponding Source conveyed, and Installation Information provided,
|
326 |
+
in accord with this section must be in a format that is publicly
|
327 |
+
documented (and with an implementation available to the public in
|
328 |
+
source code form), and must require no special password or key for
|
329 |
+
unpacking, reading or copying.
|
330 |
+
|
331 |
+
7. Additional Terms.
|
332 |
+
|
333 |
+
"Additional permissions" are terms that supplement the terms of this
|
334 |
+
License by making exceptions from one or more of its conditions.
|
335 |
+
Additional permissions that are applicable to the entire Program shall
|
336 |
+
be treated as though they were included in this License, to the extent
|
337 |
+
that they are valid under applicable law. If additional permissions
|
338 |
+
apply only to part of the Program, that part may be used separately
|
339 |
+
under those permissions, but the entire Program remains governed by
|
340 |
+
this License without regard to the additional permissions.
|
341 |
+
|
342 |
+
When you convey a copy of a covered work, you may at your option
|
343 |
+
remove any additional permissions from that copy, or from any part of
|
344 |
+
it. (Additional permissions may be written to require their own
|
345 |
+
removal in certain cases when you modify the work.) You may place
|
346 |
+
additional permissions on material, added by you to a covered work,
|
347 |
+
for which you have or can give appropriate copyright permission.
|
348 |
+
|
349 |
+
Notwithstanding any other provision of this License, for material you
|
350 |
+
add to a covered work, you may (if authorized by the copyright holders of
|
351 |
+
that material) supplement the terms of this License with terms:
|
352 |
+
|
353 |
+
a) Disclaiming warranty or limiting liability differently from the
|
354 |
+
terms of sections 15 and 16 of this License; or
|
355 |
+
|
356 |
+
b) Requiring preservation of specified reasonable legal notices or
|
357 |
+
author attributions in that material or in the Appropriate Legal
|
358 |
+
Notices displayed by works containing it; or
|
359 |
+
|
360 |
+
c) Prohibiting misrepresentation of the origin of that material, or
|
361 |
+
requiring that modified versions of such material be marked in
|
362 |
+
reasonable ways as different from the original version; or
|
363 |
+
|
364 |
+
d) Limiting the use for publicity purposes of names of licensors or
|
365 |
+
authors of the material; or
|
366 |
+
|
367 |
+
e) Declining to grant rights under trademark law for use of some
|
368 |
+
trade names, trademarks, or service marks; or
|
369 |
+
|
370 |
+
f) Requiring indemnification of licensors and authors of that
|
371 |
+
material by anyone who conveys the material (or modified versions of
|
372 |
+
it) with contractual assumptions of liability to the recipient, for
|
373 |
+
any liability that these contractual assumptions directly impose on
|
374 |
+
those licensors and authors.
|
375 |
+
|
376 |
+
All other non-permissive additional terms are considered "further
|
377 |
+
restrictions" within the meaning of section 10. If the Program as you
|
378 |
+
received it, or any part of it, contains a notice stating that it is
|
379 |
+
governed by this License along with a term that is a further
|
380 |
+
restriction, you may remove that term. If a license document contains
|
381 |
+
a further restriction but permits relicensing or conveying under this
|
382 |
+
License, you may add to a covered work material governed by the terms
|
383 |
+
of that license document, provided that the further restriction does
|
384 |
+
not survive such relicensing or conveying.
|
385 |
+
|
386 |
+
If you add terms to a covered work in accord with this section, you
|
387 |
+
must place, in the relevant source files, a statement of the
|
388 |
+
additional terms that apply to those files, or a notice indicating
|
389 |
+
where to find the applicable terms.
|
390 |
+
|
391 |
+
Additional terms, permissive or non-permissive, may be stated in the
|
392 |
+
form of a separately written license, or stated as exceptions;
|
393 |
+
the above requirements apply either way.
|
394 |
+
|
395 |
+
8. Termination.
|
396 |
+
|
397 |
+
You may not propagate or modify a covered work except as expressly
|
398 |
+
provided under this License. Any attempt otherwise to propagate or
|
399 |
+
modify it is void, and will automatically terminate your rights under
|
400 |
+
this License (including any patent licenses granted under the third
|
401 |
+
paragraph of section 11).
|
402 |
+
|
403 |
+
However, if you cease all violation of this License, then your
|
404 |
+
license from a particular copyright holder is reinstated (a)
|
405 |
+
provisionally, unless and until the copyright holder explicitly and
|
406 |
+
finally terminates your license, and (b) permanently, if the copyright
|
407 |
+
holder fails to notify you of the violation by some reasonable means
|
408 |
+
prior to 60 days after the cessation.
|
409 |
+
|
410 |
+
Moreover, your license from a particular copyright holder is
|
411 |
+
reinstated permanently if the copyright holder notifies you of the
|
412 |
+
violation by some reasonable means, this is the first time you have
|
413 |
+
received notice of violation of this License (for any work) from that
|
414 |
+
copyright holder, and you cure the violation prior to 30 days after
|
415 |
+
your receipt of the notice.
|
416 |
+
|
417 |
+
Termination of your rights under this section does not terminate the
|
418 |
+
licenses of parties who have received copies or rights from you under
|
419 |
+
this License. If your rights have been terminated and not permanently
|
420 |
+
reinstated, you do not qualify to receive new licenses for the same
|
421 |
+
material under section 10.
|
422 |
+
|
423 |
+
9. Acceptance Not Required for Having Copies.
|
424 |
+
|
425 |
+
You are not required to accept this License in order to receive or
|
426 |
+
run a copy of the Program. Ancillary propagation of a covered work
|
427 |
+
occurring solely as a consequence of using peer-to-peer transmission
|
428 |
+
to receive a copy likewise does not require acceptance. However,
|
429 |
+
nothing other than this License grants you permission to propagate or
|
430 |
+
modify any covered work. These actions infringe copyright if you do
|
431 |
+
not accept this License. Therefore, by modifying or propagating a
|
432 |
+
covered work, you indicate your acceptance of this License to do so.
|
433 |
+
|
434 |
+
10. Automatic Licensing of Downstream Recipients.
|
435 |
+
|
436 |
+
Each time you convey a covered work, the recipient automatically
|
437 |
+
receives a license from the original licensors, to run, modify and
|
438 |
+
propagate that work, subject to this License. You are not responsible
|
439 |
+
for enforcing compliance by third parties with this License.
|
440 |
+
|
441 |
+
An "entity transaction" is a transaction transferring control of an
|
442 |
+
organization, or substantially all assets of one, or subdividing an
|
443 |
+
organization, or merging organizations. If propagation of a covered
|
444 |
+
work results from an entity transaction, each party to that
|
445 |
+
transaction who receives a copy of the work also receives whatever
|
446 |
+
licenses to the work the party's predecessor in interest had or could
|
447 |
+
give under the previous paragraph, plus a right to possession of the
|
448 |
+
Corresponding Source of the work from the predecessor in interest, if
|
449 |
+
the predecessor has it or can get it with reasonable efforts.
|
450 |
+
|
451 |
+
You may not impose any further restrictions on the exercise of the
|
452 |
+
rights granted or affirmed under this License. For example, you may
|
453 |
+
not impose a license fee, royalty, or other charge for exercise of
|
454 |
+
rights granted under this License, and you may not initiate litigation
|
455 |
+
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
456 |
+
any patent claim is infringed by making, using, selling, offering for
|
457 |
+
sale, or importing the Program or any portion of it.
|
458 |
+
|
459 |
+
11. Patents.
|
460 |
+
|
461 |
+
A "contributor" is a copyright holder who authorizes use under this
|
462 |
+
License of the Program or a work on which the Program is based. The
|
463 |
+
work thus licensed is called the contributor's "contributor version".
|
464 |
+
|
465 |
+
A contributor's "essential patent claims" are all patent claims
|
466 |
+
owned or controlled by the contributor, whether already acquired or
|
467 |
+
hereafter acquired, that would be infringed by some manner, permitted
|
468 |
+
by this License, of making, using, or selling its contributor version,
|
469 |
+
but do not include claims that would be infringed only as a
|
470 |
+
consequence of further modification of the contributor version. For
|
471 |
+
purposes of this definition, "control" includes the right to grant
|
472 |
+
patent sublicenses in a manner consistent with the requirements of
|
473 |
+
this License.
|
474 |
+
|
475 |
+
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
476 |
+
patent license under the contributor's essential patent claims, to
|
477 |
+
make, use, sell, offer for sale, import and otherwise run, modify and
|
478 |
+
propagate the contents of its contributor version.
|
479 |
+
|
480 |
+
In the following three paragraphs, a "patent license" is any express
|
481 |
+
agreement or commitment, however denominated, not to enforce a patent
|
482 |
+
(such as an express permission to practice a patent or covenant not to
|
483 |
+
sue for patent infringement). To "grant" such a patent license to a
|
484 |
+
party means to make such an agreement or commitment not to enforce a
|
485 |
+
patent against the party.
|
486 |
+
|
487 |
+
If you convey a covered work, knowingly relying on a patent license,
|
488 |
+
and the Corresponding Source of the work is not available for anyone
|
489 |
+
to copy, free of charge and under the terms of this License, through a
|
490 |
+
publicly available network server or other readily accessible means,
|
491 |
+
then you must either (1) cause the Corresponding Source to be so
|
492 |
+
available, or (2) arrange to deprive yourself of the benefit of the
|
493 |
+
patent license for this particular work, or (3) arrange, in a manner
|
494 |
+
consistent with the requirements of this License, to extend the patent
|
495 |
+
license to downstream recipients. "Knowingly relying" means you have
|
496 |
+
actual knowledge that, but for the patent license, your conveying the
|
497 |
+
covered work in a country, or your recipient's use of the covered work
|
498 |
+
in a country, would infringe one or more identifiable patents in that
|
499 |
+
country that you have reason to believe are valid.
|
500 |
+
|
501 |
+
If, pursuant to or in connection with a single transaction or
|
502 |
+
arrangement, you convey, or propagate by procuring conveyance of, a
|
503 |
+
covered work, and grant a patent license to some of the parties
|
504 |
+
receiving the covered work authorizing them to use, propagate, modify
|
505 |
+
or convey a specific copy of the covered work, then the patent license
|
506 |
+
you grant is automatically extended to all recipients of the covered
|
507 |
+
work and works based on it.
|
508 |
+
|
509 |
+
A patent license is "discriminatory" if it does not include within
|
510 |
+
the scope of its coverage, prohibits the exercise of, or is
|
511 |
+
conditioned on the non-exercise of one or more of the rights that are
|
512 |
+
specifically granted under this License. You may not convey a covered
|
513 |
+
work if you are a party to an arrangement with a third party that is
|
514 |
+
in the business of distributing software, under which you make payment
|
515 |
+
to the third party based on the extent of your activity of conveying
|
516 |
+
the work, and under which the third party grants, to any of the
|
517 |
+
parties who would receive the covered work from you, a discriminatory
|
518 |
+
patent license (a) in connection with copies of the covered work
|
519 |
+
conveyed by you (or copies made from those copies), or (b) primarily
|
520 |
+
for and in connection with specific products or compilations that
|
521 |
+
contain the covered work, unless you entered into that arrangement,
|
522 |
+
or that patent license was granted, prior to 28 March 2007.
|
523 |
+
|
524 |
+
Nothing in this License shall be construed as excluding or limiting
|
525 |
+
any implied license or other defenses to infringement that may
|
526 |
+
otherwise be available to you under applicable patent law.
|
527 |
+
|
528 |
+
12. No Surrender of Others' Freedom.
|
529 |
+
|
530 |
+
If conditions are imposed on you (whether by court order, agreement or
|
531 |
+
otherwise) that contradict the conditions of this License, they do not
|
532 |
+
excuse you from the conditions of this License. If you cannot convey a
|
533 |
+
covered work so as to satisfy simultaneously your obligations under this
|
534 |
+
License and any other pertinent obligations, then as a consequence you may
|
535 |
+
not convey it at all. For example, if you agree to terms that obligate you
|
536 |
+
to collect a royalty for further conveying from those to whom you convey
|
537 |
+
the Program, the only way you could satisfy both those terms and this
|
538 |
+
License would be to refrain entirely from conveying the Program.
|
539 |
+
|
540 |
+
13. Remote Network Interaction; Use with the GNU General Public License.
|
541 |
+
|
542 |
+
Notwithstanding any other provision of this License, if you modify the
|
543 |
+
Program, your modified version must prominently offer all users
|
544 |
+
interacting with it remotely through a computer network (if your version
|
545 |
+
supports such interaction) an opportunity to receive the Corresponding
|
546 |
+
Source of your version by providing access to the Corresponding Source
|
547 |
+
from a network server at no charge, through some standard or customary
|
548 |
+
means of facilitating copying of software. This Corresponding Source
|
549 |
+
shall include the Corresponding Source for any work covered by version 3
|
550 |
+
of the GNU General Public License that is incorporated pursuant to the
|
551 |
+
following paragraph.
|
552 |
+
|
553 |
+
Notwithstanding any other provision of this License, you have
|
554 |
+
permission to link or combine any covered work with a work licensed
|
555 |
+
under version 3 of the GNU General Public License into a single
|
556 |
+
combined work, and to convey the resulting work. The terms of this
|
557 |
+
License will continue to apply to the part which is the covered work,
|
558 |
+
but the work with which it is combined will remain governed by version
|
559 |
+
3 of the GNU General Public License.
|
560 |
+
|
561 |
+
14. Revised Versions of this License.
|
562 |
+
|
563 |
+
The Free Software Foundation may publish revised and/or new versions of
|
564 |
+
the GNU Affero General Public License from time to time. Such new versions
|
565 |
+
will be similar in spirit to the present version, but may differ in detail to
|
566 |
+
address new problems or concerns.
|
567 |
+
|
568 |
+
Each version is given a distinguishing version number. If the
|
569 |
+
Program specifies that a certain numbered version of the GNU Affero General
|
570 |
+
Public License "or any later version" applies to it, you have the
|
571 |
+
option of following the terms and conditions either of that numbered
|
572 |
+
version or of any later version published by the Free Software
|
573 |
+
Foundation. If the Program does not specify a version number of the
|
574 |
+
GNU Affero General Public License, you may choose any version ever published
|
575 |
+
by the Free Software Foundation.
|
576 |
+
|
577 |
+
If the Program specifies that a proxy can decide which future
|
578 |
+
versions of the GNU Affero General Public License can be used, that proxy's
|
579 |
+
public statement of acceptance of a version permanently authorizes you
|
580 |
+
to choose that version for the Program.
|
581 |
+
|
582 |
+
Later license versions may give you additional or different
|
583 |
+
permissions. However, no additional obligations are imposed on any
|
584 |
+
author or copyright holder as a result of your choosing to follow a
|
585 |
+
later version.
|
586 |
+
|
587 |
+
15. Disclaimer of Warranty.
|
588 |
+
|
589 |
+
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
590 |
+
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
591 |
+
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
592 |
+
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
593 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
594 |
+
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
595 |
+
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
596 |
+
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
597 |
+
|
598 |
+
16. Limitation of Liability.
|
599 |
+
|
600 |
+
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
601 |
+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
602 |
+
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
603 |
+
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
604 |
+
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
605 |
+
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
606 |
+
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
607 |
+
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
608 |
+
SUCH DAMAGES.
|
609 |
+
|
610 |
+
17. Interpretation of Sections 15 and 16.
|
611 |
+
|
612 |
+
If the disclaimer of warranty and limitation of liability provided
|
613 |
+
above cannot be given local legal effect according to their terms,
|
614 |
+
reviewing courts shall apply local law that most closely approximates
|
615 |
+
an absolute waiver of all civil liability in connection with the
|
616 |
+
Program, unless a warranty or assumption of liability accompanies a
|
617 |
+
copy of the Program in return for a fee.
|
618 |
+
|
619 |
+
END OF TERMS AND CONDITIONS
|
620 |
+
|
621 |
+
How to Apply These Terms to Your New Programs
|
622 |
+
|
623 |
+
If you develop a new program, and you want it to be of the greatest
|
624 |
+
possible use to the public, the best way to achieve this is to make it
|
625 |
+
free software which everyone can redistribute and change under these terms.
|
626 |
+
|
627 |
+
To do so, attach the following notices to the program. It is safest
|
628 |
+
to attach them to the start of each source file to most effectively
|
629 |
+
state the exclusion of warranty; and each file should have at least
|
630 |
+
the "copyright" line and a pointer to where the full notice is found.
|
631 |
+
|
632 |
+
<one line to give the program's name and a brief idea of what it does.>
|
633 |
+
Copyright (C) <year> <name of author>
|
634 |
+
|
635 |
+
This program is free software: you can redistribute it and/or modify
|
636 |
+
it under the terms of the GNU Affero General Public License as published
|
637 |
+
by the Free Software Foundation, either version 3 of the License, or
|
638 |
+
(at your option) any later version.
|
639 |
+
|
640 |
+
This program is distributed in the hope that it will be useful,
|
641 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
642 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
643 |
+
GNU Affero General Public License for more details.
|
644 |
+
|
645 |
+
You should have received a copy of the GNU Affero General Public License
|
646 |
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
647 |
+
|
648 |
+
Also add information on how to contact you by electronic and paper mail.
|
649 |
+
|
650 |
+
If your software can interact with users remotely through a computer
|
651 |
+
network, you should also make sure that it provides a way for users to
|
652 |
+
get its source. For example, if your program is a web application, its
|
653 |
+
interface could display a "Source" link that leads users to an archive
|
654 |
+
of the code. There are many ways you could offer source, and different
|
655 |
+
solutions will be better for different programs; see section 13 for the
|
656 |
+
specific requirements.
|
657 |
+
|
658 |
+
You should also get your employer (if you work as a programmer) or school,
|
659 |
+
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
660 |
+
For more information on this, and how to apply and follow the GNU AGPL, see
|
661 |
+
<https://www.gnu.org/licenses/>.
|
README.md
CHANGED
@@ -1,12 +1,7 @@
|
|
1 |
-
|
2 |
-
title:
|
3 |
-
emoji: 🐨
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: blue
|
6 |
sdk: gradio
|
7 |
-
|
|
|
|
|
8 |
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
license: mit
|
2 |
+
title: vits-simple-api-gsv
|
|
|
|
|
|
|
3 |
sdk: gradio
|
4 |
+
pinned: true
|
5 |
+
python_version: 3.10.11
|
6 |
+
emoji: 👀
|
7 |
app_file: app.py
|
|
|
|
|
|
|
|
README_zh.md
ADDED
@@ -0,0 +1,584 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<div class="title" align=center>
|
2 |
+
<h1>vits-simple-api</h1>
|
3 |
+
<div>Simply call the vits api</div>
|
4 |
+
<br/>
|
5 |
+
<br/>
|
6 |
+
<p>
|
7 |
+
<img src="https://img.shields.io/github/license/Artrajz/vits-simple-api">
|
8 |
+
<img src="https://img.shields.io/badge/python-3.10-green">
|
9 |
+
<a href="https://hub.docker.com/r/artrajz/vits-simple-api">
|
10 |
+
<img src="https://img.shields.io/docker/pulls/artrajz/vits-simple-api"></a>
|
11 |
+
</p>
|
12 |
+
<a href="https://github.com/Artrajz/vits-simple-api/blob/main/README.md">English</a>|<a href="https://github.com/Artrajz/vits-simple-api/blob/main/README_zh.md">中文文档</a>
|
13 |
+
<br/>
|
14 |
+
</div>
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
# Feature
|
21 |
+
|
22 |
+
- [x] VITS语音合成,语音转换
|
23 |
+
- [x] HuBert-soft VITS模型
|
24 |
+
- [x] W2V2 VITS / [emotional-vits](https://github.com/innnky/emotional-vits)维度情感模型
|
25 |
+
- [x] [vits_chinese](https://github.com/PlayVoice/vits_chinese)
|
26 |
+
- [x] [Bert-VITS2](https://github.com/Stardust-minus/Bert-VITS2)
|
27 |
+
- [x] [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
|
28 |
+
- [x] 加载多模型
|
29 |
+
- [x] 自动识别语言并处理,根据模型的cleaner设置语言类型识别的范围,支持自定义语言类型范围
|
30 |
+
- [x] 自定义默认参数
|
31 |
+
- [x] 长文本批处理
|
32 |
+
- [x] GPU加速推理
|
33 |
+
- [x] SSML语音合成标记语言(完善中...)
|
34 |
+
|
35 |
+
|
36 |
+
## 在线demo
|
37 |
+
|
38 |
+
[![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Artrajz/vits-simple-api) 感谢hugging face喵
|
39 |
+
|
40 |
+
注意不同的id支持的语言可能有所不同。[speakers](https://artrajz-vits-simple-api.hf.space/voice/speakers)
|
41 |
+
|
42 |
+
|
43 |
+
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164`
|
44 |
+
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text=我觉得1%2B1≠3&id=164&lang=zh`(get中一些字符需要转义不然会被过滤掉)
|
45 |
+
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text=Difficult the first time, easy the second.&id=4`
|
46 |
+
- 激动:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=111`
|
47 |
+
- 小声:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=2077`
|
48 |
+
|
49 |
+
https://user-images.githubusercontent.com/73542220/237995061-c1f25b4e-dd86-438a-9363-4bb1fe65b425.mov
|
50 |
+
|
51 |
+
# 部署
|
52 |
+
|
53 |
+
有两种部署方式可供选择。不论你选择哪一种,完成部署后都需要导入模型才能使用。
|
54 |
+
|
55 |
+
## Docker部署(Linux推荐)
|
56 |
+
|
57 |
+
### 步骤1: 镜像拉取
|
58 |
+
|
59 |
+
运行以下命令以拉取 Docker 镜像,根据脚本中的提示选择需要下载的必要文件和拉取镜像:
|
60 |
+
|
61 |
+
```bash
|
62 |
+
bash -c "$(wget -O- https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/vits-simple-api-installer-latest.sh)"
|
63 |
+
```
|
64 |
+
|
65 |
+
项目配置文件以及模型文件夹的默认路径为`/usr/local/vits-simple-api/`
|
66 |
+
|
67 |
+
### 步骤2: 启动
|
68 |
+
|
69 |
+
运行以下命令启动容器:
|
70 |
+
|
71 |
+
```bash
|
72 |
+
docker-compose up -d
|
73 |
+
```
|
74 |
+
|
75 |
+
### 镜像更新
|
76 |
+
|
77 |
+
运行以下命令更新镜像:
|
78 |
+
|
79 |
+
```bash
|
80 |
+
docker-compose pull
|
81 |
+
```
|
82 |
+
|
83 |
+
重新启动容器:
|
84 |
+
|
85 |
+
```bash
|
86 |
+
docker-compose up -d
|
87 |
+
```
|
88 |
+
|
89 |
+
## 虚拟环境部署
|
90 |
+
|
91 |
+
### 步骤1: 克隆项目
|
92 |
+
|
93 |
+
使用以下命令克隆项目仓库:
|
94 |
+
|
95 |
+
```bash
|
96 |
+
git clone https://github.com/Artrajz/vits-simple-api.git
|
97 |
+
```
|
98 |
+
|
99 |
+
### 步骤2: 下载 Python 依赖
|
100 |
+
|
101 |
+
推荐使用 Python 3.10版本的虚拟环境。运行以下命令安装项目所需的 Python 依赖:
|
102 |
+
|
103 |
+
如果遇到某些无法安装的依赖,请看下面的常见问题。
|
104 |
+
|
105 |
+
```bash
|
106 |
+
pip install -r requirements.txt
|
107 |
+
```
|
108 |
+
|
109 |
+
### 步骤3: 启动
|
110 |
+
|
111 |
+
运行以下命令启动程序:
|
112 |
+
|
113 |
+
```bash
|
114 |
+
python app.py
|
115 |
+
```
|
116 |
+
|
117 |
+
## Windows快速部署包
|
118 |
+
|
119 |
+
### 步骤1:下载并解压部署包
|
120 |
+
|
121 |
+
进入[releases页面](https://github.com/Artrajz/vits-simple-api/releases)下载并解压最新的部署包
|
122 |
+
|
123 |
+
### 步骤2:启动
|
124 |
+
|
125 |
+
运行start.bat启动程序
|
126 |
+
|
127 |
+
## 模型加载
|
128 |
+
|
129 |
+
### 步骤1: 下载 VITS 模型
|
130 |
+
|
131 |
+
将 VITS 模型文件下载并放入 `data/models`文件夹。
|
132 |
+
|
133 |
+
### 步骤2: 加载模型
|
134 |
+
|
135 |
+
#### 自动加载模型
|
136 |
+
|
137 |
+
v0.6.6版本之后默认会**自动加载**`data/models`文件夹下的所有模型,方便新手使用。
|
138 |
+
|
139 |
+
#### 手动加载模型
|
140 |
+
|
141 |
+
首次启动之后会生成一个config.yaml配置文件,需要将`tts_config.auto_load`改为`false`以启用手动加载模式。
|
142 |
+
|
143 |
+
可以修改配置文件中的`tts_config.models`或者在浏览器中进入管理员后台进行修改。
|
144 |
+
|
145 |
+
**注意:v0.6.6版本之后已修改模型读取路径,请重新按照以下步骤配置模型路径!**
|
146 |
+
|
147 |
+
路径可填绝对路径或相对路径,相对路径则是从项目根目录中的`data/models`文件夹开始。
|
148 |
+
|
149 |
+
比如`data/models`文件夹中有如下文件
|
150 |
+
|
151 |
+
```
|
152 |
+
├─model1
|
153 |
+
│ │─G_1000.pth
|
154 |
+
│ └─config.json
|
155 |
+
└─model2
|
156 |
+
│─G_1000.pth
|
157 |
+
└─config.json
|
158 |
+
```
|
159 |
+
|
160 |
+
填写
|
161 |
+
|
162 |
+
```yaml
|
163 |
+
tts_config:
|
164 |
+
auto_load: false
|
165 |
+
models:
|
166 |
+
- config_path: model1/config.json
|
167 |
+
model_path: model1/G_1000.pth
|
168 |
+
- config_path: model2/config.json
|
169 |
+
model_path: model2/G_1000.pth
|
170 |
+
# GPT-SoVITS则为
|
171 |
+
- sovits_path: gpt_sovits1/model1_e8_s11536.pth
|
172 |
+
gpt_path: gpt_sovits1/model1-e15.ckpt
|
173 |
+
- sovits_path: gpt_sovits2/model2_e8_s11536.pth
|
174 |
+
gpt_path: gpt_sovits2/model2-e15.ckpt
|
175 |
+
|
176 |
+
```
|
177 |
+
|
178 |
+
在管理员后台加载模型比较方便,但如果想加载`data/models`文件夹之外的模型,则只能通过修改config.yaml配置文件来加载,方法是直接填写绝对路径。
|
179 |
+
|
180 |
+
绝对路径填写:
|
181 |
+
|
182 |
+
```yaml
|
183 |
+
tts_config:
|
184 |
+
auto_load: false
|
185 |
+
models:
|
186 |
+
- config_path: D://model3/config.json
|
187 |
+
model_path: D://model3/G_1000.pth
|
188 |
+
```
|
189 |
+
|
190 |
+
- models_path:是相对于data目录下的模型文件夹,默认为models,auto_load为true时将会加载models_path目录下的所有模型。
|
191 |
+
|
192 |
+
#### 其他模型
|
193 |
+
|
194 |
+
bert模型以及情感模型下载之后放在`data/bert`文件夹和`data/emotional`文件夹中,找到对应名字放入即可。
|
195 |
+
|
196 |
+
# GPU 加速
|
197 |
+
|
198 |
+
## windows
|
199 |
+
|
200 |
+
### 安装CUDA
|
201 |
+
|
202 |
+
查看显卡最高支持CUDA的版本
|
203 |
+
|
204 |
+
```
|
205 |
+
nvidia-smi
|
206 |
+
```
|
207 |
+
|
208 |
+
以CUDA11.7为例,[官网](https://developer.nvidia.com/cuda-11-7-0-download-archive?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exe_local)
|
209 |
+
|
210 |
+
### 安装GPU版pytorch
|
211 |
+
|
212 |
+
CUDA11.7对应的pytorch是用这个命令安装,推荐使用1.13.1+cu117,其他版本可能存在内存不稳定的问题。
|
213 |
+
|
214 |
+
```
|
215 |
+
pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
|
216 |
+
```
|
217 |
+
|
218 |
+
## Linux
|
219 |
+
|
220 |
+
安装过程类似,可以查阅网上的安装资料。也可以直接使用docker部署脚本中的gpu版本。
|
221 |
+
|
222 |
+
# WebUI
|
223 |
+
|
224 |
+
## 推理前端
|
225 |
+
|
226 |
+
http://127.0.0.1:23456
|
227 |
+
|
228 |
+
*在默认端口为23456的情况下,端口可修改
|
229 |
+
|
230 |
+
## 管理员后台
|
231 |
+
|
232 |
+
默认为http://127.0.0.1:23456/admin
|
233 |
+
|
234 |
+
**初始账号密码在初次启动后,在config.yaml中搜索admin可找到。**
|
235 |
+
|
236 |
+
# 功能选项说明
|
237 |
+
|
238 |
+
## 关闭管理员后台
|
239 |
+
|
240 |
+
由于管理员后台可以对模型进行加载和卸载操作,虽然有登录验证的保障,为了绝对安全,当对公网开放时,可以在`config.yaml`中关闭管理员后台:
|
241 |
+
|
242 |
+
```yaml
|
243 |
+
'IS_ADMIN_ENABLED': !!bool 'false'
|
244 |
+
```
|
245 |
+
|
246 |
+
## Bert-VITS2配置使用语言/Bert模型
|
247 |
+
|
248 |
+
在Bert-VITS2 v2.0以后,一个模型需要加载三个不同语言的Bert模型。如果只需要使用其中一或两种语言,可以在模型的config.json的data中,添加`lang`参数,值为`['zh']`,表示该模型只使用中文,同时也只会加载中文的Bert模型。值为`['zh','ja']`表示只使用中日双语,同时也只会加载中文和日文的Bert模型。以此类推。
|
249 |
+
|
250 |
+
示例:
|
251 |
+
|
252 |
+
```json
|
253 |
+
"data": {
|
254 |
+
"lang": ["zh","ja"],
|
255 |
+
"training_files": "filelists/train.list",
|
256 |
+
"validation_files": "filelists/val.list",
|
257 |
+
"max_wav_value": 32768.0,
|
258 |
+
...
|
259 |
+
```
|
260 |
+
|
261 |
+
## 自定义中文多音字词典
|
262 |
+
|
263 |
+
如果遇到多音字发音不正确,可以尝试用这种办法解决。
|
264 |
+
|
265 |
+
在data目录创建并打开phrases_dict.txt添加多音字词。
|
266 |
+
|
267 |
+
```python
|
268 |
+
{
|
269 |
+
"一骑当千": [["yí"], ["jì"], ["dāng"], ["qiān"]],
|
270 |
+
}
|
271 |
+
```
|
272 |
+
|
273 |
+
## GPT-SoVITS参考音频预设
|
274 |
+
|
275 |
+
在config.yaml中找到gpt_sovits的配置,在presets下添加预设,预设可添加多个,其中key作为预设名称,如下有两个默认的预设default和default2:
|
276 |
+
|
277 |
+
```
|
278 |
+
gpt_sovits_config:
|
279 |
+
hz: 50
|
280 |
+
is_half: false
|
281 |
+
id: 0
|
282 |
+
lang: auto
|
283 |
+
format: wav
|
284 |
+
segment_size: 50
|
285 |
+
presets:
|
286 |
+
default:
|
287 |
+
refer_wav_path: null
|
288 |
+
prompt_text: null
|
289 |
+
prompt_lang: auto
|
290 |
+
default2:
|
291 |
+
refer_wav_path: null
|
292 |
+
prompt_text: null
|
293 |
+
prompt_lang: auto
|
294 |
+
```
|
295 |
+
|
296 |
+
## 阅读API
|
297 |
+
|
298 |
+
在[开源阅读](https://gedoor.github.io/)中测试
|
299 |
+
|
300 |
+
可使用多种模型朗读,包括VITS,Bert-VITS2,GPT-SoVITS,`in`开头的参数配置的是对话即引号中的文本的说话人,`nr`开头的参数配置的是旁白。
|
301 |
+
|
302 |
+
使用GPT-SoVITS需要提前在`config.yaml`配置好`presets`里的参考音频,并修改下方url中的preset
|
303 |
+
|
304 |
+
url中的IP可在API启动后找到,一般使用192.168开头的局域网IP。
|
305 |
+
|
306 |
+
修改好后,选择朗读引擎-添加朗读引擎-粘贴源,并启用该朗读引擎。
|
307 |
+
|
308 |
+
```js
|
309 |
+
{
|
310 |
+
"concurrentRate": "1",
|
311 |
+
"contentType": "audio/wav",
|
312 |
+
"enabledCookieJar": false,
|
313 |
+
"header": "",
|
314 |
+
"id": 1709643305070,
|
315 |
+
"lastUpdateTime": 1709821070082,
|
316 |
+
"loginCheckJs": "",
|
317 |
+
"loginUi": "",
|
318 |
+
"loginUrl": "",
|
319 |
+
"name": "vits-simple-api",
|
320 |
+
"url": "http://192.168.xxx.xxx:23456/voice/reading?text={{java.encodeURI(speakText)}}&in_model_type=GPT-SOVITS&in_id=0&in_preset=default&nr_model_type=BERT-VITS2&nr_id=0&nr_preset=default&format=wav&lang=zh"
|
321 |
+
}
|
322 |
+
```
|
323 |
+
|
324 |
+
|
325 |
+
|
326 |
+
# 常见问题
|
327 |
+
|
328 |
+
## fasttext依赖安装问题
|
329 |
+
|
330 |
+
windows下可能安装不了fasttext,可以用以下命令安装,附[wheels下载地址](https://www.lfd.uci.edu/~gohlke/pythonlibs/#fasttext)
|
331 |
+
|
332 |
+
```
|
333 |
+
# python3.10 win_amd64
|
334 |
+
pip install https://github.com/Artrajz/archived/raw/main/fasttext/fasttext-0.9.2-cp310-cp310-win_amd64.whl
|
335 |
+
```
|
336 |
+
|
337 |
+
或者
|
338 |
+
|
339 |
+
```
|
340 |
+
pip install fasttext -i https://pypi.artrajz.cn/simple
|
341 |
+
```
|
342 |
+
|
343 |
+
## pyopenjtalk依赖安装问题
|
344 |
+
|
345 |
+
由于pypi.org没有pyopenjtalk的whl文件,通常需要从源代��来安装,这一过程对于一些人来说可能比较麻烦,所以你也可以使用我构建的whl来安装。
|
346 |
+
|
347 |
+
```
|
348 |
+
pip install pyopenjtalk -i https://pypi.artrajz.cn/simple
|
349 |
+
```
|
350 |
+
|
351 |
+
## Bert-VITS2版本兼容
|
352 |
+
|
353 |
+
修改Bert-VITS2模型的config.json,加入版本号参数`"version": "x.x.x"`,比如模型版本为1.0.1时,配置文件应该写成:
|
354 |
+
|
355 |
+
```
|
356 |
+
{
|
357 |
+
"version": "1.0.1",
|
358 |
+
"train": {
|
359 |
+
"log_interval": 10,
|
360 |
+
"eval_interval": 100,
|
361 |
+
"seed": 52,
|
362 |
+
...
|
363 |
+
```
|
364 |
+
|
365 |
+
需要注意的是,中文特化版的版本号应改为`extra`或`zh-clap`,特化修复版的版本号为`2.4`或`extra-fix`。
|
366 |
+
|
367 |
+
# API
|
368 |
+
|
369 |
+
## GET
|
370 |
+
|
371 |
+
#### speakers list
|
372 |
+
|
373 |
+
- GET http://127.0.0.1:23456/voice/speakers
|
374 |
+
|
375 |
+
返回id对应角色的映射表
|
376 |
+
|
377 |
+
#### voice vits
|
378 |
+
|
379 |
+
- GET http://127.0.0.1:23456/voice/vits?text=text
|
380 |
+
|
381 |
+
其他参数不指定时均为默认值
|
382 |
+
|
383 |
+
- GET http://127.0.0.1:23456/voice/vits?text=[ZH]text[ZH][JA]text[JA]&lang=mix
|
384 |
+
|
385 |
+
lang=mix时文本要标注
|
386 |
+
|
387 |
+
- GET http://127.0.0.1:23456/voice/vits?text=text&id=142&format=wav&lang=zh&length=1.4
|
388 |
+
|
389 |
+
文本为text,角色id为142,音频格式为wav,文本语言为zh,语音长度为1.4,其余参数默认
|
390 |
+
|
391 |
+
#### check
|
392 |
+
|
393 |
+
- GET http://127.0.0.1:23456/voice/check?id=0&model=vits
|
394 |
+
|
395 |
+
## POST
|
396 |
+
|
397 |
+
- 见`api_test.py`
|
398 |
+
|
399 |
+
|
400 |
+
|
401 |
+
## API KEY
|
402 |
+
|
403 |
+
在config.yaml中设置`api_key_enabled: true`以启用,api key填写:`api_key: api-key`。
|
404 |
+
|
405 |
+
启用后,GET请求中使用需要增加参数api_key,POST请求中使用需要在header中添加参数`X-API-KEY`。
|
406 |
+
|
407 |
+
# Parameter
|
408 |
+
|
409 |
+
## VITS语音合成
|
410 |
+
|
411 |
+
| Name | Parameter | Is must | Default | Type | Instruction |
|
412 |
+
| ------------- | ------------ | ------- | -------------------- | ----- | ------------------------------------------------------------ |
|
413 |
+
| 合成文本 | text | true | | str | 需要合成语音的文本。 |
|
414 |
+
| 角色id | id | false | 从`config.yaml`中获取 | int | 即说话人id。 |
|
415 |
+
| 音频格式 | format | false | 从`config.yaml`中获取 | str | 支持wav,ogg,silk,mp3,flac |
|
416 |
+
| 文本语言 | lang | false | 从`config.yaml`中获取 | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
|
417 |
+
| 语音长度/语速 | length | false | 从`config.yaml`中获取 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢。 |
|
418 |
+
| 噪声 | noise | false | 从`config.yaml`中获取 | float | 样本噪声,控制合成的随机性。 |
|
419 |
+
| sdp噪声 | noisew | false | 从`config.yaml`中获取 | float | 随机时长预测器噪声,控制音素发音长度。 |
|
420 |
+
| 分段阈值 | segment_size | false | 从`config.yaml`中获取 | int | 按标点符号分段,加起来大于segment_size时为一段文本。segment_size<=0表示不分段。 |
|
421 |
+
| 流式响应 | streaming | false | false | bool | 流式合成语音,更快的首包响应。 |
|
422 |
+
|
423 |
+
## VITS 语音转换
|
424 |
+
|
425 |
+
| Name | Parameter | Is must | Default | Type | Instruction |
|
426 |
+
| ---------- | ----------- | ------- | ------- | ---- | ---------------------- |
|
427 |
+
| 上传音频 | upload | true | | file | wav or ogg |
|
428 |
+
| 源角色id | original_id | true | | int | 上传文件所使用的角色id |
|
429 |
+
| 目标角色id | target_id | true | | int | 要转换的目标角色id |
|
430 |
+
|
431 |
+
## HuBert-VITS 语音转换
|
432 |
+
|
433 |
+
| Name | Parameter | Is must | Default | Type | Instruction |
|
434 |
+
| ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------ |
|
435 |
+
| 上传音频 | upload | true | | file | 需要转换说话人的音频文件。 |
|
436 |
+
| 目标角色id | id | true | | int | 目标说话人id。 |
|
437 |
+
| 音频格式 | format | true | | str | wav,ogg,silk |
|
438 |
+
| 语音长度/语速 | length | true | | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
|
439 |
+
| 噪声 | noise | true | | float | 样本噪声,控制合成的随机性。 |
|
440 |
+
| sdp噪声 | noisew | true | | float | 随机时长预测器噪声,控制音素发音长度。 |
|
441 |
+
|
442 |
+
## W2V2-VITS
|
443 |
+
|
444 |
+
| Name | Parameter | Is must | Default | Type | Instruction |
|
445 |
+
| ------------- | ------------ | ------- | -------------------- | ----- | ------------------------------------------------------------ |
|
446 |
+
| 合成文本 | text | true | | str | 需要合成语音的文本。 |
|
447 |
+
| 角色id | id | false | 从`config.yaml`中获取 | int | 即说话人id。 |
|
448 |
+
| 音频格式 | format | false | 从`config.yaml`中获取 | str | 支持wav,ogg,silk,mp3,flac |
|
449 |
+
| 文本语言 | lang | false | 从`config.yaml`中获取 | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
|
450 |
+
| 语音长度/语速 | length | false | 从`config.yaml`中获取 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
|
451 |
+
| 噪声 | noise | false | 从`config.yaml`中获取 | float | 样本噪声,控制合成的随机性。 |
|
452 |
+
| sdp噪声 | noisew | false | 从`config.yaml`中获取 | float | 随机时长预测器噪声,控制音素发音长度。 |
|
453 |
+
| 分段阈值 | segment_size | false | 从`config.yaml`中获取 | int | 按标点符号分段,加起来大于segment_size时为一段文本。segment_size<=0表示不分段。 |
|
454 |
+
| 维度情感 | emotion | false | 0 | int | 范围取决于npy情感参考文件,如[innnky](https://huggingface.co/spaces/innnky/nene-emotion/tree/main)的all_emotions.npy模型范围是0-5457 |
|
455 |
+
|
456 |
+
## Dimensional emotion
|
457 |
+
|
458 |
+
| Name | Parameter | Is must | Default | Type | Instruction |
|
459 |
+
| -------- | --------- | ------- | ------- | ---- | ----------------------------- |
|
460 |
+
| 上传音频 | upload | true | | file | 返回存储维度情感向量的npy文件 |
|
461 |
+
|
462 |
+
## Bert-VITS2语音合成
|
463 |
+
|
464 |
+
| Name | Parameter | Is must | Default | Type | Instruction |
|
465 |
+
| -------------- | --------------- | ------- | -------------------- | ----- | ------------------------------------------------------------ |
|
466 |
+
| 合成文本 | text | true | | str | 需要合成语音的文本。 |
|
467 |
+
| 角色id | id | false | 从`config.yaml`中获取 | int | 即说话人id。 |
|
468 |
+
| 音频格式 | format | false | 从`config.yaml`中获取 | str | 支持wav,ogg,silk,mp3,flac |
|
469 |
+
| 文本语言 | lang | false | 从`config.yaml`中获取 | str | auto为自动识别语言模式,也是默认模式,但目前只支持识别整段文本的语言,无法细分到每个句子。其余可选语言zh和ja。 |
|
470 |
+
| 语音长度/语速 | length | false | 从`config.yaml`中获取 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢。 |
|
471 |
+
| 噪声 | noise | false | 从`config.yaml`中获取 | float | 样本噪声,控制合成的随机性。 |
|
472 |
+
| sdp噪声 | noisew | false | 从`config.yaml`中获取 | float | 随机时长预测器噪声,控制音素发音长度。 |
|
473 |
+
| 分段阈值 | segment_size | false | 从`config.yaml`中获取 | int | 按标点符号分段,加起来大于segment_size时为一段文本。segment_size<=0表示不分段。 |
|
474 |
+
| SDP/DP混合比 | sdp_ratio | false | 从`config.yaml`中获取 | int | SDP在合成时的占比,理论上此比率越高,合成的语音语调方差越大。 |
|
475 |
+
| 情感控制 | emotion | false | 从`config.yaml`中获取 | int | Bert-VITS2 v2.1可用,范围为0-9 |
|
476 |
+
| 情感参考音频 | reference_audio | false | None | | Bert-VITS2 v2.1 使用参考音频来控制合成音频的情感 |
|
477 |
+
| 文本提示词 | text_prompt | false | 从`config.yaml`中获取 | str | Bert-VITS2 v2.2 文本提示词,用于控制情感 |
|
478 |
+
| 文本提示词 | style_text | false | 从`config.yaml`中获取 | str | Bert-VITS2 v2.3 文本提示词,用于控制情感 |
|
479 |
+
| 文本提示词权重 | style_weight | false | 从`config.yaml`中获取 | float | Bert-VITS2 v2.3 文本提示词,用于提示词权重 |
|
480 |
+
| 流式响应 | streaming | false | false | bool | 流式合成语音,更快的首包响应。 |
|
481 |
+
|
482 |
+
## GPT-SoVITS语音合成
|
483 |
+
|
484 |
+
| Name | Parameter | Is must | Default | Type | Instruction |
|
485 |
+
| ------------ | --------------- | ------- | --------------------- | ----- | ------------------------------------------------------------ |
|
486 |
+
| 合成文本 | text | true | | str | 需要合成语音的文本。 |
|
487 |
+
| 角色id | id | false | 从`config.yaml`中获取 | int | 即说话人id。在GPT-SoVITS中,每一个模型作为一个角色id,音色通过参考音频预设来切换。 |
|
488 |
+
| 音频格式 | format | false | 从`config.yaml`中获取 | str | 支持wav,ogg,silk,mp3,flac |
|
489 |
+
| 文本语言 | lang | false | 从`config.yaml`中获取 | str | auto为自动识别语言模式,也是默认模式,但目前只支持识别整段文本的语言,无法细分到每个句子。 |
|
490 |
+
| 参考音频 | reference_audio | false | None | | 参考音频是必须的,但是可以被预设代替 |
|
491 |
+
| 参考音频文本 | prompt_text | false | 从`config.yaml`中获取 | float | 需要和参考音频实际文本保持一致。 |
|
492 |
+
| 参考音频语言 | prompt_lang | false | 从`config.yaml`中获取 | str | 默认为auto,自动识别文本语言。如果识别失败则手动填写,中文就是zh,日文是ja,英文是en。 |
|
493 |
+
| 参考音频预设 | preset | false | default | str | 通过提前设置好的预设代替参考音频,可设置多个预设。 |
|
494 |
+
|
495 |
+
|
496 |
+
## SSML语音合成标记语言
|
497 |
+
|
498 |
+
目前支持的元素与属性
|
499 |
+
|
500 |
+
`speak`元素
|
501 |
+
|
502 |
+
| Attribute | Description | Is must |
|
503 |
+
| ------------ | ------------------------------------------------------------ | ------- |
|
504 |
+
| id | 默认值从`config.yaml`中读取 | false |
|
505 |
+
| lang | 默认值从`config.yaml`中读取 | false |
|
506 |
+
| length | 默认值从`config.yaml`中读取 | false |
|
507 |
+
| noise | 默认值从`config.yaml`中读取 | false |
|
508 |
+
| noisew | 默认值从`config.yaml`中读取 | false |
|
509 |
+
| segment_size | 按标点符号分段,加起来大于segment_size时为一段文本。segment_size<=0表示不分段,这里默认为0。 | false |
|
510 |
+
| model_type | 默认为VITS,可选W2V2-VITS,BERT-VITS2 | false |
|
511 |
+
| emotion | 只有用W2V2-VITS时`emotion`才会生效,范围取决于npy情感参考文件 | false |
|
512 |
+
| sdp_ratio | 只有用BERT-VITS2时`sdp_ratio`才会生效 | false |
|
513 |
+
|
514 |
+
`voice`元素
|
515 |
+
|
516 |
+
优先级大于`speak`
|
517 |
+
|
518 |
+
| Attribute | Description | Is must |
|
519 |
+
| ------------ | ------------------------------------------------------------ | ------- |
|
520 |
+
| id | 默认值从`config.yaml`中读取 | false |
|
521 |
+
| lang | 默认值从`config.yaml`中读取 | false |
|
522 |
+
| length | 默认值从`config.yaml`中读取 | false |
|
523 |
+
| noise | 默认值从`config.yaml`中读取 | false |
|
524 |
+
| noisew | 默认值从`config.yaml`中读取 | false |
|
525 |
+
| segment_size | 按标点符号分段,加起来大于segment_size时为一段文本。segment_size<=0表示不分段,这里默认为0。 | false |
|
526 |
+
| model_type | 默认为VITS,可选W2V2-VITS,BERT-VITS2 | false |
|
527 |
+
| emotion | 只有用W2V2-VITS时`emotion`才会生效,范围取决于npy情感参考文件 | false |
|
528 |
+
| sdp_ratio | 只有用BERT-VITS2时`sdp_ratio`才会生效 | false |
|
529 |
+
|
530 |
+
`break`元素
|
531 |
+
|
532 |
+
| Attribute | Description | Is must |
|
533 |
+
| --------- | ------------------------------------------------------------ | ------- |
|
534 |
+
| strength | x-weak,weak,medium(默认值),strong,x-strong | false |
|
535 |
+
| time | 暂停的绝对持续时间,以秒为单位(例如 `2s`)或以毫秒为单位(例如 `500ms`)。 有效值的范围为 0 到 5000 毫秒。 如果设置的值大于支持的最大值,则服务将使用 `5000ms`。 如果设置了 `time` 属性,则会忽略 `strength` 属性。 | false |
|
536 |
+
|
537 |
+
| Strength | Relative Duration |
|
538 |
+
| :------- | :---------------- |
|
539 |
+
| x-weak | 250 毫秒 |
|
540 |
+
| weak | 500 毫秒 |
|
541 |
+
| Medium | 750 毫秒 |
|
542 |
+
| Strong | 1000 毫秒 |
|
543 |
+
| x-strong | 1250 毫秒 |
|
544 |
+
|
545 |
+
## 阅读
|
546 |
+
|
547 |
+
| Name | Parameter | Is must | Default | Type | Instruction |
|
548 |
+
| -------------------- | ------------- | ------- | --------------------- | ---- | ------------------------------------------------------------ |
|
549 |
+
| 合成文本 | text | true | | str | 需要合成语音的文本。 |
|
550 |
+
| 对话角色模型类型 | in_model_type | false | 从`config.yaml`中获取 | str | |
|
551 |
+
| 对话角色id | in_id | false | 从`config.yaml`中获取 | int | |
|
552 |
+
| 对话角色参考音频预设 | in_preset | false | default | str | 通过提前设置好的预设代替参考音频,可设置多个预设。 |
|
553 |
+
| 旁白角色模型类型 | nr_model_type | false | 从`config.yaml`中获取 | str | |
|
554 |
+
| 旁白角色id | nr_id | false | 从`config.yaml`中获取 | int | |
|
555 |
+
| 旁白角色参考音频预设 | nr_preset | false | default | str | 通过提前设置好的预设代替参考音频,可设置多个预设。 |
|
556 |
+
| 音频格式 | format | false | 从`config.yaml`中获取 | str | 支持wav,ogg,silk,mp3,flac |
|
557 |
+
| 文本语言 | lang | false | 从`config.yaml`中获取 | str | auto为自动识别语言模式,也是默认模式,但目前只支持识别整段文本的语言,无法细分到每个句子。 |
|
558 |
+
|
559 |
+
模型的其他参数将使用config.yaml文件中对应模型的默认参数。
|
560 |
+
|
561 |
+
|
562 |
+
|
563 |
+
## 示例
|
564 |
+
|
565 |
+
见`api_test.py`
|
566 |
+
|
567 |
+
# 交流平台
|
568 |
+
|
569 |
+
现在只有 [Q群](https://qm.qq.com/cgi-bin/qm/qr?k=-1GknIe4uXrkmbDKBGKa1aAUteq40qs_&jump_from=webapi&authKey=x5YYt6Dggs1ZqWxvZqvj3fV8VUnxRyXm5S5Kzntc78+Nv3iXOIawplGip9LWuNR/)
|
570 |
+
|
571 |
+
# 鸣谢
|
572 |
+
|
573 |
+
- vits:https://github.com/jaywalnut310/vits
|
574 |
+
- MoeGoe:https://github.com/CjangCjengh/MoeGoe
|
575 |
+
- emotional-vits:https://github.com/innnky/emotional-vits
|
576 |
+
- vits-uma-genshin-honkai:https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
|
577 |
+
- vits_chinese:https://github.com/PlayVoice/vits_chinese
|
578 |
+
- Bert_VITS2:https://github.com/fishaudio/Bert-VITS2
|
579 |
+
- GPT-SoVITS:https://github.com/RVC-Boss/GPT-SoVITS
|
580 |
+
|
581 |
+
# 感谢所有的贡献者
|
582 |
+
|
583 |
+
<a href="https://github.com/artrajz/vits-simple-api/graphs/contributors" target="_blank">
|
584 |
+
<img src="https://contrib.rocks/image?repo=artrajz/vits-simple-api"/></a>
|
api_test.py
ADDED
@@ -0,0 +1,575 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
import requests
|
4 |
+
import os
|
5 |
+
import time
|
6 |
+
import random
|
7 |
+
import string
|
8 |
+
from requests_toolbelt.multipart.encoder import MultipartEncoder
|
9 |
+
|
10 |
+
absolute_path = os.path.dirname(__file__)
|
11 |
+
base_url = "http://127.0.0.1:23456"
|
12 |
+
|
13 |
+
|
14 |
+
# 映射表
|
15 |
+
def voice_speakers():
|
16 |
+
url = f"{base_url}/voice/speakers"
|
17 |
+
|
18 |
+
res = requests.post(url=url)
|
19 |
+
json = res.json()
|
20 |
+
for i in json:
|
21 |
+
print(i)
|
22 |
+
for j in json[i]:
|
23 |
+
print(j)
|
24 |
+
return json
|
25 |
+
|
26 |
+
|
27 |
+
# 语音合成 voice vits
|
28 |
+
def voice_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, segment_size=50,
|
29 |
+
save_audio=True,
|
30 |
+
save_path=None):
|
31 |
+
fields = {
|
32 |
+
"text": text,
|
33 |
+
"id": str(id),
|
34 |
+
"format": format,
|
35 |
+
"lang": lang,
|
36 |
+
"length": str(length),
|
37 |
+
"noise": str(noise),
|
38 |
+
"noisew": str(noisew),
|
39 |
+
"segment_size": str(segment_size)
|
40 |
+
}
|
41 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
42 |
+
|
43 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
44 |
+
headers = {"Content-Type": m.content_type}
|
45 |
+
url = f"{base_url}/voice/vits"
|
46 |
+
|
47 |
+
res = requests.post(url=url, data=m, headers=headers)
|
48 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
49 |
+
if save_path is not None:
|
50 |
+
path = os.path.join(save_path, fname)
|
51 |
+
else:
|
52 |
+
path = os.path.join(absolute_path, fname)
|
53 |
+
if save_audio:
|
54 |
+
with open(path, "wb") as f:
|
55 |
+
f.write(res.content)
|
56 |
+
print(path)
|
57 |
+
return path
|
58 |
+
return None
|
59 |
+
|
60 |
+
|
61 |
+
def voice_vits_streaming(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, segment_size=50,
|
62 |
+
save_audio=True, save_path=None):
|
63 |
+
fields = {
|
64 |
+
"text": text,
|
65 |
+
"id": str(id),
|
66 |
+
"format": format,
|
67 |
+
"lang": lang,
|
68 |
+
"length": str(length),
|
69 |
+
"noise": str(noise),
|
70 |
+
"noisew": str(noisew),
|
71 |
+
"segment_size": str(segment_size),
|
72 |
+
"streaming": 'True'
|
73 |
+
}
|
74 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
75 |
+
|
76 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
77 |
+
headers = {"Content-Type": m.content_type}
|
78 |
+
url = f"{base_url}/voice"
|
79 |
+
|
80 |
+
res = requests.post(url=url, data=m, headers=headers)
|
81 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
82 |
+
if save_path is not None:
|
83 |
+
path = os.path.join(save_path, fname)
|
84 |
+
else:
|
85 |
+
path = os.path.join(absolute_path, fname)
|
86 |
+
if save_audio:
|
87 |
+
with open(path, "wb") as f:
|
88 |
+
f.write(res.content)
|
89 |
+
print(path)
|
90 |
+
return path
|
91 |
+
return None
|
92 |
+
|
93 |
+
|
94 |
+
def voice_vits_streaming(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, segment_size=50,
|
95 |
+
save_path=None):
|
96 |
+
fields = {
|
97 |
+
"text": text,
|
98 |
+
"id": str(id),
|
99 |
+
"format": format,
|
100 |
+
"lang": lang,
|
101 |
+
"length": str(length),
|
102 |
+
"noise": str(noise),
|
103 |
+
"noisew": str(noisew),
|
104 |
+
"segment_size": str(segment_size),
|
105 |
+
"streaming": 'True'
|
106 |
+
}
|
107 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
108 |
+
|
109 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
110 |
+
headers = {"Content-Type": m.content_type}
|
111 |
+
url = f"{base_url}/voice"
|
112 |
+
|
113 |
+
res = requests.post(url=url, data=m, headers=headers, stream=True)
|
114 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
115 |
+
if save_path is not None:
|
116 |
+
path = os.path.join(save_path, fname)
|
117 |
+
else:
|
118 |
+
path = os.path.join(absolute_path, fname)
|
119 |
+
audio = res.content
|
120 |
+
|
121 |
+
def get_file_size_from_bytes(byte_data):
|
122 |
+
file_size_offset = 4
|
123 |
+
file_size_length = 4
|
124 |
+
|
125 |
+
try:
|
126 |
+
file_size_bytes = byte_data[file_size_offset:file_size_offset + file_size_length]
|
127 |
+
file_size = int.from_bytes(file_size_bytes, byteorder='little')
|
128 |
+
return file_size + 8
|
129 |
+
except IndexError:
|
130 |
+
return None
|
131 |
+
|
132 |
+
audio = None
|
133 |
+
p = 0
|
134 |
+
audio_size = None
|
135 |
+
audios = []
|
136 |
+
|
137 |
+
for chunk in res.iter_content(chunk_size=1024):
|
138 |
+
if audio is None:
|
139 |
+
audio = chunk
|
140 |
+
else:
|
141 |
+
audio += chunk
|
142 |
+
|
143 |
+
p += len(chunk)
|
144 |
+
if audio_size is not None:
|
145 |
+
if p >= audio_size:
|
146 |
+
p = p - audio_size
|
147 |
+
audios.append(audio[:audio_size])
|
148 |
+
audio = audio[audio_size:]
|
149 |
+
audio_size = get_file_size_from_bytes(audio)
|
150 |
+
else:
|
151 |
+
audio_size = get_file_size_from_bytes(audio)
|
152 |
+
for i, audio in enumerate(audios):
|
153 |
+
with open(f"{path[:-4]}-{i}.wav", "wb") as f:
|
154 |
+
f.write(audio)
|
155 |
+
|
156 |
+
print(f"{path[:-4]}-{i}.wav")
|
157 |
+
return path
|
158 |
+
|
159 |
+
|
160 |
+
# 语音转换 hubert-vits
|
161 |
+
def voice_hubert_vits(upload_path, id, format="wav", length=1, noise=0.667, noisew=0.8, save_audio=True,
|
162 |
+
save_path=None):
|
163 |
+
upload_name = os.path.basename(upload_path)
|
164 |
+
upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
|
165 |
+
|
166 |
+
with open(upload_path, 'rb') as upload_file:
|
167 |
+
fields = {
|
168 |
+
"upload": (upload_name, upload_file, upload_type),
|
169 |
+
"id": str(id),
|
170 |
+
"format": format,
|
171 |
+
"length": str(length),
|
172 |
+
"noise": str(noise),
|
173 |
+
"noisew": str(noisew),
|
174 |
+
}
|
175 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
176 |
+
|
177 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
178 |
+
headers = {"Content-Type": m.content_type}
|
179 |
+
url = f"{base_url}/voice/hubert-vits"
|
180 |
+
|
181 |
+
res = requests.post(url=url, data=m, headers=headers)
|
182 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
183 |
+
if save_path is not None:
|
184 |
+
path = os.path.join(save_path, fname)
|
185 |
+
else:
|
186 |
+
path = os.path.join(absolute_path, fname)
|
187 |
+
if save_audio:
|
188 |
+
with open(path, "wb") as f:
|
189 |
+
f.write(res.content)
|
190 |
+
print(path)
|
191 |
+
return path
|
192 |
+
return None
|
193 |
+
|
194 |
+
|
195 |
+
# 维度情感模型 w2v2-vits
|
196 |
+
def voice_w2v2_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, segment_size=50,
|
197 |
+
emotion=0,
|
198 |
+
save_audio=True, save_path=None):
|
199 |
+
fields = {
|
200 |
+
"text": text,
|
201 |
+
"id": str(id),
|
202 |
+
"format": format,
|
203 |
+
"lang": lang,
|
204 |
+
"length": str(length),
|
205 |
+
"noise": str(noise),
|
206 |
+
"noisew": str(noisew),
|
207 |
+
"segment_size": str(segment_size),
|
208 |
+
"emotion": str(emotion)
|
209 |
+
}
|
210 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
211 |
+
|
212 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
213 |
+
headers = {"Content-Type": m.content_type}
|
214 |
+
url = f"{base_url}/voice/w2v2-vits"
|
215 |
+
|
216 |
+
res = requests.post(url=url, data=m, headers=headers)
|
217 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
218 |
+
if save_path is not None:
|
219 |
+
path = os.path.join(save_path, fname)
|
220 |
+
else:
|
221 |
+
path = os.path.join(absolute_path, fname)
|
222 |
+
if save_audio:
|
223 |
+
with open(path, "wb") as f:
|
224 |
+
f.write(res.content)
|
225 |
+
print(path)
|
226 |
+
return path
|
227 |
+
return None
|
228 |
+
|
229 |
+
|
230 |
+
# 语音转换 同VITS模型内角色之间的音色转换
|
231 |
+
def voice_conversion(upload_path, original_id, target_id, save_audio=True, save_path=None):
|
232 |
+
upload_name = os.path.basename(upload_path)
|
233 |
+
upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
|
234 |
+
|
235 |
+
with open(upload_path, 'rb') as upload_file:
|
236 |
+
fields = {
|
237 |
+
"upload": (upload_name, upload_file, upload_type),
|
238 |
+
"original_id": str(original_id),
|
239 |
+
"target_id": str(target_id),
|
240 |
+
}
|
241 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
242 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
243 |
+
|
244 |
+
headers = {"Content-Type": m.content_type}
|
245 |
+
url = f"{base_url}/voice/conversion"
|
246 |
+
|
247 |
+
res = requests.post(url=url, data=m, headers=headers)
|
248 |
+
|
249 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
250 |
+
if save_path is not None:
|
251 |
+
path = os.path.join(save_path, fname)
|
252 |
+
else:
|
253 |
+
path = os.path.join(absolute_path, fname)
|
254 |
+
|
255 |
+
if save_audio:
|
256 |
+
with open(path, "wb") as f:
|
257 |
+
f.write(res.content)
|
258 |
+
print(path)
|
259 |
+
return path
|
260 |
+
return None
|
261 |
+
|
262 |
+
|
263 |
+
def voice_ssml(ssml, save_audio=True, save_path=None):
|
264 |
+
fields = {
|
265 |
+
"ssml": ssml,
|
266 |
+
}
|
267 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
268 |
+
|
269 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
270 |
+
headers = {"Content-Type": m.content_type}
|
271 |
+
url = f"{base_url}/voice/ssml"
|
272 |
+
|
273 |
+
res = requests.post(url=url, data=m, headers=headers)
|
274 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
275 |
+
if save_path is not None:
|
276 |
+
path = os.path.join(save_path, fname)
|
277 |
+
else:
|
278 |
+
path = os.path.join(absolute_path, fname)
|
279 |
+
|
280 |
+
if save_audio:
|
281 |
+
with open(path, "wb") as f:
|
282 |
+
f.write(res.content)
|
283 |
+
print(path)
|
284 |
+
return path
|
285 |
+
return None
|
286 |
+
|
287 |
+
|
288 |
+
def voice_dimensional_emotion(upload_path, save_audio=True,
|
289 |
+
save_path=None):
|
290 |
+
upload_name = os.path.basename(upload_path)
|
291 |
+
upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
|
292 |
+
|
293 |
+
with open(upload_path, 'rb') as upload_file:
|
294 |
+
fields = {
|
295 |
+
"upload": (upload_name, upload_file, upload_type),
|
296 |
+
}
|
297 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
298 |
+
|
299 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
300 |
+
headers = {"Content-Type": m.content_type}
|
301 |
+
url = f"{base_url}/voice/dimension-emotion"
|
302 |
+
|
303 |
+
res = requests.post(url=url, data=m, headers=headers)
|
304 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
305 |
+
if save_path is not None:
|
306 |
+
path = os.path.join(save_path, fname)
|
307 |
+
else:
|
308 |
+
path = os.path.join(absolute_path, fname)
|
309 |
+
if save_audio:
|
310 |
+
with open(path, "wb") as f:
|
311 |
+
f.write(res.content)
|
312 |
+
print(path)
|
313 |
+
return path
|
314 |
+
return None
|
315 |
+
|
316 |
+
|
317 |
+
def vits_json(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, segment_size=50,
|
318 |
+
save_audio=True, save_path=None):
|
319 |
+
fields = {
|
320 |
+
"text": text,
|
321 |
+
"id": str(id),
|
322 |
+
"format": format,
|
323 |
+
"lang": lang,
|
324 |
+
"length": str(length),
|
325 |
+
"noise": str(noise),
|
326 |
+
"noisew": str(noisew),
|
327 |
+
"segment_size": str(segment_size)
|
328 |
+
}
|
329 |
+
f = json.dumps(fields)
|
330 |
+
url = f"{base_url}/voice"
|
331 |
+
header = {"Content-Type": 'application/json'}
|
332 |
+
res = requests.post(url=url, data=f, headers=header)
|
333 |
+
|
334 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
335 |
+
if save_path is not None:
|
336 |
+
path = os.path.join(save_path, fname)
|
337 |
+
else:
|
338 |
+
path = os.path.join(absolute_path, fname)
|
339 |
+
|
340 |
+
with open(path, "wb") as f:
|
341 |
+
f.write(res.content)
|
342 |
+
|
343 |
+
if save_audio:
|
344 |
+
with open(path, "wb") as f:
|
345 |
+
f.write(res.content)
|
346 |
+
print(path)
|
347 |
+
return path
|
348 |
+
return None
|
349 |
+
|
350 |
+
|
351 |
+
# Bert_vits2
|
352 |
+
def voice_bert_vits2(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, segment_size=50,
|
353 |
+
sdp_ratio=0.2, save_audio=True, save_path=None):
|
354 |
+
fields = {
|
355 |
+
"text": text,
|
356 |
+
"id": str(id),
|
357 |
+
"format": format,
|
358 |
+
"lang": lang,
|
359 |
+
"length": str(length),
|
360 |
+
"noise": str(noise),
|
361 |
+
"noisew": str(noisew),
|
362 |
+
"segment_size": str(segment_size),
|
363 |
+
"sdp_ratio": str(sdp_ratio)
|
364 |
+
}
|
365 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
366 |
+
|
367 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
368 |
+
headers = {"Content-Type": m.content_type}
|
369 |
+
url = f"{base_url}/voice/bert-vits2"
|
370 |
+
|
371 |
+
res = requests.post(url=url, data=m, headers=headers)
|
372 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
373 |
+
if save_path is not None:
|
374 |
+
path = os.path.join(save_path, fname)
|
375 |
+
else:
|
376 |
+
path = os.path.join(absolute_path, fname)
|
377 |
+
if save_audio:
|
378 |
+
with open(path, "wb") as f:
|
379 |
+
f.write(res.content)
|
380 |
+
print(path)
|
381 |
+
return path
|
382 |
+
return None
|
383 |
+
|
384 |
+
|
385 |
+
# gpt_sovits
|
386 |
+
def voice_gpt_sovits(text, id=0, format="wav", lang="auto", preset=None, prompt_text=None, prompt_lang="auto",
|
387 |
+
segment_size=50, reference_audio=None, save_audio=True, save_path=None):
|
388 |
+
upload_name, upload_type, upload_file = None, None, None
|
389 |
+
if reference_audio is not None:
|
390 |
+
upload_name = os.path.basename(reference_audio)
|
391 |
+
upload_type = f'audio/{upload_name.split(".")[1]}'
|
392 |
+
with open(reference_audio, 'rb') as f:
|
393 |
+
upload_file = f.read()
|
394 |
+
|
395 |
+
fields = {
|
396 |
+
"text": text,
|
397 |
+
"id": str(id),
|
398 |
+
"format": format,
|
399 |
+
"lang": lang,
|
400 |
+
"segment_size": str(segment_size),
|
401 |
+
"preset": preset,
|
402 |
+
"reference_audio": (upload_name, upload_file, upload_type) if reference_audio else None,
|
403 |
+
"prompt_text": prompt_text,
|
404 |
+
"prompt_lang": prompt_lang
|
405 |
+
}
|
406 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
407 |
+
|
408 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
409 |
+
headers = {"Content-Type": m.content_type}
|
410 |
+
url = f"{base_url}/voice/gpt-sovits"
|
411 |
+
|
412 |
+
res = requests.post(url=url, data=m, headers=headers)
|
413 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
414 |
+
if save_path is not None:
|
415 |
+
path = os.path.join(save_path, fname)
|
416 |
+
else:
|
417 |
+
path = os.path.join(absolute_path, fname)
|
418 |
+
if save_audio:
|
419 |
+
with open(path, "wb") as f:
|
420 |
+
f.write(res.content)
|
421 |
+
print(path)
|
422 |
+
return path
|
423 |
+
return None
|
424 |
+
|
425 |
+
|
426 |
+
# Reading
|
427 |
+
def voice_reading_get(text, in_model_type, in_id, nr_model_type, nr_id, format="wav", lang="auto", preset=None,
|
428 |
+
save_audio=True, save_path=None):
|
429 |
+
res = requests.get(
|
430 |
+
url=f"{base_url}/voice/reading?text={text}&in_model_type={in_model_type}&in_id={in_id}&preset={preset}&nr_model_type={nr_model_type}&nr_id={nr_id}&lang={lang}&format={format}")
|
431 |
+
|
432 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
433 |
+
if save_path is not None:
|
434 |
+
path = os.path.join(save_path, fname)
|
435 |
+
else:
|
436 |
+
path = os.path.join(absolute_path, fname)
|
437 |
+
|
438 |
+
with open(path, "wb") as f:
|
439 |
+
f.write(res.content)
|
440 |
+
|
441 |
+
if save_audio:
|
442 |
+
with open(path, "wb") as f:
|
443 |
+
f.write(res.content)
|
444 |
+
print(path)
|
445 |
+
return path
|
446 |
+
return None
|
447 |
+
|
448 |
+
|
449 |
+
# Reading
|
450 |
+
def voice_reading_json(text, in_model_type, in_id, nr_model_type, nr_id, format="wav", lang="auto", preset=None,
|
451 |
+
save_audio=True, save_path=None):
|
452 |
+
fields = {
|
453 |
+
"text": text,
|
454 |
+
"in_model_type": in_model_type,
|
455 |
+
"in_id": str(in_id),
|
456 |
+
"nr_model_type": nr_model_type,
|
457 |
+
"nr_id": str(nr_id),
|
458 |
+
"format": format,
|
459 |
+
"lang": lang,
|
460 |
+
}
|
461 |
+
f = json.dumps(fields)
|
462 |
+
url = f"{base_url}/voice/reading"
|
463 |
+
header = {"Content-Type": 'application/json'}
|
464 |
+
res = requests.post(url=url, data=f, headers=header)
|
465 |
+
|
466 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
467 |
+
if save_path is not None:
|
468 |
+
path = os.path.join(save_path, fname)
|
469 |
+
else:
|
470 |
+
path = os.path.join(absolute_path, fname)
|
471 |
+
|
472 |
+
with open(path, "wb") as f:
|
473 |
+
f.write(res.content)
|
474 |
+
|
475 |
+
if save_audio:
|
476 |
+
with open(path, "wb") as f:
|
477 |
+
f.write(res.content)
|
478 |
+
print(path)
|
479 |
+
return path
|
480 |
+
return None
|
481 |
+
|
482 |
+
|
483 |
+
# Reading
|
484 |
+
def voice_reading(text, in_model_type, in_id, nr_model_type, nr_id, format="wav", lang="auto", preset=None,
|
485 |
+
save_audio=True, save_path=None):
|
486 |
+
fields = {
|
487 |
+
"text": text,
|
488 |
+
"in_model_type": in_model_type,
|
489 |
+
"in_id": str(in_id),
|
490 |
+
"nr_model_type": nr_model_type,
|
491 |
+
"nr_id": str(nr_id),
|
492 |
+
"format": format,
|
493 |
+
"lang": lang,
|
494 |
+
}
|
495 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
496 |
+
|
497 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
498 |
+
headers = {"Content-Type": m.content_type}
|
499 |
+
url = f"{base_url}/voice/reading"
|
500 |
+
|
501 |
+
res = requests.post(url=url, data=m, headers=headers)
|
502 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
503 |
+
if save_path is not None:
|
504 |
+
path = os.path.join(save_path, fname)
|
505 |
+
else:
|
506 |
+
path = os.path.join(absolute_path, fname)
|
507 |
+
if save_audio:
|
508 |
+
with open(path, "wb") as f:
|
509 |
+
f.write(res.content)
|
510 |
+
print(path)
|
511 |
+
return path
|
512 |
+
return None
|
513 |
+
|
514 |
+
|
515 |
+
def test_interface(text):
|
516 |
+
error_num = 0
|
517 |
+
for i in range(100):
|
518 |
+
try:
|
519 |
+
time.sleep(1)
|
520 |
+
t1 = time.time()
|
521 |
+
voice_vits(text, format="wav", lang="zh", save_audio=False)
|
522 |
+
t2 = time.time()
|
523 |
+
print(f"{i}:len:{len(text)}耗时:{t2 - t1}")
|
524 |
+
except Exception as e:
|
525 |
+
error_num += 1
|
526 |
+
print(e)
|
527 |
+
print(f"error_num={error_num}")
|
528 |
+
|
529 |
+
|
530 |
+
if __name__ == '__main__':
|
531 |
+
cache_path = os.path.join(os.path.curdir, "cache")
|
532 |
+
|
533 |
+
text = "你好,こんにちは"
|
534 |
+
|
535 |
+
ssml = """
|
536 |
+
<speak lang="zh" format="mp3" length="1.2">
|
537 |
+
<voice id="0" model_type="GPT-SOVITS" preset="default">这几天心里颇不宁静。</voice>
|
538 |
+
<voice id="0" model_type="Bert-VITS2">今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。</voice>
|
539 |
+
<voice id="142">月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;</voice>
|
540 |
+
<voice id="0" model_type="Bert-VITS2">妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。</voice>
|
541 |
+
<voice id="120">我悄悄地披了大衫,带上门出去。</voice><break time="2s"/>
|
542 |
+
<voice id="121">沿着荷塘,是一条曲折的小煤屑路。</voice>
|
543 |
+
<voice id="122">这是一条幽僻的路;白天也少人走,夜晚更加寂寞。</voice>
|
544 |
+
<voice id="123">荷塘四面,长着许多树,蓊蓊郁郁的。</voice>
|
545 |
+
<voice id="124">路的一旁,是些杨柳,和一些不知道名字的树。</voice>
|
546 |
+
<voice id="125">没有月光的晚上,这路上阴森森的,有些怕人。</voice>
|
547 |
+
<voice id="126">今晚却很好,虽然月光也还是淡淡的。</voice><break time="2s"/>
|
548 |
+
<voice id="127">路上只我一个人,背着手踱着。</voice>
|
549 |
+
<voice id="128">这一片天地好像是我的;我也像超出了平常的自己,到了另一个世界里。</voice>
|
550 |
+
<voice id="129">我爱热闹,也爱冷静;<break strength="x-weak"/>爱群居,也爱独处。</voice>
|
551 |
+
<voice id="130">像今晚上,一个人在这苍茫的月下,什么都可以想,什么都可以不想,便觉是个自由的人。</voice>
|
552 |
+
<voice id="131">白天里一定要做的事,一定要说的话,现在都可不理。</voice>
|
553 |
+
<voice id="132">这是独处的妙处,我且受用这无边的荷香月色好了。</voice>
|
554 |
+
</speak>
|
555 |
+
"""
|
556 |
+
|
557 |
+
# path = voice_vits(text, save_path=cache_path)
|
558 |
+
# path =voice_vits_streaming(text, save_path=cache_path)
|
559 |
+
# path = voice_w2v2_vits(text, save_path=cache_path)
|
560 |
+
# path = voice_conversion(path, 1, 3, save_path=cache_path)
|
561 |
+
# path = voice_hubert_vits(path, 0, save_path=cache_path)
|
562 |
+
# path = voice_dimensional_emotion(path, save_path=cache_path)
|
563 |
+
# path = voice_ssml(ssml, save_path=cache_path)
|
564 |
+
# path = voice_bert_vits2("你好", lang="zh", save_path=cache_path)
|
565 |
+
# path = voice_bert_vits2("こんにちは", lang="ja", save_path=cache_path)
|
566 |
+
# path = voice_gpt_sovits(text=text, id=2, preset="wz")
|
567 |
+
# path = voice_gpt_sovits(text=text, id=2, reference_audio=r"H:\git\vits-simple-api\data\reference_audio\wz_10068.wav",prompt_text="……嗯……大概、快上课的时候开始的。到这个程度的话,……半个小时吧?")
|
568 |
+
|
569 |
+
# os.system(path)
|
570 |
+
|
571 |
+
# text = "你好“你的修炼速度有些出乎我的意料”"
|
572 |
+
# path = voice_reading_json(text=text, in_model_type="GPT-SOVITS", preset="wz", in_id=2, nr_model_type="BERT-VITS2",
|
573 |
+
# nr_id=0)
|
574 |
+
|
575 |
+
# os.system(path)
|
app.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os.path
|
2 |
+
|
3 |
+
from flask import Flask
|
4 |
+
from flask_apscheduler import APScheduler
|
5 |
+
from flask_login import LoginManager
|
6 |
+
from flask_wtf import CSRFProtect
|
7 |
+
|
8 |
+
from utils.data_utils import clean_folder
|
9 |
+
from utils.phrases_dict import phrases_dict_init
|
10 |
+
from tts_app.frontend.views import frontend
|
11 |
+
from tts_app.voice_api.views import voice_api
|
12 |
+
from tts_app.auth.views import auth
|
13 |
+
from tts_app.admin.views import admin
|
14 |
+
|
15 |
+
from contants import config
|
16 |
+
|
17 |
+
app = Flask(__name__, template_folder=os.path.join(os.path.dirname(__file__), 'tts_app', 'templates'),
|
18 |
+
static_folder=os.path.join(os.path.dirname(__file__), 'tts_app', 'static'))
|
19 |
+
|
20 |
+
app.config.from_pyfile("config.py")
|
21 |
+
# app.config.update(config)
|
22 |
+
|
23 |
+
phrases_dict_init()
|
24 |
+
|
25 |
+
csrf = CSRFProtect(app)
|
26 |
+
# 禁用tts api请求的CSRF防护
|
27 |
+
csrf.exempt(voice_api)
|
28 |
+
|
29 |
+
if config.system.is_admin_enabled:
|
30 |
+
login_manager = LoginManager()
|
31 |
+
login_manager.init_app(app)
|
32 |
+
login_manager.login_view = 'auth.login'
|
33 |
+
|
34 |
+
|
35 |
+
@login_manager.user_loader
|
36 |
+
def load_user(user_id):
|
37 |
+
admin = config.admin
|
38 |
+
if admin.get_id() == user_id:
|
39 |
+
return admin
|
40 |
+
return None
|
41 |
+
|
42 |
+
# Initialize scheduler
|
43 |
+
scheduler = APScheduler()
|
44 |
+
scheduler.init_app(app)
|
45 |
+
if config.system.clean_interval_seconds > 0:
|
46 |
+
scheduler.start()
|
47 |
+
|
48 |
+
app.register_blueprint(frontend, url_prefix='/')
|
49 |
+
app.register_blueprint(voice_api, url_prefix='/voice')
|
50 |
+
if config.system.is_admin_enabled:
|
51 |
+
app.register_blueprint(auth, url_prefix=config.system.admin_route)
|
52 |
+
app.register_blueprint(admin, url_prefix=config.system.admin_route)
|
53 |
+
|
54 |
+
|
55 |
+
def create_folders(paths):
|
56 |
+
for path in paths:
|
57 |
+
if not os.path.exists(path):
|
58 |
+
os.makedirs(path, exist_ok=True)
|
59 |
+
|
60 |
+
|
61 |
+
create_folders([os.path.join(config.abs_path, config.system.upload_folder),
|
62 |
+
os.path.join(config.abs_path, config.system.cache_path), ])
|
63 |
+
|
64 |
+
|
65 |
+
# regular cleaning
|
66 |
+
@scheduler.task('interval', id='clean_task', seconds=config.system.clean_interval_seconds,
|
67 |
+
misfire_grace_time=900)
|
68 |
+
def clean_task():
|
69 |
+
clean_folder(os.path.join(config.abs_path, config.system.upload_folder))
|
70 |
+
clean_folder(os.path.join(config.abs_path, config.system.cache_path))
|
71 |
+
|
72 |
+
|
73 |
+
if __name__ == '__main__':
|
74 |
+
app.run(host=config.http_service.host, port=config.http_service.port, debug=config.http_service.debug)
|
bert_vits2/LICENSE
ADDED
@@ -0,0 +1,674 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
GNU GENERAL PUBLIC LICENSE
|
2 |
+
Version 3, 29 June 2007
|
3 |
+
|
4 |
+
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
5 |
+
Everyone is permitted to copy and distribute verbatim copies
|
6 |
+
of this license document, but changing it is not allowed.
|
7 |
+
|
8 |
+
Preamble
|
9 |
+
|
10 |
+
The GNU General Public License is a free, copyleft license for
|
11 |
+
software and other kinds of works.
|
12 |
+
|
13 |
+
The licenses for most software and other practical works are designed
|
14 |
+
to take away your freedom to share and change the works. By contrast,
|
15 |
+
the GNU General Public License is intended to guarantee your freedom to
|
16 |
+
share and change all versions of a program--to make sure it remains free
|
17 |
+
software for all its users. We, the Free Software Foundation, use the
|
18 |
+
GNU General Public License for most of our software; it applies also to
|
19 |
+
any other work released this way by its authors. You can apply it to
|
20 |
+
your programs, too.
|
21 |
+
|
22 |
+
When we speak of free software, we are referring to freedom, not
|
23 |
+
price. Our General Public Licenses are designed to make sure that you
|
24 |
+
have the freedom to distribute copies of free software (and charge for
|
25 |
+
them if you wish), that you receive source code or can get it if you
|
26 |
+
want it, that you can change the software or use pieces of it in new
|
27 |
+
free programs, and that you know you can do these things.
|
28 |
+
|
29 |
+
To protect your rights, we need to prevent others from denying you
|
30 |
+
these rights or asking you to surrender the rights. Therefore, you have
|
31 |
+
certain responsibilities if you distribute copies of the software, or if
|
32 |
+
you modify it: responsibilities to respect the freedom of others.
|
33 |
+
|
34 |
+
For example, if you distribute copies of such a program, whether
|
35 |
+
gratis or for a fee, you must pass on to the recipients the same
|
36 |
+
freedoms that you received. You must make sure that they, too, receive
|
37 |
+
or can get the source code. And you must show them these terms so they
|
38 |
+
know their rights.
|
39 |
+
|
40 |
+
Developers that use the GNU GPL protect your rights with two steps:
|
41 |
+
(1) assert copyright on the software, and (2) offer you this License
|
42 |
+
giving you legal permission to copy, distribute and/or modify it.
|
43 |
+
|
44 |
+
For the developers' and authors' protection, the GPL clearly explains
|
45 |
+
that there is no warranty for this free software. For both users' and
|
46 |
+
authors' sake, the GPL requires that modified versions be marked as
|
47 |
+
changed, so that their problems will not be attributed erroneously to
|
48 |
+
authors of previous versions.
|
49 |
+
|
50 |
+
Some devices are designed to deny users access to install or run
|
51 |
+
modified versions of the software inside them, although the manufacturer
|
52 |
+
can do so. This is fundamentally incompatible with the aim of
|
53 |
+
protecting users' freedom to change the software. The systematic
|
54 |
+
pattern of such abuse occurs in the area of products for individuals to
|
55 |
+
use, which is precisely where it is most unacceptable. Therefore, we
|
56 |
+
have designed this version of the GPL to prohibit the practice for those
|
57 |
+
products. If such problems arise substantially in other domains, we
|
58 |
+
stand ready to extend this provision to those domains in future versions
|
59 |
+
of the GPL, as needed to protect the freedom of users.
|
60 |
+
|
61 |
+
Finally, every program is threatened constantly by software patents.
|
62 |
+
States should not allow patents to restrict development and use of
|
63 |
+
software on general-purpose computers, but in those that do, we wish to
|
64 |
+
avoid the special danger that patents applied to a free program could
|
65 |
+
make it effectively proprietary. To prevent this, the GPL assures that
|
66 |
+
patents cannot be used to render the program non-free.
|
67 |
+
|
68 |
+
The precise terms and conditions for copying, distribution and
|
69 |
+
modification follow.
|
70 |
+
|
71 |
+
TERMS AND CONDITIONS
|
72 |
+
|
73 |
+
0. Definitions.
|
74 |
+
|
75 |
+
"This License" refers to version 3 of the GNU General Public License.
|
76 |
+
|
77 |
+
"Copyright" also means copyright-like laws that apply to other kinds of
|
78 |
+
works, such as semiconductor masks.
|
79 |
+
|
80 |
+
"The Program" refers to any copyrightable work licensed under this
|
81 |
+
License. Each licensee is addressed as "you". "Licensees" and
|
82 |
+
"recipients" may be individuals or organizations.
|
83 |
+
|
84 |
+
To "modify" a work means to copy from or adapt all or part of the work
|
85 |
+
in a fashion requiring copyright permission, other than the making of an
|
86 |
+
exact copy. The resulting work is called a "modified version" of the
|
87 |
+
earlier work or a work "based on" the earlier work.
|
88 |
+
|
89 |
+
A "covered work" means either the unmodified Program or a work based
|
90 |
+
on the Program.
|
91 |
+
|
92 |
+
To "propagate" a work means to do anything with it that, without
|
93 |
+
permission, would make you directly or secondarily liable for
|
94 |
+
infringement under applicable copyright law, except executing it on a
|
95 |
+
computer or modifying a private copy. Propagation includes copying,
|
96 |
+
distribution (with or without modification), making available to the
|
97 |
+
public, and in some countries other activities as well.
|
98 |
+
|
99 |
+
To "convey" a work means any kind of propagation that enables other
|
100 |
+
parties to make or receive copies. Mere interaction with a user through
|
101 |
+
a computer network, with no transfer of a copy, is not conveying.
|
102 |
+
|
103 |
+
An interactive user interface displays "Appropriate Legal Notices"
|
104 |
+
to the extent that it includes a convenient and prominently visible
|
105 |
+
feature that (1) displays an appropriate copyright notice, and (2)
|
106 |
+
tells the user that there is no warranty for the work (except to the
|
107 |
+
extent that warranties are provided), that licensees may convey the
|
108 |
+
work under this License, and how to view a copy of this License. If
|
109 |
+
the interface presents a list of user commands or options, such as a
|
110 |
+
menu, a prominent item in the list meets this criterion.
|
111 |
+
|
112 |
+
1. Source Code.
|
113 |
+
|
114 |
+
The "source code" for a work means the preferred form of the work
|
115 |
+
for making modifications to it. "Object code" means any non-source
|
116 |
+
form of a work.
|
117 |
+
|
118 |
+
A "Standard Interface" means an interface that either is an official
|
119 |
+
standard defined by a recognized standards body, or, in the case of
|
120 |
+
interfaces specified for a particular programming language, one that
|
121 |
+
is widely used among developers working in that language.
|
122 |
+
|
123 |
+
The "System Libraries" of an executable work include anything, other
|
124 |
+
than the work as a whole, that (a) is included in the normal form of
|
125 |
+
packaging a Major Component, but which is not part of that Major
|
126 |
+
Component, and (b) serves only to enable use of the work with that
|
127 |
+
Major Component, or to implement a Standard Interface for which an
|
128 |
+
implementation is available to the public in source code form. A
|
129 |
+
"Major Component", in this context, means a major essential component
|
130 |
+
(kernel, window system, and so on) of the specific operating system
|
131 |
+
(if any) on which the executable work runs, or a compiler used to
|
132 |
+
produce the work, or an object code interpreter used to run it.
|
133 |
+
|
134 |
+
The "Corresponding Source" for a work in object code form means all
|
135 |
+
the source code needed to generate, install, and (for an executable
|
136 |
+
work) run the object code and to modify the work, including scripts to
|
137 |
+
control those activities. However, it does not include the work's
|
138 |
+
System Libraries, or general-purpose tools or generally available free
|
139 |
+
programs which are used unmodified in performing those activities but
|
140 |
+
which are not part of the work. For example, Corresponding Source
|
141 |
+
includes interface definition files associated with source files for
|
142 |
+
the work, and the source code for shared libraries and dynamically
|
143 |
+
linked subprograms that the work is specifically designed to require,
|
144 |
+
such as by intimate data communication or control flow between those
|
145 |
+
subprograms and other parts of the work.
|
146 |
+
|
147 |
+
The Corresponding Source need not include anything that users
|
148 |
+
can regenerate automatically from other parts of the Corresponding
|
149 |
+
Source.
|
150 |
+
|
151 |
+
The Corresponding Source for a work in source code form is that
|
152 |
+
same work.
|
153 |
+
|
154 |
+
2. Basic Permissions.
|
155 |
+
|
156 |
+
All rights granted under this License are granted for the term of
|
157 |
+
copyright on the Program, and are irrevocable provided the stated
|
158 |
+
conditions are met. This License explicitly affirms your unlimited
|
159 |
+
permission to run the unmodified Program. The output from running a
|
160 |
+
covered work is covered by this License only if the output, given its
|
161 |
+
content, constitutes a covered work. This License acknowledges your
|
162 |
+
rights of fair use or other equivalent, as provided by copyright law.
|
163 |
+
|
164 |
+
You may make, run and propagate covered works that you do not
|
165 |
+
convey, without conditions so long as your license otherwise remains
|
166 |
+
in force. You may convey covered works to others for the sole purpose
|
167 |
+
of having them make modifications exclusively for you, or provide you
|
168 |
+
with facilities for running those works, provided that you comply with
|
169 |
+
the terms of this License in conveying all material for which you do
|
170 |
+
not control copyright. Those thus making or running the covered works
|
171 |
+
for you must do so exclusively on your behalf, under your direction
|
172 |
+
and control, on terms that prohibit them from making any copies of
|
173 |
+
your copyrighted material outside their relationship with you.
|
174 |
+
|
175 |
+
Conveying under any other circumstances is permitted solely under
|
176 |
+
the conditions stated below. Sublicensing is not allowed; section 10
|
177 |
+
makes it unnecessary.
|
178 |
+
|
179 |
+
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
180 |
+
|
181 |
+
No covered work shall be deemed part of an effective technological
|
182 |
+
measure under any applicable law fulfilling obligations under article
|
183 |
+
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
184 |
+
similar laws prohibiting or restricting circumvention of such
|
185 |
+
measures.
|
186 |
+
|
187 |
+
When you convey a covered work, you waive any legal power to forbid
|
188 |
+
circumvention of technological measures to the extent such circumvention
|
189 |
+
is effected by exercising rights under this License with respect to
|
190 |
+
the covered work, and you disclaim any intention to limit operation or
|
191 |
+
modification of the work as a means of enforcing, against the work's
|
192 |
+
users, your or third parties' legal rights to forbid circumvention of
|
193 |
+
technological measures.
|
194 |
+
|
195 |
+
4. Conveying Verbatim Copies.
|
196 |
+
|
197 |
+
You may convey verbatim copies of the Program's source code as you
|
198 |
+
receive it, in any medium, provided that you conspicuously and
|
199 |
+
appropriately publish on each copy an appropriate copyright notice;
|
200 |
+
keep intact all notices stating that this License and any
|
201 |
+
non-permissive terms added in accord with section 7 apply to the code;
|
202 |
+
keep intact all notices of the absence of any warranty; and give all
|
203 |
+
recipients a copy of this License along with the Program.
|
204 |
+
|
205 |
+
You may charge any price or no price for each copy that you convey,
|
206 |
+
and you may offer support or warranty protection for a fee.
|
207 |
+
|
208 |
+
5. Conveying Modified Source Versions.
|
209 |
+
|
210 |
+
You may convey a work based on the Program, or the modifications to
|
211 |
+
produce it from the Program, in the form of source code under the
|
212 |
+
terms of section 4, provided that you also meet all of these conditions:
|
213 |
+
|
214 |
+
a) The work must carry prominent notices stating that you modified
|
215 |
+
it, and giving a relevant date.
|
216 |
+
|
217 |
+
b) The work must carry prominent notices stating that it is
|
218 |
+
released under this License and any conditions added under section
|
219 |
+
7. This requirement modifies the requirement in section 4 to
|
220 |
+
"keep intact all notices".
|
221 |
+
|
222 |
+
c) You must license the entire work, as a whole, under this
|
223 |
+
License to anyone who comes into possession of a copy. This
|
224 |
+
License will therefore apply, along with any applicable section 7
|
225 |
+
additional terms, to the whole of the work, and all its parts,
|
226 |
+
regardless of how they are packaged. This License gives no
|
227 |
+
permission to license the work in any other way, but it does not
|
228 |
+
invalidate such permission if you have separately received it.
|
229 |
+
|
230 |
+
d) If the work has interactive user interfaces, each must display
|
231 |
+
Appropriate Legal Notices; however, if the Program has interactive
|
232 |
+
interfaces that do not display Appropriate Legal Notices, your
|
233 |
+
work need not make them do so.
|
234 |
+
|
235 |
+
A compilation of a covered work with other separate and independent
|
236 |
+
works, which are not by their nature extensions of the covered work,
|
237 |
+
and which are not combined with it such as to form a larger program,
|
238 |
+
in or on a volume of a storage or distribution medium, is called an
|
239 |
+
"aggregate" if the compilation and its resulting copyright are not
|
240 |
+
used to limit the access or legal rights of the compilation's users
|
241 |
+
beyond what the individual works permit. Inclusion of a covered work
|
242 |
+
in an aggregate does not cause this License to apply to the other
|
243 |
+
parts of the aggregate.
|
244 |
+
|
245 |
+
6. Conveying Non-Source Forms.
|
246 |
+
|
247 |
+
You may convey a covered work in object code form under the terms
|
248 |
+
of sections 4 and 5, provided that you also convey the
|
249 |
+
machine-readable Corresponding Source under the terms of this License,
|
250 |
+
in one of these ways:
|
251 |
+
|
252 |
+
a) Convey the object code in, or embodied in, a physical product
|
253 |
+
(including a physical distribution medium), accompanied by the
|
254 |
+
Corresponding Source fixed on a durable physical medium
|
255 |
+
customarily used for software interchange.
|
256 |
+
|
257 |
+
b) Convey the object code in, or embodied in, a physical product
|
258 |
+
(including a physical distribution medium), accompanied by a
|
259 |
+
written offer, valid for at least three years and valid for as
|
260 |
+
long as you offer spare parts or customer support for that product
|
261 |
+
model, to give anyone who possesses the object code either (1) a
|
262 |
+
copy of the Corresponding Source for all the software in the
|
263 |
+
product that is covered by this License, on a durable physical
|
264 |
+
medium customarily used for software interchange, for a price no
|
265 |
+
more than your reasonable cost of physically performing this
|
266 |
+
conveying of source, or (2) access to copy the
|
267 |
+
Corresponding Source from a network server at no charge.
|
268 |
+
|
269 |
+
c) Convey individual copies of the object code with a copy of the
|
270 |
+
written offer to provide the Corresponding Source. This
|
271 |
+
alternative is allowed only occasionally and noncommercially, and
|
272 |
+
only if you received the object code with such an offer, in accord
|
273 |
+
with subsection 6b.
|
274 |
+
|
275 |
+
d) Convey the object code by offering access from a designated
|
276 |
+
place (gratis or for a charge), and offer equivalent access to the
|
277 |
+
Corresponding Source in the same way through the same place at no
|
278 |
+
further charge. You need not require recipients to copy the
|
279 |
+
Corresponding Source along with the object code. If the place to
|
280 |
+
copy the object code is a network server, the Corresponding Source
|
281 |
+
may be on a different server (operated by you or a third party)
|
282 |
+
that supports equivalent copying facilities, provided you maintain
|
283 |
+
clear directions next to the object code saying where to find the
|
284 |
+
Corresponding Source. Regardless of what server hosts the
|
285 |
+
Corresponding Source, you remain obligated to ensure that it is
|
286 |
+
available for as long as needed to satisfy these requirements.
|
287 |
+
|
288 |
+
e) Convey the object code using peer-to-peer transmission, provided
|
289 |
+
you inform other peers where the object code and Corresponding
|
290 |
+
Source of the work are being offered to the general public at no
|
291 |
+
charge under subsection 6d.
|
292 |
+
|
293 |
+
A separable portion of the object code, whose source code is excluded
|
294 |
+
from the Corresponding Source as a System Library, need not be
|
295 |
+
included in conveying the object code work.
|
296 |
+
|
297 |
+
A "User Product" is either (1) a "consumer product", which means any
|
298 |
+
tangible personal property which is normally used for personal, family,
|
299 |
+
or household purposes, or (2) anything designed or sold for incorporation
|
300 |
+
into a dwelling. In determining whether a product is a consumer product,
|
301 |
+
doubtful cases shall be resolved in favor of coverage. For a particular
|
302 |
+
product received by a particular user, "normally used" refers to a
|
303 |
+
typical or common use of that class of product, regardless of the status
|
304 |
+
of the particular user or of the way in which the particular user
|
305 |
+
actually uses, or expects or is expected to use, the product. A product
|
306 |
+
is a consumer product regardless of whether the product has substantial
|
307 |
+
commercial, industrial or non-consumer uses, unless such uses represent
|
308 |
+
the only significant mode of use of the product.
|
309 |
+
|
310 |
+
"Installation Information" for a User Product means any methods,
|
311 |
+
procedures, authorization keys, or other information required to install
|
312 |
+
and execute modified versions of a covered work in that User Product from
|
313 |
+
a modified version of its Corresponding Source. The information must
|
314 |
+
suffice to ensure that the continued functioning of the modified object
|
315 |
+
code is in no case prevented or interfered with solely because
|
316 |
+
modification has been made.
|
317 |
+
|
318 |
+
If you convey an object code work under this section in, or with, or
|
319 |
+
specifically for use in, a User Product, and the conveying occurs as
|
320 |
+
part of a transaction in which the right of possession and use of the
|
321 |
+
User Product is transferred to the recipient in perpetuity or for a
|
322 |
+
fixed term (regardless of how the transaction is characterized), the
|
323 |
+
Corresponding Source conveyed under this section must be accompanied
|
324 |
+
by the Installation Information. But this requirement does not apply
|
325 |
+
if neither you nor any third party retains the ability to install
|
326 |
+
modified object code on the User Product (for example, the work has
|
327 |
+
been installed in ROM).
|
328 |
+
|
329 |
+
The requirement to provide Installation Information does not include a
|
330 |
+
requirement to continue to provide support service, warranty, or updates
|
331 |
+
for a work that has been modified or installed by the recipient, or for
|
332 |
+
the User Product in which it has been modified or installed. Access to a
|
333 |
+
network may be denied when the modification itself materially and
|
334 |
+
adversely affects the operation of the network or violates the rules and
|
335 |
+
protocols for communication across the network.
|
336 |
+
|
337 |
+
Corresponding Source conveyed, and Installation Information provided,
|
338 |
+
in accord with this section must be in a format that is publicly
|
339 |
+
documented (and with an implementation available to the public in
|
340 |
+
source code form), and must require no special password or key for
|
341 |
+
unpacking, reading or copying.
|
342 |
+
|
343 |
+
7. Additional Terms.
|
344 |
+
|
345 |
+
"Additional permissions" are terms that supplement the terms of this
|
346 |
+
License by making exceptions from one or more of its conditions.
|
347 |
+
Additional permissions that are applicable to the entire Program shall
|
348 |
+
be treated as though they were included in this License, to the extent
|
349 |
+
that they are valid under applicable law. If additional permissions
|
350 |
+
apply only to part of the Program, that part may be used separately
|
351 |
+
under those permissions, but the entire Program remains governed by
|
352 |
+
this License without regard to the additional permissions.
|
353 |
+
|
354 |
+
When you convey a copy of a covered work, you may at your option
|
355 |
+
remove any additional permissions from that copy, or from any part of
|
356 |
+
it. (Additional permissions may be written to require their own
|
357 |
+
removal in certain cases when you modify the work.) You may place
|
358 |
+
additional permissions on material, added by you to a covered work,
|
359 |
+
for which you have or can give appropriate copyright permission.
|
360 |
+
|
361 |
+
Notwithstanding any other provision of this License, for material you
|
362 |
+
add to a covered work, you may (if authorized by the copyright holders of
|
363 |
+
that material) supplement the terms of this License with terms:
|
364 |
+
|
365 |
+
a) Disclaiming warranty or limiting liability differently from the
|
366 |
+
terms of sections 15 and 16 of this License; or
|
367 |
+
|
368 |
+
b) Requiring preservation of specified reasonable legal notices or
|
369 |
+
author attributions in that material or in the Appropriate Legal
|
370 |
+
Notices displayed by works containing it; or
|
371 |
+
|
372 |
+
c) Prohibiting misrepresentation of the origin of that material, or
|
373 |
+
requiring that modified versions of such material be marked in
|
374 |
+
reasonable ways as different from the original version; or
|
375 |
+
|
376 |
+
d) Limiting the use for publicity purposes of names of licensors or
|
377 |
+
authors of the material; or
|
378 |
+
|
379 |
+
e) Declining to grant rights under trademark law for use of some
|
380 |
+
trade names, trademarks, or service marks; or
|
381 |
+
|
382 |
+
f) Requiring indemnification of licensors and authors of that
|
383 |
+
material by anyone who conveys the material (or modified versions of
|
384 |
+
it) with contractual assumptions of liability to the recipient, for
|
385 |
+
any liability that these contractual assumptions directly impose on
|
386 |
+
those licensors and authors.
|
387 |
+
|
388 |
+
All other non-permissive additional terms are considered "further
|
389 |
+
restrictions" within the meaning of section 10. If the Program as you
|
390 |
+
received it, or any part of it, contains a notice stating that it is
|
391 |
+
governed by this License along with a term that is a further
|
392 |
+
restriction, you may remove that term. If a license document contains
|
393 |
+
a further restriction but permits relicensing or conveying under this
|
394 |
+
License, you may add to a covered work material governed by the terms
|
395 |
+
of that license document, provided that the further restriction does
|
396 |
+
not survive such relicensing or conveying.
|
397 |
+
|
398 |
+
If you add terms to a covered work in accord with this section, you
|
399 |
+
must place, in the relevant source files, a statement of the
|
400 |
+
additional terms that apply to those files, or a notice indicating
|
401 |
+
where to find the applicable terms.
|
402 |
+
|
403 |
+
Additional terms, permissive or non-permissive, may be stated in the
|
404 |
+
form of a separately written license, or stated as exceptions;
|
405 |
+
the above requirements apply either way.
|
406 |
+
|
407 |
+
8. Termination.
|
408 |
+
|
409 |
+
You may not propagate or modify a covered work except as expressly
|
410 |
+
provided under this License. Any attempt otherwise to propagate or
|
411 |
+
modify it is void, and will automatically terminate your rights under
|
412 |
+
this License (including any patent licenses granted under the third
|
413 |
+
paragraph of section 11).
|
414 |
+
|
415 |
+
However, if you cease all violation of this License, then your
|
416 |
+
license from a particular copyright holder is reinstated (a)
|
417 |
+
provisionally, unless and until the copyright holder explicitly and
|
418 |
+
finally terminates your license, and (b) permanently, if the copyright
|
419 |
+
holder fails to notify you of the violation by some reasonable means
|
420 |
+
prior to 60 days after the cessation.
|
421 |
+
|
422 |
+
Moreover, your license from a particular copyright holder is
|
423 |
+
reinstated permanently if the copyright holder notifies you of the
|
424 |
+
violation by some reasonable means, this is the first time you have
|
425 |
+
received notice of violation of this License (for any work) from that
|
426 |
+
copyright holder, and you cure the violation prior to 30 days after
|
427 |
+
your receipt of the notice.
|
428 |
+
|
429 |
+
Termination of your rights under this section does not terminate the
|
430 |
+
licenses of parties who have received copies or rights from you under
|
431 |
+
this License. If your rights have been terminated and not permanently
|
432 |
+
reinstated, you do not qualify to receive new licenses for the same
|
433 |
+
material under section 10.
|
434 |
+
|
435 |
+
9. Acceptance Not Required for Having Copies.
|
436 |
+
|
437 |
+
You are not required to accept this License in order to receive or
|
438 |
+
run a copy of the Program. Ancillary propagation of a covered work
|
439 |
+
occurring solely as a consequence of using peer-to-peer transmission
|
440 |
+
to receive a copy likewise does not require acceptance. However,
|
441 |
+
nothing other than this License grants you permission to propagate or
|
442 |
+
modify any covered work. These actions infringe copyright if you do
|
443 |
+
not accept this License. Therefore, by modifying or propagating a
|
444 |
+
covered work, you indicate your acceptance of this License to do so.
|
445 |
+
|
446 |
+
10. Automatic Licensing of Downstream Recipients.
|
447 |
+
|
448 |
+
Each time you convey a covered work, the recipient automatically
|
449 |
+
receives a license from the original licensors, to run, modify and
|
450 |
+
propagate that work, subject to this License. You are not responsible
|
451 |
+
for enforcing compliance by third parties with this License.
|
452 |
+
|
453 |
+
An "entity transaction" is a transaction transferring control of an
|
454 |
+
organization, or substantially all assets of one, or subdividing an
|
455 |
+
organization, or merging organizations. If propagation of a covered
|
456 |
+
work results from an entity transaction, each party to that
|
457 |
+
transaction who receives a copy of the work also receives whatever
|
458 |
+
licenses to the work the party's predecessor in interest had or could
|
459 |
+
give under the previous paragraph, plus a right to possession of the
|
460 |
+
Corresponding Source of the work from the predecessor in interest, if
|
461 |
+
the predecessor has it or can get it with reasonable efforts.
|
462 |
+
|
463 |
+
You may not impose any further restrictions on the exercise of the
|
464 |
+
rights granted or affirmed under this License. For example, you may
|
465 |
+
not impose a license fee, royalty, or other charge for exercise of
|
466 |
+
rights granted under this License, and you may not initiate litigation
|
467 |
+
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
468 |
+
any patent claim is infringed by making, using, selling, offering for
|
469 |
+
sale, or importing the Program or any portion of it.
|
470 |
+
|
471 |
+
11. Patents.
|
472 |
+
|
473 |
+
A "contributor" is a copyright holder who authorizes use under this
|
474 |
+
License of the Program or a work on which the Program is based. The
|
475 |
+
work thus licensed is called the contributor's "contributor version".
|
476 |
+
|
477 |
+
A contributor's "essential patent claims" are all patent claims
|
478 |
+
owned or controlled by the contributor, whether already acquired or
|
479 |
+
hereafter acquired, that would be infringed by some manner, permitted
|
480 |
+
by this License, of making, using, or selling its contributor version,
|
481 |
+
but do not include claims that would be infringed only as a
|
482 |
+
consequence of further modification of the contributor version. For
|
483 |
+
purposes of this definition, "control" includes the right to grant
|
484 |
+
patent sublicenses in a manner consistent with the requirements of
|
485 |
+
this License.
|
486 |
+
|
487 |
+
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
488 |
+
patent license under the contributor's essential patent claims, to
|
489 |
+
make, use, sell, offer for sale, import and otherwise run, modify and
|
490 |
+
propagate the contents of its contributor version.
|
491 |
+
|
492 |
+
In the following three paragraphs, a "patent license" is any express
|
493 |
+
agreement or commitment, however denominated, not to enforce a patent
|
494 |
+
(such as an express permission to practice a patent or covenant not to
|
495 |
+
sue for patent infringement). To "grant" such a patent license to a
|
496 |
+
party means to make such an agreement or commitment not to enforce a
|
497 |
+
patent against the party.
|
498 |
+
|
499 |
+
If you convey a covered work, knowingly relying on a patent license,
|
500 |
+
and the Corresponding Source of the work is not available for anyone
|
501 |
+
to copy, free of charge and under the terms of this License, through a
|
502 |
+
publicly available network server or other readily accessible means,
|
503 |
+
then you must either (1) cause the Corresponding Source to be so
|
504 |
+
available, or (2) arrange to deprive yourself of the benefit of the
|
505 |
+
patent license for this particular work, or (3) arrange, in a manner
|
506 |
+
consistent with the requirements of this License, to extend the patent
|
507 |
+
license to downstream recipients. "Knowingly relying" means you have
|
508 |
+
actual knowledge that, but for the patent license, your conveying the
|
509 |
+
covered work in a country, or your recipient's use of the covered work
|
510 |
+
in a country, would infringe one or more identifiable patents in that
|
511 |
+
country that you have reason to believe are valid.
|
512 |
+
|
513 |
+
If, pursuant to or in connection with a single transaction or
|
514 |
+
arrangement, you convey, or propagate by procuring conveyance of, a
|
515 |
+
covered work, and grant a patent license to some of the parties
|
516 |
+
receiving the covered work authorizing them to use, propagate, modify
|
517 |
+
or convey a specific copy of the covered work, then the patent license
|
518 |
+
you grant is automatically extended to all recipients of the covered
|
519 |
+
work and works based on it.
|
520 |
+
|
521 |
+
A patent license is "discriminatory" if it does not include within
|
522 |
+
the scope of its coverage, prohibits the exercise of, or is
|
523 |
+
conditioned on the non-exercise of one or more of the rights that are
|
524 |
+
specifically granted under this License. You may not convey a covered
|
525 |
+
work if you are a party to an arrangement with a third party that is
|
526 |
+
in the business of distributing software, under which you make payment
|
527 |
+
to the third party based on the extent of your activity of conveying
|
528 |
+
the work, and under which the third party grants, to any of the
|
529 |
+
parties who would receive the covered work from you, a discriminatory
|
530 |
+
patent license (a) in connection with copies of the covered work
|
531 |
+
conveyed by you (or copies made from those copies), or (b) primarily
|
532 |
+
for and in connection with specific products or compilations that
|
533 |
+
contain the covered work, unless you entered into that arrangement,
|
534 |
+
or that patent license was granted, prior to 28 March 2007.
|
535 |
+
|
536 |
+
Nothing in this License shall be construed as excluding or limiting
|
537 |
+
any implied license or other defenses to infringement that may
|
538 |
+
otherwise be available to you under applicable patent law.
|
539 |
+
|
540 |
+
12. No Surrender of Others' Freedom.
|
541 |
+
|
542 |
+
If conditions are imposed on you (whether by court order, agreement or
|
543 |
+
otherwise) that contradict the conditions of this License, they do not
|
544 |
+
excuse you from the conditions of this License. If you cannot convey a
|
545 |
+
covered work so as to satisfy simultaneously your obligations under this
|
546 |
+
License and any other pertinent obligations, then as a consequence you may
|
547 |
+
not convey it at all. For example, if you agree to terms that obligate you
|
548 |
+
to collect a royalty for further conveying from those to whom you convey
|
549 |
+
the Program, the only way you could satisfy both those terms and this
|
550 |
+
License would be to refrain entirely from conveying the Program.
|
551 |
+
|
552 |
+
13. Use with the GNU Affero General Public License.
|
553 |
+
|
554 |
+
Notwithstanding any other provision of this License, you have
|
555 |
+
permission to link or combine any covered work with a work licensed
|
556 |
+
under version 3 of the GNU Affero General Public License into a single
|
557 |
+
combined work, and to convey the resulting work. The terms of this
|
558 |
+
License will continue to apply to the part which is the covered work,
|
559 |
+
but the special requirements of the GNU Affero General Public License,
|
560 |
+
section 13, concerning interaction through a network will apply to the
|
561 |
+
combination as such.
|
562 |
+
|
563 |
+
14. Revised Versions of this License.
|
564 |
+
|
565 |
+
The Free Software Foundation may publish revised and/or new versions of
|
566 |
+
the GNU General Public License from time to time. Such new versions will
|
567 |
+
be similar in spirit to the present version, but may differ in detail to
|
568 |
+
address new problems or concerns.
|
569 |
+
|
570 |
+
Each version is given a distinguishing version number. If the
|
571 |
+
Program specifies that a certain numbered version of the GNU General
|
572 |
+
Public License "or any later version" applies to it, you have the
|
573 |
+
option of following the terms and conditions either of that numbered
|
574 |
+
version or of any later version published by the Free Software
|
575 |
+
Foundation. If the Program does not specify a version number of the
|
576 |
+
GNU General Public License, you may choose any version ever published
|
577 |
+
by the Free Software Foundation.
|
578 |
+
|
579 |
+
If the Program specifies that a proxy can decide which future
|
580 |
+
versions of the GNU General Public License can be used, that proxy's
|
581 |
+
public statement of acceptance of a version permanently authorizes you
|
582 |
+
to choose that version for the Program.
|
583 |
+
|
584 |
+
Later license versions may give you additional or different
|
585 |
+
permissions. However, no additional obligations are imposed on any
|
586 |
+
author or copyright holder as a result of your choosing to follow a
|
587 |
+
later version.
|
588 |
+
|
589 |
+
15. Disclaimer of Warranty.
|
590 |
+
|
591 |
+
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
592 |
+
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
593 |
+
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
594 |
+
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
595 |
+
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
596 |
+
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
597 |
+
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
598 |
+
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
599 |
+
|
600 |
+
16. Limitation of Liability.
|
601 |
+
|
602 |
+
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
603 |
+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
604 |
+
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
605 |
+
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
606 |
+
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
607 |
+
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
608 |
+
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
609 |
+
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
610 |
+
SUCH DAMAGES.
|
611 |
+
|
612 |
+
17. Interpretation of Sections 15 and 16.
|
613 |
+
|
614 |
+
If the disclaimer of warranty and limitation of liability provided
|
615 |
+
above cannot be given local legal effect according to their terms,
|
616 |
+
reviewing courts shall apply local law that most closely approximates
|
617 |
+
an absolute waiver of all civil liability in connection with the
|
618 |
+
Program, unless a warranty or assumption of liability accompanies a
|
619 |
+
copy of the Program in return for a fee.
|
620 |
+
|
621 |
+
END OF TERMS AND CONDITIONS
|
622 |
+
|
623 |
+
How to Apply These Terms to Your New Programs
|
624 |
+
|
625 |
+
If you develop a new program, and you want it to be of the greatest
|
626 |
+
possible use to the public, the best way to achieve this is to make it
|
627 |
+
free software which everyone can redistribute and change under these terms.
|
628 |
+
|
629 |
+
To do so, attach the following notices to the program. It is safest
|
630 |
+
to attach them to the start of each source file to most effectively
|
631 |
+
state the exclusion of warranty; and each file should have at least
|
632 |
+
the "copyright" line and a pointer to where the full notice is found.
|
633 |
+
|
634 |
+
<one line to give the program's name and a brief idea of what it does.>
|
635 |
+
Copyright (C) <year> <name of author>
|
636 |
+
|
637 |
+
This program is free software: you can redistribute it and/or modify
|
638 |
+
it under the terms of the GNU General Public License as published by
|
639 |
+
the Free Software Foundation, either version 3 of the License, or
|
640 |
+
(at your option) any later version.
|
641 |
+
|
642 |
+
This program is distributed in the hope that it will be useful,
|
643 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
644 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
645 |
+
GNU General Public License for more details.
|
646 |
+
|
647 |
+
You should have received a copy of the GNU General Public License
|
648 |
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
649 |
+
|
650 |
+
Also add information on how to contact you by electronic and paper mail.
|
651 |
+
|
652 |
+
If the program does terminal interaction, make it output a short
|
653 |
+
notice like this when it starts in an interactive mode:
|
654 |
+
|
655 |
+
<program> Copyright (C) <year> <name of author>
|
656 |
+
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
657 |
+
This is free software, and you are welcome to redistribute it
|
658 |
+
under certain conditions; type `show c' for details.
|
659 |
+
|
660 |
+
The hypothetical commands `show w' and `show c' should show the appropriate
|
661 |
+
parts of the General Public License. Of course, your program's commands
|
662 |
+
might be different; for a GUI interface, you would use an "about box".
|
663 |
+
|
664 |
+
You should also get your employer (if you work as a programmer) or school,
|
665 |
+
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
666 |
+
For more information on this, and how to apply and follow the GNU GPL, see
|
667 |
+
<https://www.gnu.org/licenses/>.
|
668 |
+
|
669 |
+
The GNU General Public License does not permit incorporating your program
|
670 |
+
into proprietary programs. If your program is a subroutine library, you
|
671 |
+
may consider it more useful to permit linking proprietary applications with
|
672 |
+
the library. If this is what you want to do, use the GNU Lesser General
|
673 |
+
Public License instead of this License. But first, please read
|
674 |
+
<https://www.gnu.org/licenses/why-not-lgpl.html>.
|
bert_vits2/README.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Bert-VITS2
|
2 |
+
|
3 |
+
VITS2 Backbone with bert
|
4 |
+
## 成熟的旅行者/开拓者/舰长/博士/sensei/猎魔人/喵喵露/V应该参阅代码自己学习如何训练。
|
5 |
+
### 严禁将此项目用于一切违反《中华人民共和国宪法》,《中华人民共和国刑法》,《中华人民共和国治安管理处罚法》和《中华人民共和国民法典》之用途。
|
bert_vits2/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from bert_vits2.bert_vits2 import Bert_VITS2
|
2 |
+
from bert_vits2 import text
|
bert_vits2/attentions.py
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from torch import nn
|
4 |
+
from torch.nn import functional as F
|
5 |
+
from bert_vits2 import commons
|
6 |
+
from torch.nn.utils import weight_norm, remove_weight_norm
|
7 |
+
|
8 |
+
|
9 |
+
class LayerNorm(nn.Module):
|
10 |
+
def __init__(self, channels, eps=1e-5):
|
11 |
+
super().__init__()
|
12 |
+
self.channels = channels
|
13 |
+
self.eps = eps
|
14 |
+
|
15 |
+
self.gamma = nn.Parameter(torch.ones(channels))
|
16 |
+
self.beta = nn.Parameter(torch.zeros(channels))
|
17 |
+
|
18 |
+
def forward(self, x):
|
19 |
+
x = x.transpose(1, -1)
|
20 |
+
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
21 |
+
return x.transpose(1, -1)
|
22 |
+
|
23 |
+
|
24 |
+
@torch.jit.script
|
25 |
+
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
26 |
+
n_channels_int = n_channels[0]
|
27 |
+
in_act = input_a + input_b
|
28 |
+
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
29 |
+
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
30 |
+
acts = t_act * s_act
|
31 |
+
return acts
|
32 |
+
|
33 |
+
|
34 |
+
class Encoder(nn.Module):
|
35 |
+
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4,
|
36 |
+
isflow=True, **kwargs):
|
37 |
+
super().__init__()
|
38 |
+
self.hidden_channels = hidden_channels
|
39 |
+
self.filter_channels = filter_channels
|
40 |
+
self.n_heads = n_heads
|
41 |
+
self.n_layers = n_layers
|
42 |
+
self.kernel_size = kernel_size
|
43 |
+
self.p_dropout = p_dropout
|
44 |
+
self.window_size = window_size
|
45 |
+
# if isflow:
|
46 |
+
# cond_layer = torch.nn.Conv1d(256, 2 * hidden_channels * n_layers, 1)
|
47 |
+
# self.cond_pre = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, 1)
|
48 |
+
# self.cond_layer = weight_norm(cond_layer, name='weight')
|
49 |
+
# self.gin_channels = 256
|
50 |
+
self.cond_layer_idx = self.n_layers
|
51 |
+
if 'gin_channels' in kwargs:
|
52 |
+
self.gin_channels = kwargs['gin_channels']
|
53 |
+
if self.gin_channels != 0:
|
54 |
+
self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
|
55 |
+
# vits2 says 3rd block, so idx is 2 by default
|
56 |
+
self.cond_layer_idx = kwargs['cond_layer_idx'] if 'cond_layer_idx' in kwargs else 2
|
57 |
+
# print(self.gin_channels, self.cond_layer_idx)
|
58 |
+
assert self.cond_layer_idx < self.n_layers, 'cond_layer_idx should be less than n_layers'
|
59 |
+
self.drop = nn.Dropout(p_dropout)
|
60 |
+
self.attn_layers = nn.ModuleList()
|
61 |
+
self.norm_layers_1 = nn.ModuleList()
|
62 |
+
self.ffn_layers = nn.ModuleList()
|
63 |
+
self.norm_layers_2 = nn.ModuleList()
|
64 |
+
for i in range(self.n_layers):
|
65 |
+
self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout,
|
66 |
+
window_size=window_size))
|
67 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
68 |
+
self.ffn_layers.append(
|
69 |
+
FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
|
70 |
+
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
71 |
+
|
72 |
+
def forward(self, x, x_mask, g=None):
|
73 |
+
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
74 |
+
x = x * x_mask
|
75 |
+
for i in range(self.n_layers):
|
76 |
+
if i == self.cond_layer_idx and g is not None:
|
77 |
+
g = self.spk_emb_linear(g.transpose(1, 2))
|
78 |
+
g = g.transpose(1, 2)
|
79 |
+
x = x + g
|
80 |
+
x = x * x_mask
|
81 |
+
y = self.attn_layers[i](x, x, attn_mask)
|
82 |
+
y = self.drop(y)
|
83 |
+
x = self.norm_layers_1[i](x + y)
|
84 |
+
|
85 |
+
y = self.ffn_layers[i](x, x_mask)
|
86 |
+
y = self.drop(y)
|
87 |
+
x = self.norm_layers_2[i](x + y)
|
88 |
+
x = x * x_mask
|
89 |
+
return x
|
90 |
+
|
91 |
+
|
92 |
+
class Decoder(nn.Module):
|
93 |
+
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.,
|
94 |
+
proximal_bias=False, proximal_init=True, **kwargs):
|
95 |
+
super().__init__()
|
96 |
+
self.hidden_channels = hidden_channels
|
97 |
+
self.filter_channels = filter_channels
|
98 |
+
self.n_heads = n_heads
|
99 |
+
self.n_layers = n_layers
|
100 |
+
self.kernel_size = kernel_size
|
101 |
+
self.p_dropout = p_dropout
|
102 |
+
self.proximal_bias = proximal_bias
|
103 |
+
self.proximal_init = proximal_init
|
104 |
+
|
105 |
+
self.drop = nn.Dropout(p_dropout)
|
106 |
+
self.self_attn_layers = nn.ModuleList()
|
107 |
+
self.norm_layers_0 = nn.ModuleList()
|
108 |
+
self.encdec_attn_layers = nn.ModuleList()
|
109 |
+
self.norm_layers_1 = nn.ModuleList()
|
110 |
+
self.ffn_layers = nn.ModuleList()
|
111 |
+
self.norm_layers_2 = nn.ModuleList()
|
112 |
+
for i in range(self.n_layers):
|
113 |
+
self.self_attn_layers.append(
|
114 |
+
MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout,
|
115 |
+
proximal_bias=proximal_bias, proximal_init=proximal_init))
|
116 |
+
self.norm_layers_0.append(LayerNorm(hidden_channels))
|
117 |
+
self.encdec_attn_layers.append(
|
118 |
+
MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
|
119 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
120 |
+
self.ffn_layers.append(
|
121 |
+
FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
|
122 |
+
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
123 |
+
|
124 |
+
def forward(self, x, x_mask, h, h_mask):
|
125 |
+
"""
|
126 |
+
x: decoder input
|
127 |
+
h: encoder output
|
128 |
+
"""
|
129 |
+
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
|
130 |
+
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
131 |
+
x = x * x_mask
|
132 |
+
for i in range(self.n_layers):
|
133 |
+
y = self.self_attn_layers[i](x, x, self_attn_mask)
|
134 |
+
y = self.drop(y)
|
135 |
+
x = self.norm_layers_0[i](x + y)
|
136 |
+
|
137 |
+
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
|
138 |
+
y = self.drop(y)
|
139 |
+
x = self.norm_layers_1[i](x + y)
|
140 |
+
|
141 |
+
y = self.ffn_layers[i](x, x_mask)
|
142 |
+
y = self.drop(y)
|
143 |
+
x = self.norm_layers_2[i](x + y)
|
144 |
+
x = x * x_mask
|
145 |
+
return x
|
146 |
+
|
147 |
+
|
148 |
+
class MultiHeadAttention(nn.Module):
|
149 |
+
def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True,
|
150 |
+
block_length=None, proximal_bias=False, proximal_init=False):
|
151 |
+
super().__init__()
|
152 |
+
assert channels % n_heads == 0
|
153 |
+
|
154 |
+
self.channels = channels
|
155 |
+
self.out_channels = out_channels
|
156 |
+
self.n_heads = n_heads
|
157 |
+
self.p_dropout = p_dropout
|
158 |
+
self.window_size = window_size
|
159 |
+
self.heads_share = heads_share
|
160 |
+
self.block_length = block_length
|
161 |
+
self.proximal_bias = proximal_bias
|
162 |
+
self.proximal_init = proximal_init
|
163 |
+
self.attn = None
|
164 |
+
|
165 |
+
self.k_channels = channels // n_heads
|
166 |
+
self.conv_q = nn.Conv1d(channels, channels, 1)
|
167 |
+
self.conv_k = nn.Conv1d(channels, channels, 1)
|
168 |
+
self.conv_v = nn.Conv1d(channels, channels, 1)
|
169 |
+
self.conv_o = nn.Conv1d(channels, out_channels, 1)
|
170 |
+
self.drop = nn.Dropout(p_dropout)
|
171 |
+
|
172 |
+
if window_size is not None:
|
173 |
+
n_heads_rel = 1 if heads_share else n_heads
|
174 |
+
rel_stddev = self.k_channels ** -0.5
|
175 |
+
self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
|
176 |
+
self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
|
177 |
+
|
178 |
+
nn.init.xavier_uniform_(self.conv_q.weight)
|
179 |
+
nn.init.xavier_uniform_(self.conv_k.weight)
|
180 |
+
nn.init.xavier_uniform_(self.conv_v.weight)
|
181 |
+
if proximal_init:
|
182 |
+
with torch.no_grad():
|
183 |
+
self.conv_k.weight.copy_(self.conv_q.weight)
|
184 |
+
self.conv_k.bias.copy_(self.conv_q.bias)
|
185 |
+
|
186 |
+
def forward(self, x, c, attn_mask=None):
|
187 |
+
q = self.conv_q(x)
|
188 |
+
k = self.conv_k(c)
|
189 |
+
v = self.conv_v(c)
|
190 |
+
|
191 |
+
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
192 |
+
|
193 |
+
x = self.conv_o(x)
|
194 |
+
return x
|
195 |
+
|
196 |
+
def attention(self, query, key, value, mask=None):
|
197 |
+
# reshape [b, d, t] -> [b, n_h, t, d_k]
|
198 |
+
b, d, t_s, t_t = (*key.size(), query.size(2))
|
199 |
+
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
|
200 |
+
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
201 |
+
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
202 |
+
|
203 |
+
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
|
204 |
+
if self.window_size is not None:
|
205 |
+
assert t_s == t_t, "Relative attention is only available for self-attention."
|
206 |
+
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
|
207 |
+
rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
|
208 |
+
scores_local = self._relative_position_to_absolute_position(rel_logits)
|
209 |
+
scores = scores + scores_local
|
210 |
+
if self.proximal_bias:
|
211 |
+
assert t_s == t_t, "Proximal bias is only available for self-attention."
|
212 |
+
scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
|
213 |
+
if mask is not None:
|
214 |
+
scores = scores.masked_fill(mask == 0, -1e4)
|
215 |
+
if self.block_length is not None:
|
216 |
+
assert t_s == t_t, "Local attention is only available for self-attention."
|
217 |
+
block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
|
218 |
+
scores = scores.masked_fill(block_mask == 0, -1e4)
|
219 |
+
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
|
220 |
+
p_attn = self.drop(p_attn)
|
221 |
+
output = torch.matmul(p_attn, value)
|
222 |
+
if self.window_size is not None:
|
223 |
+
relative_weights = self._absolute_position_to_relative_position(p_attn)
|
224 |
+
value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
|
225 |
+
output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
|
226 |
+
output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
|
227 |
+
return output, p_attn
|
228 |
+
|
229 |
+
def _matmul_with_relative_values(self, x, y):
|
230 |
+
"""
|
231 |
+
x: [b, h, l, m]
|
232 |
+
y: [h or 1, m, d]
|
233 |
+
ret: [b, h, l, d]
|
234 |
+
"""
|
235 |
+
ret = torch.matmul(x, y.unsqueeze(0))
|
236 |
+
return ret
|
237 |
+
|
238 |
+
def _matmul_with_relative_keys(self, x, y):
|
239 |
+
"""
|
240 |
+
x: [b, h, l, d]
|
241 |
+
y: [h or 1, m, d]
|
242 |
+
ret: [b, h, l, m]
|
243 |
+
"""
|
244 |
+
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
|
245 |
+
return ret
|
246 |
+
|
247 |
+
def _get_relative_embeddings(self, relative_embeddings, length):
|
248 |
+
max_relative_position = 2 * self.window_size + 1
|
249 |
+
# Pad first before slice to avoid using cond ops.
|
250 |
+
pad_length = max(length - (self.window_size + 1), 0)
|
251 |
+
slice_start_position = max((self.window_size + 1) - length, 0)
|
252 |
+
slice_end_position = slice_start_position + 2 * length - 1
|
253 |
+
if pad_length > 0:
|
254 |
+
padded_relative_embeddings = F.pad(
|
255 |
+
relative_embeddings,
|
256 |
+
commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
|
257 |
+
else:
|
258 |
+
padded_relative_embeddings = relative_embeddings
|
259 |
+
used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
|
260 |
+
return used_relative_embeddings
|
261 |
+
|
262 |
+
def _relative_position_to_absolute_position(self, x):
|
263 |
+
"""
|
264 |
+
x: [b, h, l, 2*l-1]
|
265 |
+
ret: [b, h, l, l]
|
266 |
+
"""
|
267 |
+
batch, heads, length, _ = x.size()
|
268 |
+
# Concat columns of pad to shift from relative to absolute indexing.
|
269 |
+
x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
|
270 |
+
|
271 |
+
# Concat extra elements so to add up to shape (len+1, 2*len-1).
|
272 |
+
x_flat = x.view([batch, heads, length * 2 * length])
|
273 |
+
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
|
274 |
+
|
275 |
+
# Reshape and slice out the padded elements.
|
276 |
+
x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1:]
|
277 |
+
return x_final
|
278 |
+
|
279 |
+
def _absolute_position_to_relative_position(self, x):
|
280 |
+
"""
|
281 |
+
x: [b, h, l, l]
|
282 |
+
ret: [b, h, l, 2*l-1]
|
283 |
+
"""
|
284 |
+
batch, heads, length, _ = x.size()
|
285 |
+
# padd along column
|
286 |
+
x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
|
287 |
+
x_flat = x.view([batch, heads, length ** 2 + length * (length - 1)])
|
288 |
+
# add 0's in the beginning that will skew the elements after reshape
|
289 |
+
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
|
290 |
+
x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
|
291 |
+
return x_final
|
292 |
+
|
293 |
+
def _attention_bias_proximal(self, length):
|
294 |
+
"""Bias for self-attention to encourage attention to close positions.
|
295 |
+
Args:
|
296 |
+
length: an integer scalar.
|
297 |
+
Returns:
|
298 |
+
a Tensor with shape [1, 1, length, length]
|
299 |
+
"""
|
300 |
+
r = torch.arange(length, dtype=torch.float32)
|
301 |
+
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
|
302 |
+
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
|
303 |
+
|
304 |
+
|
305 |
+
class FFN(nn.Module):
|
306 |
+
def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None,
|
307 |
+
causal=False):
|
308 |
+
super().__init__()
|
309 |
+
self.in_channels = in_channels
|
310 |
+
self.out_channels = out_channels
|
311 |
+
self.filter_channels = filter_channels
|
312 |
+
self.kernel_size = kernel_size
|
313 |
+
self.p_dropout = p_dropout
|
314 |
+
self.activation = activation
|
315 |
+
self.causal = causal
|
316 |
+
|
317 |
+
if causal:
|
318 |
+
self.padding = self._causal_padding
|
319 |
+
else:
|
320 |
+
self.padding = self._same_padding
|
321 |
+
|
322 |
+
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
|
323 |
+
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
|
324 |
+
self.drop = nn.Dropout(p_dropout)
|
325 |
+
|
326 |
+
def forward(self, x, x_mask):
|
327 |
+
x = self.conv_1(self.padding(x * x_mask))
|
328 |
+
if self.activation == "gelu":
|
329 |
+
x = x * torch.sigmoid(1.702 * x)
|
330 |
+
else:
|
331 |
+
x = torch.relu(x)
|
332 |
+
x = self.drop(x)
|
333 |
+
x = self.conv_2(self.padding(x * x_mask))
|
334 |
+
return x * x_mask
|
335 |
+
|
336 |
+
def _causal_padding(self, x):
|
337 |
+
if self.kernel_size == 1:
|
338 |
+
return x
|
339 |
+
pad_l = self.kernel_size - 1
|
340 |
+
pad_r = 0
|
341 |
+
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
342 |
+
x = F.pad(x, commons.convert_pad_shape(padding))
|
343 |
+
return x
|
344 |
+
|
345 |
+
def _same_padding(self, x):
|
346 |
+
if self.kernel_size == 1:
|
347 |
+
return x
|
348 |
+
pad_l = (self.kernel_size - 1) // 2
|
349 |
+
pad_r = self.kernel_size // 2
|
350 |
+
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
351 |
+
x = F.pad(x, commons.convert_pad_shape(padding))
|
352 |
+
return x
|
bert_vits2/bert_vits2.py
ADDED
@@ -0,0 +1,403 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
|
6 |
+
from bert_vits2 import commons
|
7 |
+
from bert_vits2 import utils as bert_vits2_utils
|
8 |
+
from bert_vits2.clap_wrapper import get_clap_audio_feature, get_clap_text_feature
|
9 |
+
from bert_vits2.get_emo import get_emo
|
10 |
+
from bert_vits2.models import SynthesizerTrn
|
11 |
+
from bert_vits2.models_v230 import SynthesizerTrn as SynthesizerTrn_v230
|
12 |
+
from bert_vits2.models_ja_extra import SynthesizerTrn as SynthesizerTrn_ja_extra
|
13 |
+
from bert_vits2.text import *
|
14 |
+
from bert_vits2.text.cleaner import clean_text
|
15 |
+
from bert_vits2.utils import process_legacy_versions
|
16 |
+
from contants import config
|
17 |
+
from utils import get_hparams_from_file
|
18 |
+
from utils.sentence import split_languages
|
19 |
+
|
20 |
+
|
21 |
+
class Bert_VITS2:
|
22 |
+
def __init__(self, model_path, config, device=torch.device("cpu"), **kwargs):
|
23 |
+
self.model_path = model_path
|
24 |
+
self.hps_ms = get_hparams_from_file(config) if isinstance(config, str) else config
|
25 |
+
self.n_speakers = getattr(self.hps_ms.data, 'n_speakers', 0)
|
26 |
+
self.speakers = [item[0] for item in
|
27 |
+
sorted(list(getattr(self.hps_ms.data, 'spk2id', {'0': 0}).items()), key=lambda x: x[1])]
|
28 |
+
self.symbols = symbols
|
29 |
+
self.sampling_rate = self.hps_ms.data.sampling_rate
|
30 |
+
|
31 |
+
self.bert_model_names = {}
|
32 |
+
self.zh_bert_extra = False
|
33 |
+
self.ja_bert_extra = False
|
34 |
+
self.ja_bert_dim = 1024
|
35 |
+
self.num_tones = num_tones
|
36 |
+
self.pinyinPlus = None
|
37 |
+
|
38 |
+
# Compatible with legacy versions
|
39 |
+
self.version = process_legacy_versions(self.hps_ms).lower().replace("-", "_")
|
40 |
+
self.text_extra_str_map = {"zh": "", "ja": "", "en": ""}
|
41 |
+
self.bert_extra_str_map = {"zh": "", "ja": "", "en": ""}
|
42 |
+
self.hps_ms.model.emotion_embedding = None
|
43 |
+
if self.version in ["1.0", "1.0.0", "1.0.1"]:
|
44 |
+
"""
|
45 |
+
chinese-roberta-wwm-ext-large
|
46 |
+
"""
|
47 |
+
self.version = "1.0"
|
48 |
+
self.symbols = symbols_legacy
|
49 |
+
self.hps_ms.model.n_layers_trans_flow = 3
|
50 |
+
self.lang = getattr(self.hps_ms.data, "lang", ["zh"])
|
51 |
+
self.ja_bert_dim = 768
|
52 |
+
self.num_tones = num_tones_v111
|
53 |
+
self.text_extra_str_map.update({"zh": "_v100"})
|
54 |
+
|
55 |
+
elif self.version in ["1.1.0-transition"]:
|
56 |
+
"""
|
57 |
+
chinese-roberta-wwm-ext-large
|
58 |
+
"""
|
59 |
+
self.version = "1.1.0-transition"
|
60 |
+
self.hps_ms.model.n_layers_trans_flow = 3
|
61 |
+
self.lang = getattr(self.hps_ms.data, "lang", ["zh", "ja"])
|
62 |
+
self.ja_bert_dim = 768
|
63 |
+
self.num_tones = num_tones_v111
|
64 |
+
if "ja" in self.lang: self.bert_model_names.update({"ja": "BERT_BASE_JAPANESE_V3"})
|
65 |
+
self.text_extra_str_map.update({"zh": "_v100", "ja": "_v111"})
|
66 |
+
self.bert_extra_str_map.update({"ja": "_v111"})
|
67 |
+
|
68 |
+
elif self.version in ["1.1", "1.1.0", "1.1.1"]:
|
69 |
+
"""
|
70 |
+
chinese-roberta-wwm-ext-large
|
71 |
+
bert-base-japanese-v3
|
72 |
+
"""
|
73 |
+
self.version = "1.1"
|
74 |
+
self.hps_ms.model.n_layers_trans_flow = 6
|
75 |
+
self.lang = getattr(self.hps_ms.data, "lang", ["zh", "ja"])
|
76 |
+
self.ja_bert_dim = 768
|
77 |
+
self.num_tones = num_tones_v111
|
78 |
+
if "ja" in self.lang: self.bert_model_names.update({"ja": "BERT_BASE_JAPANESE_V3"})
|
79 |
+
self.text_extra_str_map.update({"zh": "_v100", "ja": "_v111"})
|
80 |
+
self.bert_extra_str_map.update({"ja": "_v111"})
|
81 |
+
|
82 |
+
elif self.version in ["2.0", "2.0.0", "2.0.1", "2.0.2"]:
|
83 |
+
"""
|
84 |
+
chinese-roberta-wwm-ext-large
|
85 |
+
deberta-v2-large-japanese
|
86 |
+
deberta-v3-large
|
87 |
+
"""
|
88 |
+
self.version = "2.0"
|
89 |
+
self.hps_ms.model.n_layers_trans_flow = 4
|
90 |
+
self.lang = getattr(self.hps_ms.data, "lang", ["zh", "ja", "en"])
|
91 |
+
self.num_tones = num_tones
|
92 |
+
if "ja" in self.lang: self.bert_model_names.update({"ja": "DEBERTA_V2_LARGE_JAPANESE"})
|
93 |
+
if "en" in self.lang: self.bert_model_names.update({"en": "DEBERTA_V3_LARGE"})
|
94 |
+
self.text_extra_str_map.update({"zh": "_v100", "ja": "_v200", "en": "_v200"})
|
95 |
+
self.bert_extra_str_map.update({"ja": "_v200", "en": "_v200"})
|
96 |
+
|
97 |
+
elif self.version in ["2.1", "2.1.0"]:
|
98 |
+
"""
|
99 |
+
chinese-roberta-wwm-ext-large
|
100 |
+
deberta-v2-large-japanese-char-wwm
|
101 |
+
deberta-v3-large
|
102 |
+
wav2vec2-large-robust-12-ft-emotion-msp-dim
|
103 |
+
"""
|
104 |
+
self.version = "2.1"
|
105 |
+
self.hps_ms.model.n_layers_trans_flow = 4
|
106 |
+
self.hps_ms.model.emotion_embedding = 1
|
107 |
+
self.lang = getattr(self.hps_ms.data, "lang", ["zh", "ja", "en"])
|
108 |
+
self.num_tones = num_tones
|
109 |
+
if "ja" in self.lang: self.bert_model_names.update({"ja": "DEBERTA_V2_LARGE_JAPANESE_CHAR_WWM"})
|
110 |
+
if "en" in self.lang: self.bert_model_names.update({"en": "DEBERTA_V3_LARGE"})
|
111 |
+
|
112 |
+
elif self.version in ["2.2", "2.2.0"]:
|
113 |
+
"""
|
114 |
+
chinese-roberta-wwm-ext-large
|
115 |
+
deberta-v2-large-japanese-char-wwm
|
116 |
+
deberta-v3-large
|
117 |
+
clap-htsat-fused
|
118 |
+
"""
|
119 |
+
self.version = "2.2"
|
120 |
+
self.hps_ms.model.n_layers_trans_flow = 4
|
121 |
+
self.hps_ms.model.emotion_embedding = 2
|
122 |
+
self.lang = getattr(self.hps_ms.data, "lang", ["zh", "ja", "en"])
|
123 |
+
self.num_tones = num_tones
|
124 |
+
if "ja" in self.lang: self.bert_model_names.update({"ja": "DEBERTA_V2_LARGE_JAPANESE_CHAR_WWM"})
|
125 |
+
if "en" in self.lang: self.bert_model_names.update({"en": "DEBERTA_V3_LARGE"})
|
126 |
+
|
127 |
+
elif self.version in ["2.3", "2.3.0"]:
|
128 |
+
"""
|
129 |
+
chinese-roberta-wwm-ext-large
|
130 |
+
deberta-v2-large-japanese-char-wwm
|
131 |
+
deberta-v3-large
|
132 |
+
"""
|
133 |
+
self.version = "2.3"
|
134 |
+
self.lang = getattr(self.hps_ms.data, "lang", ["zh", "ja", "en"])
|
135 |
+
self.num_tones = num_tones
|
136 |
+
self.text_extra_str_map.update({"en": "_v230"})
|
137 |
+
if "ja" in self.lang: self.bert_model_names.update({"ja": "DEBERTA_V2_LARGE_JAPANESE_CHAR_WWM"})
|
138 |
+
if "en" in self.lang: self.bert_model_names.update({"en": "DEBERTA_V3_LARGE"})
|
139 |
+
|
140 |
+
elif self.version is not None and self.version in ["extra", "zh_clap"]:
|
141 |
+
"""
|
142 |
+
Erlangshen-MegatronBert-1.3B-Chinese
|
143 |
+
clap-htsat-fused
|
144 |
+
"""
|
145 |
+
self.version = "extra"
|
146 |
+
self.hps_ms.model.emotion_embedding = 2
|
147 |
+
self.hps_ms.model.n_layers_trans_flow = 6
|
148 |
+
self.lang = ["zh"]
|
149 |
+
self.num_tones = num_tones
|
150 |
+
self.zh_bert_extra = True
|
151 |
+
self.bert_model_names.update({"zh": "Erlangshen_MegatronBert_1.3B_Chinese"})
|
152 |
+
self.bert_extra_str_map.update({"zh": "_extra"})
|
153 |
+
|
154 |
+
elif self.version is not None and self.version in ["extra_fix", "2.4", "2.4.0"]:
|
155 |
+
"""
|
156 |
+
Erlangshen-MegatronBert-1.3B-Chinese
|
157 |
+
clap-htsat-fused
|
158 |
+
"""
|
159 |
+
self.version = "2.4"
|
160 |
+
self.hps_ms.model.emotion_embedding = 2
|
161 |
+
self.hps_ms.model.n_layers_trans_flow = 6
|
162 |
+
self.lang = ["zh"]
|
163 |
+
self.num_tones = num_tones
|
164 |
+
self.zh_bert_extra = True
|
165 |
+
self.bert_model_names.update({"zh": "Erlangshen_MegatronBert_1.3B_Chinese"})
|
166 |
+
self.bert_extra_str_map.update({"zh": "_extra"})
|
167 |
+
self.text_extra_str_map.update({"zh": "_v240"})
|
168 |
+
|
169 |
+
elif self.version is not None and self.version in ["ja_extra"]:
|
170 |
+
"""
|
171 |
+
deberta-v2-large-japanese-char-wwm
|
172 |
+
"""
|
173 |
+
self.version = "ja_extra"
|
174 |
+
self.hps_ms.model.emotion_embedding = 2
|
175 |
+
self.hps_ms.model.n_layers_trans_flow = 6
|
176 |
+
self.lang = ["ja"]
|
177 |
+
self.num_tones = num_tones
|
178 |
+
self.ja_bert_extra = True
|
179 |
+
self.bert_model_names.update({"ja": "DEBERTA_V2_LARGE_JAPANESE_CHAR_WWM"})
|
180 |
+
self.bert_extra_str_map.update({"ja": "_extra"})
|
181 |
+
self.text_extra_str_map.update({"ja": "_extra"})
|
182 |
+
|
183 |
+
else:
|
184 |
+
logging.debug("Version information not found. Loaded as the newest version: v2.3.")
|
185 |
+
self.version = "2.3"
|
186 |
+
self.lang = getattr(self.hps_ms.data, "lang", ["zh", "ja", "en"])
|
187 |
+
self.num_tones = num_tones
|
188 |
+
self.text_extra_str_map.update({"en": "_v230"})
|
189 |
+
if "ja" in self.lang: self.bert_model_names.update({"ja": "DEBERTA_V2_LARGE_JAPANESE_CHAR_WWM"})
|
190 |
+
if "en" in self.lang: self.bert_model_names.update({"en": "DEBERTA_V3_LARGE"})
|
191 |
+
|
192 |
+
if "zh" in self.lang and "zh" not in self.bert_model_names.keys():
|
193 |
+
self.bert_model_names.update({"zh": "CHINESE_ROBERTA_WWM_EXT_LARGE"})
|
194 |
+
|
195 |
+
self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
|
196 |
+
|
197 |
+
self.device = device
|
198 |
+
|
199 |
+
def load_model(self, model_handler):
|
200 |
+
self.model_handler = model_handler
|
201 |
+
|
202 |
+
if self.version in ["2.3", "extra", "2.4"]:
|
203 |
+
Synthesizer = SynthesizerTrn_v230
|
204 |
+
elif self.version == "ja_extra":
|
205 |
+
Synthesizer = SynthesizerTrn_ja_extra
|
206 |
+
else:
|
207 |
+
Synthesizer = SynthesizerTrn
|
208 |
+
|
209 |
+
if self.version == "2.4":
|
210 |
+
self.pinyinPlus = self.model_handler.get_pinyinPlus()
|
211 |
+
self.net_g = Synthesizer(
|
212 |
+
len(self.symbols),
|
213 |
+
self.hps_ms.data.filter_length // 2 + 1,
|
214 |
+
self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
|
215 |
+
n_speakers=self.hps_ms.data.n_speakers,
|
216 |
+
symbols=self.symbols,
|
217 |
+
ja_bert_dim=self.ja_bert_dim,
|
218 |
+
num_tones=self.num_tones,
|
219 |
+
zh_bert_extra=self.zh_bert_extra,
|
220 |
+
**self.hps_ms.model).to(self.device)
|
221 |
+
_ = self.net_g.eval()
|
222 |
+
bert_vits2_utils.load_checkpoint(self.model_path, self.net_g, None, skip_optimizer=True, version=self.version)
|
223 |
+
|
224 |
+
def get_speakers(self):
|
225 |
+
return self.speakers
|
226 |
+
|
227 |
+
def get_text(self, text, language_str, hps, style_text=None, style_weight=0.7):
|
228 |
+
clean_text_lang_str = language_str + self.text_extra_str_map.get(language_str, "")
|
229 |
+
bert_feature_lang_str = language_str + self.bert_extra_str_map.get(language_str, "")
|
230 |
+
|
231 |
+
tokenizer, _ = self.model_handler.get_bert_model(self.bert_model_names[language_str])
|
232 |
+
|
233 |
+
norm_text, phone, tone, word2ph = clean_text(text, clean_text_lang_str, tokenizer, self.pinyinPlus)
|
234 |
+
|
235 |
+
phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str, self._symbol_to_id)
|
236 |
+
|
237 |
+
if hps.data.add_blank:
|
238 |
+
phone = commons.intersperse(phone, 0)
|
239 |
+
tone = commons.intersperse(tone, 0)
|
240 |
+
language = commons.intersperse(language, 0)
|
241 |
+
for i in range(len(word2ph)):
|
242 |
+
word2ph[i] = word2ph[i] * 2
|
243 |
+
word2ph[0] += 1
|
244 |
+
|
245 |
+
if style_text == "" or self.zh_bert_extra:
|
246 |
+
style_text = None
|
247 |
+
|
248 |
+
bert = self.model_handler.get_bert_feature(norm_text, word2ph, bert_feature_lang_str,
|
249 |
+
self.bert_model_names[language_str], style_text, style_weight)
|
250 |
+
del word2ph
|
251 |
+
assert bert.shape[-1] == len(phone), phone
|
252 |
+
|
253 |
+
if self.zh_bert_extra:
|
254 |
+
zh_bert = bert
|
255 |
+
ja_bert, en_bert = None, None
|
256 |
+
elif self.ja_bert_extra:
|
257 |
+
ja_bert = bert
|
258 |
+
zh_bert, en_bert = None, None
|
259 |
+
elif language_str == "zh":
|
260 |
+
zh_bert = bert
|
261 |
+
ja_bert = torch.zeros(self.ja_bert_dim, len(phone))
|
262 |
+
en_bert = torch.zeros(1024, len(phone))
|
263 |
+
elif language_str == "ja":
|
264 |
+
zh_bert = torch.zeros(1024, len(phone))
|
265 |
+
ja_bert = bert
|
266 |
+
en_bert = torch.zeros(1024, len(phone))
|
267 |
+
elif language_str == "en":
|
268 |
+
zh_bert = torch.zeros(1024, len(phone))
|
269 |
+
ja_bert = torch.zeros(self.ja_bert_dim, len(phone))
|
270 |
+
en_bert = bert
|
271 |
+
else:
|
272 |
+
zh_bert = torch.zeros(1024, len(phone))
|
273 |
+
ja_bert = torch.zeros(self.ja_bert_dim, len(phone))
|
274 |
+
en_bert = torch.zeros(1024, len(phone))
|
275 |
+
assert bert.shape[-1] == len(
|
276 |
+
phone
|
277 |
+
), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
|
278 |
+
phone = torch.LongTensor(phone)
|
279 |
+
tone = torch.LongTensor(tone)
|
280 |
+
language = torch.LongTensor(language)
|
281 |
+
return zh_bert, ja_bert, en_bert, phone, tone, language
|
282 |
+
|
283 |
+
def _get_emo(self, reference_audio, emotion):
|
284 |
+
if reference_audio:
|
285 |
+
emo = torch.from_numpy(
|
286 |
+
get_emo(reference_audio, self.model_handler.emotion_model,
|
287 |
+
self.model_handler.emotion_processor))
|
288 |
+
else:
|
289 |
+
if emotion is None: emotion = 0
|
290 |
+
emo = torch.Tensor([emotion])
|
291 |
+
|
292 |
+
return emo
|
293 |
+
|
294 |
+
def _get_clap(self, reference_audio, text_prompt):
|
295 |
+
if isinstance(reference_audio, np.ndarray):
|
296 |
+
emo = get_clap_audio_feature(reference_audio, self.model_handler.clap_model,
|
297 |
+
self.model_handler.clap_processor, self.device)
|
298 |
+
else:
|
299 |
+
if text_prompt is None: text_prompt = config.bert_vits2_config.text_prompt
|
300 |
+
emo = get_clap_text_feature(text_prompt, self.model_handler.clap_model,
|
301 |
+
self.model_handler.clap_processor, self.device)
|
302 |
+
emo = torch.squeeze(emo, dim=1).unsqueeze(0)
|
303 |
+
return emo
|
304 |
+
|
305 |
+
def _infer(self, id, phones, tones, lang_ids, zh_bert, ja_bert, en_bert, sdp_ratio, noise, noisew, length,
|
306 |
+
emo=None):
|
307 |
+
with torch.no_grad():
|
308 |
+
x_tst = phones.to(self.device).unsqueeze(0)
|
309 |
+
tones = tones.to(self.device).unsqueeze(0)
|
310 |
+
lang_ids = lang_ids.to(self.device).unsqueeze(0)
|
311 |
+
if self.zh_bert_extra:
|
312 |
+
zh_bert = zh_bert.to(self.device).unsqueeze(0)
|
313 |
+
elif self.ja_bert_extra:
|
314 |
+
ja_bert = ja_bert.to(self.device).unsqueeze(0)
|
315 |
+
else:
|
316 |
+
zh_bert = zh_bert.to(self.device).unsqueeze(0)
|
317 |
+
ja_bert = ja_bert.to(self.device).unsqueeze(0)
|
318 |
+
en_bert = en_bert.to(self.device).unsqueeze(0)
|
319 |
+
x_tst_lengths = torch.LongTensor([phones.size(0)]).to(self.device)
|
320 |
+
speakers = torch.LongTensor([int(id)]).to(self.device)
|
321 |
+
audio = self.net_g.infer(x_tst,
|
322 |
+
x_tst_lengths,
|
323 |
+
speakers,
|
324 |
+
tones,
|
325 |
+
lang_ids,
|
326 |
+
zh_bert=zh_bert,
|
327 |
+
ja_bert=ja_bert,
|
328 |
+
en_bert=en_bert,
|
329 |
+
sdp_ratio=sdp_ratio,
|
330 |
+
noise_scale=noise,
|
331 |
+
noise_scale_w=noisew,
|
332 |
+
length_scale=length,
|
333 |
+
emo=emo
|
334 |
+
)[0][0, 0].data.cpu().float().numpy()
|
335 |
+
|
336 |
+
torch.cuda.empty_cache()
|
337 |
+
return audio
|
338 |
+
|
339 |
+
def infer(self, text, id, lang, sdp_ratio, noise, noisew, length, reference_audio=None, emotion=None,
|
340 |
+
text_prompt=None, style_text=None, style_weigth=0.7, **kwargs):
|
341 |
+
zh_bert, ja_bert, en_bert, phones, tones, lang_ids = self.get_text(text, lang, self.hps_ms, style_text,
|
342 |
+
style_weigth)
|
343 |
+
|
344 |
+
emo = None
|
345 |
+
if self.hps_ms.model.emotion_embedding == 1:
|
346 |
+
emo = self._get_emo(reference_audio, emotion).to(self.device).unsqueeze(0)
|
347 |
+
elif self.hps_ms.model.emotion_embedding == 2:
|
348 |
+
emo = self._get_clap(reference_audio, text_prompt)
|
349 |
+
|
350 |
+
return self._infer(id, phones, tones, lang_ids, zh_bert, ja_bert, en_bert, sdp_ratio, noise, noisew, length,
|
351 |
+
emo)
|
352 |
+
|
353 |
+
def infer_multilang(self, text, id, lang, sdp_ratio, noise, noisew, length, reference_audio=None, emotion=None,
|
354 |
+
text_prompt=None, style_text=None, style_weigth=0.7, **kwargs):
|
355 |
+
sentences_list = split_languages(text, self.lang, expand_abbreviations=True, expand_hyphens=True)
|
356 |
+
|
357 |
+
emo = None
|
358 |
+
if self.hps_ms.model.emotion_embedding == 1:
|
359 |
+
emo = self._get_emo(reference_audio, emotion).to(self.device).unsqueeze(0)
|
360 |
+
elif self.hps_ms.model.emotion_embedding == 2:
|
361 |
+
emo = self._get_clap(reference_audio, text_prompt)
|
362 |
+
|
363 |
+
phones, tones, lang_ids, zh_bert, ja_bert, en_bert = [], [], [], [], [], []
|
364 |
+
|
365 |
+
for idx, (_text, lang) in enumerate(sentences_list):
|
366 |
+
skip_start = idx != 0
|
367 |
+
skip_end = idx != len(sentences_list) - 1
|
368 |
+
_zh_bert, _ja_bert, _en_bert, _phones, _tones, _lang_ids = self.get_text(_text, lang, self.hps_ms,
|
369 |
+
style_text, style_weigth)
|
370 |
+
|
371 |
+
if skip_start:
|
372 |
+
_phones = _phones[3:]
|
373 |
+
_tones = _tones[3:]
|
374 |
+
_lang_ids = _lang_ids[3:]
|
375 |
+
_zh_bert = _zh_bert[:, 3:]
|
376 |
+
_ja_bert = _ja_bert[:, 3:]
|
377 |
+
_en_bert = _en_bert[:, 3:]
|
378 |
+
if skip_end:
|
379 |
+
_phones = _phones[:-2]
|
380 |
+
_tones = _tones[:-2]
|
381 |
+
_lang_ids = _lang_ids[:-2]
|
382 |
+
_zh_bert = _zh_bert[:, :-2]
|
383 |
+
_ja_bert = _ja_bert[:, :-2]
|
384 |
+
_en_bert = _en_bert[:, :-2]
|
385 |
+
|
386 |
+
phones.append(_phones)
|
387 |
+
tones.append(_tones)
|
388 |
+
lang_ids.append(_lang_ids)
|
389 |
+
zh_bert.append(_zh_bert)
|
390 |
+
ja_bert.append(_ja_bert)
|
391 |
+
en_bert.append(_en_bert)
|
392 |
+
|
393 |
+
zh_bert = torch.cat(zh_bert, dim=1)
|
394 |
+
ja_bert = torch.cat(ja_bert, dim=1)
|
395 |
+
en_bert = torch.cat(en_bert, dim=1)
|
396 |
+
phones = torch.cat(phones, dim=0)
|
397 |
+
tones = torch.cat(tones, dim=0)
|
398 |
+
lang_ids = torch.cat(lang_ids, dim=0)
|
399 |
+
|
400 |
+
audio = self._infer(id, phones, tones, lang_ids, zh_bert, ja_bert, en_bert, sdp_ratio, noise,
|
401 |
+
noisew, length, emo)
|
402 |
+
|
403 |
+
return audio
|
bert_vits2/clap_wrapper.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
|
4 |
+
def get_clap_audio_feature(audio_data, clap_model, processor, device):
|
5 |
+
with torch.no_grad():
|
6 |
+
inputs = processor(
|
7 |
+
audios=audio_data, return_tensors="pt", sampling_rate=48000
|
8 |
+
).to(device)
|
9 |
+
emb = clap_model.get_audio_features(**inputs).float()
|
10 |
+
return emb.T
|
11 |
+
|
12 |
+
|
13 |
+
def get_clap_text_feature(text, clap_model, processor, device):
|
14 |
+
with torch.no_grad():
|
15 |
+
inputs = processor(text=text, return_tensors="pt").to(device)
|
16 |
+
emb = clap_model.get_text_features(**inputs).float()
|
17 |
+
return emb.T
|
bert_vits2/commons.py
ADDED
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from torch.nn import functional as F
|
4 |
+
|
5 |
+
|
6 |
+
def init_weights(m, mean=0.0, std=0.01):
|
7 |
+
classname = m.__class__.__name__
|
8 |
+
if classname.find("Conv") != -1:
|
9 |
+
m.weight.data.normal_(mean, std)
|
10 |
+
|
11 |
+
|
12 |
+
def get_padding(kernel_size, dilation=1):
|
13 |
+
return int((kernel_size * dilation - dilation) / 2)
|
14 |
+
|
15 |
+
|
16 |
+
def convert_pad_shape(pad_shape):
|
17 |
+
layer = pad_shape[::-1]
|
18 |
+
pad_shape = [item for sublist in layer for item in sublist]
|
19 |
+
return pad_shape
|
20 |
+
|
21 |
+
|
22 |
+
def intersperse(lst, item):
|
23 |
+
result = [item] * (len(lst) * 2 + 1)
|
24 |
+
result[1::2] = lst
|
25 |
+
return result
|
26 |
+
|
27 |
+
|
28 |
+
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
29 |
+
"""KL(P||Q)"""
|
30 |
+
kl = (logs_q - logs_p) - 0.5
|
31 |
+
kl += (
|
32 |
+
0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
|
33 |
+
)
|
34 |
+
return kl
|
35 |
+
|
36 |
+
|
37 |
+
def rand_gumbel(shape):
|
38 |
+
"""Sample from the Gumbel distribution, protect from overflows."""
|
39 |
+
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
|
40 |
+
return -torch.log(-torch.log(uniform_samples))
|
41 |
+
|
42 |
+
|
43 |
+
def rand_gumbel_like(x):
|
44 |
+
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
|
45 |
+
return g
|
46 |
+
|
47 |
+
|
48 |
+
def slice_segments(x, ids_str, segment_size=4):
|
49 |
+
gather_indices = ids_str.view(x.size(0), 1, 1).repeat(
|
50 |
+
1, x.size(1), 1
|
51 |
+
) + torch.arange(segment_size, device=x.device)
|
52 |
+
return torch.gather(x, 2, gather_indices)
|
53 |
+
|
54 |
+
|
55 |
+
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
56 |
+
b, d, t = x.size()
|
57 |
+
if x_lengths is None:
|
58 |
+
x_lengths = t
|
59 |
+
ids_str_max = torch.clamp(x_lengths - segment_size + 1, min=0)
|
60 |
+
ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long)
|
61 |
+
ret = slice_segments(x, ids_str, segment_size)
|
62 |
+
return ret, ids_str
|
63 |
+
|
64 |
+
|
65 |
+
def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
|
66 |
+
position = torch.arange(length, dtype=torch.float)
|
67 |
+
num_timescales = channels // 2
|
68 |
+
log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
|
69 |
+
num_timescales - 1
|
70 |
+
)
|
71 |
+
inv_timescales = min_timescale * torch.exp(
|
72 |
+
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
|
73 |
+
)
|
74 |
+
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
|
75 |
+
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
|
76 |
+
signal = F.pad(signal, [0, 0, 0, channels % 2])
|
77 |
+
signal = signal.view(1, channels, length)
|
78 |
+
return signal
|
79 |
+
|
80 |
+
|
81 |
+
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
|
82 |
+
b, channels, length = x.size()
|
83 |
+
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
84 |
+
return x + signal.to(dtype=x.dtype, device=x.device)
|
85 |
+
|
86 |
+
|
87 |
+
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
|
88 |
+
b, channels, length = x.size()
|
89 |
+
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
90 |
+
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
|
91 |
+
|
92 |
+
|
93 |
+
def subsequent_mask(length):
|
94 |
+
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
|
95 |
+
return mask
|
96 |
+
|
97 |
+
|
98 |
+
@torch.jit.script
|
99 |
+
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
100 |
+
n_channels_int = n_channels[0]
|
101 |
+
in_act = input_a + input_b
|
102 |
+
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
103 |
+
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
104 |
+
acts = t_act * s_act
|
105 |
+
return acts
|
106 |
+
|
107 |
+
|
108 |
+
def convert_pad_shape(pad_shape):
|
109 |
+
layer = pad_shape[::-1]
|
110 |
+
pad_shape = [item for sublist in layer for item in sublist]
|
111 |
+
return pad_shape
|
112 |
+
|
113 |
+
|
114 |
+
def shift_1d(x):
|
115 |
+
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
|
116 |
+
return x
|
117 |
+
|
118 |
+
|
119 |
+
def sequence_mask(length, max_length=None):
|
120 |
+
if max_length is None:
|
121 |
+
max_length = length.max()
|
122 |
+
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
123 |
+
return x.unsqueeze(0) < length.unsqueeze(1)
|
124 |
+
|
125 |
+
|
126 |
+
def generate_path(duration, mask):
|
127 |
+
"""
|
128 |
+
duration: [b, 1, t_x]
|
129 |
+
mask: [b, 1, t_y, t_x]
|
130 |
+
"""
|
131 |
+
|
132 |
+
b, _, t_y, t_x = mask.shape
|
133 |
+
cum_duration = torch.cumsum(duration, -1)
|
134 |
+
|
135 |
+
cum_duration_flat = cum_duration.view(b * t_x)
|
136 |
+
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
|
137 |
+
path = path.view(b, t_x, t_y)
|
138 |
+
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
139 |
+
path = path.unsqueeze(1).transpose(2, 3) * mask
|
140 |
+
return path
|
141 |
+
|
142 |
+
|
143 |
+
def clip_grad_value_(parameters, clip_value, norm_type=2):
|
144 |
+
if isinstance(parameters, torch.Tensor):
|
145 |
+
parameters = [parameters]
|
146 |
+
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
147 |
+
norm_type = float(norm_type)
|
148 |
+
if clip_value is not None:
|
149 |
+
clip_value = float(clip_value)
|
150 |
+
|
151 |
+
total_norm = 0
|
152 |
+
for p in parameters:
|
153 |
+
param_norm = p.grad.data.norm(norm_type)
|
154 |
+
total_norm += param_norm.item() ** norm_type
|
155 |
+
if clip_value is not None:
|
156 |
+
p.grad.data.clamp_(min=-clip_value, max=clip_value)
|
157 |
+
total_norm = total_norm ** (1.0 / norm_type)
|
158 |
+
return total_norm
|
bert_vits2/g2pW/pypinyin_G2pW_bv2/__init__.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
from .g2pw import G2PWPinyin
|
4 |
+
|
5 |
+
__all__ = ["G2PWPinyin"]
|
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
from pypinyin.constants import RE_HANS
|
4 |
+
from pypinyin.core import Pinyin, Style
|
5 |
+
from pypinyin.seg.simpleseg import simple_seg
|
6 |
+
from pypinyin.converter import UltimateConverter
|
7 |
+
from pypinyin.contrib.tone_convert import to_tone
|
8 |
+
from .g2pw1.onnx_api import G2PWOnnxConverter
|
9 |
+
|
10 |
+
class G2PWPinyin(Pinyin):
|
11 |
+
def __init__(
|
12 |
+
self,
|
13 |
+
model_dir="G2PWModel/",
|
14 |
+
model_source=None,
|
15 |
+
num_workers=None,
|
16 |
+
batch_size=None,
|
17 |
+
turnoff_tqdm=True,
|
18 |
+
enable_non_tradional_chinese=True,
|
19 |
+
v_to_u=False,
|
20 |
+
neutral_tone_with_five=False,
|
21 |
+
tone_sandhi=False,
|
22 |
+
**kwargs
|
23 |
+
):
|
24 |
+
self._g2pw = G2PWOnnxConverter(
|
25 |
+
model_dir=model_dir,
|
26 |
+
style="pinyin",
|
27 |
+
model_source=model_source,
|
28 |
+
enable_non_tradional_chinese=enable_non_tradional_chinese,
|
29 |
+
)
|
30 |
+
self._converter = Converter(
|
31 |
+
self._g2pw,
|
32 |
+
v_to_u=v_to_u,
|
33 |
+
neutral_tone_with_five=neutral_tone_with_five,
|
34 |
+
tone_sandhi=tone_sandhi,
|
35 |
+
)
|
36 |
+
|
37 |
+
def get_seg(self, **kwargs):
|
38 |
+
return simple_seg
|
39 |
+
|
40 |
+
|
41 |
+
class Converter(UltimateConverter):
|
42 |
+
def __init__(
|
43 |
+
self,
|
44 |
+
g2pw_instance,
|
45 |
+
v_to_u=False,
|
46 |
+
neutral_tone_with_five=False,
|
47 |
+
tone_sandhi=False,
|
48 |
+
**kwargs
|
49 |
+
):
|
50 |
+
super(Converter, self).__init__(
|
51 |
+
v_to_u=v_to_u,
|
52 |
+
neutral_tone_with_five=neutral_tone_with_five,
|
53 |
+
tone_sandhi=tone_sandhi,
|
54 |
+
**kwargs
|
55 |
+
)
|
56 |
+
|
57 |
+
self._g2pw = g2pw_instance
|
58 |
+
|
59 |
+
def convert(self, words, style, heteronym, errors, strict, **kwargs):
|
60 |
+
pys = []
|
61 |
+
if RE_HANS.match(words):
|
62 |
+
pys = self._to_pinyin(
|
63 |
+
words, style=style, heteronym=heteronym, errors=errors, strict=strict
|
64 |
+
)
|
65 |
+
post_data = self.post_pinyin(words, heteronym, pys)
|
66 |
+
if post_data is not None:
|
67 |
+
pys = post_data
|
68 |
+
|
69 |
+
pys = self.convert_styles(pys, words, style, heteronym, errors, strict)
|
70 |
+
|
71 |
+
else:
|
72 |
+
py = self.handle_nopinyin(
|
73 |
+
words, style=style, errors=errors, heteronym=heteronym, strict=strict
|
74 |
+
)
|
75 |
+
if py:
|
76 |
+
pys.extend(py)
|
77 |
+
|
78 |
+
return _remove_dup_and_empty(pys)
|
79 |
+
|
80 |
+
def _to_pinyin(self, han, style, heteronym, errors, strict, **kwargs):
|
81 |
+
g2pw_pinyin = self._g2pw(han)
|
82 |
+
|
83 |
+
if not g2pw_pinyin: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑
|
84 |
+
return super(Converter, self).convert(
|
85 |
+
han, Style.TONE, heteronym, errors, strict, **kwargs
|
86 |
+
)
|
87 |
+
|
88 |
+
pinyins = []
|
89 |
+
|
90 |
+
for i, item in enumerate(g2pw_pinyin[0]):
|
91 |
+
if item is None: # g2pw 不支持的汉字改为使用 pypinyin 原有逻辑
|
92 |
+
py = super(Converter, self).convert(
|
93 |
+
han[i], Style.TONE, heteronym, errors, strict, **kwargs
|
94 |
+
)
|
95 |
+
pinyins.extend(py)
|
96 |
+
else:
|
97 |
+
pinyins.append([to_tone(item)])
|
98 |
+
|
99 |
+
return pinyins
|
100 |
+
|
101 |
+
|
102 |
+
def _remove_dup_items(lst, remove_empty=False):
|
103 |
+
new_lst = []
|
104 |
+
for item in lst:
|
105 |
+
if remove_empty and not item:
|
106 |
+
continue
|
107 |
+
if item not in new_lst:
|
108 |
+
new_lst.append(item)
|
109 |
+
return new_lst
|
110 |
+
|
111 |
+
|
112 |
+
def _remove_dup_and_empty(lst_list):
|
113 |
+
new_lst_list = []
|
114 |
+
for lst in lst_list:
|
115 |
+
lst = _remove_dup_items(lst, remove_empty=True)
|
116 |
+
if lst:
|
117 |
+
new_lst_list.append(lst)
|
118 |
+
else:
|
119 |
+
new_lst_list.append([""])
|
120 |
+
|
121 |
+
return new_lst_list
|
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/__init__.py
ADDED
File without changes
|
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/bopomofo_to_pinyin_wo_tune_dict.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"ㄌㄧㄥ": "ling", "ㄩㄢ": "yuan", "ㄒㄧㄥ": "xing", "ㄑㄧㄡ": "qiu", "ㄊㄧㄢ": "tian", "ㄎㄨㄚ": "kua", "ㄨ": "wu", "ㄧㄣ": "yin", "ㄧ": "yi", "ㄒㄧㄝ": "xie", "ㄔㄡ": "chou", "ㄋㄨㄛ": "nuo", "ㄉㄢ": "dan", "ㄒㄩ": "xu", "ㄒㄩㄥ": "xiong", "ㄌㄧㄡ": "liu", "ㄌㄧㄣ": "lin", "ㄒㄧㄤ": "xiang", "ㄩㄥ": "yong", "ㄒㄧㄣ": "xin", "ㄓㄣ": "zhen", "ㄉㄞ": "dai", "ㄆㄢ": "pan", "ㄖㄨ": "ru", "ㄇㄚ": "ma", "ㄑㄧㄢ": "qian", "ㄘ": "ci", "ㄓㄨㄥ": "zhong", "ㄋㄟ": "nei", "ㄔㄥ": "cheng", "ㄈㄥ": "feng", "ㄓㄨㄛ": "zhuo", "ㄈㄤ": "fang", "ㄠ": "ao", "ㄗㄨㄛ": "zuo", "ㄓㄡ": "zhou", "ㄉㄨㄥ": "dong", "ㄙㄨ": "su", "ㄑㄩㄥ": "qiong", "ㄎㄨㄤ": "kuang", "ㄨㄤ": "wang", "ㄌㄟ": "lei", "ㄋㄠ": "nao", "ㄓㄨ": "zhu", "ㄕㄨ": "shu", "ㄕㄣ": "shen", "ㄐㄧㄝ": "jie", "ㄉㄧㄝ": "die", "ㄔ": "chi", "ㄌㄨㄥ": "long", "ㄧㄥ": "ying", "ㄅㄥ": "beng", "ㄌㄢ": "lan", "ㄇㄧㄠ": "miao", "ㄌㄧ": "li", "ㄐㄧ": "ji", "ㄩ": "yu", "ㄌㄨㄛ": "luo", "ㄔㄞ": "chai", "ㄏㄨㄣ": "hun", "ㄏㄨㄟ": "hui", "ㄖㄠ": "rao", "ㄏㄢ": "han", "ㄒㄧ": "xi", "ㄊㄞ": "tai", "ㄧㄠ": "yao", "ㄐㄩㄣ": "jun", "ㄌㄩㄝ": "lve", "ㄊㄤ": "tang", "ㄓㄠ": "zhao", "ㄓㄞ": "zhai", "ㄓㄚ": "zha", "ㄦ": "er", "ㄖㄢ": "ran", "ㄑㄧ": "qi", "ㄙㄜ": "se", "ㄙ": "si", "ㄙㄚ": "sa", "ㄎㄨㄟ": "kui", "ㄆㄨ": "pu", "ㄊㄚ": "ta", "ㄉㄨ": "du", "ㄊㄨ": "tu", "ㄧㄤ": "yang", "ㄡ": "ou", "ㄇㄧㄢ": "mian", "ㄨㄣ": "wen", "ㄉㄧㄠ": "diao", "ㄇㄧㄝ": "mie", "ㄨㄚ": "wa", "ㄋㄧㄠ": "niao", "ㄧㄡ": "you", "ㄔㄜ": "che", "ㄑㄩㄢ": "quan", "ㄘㄞ": "cai", "ㄌㄧㄤ": "liang", "ㄍㄨ": "gu", "ㄇㄠ": "mao", "ㄍㄨㄚ": "gua", "ㄙㄨㄟ": "sui", "ㄇㄢ": "man", "ㄕ": "shi", "ㄎㄡ": "kou", "ㄊㄧㄥ": "ting", "ㄅㄧㄥ": "bing", "ㄏㄨㄛ": "huo", "ㄍㄨㄥ": "gong", "ㄑㄧㄣ": "qin", "ㄐㄩㄥ": "jiong", "ㄌㄨ": "lu", "ㄋㄢ": "nan", "ㄅㄧ": "bi", "ㄑㄧㄚ": "qia", "ㄆㄧ": "pi", "ㄉㄧㄢ": "dian", "ㄈㄨ": "fu", "ㄍㄜ": "ge", "ㄅㄞ": "bai", "ㄍㄢ": "gan", "ㄒㄩㄢ": "xuan", "ㄌㄤ": "lang", "ㄕㄜ": "she", "ㄏㄨㄚ": "hua", "ㄊㄡ": "tou", "ㄆㄧㄢ": "pian", "ㄉㄧ": "di", "ㄖㄨㄢ": "ruan", "ㄜ": "e", "ㄑㄧㄝ": "qie", "ㄉㄡ": "dou", "ㄖㄨㄟ": "rui", "ㄘㄨㄟ": "cui", "ㄐㄧㄢ": "jian", "ㄔㄨㄥ": "chong", "ㄉㄥ": "deng", "ㄐㄩㄝ": "jue", "ㄒㄩㄝ": "xue", "ㄒㄧㄠ": "xiao", "ㄗㄢ": "zan", "ㄓㄢ": "zhan", "ㄗㄡ": "zou", "ㄘㄡ": "cou", "ㄔㄨㄚ": "chua", "ㄈㄟ": "fei", "ㄅㄟ": "bei", "ㄔㄨ": "chu", "ㄅㄚ": "ba", "ㄎㄨㄞ": "kuai", "ㄒㄧㄚ": "xia", "ㄏㄜ": "he", "ㄅㄧㄝ": "bie", "ㄌㄩ": "lv", "ㄙㄨㄢ": "suan", "ㄏㄥ": "heng", "ㄍㄨㄟ": "gui", "ㄌㄡ": "lou", "ㄊㄧ": "ti", "ㄌㄜ": "le", "ㄙㄨㄣ": "sun", "ㄒㄧㄢ": "xian", "ㄑㄩㄝ": "que", "ㄓ": "zhi", "ㄐㄧㄚ": "jia", "ㄏㄨ": "hu", "ㄌㄚ": "la", "ㄎㄜ": "ke", "ㄞ": "ai", "ㄨㄟ": "wei", "ㄏㄨㄢ": "huan", "ㄕㄨㄚ": "shua", "ㄕㄨㄤ": "shuang", "ㄍㄞ": "gai", "ㄏㄞ": "hai", "ㄧㄢ": "yan", "ㄈㄢ": "fan", "ㄆㄤ": "pang", "ㄙㄨㄥ": "song", "ㄋㄜ": "ne", "ㄔㄣ": "chen", "ㄍㄨㄛ": "guo", "ㄣ": "en", "ㄋㄍ": "ng", "ㄆㄚ": "pa", "ㄈㄚ": "fa", "ㄆㄡ": "pou", "ㄏㄡ": "hou", "ㄑㄩ": "qu", "ㄒㄩㄣ": "xun", "ㄋㄧㄝ": "nie", "ㄏㄨㄥ": "hong", "ㄊㄨㄣ": "tun", "ㄨㄞ": "wai", "ㄕㄡ": "shou", "ㄧㄝ": "ye", "ㄐㄩ": "ju", "ㄙㄡ": "sou", "ㄌㄨㄣ": "lun", "ㄋㄧㄚ": "nia", "ㄆㄣ": "pen", "ㄈㄣ": "fen", "ㄔㄨㄣ": "chun", "ㄋㄧㄡ": "niu", "ㄖㄡ": "rou", "ㄉㄨㄛ": "duo", "ㄗㄜ": "ze", "ㄕㄥ": "sheng", "ㄎㄨ": "ku", "ㄧㄚ": "ya", "ㄓㄨㄟ": "zhui", "ㄍㄡ": "gou", "ㄅㄛ": "bo", "ㄋㄚ": "na", "ㄒㄧㄡ": "xiu", "ㄘㄨ": "cu", "ㄎㄨㄛ": "kuo", "ㄌㄠ": "lao", "ㄘㄨㄥ": "cong", "ㄉㄚ": "da", "ㄆㄛ": "po", "ㄙㄞ": "sai", "ㄌㄥ": "leng", "ㄖㄨㄥ": "rong", "ㄋㄧ": "ni", "ㄆㄠ": "pao", "ㄎㄢ": "kan", "ㄨㄥ": "weng", "ㄨㄢ": "wan", "ㄏㄠ": "hao", "ㄐㄧㄥ": "jing", "ㄊㄢ": "tan", "ㄅㄨ": "bu", "ㄗㄤ": "zang", "ㄐㄧㄡ": "jiu", "ㄇㄟ": "mei", "ㄇㄨ": "mu", "ㄉㄨㄟ": "dui", "ㄅㄤ": "bang", "ㄅㄠ": "bao", "ㄔㄤ": "chang", "ㄓㄤ": "zhang", "ㄗㄨㄥ": "zong", "ㄍㄨㄣ": "gun", "ㄌㄧㄠ": "liao", "ㄔㄢ": "chan", "ㄓㄜ": "zhe", "ㄇㄥ": "meng", "ㄑㄧㄠ": "qiao", "ㄋㄤ": "nang", "ㄩㄣ": "yun", "ㄎㄞ": "kai", "ㄍㄠ": "gao", "ㄊㄠ": "tao", "ㄕㄢ": "shan", "ㄌㄞ": "lai", "ㄅㄢ": "ban", "ㄎㄨㄥ": "kong", "ㄔㄨㄛ": "chuo", "ㄋㄨ": "nu", "ㄆㄟ": "pei", "ㄆㄥ": "peng", "ㄘㄢ": "can", "ㄙㄨㄛ": "suo", "ㄊㄨㄥ": "tong", "ㄑㄧㄤ": "qiang", "ㄙㄠ": "sao", "ㄓㄨㄢ": "zhuan", "ㄢ": "an", "ㄔㄚ": "cha", "ㄕㄚ": "sha", "ㄌㄧㄢ": "lian", "ㄇㄧ": "mi", "ㄋㄡ": "nou", "ㄘㄠ": "cao", "ㄙㄣ": "sen", "ㄋㄣ": "nen", "ㄋㄧㄢ": "nian", "ㄇㄞ": "mai", "ㄩㄝ": "yue", "ㄋㄞ": "nai", "ㄏㄨㄞ": "huai", "ㄗ": "zi", "ㄌㄨㄢ": "luan", "ㄉ��ㄥ": "ding", "ㄇㄤ": "mang", "ㄋㄧㄥ": "ning", "ㄇㄧㄥ": "ming", "ㄗㄨㄟ": "zui", "ㄎㄤ": "kang", "ㄉㄜ": "de", "ㄅㄧㄢ": "bian", "ㄐㄧㄣ": "jin", "ㄔㄨㄟ": "chui", "ㄊㄨㄟ": "tui", "ㄗㄚ": "za", "ㄘㄣ": "cen", "ㄇㄧㄣ": "min", "ㄏㄨㄤ": "huang", "ㄗㄨ": "zu", "ㄘㄨㄛ": "cuo", "ㄊㄨㄛ": "tuo", "ㄑㄩㄣ": "qun", "ㄅㄧㄣ": "bin", "ㄊㄧㄠ": "tiao", "ㄍㄤ": "gang", "ㄉㄨㄢ": "duan", "ㄅㄧㄠ": "biao", "ㄉㄠ": "dao", "ㄖㄨㄣ": "run", "ㄐㄧㄠ": "jiao", "ㄨㄛ": "wo", "ㄘㄨㄢ": "cuan", "ㄖㄣ": "ren", "ㄇㄣ": "men", "ㄓㄨㄣ": "zhun", "ㄎㄨㄣ": "kun", "ㄔㄨㄤ": "chuang", "ㄗㄠ": "zao", "ㄓㄥ": "zheng", "ㄆㄧㄣ": "pin", "ㄅㄣ": "ben", "ㄐㄧㄤ": "jiang", "ㄐㄩㄢ": "juan", "ㄘㄥ": "ceng", "ㄏㄤ": "hang", "ㄋㄧㄣ": "nin", "ㄌㄧㄝ": "lie", "ㄍㄨㄤ": "guang", "ㄙㄢ": "san", "ㄊㄜ": "te", "ㄕㄨㄣ": "shun", "ㄕㄨㄟ": "shui", "ㄔㄠ": "chao", "ㄘㄜ": "ce", "ㄍㄨㄞ": "guai", "ㄎㄥ": "keng", "ㄕㄞ": "shai", "ㄉㄣ": "den", "ㄊㄨㄢ": "tuan", "ㄆㄧㄠ": "piao", "ㄑㄧㄥ": "qing", "ㄍㄥ": "geng", "ㄔㄨㄞ": "chuai", "ㄕㄠ": "shao", "ㄍㄣ": "gen", "ㄋㄨㄢ": "nuan", "ㄖㄥ": "reng", "ㄇㄡ": "mou", "ㄆㄞ": "pai", "ㄤ": "ang", "ㄎㄚ": "ka", "ㄍㄨㄢ": "guan", "ㄕㄨㄛ": "shuo", "ㄏㄣ": "hen", "ㄔㄨㄢ": "chuan", "ㄎㄨㄢ": "kuan", "ㄏㄟ": "hei", "ㄇㄛ": "mo", "ㄗㄞ": "zai", "ㄋㄥ": "neng", "ㄕㄨㄞ": "shuai", "ㄖㄜ": "re", "ㄋㄩ": "nv", "ㄆㄧㄥ": "ping", "ㄘㄤ": "cang", "ㄋㄨㄥ": "nong", "ㄎㄠ": "kao", "ㄗㄨㄢ": "zuan", "ㄎㄣ": "ken", "ㄍㄚ": "ga", "ㄗㄣ": "zen", "ㄉㄤ": "dang", "ㄗㄥ": "zeng", "ㄉㄨㄣ": "dun", "ㄘㄚ": "ca", "ㄖㄤ": "rang", "ㄘㄨㄣ": "cun", "ㄖㄨㄛ": "ruo", "ㄊㄧㄝ": "tie", "ㄊㄥ": "teng", "ㄙㄥ": "seng", "ㄖ": "ri", "ㄗㄨㄣ": "zun", "ㄋㄧㄤ": "niang", "ㄋㄩㄝ": "nve", "ㄙㄤ": "sang", "ㄓㄨㄤ": "zhuang", "ㄕㄤ": "shang", "ㄆㄧㄝ": "pie", "ㄕㄨㄢ": "shuan", "ㄈㄡ": "fou", "ㄉㄧㄡ": "diu", "ㄇㄜ": "me", "ㄈㄛ": "fo", "ㄌㄧㄚ": "lia", "ㄎㄟ": "kei", "ㄏㄚ": "ha", "ㄚ": "a", "ㄌㄛ": "lo", "ㄧㄛ": "yo", "ㄛ": "o", "ㄏㄋㄍ": "hng", "ㄋ": "n", "ㄌㄣ": "len", "ㄉㄧㄚ": "dia", "ㄇㄧㄡ": "miu", "ㄉㄟ": "dei", "ㄏㄇ": "hm", "ㄋㄨㄣ": "nun", "ㄓㄨㄞ": "zhuai", "ㄊㄟ": "tei", "ㄗㄟ": "zei", "ㄓㄨㄚ": "zhua", "ㄖㄨㄚ": "rua", "ê": "ê", "ㄟ": "ei", "ㄍㄟ": "gei", "ㄈㄧㄠ": "fiao", "ㄕㄟ": "shei", "ㄓㄟ": "zhei", "ㄥ": "eng", "ㄘㄟ": "cei", "ㄉㄧㄣ": "din", "ㄅㄧㄤ": "biang", "ㄧㄞ": "yai"}
|
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/char_bopomofo_dict.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/char_convert.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
"""Traditional and simplified Chinese conversion, a simplified character may correspond to multiple traditional characters.
|
16 |
+
"""
|
17 |
+
simplified_charcters = "制咖片型超声盘鉴定仔点他命书歌粉巾字帐恤手指记忆棒形转弯沟光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞以㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮涧㵪㶸㷖㷭㹢㹴犬㺢狓㺵碗㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓射䥯䦉䯝鲃鱼䲔䳗鹅䵹鼄䶑一对应映射丁不识下儿子做二休世丘之貉并中台原则串为甚谓干净了百事无成八变五十些人得道鸡升天代如并来去个国政策劲幽灵在欧洲游荡接样萝卜坑侧化传价元论醇共再准刀两断切分耕耘收获钱货物向看旧就绪险刻千金动劳永逸匙零夜半卡通回复返影踪反常态口咬气句话同吐快吹周味呼诺呜品红锅哄而散起唱和问三知生熟团漆黑火糟堆场空块面塌糊涂尘染壁厢夔已足多情露水大早到晚夫妻当关万莫开失古恨套所料既往孔见提师要家主审寸阴难买斗牛小撮部阵局展身层巴掌帆风顺席地带过年计于春头载四季期被蛇怕井绳度愿式份弹顷深前律径心意念差愁孤行俱全房厅交遮打技长把抓死拿眼泪鼻涕钥锁折段抿拍即合扫排掬挥拨拥上入击洞掷揽改故辙败文值名斑方面旁族日秋餐隔雅里终父旦时晌会霎间晃暴寒曝更月望垠际朝夕本正经利杯羹东西板枝独秀根筋杆进条龙服务概模次函数又性程总付步脚印趋登毛拔呵氧氮碳决雌雄波未平派谎言流清楚白准溜烟潭有获闻是处降琴鹤甲病发可拾沙目然了直以相眨穿睹瞥瞬矢的解石鸟神教秉虔诚秘种窝蜂穷窍笑置笔苟勾销抹杀煞等奖箍节吃箭仇双雕诗筹箩筐系列纸级士官统丝毫挂维网尽线微吭响股脑胎脉承腔臂力致效资源址器举功投般说讲规贸易叶障着慎满皆输号木电池衣倾钟高低视仁觉醒览遗角银币触溃九鼎蔽抄出驷马追重语破贫洗贯走路安蹴至几蹶振跃役胆汗较辈轮辞赞退六连遍递边针血锤音错门思闪真倒项栽雾类保护川先惊乍体哄鳞爪鸣滴泡邻域党专鼓作齐炒丑烯亥克内酯冬加奴卯肝炎基尺梁街裤镐客宠庭巳汝昌烷玲磊糖肇酉醛啷青县韪良香骨鲷丂七集河市弦喜嘴张舌堵区工业姊妹星架构巧彩扭歪拼凑余热曜武州爷浮屠美乡老阶树荤素碎落能魄鳃鳗珠丄丅丆万俟丈尚摸母娘量管群亚虎必我堂令申件装伏位博侠义界表女墟台戏臭皮匠胜诸葛亮赛顶倍催请运算包立叉戟离疫苗土史志演围揭瓦晒夷姑婆帝村宝烂尖杉碱屉桌山岔岛由纪峡坝库镇废从德后拗汤治旬食明昧曹朋友框栏极权幂曲归依猫民氟硼氯磷铁江侗自旅法司洋浦梅园温暖湾焦班幸用田略番叠皇炮捶硝苯酸腺苷棱草镜穗跳远索锦纲聚氰胺联店胚膲爱色堇紫罗兰芝茶饭菱云虫藏藩乱叛苏亲债凳学座恐恋柱测肌腹衩锥系貂企乌跪叩军车农题迭都甘油屯奏键短阿姨陪姐只顾茅庐槽驾魂鲜鹿页其菜单乘任供势午齿汉组织吊调泻唇坡城报坟外夸将尉建筑岸岗公床扬新剑升杭林栗校楼标款汽社浣海商馆剧院钢华港机械广媒环球融第医科证券综财乐育游涨犹岭疏瘾睑确兵领导缴肢膛船艾瑟尔苍蔡虞效衫覆访诉课谕议轨述野钩限敌鞋颌颔颚饶首龈站例修凡划垂届属崽颏厨拜挫摆放旋削棋榻槛礼沉注滑营狱画确仪聘花葬诏员跌辖周达酒锚闸陷陆雨雪飞威丌于丹久乏予理评产亢卑亦乎舞己悲矩圆词害志但住佞佳便俗信票案幅翁倦伦假偏倚斜亏鬼敲停备伤脾胃仅此像俭匮免宜穴焉戴兼容许冻伯仲负彼昼皂轩轾实刊划颠卫战哥比省非好黄饰别拘束掩奶睬选择摇扰烦苦枚写协厌及格受欢迎约只估侵犯割状告或缺抗拒挽撤救药喻磨灭端倪少逆逾越避靠适吉誉吝玉含延咎歹听啻渊善谋均匀堪忍够太惹妙妥妨孕症孝术室完纳推冠积宣疑辩栗碴称屈挠屑干涉衡待很忙恶忿怎么怠急耻恭息悦惑惜惟想愉愧怍慌愤启懂懈怀材才紧招认扣抵拉舍也罢插揣冒搭撞南墙扩核支攻敢雷攀敬里吗需景智暇曾罪遇朽枉止况竞争辱求愈渝溶济左右袒困补爽特寂寞示弱找谢畏强疾徐痛痒冤符眠睦瞅董何厚云措活疲羞者轻玻璃祥兆禁���稂莠稳佛换答简结果盟绝缕途给谈否羁翼耐肖胫毋宁兴舒若菲莱痕迹窠臼虚衰脸兔撒鹰棺范该详讳抬泰让须眉象众赀账费灰赖奇虑训辍辨菽麦辛近送透逞徒速续逮捕遂遑违逊斧钺艰醉锈随观弃显饱脂肪使丏丐帮丒且慢末丕替桃宗王尊凉爵各图屋脊粮署录坛吾禄职胄袭君厦丗北壑桐疹损逢陵鹬丙寅戌氨腈唑纶辰酮脱氢酶醚丞丢现掉纱帽弄扯炮碗丠両丣坐存激肩臻蒂莲悖序驱丨丩丫挺杈髻鬟细介俄伊犁京尼布订普渡央委监察检查剂圈设警队斯督剩震境航舶革防托播促质版蝾螈锋研艺历残消频谱精密制造陲邮候埔坚压坜凹汇执府究邦俘摄寮彬狼岳肺肿庸英讯诊埋粒胞括控码韩暑枪枢砥澳哇牟寿甸钻探篇签缀缝继耳肯照妇埃悬璧轴柜台辣搁浅邪跑纤阮阳私囊魔丮丰姿采丱烧丳丵丶丷丸参寨朗桂瑞砂衷霞貌凤仆舰因嫌宰峰干络牌持旨祭祷簿编罚宾办丼丿乀乂乃乄仰慕盛旷留考验阔乆乇么丑麽乊湖燃乑乒乓乕乖僻忤戾离谬迕乗危肥劫除隙浪婿乙炔肠酰吡咯盐乚乛乜嘢卿玄宫尾狐龟塔嶷兄弟泉章霄钉耙乞扎哀怜恕讨乢乣乤乥乧乨乩童乪乫乭乳晕汁液瑶浆牙癌突窦罩腐胶猪酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉哕嚎坤妈尸垒旱枯涸俐渴潮涩煸豆燥爹瘦瘪癣瞪袋脆姜贝隆馏乿亀亁叫咕攘扔搞男砸窜蓬麻亃亄亅却亇迟典今临繁累卵奉婚聪躬巨与迁添裂副宿岁怪恶尕仑愣杆硅硫钛铀锰芑杂异钠砷胂磺琥珀舱棍簧胡茬盗浩盆贩郎腿亍洪亐互欠助勉惠操斥诿系户译亓墓碑刑铃卅渠缤纷斗米旗宪钒灯徽瘟祖拳福谷丰脏腑绑肉腌苓蕴桥铺霸颜闹判喷冈底蛙陉矿亖亘亜罕们娜桑那努哈喀弗烈曼松森杜氏杯奥琛敦戊穆圣裔汇薛孙亟亡佚虏羊牢奋释卷卸契媾感额睫缠谊趾塞挤纽阻还配驰庄亨洛祚亪享津沪畿郊慈菴枇杷膏亭阁锃丽亳亶亹诛初责翻疯偶杰丛稠妖拖寰居吸授慧蜗吞壮魅狗矛盾益渣患忧稀描猿梦暂涯畜祸缘沸搜引擎臣横纭谁混援蒸兽狮税剖亻亼亽亡什献刹邡么仂仃仄仆富怨仈仉毕昔晨壳绍仍仏仒仕宦仗欺恃腰叹叹炬梓讫施仙后琼逝仚仝仞仟悔仡佬偿填泊拓扑簇羔购顿钦佩发棻阃驭养亿儆尤借帧赈凌叙帖李柔刚沃眦睚戒讹取飨读仨仫仮著泳卧躺韶夏裁仳仵唯贤凭钓诞仿似宋佛讽伀硕盼鹅伄儅伈伉俪柯始娃迈戈坦堡帕茨萨庙玛莉莎藤霍姆伋伍奢胥廷芳豪伎俩侍汛勒希羲雏伐憩整谟闲闲伕伙伴颐伜伝伢叔恒兹恩翰伱伲侣伶俜悧鼬伸懒缩喇叭伹伺伻伽倻辐伾似佃伫布乔妮墨佉卢佌贷劣廉昂档浓矮伞洼缓耗胸谷迷挡率龋宅沫舍疗佐贰佑占优据铧尝呢须鲁晓佗佘余坪寺瓜铳僧蒙芒陀龛哼呕坊奸孽弊揖祟茧缚誓贼佝偻瞀佟你夺赶佡佢佣佤佧贾佪佫佯佰佱洁绩酿肴佴卷佶佷佸佹佺佻佼佽佾具唤窘坏娱怒慨硬习惯聋膨胀蔓骇贵痹侀侁侂侃侄侅鸿燕侇侈糜靡侉侌妾侏儒仓鼠侐侑侔仑侘侚链侜偎傍钴循柳葫芦附価侮骂蔑侯岩截蚀局贴壶嬛宴捷携桶笺酌俣狭膝狄俅俉俊俏俎俑俓俔谚俚俛黎健呈固墒增守康箱湿祐镖镳杠盒靖膜龄俞豹猎噪孚封札筒托衍鸽剪撰稿炼厂禊练缮葺俯瞰撑冲效俳俴俵俶俷俺备俾伥倂倅储卒惶敷猝逃颉蓄崇隐倌倏忽刺蜡烛噍嚼坍扁抽毙葱楣灌灶粪背薮卖赔闭霉腾倓倔幸倘倜傥倝借箸挹浇阅倡狂倢倣値倥偬倨傲倩匡嗣冲柝珍倬倭寇猩倮倶倷倹勤赞偁偃充伪吏嗓寐惺扮拱芫茜藉虢钞偈伟晶偌宕距析滤殿疼瘫注颇偓偕鸭歇滞偝偟偢忘怡旺偨偩逼偫偭偯偰偱偲侦缉蹄偷减惰漏窥窃偸偺迹傀儡傅傈僳骂篱傎奎琳迪叟芭傒傔傕伧悉荒傜傞傢傣芽逼佣婢傮睨寄檄诵谣颂伛担辜弓惨蒿悼疤傺傻屄臆巢泄箧羡盖轧颓傿㑩僄僇佥僊働僎侨僔僖僚僝伪僣僤侥僦猴偾僩僬僭僮僯僰雇僵殖签静僾僿征陇儁侬儃儇侩朴薄儊儋儌儍傧儓俦侪拟尽儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹傩俨儽兀臬臲鹫允勋勋宙宵帅憝彝谐嫂阋畅沛溢盈饥赫凶悍狠猛顽愚妣斩秦遣鞭耀敏荣槃泽爆碟磁秃缆辉霁卤朵娄孜烽酱勃汀箕裘钳耶蒙蕾彻兑软遭黜兎児韵媳爸兕觥兖兙兛兜售鍪肚兝兞兟兡兢兣樽殓涅睡禀籍赘泌啡肽奸幕涵涝熵疚眷稃衬讧赴焕椒歼植跏没试误猜栖窗肋袖颊兪卦撇胡岐廓轿疸枫茴珑厕秩募勺吨寓斤历亩迫筷厘最淫螺韬兮宽匪筛襄赢轭复兲诈刃堰戎痞蚁饷它冀铸冂冃円冇冉册嫁厉砺竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑诬冥冫烘菇蛰冷凝坨橇淇淋炭饼砖碛窖醋雕雹霜冱冶炉艳嘲峻滩淡漠煖飕饮冼冽凃凄怆梗凅凇净凊凋敝蒙凔凛遵汞脢凞几凢処凰凯凵凶焰凸折刷纹预丧喽奔巡榜殡芙蓉租笼辑鞘萃凼锯镬刁蛮刂娩崩批拆摊掰蘖骤歧颗秒袂赃勿嘱忌磋琢肤刈羽刎讼戮舂桨艇刓刖霹雳刜创犊刡恙墅帜筵致劫劫刨昏默攸尿欲熏润薰圭删刮痧铲刱刲刳刴刵踏磅戳柏槐绣芹苋猬舟铭鹄鹜劫剁剃辫刭锉履铅克剌姻咽哨廊掠桅沿召瞻翅赵卜渺茫郭剒剔剕沥剚愎毅讷才剜剥啄采剞剟剡剣剤䌽剐肾驶黏剰袍剀紊铲剸剺剽剿劁劂札劈啪柴扳啦刘奭姥夼昫涓熙禅禹锡翔雁鹗刽刿弩柄蜻蛉劒劓劖劘劙澜篑赏矶釜晋甜薪逐劦熔纣虐赤囚劬劭労劵效劻劼劾峭艮勅勇励勍勐腊脖庞漫饲荡粥辄勖勗勘骄馁碌泮雇捐竹骑殊阱绩朴恳谨剿勧勩勯勰劢勋勷劝惩慰诫谏勹芡践阑匁庇拯粟扎袱裹饺匆遽匈匉匊匋匍匐茎匏匕妆痰脓蛹斋苑烤蹈塘羌熊阀螳螂疆碚竿纬荷茵邙魏匚匜匝匟扶稷匣匦拢匸匹耦匽匾匿卂叮疮禧轸堤棚迢钧炼卄卆遐卉瓷盲瓶当胱腱裸卋卌卍卐怯污贱鄙龌龊陋卓溪唐梯渔陈枣泥漳浔涧梨芬谯赡辕迦郑単驴弈洽鳌卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫袄玺绶钮蚤惧殆笃耸卲帘帙绕恤卼卽厂厎厓厔厖厗奚厘厍厜厝谅厕厤厥厪腻孢厮厰厳厣厹厺粕垢芜菁厼厾叁悟茸薯叄吵笄悌哺讥坫垄弧芯杠潜婴刍袁诘贪谍煽馈驳収岳缔灾贿骗叚叡吻拦蘑蜜诀燧玩砚筝椎蔺铜逗骊另觅叨唠谒杵姓喊嚷嚣咚咛塑寻恼憎擦只泣渗蝠叱吒咄咤喝籀黛舵舷叵叶铎懿昭穰苴辽叻叼吁堑嫖赌瞧爬众抒吅吆夥卺橡涤抱纵摩郡唁坠扇篮膀袜颈吋忾谘酬哭妓媛暗表缰迩妃羿絮蕃浑拐葵暮隅吔吖啶嗪戚吜啬噬咽吟哦咏吠吧唧嗒咐吪隽咀征燐苞茹钙哧吮吰吱嘎吲哚吴栋娇窟孟箫忠晗淞阖闾趼宇呐睛嘘拂捧疵熄竽笛糠吼吽呀吕韦蒙呃呆笨呇贡呉罄呋喃呎呏呔呠呡痴呣呤呦呧瑛眩扒晬淑姬瑜璇鹃呪呫哔嚅嗫呬呯呰呱呲咧噌钝呴呶呷呸呺呻哱咻啸噜吁坎坷逻呿咁咂咆哮咇咈咋蟹煦珅蔼咍咑咒诅咔哒嚓咾哝哩喱咗咠咡咢咣咥咦咨嗟询咩咪咫啮啮咭咮咱咲咳呛嗽咴啕咸咹咺呙喉咿婉恸悯赋矜绿茗蓝哂抢瞒哆嗦啰噻啾滨彗哋哌哎唷哟哏哐哞哢哤哪里哫啼喘哰哲萎蚌哳咩哽哿呗唅唆唈唉唎唏哗尧棣殇璜睿肃唔睇唕吣唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鹦鹉啅埠栈榷祺铺鞅飙啊啍啎啐啓啕啖啗啜哑祈啢衔啤啥啫啱啲啵啺饥啽噶昆沁喁喂喆裙喈咙喋喌喎喑喒喓喔粗喙幛庆滋鹊喟喣喤喥喦喧骚喨喩梆吃葡萄喭驼挑吓碰枞瓣纯疱藻趟铬喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔诟嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨唢嗬嗯嗰嗲嗵叽嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾荫啀嘌嘏嘐嘒啯啧嘚唛嘞嘟囔嘣嘥嘦嘧嘬嘭这谑严敞馋松哓嘶嗥呒虾嘹嘻啴嘿噀噂噅噇噉噎噏噔噗噘噙噚咝噞噢噤蝉皿噩噫噭嗳噱哙噳嚏涌洒欲巫霏噷噼嚃嚄嚆抖哜尝嚔苏嚚嚜嚞嚟呖嚬嚭嚮嚯亸喾饬按竣苛嚵嘤啭冁呓膪谦囍囒囓囗囘萧酚飘溅谛囝溯眸纥銮鹘囟殉囡団囤囥囧囨囱囫囵囬囮囯囲図囶囷囸囹圄圉拟囻囿圀圂圃圊粹蠹赦圌垦圏滚鲱凿枘圕圛圜圞坯埂壤骸炕祠窑豚绅魠鲮鳖圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆垫墩椅坒坓坩埚坭坰坱坳坴坵坻坼杨挣涎帘垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜垭埤埦埧埭埯埰埲埳埴埵埶绋埸培怖桩础辅埼埽堀诃侄庑堃堄摧磐贞韧砌堈堉垩堋堌堍堎垴堙堞堠礁堧堨舆堭堮蜓摘堲堳堽堿塁塄塈煤茔棵塍垲埘塓绸塕鸦沽虱塙冢塝缪塡坞埙塥塩塬塱场螨塼塽塾塿墀墁墈墉墐夯増毁墝墠墦渍钵墫墬堕墰墺墙橱壅壆壊壌壎壒榨蒜壔壕壖圹垆壜壝垅壡壬壭壱売壴壹壻壸寝壿夂夅夆変夊夌漱邑夓腕泄甥御骼夗夘夙衮瑙妊娠醣枭珊莺鹭戗幻魇夤蹀秘擂鸫姚宛闺屿庾挞拇賛蛤裨菠氅漓捞湄蚊霆鲨箐篆篷荆肆舅荔鲆巷惭骰辟邱镕镰阪漂烩鲵鲽鳄鸨胪鹏妒峨谭枰晏玑癸祝秤竺牡籁恢罡蝼蝎赐绒御梭夬夭砣榆怙枕夶夹馅奄崛葩谲奈贺祀赠奌奂奓奕䜣詝奘奜奠奡奣陶奨奁魁奫奬奰娲孩贬隶酥宄狡猾她姹嫣妁毡荼皋膻蝇嫔妄妍嫉媚娆妗趣妚妞妤碍妬娅妯娌妲妳妵妺姁姅姉姗姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀诱慑胁娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥溪孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮妫媲媵媸媺媻媪眯媿嫄嫈袅嫏嫕妪嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰妩嫺娴嫽嫿妫嬃嬅嬉耍婵痴艳嬔嬖嬗嫱袅嫒嬢嬷嬦嬬嬭幼嬲嬴婶嬹嬾嬿孀娘孅娈孏曰癫屏孑孓雀孖斟篓谜摺孛矻鸠崮轲祜鸾孥邈毓棠膑孬孭孰孱孳孵泛罔衔孻孪宀宁冗拙株薇掣抚琪瓿榴谧弥宊濂祁瑕宍宏碁宓邸谳実潢町宥宧宨宬徵崎骏掖阙臊煮禽蚕宸豫寀寁寥寃檐庶寎暄碜寔寖寘寙寛寠苫寤肘洱滥蒗陕核寪弘绰螽宝擅疙瘩晷対檐専尃尅赎绌缭畴衅尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚觑蔻脏躁尔尓锐尗尙尜尟尢��尨尪尬尭尰擒尲尶尴尸尹潽蠖蛾尻扣梢蚴鳍脬蹲屇屌蚵屐屃挪屖屘屙屛屝屡屣峦嶂岩舄屧屦屩屪屃屮戍驻钾崖嵛巅旮旯楂榄榉芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭巩岒岝岢岚岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峨峰峱岘峹峿崀崁崆祯崋崌崃岖昆崒崔嵬巍萤颢崚崞崟崠峥巆崤崦崧殂岽崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓岁嵙嵞嵡嵩嵫嵯嵴嵼嵾嵝崭崭晴嶋嶌嶒嶓嵚崂嶙嶝嶞峤嶡嶢峄嶨嶭嶮嶰嶲岙嵘巂巃巇巉岿巌巓巘巛滇芎巟巠弋回巣巤炊擘蜥蟒蛊觋巰蜀彦淖杏茂甫楞巻巽帼巿帛斐鲫蕊帑帔帗帚琉汶帟帡帣帨裙帯帰帷帹暆帏幄帮幋幌幏帻幙帮幞幠幡幢幦幨幩幪帱幭幯幰遥蹉跎馀庚鉴幵幷稚邃庀庁広庄庈庉笠庋跋庖牺庠庤庥鲸庬庱庳庴庵馨衢庹庿廃厩廆廋廌廎廏廐廑廒荫廖廛厮搏锣廞弛袤廥廧廨廪廱绵踵髓廸迫瓯邺廻廼廾廿躔弁皱弇弌弍弎弐弑吊诡憾荐弝弢弣弤弨弭弮弰弪霖繇焘斌旭溥骞弶弸弼弾彀彄别累纠强彔彖彘彟彟陌彤贻彧绘虹彪炳雕蔚鸥彰瘅彲彳彴仿彷徉徨彸彽踩敛旆徂徇徊渭畲铉裼従筌徘徙徜徕膳苏萌渐徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸颤扉犀澎湃砰恍惚绞隘忉惮挨饿忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懑怏遏怔怗怚怛怞怼黍讶怫怭懦怱怲恍怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱凄恻德悴怅惘闷悻悾惄愫钟蒐惆惇惌惎惏惓惔惙惛耄惝疟浊恿惦德恽惴蠢惸拈愀愃愆愈愊愍愐愑愒愓愔愕恪氓蠢騃昵惬赧悫愬愮愯恺愼慁恿慅慆慇霭慉慊愠慝慥怄怂慬慱悭慴慵慷戚焚憀灼郁憃惫憋憍眺捏轼愦憔憖憙憧憬憨憪憭怃憯憷憸憹憺懃懅懆邀懊懋怿懔懐懞懠懤懥恹懫懮懰懱毖懵遁梁雍忏懽戁戄戆戉戋戕戛戝戛戠戡戢戣戤戥戦戬戭戯轰戱披菊牖戸戹戺戻卯戽锹扂楔扃扆扈扊杖牵绢铐镯赉扐搂搅烊盹瞌跟趸镲靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄绥鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔缳缢擞抜拗択抨摔歉蹿牾抶抻搐泵菸拃拄拊髀抛拌脯拎拏拑擢秧沓曳挛迂拚拝拠拡拫拭拮踢拴拶拷攒拽掇芥橐簪摹疔挈瓢骥捺蹻挌挍挎挐拣挓挖掘浚挙揍聩挲挶挟挿捂捃捄捅捆捉捋胳膊揎捌捍捎躯蛛捗捘捙捜捥捩扪捭据捱捻捼捽掀掂抡臀膘掊掎掏掐笙掔掗掞棉芍掤搪阐掫掮掯揉掱掲掽掾揃揅揆搓揌诨揕揗揘揜揝揞揠揥揩揪揫橥遒麈揰揲揵揶揸背揺搆搉搊搋搌搎搔搕撼橹捣搘搠搡搢搣搤搥搦搧搨搬楦裢讪赸掏搰搲搳搴揾搷搽搾搿摀摁摂摃摎掴摒摓跤摙摛掼摞摠摦喉羯摭摮挚摰摲抠摴抟摷掺摽撂撃撅稻撊撋挦锏泼撕撙撚㧑挢撢掸撦撅撩撬撱朔揿蚍蜉挝捡擀掳闯擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭摈拧撷擸撸擽擿攃摅撵攉攥攐攓撄搀撺每攩攫辔澄攮攰攲攴轶攷砭讦攽碘敁敃敇敉叙敎筏敔敕敖闰诲敜煌敧敪敳敹敺敻敿斁衽斄牒绉诌斉斎斓鹑谰驳鳢斒筲斛斝斞斠斡斢斨斫斮晾沂潟颖绛邵斲斸釳於琅斾斿旀旗旃旄涡旌旎旐旒旓旖旛旝旟旡旣浴旰獭魃旴时旻旼旽昀昃昄昇昉晰躲澈熹皎皓矾昑昕昜昝昞昡昤晖笋昦昨是昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘莹顗晿暁暋暌暍暐暔暕煅旸暝暠暡曚暦暨暪朦胧昵暲殄冯暵暸暹暻暾曀晔昙曈曌曏曐暧曘曙曛叠昽曩骆曱甴肱曷牍禺锟曽沧耽朁朅朆杪栓夸竟粘绦朊膺朏朐朓朕朘朙瞄觐溘饔飧朠朢朣栅椆淀虱朩朮朰朱炆璋钰炽鹮朳槿朵朾朿杅杇杌陧欣钊湛漼楷瀍煜玟缨翱肇舜贽适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦颦缅莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翘纾逋枙狸桠枟槁枲枳枴枵枷枸橼枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞栎柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟柏栩栫栭栱栲栳栴檀栵栻桀骜桁镁桄桉桋桎梏椹葚桓桔桕桜桟桫椤桭杯桯桲桴桷桹湘溟梃梊梍梐潼栀枧梜梠梡梣梧梩梱梲梳梴梵梹棁棃樱棐棑棕榈簑绷蓑枨棘棜棨棩棪棫棬棯棰棱棳棸棹椁棼碗椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀匾楅篪楋楍楎楗楘楙楛楝楟楠楢楥桢楩楪楫楬楮楯楰梅楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽搒笞榠榡榤榥榦榧杩榭榰榱梿霰榼榾桤槊闩槎槑槔槖様槜槢槥椠槪槭椮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢狲桦樻罍樾樿橁橄橆桡笥龠橕橚橛辆椭橤橧竖膈跨橾橿檩檃檇柽檍檎檑檖檗桧槚檠樯檨檫檬梼槟檴檵柠棹櫆櫌栉櫜椟櫡槠栌枥榇栊櫹棂茄櫽欀欂欃欐欑栾欙棂溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊莳蝶歓歕歘歙歛歜欤歠蹦诠镶蹒跚升陟歩歮歯歰歳歴璞歺瞑歾殁夭殈殍殑殗殜殙殛殒殢殣殥殪殚僵殰殳荃殷殸殹蛟殻肴谤殴毈毉喂毎���蕈毗毘毚茛邓毧毬毳毷毹毽毾毵牦氄氆靴氉氊氇氍氐聊氕氖気氘氙氚氛氜氝氡汹焊痉氤氲氥氦铝锌氪烃氩铵痤汪浒漉痘盂碾菖蒲蕹蛭螅氵冰氹氺氽烫氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蓠沼秽蔑汧汨汩汭汲汳汴堤汾沄沅沆瀣沇沈葆浸沦湎溺痼疴沌沍沏沐沔沕沘浜畹砾沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆涌肓泐泑泒泓泔泖泙泚泜泝泠漩馍涛粼泞藓鳅泩泫泭泯铢泱泲洇洊泾琵琶荽蓟箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙赣渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鲤浃浼浽溦涂涊涐涑涒涔滂莅涘涙涪涫涬涮涴涶涷涿淄淅淆淊凄黯淓淙涟淜淝淟淠淢淤渌淦淩猥藿亵淬淮淯淰淳诣涞纺淸淹炖癯绮渇済渉渋渓渕涣渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝浈湟湢湣湩湫湮麟湱湲湴涅満沩溍溎溏溛舐漭溠溤溧驯溮溱溲溳溵溷溻溼溽溾滁滃滉滊荥滏稽滕滘汇滝滫滮羼耷卤滹浐煎漈漊漎绎漕漖漘漙沤漜漪漾漥漦漯漰溆漶漷濞潀颍潎潏潕潗潚潝潞潠潦祉疡潲潵滗潸潺潾涠澁澂澃澉澌澍澐澒澔澙渑澣澦澧澨澫澬浍澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觞浚濮盥潍濲泺瀁滢渎渖瀌浏瀒瀔濒泸瀛潇潆瀡潴泷濑瀬弥潋瀳瀵瀹瀺瀼沣滠灉灋灒漓灖灏灞灠滦灥灨滟灪蜴灮烬獴灴灸灺炁炅鱿炗炘炙炤炫疽烙钎炯炰炱炲炴炷毁炻烀烋瘴鲳烓烔焙烜烝烳饪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐炜煕暖熏硷霾煚煝煟煠茕矸煨琐炀萁煳煺煻熀熅熇熉罴荧穹炝熘熛熜稔谙烁熤熨熯熰眶蚂颎熳熸熿燀烨燂燄盏燊燋燏燔隼燖焖燠燡灿燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰为爻丬爿牀牁牂牄牋窗牏牓窗釉牚腩蒡虻牠虽蛎牣牤牮牯牲牳牴牷牸牼绊牿靬犂犄犆犇犉犍犎犒荦犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狈蜘猁猇猈猊猋猓猖獗猗猘狰狞犸猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒毙獙獚獜獝獞獠獢獣獧鼇蹊狯猃獬豸狝獯鬻獳犷猕猡玁菟玅玆玈珉糁禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦珏瑰玭玳瑁玶玷玹玼珂珇珈瑚珌馐馔珔珖珙珛珞珡珣珥珧珩珪佩珶珷珺珽琀琁陨玡琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲琅琴珐珲瑀瑂瑄瑉玮瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈琏璊璐璘璚璝璟璠璡璥瑷璩璪璫璯璲玙璸璺璿瓀璎瓖瓘瓒瓛脐瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔瓮甖甗饴蔗甙诧钜粱盎锈团甡褥産甪甬甭甮宁铠甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃叠疋疍疎箪疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀痖瘃瘈瘉瘊瘌瘏瘐痪瘕瘖瘙瘚瘛疭瘜瘝瘗瘠瘥瘨瘭瘆瘯瘰疬瘳疠瘵瘸瘺瘘瘼癃痨痫癈癎癐癔癙癜癠疖症癞蟆癪瘿痈発踔绀蔫酵皙砬砒翎翳蔹钨镴皑鹎驹暨粤褶皀皁荚皃镈皈皌皋皒朱皕皖皘皜皝皞皤皦皨皪皫皭糙绽皴皲皻皽盅盋碗盍盚盝踞盦盩秋千盬盭眦睁瞤盯盱眙裰盵盻睐眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎困睒睖睙睟睠睢睥睪睾睯睽睾眯瞈瞋瞍逛瞏瞕瞖眍䁖瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽阇瞿眬矉矍铄矔矗矙瞩矞矟矠矣矧矬矫矰矱硪碇磙罅舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硖砗磲茚钡硭硻硾碃碉碏碣碓碔碞碡碪碫碬砀碯碲砜碻礴磈磉磎硙磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻硗礀硚礅礌礐礚礜礞礤礧礮砻礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼饵脔锢禂禇禋祦禔祎隋禖禘禚禜禝禠祃禢禤禥禨禫祢禴禸秆秈秊闱飒秋秏秕笈蘵赁秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬秸稲稹稼颡稿穂穄穇穈穉穋稣贮穏穜穟秾穑穣穤穧穨穭穮穵穸窿阒窀窂窅窆窈窕窊窋窌窒窗窔窞窣窬黩蹙窑窳窴窵窭窸窗竁竃竈竑竜并竦竖篦篾笆鲛竾笉笊笎笏笐靥笓笤箓笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦笕筒筭箸筰筱筳筴宴筸箂个箊箎箑箒箘箙箛箜篌箝箠箬镞箯箴箾篁筼筜篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲筚篴篶篹篼箦簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊藤籒籓籔签籚篯箨籣籥籧笾簖籫籯芾麴籵籸籹籼粁秕粋粑粔粝粛粞粢粧粨粲粳稗粻粽辟粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬粽糯糱籴粜糸糺紃蹼鲣霉纡纨绔纫闽襻紑纰纮锭鸢鹞纴紞紟扎紩紬绂绁纻紽紾绐絁絃絅経絍绗絏缡褵絓絖絘絜绚絣螯絪絫聒絰絵绝絺絻絿綀绡綅绠绨绣綌綍綎捆綖綘継続缎绻綦綪线綮綯绾罟蝽綷縩绺绫緁绲緅緆缁绯緌緎総緑绱緖缃缄缂绵缗緤褓缌纂緪緰缑缈缏缇縁縃縄萦缙缒縏缣縕缞縚缜缟缛縠縡縢縦绦縯縰骋缧縳纤缦絷缥縻衙縿繄缫繈繊繋繐缯繖繘繙繠缋繣繨缰缲繸繻缱纁纆纇缬缵纩纑纕缵纙纚纛缾罃罆坛罋罂罎罏罖罘罛罝罠罣罥罦罨罫罭锾罳罶罹罻罽罿羂羃羇芈蕉51鸵羑羖羌羜羝羢羣羟羧羭羮羰羱羵羶羸藜鲐翀翃翅翊翌翏翕翛翟翡翣翥翦跹翪翫翚翮翯翱翽翾翿板饕鸹锨耋耇耎耏专耒耜耔耞耡耤耨耩耪耧耰鬓耵聍聃聆聎聝聡聦聱聴聂聼阈聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠铨胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臜腍腒腓胨腜腠脶腥腧腬腯踝蹬镣腴腶蠕诽膂腽嗉膇膋膔腘膗膙膟黐膣膦膫膰膴膵膷脍臃臄臇臈臌臐臑臓膘臖臙臛臝臞臧蓐诩臽臾臿舀舁鳑鲏舋舎舔舗馆舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣舣艨艩舻艬艭荏艴艳艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鲢芴芷芸荛豢芼芿苄苒苘苙苜蓿苠苡苣荬苤苎苪镑苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鸮荍荑荘豆荵荸荠莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔芲菘菝菡菢菣菥蓂菧菫毂蓥菶菷菹醢菺菻菼菾萅萆苌萋萏萐萑萜萩萱萴莴扁萻葇葍葎葑荭葖葙葠葥苇葧葭药葳葴葶葸葹葽蒄蒎莼茏薹莅蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽荪蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫荜跣藕苁蓰蓱莼蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蒌蔟锷蒋雯茑蔯蔳麻蔵蔸蔾荨蒇蕋蕍荞蕐蕑芸莸蕖蕗蕝蕞蕠蕡蒉蕣蕤蕨蕳蓣蕸蕺蕻薀薁薃薅薆荟薉芗薏薐蔷薖薘剃谔钗薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋荩藐藙藚藟藦藳藴苈藷藾蘀蘁蕲苹蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虬虰蛵蛇虷鳟虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛱蜕螫蜅蚬蜈蝣蜋蜍蜎蜑蠊蜛饯蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鲼蝡蝤蝥猿蝰虻蝲蝴蝻螃蠏蛳螉螋螒螓螗螘螙螚蟥螟螣螥螬螭䗖螾螀蟀蟅蝈蟊蟋蟑蟓蟛蟜蟟蟢虮蟨蟪蟭蛲蟳蛏蟷蟺蟿蠁蠂蠃虿蠋蛴蠓蚝蠗蠙蠚蠛蠜蠧蟏蠩蜂蠮蠰蠲蠵蠸蠼蠽衁衄衄衇衈衉衋衎衒同衖胡衞裳钩衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉袅裋夹裍裎裒裛裯裱裲裴裾褀褂褉褊裈褎褐褒褓褔褕袆褚褡褢褦褧褪褫袅褯褰褱裆褛褽褾襁褒襆裥襉襋襌襏襚襛襜裣襞襡襢褴襦襫襬襭襮襕襶襼襽襾覂覃覅霸覉覊覌覗觇覚覜觍觎覧覩觊觏覰観觌觔觕觖觜觽觝觡酲觩觫觭觱觳觯觷觼觾觿言赅讣訇訏訑訒诂讬訧訬訳訹证訾詀詅诋毁詈詊讵詑诒诐詗诎察詨诜詶詸詹詻诙诖誂誃诔锄诓誋诳诶悖誙诮诰誧説読誯谇訚谄谆諆諌诤诹诼諕谂谀諝谝諟喧谥諴諵谌谖誊謆謇歌謍謏謑谡谥謡謦謪谪讴謷謼谩哗譅譆譈譊讹譒撰谮鑫譞噪譩谵譬譱譲谴譸譹谫讅讆詟䜩雠讐谗谶讙谠讟谽豁豉豇岂豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆狸猊貔貘䝙貜貤餍贳餸贶贲赂賏赊赇赒賝赓赕賨赍斗賮賵賸赚赙赜赟贉赆赑贕赝赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趱趴趵趷趹趺趿跁跂跅跆踬跄跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒踖踘踜踟躇蹰踠踡踣踤踥踦踧跷踫踮逾踱踊踶踹踺踼踽躞蹁蹂躏蹎蹐蹓蹔跸蹚蹜蹝迹蹠蹡蹢跶蹧蹩蹪蹯鞠蹽躃躄躅踌跻躐踯跞躘躙躗躝躠蹑躜躧躩躭躰躬躶軃軆辊軏轫軘軜軝腭転軥軨軭軱轱辘軷轵轺軽軿輀輂辇辂辁輈挽輗辄辎辋輠輤輬輭輮辏輴輵輶輹輼辗辒轇轏轑轒辚轕轖轗轘轙轝轞轹轳罪辣辞辵辶辺込辿迅迋迍麿迓迣迤逦迥迨迮迸迺迻迿逄逅逌逍逑逓迳逖逡逭逯逴逶逹遄遅侦遘遛遝遢遨遫遯遰遴绕遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯郸邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郏郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄郓鄇鄈鄋鄍鄎鄏鄐鄑邹邬鄕郧鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱郐鄷鄹邝鄻鄾鄿酃酅酆酇郦酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝酝醡醤醨醪醭醯醰酦醲醴醵醸醹醼醽醾釂酾酽釆釈鲈镏阊钆钇钌钯钋鼢鼹钐钏釪釬釭釱钍釸钕钫鈃钭鈆鈇钚鈊鈌钤钣鈒鈤钬钪鈬铌铈钶铛钹铍钸钿鉄鉆铊铇鉌铋鉏铂钷铆钵鉥钲鉨钼钽鉱鉲鉶铰铒鉼铪銍銎铣銕镂铫铦铑铷銤铱铟銧铥铕铯銭銰焊銶锑锉汞鋂锒鋆鋈鋊铤鋍铗鋐鋑鋕鋘鋙锊锓锔锇铓鋭铖锆锂铽鋳鋹鋺鉴镚钎錀锞锖锫锩錍铔锕錔锱铮锛錞锬锜錤錩錬録铼錼锝钔锴鍉镀鍏鍐铡鍚锻锽锸锲锘鍫鍭鍱鍴锶鍹锗针锺锿镅鎉鎋鎌鎍鎏鎒鎓鎗镉鎚鎞镃鎤铩锼鎭鎯镒镍鎴镓��鎹镎镟鏊镆镠镝鏖铿锵鏚镗镘镛鏠鏦錾镤鏸镪鏻鏽鏾铙鐄鐇鐏铹镦镡鐗馗镫镢镨鐡锎镄鐩镌鐬鐱镭鐶鐻鐽镱鑀鑅镔鑐鑕鑚鑛鑢鑤镥鑪镧鑯鑱鑴鑵镊镢钃镻闫闬闶闳閒闵閗閟阂関合閤哄阆閲阉閺阎阏阍阌暗闉阕阗闑闒闿闘闚阚闟闠闤闼阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬骘陴険陼陾阴隃隈隒隗隞隠隣隤隩隮隰颧隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驿霂霅霈霊沾霒霓霙霝霢霣霤霨霩霪霫霮靁叇叆靑靓靣腼靪靮靰靳靷靸靺靼靿鞀鞃鞄鞍鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾鞑韅鞯驮韍韎韔韖韘韝韫韡韣韭韭韱韹韺頀刮頄顸顼頍颀颃颁頖頞頠頫頬颅頯頲颕頼悴顋顑颙颛颜顕顚顜颟顣颥颞飐飑台飓颸飏飖颽颾颿飀飂飚飌翻飡飣饲飥饨饫飮飧飶餀餂饸饹餇餈饽哺馂餖餗餚馄馃餟餠餤餧餩餪餫糊餮糇餲饧馎糕饩馈馊馌馒饇馑馓膳饎饐饘饟馕馘馥馝馡馣骝骡馵馹駃駄駅駆駉駋驽駓驵駗骀驸駜骂骈駪駬骃駴骎駹駽駾騂騄骓騆騉騋骒骐麟騑騒験騕骛騠騢騣騤騧骧騵驺骟騺蓦骖骠骢驆驈骅驌骁驎骣驒驔驖驙驦驩驫骺鲠骫骭肮骱骴骶骷髅骾髁髂髄髆膀髇髑髌髋髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣斗鬫鬬阄鬯鬰鬲鬵鬷魆魈魊魋魍魉魑魖鳔魛魟魣魦魨魬鲂魵魸鮀鲅鮆鲧鲇鲍鲋鮓鲒鲕鮟鱇鮠鮦鮨鲔鲑鮶鮸鮿鲧鯄鯆鲩鯈鲻鯕鲭鲞鯙鯠鲲鯥鲰鲶鳀鯸鳊鲗䲠鹣鳇鰋鳄鳆鰕鰛鰜鲥鰤鳏鰦鳎鳐鳁鳓鰶鲦鲡鰼鰽鱀鱄鳙鱆鳕鱎鱐鳝鳝鳜鲟鲎鱠鳣鱨鲚鱮鱲鱵鱻鲅鳦凫鳯鳲鳷鳻鴂鴃鴄鸩鴈鴎鸰鴔鴗鸳鸯鸲鹆鸱鴠鴢鸪鴥鸸鹋鴳鸻鴷鴽鵀鵁鸺鹁鵖鵙鹈鹕鹅鵟鵩鹌鵫鵵鵷鵻鹍鶂鶊鶏鶒鹙鶗鶡鶤鶦鶬鶱鹟鶵鶸鶹鹡鶿鹚鷁鷃鷄鷇䴘䴘鷊鷏鹧鷕鹥鸷鷞鷟鸶鹪鹩鷩鷫鷭鹇鹇鸴鷾䴙鸂鸇䴙鸏鸑鸒鸓鸬鹳鸜鹂鹸咸鹾麀麂麃麄麇麋麌麐麑麒麚麛麝麤麸面麫麮麯麰麺麾黁黈黉黢黒黓黕黙黝黟黥黦黧黮黰黱黪黶黹黻黼黾鼋鼂鼃鼅鼈鼍鼏鼐鼒冬鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌赍齑龀齕齗龅齚龇齞龃龉龆齢出齧齩齮齯齰齱齵齾厐龑龒龚龖龘龝龡龢龤"
|
18 |
+
|
19 |
+
traditional_characters = "制咖片型超聲盤鑒定仔點他命書歌粉巾字帳恤手指記憶棒形轉彎溝光○〇㐄㐅㐆㐌㐖毒㐜㐡㐤㐰㐺㑇㑳㒳㒸㔾㗂㗎㝵㞎㞙㞞㠯㢲㢴㤅㥁㥯㨗㫺㬎㮎㮚㮸㲋㲱㲾㳮㵎㵪㶸㷖㷭㹢㹴犬㺢狓㺵㼝㽮㿝䍃䔢䖟䖸䗈䗥䗪䝓䠶䥯䦉䯝䰾魚䲔䳗䳘䵹鼄䶑一對應映射丁不識下兒子做二休世丘之貉並中台原則串為甚謂乾淨了百事無成八變五十些人得道雞升天代如併來去個國政策勁幽靈在歐洲遊蕩接樣蘿蔔坑側化傳價元論醇共再准刀兩斷切分耕耘收穫錢貨物向看舊就緒險刻千金動勞永逸匙零夜半卡通回復返影蹤反常態口咬氣句話同吐快吹周味呼諾嗚品紅鍋哄而散起唱和問三知生熟團漆黑火糟堆場空塊麵塌糊塗塵染壁廂夔已足多情露水大早到晚夫妻當關萬莫開失古恨套所料既往孔見提師要家主審寸陰難買鬥牛小撮部陣局展身層巴掌帆風順席地帶過年計於春頭載四季期被蛇怕井繩度願式份彈頃深前律徑心意念差愁孤行俱全房廳交遮打技長把抓死拿眼淚鼻涕鑰鎖折段抿拍即合掃排掬揮撥擁上入擊洞擲攬改故轍敗文值名斑方面旁族日秋餐隔雅里終父旦時晌會霎間晃暴寒曝更月望垠際朝夕本正經利杯羹東西板枝獨秀根筋桿進條龍服務概模次函數又性程總付步腳印趨登毛拔呵氧氮碳決雌雄波未平派謊言流清楚白準溜煙潭有獲聞是處降琴鶴甲病發可拾沙目然瞭直以相眨穿睹瞥瞬矢的解石鳥神教秉虔誠秘種窩蜂窮竅笑置筆苟勾銷抹殺煞等獎箍節吃箭仇雙鵰詩籌籮筐系列紙級士官統絲毫掛維網盡線微吭響股腦胎脈承腔臂力致效資源址器舉功投般說講規貿易葉障著慎滿皆輸號木電池衣傾鐘高低視仁覺醒覽遺角銀幣觸潰九鼎蔽抄出駟馬追重語破貧洗貫走路安蹴至幾蹶振躍役膽汗較輩輪辭贊退六連遍遞邊針血錘音錯門思閃真倒項栽霧類保護川先驚乍體鬨鱗爪鳴滴泡鄰域黨專鼓作齊炒丑烯亥克內酯冬加奴卯肝炎基尺梁街褲鎬客寵庭巳汝昌烷玲磊糖肇酉醛啷青縣韙良香骨鯛丂七集河市弦喜嘴張舌堵區工業姊妹星架構巧彩扭歪拼湊餘熱曜武州爺浮屠美鄉老階樹葷素碎落能魄鰓鰻珠丄丅丆万俟丈尚摸母娘量管群亞虎必我堂令申件裝伏位博俠義界表女墟臺戲臭皮匠勝諸葛亮賽頂倍催請運算包立叉戟離疫苗土史志演圍揭瓦曬夷姑婆帝村寶爛尖杉鹼屜桌山岔島由紀峽壩庫鎮廢從德後拗湯治旬食明昧曹朋友框欄極權冪曲歸依貓民氟硼氯磷鐵江侗自旅法司洋浦梅園溫暖灣焦班幸用田略番疊皇炮捶硝苯酸腺苷稜草鏡穗跳遠索錦綱聚氰胺聯店胚膲愛色堇紫羅蘭芝茶飯菱雲蟲藏藩亂叛蘇親債凳學座恐戀柱測肌腹衩錐係貂企烏跪叩軍車農題迭都甘油屯奏鍵短阿姨陪姐隻顧茅廬槽駕魂鮮鹿頁其菜單乘任供勢午齒漢組織吊調瀉唇坡城報墳外夸將尉建築岸崗公床揚新劍昇杭林栗校樓標款汽社浣海商館劇院鋼華港機械廣媒環球融第醫科證券綜財樂育游漲猶嶺疏癮瞼確兵領導繳肢膛船艾瑟爾蒼蔡虞傚衫覆訪訴課諭議軌述野鉤限敵鞋頜頷顎饒首齦站例修凡劃垂屆屬崽頦廚拜挫擺放旋削棋榻檻禮沉注滑營獄畫确儀聘花葬詔員跌轄週達酒錨閘陷陸雨雪飛威丌于丹久乏予理評產亢卑亦乎舞己悲矩圓詞害誌但住佞佳便俗信票案幅翁倦倫假偏倚斜虧鬼敲停備傷脾胃僅此像儉匱免宜穴焉戴兼容許凍伯仲負彼晝皂軒輊實刊划顛衛戰哥比省非好黃飾別拘束掩奶睬選擇搖擾煩苦枚寫協厭及格受歡迎約只估侵犯割狀告或缺抗拒挽撤救藥喻磨滅端倪少逆逾越避靠適吉譽吝玉含延咎歹聽啻淵善謀均勻堪忍夠太惹妙妥妨孕症孝術室完納推冠積宣疑辯慄碴稱屈撓屑干涉衡待很忙惡忿怎麼怠急恥恭息悅惑惜惟想愉愧怍慌憤啟懂懈懷材才緊招認扣抵拉捨也罷插揣冒搭撞南牆擴核支攻敢雷攀敬裡嗎需景智暇曾罪遇朽枉止況競爭辱求癒渝溶濟左右袒困補爽特寂寞示弱找謝畏強疾徐痛癢冤符眠睦瞅董何厚云措活疲羞者輕玻璃祥兆禁移稂莠穩佛換答簡結果盟絕縷途給談否羈翼耐肖脛毋寧興舒若菲萊痕跡窠臼虛衰臉兔撒鷹棺範該詳諱抬泰讓鬚眉象眾貲賬費灰賴奇慮訓輟辨菽麥辛近送透逞徒速續逮捕遂遑違遜斧鉞艱醉鏽隨觀棄顯飽脂肪使丏丐幫丒且慢末丕替桃宗王尊涼爵各圖屋脊糧署錄壇吾祿職胄襲君廈丗北壑桐疹損逢陵鷸丙寅戌氨腈唑綸辰酮脫氫酶醚丞丟現掉紗帽弄扯砲碗丠両丣坐存激肩臻蒂蓮悖序驅丨丩丫挺杈髻鬟細介俄伊犁京尼布訂普渡央委監察檢查劑圈設警隊斯督剩震境航舶革防托播促質版蠑螈鋒研藝歷殘消頻譜精密製造陲郵候埔堅壓壢凹匯執府究邦俘攝寮彬狼嶽肺腫庸英訊診埋粒胞括控碼韓暑槍樞砥澳哇牟壽甸鑽探篇簽綴縫繼耳肯照婦埃懸璧軸櫃檯辣擱淺邪跑纖阮陽私囊魔丮丰姿采丱燒丳丵丶丷丸參寨朗桂瑞砂衷霞貌鳳僕艦因嫌宰峰幹絡牌持旨祭禱簿編罰賓辦丼丿乀乂乃乄仰慕盛曠留考驗闊乆乇么醜麼乊湖燃乑乒乓乕乖僻忤戾离謬迕乗危肥劫除隙浪婿乙炔腸酰吡咯鹽乚乛乜嘢卿玄宮尾狐龜塔嶷兄弟泉章霄釘耙乞扎哀憐恕討乢乣乤乥乧乨乩童乪乫乭乳暈汁液瑤漿牙癌突竇罩腐膠豬酪蛋糕菌瘤乴乵乶乷乸乹乺乼乾俸冰嘉噦嚎坤媽屍壘旱枯涸俐渴潮澀煸豆燥爹瘦癟癬瞪袋脆薑貝隆餾乿亀亁叫咕攘扔搞男砸竄蓬麻亃亄亅卻亇遲典今臨繁累卵奉婚聰躬巨與遷添裂副宿歲怪噁尕崙愣杆硅硫鈦鈾錳芑雜異鈉砷胂磺琥珀艙棍簧胡茬盜浩盆販郎腿亍洪亐互欠助勉惠操斥諉繫戶譯亓墓碑刑鈴卅渠繽紛斗米旗憲釩燈徽瘟祖拳福穀豐臟腑綁肉醃苓蘊橋鋪霸顏鬧判噴岡底蛙陘礦亖亙亜罕們娜桑那努哈喀弗烈曼松森杜氏盃奧琛敦戊穆聖裔彙薛孫亟亡佚虜羊牢奮釋卷卸契媾感額睫纏誼趾塞擠紐阻還配馳莊亨洛祚亪享津滬畿郊慈菴枇杷膏亭閣鋥麗亳亶亹誅初責翻瘋偶傑叢稠妖拖寰居吸授慧蝸吞壯魅狗矛盾益渣患憂稀描猿夢暫涯畜禍緣沸搜引擎臣橫紜誰混援蒸獸獅稅剖亻亼亽亾什獻剎邡麽仂仃仄仆富怨仈仉畢昔晨殼紹仍仏仒仕宦仗欺恃腰嘆歎炬梓訖施仙后瓊逝仚仝仞仟悔仡佬償填泊拓撲簇羔購頓欽佩髮棻閫馭養億儆尤藉幀賑凌敘帖李柔剛沃眥睚戒訛取饗讀仨仫仮著泳臥躺韶夏裁仳仵唯賢憑釣誕仿似宋彿諷伀碩盼鵝伄儅伈伉儷柯始娃邁戈坦堡帕茨薩廟瑪莉莎藤霍姆伋伍奢胥廷芳豪伎倆侍汛勒希羲雛伐憩整謨閑閒伕伙伴頤伜伝伢叔恆茲恩翰伱伲侶伶俜悧鼬伸懶縮喇叭伹伺伻伽倻輻伾佀佃佇佈喬妮墨佉盧佌貸劣廉昂檔濃矮傘窪緩耗胸谷迷擋率齲宅沫舍療佐貳佑佔優據鏵嘗呢須魯曉佗佘余坪寺瓜銃僧蒙芒陀龕哼嘔坊姦孽弊揖祟繭縛誓賊佝僂瞀佟你奪趕佡佢佣佤佧賈佪佫佯佰佱潔績釀餚佴捲佶佷佸佹佺佻佼佽佾具喚窘壞娛怒慨硬習慣聾膨脹蔓駭貴痺侀侁侂侃侄侅鴻燕侇侈糜靡侉侌妾侏儒倉鼠侐侑侔侖侘侚鏈侜偎傍鈷循柳葫蘆附価侮罵蔑侯岩截蝕侷貼壺嬛宴捷攜桶箋酌俁狹膝狄俅俉俊俏俎俑俓俔諺俚俛黎健呈固墒增守康箱濕祐鏢鑣槓盒靖膜齡俞豹獵噪孚封札筒託衍鴿剪撰稿煉廠禊練繕葺俯瞰撐衝俲俳俴俵俶俷俺俻俾倀倂倅儲卒惶敷猝逃頡蓄崇隱倌倏忽刺蠟燭噍嚼坍扁抽斃蔥楣灌灶糞背藪賣賠閉霉騰倓倔倖倘倜儻倝借箸挹澆閱倡狂倢倣値倥傯倨��倩匡嗣沖柝珍倬倭寇猩倮倶倷倹勤讚偁偃充偽吏嗓寐惺扮拱芫茜藉虢鈔偈偉晶偌宕距析濾殿疼癱註頗偓偕鴨歇滯偝偟偢忘怡旺偨偩偪偫偭偯偰偱偲偵緝蹄偷減惰漏窺竊偸偺迹傀儡傅傈僳傌籬傎奎琳迪叟芭傒傔傕傖悉荒傜傞傢傣芽逼傭婢傮睨寄檄誦謠頌傴擔辜弓慘蒿悼疤傺傻屄臆巢洩篋羨蓋軋頹傿儸僄僇僉僊働僎僑僔僖僚僝僞僣僤僥僦猴僨僩僬僭僮僯僰僱僵殖籤靜僾僿征隴儁儂儃儇儈朴薄儊儋儌儍儐儓儔儕儗儘儜儞儤儦儩汰哉寡渥裕酷儭儱罐儳儵儹儺儼儽兀臬臲鷲允勛勳宙宵帥憝彞諧嫂鬩暢沛溢盈飢赫兇悍狠猛頑愚妣斬秦遣鞭耀敏榮槃澤爆碟磁禿纜輝霽鹵朵婁孜烽醬勃汀箕裘鉗耶懞蕾徹兌軟遭黜兎児韻媳爸兕觥兗兙兛兜售鍪肚兝兞兟兡兢兣樽殮涅睡稟籍贅泌啡肽奸幕涵澇熵疚眷稃襯訌赴煥椒殲植跏沒試誤猜棲窗肋袖頰兪卦撇鬍岐廓轎疸楓茴瓏廁秩募勺噸寓斤曆畝迫筷釐最淫螺韜兮寬匪篩襄贏軛複兲詐刃堰戎痞蟻餉它冀鑄冂冃円冇冉冊嫁厲礪竭醮冏牧冑冓冔冕冖冗冘冞冢窄抑誣冥冫烘菇蟄冷凝坨橇淇淋炭餅磚磧窖醋雕雹霜冱冶爐艷嘲峻灘淡漠煖颼飲冼冽凃凄愴梗凅凇凈凊凋敝濛凔凜遵汞脢凞几凢処凰凱凵凶焰凸摺刷紋預喪嘍奔巡榜殯芙蓉租籠輯鞘萃凼鋸鑊刁蠻刂娩崩批拆攤掰櫱驟歧顆秒袂贓勿囑忌磋琢膚刈羽刎訟戮舂槳艇刓刖霹靂刜創犢刡恙墅幟筵緻刦刧刨昏默攸尿慾薰潤薰圭刪刮痧鏟刱刲刳刴刵踏磅戳柏槐繡芹莧蝟舟銘鵠鶩刼剁剃辮剄剉履鉛剋剌姻咽哨廊掠桅沿召瞻翅趙卜渺茫郭剒剔剕瀝剚愎毅訥纔剜剝啄採剞剟剡剣剤綵剮腎駛黏剰袍剴紊剷剸剺剽剿劁劂劄劈啪柴扳啦劉奭姥夼昫涓熙禪禹錫翔雁鶚劊劌弩柄蜻蛉劒劓劖劘劙瀾簣賞磯釜晉甜薪逐劦熔紂虐赤囚劬劭労劵効劻劼劾峭艮勅勇勵勍勐臘脖龐漫飼盪粥輒勖勗勘驕餒碌泮雇捐竹騎殊阱勣樸懇謹勦勧勩勯勰勱勲勷勸懲慰誡諫勹芡踐闌匁庇拯粟紮袱裹餃匆遽匈匉匊匋匍匐莖匏匕妝痰膿蛹齋苑烤蹈塘羌熊閥螳螂疆碚竿緯荷茵邙魏匚匜匝匟扶稷匣匭攏匸匹耦匽匾匿卂叮瘡禧軫堤棚迢鈞鍊卄卆遐卉瓷盲瓶噹胱腱裸卋卌卍卐怯污賤鄙齷齪陋卓溪唐梯漁陳棗泥漳潯澗梨芬譙贍轅迦鄭単驢弈洽鰲卛占筮卝卞卟吩啉屎翠厄卣卨卪卬卮榫襖璽綬鈕蚤懼殆篤聳卲帘帙繞卹卼卽厂厎厓厔厖厗奚厘厙厜厝諒厠厤厥厪膩孢厮厰厳厴厹厺粕垢蕪菁厼厾叁悟茸薯叄吵笄悌哺譏坫壟弧芯杠潛嬰芻袁詰貪諜煽饋駁収岳締災賄騙叚叡吻攔蘑蜜訣燧玩硯箏椎藺銅逗驪另覓叨嘮謁杵姓喊嚷囂咚嚀塑尋惱憎擦祇泣滲蝠叱吒咄咤喝籀黛舵舷叵叶鐸懿昭穰苴遼叻叼吁塹嫖賭瞧爬衆抒吅吆夥巹橡滌抱縱摩郡唁墜扇籃膀襪頸吋愾諮酬哭妓媛暗錶韁邇妃羿絮蕃渾拐葵暮隅吔吖啶嗪戚吜嗇噬嚥吟哦詠吠吧唧嗒咐吪雋咀徵燐苞茹鈣哧吮吰吱嘎吲哚吳棟嬌窟孟簫忠晗淞闔閭趼宇吶睛噓拂捧疵熄竽笛糠吼吽呀呂韋矇呃呆笨呇貢呉罄呋喃呎呏呔呠呡癡呣呤呦呧瑛眩扒晬淑姬瑜璇鵑呪呫嗶嚅囁呬呯呰呱呲咧噌鈍呴呶呷呸呺呻哱咻嘯嚕籲坎坷邏呿咁咂咆哮咇咈咋蟹煦珅藹咍咑咒詛咔噠嚓咾噥哩喱咗咠咡咢咣咥咦咨嗟詢咩咪咫嚙齧咭咮咱咲咳嗆嗽咴咷咸咹咺咼喉咿婉慟憫賦矜綠茗藍哂搶瞞哆嗦囉噻啾濱彗哋哌哎唷喲哏哐哞哢哤哪裏哫啼喘哰哲萎蚌哳哶哽哿唄唅唆唈唉唎唏嘩堯棣殤璜睿肅唔睇唕唚唞唣喳唪唬唰喏唲唳唵嘛唶唸唹唻唼唾唿啁啃鸚鵡啅埠棧榷祺舖鞅飆啊啍啎啐啓啕啖啗啜啞祈啢啣啤啥啫啱啲啵啺饑啽噶崑沁喁喂喆裙喈嚨喋喌喎喑喒喓喔粗喙幛慶滋鵲喟喣喤喥喦喧騷喨喩梆喫葡萄喭駝挑嚇碰樅瓣純皰藻趟鉻喵営喹喺喼喿嗀嗃嗄嗅嗈嗉嗊嗍嗐嗑嗔詬嗕嗖嗙嗛嗜痂癖嗝嗡嗤嗥嗨嗩嗬嗯嗰嗲嗵嘰嗷嗹嗾嗿嘀嘁嘂嘅惋嘈峪禾蔭嘊嘌嘏嘐嘒嘓嘖嘚嘜嘞嘟囔嘣嘥嘦嘧嘬嘭這謔嚴敞饞鬆嘵嘶嘷嘸蝦嘹嘻嘽嘿噀噂噅噇噉噎噏噔噗噘噙噚噝噞噢噤蟬皿噩噫噭噯噱噲噳嚏涌灑欲巫霏噷噼嚃嚄嚆抖嚌嚐嚔囌嚚嚜嚞嚟嚦嚬嚭嚮嚯嚲嚳飭按竣苛嚵嚶囀囅囈膪謙囍囒囓囗囘蕭酚飄濺諦囝溯眸紇鑾鶻囟殉囡団囤囥囧囨囪囫圇囬囮囯囲図囶囷囸囹圄圉擬囻囿圀圂圃圊粹蠹赦圌墾圏滾鯡鑿枘圕圛圜圞坯埂壤骸炕祠窯豚紳魠鯪鱉圧握圩圪垯圬圮圯炸岬幔毯祇窨菩溉圳圴圻圾坂坆沾坋坌舛壈昆墊墩椅坒坓坩堝坭坰坱坳坴坵坻坼楊掙涎簾垃垈垌垍垓垔垕垗垚垛垝垣垞垟垤垧垮垵垺垾垿埀畔埄埆埇埈埌殃隍埏埒埕埗埜埡埤埦埧埭埯埰埲埳埴埵埶紼埸培怖樁礎輔埼埽堀訶姪廡堃堄摧磐貞韌砌堈堉堊堋堌堍堎堖堙堞堠礁堧堨輿堭堮蜓摘堲堳堽堿塁塄塈煤塋棵塍塏塒塓綢���鴉沽虱塙塚塝繆塡塢塤塥塩塬塱塲蟎塼塽塾塿墀墁墈墉墐夯増毀墝墠墦漬缽墫墬墮墰墺墻櫥壅壆壊壌壎壒榨蒜壔壕壖壙壚壜壝壠壡壬壭壱売壴壹壻壼寢壿夂夅夆変夊夌漱邑夓腕泄甥禦骼夗夘夙袞瑙妊娠醣梟珊鶯鷺戧幻魘夤蹀祕擂鶇姚宛閨嶼庾撻拇賛蛤裨菠氅漓撈湄蚊霆鯊箐篆篷荊肆舅荔鮃巷慚骰辟邱鎔鐮阪漂燴鯢鰈鱷鴇臚鵬妒峨譚枰晏璣癸祝秤竺牡籟恢罡螻蠍賜絨御梭夬夭砣榆怙枕夶夾餡奄崛葩譎奈賀祀贈奌奐奓奕訢詝奘奜奠奡奣陶奨奩魁奫奬奰媧孩貶隸酥宄狡猾她奼嫣妁氈荼皋膻蠅嬪妄妍嫉媚嬈妗趣妚妞妤礙妬婭妯娌妲妳妵妺姁姅姉姍姒姘姙姜姝姞姣姤姧姫姮娥姱姸姺姽婀娀誘懾脅娉婷娑娓娟娣娭娯娵娶娸娼婊婐婕婞婤婥谿孺婧婪婬婹婺婼婽媁媄媊媕媞媟媠媢媬媮媯媲媵媸媺媻媼眯媿嫄嫈嫋嫏嫕嫗嫘嫚嫜嫠嫡嫦嫩嫪毐嫫嫬嫰嫵嫺嫻嫽嫿嬀嬃嬅嬉耍嬋痴豔嬔嬖嬗嬙嬝嬡嬢嬤嬦嬬嬭幼嬲嬴嬸嬹嬾嬿孀孃孅孌孏曰癲屏孑孓雀孖斟簍謎摺孛矻鳩崮軻祜鸞孥邈毓棠臏孬孭孰孱孳孵泛罔銜孻孿宀宁宂拙株薇掣撫琪瓿榴謐彌宊濂祁瑕宍宏碁宓邸讞実潢町宥宧宨宬徵崎駿掖闕臊煮禽蠶宸豫寀寁寥寃簷庶寎暄磣寔寖寘寙寛寠苫寤肘洱濫蒗陝覈寪弘綽螽寳擅疙瘩晷対檐専尃尅贖絀繚疇釁尌峙醌襟痲碧屁昊槌淘恵瀑牝畑莓缸羚覷蔻髒躁尒尓銳尗尙尜尟尢尥尨尪尬尭尰擒尲尶尷尸尹潽蠖蛾尻釦梢蚴鰭脬蹲屇屌蚵屐屓挪屖屘屙屛屝屢屣巒嶂巖舄屧屨屩屪屭屮戍駐鉀崖嵛巔旮旯楂欖櫸芋茱萸靛麓屴屹屺屼岀岊岌岍阜岑彭鞏岒岝岢嵐岣岧岨岫岱岵岷峁峇峋峒峓峞峠嵋峩峯峱峴峹峿崀崁崆禎崋崌崍嶇崐崒崔嵬巍螢顥崚崞崟崠崢巆崤崦崧殂崬崱崳崴崶崿嵂嵇嵊泗嵌嵎嵒嵓嵗嵙嵞嵡嵩嵫嵯嵴嵼嵾嶁嶃嶄晴嶋嶌嶒嶓嶔嶗嶙嶝嶞嶠嶡嶢嶧嶨嶭嶮嶰嶲嶴嶸巂巃巇巉巋巌巓巘巛滇芎巟巠弋迴巣巤炊擘蜥蟒蠱覡巰蜀彥淖杏茂甫楞巻巽幗巿帛斐鯽蕊帑帔帗帚琉汶帟帡帣帨帬帯帰帷帹暆幃幄幇幋幌幏幘幙幚幞幠幡幢幦幨幩幪幬幭幯幰遙蹉跎餘庚鑑幵幷稚邃庀庁広庄庈庉笠庋跋庖犧庠庤庥鯨庬庱庳庴庵馨衢庹庿廃廄廆廋廌廎廏廐廑廒廕廖廛廝搏鑼廞弛袤廥廧廨廩廱綿踵髓廸廹甌鄴廻廼廾廿躔弁皺弇弌弍弎弐弒弔詭憾薦弝弢弣弤弨弭弮弰弳霖繇燾斌旭溥騫弶弸弼弾彀彄彆纍糾彊彔彖彘彟彠陌彤貽彧繪虹彪炳彫蔚鷗彰癉彲彳彴彷彷徉徨彸彽踩斂旆徂徇徊渭畬鉉裼従筌徘徙徜徠膳甦萌漸徬徭醺徯徳徴潘徻徼忀瘁胖燎怦悸顫扉犀澎湃砰恍惚絞隘忉憚挨餓忐忑忒忖応忝忞耿忡忪忭忮忱忸怩忻悠懣怏遏怔怗怚怛怞懟黍訝怫怭懦怱怲怳怵惕怸怹恁恂恇恉恌恏恒恓恔恘恚恛恝恞恟恠恣恧眄恪恫恬澹恰恿悀悁悃悄悆悊悐悒晦悚悛悜悝悤您悩悪悮悰悱悽惻悳悴悵惘悶悻悾惄愫鍾蒐惆惇惌惎惏惓惔惙惛耄惝瘧濁惥惦惪惲惴惷惸拈愀愃愆愈愊愍愐愑愒愓愔愕愙氓蠢騃昵愜赧愨愬愮愯愷愼慁慂慅慆慇靄慉慊慍慝慥慪慫慬慱慳慴慵慷慼焚憀灼鬱憃憊憋憍眺捏軾憒憔憖憙憧憬憨憪憭憮憯憷憸憹憺懃懅懆邀懊懋懌懍懐懞懠懤懥懨懫懮懰懱毖懵遁樑雍懺懽戁戄戇戉戔戕戛戝戞戠戡戢戣戤戥戦戩戭戯轟戱披菊牖戸戹戺戻戼戽鍬扂楔扃扆扈扊杖牽絹銬鐲賚扐摟攪烊盹瞌跟躉鑔靶鼾払扗玫腮扛扞扠扡扢盔押扤扦扱罾揄綏鞍郤窾扻扼扽抃抆抈抉抌抏瞎抔繯縊擻抜抝択抨摔歉躥牾抶抻搐泵菸拃拄拊髀拋拌脯拎拏拑擢秧沓曳攣迂拚拝拠拡拫拭拮踢拴拶拷攢拽掇芥橐簪摹疔挈瓢驥捺蹻挌挍挎挐揀挓挖掘浚挙揍聵挲挶挾挿捂捃捄捅捆捉捋胳膊揎捌捍捎軀蛛捗捘捙捜捥捩捫捭据捱捻捼捽掀掂掄臀膘掊掎掏掐笙掔掗掞棉芍掤搪闡掫掮掯揉掱掲掽掾揃揅揆搓揌諢揕揗揘揜揝揞揠揥揩揪揫櫫遒麈揰揲揵揶揸揹揺搆搉搊搋搌搎搔搕撼櫓搗搘搠搡搢搣搤搥搦搧搨搬楦褳訕赸搯搰搲搳搴搵搷搽搾搿摀摁摂摃摎摑摒摓跤摙摛摜摞摠摦睺羯摭摮摯摰摲摳摴摶摷摻摽撂撃撅稻撊撋撏鐧潑撕撙撚撝撟撢撣撦撧撩撬撱朔撳蚍蜉撾撿擀擄闖擉缶觚擐擕擖擗擡擣擤澡腚擧擨擩擫擭擯擰擷擸擼擽擿攃攄攆攉攥攐攓攖攙攛每攩攫轡澄攮攰攲攴軼攷砭訐攽碘敁敃敇敉敍敎筏敔敕敖閏誨敜煌敧敪敱敹敺敻敿斁衽斄牒縐謅斉斎斕鶉讕駮鱧斒筲斛斝斞斠斡斢斨斫斮晾沂潟穎絳邵斲斸釳於琅斾斿旀旂旃旄渦旌旎旐旒旓旖旛旝旟旡旣浴旰獺魃旴旹旻旼旽昀昃昄昇昉晰躲澈熹皎皓礬昑昕昜昝昞昡昤暉筍昦昨昰昱昳昴昶昺昻晁蹇隧蔬髦晄晅晒晛晜晞晟晡晢晤晥曦晩萘瑩顗晿暁暋暌暍暐暔暕煅暘暝暠暡曚暦暨暪朦朧暱暲殄馮暵暸暹暻暾曀曄曇曈曌曏曐曖曘曙曛曡曨曩駱曱甴肱曷牘禺錕曽滄耽朁朅朆杪栓誇竟粘絛朊膺朏朐朓朕朘朙瞄覲溘饔飧朠朢朣柵椆澱蝨朩朮朰朱炆璋鈺熾鹮朳槿朶朾朿杅杇杌隉欣釗湛漼楷瀍煜玟纓翱肈舜贄适逵杓杕杗杙荀蘅杝杞脩珓筊杰榔狍閦顰緬莞杲杳眇杴杶杸杻杼枋枌枒枓衾葄翹紓逋枙狸椏枟槁枲枳枴枵枷枸櫞枹枻柁柂柃柅柈柊柎某柑橘柒柘柙柚柜柞櫟柟柢柣柤柩柬柮柰柲橙柶柷柸柺査柿栃栄栒栔栘栝栟栢栩栫栭栱栲栳栴檀栵栻桀驁桁鎂桄桉桋桎梏椹葚桓桔桕桜桟桫欏桭桮桯桲桴桷桹湘溟梃梊梍梐潼梔梘梜梠梡梣梧梩梱梲梳梴梵梹棁棃櫻棐棑棕櫚簑繃蓑棖棘棜棨棩棪棫棬棯棰棱棳棸棹槨棼椀椄苕椈椊椋椌椐椑椓椗検椤椪椰椳椴椵椷椸椽椿楀楄楅篪楋楍楎楗楘楙楛楝楟楠楢楥楨楩楪楫楬楮楯楰楳楸楹楻楽榀榃榊榎槺榕榖榘榛狉莽榜笞榠榡榤榥榦榧榪榭榰榱槤霰榼榾榿槊閂槎槑槔槖様槜槢槥槧槪槭槮槱槲槻槼槾樆樊樏樑樕樗樘樛樟樠樧樨権樲樴樵猢猻樺樻罍樾樿橁橄橆橈笥龠橕橚橛輛橢橤橧豎膈跨橾橿檁檃檇檉檍檎檑檖檗檜檟檠檣檨檫檬檮檳檴檵檸櫂櫆櫌櫛櫜櫝櫡櫧櫨櫪櫬櫳櫹櫺茄櫽欀欂欃欐欑欒欙欞溴欨欬欱欵欶欷歔欸欹欻欼欿歁歃歆艎歈歊蒔蝶歓歕歘歙歛歜歟歠蹦詮鑲蹣跚陞陟歩歮歯歰歳歴璞歺瞑歾歿殀殈殍殑殗殜殙殛殞殢殣殥殪殫殭殰殳荃殷殸殹蛟殻殽謗毆毈毉餵毎毑蕈毗毘毚茛鄧毧毬毳毷毹毽毾毿氂氄氆靴氉氊氌氍氐聊氕氖気氘氙氚氛氜氝氡洶焊痙氤氳氥氦鋁鋅氪烴氬銨痤汪滸漉痘盂碾菖蒲蕹蛭螅氵氷氹氺氽燙氾氿渚汆汊汋汍汎汏汐汔汕褟汙汚汜蘺沼穢衊汧汨汩汭汲汳汴隄汾沄沅沆瀣沇沈葆浸淪湎溺痼痾沌沍沏沐沔沕沘浜畹礫沚沢沬沭沮沰沱灢沴沷籽沺烹濡洄泂肛泅泆湧肓泐泑泒泓泔泖泙泚泜泝泠漩饃濤粼濘蘚鰍泩泫泭泯銖泱泲洇洊涇琵琶荽薊箔洌洎洏洑潄濯洙洚洟洢洣洧洨洩痢滔洫洮洳洴洵洸洹洺洼洿淌蜚浄浉浙贛渫浠浡浤浥淼瀚浬浭翩萍浯浰蜃淀苔蛞蝓蜇螵蛸煲鯉浹浼浽溦涂涊涐涑涒涔滂涖涘涙涪涫涬涮涴涶涷涿淄淅淆淊淒黯淓淙漣淜淝淟淠淢淤淥淦淩猥藿褻淬淮淯淰淳詣淶紡淸淹燉癯綺渇済渉渋渓渕渙渟渢滓渤澥渧渨渮渰渲渶渼湅湉湋湍湑湓湔黔湜湝湞湟湢湣湩湫湮麟湱湲湴湼満溈溍溎溏溛舐漭溠溤溧馴溮溱溲溳溵溷溻溼溽溾滁滃滉滊滎滏稽滕滘滙滝滫滮羼耷滷滹滻煎漈漊漎繹漕漖漘漙漚漜漪漾漥漦漯漰漵漶漷濞潀潁潎潏潕潗潚潝潞潠潦祉瘍潲潵潷潸潺潾潿澁澂澃澉澌澍澐澒澔澙澠澣澦澧澨澫澬澮澰澴澶澼熏郁濆濇濈濉濊貊濔疣濜濠濩觴濬濮盥濰濲濼瀁瀅瀆瀋瀌瀏瀒瀔瀕瀘瀛瀟瀠瀡瀦瀧瀨瀬瀰瀲瀳瀵瀹瀺瀼灃灄灉灋灒灕灖灝灞灠灤灥灨灩灪蜴灮燼獴灴灸灺炁炅魷炗炘炙炤炫疽烙釺炯炰炱炲炴炷燬炻烀烋瘴鯧烓烔焙烜烝烳飪烺焃焄耆焌焐焓焗焜焞焠焢焮焯焱焼煁煃煆煇煊熠煍熬煐煒煕煗燻礆霾煚煝煟煠煢矸煨瑣煬萁煳煺煻熀熅熇熉羆熒穹熗熘熛熜稔諳爍熤熨熯熰眶螞熲熳熸熿燀燁燂燄盞燊燋燏燔隼燖燜燠燡燦燨燮燹燻燽燿爇爊爓爚爝爟爨蟾爯爰爲爻爿爿牀牁牂牄牋牎牏牓牕釉牚腩蒡虻牠雖蠣牣牤牮牯牲牳牴牷牸牼絆牿靬犂犄犆犇犉犍犎犒犖犗犛犟犠犨犩犪犮犰狳犴犵犺狁甩狃狆狎狒獾狘狙黠狨狩狫狴狷狺狻豕狽蜘猁猇猈猊猋猓猖獗猗猘猙獰獁猞猟獕猭猱猲猳猷猸猹猺玃獀獃獉獍獏獐獒獘獙獚獜獝獞獠獢獣獧鼇蹊獪獫獬豸獮獯鬻獳獷獼玀玁菟玅玆玈珉糝禛郅玍玎玓瓅玔玕玖玗玘玞玠玡玢玤玥玦玨瑰玭玳瑁玶玷玹玼珂珇珈瑚珌饈饌珔珖珙珛珞珡珣珥珧珩珪珮珶珷珺珽琀琁隕琊琇琖琚琠琤琦琨琫琬琭琮琯琰琱琲瑯琹琺琿瑀瑂瑄瑉瑋瑑瑔瑗瑢瑭瑱瑲瑳瑽瑾瑿璀璨璁璅璆璈璉璊璐璘璚璝璟璠璡璥璦璩璪璫璯璲璵璸璺璿瓀瓔瓖瓘瓚瓛臍瓞瓠瓤瓧瓩瓮瓰瓱瓴瓸瓻瓼甀甁甃甄甇甋甍甎甏甑甒甓甔甕甖甗飴蔗甙詫鉅粱盎銹糰甡褥産甪甬甭甮甯鎧甹甽甾甿畀畁畇畈畊畋畎畓畚畛畟鄂畤畦畧荻畯畳畵畷畸畽畾疃疉疋疍疎簞疐疒疕疘疝疢疥疧疳疶疿痁痄痊痌痍痏痐痒痔痗瘢痚痠痡痣痦痩痭痯痱痳痵痻痿瘀瘂瘃瘈瘉瘊瘌瘏瘐瘓瘕瘖瘙瘚瘛瘲瘜瘝瘞瘠瘥瘨瘭瘮瘯瘰癧瘳癘瘵瘸瘺瘻瘼癃癆癇癈癎癐癔癙癜癠癤癥癩蟆癪癭癰発踔紺蔫酵皙砬砒翎翳蘞鎢鑞皚鵯駒鱀粵褶皀皁莢皃鎛皈皌皐皒硃皕皖皘皜皝皞皤皦皨皪皫皭糙綻皴皸皻皽盅盋盌盍盚盝踞盦盩鞦韆盬盭眦睜瞤盯盱眙裰盵盻睞眂眅眈眊県眑眕眚眛眞眢眣眭眳眴眵眹瞓眽郛睃睅睆睊睍睎睏睒睖睙睟睠睢睥睪睪睯睽睾瞇瞈瞋瞍逛瞏瞕瞖瞘瞜瞟瞠瞢瞫瞭瞳瞵瞷瞹瞽闍瞿矓矉矍鑠矔矗矙矚矞矟矠矣矧矬矯矰矱硪碇磙��舫阡、矼矽礓砃砅砆砉砍砑砕砝砟砠砢砦砧砩砫砮砳艏砵砹砼硇硌硍硎硏硐硒硜硤硨磲茚鋇硭硻硾碃碉碏碣碓碔碞碡碪碫碬碭碯碲碸碻礡磈磉磎磑磔磕磖磛磟磠磡磤磥蹭磪磬磴磵磹磻磽礀礄礅礌礐礚礜礞礤礧礮礱礲礵礽礿祂祄祅祆禳祊祍祏祓祔祕祗祘祛祧祫祲祻祼餌臠錮禂禇禋禑禔禕隋禖禘禚禜禝禠禡禢禤禥禨禫禰禴禸稈秈秊闈颯秌秏秕笈蘵賃秠秣秪秫秬秭秷秸稊稌稍稑稗稙稛稞稬稭稲稹稼顙稾穂穄穇穈穉穋穌貯穏穜穟穠穡穣穤穧穨穭穮穵穸窿闃窀窂窅窆窈窕窊窋窌窒窓窔窞窣窬黷蹙窰窳窴窵窶窸窻竁竃竈竑竜竝竦竪篦篾笆鮫竾笉笊笎笏笐靨笓笤籙笪笫笭笮笰笱笲笳笵笸笻筀筅筇筈筎筑筘筠筤筥筦筧筩筭筯筰筱筳筴讌筸箂箇箊箎箑箒箘箙箛箜篌箝箠箬鏃箯箴箾篁篔簹篘篙篚篛篜篝篟篠篡篢篥篧篨篭篰篲篳篴篶篹篼簀簁簃簆簉簋簌簏簜簟簠簥簦簨簬簰簸簻籊籐籒籓籔籖籚籛籜籣籥籧籩籪籫籯芾麴籵籸籹籼粁粃粋粑粔糲粛粞粢粧粨粲粳粺粻粽闢粿糅糆糈糌糍糒糔萼糗蛆蹋糢糨糬糭糯糱糴糶糸糺紃蹼鰹黴紆紈絝紉閩襻紑紕紘錠鳶鷂紝紞紟紥紩紬紱紲紵紽紾紿絁絃絅経絍絎絏縭褵絓絖絘絜絢絣螯絪絫聒絰絵絶絺絻絿綀綃綅綆綈綉綌綍綎綑綖綘継続緞綣綦綪綫綮綯綰罟蝽綷縩綹綾緁緄緅緆緇緋緌緎総緑緔緖緗緘緙緜緡緤緥緦纂緪緰緱緲緶緹縁縃縄縈縉縋縏縑縕縗縚縝縞縟縠縡縢縦縧縯縰騁縲縳縴縵縶縹縻衙縿繄繅繈繊繋繐繒繖繘繙繠繢繣繨繮繰繸繻繾纁纆纇纈纉纊纑纕纘纙纚纛缾罃罆罈罋罌罎罏罖罘罛罝罠罣罥罦罨罫罭鍰罳罶罹罻罽罿羂羃羇羋蕉51鴕羑羖羗羜羝羢羣羥羧羭羮羰羱羵羶羸藜鮐翀翃翄翊翌翏翕翛翟翡翣翥翦躚翪翫翬翮翯翺翽翾翿闆饕鴰鍁耋耇耎耏耑耒耜耔耞耡耤耨耩耪耬耰鬢耵聹聃聆聎聝聡聦聱聴聶聼閾聿肄肏肐肕腋肙肜肟肧胛肫肬肭肰肴肵肸肼胊胍胏胑胔胗胙胝胠銓胤胦胩胬胭胯胰胲胴胹胻胼胾脇脘脝脞脡脣脤脥脧脰脲脳腆腊腌臢腍腒腓腖腜腠腡腥腧腬腯踝蹬鐐腴腶蠕誹膂膃膆膇膋膔膕膗膙膟黐膣膦膫膰膴膵膷膾臃臄臇臈臌臐臑臓臕臖臙臛臝臞臧蓐詡臽臾臿舀舁鰟鮍舋舎舔舗舘舝舠舡舢舨舭舲舳舴舸舺艁艄艅艉艋艑艕艖艗艘艚艜艟艣艤艨艩艫艬艭荏艴艶艸艹艻艿芃芄芊萰陂藭芏芔芘芚蕙芟芣芤茉芧芨芩芪芮芰鰱芴芷芸蕘豢芼芿苄苒苘苙苜蓿苠苡苣蕒苤苧苪鎊苶苹苺苻苾茀茁范蠡萣茆茇茈茌茍茖茞茠茢茥茦菰茭茯茳藨茷藘茼荁荄荅荇荈菅蜢鴞荍荑荘荳荵荸薺莆莒莔莕莘莙莚莛莜莝莦莨菪莩莪莭莰莿菀菆菉菎菏菐菑菓菔菕菘菝菡菢菣菥蓂菧菫轂鎣菶菷菹醢菺菻菼菾萅萆萇萋萏萐萑萜萩萱萴萵萹萻葇葍葎葑葒葖葙葠葥葦葧葭葯葳葴葶葸葹葽蒄蒎蒓蘢薹蒞蒟蒻蒢蒦蒨蒭藁蒯蒱鉾蒴蒹蒺蒽蓀蓁蓆蓇蓊蓌蓍蓏蓓蓖蓧蓪蓫蓽跣藕蓯蓰蓱蓴蓷蓺蓼蔀蔂蔃蔆蔇蔉蔊蔋蔌蔎蔕蔘蔙蔞蔟鍔蔣雯蔦蔯蔳蔴蔵蔸蔾蕁蕆蕋蕍蕎蕐蕑蕓蕕蕖蕗蕝蕞蕠蕡蕢蕣蕤蕨蕳蕷蕸蕺蕻薀薁薃薅薆薈薉薌薏薐薔薖薘薙諤釵薜薠薢薤薧薨薫薬薳薶薷薸薽薾薿藄藇藋藎藐藙藚藟藦藳藴藶藷藾蘀蘁蘄蘋蘗蘘蘝蘤蘧蘩蘸蘼虀虆虍蟠虒虓虖虡虣虥虩虯虰蛵虵虷鱒虺虼蚆蚈蚋蚓蚔蚖蚘蚜蚡蚣蚧蚨蚩蚪蚯蚰蜒蚱蚳蚶蚹蚺蚻蚿蛀蛁蛄蛅蝮蛌蛍蛐蟮蛑蛓蛔蛘蛚蛜蛡蛣蜊蛩蛺蛻螫蜅蜆蜈蝣蜋蜍蜎蜑蠊蜛餞蜞蜣蜨蜩蜮蜱蜷蜺蜾蜿蝀蝃蝋蝌蝍蝎蝏蝗蝘蝙蝝鱝蝡蝤蝥蝯蝰蝱蝲蝴蝻螃蠏螄螉螋螒螓螗螘螙螚蟥螟螣螥螬螭螮螾螿蟀蟅蟈蟊蟋蟑蟓蟛蟜蟟蟢蟣蟨蟪蟭蟯蟳蟶蟷蟺蟿蠁蠂蠃蠆蠋蠐蠓蠔蠗蠙蠚蠛蠜蠧蠨蠩蠭蠮蠰蠲蠵蠸蠼蠽衁衂衄衇衈衉衋衎衒衕衖衚衞裳鈎衭衲衵衹衺衿袈裟袗袚袟袢袪袮袲袴袷袺袼褙袽裀裉裊裋裌裍裎裒裛裯裱裲裴裾褀褂褉褊褌褎褐褒褓褔褕褘褚褡褢褦褧褪褫褭褯褰褱襠褸褽褾襁襃襆襇襉襋襌襏襚襛襜襝襞襡襢襤襦襫襬襭襮襴襶襼襽襾覂覃覅覇覉覊覌覗覘覚覜覥覦覧覩覬覯覰観覿觔觕觖觜觽觝觡酲觩觫觭觱觳觶觷觼觾觿言賅訃訇訏訑訒詁託訧訬訳訹証訾詀詅詆譭詈詊詎詑詒詖詗詘詧詨詵詶詸詹詻詼詿誂誃誄鋤誆誋誑誒誖誙誚誥誧説読誯誶誾諂諄諆諌諍諏諑諕諗諛諝諞諟諠諡諴諵諶諼謄謆謇謌謍謏謑謖謚謡謦謪謫謳謷謼謾譁譅譆譈譊譌譒譔譖鑫譞譟譩譫譬譱譲譴譸譹譾讅讆讋讌讎讐讒讖讙讜讟谽豁豉豇豈豊豋豌豏豔豞豖豗豜豝豣豦豨豭豱豳豵豶豷豺豻貅貆貍貎貔貘貙貜貤饜貰餸貺賁賂賏賒賕賙賝賡賧賨賫鬭賮賵賸賺賻賾贇贉贐贔贕贗赬赭赱赳迄趁趂趄趐趑趒趔趡趦趫趮趯趲趴趵趷趹趺趿跁跂跅跆躓蹌跐跕跖跗跙跛跦跧跩跫跬跮跱跲跴跺跼跽踅踆踈踉踊踒���踘踜踟躇躕踠踡踣踤踥踦踧蹺踫踮踰踱踴踶踹踺踼踽躞蹁蹂躪蹎蹐蹓蹔蹕蹚蹜蹝蹟蹠蹡蹢躂蹧蹩蹪蹯鞠蹽躃躄躅躊躋躐躑躒躘躙躛躝躠躡躦躧躩躭躰躳躶軃軆輥軏軔軘軜軝齶転軥軨軭軱軲轆軷軹軺軽軿輀輂輦輅輇輈輓輗輙輜輞輠輤輬輭輮輳輴輵輶輹輼輾轀轇轏轑轒轔轕轖轗轘轙轝轞轢轤辠辢辤辵辶辺込辿迅迋迍麿迓迣迤邐迥迨迮迸迺迻迿逄逅逌逍逑逓逕逖逡逭逯逴逶逹遄遅遉遘遛遝遢遨遫遯遰遴遶遹遻邂邅邉邋邎邕邗邘邛邠邢邧邨邯鄲邰邲邳邴邶邷邽邾邿郃郄郇郈郔郕郗郙郚郜郝郞郟郠郢郪郫郯郰郲郳郴郷郹郾郿鄀鄄鄆鄇鄈鄋鄍鄎鄏鄐鄑鄒鄔鄕鄖鄗鄘鄚鄜鄞鄠鄢鄣鄤鄦鄩鄫鄬鄮鄯鄱鄶鄷鄹鄺鄻鄾鄿酃酅酆酇酈酊酋酎酏酐酣酔酕醄酖酗酞酡酢酤酩酴酹酺醁醅醆醊醍醐醑醓醖醝醞醡醤醨醪醭醯醰醱醲醴醵醸醹醼醽醾釂釃釅釆釈鱸鎦閶釓釔釕鈀釙鼢鼴釤釧釪釬釭釱釷釸釹鈁鈃鈄鈆鈇鈈鈊鈌鈐鈑鈒鈤鈥鈧鈬鈮鈰鈳鐺鈸鈹鈽鈿鉄鉆鉈鉋鉌鉍鉏鉑鉕鉚鉢鉥鉦鉨鉬鉭鉱鉲鉶鉸鉺鉼鉿銍銎銑銕鏤銚銛銠銣銤銥銦銧銩銪銫銭銰銲銶銻銼銾鋂鋃鋆鋈鋊鋌鋍鋏鋐鋑鋕鋘鋙鋝鋟鋦鋨鋩鋭鋮鋯鋰鋱鋳鋹鋺鋻鏰鐱錀錁錆錇錈錍錏錒錔錙錚錛錞錟錡錤錩錬録錸錼鍀鍆鍇鍉鍍鍏鍐鍘鍚鍛鍠鍤鍥鍩鍫鍭鍱鍴鍶鍹鍺鍼鍾鎄鎇鎉鎋鎌鎍鎏鎒鎓鎗鎘鎚鎞鎡鎤鎩鎪鎭鎯鎰鎳鎴鎵鎸鎹鎿鏇鏊鏌鏐鏑鏖鏗鏘鏚鏜鏝鏞鏠鏦鏨鏷鏸鏹鏻鏽鏾鐃鐄鐇鐏鐒鐓鐔鐗馗鐙鐝鐠鐡鐦鐨鐩鐫鐬鐱鐳鐶鐻鐽鐿鑀鑅鑌鑐鑕鑚鑛鑢鑤鑥鑪鑭鑯鑱鑴鑵鑷钁钃镻閆閈閌閎閒閔閗閟閡関閤閤閧閬閲閹閺閻閼閽閿闇闉闋闐闑闒闓闘闚闞闟闠闤闥阞阢阤阨阬阯阹阼阽陁陑陔陛陜陡陥陬騭陴険陼陾隂隃隈隒隗隞隠隣隤隩隮隰顴隳隷隹雂雈雉雊雎雑雒雗雘雚雝雟雩雰雱驛霂霅霈霊霑霒霓霙霝霢霣霤霨霩霪霫霮靁靆靉靑靚靣靦靪靮靰靳靷靸靺靼靿鞀鞃鞄鞌鞗鞙鞚鞝鞞鞡鞣鞨鞫鞬鞮鞶鞹鞾韃韅韉馱韍韎韔韖韘韝韞韡韣韭韮韱韹韺頀颳頄頇頊頍頎頏頒頖頞頠頫頬顱頯頲頴頼顇顋顑顒顓顔顕顚顜顢顣顬顳颭颮颱颶颸颺颻颽颾颿飀飂飈飌飜飡飣飤飥飩飫飮飱飶餀餂餄餎餇餈餑餔餕餖餗餚餛餜餟餠餤餧餩餪餫餬餮餱餲餳餺餻餼餽餿饁饅饇饉饊饍饎饐饘饟饢馘馥馝馡馣騮騾馵馹駃駄駅駆駉駋駑駓駔駗駘駙駜駡駢駪駬駰駴駸駹駽駾騂騄騅騆騉騋騍騏驎騑騒験騕騖騠騢騣騤騧驤騵騶騸騺驀驂驃驄驆驈驊驌驍驎驏驒驔驖驙驦驩驫骺鯁骫骭骯骱骴骶骷髏骾髁髂髄髆髈髐髑髕髖髙髝髞髟髡髣髧髪髫髭髯髲髳髹髺髽髾鬁鬃鬅鬈鬋鬎鬏鬐鬑鬒鬖鬗鬘鬙鬠鬣鬪鬫鬬鬮鬯鬰鬲鬵鬷魆魈魊魋魍魎魑魖鰾魛魟魣魦魨魬魴魵魸鮀鮁鮆鮌鮎鮑鮒鮓鮚鮞鮟鱇鮠鮦鮨鮪鮭鮶鮸鮿鯀鯄鯆鯇鯈鯔鯕鯖鯗鯙鯠鯤鯥鯫鯰鯷鯸鯿鰂鰆鶼鰉鰋鰐鰒鰕鰛鰜鰣鰤鰥鰦鰨鰩鰮鰳鰶鰷鱺鰼鰽鱀鱄鱅鱆鱈鱎鱐鱓鱔鱖鱘鱟鱠鱣鱨鱭鱮鱲鱵鱻鲅鳦鳧鳯鳲鳷鳻鴂鴃鴄鴆鴈鴎鴒鴔鴗鴛鴦鴝鵒鴟鴠鴢鴣鴥鴯鶓鴳鴴鴷鴽鵀鵁鵂鵓鵖鵙鵜鶘鵞鵟鵩鵪鵫鵵鵷鵻鵾鶂鶊鶏鶒鶖鶗鶡鶤鶦鶬鶱鶲鶵鶸鶹鶺鶿鷀鷁鷃鷄鷇鷈鷉鷊鷏鷓鷕鷖鷙鷞鷟鷥鷦鷯鷩鷫鷭鷳鷴鷽鷾鷿鸂鸇鸊鸏鸑鸒鸓鸕鸛鸜鸝鹸鹹鹺麀麂麃麄麇麋麌麐麑麒麚麛麝麤麩麪麫麮麯麰麺麾黁黈黌黢黒黓黕黙黝黟黥黦黧黮黰黱黲黶黹黻黼黽黿鼂鼃鼅鼈鼉鼏鼐鼒鼕鼖鼙鼚鼛鼡鼩鼱鼪鼫鼯鼷鼽齁齆齇齈齉齌齎齏齔齕齗齙齚齜齞齟齬齠齢齣齧齩齮齯齰齱齵齾龎龑龒龔龖龘龝龡龢龤"
|
20 |
+
|
21 |
+
assert len(simplified_charcters) == len(simplified_charcters)
|
22 |
+
|
23 |
+
s2t_dict = {}
|
24 |
+
t2s_dict = {}
|
25 |
+
for i, item in enumerate(simplified_charcters):
|
26 |
+
s2t_dict[item] = traditional_characters[i]
|
27 |
+
t2s_dict[traditional_characters[i]] = item
|
28 |
+
|
29 |
+
|
30 |
+
def tranditional_to_simplified(text: str) -> str:
|
31 |
+
return "".join([t2s_dict[item] if item in t2s_dict else item for item in text])
|
32 |
+
|
33 |
+
|
34 |
+
def simplified_to_traditional(text: str) -> str:
|
35 |
+
return "".join([s2t_dict[item] if item in s2t_dict else item for item in text])
|
36 |
+
|
37 |
+
|
38 |
+
if __name__ == "__main__":
|
39 |
+
text = "一般是指存取一個應用程式啟動時始終顯示在網站或網頁瀏覽器中的一個或多個初始網頁等畫面存在的站點"
|
40 |
+
print(text)
|
41 |
+
text_simple = tranditional_to_simplified(text)
|
42 |
+
print(text_simple)
|
43 |
+
text_traditional = simplified_to_traditional(text_simple)
|
44 |
+
print(text_traditional)
|
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/dataset.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
"""
|
15 |
+
Credits
|
16 |
+
This code is modified from https://github.com/GitYCC/g2pW
|
17 |
+
"""
|
18 |
+
from typing import Dict
|
19 |
+
from typing import List
|
20 |
+
from typing import Tuple
|
21 |
+
|
22 |
+
import numpy as np
|
23 |
+
|
24 |
+
from .utils import tokenize_and_map
|
25 |
+
|
26 |
+
ANCHOR_CHAR = "▁"
|
27 |
+
|
28 |
+
|
29 |
+
def prepare_onnx_input(
|
30 |
+
tokenizer,
|
31 |
+
labels: List[str],
|
32 |
+
char2phonemes: Dict[str, List[int]],
|
33 |
+
chars: List[str],
|
34 |
+
texts: List[str],
|
35 |
+
query_ids: List[int],
|
36 |
+
use_mask: bool = False,
|
37 |
+
window_size: int = None,
|
38 |
+
max_len: int = 512,
|
39 |
+
) -> Dict[str, np.array]:
|
40 |
+
if window_size is not None:
|
41 |
+
truncated_texts, truncated_query_ids = _truncate_texts(
|
42 |
+
window_size=window_size, texts=texts, query_ids=query_ids
|
43 |
+
)
|
44 |
+
input_ids = []
|
45 |
+
token_type_ids = []
|
46 |
+
attention_masks = []
|
47 |
+
phoneme_masks = []
|
48 |
+
char_ids = []
|
49 |
+
position_ids = []
|
50 |
+
|
51 |
+
for idx in range(len(texts)):
|
52 |
+
text = (truncated_texts if window_size else texts)[idx].lower()
|
53 |
+
query_id = (truncated_query_ids if window_size else query_ids)[idx]
|
54 |
+
|
55 |
+
try:
|
56 |
+
tokens, text2token, token2text = tokenize_and_map(
|
57 |
+
tokenizer=tokenizer, text=text
|
58 |
+
)
|
59 |
+
except Exception:
|
60 |
+
print(f'warning: text "{text}" is invalid')
|
61 |
+
return {}
|
62 |
+
|
63 |
+
text, query_id, tokens, text2token, token2text = _truncate(
|
64 |
+
max_len=max_len,
|
65 |
+
text=text,
|
66 |
+
query_id=query_id,
|
67 |
+
tokens=tokens,
|
68 |
+
text2token=text2token,
|
69 |
+
token2text=token2text,
|
70 |
+
)
|
71 |
+
|
72 |
+
processed_tokens = ["[CLS]"] + tokens + ["[SEP]"]
|
73 |
+
|
74 |
+
input_id = list(np.array(tokenizer.convert_tokens_to_ids(processed_tokens)))
|
75 |
+
token_type_id = list(np.zeros((len(processed_tokens),), dtype=int))
|
76 |
+
attention_mask = list(np.ones((len(processed_tokens),), dtype=int))
|
77 |
+
|
78 |
+
query_char = text[query_id]
|
79 |
+
phoneme_mask = (
|
80 |
+
[1 if i in char2phonemes[query_char] else 0 for i in range(len(labels))]
|
81 |
+
if use_mask
|
82 |
+
else [1] * len(labels)
|
83 |
+
)
|
84 |
+
char_id = chars.index(query_char)
|
85 |
+
position_id = text2token[query_id] + 1 # [CLS] token locate at first place
|
86 |
+
|
87 |
+
input_ids.append(input_id)
|
88 |
+
token_type_ids.append(token_type_id)
|
89 |
+
attention_masks.append(attention_mask)
|
90 |
+
phoneme_masks.append(phoneme_mask)
|
91 |
+
char_ids.append(char_id)
|
92 |
+
position_ids.append(position_id)
|
93 |
+
|
94 |
+
outputs = {
|
95 |
+
"input_ids": np.array(input_ids).astype(np.int64),
|
96 |
+
"token_type_ids": np.array(token_type_ids).astype(np.int64),
|
97 |
+
"attention_masks": np.array(attention_masks).astype(np.int64),
|
98 |
+
"phoneme_masks": np.array(phoneme_masks).astype(np.float32),
|
99 |
+
"char_ids": np.array(char_ids).astype(np.int64),
|
100 |
+
"position_ids": np.array(position_ids).astype(np.int64),
|
101 |
+
}
|
102 |
+
return outputs
|
103 |
+
|
104 |
+
|
105 |
+
def _truncate_texts(
|
106 |
+
window_size: int, texts: List[str], query_ids: List[int]
|
107 |
+
) -> Tuple[List[str], List[int]]:
|
108 |
+
truncated_texts = []
|
109 |
+
truncated_query_ids = []
|
110 |
+
for text, query_id in zip(texts, query_ids):
|
111 |
+
start = max(0, query_id - window_size // 2)
|
112 |
+
end = min(len(text), query_id + window_size // 2)
|
113 |
+
truncated_text = text[start:end]
|
114 |
+
truncated_texts.append(truncated_text)
|
115 |
+
|
116 |
+
truncated_query_id = query_id - start
|
117 |
+
truncated_query_ids.append(truncated_query_id)
|
118 |
+
return truncated_texts, truncated_query_ids
|
119 |
+
|
120 |
+
|
121 |
+
def _truncate(
|
122 |
+
max_len: int,
|
123 |
+
text: str,
|
124 |
+
query_id: int,
|
125 |
+
tokens: List[str],
|
126 |
+
text2token: List[int],
|
127 |
+
token2text: List[Tuple[int]],
|
128 |
+
):
|
129 |
+
truncate_len = max_len - 2
|
130 |
+
if len(tokens) <= truncate_len:
|
131 |
+
return (text, query_id, tokens, text2token, token2text)
|
132 |
+
|
133 |
+
token_position = text2token[query_id]
|
134 |
+
|
135 |
+
token_start = token_position - truncate_len // 2
|
136 |
+
token_end = token_start + truncate_len
|
137 |
+
font_exceed_dist = -token_start
|
138 |
+
back_exceed_dist = token_end - len(tokens)
|
139 |
+
if font_exceed_dist > 0:
|
140 |
+
token_start += font_exceed_dist
|
141 |
+
token_end += font_exceed_dist
|
142 |
+
elif back_exceed_dist > 0:
|
143 |
+
token_start -= back_exceed_dist
|
144 |
+
token_end -= back_exceed_dist
|
145 |
+
|
146 |
+
start = token2text[token_start][0]
|
147 |
+
end = token2text[token_end - 1][1]
|
148 |
+
|
149 |
+
return (
|
150 |
+
text[start:end],
|
151 |
+
query_id - start,
|
152 |
+
tokens[token_start:token_end],
|
153 |
+
[i - token_start if i is not None else None for i in text2token[start:end]],
|
154 |
+
[(s - start, e - start) for s, e in token2text[token_start:token_end]],
|
155 |
+
)
|
156 |
+
|
157 |
+
|
158 |
+
def get_phoneme_labels(
|
159 |
+
polyphonic_chars: List[List[str]],
|
160 |
+
) -> Tuple[List[str], Dict[str, List[int]]]:
|
161 |
+
labels = sorted(list(set([phoneme for char, phoneme in polyphonic_chars])))
|
162 |
+
char2phonemes = {}
|
163 |
+
for char, phoneme in polyphonic_chars:
|
164 |
+
if char not in char2phonemes:
|
165 |
+
char2phonemes[char] = []
|
166 |
+
char2phonemes[char].append(labels.index(phoneme))
|
167 |
+
return labels, char2phonemes
|
168 |
+
|
169 |
+
|
170 |
+
def get_char_phoneme_labels(
|
171 |
+
polyphonic_chars: List[List[str]],
|
172 |
+
) -> Tuple[List[str], Dict[str, List[int]]]:
|
173 |
+
labels = sorted(
|
174 |
+
list(set([f"{char} {phoneme}" for char, phoneme in polyphonic_chars]))
|
175 |
+
)
|
176 |
+
char2phonemes = {}
|
177 |
+
for char, phoneme in polyphonic_chars:
|
178 |
+
if char not in char2phonemes:
|
179 |
+
char2phonemes[char] = []
|
180 |
+
char2phonemes[char].append(labels.index(f"{char} {phoneme}"))
|
181 |
+
return labels, char2phonemes
|
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/onnx_api.py
ADDED
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
"""
|
15 |
+
Credits
|
16 |
+
This code is modified from https://github.com/GitYCC/g2pW
|
17 |
+
"""
|
18 |
+
import json
|
19 |
+
import os
|
20 |
+
from typing import Any
|
21 |
+
from typing import Dict
|
22 |
+
from typing import List
|
23 |
+
from typing import Tuple
|
24 |
+
import numpy as np
|
25 |
+
import onnxruntime
|
26 |
+
from opencc import OpenCC
|
27 |
+
from transformers import BertTokenizer
|
28 |
+
from pypinyin import pinyin
|
29 |
+
from pypinyin import Style
|
30 |
+
|
31 |
+
from .dataset import get_char_phoneme_labels
|
32 |
+
from .dataset import get_phoneme_labels
|
33 |
+
from .dataset import prepare_onnx_input
|
34 |
+
from .utils import load_config
|
35 |
+
from .char_convert import tranditional_to_simplified
|
36 |
+
|
37 |
+
model_version = "1.1"
|
38 |
+
|
39 |
+
|
40 |
+
def predict(
|
41 |
+
session, onnx_input: Dict[str, Any], labels: List[str]
|
42 |
+
) -> Tuple[List[str], List[float]]:
|
43 |
+
all_preds = []
|
44 |
+
all_confidences = []
|
45 |
+
probs = session.run(
|
46 |
+
[],
|
47 |
+
{
|
48 |
+
"input_ids": onnx_input["input_ids"],
|
49 |
+
"token_type_ids": onnx_input["token_type_ids"],
|
50 |
+
"attention_mask": onnx_input["attention_masks"],
|
51 |
+
"phoneme_mask": onnx_input["phoneme_masks"],
|
52 |
+
"char_ids": onnx_input["char_ids"],
|
53 |
+
"position_ids": onnx_input["position_ids"],
|
54 |
+
},
|
55 |
+
)[0]
|
56 |
+
|
57 |
+
preds = np.argmax(probs, axis=1).tolist()
|
58 |
+
max_probs = []
|
59 |
+
for index, arr in zip(preds, probs.tolist()):
|
60 |
+
max_probs.append(arr[index])
|
61 |
+
all_preds += [labels[pred] for pred in preds]
|
62 |
+
all_confidences += max_probs
|
63 |
+
|
64 |
+
return all_preds, all_confidences
|
65 |
+
|
66 |
+
|
67 |
+
class G2PWOnnxConverter:
|
68 |
+
def __init__(
|
69 |
+
self,
|
70 |
+
model_dir: None,
|
71 |
+
model_source=None,
|
72 |
+
style: str = "bopomofo",
|
73 |
+
enable_non_tradional_chinese: bool = False,
|
74 |
+
):
|
75 |
+
sess_options = onnxruntime.SessionOptions()
|
76 |
+
sess_options.graph_optimization_level = (
|
77 |
+
onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
78 |
+
)
|
79 |
+
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
|
80 |
+
sess_options.intra_op_num_threads = os.cpu_count() - 1
|
81 |
+
try:
|
82 |
+
self.session_g2pw = onnxruntime.InferenceSession(
|
83 |
+
os.path.join(model_dir, "g2pW.onnx"),
|
84 |
+
sess_options=sess_options,
|
85 |
+
providers=["CUDAExecutionProvider"],
|
86 |
+
)
|
87 |
+
except:
|
88 |
+
self.session_g2pw = onnxruntime.InferenceSession(
|
89 |
+
os.path.join(model_dir, "g2pW.onnx"), sess_options=sess_options
|
90 |
+
)
|
91 |
+
self.config = load_config(
|
92 |
+
os.path.join(model_dir, "config.py"), use_default=True
|
93 |
+
)
|
94 |
+
|
95 |
+
self.model_source = (
|
96 |
+
os.path.join(os.path.abspath(os.curdir), model_source)
|
97 |
+
if model_source
|
98 |
+
else os.path.join(os.path.abspath(os.curdir), self.config.model_source)
|
99 |
+
)
|
100 |
+
self.enable_opencc = enable_non_tradional_chinese
|
101 |
+
|
102 |
+
self.tokenizer = (
|
103 |
+
BertTokenizer.from_pretrained(self.model_source)
|
104 |
+
if model_source
|
105 |
+
else BertTokenizer.from_pretrained(self.config.model_source)
|
106 |
+
)
|
107 |
+
polyphonic_chars_path = os.path.join(model_dir, "POLYPHONIC_CHARS.txt")
|
108 |
+
monophonic_chars_path = os.path.join(model_dir, "MONOPHONIC_CHARS.txt")
|
109 |
+
|
110 |
+
self.polyphonic_chars = [
|
111 |
+
line.split("\t")
|
112 |
+
for line in open(polyphonic_chars_path, encoding="utf-8")
|
113 |
+
.read()
|
114 |
+
.strip()
|
115 |
+
.split("\n")
|
116 |
+
]
|
117 |
+
self.non_polyphonic = {
|
118 |
+
"一",
|
119 |
+
"不",
|
120 |
+
"和",
|
121 |
+
"咋",
|
122 |
+
"嗲",
|
123 |
+
"剖",
|
124 |
+
"差",
|
125 |
+
"攢",
|
126 |
+
"倒",
|
127 |
+
"難",
|
128 |
+
"奔",
|
129 |
+
"勁",
|
130 |
+
"拗",
|
131 |
+
"肖",
|
132 |
+
"瘙",
|
133 |
+
"誒",
|
134 |
+
"泊",
|
135 |
+
"听",
|
136 |
+
"噢",
|
137 |
+
}
|
138 |
+
self.non_monophonic = {"似", "攢"}
|
139 |
+
self.monophonic_chars = [
|
140 |
+
line.split("\t")
|
141 |
+
for line in open(monophonic_chars_path, encoding="utf-8")
|
142 |
+
.read()
|
143 |
+
.strip()
|
144 |
+
.split("\n")
|
145 |
+
]
|
146 |
+
self.labels, self.char2phonemes = (
|
147 |
+
get_char_phoneme_labels(polyphonic_chars=self.polyphonic_chars)
|
148 |
+
if self.config.use_char_phoneme
|
149 |
+
else get_phoneme_labels(polyphonic_chars=self.polyphonic_chars)
|
150 |
+
)
|
151 |
+
|
152 |
+
self.chars = sorted(list(self.char2phonemes.keys()))
|
153 |
+
|
154 |
+
self.polyphonic_chars_new = set(self.chars)
|
155 |
+
for char in self.non_polyphonic:
|
156 |
+
if char in self.polyphonic_chars_new:
|
157 |
+
self.polyphonic_chars_new.remove(char)
|
158 |
+
|
159 |
+
self.monophonic_chars_dict = {
|
160 |
+
char: phoneme for char, phoneme in self.monophonic_chars
|
161 |
+
}
|
162 |
+
for char in self.non_monophonic:
|
163 |
+
if char in self.monophonic_chars_dict:
|
164 |
+
self.monophonic_chars_dict.pop(char)
|
165 |
+
|
166 |
+
self.pos_tags = ["UNK", "A", "C", "D", "I", "N", "P", "T", "V", "DE", "SHI"]
|
167 |
+
|
168 |
+
with open(
|
169 |
+
os.path.join(
|
170 |
+
os.path.dirname(os.path.abspath(__file__)),
|
171 |
+
"bopomofo_to_pinyin_wo_tune_dict.json",
|
172 |
+
),
|
173 |
+
"r",
|
174 |
+
encoding="utf-8",
|
175 |
+
) as fr:
|
176 |
+
self.bopomofo_convert_dict = json.load(fr)
|
177 |
+
self.style_convert_func = {
|
178 |
+
"bopomofo": lambda x: x,
|
179 |
+
"pinyin": self._convert_bopomofo_to_pinyin,
|
180 |
+
}[style]
|
181 |
+
|
182 |
+
with open(
|
183 |
+
os.path.join(
|
184 |
+
os.path.dirname(os.path.abspath(__file__)), "char_bopomofo_dict.json"
|
185 |
+
),
|
186 |
+
"r",
|
187 |
+
encoding="utf-8",
|
188 |
+
) as fr:
|
189 |
+
self.char_bopomofo_dict = json.load(fr)
|
190 |
+
|
191 |
+
if self.enable_opencc:
|
192 |
+
self.cc = OpenCC("s2tw")
|
193 |
+
|
194 |
+
def _convert_bopomofo_to_pinyin(self, bopomofo: str) -> str:
|
195 |
+
tone = bopomofo[-1]
|
196 |
+
assert tone in "12345"
|
197 |
+
component = self.bopomofo_convert_dict.get(bopomofo[:-1])
|
198 |
+
if component:
|
199 |
+
return component + tone
|
200 |
+
else:
|
201 |
+
print(f'Warning: "{bopomofo}" cannot convert to pinyin')
|
202 |
+
return None
|
203 |
+
|
204 |
+
def __call__(self, sentences: List[str]) -> List[List[str]]:
|
205 |
+
if isinstance(sentences, str):
|
206 |
+
sentences = [sentences]
|
207 |
+
|
208 |
+
if self.enable_opencc:
|
209 |
+
translated_sentences = []
|
210 |
+
for sent in sentences:
|
211 |
+
translated_sent = self.cc.convert(sent)
|
212 |
+
assert len(translated_sent) == len(sent)
|
213 |
+
translated_sentences.append(translated_sent)
|
214 |
+
sentences = translated_sentences
|
215 |
+
|
216 |
+
texts, query_ids, sent_ids, partial_results = self._prepare_data(
|
217 |
+
sentences=sentences
|
218 |
+
)
|
219 |
+
if len(texts) == 0:
|
220 |
+
# sentences no polyphonic words
|
221 |
+
return partial_results
|
222 |
+
|
223 |
+
onnx_input = prepare_onnx_input(
|
224 |
+
tokenizer=self.tokenizer,
|
225 |
+
labels=self.labels,
|
226 |
+
char2phonemes=self.char2phonemes,
|
227 |
+
chars=self.chars,
|
228 |
+
texts=texts,
|
229 |
+
query_ids=query_ids,
|
230 |
+
use_mask=self.config.use_mask,
|
231 |
+
window_size=None,
|
232 |
+
)
|
233 |
+
|
234 |
+
preds, confidences = predict(
|
235 |
+
session=self.session_g2pw, onnx_input=onnx_input, labels=self.labels
|
236 |
+
)
|
237 |
+
if self.config.use_char_phoneme:
|
238 |
+
preds = [pred.split(" ")[1] for pred in preds]
|
239 |
+
|
240 |
+
results = partial_results
|
241 |
+
for sent_id, query_id, pred in zip(sent_ids, query_ids, preds):
|
242 |
+
results[sent_id][query_id] = self.style_convert_func(pred)
|
243 |
+
|
244 |
+
return results
|
245 |
+
|
246 |
+
def _prepare_data(
|
247 |
+
self, sentences: List[str]
|
248 |
+
) -> Tuple[List[str], List[int], List[int], List[List[str]]]:
|
249 |
+
texts, query_ids, sent_ids, partial_results = [], [], [], []
|
250 |
+
for sent_id, sent in enumerate(sentences):
|
251 |
+
# pypinyin works well for Simplified Chinese than Traditional Chinese
|
252 |
+
sent_s = tranditional_to_simplified(sent)
|
253 |
+
pypinyin_result = pinyin(
|
254 |
+
sent_s, neutral_tone_with_five=True, style=Style.TONE3
|
255 |
+
)
|
256 |
+
partial_result = [None] * len(sent)
|
257 |
+
for i, char in enumerate(sent):
|
258 |
+
if char in self.polyphonic_chars_new:
|
259 |
+
texts.append(sent)
|
260 |
+
query_ids.append(i)
|
261 |
+
sent_ids.append(sent_id)
|
262 |
+
elif char in self.monophonic_chars_dict:
|
263 |
+
partial_result[i] = self.style_convert_func(
|
264 |
+
self.monophonic_chars_dict[char]
|
265 |
+
)
|
266 |
+
elif char in self.char_bopomofo_dict:
|
267 |
+
partial_result[i] = pypinyin_result[i][0]
|
268 |
+
# partial_result[i] = self.style_convert_func(self.char_bopomofo_dict[char][0])
|
269 |
+
else:
|
270 |
+
partial_result[i] = pypinyin_result[i][0]
|
271 |
+
|
272 |
+
partial_results.append(partial_result)
|
273 |
+
return texts, query_ids, sent_ids, partial_results
|
bert_vits2/g2pW/pypinyin_G2pW_bv2/g2pw1/utils.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
"""
|
15 |
+
Credits
|
16 |
+
This code is modified from https://github.com/GitYCC/g2pW
|
17 |
+
"""
|
18 |
+
import os
|
19 |
+
import re
|
20 |
+
|
21 |
+
|
22 |
+
def wordize_and_map(text: str):
|
23 |
+
words = []
|
24 |
+
index_map_from_text_to_word = []
|
25 |
+
index_map_from_word_to_text = []
|
26 |
+
while len(text) > 0:
|
27 |
+
match_space = re.match(r"^ +", text)
|
28 |
+
if match_space:
|
29 |
+
space_str = match_space.group(0)
|
30 |
+
index_map_from_text_to_word += [None] * len(space_str)
|
31 |
+
text = text[len(space_str) :]
|
32 |
+
continue
|
33 |
+
|
34 |
+
match_en = re.match(r"^[a-zA-Z0-9]+", text)
|
35 |
+
if match_en:
|
36 |
+
en_word = match_en.group(0)
|
37 |
+
|
38 |
+
word_start_pos = len(index_map_from_text_to_word)
|
39 |
+
word_end_pos = word_start_pos + len(en_word)
|
40 |
+
index_map_from_word_to_text.append((word_start_pos, word_end_pos))
|
41 |
+
|
42 |
+
index_map_from_text_to_word += [len(words)] * len(en_word)
|
43 |
+
|
44 |
+
words.append(en_word)
|
45 |
+
text = text[len(en_word) :]
|
46 |
+
else:
|
47 |
+
word_start_pos = len(index_map_from_text_to_word)
|
48 |
+
word_end_pos = word_start_pos + 1
|
49 |
+
index_map_from_word_to_text.append((word_start_pos, word_end_pos))
|
50 |
+
|
51 |
+
index_map_from_text_to_word += [len(words)]
|
52 |
+
|
53 |
+
words.append(text[0])
|
54 |
+
text = text[1:]
|
55 |
+
return words, index_map_from_text_to_word, index_map_from_word_to_text
|
56 |
+
|
57 |
+
|
58 |
+
def tokenize_and_map(tokenizer, text: str):
|
59 |
+
words, text2word, word2text = wordize_and_map(text=text)
|
60 |
+
|
61 |
+
tokens = []
|
62 |
+
index_map_from_token_to_text = []
|
63 |
+
for word, (word_start, word_end) in zip(words, word2text):
|
64 |
+
word_tokens = tokenizer.tokenize(word)
|
65 |
+
|
66 |
+
if len(word_tokens) == 0 or word_tokens == ["[UNK]"]:
|
67 |
+
index_map_from_token_to_text.append((word_start, word_end))
|
68 |
+
tokens.append("[UNK]")
|
69 |
+
else:
|
70 |
+
current_word_start = word_start
|
71 |
+
for word_token in word_tokens:
|
72 |
+
word_token_len = len(re.sub(r"^##", "", word_token))
|
73 |
+
index_map_from_token_to_text.append(
|
74 |
+
(current_word_start, current_word_start + word_token_len)
|
75 |
+
)
|
76 |
+
current_word_start = current_word_start + word_token_len
|
77 |
+
tokens.append(word_token)
|
78 |
+
|
79 |
+
index_map_from_text_to_token = text2word
|
80 |
+
for i, (token_start, token_end) in enumerate(index_map_from_token_to_text):
|
81 |
+
for token_pos in range(token_start, token_end):
|
82 |
+
index_map_from_text_to_token[token_pos] = i
|
83 |
+
|
84 |
+
return tokens, index_map_from_text_to_token, index_map_from_token_to_text
|
85 |
+
|
86 |
+
|
87 |
+
def _load_config(config_path: os.PathLike):
|
88 |
+
import importlib.util
|
89 |
+
|
90 |
+
spec = importlib.util.spec_from_file_location("__init__", config_path)
|
91 |
+
config = importlib.util.module_from_spec(spec)
|
92 |
+
spec.loader.exec_module(config)
|
93 |
+
return config
|
94 |
+
|
95 |
+
|
96 |
+
default_config_dict = {
|
97 |
+
"manual_seed": 1313,
|
98 |
+
"model_source": "bert-base-chinese",
|
99 |
+
"window_size": 32,
|
100 |
+
"num_workers": 2,
|
101 |
+
"use_mask": True,
|
102 |
+
"use_char_phoneme": False,
|
103 |
+
"use_conditional": True,
|
104 |
+
"param_conditional": {
|
105 |
+
"affect_location": "softmax",
|
106 |
+
"bias": True,
|
107 |
+
"char-linear": True,
|
108 |
+
"pos-linear": False,
|
109 |
+
"char+pos-second": True,
|
110 |
+
"char+pos-second_lowrank": False,
|
111 |
+
"lowrank_size": 0,
|
112 |
+
"char+pos-second_fm": False,
|
113 |
+
"fm_size": 0,
|
114 |
+
"fix_mode": None,
|
115 |
+
"count_json": "train.count.json",
|
116 |
+
},
|
117 |
+
"lr": 5e-5,
|
118 |
+
"val_interval": 200,
|
119 |
+
"num_iter": 10000,
|
120 |
+
"use_focal": False,
|
121 |
+
"param_focal": {"alpha": 0.0, "gamma": 0.7},
|
122 |
+
"use_pos": True,
|
123 |
+
"param_pos ": {
|
124 |
+
"weight": 0.1,
|
125 |
+
"pos_joint_training": True,
|
126 |
+
"train_pos_path": "train.pos",
|
127 |
+
"valid_pos_path": "dev.pos",
|
128 |
+
"test_pos_path": "test.pos",
|
129 |
+
},
|
130 |
+
}
|
131 |
+
|
132 |
+
|
133 |
+
def load_config(config_path: os.PathLike, use_default: bool = False):
|
134 |
+
config = _load_config(config_path)
|
135 |
+
if use_default:
|
136 |
+
for attr, val in default_config_dict.items():
|
137 |
+
if not hasattr(config, attr):
|
138 |
+
setattr(config, attr, val)
|
139 |
+
elif isinstance(val, dict):
|
140 |
+
d = getattr(config, attr)
|
141 |
+
for dict_k, dict_v in val.items():
|
142 |
+
if dict_k not in d:
|
143 |
+
d[dict_k] = dict_v
|
144 |
+
return config
|
bert_vits2/get_emo.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
from transformers import Wav2Vec2Processor
|
6 |
+
from transformers.models.wav2vec2.modeling_wav2vec2 import (
|
7 |
+
Wav2Vec2Model,
|
8 |
+
Wav2Vec2PreTrainedModel,
|
9 |
+
)
|
10 |
+
|
11 |
+
from contants import config
|
12 |
+
|
13 |
+
|
14 |
+
class RegressionHead(nn.Module):
|
15 |
+
r"""Classification head."""
|
16 |
+
|
17 |
+
def __init__(self, config):
|
18 |
+
super().__init__()
|
19 |
+
|
20 |
+
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
|
21 |
+
self.dropout = nn.Dropout(config.final_dropout)
|
22 |
+
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
|
23 |
+
|
24 |
+
def forward(self, features, **kwargs):
|
25 |
+
x = features
|
26 |
+
x = self.dropout(x)
|
27 |
+
x = self.dense(x)
|
28 |
+
x = torch.tanh(x)
|
29 |
+
x = self.dropout(x)
|
30 |
+
x = self.out_proj(x)
|
31 |
+
|
32 |
+
return x
|
33 |
+
|
34 |
+
|
35 |
+
class EmotionModel(Wav2Vec2PreTrainedModel):
|
36 |
+
r"""Speech emotion classifier."""
|
37 |
+
|
38 |
+
def __init__(self, config):
|
39 |
+
super().__init__(config)
|
40 |
+
|
41 |
+
self.config = config
|
42 |
+
self.wav2vec2 = Wav2Vec2Model(config)
|
43 |
+
self.classifier = RegressionHead(config)
|
44 |
+
self.init_weights()
|
45 |
+
|
46 |
+
def forward(
|
47 |
+
self,
|
48 |
+
input_values,
|
49 |
+
):
|
50 |
+
outputs = self.wav2vec2(input_values)
|
51 |
+
hidden_states = outputs[0]
|
52 |
+
hidden_states = torch.mean(hidden_states, dim=1)
|
53 |
+
logits = self.classifier(hidden_states)
|
54 |
+
|
55 |
+
return hidden_states, logits
|
56 |
+
|
57 |
+
|
58 |
+
def process_func(
|
59 |
+
x: np.ndarray,
|
60 |
+
sampling_rate: int,
|
61 |
+
model: EmotionModel,
|
62 |
+
processor: Wav2Vec2Processor,
|
63 |
+
device: str,
|
64 |
+
embeddings: bool = False,
|
65 |
+
) -> np.ndarray:
|
66 |
+
r"""Predict emotions or extract embeddings from raw audio signal."""
|
67 |
+
model = model.to(device)
|
68 |
+
y = processor(x, sampling_rate=sampling_rate)
|
69 |
+
y = y["input_values"][0]
|
70 |
+
y = torch.from_numpy(y).unsqueeze(0).to(device)
|
71 |
+
|
72 |
+
# run through model
|
73 |
+
with torch.no_grad():
|
74 |
+
y = model(y)[0 if embeddings else 1]
|
75 |
+
|
76 |
+
# convert to numpy
|
77 |
+
y = y.detach().cpu().numpy()
|
78 |
+
|
79 |
+
return y
|
80 |
+
|
81 |
+
|
82 |
+
def get_emo(audio, emotion_model, processor):
|
83 |
+
wav, sr = librosa.load(audio, 16000)
|
84 |
+
device = config.system.device
|
85 |
+
return process_func(
|
86 |
+
np.expand_dims(wav, 0).astype(np.float),
|
87 |
+
sr,
|
88 |
+
emotion_model,
|
89 |
+
processor,
|
90 |
+
device,
|
91 |
+
embeddings=True,
|
92 |
+
).squeeze(0)
|
bert_vits2/models.py
ADDED
@@ -0,0 +1,799 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from torch import nn
|
4 |
+
from torch.nn import functional as F
|
5 |
+
|
6 |
+
from bert_vits2 import commons
|
7 |
+
from bert_vits2 import modules
|
8 |
+
from bert_vits2 import attentions
|
9 |
+
|
10 |
+
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
11 |
+
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
12 |
+
from vector_quantize_pytorch import VectorQuantize
|
13 |
+
|
14 |
+
from bert_vits2.commons import init_weights, get_padding
|
15 |
+
from bert_vits2.text import num_languages
|
16 |
+
|
17 |
+
|
18 |
+
class DurationDiscriminator(nn.Module): # vits2
|
19 |
+
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
|
20 |
+
super().__init__()
|
21 |
+
|
22 |
+
self.in_channels = in_channels
|
23 |
+
self.filter_channels = filter_channels
|
24 |
+
self.kernel_size = kernel_size
|
25 |
+
self.p_dropout = p_dropout
|
26 |
+
self.gin_channels = gin_channels
|
27 |
+
|
28 |
+
self.drop = nn.Dropout(p_dropout)
|
29 |
+
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
|
30 |
+
self.norm_1 = modules.LayerNorm(filter_channels)
|
31 |
+
self.conv_2 = nn.Conv1d(
|
32 |
+
filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
33 |
+
)
|
34 |
+
self.norm_2 = modules.LayerNorm(filter_channels)
|
35 |
+
self.dur_proj = nn.Conv1d(1, filter_channels, 1)
|
36 |
+
|
37 |
+
self.pre_out_conv_1 = nn.Conv1d(2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
|
38 |
+
self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
|
39 |
+
self.pre_out_conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
|
40 |
+
self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
|
41 |
+
|
42 |
+
if gin_channels != 0:
|
43 |
+
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
|
44 |
+
|
45 |
+
self.output_layer = nn.Sequential(
|
46 |
+
nn.Linear(filter_channels, 1),
|
47 |
+
nn.Sigmoid()
|
48 |
+
)
|
49 |
+
|
50 |
+
def forward_probability(self, x, x_mask, dur, g=None):
|
51 |
+
dur = self.dur_proj(dur)
|
52 |
+
x = torch.cat([x, dur], dim=1)
|
53 |
+
x = self.pre_out_conv_1(x * x_mask)
|
54 |
+
x = torch.relu(x)
|
55 |
+
x = self.pre_out_norm_1(x)
|
56 |
+
x = self.drop(x)
|
57 |
+
x = self.pre_out_conv_2(x * x_mask)
|
58 |
+
x = torch.relu(x)
|
59 |
+
x = self.pre_out_norm_2(x)
|
60 |
+
x = self.drop(x)
|
61 |
+
x = x * x_mask
|
62 |
+
x = x.transpose(1, 2)
|
63 |
+
output_prob = self.output_layer(x)
|
64 |
+
return output_prob
|
65 |
+
|
66 |
+
def forward(self, x, x_mask, dur_r, dur_hat, g=None):
|
67 |
+
x = torch.detach(x)
|
68 |
+
if g is not None:
|
69 |
+
g = torch.detach(g)
|
70 |
+
x = x + self.cond(g)
|
71 |
+
x = self.conv_1(x * x_mask)
|
72 |
+
x = torch.relu(x)
|
73 |
+
x = self.norm_1(x)
|
74 |
+
x = self.drop(x)
|
75 |
+
x = self.conv_2(x * x_mask)
|
76 |
+
x = torch.relu(x)
|
77 |
+
x = self.norm_2(x)
|
78 |
+
x = self.drop(x)
|
79 |
+
|
80 |
+
output_probs = []
|
81 |
+
for dur in [dur_r, dur_hat]:
|
82 |
+
output_prob = self.forward_probability(x, x_mask, dur, g)
|
83 |
+
output_probs.append(output_prob)
|
84 |
+
|
85 |
+
return output_probs
|
86 |
+
|
87 |
+
|
88 |
+
class Block(nn.Module):
|
89 |
+
def __init__(self, in_dim, hidden_dim) -> None:
|
90 |
+
super().__init__()
|
91 |
+
self.norm = nn.LayerNorm(in_dim)
|
92 |
+
self.mlp = MLP(in_dim, hidden_dim)
|
93 |
+
|
94 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
95 |
+
x = x + self.mlp(self.norm(x))
|
96 |
+
return x
|
97 |
+
|
98 |
+
|
99 |
+
class MLP(nn.Module):
|
100 |
+
def __init__(self, in_dim, hidden_dim):
|
101 |
+
super().__init__()
|
102 |
+
self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
|
103 |
+
self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
|
104 |
+
self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False)
|
105 |
+
|
106 |
+
def forward(self, x: torch.Tensor):
|
107 |
+
x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
|
108 |
+
x = self.c_proj(x)
|
109 |
+
return x
|
110 |
+
|
111 |
+
|
112 |
+
class TransformerCouplingBlock(nn.Module):
|
113 |
+
def __init__(self,
|
114 |
+
channels,
|
115 |
+
hidden_channels,
|
116 |
+
filter_channels,
|
117 |
+
n_heads,
|
118 |
+
n_layers,
|
119 |
+
kernel_size,
|
120 |
+
p_dropout,
|
121 |
+
n_flows=4,
|
122 |
+
gin_channels=0,
|
123 |
+
share_parameter=False
|
124 |
+
):
|
125 |
+
|
126 |
+
super().__init__()
|
127 |
+
self.channels = channels
|
128 |
+
self.hidden_channels = hidden_channels
|
129 |
+
self.kernel_size = kernel_size
|
130 |
+
self.n_layers = n_layers
|
131 |
+
self.n_flows = n_flows
|
132 |
+
self.gin_channels = gin_channels
|
133 |
+
|
134 |
+
self.flows = nn.ModuleList()
|
135 |
+
|
136 |
+
self.wn = attentions.FFT(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout,
|
137 |
+
isflow=True, gin_channels=self.gin_channels) if share_parameter else None
|
138 |
+
|
139 |
+
for i in range(n_flows):
|
140 |
+
self.flows.append(
|
141 |
+
modules.TransformerCouplingLayer(channels, hidden_channels, kernel_size, n_layers, n_heads, p_dropout,
|
142 |
+
filter_channels, mean_only=True, wn_sharing_parameter=self.wn,
|
143 |
+
gin_channels=self.gin_channels))
|
144 |
+
self.flows.append(modules.Flip())
|
145 |
+
|
146 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
147 |
+
if not reverse:
|
148 |
+
for flow in self.flows:
|
149 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
150 |
+
else:
|
151 |
+
for flow in reversed(self.flows):
|
152 |
+
x = flow(x, x_mask, g=g, reverse=reverse)
|
153 |
+
return x
|
154 |
+
|
155 |
+
|
156 |
+
class StochasticDurationPredictor(nn.Module):
|
157 |
+
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
|
158 |
+
super().__init__()
|
159 |
+
filter_channels = in_channels # it needs to be removed from future version.
|
160 |
+
self.in_channels = in_channels
|
161 |
+
self.filter_channels = filter_channels
|
162 |
+
self.kernel_size = kernel_size
|
163 |
+
self.p_dropout = p_dropout
|
164 |
+
self.n_flows = n_flows
|
165 |
+
self.gin_channels = gin_channels
|
166 |
+
|
167 |
+
self.log_flow = modules.Log()
|
168 |
+
self.flows = nn.ModuleList()
|
169 |
+
self.flows.append(modules.ElementwiseAffine(2))
|
170 |
+
for i in range(n_flows):
|
171 |
+
self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
|
172 |
+
self.flows.append(modules.Flip())
|
173 |
+
|
174 |
+
self.post_pre = nn.Conv1d(1, filter_channels, 1)
|
175 |
+
self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
176 |
+
self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
|
177 |
+
self.post_flows = nn.ModuleList()
|
178 |
+
self.post_flows.append(modules.ElementwiseAffine(2))
|
179 |
+
for i in range(4):
|
180 |
+
self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
|
181 |
+
self.post_flows.append(modules.Flip())
|
182 |
+
|
183 |
+
self.pre = nn.Conv1d(in_channels, filter_channels, 1)
|
184 |
+
self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
185 |
+
self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
|
186 |
+
if gin_channels != 0:
|
187 |
+
self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
|
188 |
+
|
189 |
+
def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
|
190 |
+
x = torch.detach(x)
|
191 |
+
x = self.pre(x)
|
192 |
+
if g is not None:
|
193 |
+
g = torch.detach(g)
|
194 |
+
x = x + self.cond(g)
|
195 |
+
x = self.convs(x, x_mask)
|
196 |
+
x = self.proj(x) * x_mask
|
197 |
+
|
198 |
+
if not reverse:
|
199 |
+
flows = self.flows
|
200 |
+
assert w is not None
|
201 |
+
|
202 |
+
logdet_tot_q = 0
|
203 |
+
h_w = self.post_pre(w)
|
204 |
+
h_w = self.post_convs(h_w, x_mask)
|
205 |
+
h_w = self.post_proj(h_w) * x_mask
|
206 |
+
e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
|
207 |
+
z_q = e_q
|
208 |
+
for flow in self.post_flows:
|
209 |
+
z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
|
210 |
+
logdet_tot_q += logdet_q
|
211 |
+
z_u, z1 = torch.split(z_q, [1, 1], 1)
|
212 |
+
u = torch.sigmoid(z_u) * x_mask
|
213 |
+
z0 = (w - u) * x_mask
|
214 |
+
logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2])
|
215 |
+
logq = torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2]) - logdet_tot_q
|
216 |
+
|
217 |
+
logdet_tot = 0
|
218 |
+
z0, logdet = self.log_flow(z0, x_mask)
|
219 |
+
logdet_tot += logdet
|
220 |
+
z = torch.cat([z0, z1], 1)
|
221 |
+
for flow in flows:
|
222 |
+
z, logdet = flow(z, x_mask, g=x, reverse=reverse)
|
223 |
+
logdet_tot = logdet_tot + logdet
|
224 |
+
nll = torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2]) - logdet_tot
|
225 |
+
return nll + logq # [b]
|
226 |
+
else:
|
227 |
+
flows = list(reversed(self.flows))
|
228 |
+
flows = flows[:-2] + [flows[-1]] # remove a useless vflow
|
229 |
+
z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
|
230 |
+
for flow in flows:
|
231 |
+
z = flow(z, x_mask, g=x, reverse=reverse)
|
232 |
+
z0, z1 = torch.split(z, [1, 1], 1)
|
233 |
+
logw = z0
|
234 |
+
return logw
|
235 |
+
|
236 |
+
|
237 |
+
class DurationPredictor(nn.Module):
|
238 |
+
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0):
|
239 |
+
super().__init__()
|
240 |
+
|
241 |
+
self.in_channels = in_channels
|
242 |
+
self.filter_channels = filter_channels
|
243 |
+
self.kernel_size = kernel_size
|
244 |
+
self.p_dropout = p_dropout
|
245 |
+
self.gin_channels = gin_channels
|
246 |
+
|
247 |
+
self.drop = nn.Dropout(p_dropout)
|
248 |
+
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
|
249 |
+
self.norm_1 = modules.LayerNorm(filter_channels)
|
250 |
+
self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
|
251 |
+
self.norm_2 = modules.LayerNorm(filter_channels)
|
252 |
+
self.proj = nn.Conv1d(filter_channels, 1, 1)
|
253 |
+
|
254 |
+
if gin_channels != 0:
|
255 |
+
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
|
256 |
+
|
257 |
+
def forward(self, x, x_mask, g=None):
|
258 |
+
x = torch.detach(x)
|
259 |
+
if g is not None:
|
260 |
+
g = torch.detach(g)
|
261 |
+
x = x + self.cond(g)
|
262 |
+
x = self.conv_1(x * x_mask)
|
263 |
+
x = torch.relu(x)
|
264 |
+
x = self.norm_1(x)
|
265 |
+
x = self.drop(x)
|
266 |
+
x = self.conv_2(x * x_mask)
|
267 |
+
x = torch.relu(x)
|
268 |
+
x = self.norm_2(x)
|
269 |
+
x = self.drop(x)
|
270 |
+
x = self.proj(x * x_mask)
|
271 |
+
return x * x_mask
|
272 |
+
|
273 |
+
|
274 |
+
class TextEncoder(nn.Module):
|
275 |
+
def __init__(self,
|
276 |
+
n_vocab,
|
277 |
+
out_channels,
|
278 |
+
hidden_channels,
|
279 |
+
filter_channels,
|
280 |
+
n_heads,
|
281 |
+
n_layers,
|
282 |
+
kernel_size,
|
283 |
+
p_dropout,
|
284 |
+
n_speakers,
|
285 |
+
gin_channels=0,
|
286 |
+
symbols=None,
|
287 |
+
ja_bert_dim=1024,
|
288 |
+
num_tones=None,
|
289 |
+
emotion_embedding=1,
|
290 |
+
zh_bert_extra=False,
|
291 |
+
):
|
292 |
+
super().__init__()
|
293 |
+
self.n_vocab = n_vocab
|
294 |
+
self.out_channels = out_channels
|
295 |
+
self.hidden_channels = hidden_channels
|
296 |
+
self.filter_channels = filter_channels
|
297 |
+
self.n_heads = n_heads
|
298 |
+
self.n_layers = n_layers
|
299 |
+
self.kernel_size = kernel_size
|
300 |
+
self.p_dropout = p_dropout
|
301 |
+
self.gin_channels = gin_channels
|
302 |
+
self.emb = nn.Embedding(len(symbols), hidden_channels)
|
303 |
+
nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5)
|
304 |
+
self.tone_emb = nn.Embedding(num_tones, hidden_channels)
|
305 |
+
nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels ** -0.5)
|
306 |
+
self.language_emb = nn.Embedding(num_languages, hidden_channels)
|
307 |
+
nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels ** -0.5)
|
308 |
+
self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
|
309 |
+
self.zh_bert_extra = zh_bert_extra
|
310 |
+
if self.zh_bert_extra:
|
311 |
+
self.bert_pre_proj = nn.Conv1d(2048, 1024, 1)
|
312 |
+
self.ja_bert_proj = nn.Conv1d(ja_bert_dim, hidden_channels, 1)
|
313 |
+
self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
|
314 |
+
self.emotion_embedding = emotion_embedding
|
315 |
+
|
316 |
+
if self.emotion_embedding == 1:
|
317 |
+
self.emo_proj = nn.Linear(1024, 1024)
|
318 |
+
self.emo_quantizer = VectorQuantize(
|
319 |
+
dim=1024,
|
320 |
+
codebook_size=10,
|
321 |
+
decay=0.8,
|
322 |
+
commitment_weight=1.0,
|
323 |
+
learnable_codebook=True,
|
324 |
+
ema_update=False,
|
325 |
+
)
|
326 |
+
self.emo_q_proj = nn.Linear(1024, hidden_channels)
|
327 |
+
elif self.emotion_embedding == 2:
|
328 |
+
self.in_feature_net = nn.Sequential(
|
329 |
+
# input is assumed to an already normalized embedding
|
330 |
+
nn.Linear(512, 1028, bias=False),
|
331 |
+
nn.GELU(),
|
332 |
+
nn.LayerNorm(1028),
|
333 |
+
*[Block(1028, 512) for _ in range(1)],
|
334 |
+
nn.Linear(1028, 512, bias=False),
|
335 |
+
# normalize before passing to VQ?
|
336 |
+
# nn.GELU(),
|
337 |
+
# nn.LayerNorm(512),
|
338 |
+
)
|
339 |
+
self.emo_vq = VectorQuantize(
|
340 |
+
dim=512,
|
341 |
+
codebook_size=64,
|
342 |
+
codebook_dim=32,
|
343 |
+
commitment_weight=0.1,
|
344 |
+
decay=0.85,
|
345 |
+
heads=32,
|
346 |
+
kmeans_iters=20,
|
347 |
+
separate_codebook_per_head=True,
|
348 |
+
stochastic_sample_codes=True,
|
349 |
+
threshold_ema_dead_code=2,
|
350 |
+
)
|
351 |
+
self.out_feature_net = nn.Linear(512, hidden_channels)
|
352 |
+
|
353 |
+
self.encoder = attentions.Encoder(
|
354 |
+
hidden_channels,
|
355 |
+
filter_channels,
|
356 |
+
n_heads,
|
357 |
+
n_layers,
|
358 |
+
kernel_size,
|
359 |
+
p_dropout,
|
360 |
+
gin_channels=self.gin_channels)
|
361 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
362 |
+
|
363 |
+
def forward(self, x, x_lengths, tone, language, zh_bert, ja_bert, en_bert, emo=None, sid=None, g=None):
|
364 |
+
x = self.emb(x) + self.tone_emb(tone) + self.language_emb(language)
|
365 |
+
|
366 |
+
if self.zh_bert_extra:
|
367 |
+
zh_bert = self.bert_pre_proj(zh_bert)
|
368 |
+
x += self.bert_proj(zh_bert).transpose(1, 2)
|
369 |
+
x += self.ja_bert_proj(ja_bert).transpose(1, 2)
|
370 |
+
x += self.en_bert_proj(en_bert).transpose(1, 2)
|
371 |
+
|
372 |
+
x *= math.sqrt(self.hidden_channels) # [b, t, h]
|
373 |
+
if self.emotion_embedding == 1:
|
374 |
+
# emo = emo.to(zh_bert_emb.device)
|
375 |
+
if emo.size(-1) == 1024:
|
376 |
+
emo_emb = self.emo_proj(emo.unsqueeze(1))
|
377 |
+
emo_emb_ = []
|
378 |
+
for i in range(emo_emb.size(0)):
|
379 |
+
temp_emo_emb, _, _ = self.emo_quantizer(
|
380 |
+
emo_emb[i].unsqueeze(0).to(emo.device)
|
381 |
+
)
|
382 |
+
emo_emb_.append(temp_emo_emb)
|
383 |
+
emo_emb = torch.cat(emo_emb_, dim=0).to(emo_emb.device)
|
384 |
+
else:
|
385 |
+
emo_emb = (
|
386 |
+
self.emo_quantizer.get_output_from_indices(emo.to(torch.long))
|
387 |
+
.unsqueeze(0)
|
388 |
+
.to(emo.device)
|
389 |
+
)
|
390 |
+
|
391 |
+
x += self.emo_q_proj(emo_emb)
|
392 |
+
elif self.emotion_embedding == 2:
|
393 |
+
emo_emb = self.in_feature_net(emo)
|
394 |
+
emo_emb, _, _ = self.emo_vq(emo_emb.unsqueeze(1))
|
395 |
+
emo_emb = self.out_feature_net(emo_emb)
|
396 |
+
x += emo_emb
|
397 |
+
|
398 |
+
x *= math.sqrt(self.hidden_channels) # [b, t, h]
|
399 |
+
x = torch.transpose(x, 1, -1) # [b, h, t]
|
400 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
401 |
+
|
402 |
+
x = self.encoder(x * x_mask, x_mask, g=g)
|
403 |
+
stats = self.proj(x) * x_mask
|
404 |
+
|
405 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
406 |
+
return x, m, logs, x_mask
|
407 |
+
|
408 |
+
|
409 |
+
class ResidualCouplingBlock(nn.Module):
|
410 |
+
def __init__(self,
|
411 |
+
channels,
|
412 |
+
hidden_channels,
|
413 |
+
kernel_size,
|
414 |
+
dilation_rate,
|
415 |
+
n_layers,
|
416 |
+
n_flows=4,
|
417 |
+
gin_channels=0):
|
418 |
+
super().__init__()
|
419 |
+
self.channels = channels
|
420 |
+
self.hidden_channels = hidden_channels
|
421 |
+
self.kernel_size = kernel_size
|
422 |
+
self.dilation_rate = dilation_rate
|
423 |
+
self.n_layers = n_layers
|
424 |
+
self.n_flows = n_flows
|
425 |
+
self.gin_channels = gin_channels
|
426 |
+
|
427 |
+
self.flows = nn.ModuleList()
|
428 |
+
for i in range(n_flows):
|
429 |
+
self.flows.append(
|
430 |
+
modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers,
|
431 |
+
gin_channels=gin_channels, mean_only=True))
|
432 |
+
self.flows.append(modules.Flip())
|
433 |
+
|
434 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
435 |
+
if not reverse:
|
436 |
+
for flow in self.flows:
|
437 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
438 |
+
else:
|
439 |
+
for flow in reversed(self.flows):
|
440 |
+
x = flow(x, x_mask, g=g, reverse=reverse)
|
441 |
+
return x
|
442 |
+
|
443 |
+
|
444 |
+
class PosteriorEncoder(nn.Module):
|
445 |
+
def __init__(self,
|
446 |
+
in_channels,
|
447 |
+
out_channels,
|
448 |
+
hidden_channels,
|
449 |
+
kernel_size,
|
450 |
+
dilation_rate,
|
451 |
+
n_layers,
|
452 |
+
gin_channels=0):
|
453 |
+
super().__init__()
|
454 |
+
self.in_channels = in_channels
|
455 |
+
self.out_channels = out_channels
|
456 |
+
self.hidden_channels = hidden_channels
|
457 |
+
self.kernel_size = kernel_size
|
458 |
+
self.dilation_rate = dilation_rate
|
459 |
+
self.n_layers = n_layers
|
460 |
+
self.gin_channels = gin_channels
|
461 |
+
|
462 |
+
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
463 |
+
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
|
464 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
465 |
+
|
466 |
+
def forward(self, x, x_lengths, g=None):
|
467 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
468 |
+
x = self.pre(x) * x_mask
|
469 |
+
x = self.enc(x, x_mask, g=g)
|
470 |
+
stats = self.proj(x) * x_mask
|
471 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
472 |
+
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
473 |
+
return z, m, logs, x_mask
|
474 |
+
|
475 |
+
|
476 |
+
class Generator(torch.nn.Module):
|
477 |
+
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
|
478 |
+
upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
|
479 |
+
super(Generator, self).__init__()
|
480 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
481 |
+
self.num_upsamples = len(upsample_rates)
|
482 |
+
self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
|
483 |
+
resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
|
484 |
+
|
485 |
+
self.ups = nn.ModuleList()
|
486 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
487 |
+
self.ups.append(weight_norm(
|
488 |
+
ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)),
|
489 |
+
k, u, padding=(k - u) // 2)))
|
490 |
+
|
491 |
+
self.resblocks = nn.ModuleList()
|
492 |
+
for i in range(len(self.ups)):
|
493 |
+
ch = upsample_initial_channel // (2 ** (i + 1))
|
494 |
+
for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
|
495 |
+
self.resblocks.append(resblock(ch, k, d))
|
496 |
+
|
497 |
+
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
498 |
+
self.ups.apply(init_weights)
|
499 |
+
|
500 |
+
if gin_channels != 0:
|
501 |
+
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
502 |
+
|
503 |
+
def forward(self, x, g=None):
|
504 |
+
x = self.conv_pre(x)
|
505 |
+
if g is not None:
|
506 |
+
x = x + self.cond(g)
|
507 |
+
|
508 |
+
for i in range(self.num_upsamples):
|
509 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
510 |
+
x = self.ups[i](x)
|
511 |
+
xs = None
|
512 |
+
for j in range(self.num_kernels):
|
513 |
+
if xs is None:
|
514 |
+
xs = self.resblocks[i * self.num_kernels + j](x)
|
515 |
+
else:
|
516 |
+
xs += self.resblocks[i * self.num_kernels + j](x)
|
517 |
+
x = xs / self.num_kernels
|
518 |
+
x = F.leaky_relu(x)
|
519 |
+
x = self.conv_post(x)
|
520 |
+
x = torch.tanh(x)
|
521 |
+
|
522 |
+
return x
|
523 |
+
|
524 |
+
def remove_weight_norm(self):
|
525 |
+
print('Removing weight norm...')
|
526 |
+
for l in self.ups:
|
527 |
+
remove_weight_norm(l)
|
528 |
+
for l in self.resblocks:
|
529 |
+
l.remove_weight_norm()
|
530 |
+
|
531 |
+
|
532 |
+
class DiscriminatorP(torch.nn.Module):
|
533 |
+
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
534 |
+
super(DiscriminatorP, self).__init__()
|
535 |
+
self.period = period
|
536 |
+
self.use_spectral_norm = use_spectral_norm
|
537 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
538 |
+
self.convs = nn.ModuleList([
|
539 |
+
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
540 |
+
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
541 |
+
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
542 |
+
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
543 |
+
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
|
544 |
+
])
|
545 |
+
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
546 |
+
|
547 |
+
def forward(self, x):
|
548 |
+
fmap = []
|
549 |
+
|
550 |
+
# 1d to 2d
|
551 |
+
b, c, t = x.shape
|
552 |
+
if t % self.period != 0: # pad first
|
553 |
+
n_pad = self.period - (t % self.period)
|
554 |
+
x = F.pad(x, (0, n_pad), "reflect")
|
555 |
+
t = t + n_pad
|
556 |
+
x = x.view(b, c, t // self.period, self.period)
|
557 |
+
|
558 |
+
for l in self.convs:
|
559 |
+
x = l(x)
|
560 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
561 |
+
fmap.append(x)
|
562 |
+
x = self.conv_post(x)
|
563 |
+
fmap.append(x)
|
564 |
+
x = torch.flatten(x, 1, -1)
|
565 |
+
|
566 |
+
return x, fmap
|
567 |
+
|
568 |
+
|
569 |
+
class DiscriminatorS(torch.nn.Module):
|
570 |
+
def __init__(self, use_spectral_norm=False):
|
571 |
+
super(DiscriminatorS, self).__init__()
|
572 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
573 |
+
self.convs = nn.ModuleList([
|
574 |
+
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
575 |
+
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
576 |
+
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
577 |
+
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
578 |
+
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
579 |
+
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
580 |
+
])
|
581 |
+
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
582 |
+
|
583 |
+
def forward(self, x):
|
584 |
+
fmap = []
|
585 |
+
|
586 |
+
for l in self.convs:
|
587 |
+
x = l(x)
|
588 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
589 |
+
fmap.append(x)
|
590 |
+
x = self.conv_post(x)
|
591 |
+
fmap.append(x)
|
592 |
+
x = torch.flatten(x, 1, -1)
|
593 |
+
|
594 |
+
return x, fmap
|
595 |
+
|
596 |
+
|
597 |
+
class MultiPeriodDiscriminator(torch.nn.Module):
|
598 |
+
def __init__(self, use_spectral_norm=False):
|
599 |
+
super(MultiPeriodDiscriminator, self).__init__()
|
600 |
+
periods = [2, 3, 5, 7, 11]
|
601 |
+
|
602 |
+
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
603 |
+
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
|
604 |
+
self.discriminators = nn.ModuleList(discs)
|
605 |
+
|
606 |
+
def forward(self, y, y_hat):
|
607 |
+
y_d_rs = []
|
608 |
+
y_d_gs = []
|
609 |
+
fmap_rs = []
|
610 |
+
fmap_gs = []
|
611 |
+
for i, d in enumerate(self.discriminators):
|
612 |
+
y_d_r, fmap_r = d(y)
|
613 |
+
y_d_g, fmap_g = d(y_hat)
|
614 |
+
y_d_rs.append(y_d_r)
|
615 |
+
y_d_gs.append(y_d_g)
|
616 |
+
fmap_rs.append(fmap_r)
|
617 |
+
fmap_gs.append(fmap_g)
|
618 |
+
|
619 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
620 |
+
|
621 |
+
|
622 |
+
class ReferenceEncoder(nn.Module):
|
623 |
+
'''
|
624 |
+
inputs --- [N, Ty/r, n_mels*r] mels
|
625 |
+
outputs --- [N, ref_enc_gru_size]
|
626 |
+
'''
|
627 |
+
|
628 |
+
def __init__(self, spec_channels, gin_channels=0):
|
629 |
+
|
630 |
+
super().__init__()
|
631 |
+
self.spec_channels = spec_channels
|
632 |
+
ref_enc_filters = [32, 32, 64, 64, 128, 128]
|
633 |
+
K = len(ref_enc_filters)
|
634 |
+
filters = [1] + ref_enc_filters
|
635 |
+
convs = [weight_norm(nn.Conv2d(in_channels=filters[i],
|
636 |
+
out_channels=filters[i + 1],
|
637 |
+
kernel_size=(3, 3),
|
638 |
+
stride=(2, 2),
|
639 |
+
padding=(1, 1))) for i in range(K)]
|
640 |
+
self.convs = nn.ModuleList(convs)
|
641 |
+
# self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)])
|
642 |
+
|
643 |
+
out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
|
644 |
+
self.gru = nn.GRU(input_size=ref_enc_filters[-1] * out_channels,
|
645 |
+
hidden_size=256 // 2,
|
646 |
+
batch_first=True)
|
647 |
+
self.proj = nn.Linear(128, gin_channels)
|
648 |
+
|
649 |
+
def forward(self, inputs, mask=None):
|
650 |
+
N = inputs.size(0)
|
651 |
+
out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
|
652 |
+
for conv in self.convs:
|
653 |
+
out = conv(out)
|
654 |
+
# out = wn(out)
|
655 |
+
out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
|
656 |
+
|
657 |
+
out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
|
658 |
+
T = out.size(1)
|
659 |
+
N = out.size(0)
|
660 |
+
out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
|
661 |
+
|
662 |
+
self.gru.flatten_parameters()
|
663 |
+
memory, out = self.gru(out) # out --- [1, N, 128]
|
664 |
+
|
665 |
+
return self.proj(out.squeeze(0))
|
666 |
+
|
667 |
+
def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
|
668 |
+
for i in range(n_convs):
|
669 |
+
L = (L - kernel_size + 2 * pad) // stride + 1
|
670 |
+
return L
|
671 |
+
|
672 |
+
|
673 |
+
class SynthesizerTrn(nn.Module):
|
674 |
+
"""
|
675 |
+
Synthesizer for Training
|
676 |
+
"""
|
677 |
+
|
678 |
+
def __init__(self,
|
679 |
+
n_vocab,
|
680 |
+
spec_channels,
|
681 |
+
segment_size,
|
682 |
+
inter_channels,
|
683 |
+
hidden_channels,
|
684 |
+
filter_channels,
|
685 |
+
n_heads,
|
686 |
+
n_layers,
|
687 |
+
kernel_size,
|
688 |
+
p_dropout,
|
689 |
+
resblock,
|
690 |
+
resblock_kernel_sizes,
|
691 |
+
resblock_dilation_sizes,
|
692 |
+
upsample_rates,
|
693 |
+
upsample_initial_channel,
|
694 |
+
upsample_kernel_sizes,
|
695 |
+
n_speakers=256,
|
696 |
+
gin_channels=256,
|
697 |
+
use_sdp=True,
|
698 |
+
n_flow_layer=4,
|
699 |
+
n_layers_trans_flow=6,
|
700 |
+
flow_share_parameter=False,
|
701 |
+
use_transformer_flow=True,
|
702 |
+
symbols=None,
|
703 |
+
ja_bert_dim=1024,
|
704 |
+
num_tones=None,
|
705 |
+
emotion_embedding=False,
|
706 |
+
zh_bert_extra=False,
|
707 |
+
**kwargs):
|
708 |
+
|
709 |
+
super().__init__()
|
710 |
+
self.n_vocab = n_vocab
|
711 |
+
self.spec_channels = spec_channels
|
712 |
+
self.inter_channels = inter_channels
|
713 |
+
self.hidden_channels = hidden_channels
|
714 |
+
self.filter_channels = filter_channels
|
715 |
+
self.n_heads = n_heads
|
716 |
+
self.n_layers = n_layers
|
717 |
+
self.kernel_size = kernel_size
|
718 |
+
self.p_dropout = p_dropout
|
719 |
+
self.resblock = resblock
|
720 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
721 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
722 |
+
self.upsample_rates = upsample_rates
|
723 |
+
self.upsample_initial_channel = upsample_initial_channel
|
724 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
725 |
+
self.segment_size = segment_size
|
726 |
+
self.n_speakers = n_speakers
|
727 |
+
self.gin_channels = gin_channels
|
728 |
+
self.n_layers_trans_flow = n_layers_trans_flow
|
729 |
+
self.use_spk_conditioned_encoder = kwargs.get("use_spk_conditioned_encoder", True)
|
730 |
+
self.use_sdp = use_sdp
|
731 |
+
self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False)
|
732 |
+
self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01)
|
733 |
+
self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6)
|
734 |
+
self.current_mas_noise_scale = self.mas_noise_scale_initial
|
735 |
+
if self.use_spk_conditioned_encoder and gin_channels > 0:
|
736 |
+
self.enc_gin_channels = gin_channels
|
737 |
+
self.emotion_embedding = emotion_embedding
|
738 |
+
self.enc_p = TextEncoder(n_vocab,
|
739 |
+
inter_channels,
|
740 |
+
hidden_channels,
|
741 |
+
filter_channels,
|
742 |
+
n_heads,
|
743 |
+
n_layers,
|
744 |
+
kernel_size,
|
745 |
+
p_dropout,
|
746 |
+
self.n_speakers,
|
747 |
+
gin_channels=self.enc_gin_channels,
|
748 |
+
symbols=symbols,
|
749 |
+
ja_bert_dim=ja_bert_dim,
|
750 |
+
num_tones=num_tones,
|
751 |
+
emotion_embedding=self.emotion_embedding,
|
752 |
+
zh_bert_extra=zh_bert_extra,
|
753 |
+
)
|
754 |
+
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
|
755 |
+
upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
|
756 |
+
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16,
|
757 |
+
gin_channels=gin_channels)
|
758 |
+
if use_transformer_flow:
|
759 |
+
self.flow = TransformerCouplingBlock(inter_channels, hidden_channels, filter_channels, n_heads,
|
760 |
+
n_layers_trans_flow, 5, p_dropout, n_flow_layer,
|
761 |
+
gin_channels=gin_channels, share_parameter=flow_share_parameter)
|
762 |
+
else:
|
763 |
+
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, n_flow_layer,
|
764 |
+
gin_channels=gin_channels)
|
765 |
+
self.sdp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
|
766 |
+
self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
|
767 |
+
|
768 |
+
if self.n_speakers > 0:
|
769 |
+
self.emb_g = nn.Embedding(self.n_speakers, gin_channels)
|
770 |
+
else:
|
771 |
+
self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
|
772 |
+
|
773 |
+
def infer(self, x, x_lengths, sid, tone, language, zh_bert, ja_bert, en_bert, noise_scale=.667, length_scale=1,
|
774 |
+
noise_scale_w=0.8, max_len=None, sdp_ratio=0, y=None, emo=None):
|
775 |
+
# x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, tone, language, bert)
|
776 |
+
# g = self.gst(y)
|
777 |
+
if self.n_speakers > 0:
|
778 |
+
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
779 |
+
else:
|
780 |
+
g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
|
781 |
+
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, tone, language, zh_bert, ja_bert, en_bert, emo, sid, g=g)
|
782 |
+
logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * (sdp_ratio) + self.dp(x, x_mask,
|
783 |
+
g=g) * (
|
784 |
+
1 - sdp_ratio)
|
785 |
+
w = torch.exp(logw) * x_mask * length_scale
|
786 |
+
w_ceil = torch.ceil(w)
|
787 |
+
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
|
788 |
+
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
|
789 |
+
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
790 |
+
attn = commons.generate_path(w_ceil, attn_mask)
|
791 |
+
|
792 |
+
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
|
793 |
+
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1,
|
794 |
+
2) # [b, t', t], [b, t, d] -> [b, d, t']
|
795 |
+
|
796 |
+
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
797 |
+
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
798 |
+
o = self.dec((z * y_mask)[:, :, :max_len], g=g)
|
799 |
+
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
bert_vits2/models_ja_extra.py
ADDED
@@ -0,0 +1,1016 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from torch import nn
|
4 |
+
from torch.nn import functional as F
|
5 |
+
|
6 |
+
from bert_vits2 import commons
|
7 |
+
from bert_vits2 import modules
|
8 |
+
from bert_vits2 import attentions
|
9 |
+
|
10 |
+
from torch.nn import Conv1d, ConvTranspose1d, Conv2d
|
11 |
+
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
12 |
+
|
13 |
+
from bert_vits2.commons import init_weights, get_padding
|
14 |
+
from bert_vits2.text import symbols, num_tones, num_languages
|
15 |
+
|
16 |
+
from vector_quantize_pytorch import VectorQuantize
|
17 |
+
|
18 |
+
|
19 |
+
class DurationDiscriminator(nn.Module): # vits2
|
20 |
+
def __init__(
|
21 |
+
self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
|
22 |
+
):
|
23 |
+
super().__init__()
|
24 |
+
|
25 |
+
self.in_channels = in_channels
|
26 |
+
self.filter_channels = filter_channels
|
27 |
+
self.kernel_size = kernel_size
|
28 |
+
self.p_dropout = p_dropout
|
29 |
+
self.gin_channels = gin_channels
|
30 |
+
|
31 |
+
self.drop = nn.Dropout(p_dropout)
|
32 |
+
self.conv_1 = nn.Conv1d(
|
33 |
+
in_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
34 |
+
)
|
35 |
+
self.norm_1 = modules.LayerNorm(filter_channels)
|
36 |
+
self.conv_2 = nn.Conv1d(
|
37 |
+
filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
38 |
+
)
|
39 |
+
self.norm_2 = modules.LayerNorm(filter_channels)
|
40 |
+
self.dur_proj = nn.Conv1d(1, filter_channels, 1)
|
41 |
+
|
42 |
+
self.LSTM = nn.LSTM(
|
43 |
+
2 * filter_channels, filter_channels, batch_first=True, bidirectional=True
|
44 |
+
)
|
45 |
+
|
46 |
+
if gin_channels != 0:
|
47 |
+
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
|
48 |
+
|
49 |
+
self.output_layer = nn.Sequential(
|
50 |
+
nn.Linear(2 * filter_channels, 1), nn.Sigmoid()
|
51 |
+
)
|
52 |
+
|
53 |
+
def forward_probability(self, x, dur):
|
54 |
+
dur = self.dur_proj(dur)
|
55 |
+
x = torch.cat([x, dur], dim=1)
|
56 |
+
x = x.transpose(1, 2)
|
57 |
+
x, _ = self.LSTM(x)
|
58 |
+
output_prob = self.output_layer(x)
|
59 |
+
return output_prob
|
60 |
+
|
61 |
+
def forward(self, x, x_mask, dur_r, dur_hat, g=None):
|
62 |
+
x = torch.detach(x)
|
63 |
+
if g is not None:
|
64 |
+
g = torch.detach(g)
|
65 |
+
x = x + self.cond(g)
|
66 |
+
x = self.conv_1(x * x_mask)
|
67 |
+
x = torch.relu(x)
|
68 |
+
x = self.norm_1(x)
|
69 |
+
x = self.drop(x)
|
70 |
+
x = self.conv_2(x * x_mask)
|
71 |
+
x = torch.relu(x)
|
72 |
+
x = self.norm_2(x)
|
73 |
+
x = self.drop(x)
|
74 |
+
|
75 |
+
output_probs = []
|
76 |
+
for dur in [dur_r, dur_hat]:
|
77 |
+
output_prob = self.forward_probability(x, dur)
|
78 |
+
output_probs.append(output_prob)
|
79 |
+
|
80 |
+
return output_probs
|
81 |
+
|
82 |
+
|
83 |
+
class TransformerCouplingBlock(nn.Module):
|
84 |
+
def __init__(
|
85 |
+
self,
|
86 |
+
channels,
|
87 |
+
hidden_channels,
|
88 |
+
filter_channels,
|
89 |
+
n_heads,
|
90 |
+
n_layers,
|
91 |
+
kernel_size,
|
92 |
+
p_dropout,
|
93 |
+
n_flows=4,
|
94 |
+
gin_channels=0,
|
95 |
+
share_parameter=False,
|
96 |
+
):
|
97 |
+
super().__init__()
|
98 |
+
self.channels = channels
|
99 |
+
self.hidden_channels = hidden_channels
|
100 |
+
self.kernel_size = kernel_size
|
101 |
+
self.n_layers = n_layers
|
102 |
+
self.n_flows = n_flows
|
103 |
+
self.gin_channels = gin_channels
|
104 |
+
|
105 |
+
self.flows = nn.ModuleList()
|
106 |
+
|
107 |
+
self.wn = (
|
108 |
+
attentions.FFT(
|
109 |
+
hidden_channels,
|
110 |
+
filter_channels,
|
111 |
+
n_heads,
|
112 |
+
n_layers,
|
113 |
+
kernel_size,
|
114 |
+
p_dropout,
|
115 |
+
isflow=True,
|
116 |
+
gin_channels=self.gin_channels,
|
117 |
+
)
|
118 |
+
if share_parameter
|
119 |
+
else None
|
120 |
+
)
|
121 |
+
|
122 |
+
for i in range(n_flows):
|
123 |
+
self.flows.append(
|
124 |
+
modules.TransformerCouplingLayer(
|
125 |
+
channels,
|
126 |
+
hidden_channels,
|
127 |
+
kernel_size,
|
128 |
+
n_layers,
|
129 |
+
n_heads,
|
130 |
+
p_dropout,
|
131 |
+
filter_channels,
|
132 |
+
mean_only=True,
|
133 |
+
wn_sharing_parameter=self.wn,
|
134 |
+
gin_channels=self.gin_channels,
|
135 |
+
)
|
136 |
+
)
|
137 |
+
self.flows.append(modules.Flip())
|
138 |
+
|
139 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
140 |
+
if not reverse:
|
141 |
+
for flow in self.flows:
|
142 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
143 |
+
else:
|
144 |
+
for flow in reversed(self.flows):
|
145 |
+
x = flow(x, x_mask, g=g, reverse=reverse)
|
146 |
+
return x
|
147 |
+
|
148 |
+
|
149 |
+
class StochasticDurationPredictor(nn.Module):
|
150 |
+
def __init__(
|
151 |
+
self,
|
152 |
+
in_channels,
|
153 |
+
filter_channels,
|
154 |
+
kernel_size,
|
155 |
+
p_dropout,
|
156 |
+
n_flows=4,
|
157 |
+
gin_channels=0,
|
158 |
+
):
|
159 |
+
super().__init__()
|
160 |
+
filter_channels = in_channels # it needs to be removed from future version.
|
161 |
+
self.in_channels = in_channels
|
162 |
+
self.filter_channels = filter_channels
|
163 |
+
self.kernel_size = kernel_size
|
164 |
+
self.p_dropout = p_dropout
|
165 |
+
self.n_flows = n_flows
|
166 |
+
self.gin_channels = gin_channels
|
167 |
+
|
168 |
+
self.log_flow = modules.Log()
|
169 |
+
self.flows = nn.ModuleList()
|
170 |
+
self.flows.append(modules.ElementwiseAffine(2))
|
171 |
+
for i in range(n_flows):
|
172 |
+
self.flows.append(
|
173 |
+
modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
|
174 |
+
)
|
175 |
+
self.flows.append(modules.Flip())
|
176 |
+
|
177 |
+
self.post_pre = nn.Conv1d(1, filter_channels, 1)
|
178 |
+
self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
179 |
+
self.post_convs = modules.DDSConv(
|
180 |
+
filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
|
181 |
+
)
|
182 |
+
self.post_flows = nn.ModuleList()
|
183 |
+
self.post_flows.append(modules.ElementwiseAffine(2))
|
184 |
+
for i in range(4):
|
185 |
+
self.post_flows.append(
|
186 |
+
modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
|
187 |
+
)
|
188 |
+
self.post_flows.append(modules.Flip())
|
189 |
+
|
190 |
+
self.pre = nn.Conv1d(in_channels, filter_channels, 1)
|
191 |
+
self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
192 |
+
self.convs = modules.DDSConv(
|
193 |
+
filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
|
194 |
+
)
|
195 |
+
if gin_channels != 0:
|
196 |
+
self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
|
197 |
+
|
198 |
+
def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
|
199 |
+
x = torch.detach(x)
|
200 |
+
x = self.pre(x)
|
201 |
+
if g is not None:
|
202 |
+
g = torch.detach(g)
|
203 |
+
x = x + self.cond(g)
|
204 |
+
x = self.convs(x, x_mask)
|
205 |
+
x = self.proj(x) * x_mask
|
206 |
+
|
207 |
+
if not reverse:
|
208 |
+
flows = self.flows
|
209 |
+
assert w is not None
|
210 |
+
|
211 |
+
logdet_tot_q = 0
|
212 |
+
h_w = self.post_pre(w)
|
213 |
+
h_w = self.post_convs(h_w, x_mask)
|
214 |
+
h_w = self.post_proj(h_w) * x_mask
|
215 |
+
e_q = (
|
216 |
+
torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype)
|
217 |
+
* x_mask
|
218 |
+
)
|
219 |
+
z_q = e_q
|
220 |
+
for flow in self.post_flows:
|
221 |
+
z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
|
222 |
+
logdet_tot_q += logdet_q
|
223 |
+
z_u, z1 = torch.split(z_q, [1, 1], 1)
|
224 |
+
u = torch.sigmoid(z_u) * x_mask
|
225 |
+
z0 = (w - u) * x_mask
|
226 |
+
logdet_tot_q += torch.sum(
|
227 |
+
(F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
|
228 |
+
)
|
229 |
+
logq = (
|
230 |
+
torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2])
|
231 |
+
- logdet_tot_q
|
232 |
+
)
|
233 |
+
|
234 |
+
logdet_tot = 0
|
235 |
+
z0, logdet = self.log_flow(z0, x_mask)
|
236 |
+
logdet_tot += logdet
|
237 |
+
z = torch.cat([z0, z1], 1)
|
238 |
+
for flow in flows:
|
239 |
+
z, logdet = flow(z, x_mask, g=x, reverse=reverse)
|
240 |
+
logdet_tot = logdet_tot + logdet
|
241 |
+
nll = (
|
242 |
+
torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2])
|
243 |
+
- logdet_tot
|
244 |
+
)
|
245 |
+
return nll + logq # [b]
|
246 |
+
else:
|
247 |
+
flows = list(reversed(self.flows))
|
248 |
+
flows = flows[:-2] + [flows[-1]] # remove a useless vflow
|
249 |
+
z = (
|
250 |
+
torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
|
251 |
+
* noise_scale
|
252 |
+
)
|
253 |
+
for flow in flows:
|
254 |
+
z = flow(z, x_mask, g=x, reverse=reverse)
|
255 |
+
z0, z1 = torch.split(z, [1, 1], 1)
|
256 |
+
logw = z0
|
257 |
+
return logw
|
258 |
+
|
259 |
+
|
260 |
+
class DurationPredictor(nn.Module):
|
261 |
+
def __init__(
|
262 |
+
self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
|
263 |
+
):
|
264 |
+
super().__init__()
|
265 |
+
|
266 |
+
self.in_channels = in_channels
|
267 |
+
self.filter_channels = filter_channels
|
268 |
+
self.kernel_size = kernel_size
|
269 |
+
self.p_dropout = p_dropout
|
270 |
+
self.gin_channels = gin_channels
|
271 |
+
|
272 |
+
self.drop = nn.Dropout(p_dropout)
|
273 |
+
self.conv_1 = nn.Conv1d(
|
274 |
+
in_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
275 |
+
)
|
276 |
+
self.norm_1 = modules.LayerNorm(filter_channels)
|
277 |
+
self.conv_2 = nn.Conv1d(
|
278 |
+
filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
279 |
+
)
|
280 |
+
self.norm_2 = modules.LayerNorm(filter_channels)
|
281 |
+
self.proj = nn.Conv1d(filter_channels, 1, 1)
|
282 |
+
|
283 |
+
if gin_channels != 0:
|
284 |
+
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
|
285 |
+
|
286 |
+
def forward(self, x, x_mask, g=None):
|
287 |
+
x = torch.detach(x)
|
288 |
+
if g is not None:
|
289 |
+
g = torch.detach(g)
|
290 |
+
x = x + self.cond(g)
|
291 |
+
x = self.conv_1(x * x_mask)
|
292 |
+
x = torch.relu(x)
|
293 |
+
x = self.norm_1(x)
|
294 |
+
x = self.drop(x)
|
295 |
+
x = self.conv_2(x * x_mask)
|
296 |
+
x = torch.relu(x)
|
297 |
+
x = self.norm_2(x)
|
298 |
+
x = self.drop(x)
|
299 |
+
x = self.proj(x * x_mask)
|
300 |
+
return x * x_mask
|
301 |
+
|
302 |
+
|
303 |
+
class Bottleneck(nn.Sequential):
|
304 |
+
def __init__(self, in_dim, hidden_dim):
|
305 |
+
c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
|
306 |
+
c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
|
307 |
+
super().__init__(*[c_fc1, c_fc2])
|
308 |
+
|
309 |
+
|
310 |
+
class Block(nn.Module):
|
311 |
+
def __init__(self, in_dim, hidden_dim) -> None:
|
312 |
+
super().__init__()
|
313 |
+
self.norm = nn.LayerNorm(in_dim)
|
314 |
+
self.mlp = MLP(in_dim, hidden_dim)
|
315 |
+
|
316 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
317 |
+
x = x + self.mlp(self.norm(x))
|
318 |
+
return x
|
319 |
+
|
320 |
+
|
321 |
+
class MLP(nn.Module):
|
322 |
+
def __init__(self, in_dim, hidden_dim):
|
323 |
+
super().__init__()
|
324 |
+
self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
|
325 |
+
self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
|
326 |
+
self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False)
|
327 |
+
|
328 |
+
def forward(self, x: torch.Tensor):
|
329 |
+
x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
|
330 |
+
x = self.c_proj(x)
|
331 |
+
return x
|
332 |
+
|
333 |
+
|
334 |
+
class TextEncoder(nn.Module):
|
335 |
+
def __init__(
|
336 |
+
self,
|
337 |
+
n_vocab,
|
338 |
+
out_channels,
|
339 |
+
hidden_channels,
|
340 |
+
filter_channels,
|
341 |
+
n_heads,
|
342 |
+
n_layers,
|
343 |
+
kernel_size,
|
344 |
+
p_dropout,
|
345 |
+
gin_channels=0,
|
346 |
+
):
|
347 |
+
super().__init__()
|
348 |
+
self.n_vocab = n_vocab
|
349 |
+
self.out_channels = out_channels
|
350 |
+
self.hidden_channels = hidden_channels
|
351 |
+
self.filter_channels = filter_channels
|
352 |
+
self.n_heads = n_heads
|
353 |
+
self.n_layers = n_layers
|
354 |
+
self.kernel_size = kernel_size
|
355 |
+
self.p_dropout = p_dropout
|
356 |
+
self.gin_channels = gin_channels
|
357 |
+
self.emb = nn.Embedding(len(symbols), hidden_channels)
|
358 |
+
nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5)
|
359 |
+
self.tone_emb = nn.Embedding(num_tones, hidden_channels)
|
360 |
+
nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels ** -0.5)
|
361 |
+
self.language_emb = nn.Embedding(num_languages, hidden_channels)
|
362 |
+
nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels ** -0.5)
|
363 |
+
self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
|
364 |
+
# self.bert_pre_proj = nn.Conv1d(2048, 1024, 1)
|
365 |
+
# self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
|
366 |
+
self.in_feature_net = nn.Sequential(
|
367 |
+
# input is assumed to an already normalized embedding
|
368 |
+
nn.Linear(512, 1028, bias=False),
|
369 |
+
nn.GELU(),
|
370 |
+
nn.LayerNorm(1028),
|
371 |
+
*[Block(1028, 512) for _ in range(1)],
|
372 |
+
nn.Linear(1028, 512, bias=False),
|
373 |
+
# normalize before passing to VQ?
|
374 |
+
# nn.GELU(),
|
375 |
+
# nn.LayerNorm(512),
|
376 |
+
)
|
377 |
+
self.emo_vq = VectorQuantize(
|
378 |
+
dim=512,
|
379 |
+
# codebook_size=128,
|
380 |
+
codebook_size=256,
|
381 |
+
codebook_dim=16,
|
382 |
+
# codebook_dim=32,
|
383 |
+
commitment_weight=0.1,
|
384 |
+
decay=0.99,
|
385 |
+
heads=32,
|
386 |
+
kmeans_iters=20,
|
387 |
+
separate_codebook_per_head=True,
|
388 |
+
stochastic_sample_codes=True,
|
389 |
+
threshold_ema_dead_code=2,
|
390 |
+
use_cosine_sim=True,
|
391 |
+
)
|
392 |
+
self.out_feature_net = nn.Linear(512, hidden_channels)
|
393 |
+
|
394 |
+
self.encoder = attentions.Encoder(
|
395 |
+
hidden_channels,
|
396 |
+
filter_channels,
|
397 |
+
n_heads,
|
398 |
+
n_layers,
|
399 |
+
kernel_size,
|
400 |
+
p_dropout,
|
401 |
+
gin_channels=self.gin_channels,
|
402 |
+
)
|
403 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
404 |
+
|
405 |
+
def forward(self, x, x_lengths, tone, language, bert, emo, g=None):
|
406 |
+
bert_emb = self.bert_proj(bert).transpose(1, 2)
|
407 |
+
# en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2)
|
408 |
+
emo_emb = self.in_feature_net(emo)
|
409 |
+
emo_emb, _, loss_commit = self.emo_vq(emo_emb.unsqueeze(1))
|
410 |
+
loss_commit = loss_commit.mean()
|
411 |
+
emo_emb = self.out_feature_net(emo_emb)
|
412 |
+
x = (
|
413 |
+
self.emb(x)
|
414 |
+
+ self.tone_emb(tone)
|
415 |
+
+ self.language_emb(language)
|
416 |
+
+ bert_emb
|
417 |
+
# + en_bert_emb
|
418 |
+
+ emo_emb
|
419 |
+
) * math.sqrt(
|
420 |
+
self.hidden_channels
|
421 |
+
) # [b, t, h]
|
422 |
+
x = torch.transpose(x, 1, -1) # [b, h, t]
|
423 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
|
424 |
+
x.dtype
|
425 |
+
)
|
426 |
+
|
427 |
+
x = self.encoder(x * x_mask, x_mask, g=g)
|
428 |
+
stats = self.proj(x) * x_mask
|
429 |
+
|
430 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
431 |
+
return x, m, logs, x_mask, loss_commit
|
432 |
+
|
433 |
+
|
434 |
+
class ResidualCouplingBlock(nn.Module):
|
435 |
+
def __init__(
|
436 |
+
self,
|
437 |
+
channels,
|
438 |
+
hidden_channels,
|
439 |
+
kernel_size,
|
440 |
+
dilation_rate,
|
441 |
+
n_layers,
|
442 |
+
n_flows=4,
|
443 |
+
gin_channels=0,
|
444 |
+
):
|
445 |
+
super().__init__()
|
446 |
+
self.channels = channels
|
447 |
+
self.hidden_channels = hidden_channels
|
448 |
+
self.kernel_size = kernel_size
|
449 |
+
self.dilation_rate = dilation_rate
|
450 |
+
self.n_layers = n_layers
|
451 |
+
self.n_flows = n_flows
|
452 |
+
self.gin_channels = gin_channels
|
453 |
+
|
454 |
+
self.flows = nn.ModuleList()
|
455 |
+
for i in range(n_flows):
|
456 |
+
self.flows.append(
|
457 |
+
modules.ResidualCouplingLayer(
|
458 |
+
channels,
|
459 |
+
hidden_channels,
|
460 |
+
kernel_size,
|
461 |
+
dilation_rate,
|
462 |
+
n_layers,
|
463 |
+
gin_channels=gin_channels,
|
464 |
+
mean_only=True,
|
465 |
+
)
|
466 |
+
)
|
467 |
+
self.flows.append(modules.Flip())
|
468 |
+
|
469 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
470 |
+
if not reverse:
|
471 |
+
for flow in self.flows:
|
472 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
473 |
+
else:
|
474 |
+
for flow in reversed(self.flows):
|
475 |
+
x = flow(x, x_mask, g=g, reverse=reverse)
|
476 |
+
return x
|
477 |
+
|
478 |
+
|
479 |
+
class PosteriorEncoder(nn.Module):
|
480 |
+
def __init__(
|
481 |
+
self,
|
482 |
+
in_channels,
|
483 |
+
out_channels,
|
484 |
+
hidden_channels,
|
485 |
+
kernel_size,
|
486 |
+
dilation_rate,
|
487 |
+
n_layers,
|
488 |
+
gin_channels=0,
|
489 |
+
):
|
490 |
+
super().__init__()
|
491 |
+
self.in_channels = in_channels
|
492 |
+
self.out_channels = out_channels
|
493 |
+
self.hidden_channels = hidden_channels
|
494 |
+
self.kernel_size = kernel_size
|
495 |
+
self.dilation_rate = dilation_rate
|
496 |
+
self.n_layers = n_layers
|
497 |
+
self.gin_channels = gin_channels
|
498 |
+
|
499 |
+
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
500 |
+
self.enc = modules.WN(
|
501 |
+
hidden_channels,
|
502 |
+
kernel_size,
|
503 |
+
dilation_rate,
|
504 |
+
n_layers,
|
505 |
+
gin_channels=gin_channels,
|
506 |
+
)
|
507 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
508 |
+
|
509 |
+
def forward(self, x, x_lengths, g=None):
|
510 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
|
511 |
+
x.dtype
|
512 |
+
)
|
513 |
+
x = self.pre(x) * x_mask
|
514 |
+
x = self.enc(x, x_mask, g=g)
|
515 |
+
stats = self.proj(x) * x_mask
|
516 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
517 |
+
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
518 |
+
return z, m, logs, x_mask
|
519 |
+
|
520 |
+
|
521 |
+
class Generator(torch.nn.Module):
|
522 |
+
def __init__(
|
523 |
+
self,
|
524 |
+
initial_channel,
|
525 |
+
resblock,
|
526 |
+
resblock_kernel_sizes,
|
527 |
+
resblock_dilation_sizes,
|
528 |
+
upsample_rates,
|
529 |
+
upsample_initial_channel,
|
530 |
+
upsample_kernel_sizes,
|
531 |
+
gin_channels=0,
|
532 |
+
):
|
533 |
+
super(Generator, self).__init__()
|
534 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
535 |
+
self.num_upsamples = len(upsample_rates)
|
536 |
+
self.conv_pre = Conv1d(
|
537 |
+
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
538 |
+
)
|
539 |
+
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
|
540 |
+
|
541 |
+
self.ups = nn.ModuleList()
|
542 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
543 |
+
self.ups.append(
|
544 |
+
weight_norm(
|
545 |
+
ConvTranspose1d(
|
546 |
+
upsample_initial_channel // (2 ** i),
|
547 |
+
upsample_initial_channel // (2 ** (i + 1)),
|
548 |
+
k,
|
549 |
+
u,
|
550 |
+
padding=(k - u) // 2,
|
551 |
+
)
|
552 |
+
)
|
553 |
+
)
|
554 |
+
|
555 |
+
self.resblocks = nn.ModuleList()
|
556 |
+
for i in range(len(self.ups)):
|
557 |
+
ch = upsample_initial_channel // (2 ** (i + 1))
|
558 |
+
for j, (k, d) in enumerate(
|
559 |
+
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
560 |
+
):
|
561 |
+
self.resblocks.append(resblock(ch, k, d))
|
562 |
+
|
563 |
+
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
564 |
+
self.ups.apply(init_weights)
|
565 |
+
|
566 |
+
if gin_channels != 0:
|
567 |
+
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
568 |
+
|
569 |
+
def forward(self, x, g=None):
|
570 |
+
x = self.conv_pre(x)
|
571 |
+
if g is not None:
|
572 |
+
x = x + self.cond(g)
|
573 |
+
|
574 |
+
for i in range(self.num_upsamples):
|
575 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
576 |
+
x = self.ups[i](x)
|
577 |
+
xs = None
|
578 |
+
for j in range(self.num_kernels):
|
579 |
+
if xs is None:
|
580 |
+
xs = self.resblocks[i * self.num_kernels + j](x)
|
581 |
+
else:
|
582 |
+
xs += self.resblocks[i * self.num_kernels + j](x)
|
583 |
+
x = xs / self.num_kernels
|
584 |
+
x = F.leaky_relu(x)
|
585 |
+
x = self.conv_post(x)
|
586 |
+
x = torch.tanh(x)
|
587 |
+
|
588 |
+
return x
|
589 |
+
|
590 |
+
def remove_weight_norm(self):
|
591 |
+
print("Removing weight norm...")
|
592 |
+
for layer in self.ups:
|
593 |
+
remove_weight_norm(layer)
|
594 |
+
for layer in self.resblocks:
|
595 |
+
layer.remove_weight_norm()
|
596 |
+
|
597 |
+
|
598 |
+
class DiscriminatorP(torch.nn.Module):
|
599 |
+
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
600 |
+
super(DiscriminatorP, self).__init__()
|
601 |
+
self.period = period
|
602 |
+
self.use_spectral_norm = use_spectral_norm
|
603 |
+
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
604 |
+
self.convs = nn.ModuleList(
|
605 |
+
[
|
606 |
+
norm_f(
|
607 |
+
Conv2d(
|
608 |
+
1,
|
609 |
+
32,
|
610 |
+
(kernel_size, 1),
|
611 |
+
(stride, 1),
|
612 |
+
padding=(get_padding(kernel_size, 1), 0),
|
613 |
+
)
|
614 |
+
),
|
615 |
+
norm_f(
|
616 |
+
Conv2d(
|
617 |
+
32,
|
618 |
+
128,
|
619 |
+
(kernel_size, 1),
|
620 |
+
(stride, 1),
|
621 |
+
padding=(get_padding(kernel_size, 1), 0),
|
622 |
+
)
|
623 |
+
),
|
624 |
+
norm_f(
|
625 |
+
Conv2d(
|
626 |
+
128,
|
627 |
+
512,
|
628 |
+
(kernel_size, 1),
|
629 |
+
(stride, 1),
|
630 |
+
padding=(get_padding(kernel_size, 1), 0),
|
631 |
+
)
|
632 |
+
),
|
633 |
+
norm_f(
|
634 |
+
Conv2d(
|
635 |
+
512,
|
636 |
+
1024,
|
637 |
+
(kernel_size, 1),
|
638 |
+
(stride, 1),
|
639 |
+
padding=(get_padding(kernel_size, 1), 0),
|
640 |
+
)
|
641 |
+
),
|
642 |
+
norm_f(
|
643 |
+
Conv2d(
|
644 |
+
1024,
|
645 |
+
1024,
|
646 |
+
(kernel_size, 1),
|
647 |
+
1,
|
648 |
+
padding=(get_padding(kernel_size, 1), 0),
|
649 |
+
)
|
650 |
+
),
|
651 |
+
]
|
652 |
+
)
|
653 |
+
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
654 |
+
|
655 |
+
def forward(self, x):
|
656 |
+
fmap = []
|
657 |
+
|
658 |
+
# 1d to 2d
|
659 |
+
b, c, t = x.shape
|
660 |
+
if t % self.period != 0: # pad first
|
661 |
+
n_pad = self.period - (t % self.period)
|
662 |
+
x = F.pad(x, (0, n_pad), "reflect")
|
663 |
+
t = t + n_pad
|
664 |
+
x = x.view(b, c, t // self.period, self.period)
|
665 |
+
|
666 |
+
for layer in self.convs:
|
667 |
+
x = layer(x)
|
668 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
669 |
+
fmap.append(x)
|
670 |
+
x = self.conv_post(x)
|
671 |
+
fmap.append(x)
|
672 |
+
x = torch.flatten(x, 1, -1)
|
673 |
+
|
674 |
+
return x, fmap
|
675 |
+
|
676 |
+
|
677 |
+
class DiscriminatorS(torch.nn.Module):
|
678 |
+
def __init__(self, use_spectral_norm=False):
|
679 |
+
super(DiscriminatorS, self).__init__()
|
680 |
+
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
681 |
+
self.convs = nn.ModuleList(
|
682 |
+
[
|
683 |
+
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
684 |
+
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
685 |
+
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
686 |
+
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
687 |
+
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
688 |
+
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
689 |
+
]
|
690 |
+
)
|
691 |
+
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
692 |
+
|
693 |
+
def forward(self, x):
|
694 |
+
fmap = []
|
695 |
+
|
696 |
+
for layer in self.convs:
|
697 |
+
x = layer(x)
|
698 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
699 |
+
fmap.append(x)
|
700 |
+
x = self.conv_post(x)
|
701 |
+
fmap.append(x)
|
702 |
+
x = torch.flatten(x, 1, -1)
|
703 |
+
|
704 |
+
return x, fmap
|
705 |
+
|
706 |
+
|
707 |
+
class MultiPeriodDiscriminator(torch.nn.Module):
|
708 |
+
def __init__(self, use_spectral_norm=False):
|
709 |
+
super(MultiPeriodDiscriminator, self).__init__()
|
710 |
+
periods = [2, 3, 5, 7, 11]
|
711 |
+
|
712 |
+
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
713 |
+
discs = discs + [
|
714 |
+
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
|
715 |
+
]
|
716 |
+
self.discriminators = nn.ModuleList(discs)
|
717 |
+
|
718 |
+
def forward(self, y, y_hat):
|
719 |
+
y_d_rs = []
|
720 |
+
y_d_gs = []
|
721 |
+
fmap_rs = []
|
722 |
+
fmap_gs = []
|
723 |
+
for i, d in enumerate(self.discriminators):
|
724 |
+
y_d_r, fmap_r = d(y)
|
725 |
+
y_d_g, fmap_g = d(y_hat)
|
726 |
+
y_d_rs.append(y_d_r)
|
727 |
+
y_d_gs.append(y_d_g)
|
728 |
+
fmap_rs.append(fmap_r)
|
729 |
+
fmap_gs.append(fmap_g)
|
730 |
+
|
731 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
732 |
+
|
733 |
+
|
734 |
+
class WavLMDiscriminator(nn.Module):
|
735 |
+
"""docstring for Discriminator."""
|
736 |
+
|
737 |
+
def __init__(
|
738 |
+
self, slm_hidden=768, slm_layers=13, initial_channel=64, use_spectral_norm=False
|
739 |
+
):
|
740 |
+
super(WavLMDiscriminator, self).__init__()
|
741 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
742 |
+
self.pre = norm_f(
|
743 |
+
Conv1d(slm_hidden * slm_layers, initial_channel, 1, 1, padding=0)
|
744 |
+
)
|
745 |
+
|
746 |
+
self.convs = nn.ModuleList(
|
747 |
+
[
|
748 |
+
norm_f(
|
749 |
+
nn.Conv1d(
|
750 |
+
initial_channel, initial_channel * 2, kernel_size=5, padding=2
|
751 |
+
)
|
752 |
+
),
|
753 |
+
norm_f(
|
754 |
+
nn.Conv1d(
|
755 |
+
initial_channel * 2,
|
756 |
+
initial_channel * 4,
|
757 |
+
kernel_size=5,
|
758 |
+
padding=2,
|
759 |
+
)
|
760 |
+
),
|
761 |
+
norm_f(
|
762 |
+
nn.Conv1d(initial_channel * 4, initial_channel * 4, 5, 1, padding=2)
|
763 |
+
),
|
764 |
+
]
|
765 |
+
)
|
766 |
+
|
767 |
+
self.conv_post = norm_f(Conv1d(initial_channel * 4, 1, 3, 1, padding=1))
|
768 |
+
|
769 |
+
def forward(self, x):
|
770 |
+
x = self.pre(x)
|
771 |
+
|
772 |
+
fmap = []
|
773 |
+
for l in self.convs:
|
774 |
+
x = l(x)
|
775 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
776 |
+
fmap.append(x)
|
777 |
+
x = self.conv_post(x)
|
778 |
+
x = torch.flatten(x, 1, -1)
|
779 |
+
|
780 |
+
return x
|
781 |
+
|
782 |
+
|
783 |
+
class ReferenceEncoder(nn.Module):
|
784 |
+
"""
|
785 |
+
inputs --- [N, Ty/r, n_mels*r] mels
|
786 |
+
outputs --- [N, ref_enc_gru_size]
|
787 |
+
"""
|
788 |
+
|
789 |
+
def __init__(self, spec_channels, gin_channels=0):
|
790 |
+
super().__init__()
|
791 |
+
self.spec_channels = spec_channels
|
792 |
+
ref_enc_filters = [32, 32, 64, 64, 128, 128]
|
793 |
+
K = len(ref_enc_filters)
|
794 |
+
filters = [1] + ref_enc_filters
|
795 |
+
convs = [
|
796 |
+
weight_norm(
|
797 |
+
nn.Conv2d(
|
798 |
+
in_channels=filters[i],
|
799 |
+
out_channels=filters[i + 1],
|
800 |
+
kernel_size=(3, 3),
|
801 |
+
stride=(2, 2),
|
802 |
+
padding=(1, 1),
|
803 |
+
)
|
804 |
+
)
|
805 |
+
for i in range(K)
|
806 |
+
]
|
807 |
+
self.convs = nn.ModuleList(convs)
|
808 |
+
# self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) # noqa: E501
|
809 |
+
|
810 |
+
out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
|
811 |
+
self.gru = nn.GRU(
|
812 |
+
input_size=ref_enc_filters[-1] * out_channels,
|
813 |
+
hidden_size=256 // 2,
|
814 |
+
batch_first=True,
|
815 |
+
)
|
816 |
+
self.proj = nn.Linear(128, gin_channels)
|
817 |
+
|
818 |
+
def forward(self, inputs, mask=None):
|
819 |
+
N = inputs.size(0)
|
820 |
+
out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
|
821 |
+
for conv in self.convs:
|
822 |
+
out = conv(out)
|
823 |
+
# out = wn(out)
|
824 |
+
out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
|
825 |
+
|
826 |
+
out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
|
827 |
+
T = out.size(1)
|
828 |
+
N = out.size(0)
|
829 |
+
out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
|
830 |
+
|
831 |
+
self.gru.flatten_parameters()
|
832 |
+
memory, out = self.gru(out) # out --- [1, N, 128]
|
833 |
+
|
834 |
+
return self.proj(out.squeeze(0))
|
835 |
+
|
836 |
+
def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
|
837 |
+
for i in range(n_convs):
|
838 |
+
L = (L - kernel_size + 2 * pad) // stride + 1
|
839 |
+
return L
|
840 |
+
|
841 |
+
|
842 |
+
class SynthesizerTrn(nn.Module):
|
843 |
+
"""
|
844 |
+
Synthesizer for Training
|
845 |
+
"""
|
846 |
+
|
847 |
+
def __init__(
|
848 |
+
self,
|
849 |
+
n_vocab,
|
850 |
+
spec_channels,
|
851 |
+
segment_size,
|
852 |
+
inter_channels,
|
853 |
+
hidden_channels,
|
854 |
+
filter_channels,
|
855 |
+
n_heads,
|
856 |
+
n_layers,
|
857 |
+
kernel_size,
|
858 |
+
p_dropout,
|
859 |
+
resblock,
|
860 |
+
resblock_kernel_sizes,
|
861 |
+
resblock_dilation_sizes,
|
862 |
+
upsample_rates,
|
863 |
+
upsample_initial_channel,
|
864 |
+
upsample_kernel_sizes,
|
865 |
+
n_speakers=256,
|
866 |
+
gin_channels=256,
|
867 |
+
use_sdp=True,
|
868 |
+
n_flow_layer=4,
|
869 |
+
n_layers_trans_flow=6,
|
870 |
+
flow_share_parameter=False,
|
871 |
+
use_transformer_flow=True,
|
872 |
+
**kwargs
|
873 |
+
):
|
874 |
+
super().__init__()
|
875 |
+
self.n_vocab = n_vocab
|
876 |
+
self.spec_channels = spec_channels
|
877 |
+
self.inter_channels = inter_channels
|
878 |
+
self.hidden_channels = hidden_channels
|
879 |
+
self.filter_channels = filter_channels
|
880 |
+
self.n_heads = n_heads
|
881 |
+
self.n_layers = n_layers
|
882 |
+
self.kernel_size = kernel_size
|
883 |
+
self.p_dropout = p_dropout
|
884 |
+
self.resblock = resblock
|
885 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
886 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
887 |
+
self.upsample_rates = upsample_rates
|
888 |
+
self.upsample_initial_channel = upsample_initial_channel
|
889 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
890 |
+
self.segment_size = segment_size
|
891 |
+
self.n_speakers = n_speakers
|
892 |
+
self.gin_channels = gin_channels
|
893 |
+
self.n_layers_trans_flow = n_layers_trans_flow
|
894 |
+
self.use_spk_conditioned_encoder = kwargs.get(
|
895 |
+
"use_spk_conditioned_encoder", True
|
896 |
+
)
|
897 |
+
self.use_sdp = use_sdp
|
898 |
+
self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False)
|
899 |
+
self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01)
|
900 |
+
self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6)
|
901 |
+
self.current_mas_noise_scale = self.mas_noise_scale_initial
|
902 |
+
if self.use_spk_conditioned_encoder and gin_channels > 0:
|
903 |
+
self.enc_gin_channels = gin_channels
|
904 |
+
self.enc_p = TextEncoder(
|
905 |
+
n_vocab,
|
906 |
+
inter_channels,
|
907 |
+
hidden_channels,
|
908 |
+
filter_channels,
|
909 |
+
n_heads,
|
910 |
+
n_layers,
|
911 |
+
kernel_size,
|
912 |
+
p_dropout,
|
913 |
+
gin_channels=self.enc_gin_channels,
|
914 |
+
)
|
915 |
+
self.dec = Generator(
|
916 |
+
inter_channels,
|
917 |
+
resblock,
|
918 |
+
resblock_kernel_sizes,
|
919 |
+
resblock_dilation_sizes,
|
920 |
+
upsample_rates,
|
921 |
+
upsample_initial_channel,
|
922 |
+
upsample_kernel_sizes,
|
923 |
+
gin_channels=gin_channels,
|
924 |
+
)
|
925 |
+
self.enc_q = PosteriorEncoder(
|
926 |
+
spec_channels,
|
927 |
+
inter_channels,
|
928 |
+
hidden_channels,
|
929 |
+
5,
|
930 |
+
1,
|
931 |
+
16,
|
932 |
+
gin_channels=gin_channels,
|
933 |
+
)
|
934 |
+
if use_transformer_flow:
|
935 |
+
self.flow = TransformerCouplingBlock(
|
936 |
+
inter_channels,
|
937 |
+
hidden_channels,
|
938 |
+
filter_channels,
|
939 |
+
n_heads,
|
940 |
+
n_layers_trans_flow,
|
941 |
+
5,
|
942 |
+
p_dropout,
|
943 |
+
n_flow_layer,
|
944 |
+
gin_channels=gin_channels,
|
945 |
+
share_parameter=flow_share_parameter,
|
946 |
+
)
|
947 |
+
else:
|
948 |
+
self.flow = ResidualCouplingBlock(
|
949 |
+
inter_channels,
|
950 |
+
hidden_channels,
|
951 |
+
5,
|
952 |
+
1,
|
953 |
+
n_flow_layer,
|
954 |
+
gin_channels=gin_channels,
|
955 |
+
)
|
956 |
+
self.sdp = StochasticDurationPredictor(
|
957 |
+
hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
|
958 |
+
)
|
959 |
+
self.dp = DurationPredictor(
|
960 |
+
hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
|
961 |
+
)
|
962 |
+
|
963 |
+
if n_speakers >= 1:
|
964 |
+
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
965 |
+
else:
|
966 |
+
self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
|
967 |
+
|
968 |
+
def infer(
|
969 |
+
self,
|
970 |
+
x,
|
971 |
+
x_lengths,
|
972 |
+
sid,
|
973 |
+
tone,
|
974 |
+
language,
|
975 |
+
ja_bert,
|
976 |
+
emo,
|
977 |
+
noise_scale=0.667,
|
978 |
+
length_scale=1,
|
979 |
+
noise_scale_w=0.8,
|
980 |
+
max_len=None,
|
981 |
+
sdp_ratio=0,
|
982 |
+
y=None,
|
983 |
+
**kwargs
|
984 |
+
):
|
985 |
+
# x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, tone, language, ja_bert)
|
986 |
+
# g = self.gst(y)
|
987 |
+
if self.n_speakers > 0:
|
988 |
+
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
989 |
+
else:
|
990 |
+
g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
|
991 |
+
x, m_p, logs_p, x_mask, _ = self.enc_p(
|
992 |
+
x, x_lengths, tone, language, ja_bert, emo, g=g
|
993 |
+
)
|
994 |
+
logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * (
|
995 |
+
sdp_ratio
|
996 |
+
) + self.dp(x, x_mask, g=g) * (1 - sdp_ratio)
|
997 |
+
w = torch.exp(logw) * x_mask * length_scale
|
998 |
+
w_ceil = torch.ceil(w)
|
999 |
+
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
|
1000 |
+
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(
|
1001 |
+
x_mask.dtype
|
1002 |
+
)
|
1003 |
+
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
1004 |
+
attn = commons.generate_path(w_ceil, attn_mask)
|
1005 |
+
|
1006 |
+
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
|
1007 |
+
1, 2
|
1008 |
+
) # [b, t', t], [b, t, d] -> [b, d, t']
|
1009 |
+
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
|
1010 |
+
1, 2
|
1011 |
+
) # [b, t', t], [b, t, d] -> [b, d, t']
|
1012 |
+
|
1013 |
+
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
1014 |
+
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
1015 |
+
o = self.dec((z * y_mask)[:, :, :max_len], g=g)
|
1016 |
+
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
bert_vits2/models_v230.py
ADDED
@@ -0,0 +1,1019 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from torch import nn
|
4 |
+
from torch.nn import functional as F
|
5 |
+
from vector_quantize_pytorch import VectorQuantize
|
6 |
+
|
7 |
+
from bert_vits2 import commons
|
8 |
+
from bert_vits2 import modules
|
9 |
+
from bert_vits2 import attentions
|
10 |
+
|
11 |
+
from torch.nn import Conv1d, ConvTranspose1d, Conv2d
|
12 |
+
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
13 |
+
|
14 |
+
from bert_vits2.commons import init_weights, get_padding
|
15 |
+
from bert_vits2.text import symbols, num_tones, num_languages
|
16 |
+
|
17 |
+
|
18 |
+
class DurationDiscriminator(nn.Module): # vits2
|
19 |
+
def __init__(
|
20 |
+
self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
|
21 |
+
):
|
22 |
+
super().__init__()
|
23 |
+
|
24 |
+
self.in_channels = in_channels
|
25 |
+
self.filter_channels = filter_channels
|
26 |
+
self.kernel_size = kernel_size
|
27 |
+
self.p_dropout = p_dropout
|
28 |
+
self.gin_channels = gin_channels
|
29 |
+
|
30 |
+
self.drop = nn.Dropout(p_dropout)
|
31 |
+
self.conv_1 = nn.Conv1d(
|
32 |
+
in_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
33 |
+
)
|
34 |
+
self.norm_1 = modules.LayerNorm(filter_channels)
|
35 |
+
self.conv_2 = nn.Conv1d(
|
36 |
+
filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
37 |
+
)
|
38 |
+
self.norm_2 = modules.LayerNorm(filter_channels)
|
39 |
+
self.dur_proj = nn.Conv1d(1, filter_channels, 1)
|
40 |
+
|
41 |
+
self.LSTM = nn.LSTM(
|
42 |
+
2 * filter_channels, filter_channels, batch_first=True, bidirectional=True
|
43 |
+
)
|
44 |
+
|
45 |
+
if gin_channels != 0:
|
46 |
+
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
|
47 |
+
|
48 |
+
self.output_layer = nn.Sequential(
|
49 |
+
nn.Linear(2 * filter_channels, 1), nn.Sigmoid()
|
50 |
+
)
|
51 |
+
|
52 |
+
def forward_probability(self, x, dur):
|
53 |
+
dur = self.dur_proj(dur)
|
54 |
+
x = torch.cat([x, dur], dim=1)
|
55 |
+
x = x.transpose(1, 2)
|
56 |
+
x, _ = self.LSTM(x)
|
57 |
+
output_prob = self.output_layer(x)
|
58 |
+
return output_prob
|
59 |
+
|
60 |
+
def forward(self, x, x_mask, dur_r, dur_hat, g=None):
|
61 |
+
x = torch.detach(x)
|
62 |
+
if g is not None:
|
63 |
+
g = torch.detach(g)
|
64 |
+
x = x + self.cond(g)
|
65 |
+
x = self.conv_1(x * x_mask)
|
66 |
+
x = torch.relu(x)
|
67 |
+
x = self.norm_1(x)
|
68 |
+
x = self.drop(x)
|
69 |
+
x = self.conv_2(x * x_mask)
|
70 |
+
x = torch.relu(x)
|
71 |
+
x = self.norm_2(x)
|
72 |
+
x = self.drop(x)
|
73 |
+
|
74 |
+
output_probs = []
|
75 |
+
for dur in [dur_r, dur_hat]:
|
76 |
+
output_prob = self.forward_probability(x, dur)
|
77 |
+
output_probs.append(output_prob)
|
78 |
+
|
79 |
+
return output_probs
|
80 |
+
|
81 |
+
|
82 |
+
class TransformerCouplingBlock(nn.Module):
|
83 |
+
def __init__(
|
84 |
+
self,
|
85 |
+
channels,
|
86 |
+
hidden_channels,
|
87 |
+
filter_channels,
|
88 |
+
n_heads,
|
89 |
+
n_layers,
|
90 |
+
kernel_size,
|
91 |
+
p_dropout,
|
92 |
+
n_flows=4,
|
93 |
+
gin_channels=0,
|
94 |
+
share_parameter=False,
|
95 |
+
):
|
96 |
+
super().__init__()
|
97 |
+
self.channels = channels
|
98 |
+
self.hidden_channels = hidden_channels
|
99 |
+
self.kernel_size = kernel_size
|
100 |
+
self.n_layers = n_layers
|
101 |
+
self.n_flows = n_flows
|
102 |
+
self.gin_channels = gin_channels
|
103 |
+
|
104 |
+
self.flows = nn.ModuleList()
|
105 |
+
|
106 |
+
self.wn = (
|
107 |
+
attentions.FFT(
|
108 |
+
hidden_channels,
|
109 |
+
filter_channels,
|
110 |
+
n_heads,
|
111 |
+
n_layers,
|
112 |
+
kernel_size,
|
113 |
+
p_dropout,
|
114 |
+
isflow=True,
|
115 |
+
gin_channels=self.gin_channels,
|
116 |
+
)
|
117 |
+
if share_parameter
|
118 |
+
else None
|
119 |
+
)
|
120 |
+
|
121 |
+
for i in range(n_flows):
|
122 |
+
self.flows.append(
|
123 |
+
modules.TransformerCouplingLayer(
|
124 |
+
channels,
|
125 |
+
hidden_channels,
|
126 |
+
kernel_size,
|
127 |
+
n_layers,
|
128 |
+
n_heads,
|
129 |
+
p_dropout,
|
130 |
+
filter_channels,
|
131 |
+
mean_only=True,
|
132 |
+
wn_sharing_parameter=self.wn,
|
133 |
+
gin_channels=self.gin_channels,
|
134 |
+
)
|
135 |
+
)
|
136 |
+
self.flows.append(modules.Flip())
|
137 |
+
|
138 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
139 |
+
if not reverse:
|
140 |
+
for flow in self.flows:
|
141 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
142 |
+
else:
|
143 |
+
for flow in reversed(self.flows):
|
144 |
+
x = flow(x, x_mask, g=g, reverse=reverse)
|
145 |
+
return x
|
146 |
+
|
147 |
+
|
148 |
+
class StochasticDurationPredictor(nn.Module):
|
149 |
+
def __init__(
|
150 |
+
self,
|
151 |
+
in_channels,
|
152 |
+
filter_channels,
|
153 |
+
kernel_size,
|
154 |
+
p_dropout,
|
155 |
+
n_flows=4,
|
156 |
+
gin_channels=0,
|
157 |
+
):
|
158 |
+
super().__init__()
|
159 |
+
filter_channels = in_channels # it needs to be removed from future version.
|
160 |
+
self.in_channels = in_channels
|
161 |
+
self.filter_channels = filter_channels
|
162 |
+
self.kernel_size = kernel_size
|
163 |
+
self.p_dropout = p_dropout
|
164 |
+
self.n_flows = n_flows
|
165 |
+
self.gin_channels = gin_channels
|
166 |
+
|
167 |
+
self.log_flow = modules.Log()
|
168 |
+
self.flows = nn.ModuleList()
|
169 |
+
self.flows.append(modules.ElementwiseAffine(2))
|
170 |
+
for i in range(n_flows):
|
171 |
+
self.flows.append(
|
172 |
+
modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
|
173 |
+
)
|
174 |
+
self.flows.append(modules.Flip())
|
175 |
+
|
176 |
+
self.post_pre = nn.Conv1d(1, filter_channels, 1)
|
177 |
+
self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
178 |
+
self.post_convs = modules.DDSConv(
|
179 |
+
filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
|
180 |
+
)
|
181 |
+
self.post_flows = nn.ModuleList()
|
182 |
+
self.post_flows.append(modules.ElementwiseAffine(2))
|
183 |
+
for i in range(4):
|
184 |
+
self.post_flows.append(
|
185 |
+
modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
|
186 |
+
)
|
187 |
+
self.post_flows.append(modules.Flip())
|
188 |
+
|
189 |
+
self.pre = nn.Conv1d(in_channels, filter_channels, 1)
|
190 |
+
self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
191 |
+
self.convs = modules.DDSConv(
|
192 |
+
filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
|
193 |
+
)
|
194 |
+
if gin_channels != 0:
|
195 |
+
self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
|
196 |
+
|
197 |
+
def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
|
198 |
+
x = torch.detach(x)
|
199 |
+
x = self.pre(x)
|
200 |
+
if g is not None:
|
201 |
+
g = torch.detach(g)
|
202 |
+
x = x + self.cond(g)
|
203 |
+
x = self.convs(x, x_mask)
|
204 |
+
x = self.proj(x) * x_mask
|
205 |
+
|
206 |
+
if not reverse:
|
207 |
+
flows = self.flows
|
208 |
+
assert w is not None
|
209 |
+
|
210 |
+
logdet_tot_q = 0
|
211 |
+
h_w = self.post_pre(w)
|
212 |
+
h_w = self.post_convs(h_w, x_mask)
|
213 |
+
h_w = self.post_proj(h_w) * x_mask
|
214 |
+
e_q = (
|
215 |
+
torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype)
|
216 |
+
* x_mask
|
217 |
+
)
|
218 |
+
z_q = e_q
|
219 |
+
for flow in self.post_flows:
|
220 |
+
z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
|
221 |
+
logdet_tot_q += logdet_q
|
222 |
+
z_u, z1 = torch.split(z_q, [1, 1], 1)
|
223 |
+
u = torch.sigmoid(z_u) * x_mask
|
224 |
+
z0 = (w - u) * x_mask
|
225 |
+
logdet_tot_q += torch.sum(
|
226 |
+
(F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1, 2]
|
227 |
+
)
|
228 |
+
logq = (
|
229 |
+
torch.sum(-0.5 * (math.log(2 * math.pi) + (e_q ** 2)) * x_mask, [1, 2])
|
230 |
+
- logdet_tot_q
|
231 |
+
)
|
232 |
+
|
233 |
+
logdet_tot = 0
|
234 |
+
z0, logdet = self.log_flow(z0, x_mask)
|
235 |
+
logdet_tot += logdet
|
236 |
+
z = torch.cat([z0, z1], 1)
|
237 |
+
for flow in flows:
|
238 |
+
z, logdet = flow(z, x_mask, g=x, reverse=reverse)
|
239 |
+
logdet_tot = logdet_tot + logdet
|
240 |
+
nll = (
|
241 |
+
torch.sum(0.5 * (math.log(2 * math.pi) + (z ** 2)) * x_mask, [1, 2])
|
242 |
+
- logdet_tot
|
243 |
+
)
|
244 |
+
return nll + logq # [b]
|
245 |
+
else:
|
246 |
+
flows = list(reversed(self.flows))
|
247 |
+
flows = flows[:-2] + [flows[-1]] # remove a useless vflow
|
248 |
+
z = (
|
249 |
+
torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
|
250 |
+
* noise_scale
|
251 |
+
)
|
252 |
+
for flow in flows:
|
253 |
+
z = flow(z, x_mask, g=x, reverse=reverse)
|
254 |
+
z0, z1 = torch.split(z, [1, 1], 1)
|
255 |
+
logw = z0
|
256 |
+
return logw
|
257 |
+
|
258 |
+
|
259 |
+
class DurationPredictor(nn.Module):
|
260 |
+
def __init__(
|
261 |
+
self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
|
262 |
+
):
|
263 |
+
super().__init__()
|
264 |
+
|
265 |
+
self.in_channels = in_channels
|
266 |
+
self.filter_channels = filter_channels
|
267 |
+
self.kernel_size = kernel_size
|
268 |
+
self.p_dropout = p_dropout
|
269 |
+
self.gin_channels = gin_channels
|
270 |
+
|
271 |
+
self.drop = nn.Dropout(p_dropout)
|
272 |
+
self.conv_1 = nn.Conv1d(
|
273 |
+
in_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
274 |
+
)
|
275 |
+
self.norm_1 = modules.LayerNorm(filter_channels)
|
276 |
+
self.conv_2 = nn.Conv1d(
|
277 |
+
filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
278 |
+
)
|
279 |
+
self.norm_2 = modules.LayerNorm(filter_channels)
|
280 |
+
self.proj = nn.Conv1d(filter_channels, 1, 1)
|
281 |
+
|
282 |
+
if gin_channels != 0:
|
283 |
+
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
|
284 |
+
|
285 |
+
def forward(self, x, x_mask, g=None):
|
286 |
+
x = torch.detach(x)
|
287 |
+
if g is not None:
|
288 |
+
g = torch.detach(g)
|
289 |
+
x = x + self.cond(g)
|
290 |
+
x = self.conv_1(x * x_mask)
|
291 |
+
x = torch.relu(x)
|
292 |
+
x = self.norm_1(x)
|
293 |
+
x = self.drop(x)
|
294 |
+
x = self.conv_2(x * x_mask)
|
295 |
+
x = torch.relu(x)
|
296 |
+
x = self.norm_2(x)
|
297 |
+
x = self.drop(x)
|
298 |
+
x = self.proj(x * x_mask)
|
299 |
+
return x * x_mask
|
300 |
+
|
301 |
+
|
302 |
+
class Bottleneck(nn.Sequential):
|
303 |
+
def __init__(self, in_dim, hidden_dim):
|
304 |
+
c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
|
305 |
+
c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
|
306 |
+
super().__init__(*[c_fc1, c_fc2])
|
307 |
+
|
308 |
+
|
309 |
+
class Block(nn.Module):
|
310 |
+
def __init__(self, in_dim, hidden_dim) -> None:
|
311 |
+
super().__init__()
|
312 |
+
self.norm = nn.LayerNorm(in_dim)
|
313 |
+
self.mlp = MLP(in_dim, hidden_dim)
|
314 |
+
|
315 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
316 |
+
x = x + self.mlp(self.norm(x))
|
317 |
+
return x
|
318 |
+
|
319 |
+
|
320 |
+
class MLP(nn.Module):
|
321 |
+
def __init__(self, in_dim, hidden_dim):
|
322 |
+
super().__init__()
|
323 |
+
self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
|
324 |
+
self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
|
325 |
+
self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False)
|
326 |
+
|
327 |
+
def forward(self, x: torch.Tensor):
|
328 |
+
x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
|
329 |
+
x = self.c_proj(x)
|
330 |
+
return x
|
331 |
+
|
332 |
+
|
333 |
+
class TextEncoder(nn.Module):
|
334 |
+
def __init__(
|
335 |
+
self,
|
336 |
+
n_vocab,
|
337 |
+
out_channels,
|
338 |
+
hidden_channels,
|
339 |
+
filter_channels,
|
340 |
+
n_heads,
|
341 |
+
n_layers,
|
342 |
+
kernel_size,
|
343 |
+
p_dropout,
|
344 |
+
gin_channels=0,
|
345 |
+
zh_bert_extra=False,
|
346 |
+
):
|
347 |
+
super().__init__()
|
348 |
+
self.n_vocab = n_vocab
|
349 |
+
self.out_channels = out_channels
|
350 |
+
self.hidden_channels = hidden_channels
|
351 |
+
self.filter_channels = filter_channels
|
352 |
+
self.n_heads = n_heads
|
353 |
+
self.n_layers = n_layers
|
354 |
+
self.kernel_size = kernel_size
|
355 |
+
self.p_dropout = p_dropout
|
356 |
+
self.gin_channels = gin_channels
|
357 |
+
self.emb = nn.Embedding(len(symbols), hidden_channels)
|
358 |
+
nn.init.normal_(self.emb.weight, 0.0, hidden_channels ** -0.5)
|
359 |
+
self.tone_emb = nn.Embedding(num_tones, hidden_channels)
|
360 |
+
nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels ** -0.5)
|
361 |
+
self.language_emb = nn.Embedding(num_languages, hidden_channels)
|
362 |
+
nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels ** -0.5)
|
363 |
+
self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
|
364 |
+
self.zh_bert_extra = zh_bert_extra
|
365 |
+
if self.zh_bert_extra:
|
366 |
+
self.bert_pre_proj = nn.Conv1d(2048, 1024, 1)
|
367 |
+
self.in_feature_net = nn.Sequential(
|
368 |
+
# input is assumed to an already normalized embedding
|
369 |
+
nn.Linear(512, 1028, bias=False),
|
370 |
+
nn.GELU(),
|
371 |
+
nn.LayerNorm(1028),
|
372 |
+
*[Block(1028, 512) for _ in range(1)],
|
373 |
+
nn.Linear(1028, 512, bias=False),
|
374 |
+
# normalize before passing to VQ?
|
375 |
+
# nn.GELU(),
|
376 |
+
# nn.LayerNorm(512),
|
377 |
+
)
|
378 |
+
self.emo_vq = VectorQuantize(
|
379 |
+
dim=512,
|
380 |
+
codebook_size=64,
|
381 |
+
codebook_dim=32,
|
382 |
+
commitment_weight=0.1,
|
383 |
+
decay=0.85,
|
384 |
+
heads=32,
|
385 |
+
kmeans_iters=20,
|
386 |
+
separate_codebook_per_head=True,
|
387 |
+
stochastic_sample_codes=True,
|
388 |
+
threshold_ema_dead_code=2,
|
389 |
+
)
|
390 |
+
self.out_feature_net = nn.Linear(512, hidden_channels)
|
391 |
+
else:
|
392 |
+
self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
|
393 |
+
self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
|
394 |
+
|
395 |
+
self.encoder = attentions.Encoder(
|
396 |
+
hidden_channels,
|
397 |
+
filter_channels,
|
398 |
+
n_heads,
|
399 |
+
n_layers,
|
400 |
+
kernel_size,
|
401 |
+
p_dropout,
|
402 |
+
gin_channels=self.gin_channels,
|
403 |
+
)
|
404 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
405 |
+
|
406 |
+
def forward(self, x, x_lengths, tone, language, zh_bert, ja_bert, en_bert, emo=None, g=None):
|
407 |
+
x = self.emb(x) + self.tone_emb(tone) + self.language_emb(language)
|
408 |
+
|
409 |
+
if self.zh_bert_extra:
|
410 |
+
zh_bert = self.bert_pre_proj(zh_bert)
|
411 |
+
emo_emb = self.in_feature_net(emo)
|
412 |
+
emo_emb, _, _ = self.emo_vq(emo_emb.unsqueeze(1))
|
413 |
+
emo_emb = self.out_feature_net(emo_emb)
|
414 |
+
x += emo_emb
|
415 |
+
x += self.bert_proj(zh_bert).transpose(1, 2)
|
416 |
+
if not self.zh_bert_extra:
|
417 |
+
x += self.ja_bert_proj(ja_bert).transpose(1, 2)
|
418 |
+
x += self.en_bert_proj(en_bert).transpose(1, 2)
|
419 |
+
|
420 |
+
x *= math.sqrt(self.hidden_channels) # [b, t, h]
|
421 |
+
x = torch.transpose(x, 1, -1) # [b, h, t]
|
422 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
|
423 |
+
x.dtype
|
424 |
+
)
|
425 |
+
|
426 |
+
x = self.encoder(x * x_mask, x_mask, g=g)
|
427 |
+
stats = self.proj(x) * x_mask
|
428 |
+
|
429 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
430 |
+
return x, m, logs, x_mask
|
431 |
+
|
432 |
+
|
433 |
+
class ResidualCouplingBlock(nn.Module):
|
434 |
+
def __init__(
|
435 |
+
self,
|
436 |
+
channels,
|
437 |
+
hidden_channels,
|
438 |
+
kernel_size,
|
439 |
+
dilation_rate,
|
440 |
+
n_layers,
|
441 |
+
n_flows=4,
|
442 |
+
gin_channels=0,
|
443 |
+
):
|
444 |
+
super().__init__()
|
445 |
+
self.channels = channels
|
446 |
+
self.hidden_channels = hidden_channels
|
447 |
+
self.kernel_size = kernel_size
|
448 |
+
self.dilation_rate = dilation_rate
|
449 |
+
self.n_layers = n_layers
|
450 |
+
self.n_flows = n_flows
|
451 |
+
self.gin_channels = gin_channels
|
452 |
+
|
453 |
+
self.flows = nn.ModuleList()
|
454 |
+
for i in range(n_flows):
|
455 |
+
self.flows.append(
|
456 |
+
modules.ResidualCouplingLayer(
|
457 |
+
channels,
|
458 |
+
hidden_channels,
|
459 |
+
kernel_size,
|
460 |
+
dilation_rate,
|
461 |
+
n_layers,
|
462 |
+
gin_channels=gin_channels,
|
463 |
+
mean_only=True,
|
464 |
+
)
|
465 |
+
)
|
466 |
+
self.flows.append(modules.Flip())
|
467 |
+
|
468 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
469 |
+
if not reverse:
|
470 |
+
for flow in self.flows:
|
471 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
472 |
+
else:
|
473 |
+
for flow in reversed(self.flows):
|
474 |
+
x = flow(x, x_mask, g=g, reverse=reverse)
|
475 |
+
return x
|
476 |
+
|
477 |
+
|
478 |
+
class PosteriorEncoder(nn.Module):
|
479 |
+
def __init__(
|
480 |
+
self,
|
481 |
+
in_channels,
|
482 |
+
out_channels,
|
483 |
+
hidden_channels,
|
484 |
+
kernel_size,
|
485 |
+
dilation_rate,
|
486 |
+
n_layers,
|
487 |
+
gin_channels=0,
|
488 |
+
):
|
489 |
+
super().__init__()
|
490 |
+
self.in_channels = in_channels
|
491 |
+
self.out_channels = out_channels
|
492 |
+
self.hidden_channels = hidden_channels
|
493 |
+
self.kernel_size = kernel_size
|
494 |
+
self.dilation_rate = dilation_rate
|
495 |
+
self.n_layers = n_layers
|
496 |
+
self.gin_channels = gin_channels
|
497 |
+
|
498 |
+
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
499 |
+
self.enc = modules.WN(
|
500 |
+
hidden_channels,
|
501 |
+
kernel_size,
|
502 |
+
dilation_rate,
|
503 |
+
n_layers,
|
504 |
+
gin_channels=gin_channels,
|
505 |
+
)
|
506 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
507 |
+
|
508 |
+
def forward(self, x, x_lengths, g=None):
|
509 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
|
510 |
+
x.dtype
|
511 |
+
)
|
512 |
+
x = self.pre(x) * x_mask
|
513 |
+
x = self.enc(x, x_mask, g=g)
|
514 |
+
stats = self.proj(x) * x_mask
|
515 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
516 |
+
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
517 |
+
return z, m, logs, x_mask
|
518 |
+
|
519 |
+
|
520 |
+
class Generator(torch.nn.Module):
|
521 |
+
def __init__(
|
522 |
+
self,
|
523 |
+
initial_channel,
|
524 |
+
resblock,
|
525 |
+
resblock_kernel_sizes,
|
526 |
+
resblock_dilation_sizes,
|
527 |
+
upsample_rates,
|
528 |
+
upsample_initial_channel,
|
529 |
+
upsample_kernel_sizes,
|
530 |
+
gin_channels=0,
|
531 |
+
):
|
532 |
+
super(Generator, self).__init__()
|
533 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
534 |
+
self.num_upsamples = len(upsample_rates)
|
535 |
+
self.conv_pre = Conv1d(
|
536 |
+
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
537 |
+
)
|
538 |
+
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
|
539 |
+
|
540 |
+
self.ups = nn.ModuleList()
|
541 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
542 |
+
self.ups.append(
|
543 |
+
weight_norm(
|
544 |
+
ConvTranspose1d(
|
545 |
+
upsample_initial_channel // (2 ** i),
|
546 |
+
upsample_initial_channel // (2 ** (i + 1)),
|
547 |
+
k,
|
548 |
+
u,
|
549 |
+
padding=(k - u) // 2,
|
550 |
+
)
|
551 |
+
)
|
552 |
+
)
|
553 |
+
|
554 |
+
self.resblocks = nn.ModuleList()
|
555 |
+
for i in range(len(self.ups)):
|
556 |
+
ch = upsample_initial_channel // (2 ** (i + 1))
|
557 |
+
for j, (k, d) in enumerate(
|
558 |
+
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
559 |
+
):
|
560 |
+
self.resblocks.append(resblock(ch, k, d))
|
561 |
+
|
562 |
+
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
563 |
+
self.ups.apply(init_weights)
|
564 |
+
|
565 |
+
if gin_channels != 0:
|
566 |
+
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
567 |
+
|
568 |
+
def forward(self, x, g=None):
|
569 |
+
x = self.conv_pre(x)
|
570 |
+
if g is not None:
|
571 |
+
x = x + self.cond(g)
|
572 |
+
|
573 |
+
for i in range(self.num_upsamples):
|
574 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
575 |
+
x = self.ups[i](x)
|
576 |
+
xs = None
|
577 |
+
for j in range(self.num_kernels):
|
578 |
+
if xs is None:
|
579 |
+
xs = self.resblocks[i * self.num_kernels + j](x)
|
580 |
+
else:
|
581 |
+
xs += self.resblocks[i * self.num_kernels + j](x)
|
582 |
+
x = xs / self.num_kernels
|
583 |
+
x = F.leaky_relu(x)
|
584 |
+
x = self.conv_post(x)
|
585 |
+
x = torch.tanh(x)
|
586 |
+
|
587 |
+
return x
|
588 |
+
|
589 |
+
def remove_weight_norm(self):
|
590 |
+
print("Removing weight norm...")
|
591 |
+
for layer in self.ups:
|
592 |
+
remove_weight_norm(layer)
|
593 |
+
for layer in self.resblocks:
|
594 |
+
layer.remove_weight_norm()
|
595 |
+
|
596 |
+
|
597 |
+
class DiscriminatorP(torch.nn.Module):
|
598 |
+
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
599 |
+
super(DiscriminatorP, self).__init__()
|
600 |
+
self.period = period
|
601 |
+
self.use_spectral_norm = use_spectral_norm
|
602 |
+
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
603 |
+
self.convs = nn.ModuleList(
|
604 |
+
[
|
605 |
+
norm_f(
|
606 |
+
Conv2d(
|
607 |
+
1,
|
608 |
+
32,
|
609 |
+
(kernel_size, 1),
|
610 |
+
(stride, 1),
|
611 |
+
padding=(get_padding(kernel_size, 1), 0),
|
612 |
+
)
|
613 |
+
),
|
614 |
+
norm_f(
|
615 |
+
Conv2d(
|
616 |
+
32,
|
617 |
+
128,
|
618 |
+
(kernel_size, 1),
|
619 |
+
(stride, 1),
|
620 |
+
padding=(get_padding(kernel_size, 1), 0),
|
621 |
+
)
|
622 |
+
),
|
623 |
+
norm_f(
|
624 |
+
Conv2d(
|
625 |
+
128,
|
626 |
+
512,
|
627 |
+
(kernel_size, 1),
|
628 |
+
(stride, 1),
|
629 |
+
padding=(get_padding(kernel_size, 1), 0),
|
630 |
+
)
|
631 |
+
),
|
632 |
+
norm_f(
|
633 |
+
Conv2d(
|
634 |
+
512,
|
635 |
+
1024,
|
636 |
+
(kernel_size, 1),
|
637 |
+
(stride, 1),
|
638 |
+
padding=(get_padding(kernel_size, 1), 0),
|
639 |
+
)
|
640 |
+
),
|
641 |
+
norm_f(
|
642 |
+
Conv2d(
|
643 |
+
1024,
|
644 |
+
1024,
|
645 |
+
(kernel_size, 1),
|
646 |
+
1,
|
647 |
+
padding=(get_padding(kernel_size, 1), 0),
|
648 |
+
)
|
649 |
+
),
|
650 |
+
]
|
651 |
+
)
|
652 |
+
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
653 |
+
|
654 |
+
def forward(self, x):
|
655 |
+
fmap = []
|
656 |
+
|
657 |
+
# 1d to 2d
|
658 |
+
b, c, t = x.shape
|
659 |
+
if t % self.period != 0: # pad first
|
660 |
+
n_pad = self.period - (t % self.period)
|
661 |
+
x = F.pad(x, (0, n_pad), "reflect")
|
662 |
+
t = t + n_pad
|
663 |
+
x = x.view(b, c, t // self.period, self.period)
|
664 |
+
|
665 |
+
for layer in self.convs:
|
666 |
+
x = layer(x)
|
667 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
668 |
+
fmap.append(x)
|
669 |
+
x = self.conv_post(x)
|
670 |
+
fmap.append(x)
|
671 |
+
x = torch.flatten(x, 1, -1)
|
672 |
+
|
673 |
+
return x, fmap
|
674 |
+
|
675 |
+
|
676 |
+
class DiscriminatorS(torch.nn.Module):
|
677 |
+
def __init__(self, use_spectral_norm=False):
|
678 |
+
super(DiscriminatorS, self).__init__()
|
679 |
+
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
680 |
+
self.convs = nn.ModuleList(
|
681 |
+
[
|
682 |
+
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
683 |
+
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
684 |
+
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
685 |
+
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
686 |
+
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
687 |
+
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
688 |
+
]
|
689 |
+
)
|
690 |
+
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
691 |
+
|
692 |
+
def forward(self, x):
|
693 |
+
fmap = []
|
694 |
+
|
695 |
+
for layer in self.convs:
|
696 |
+
x = layer(x)
|
697 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
698 |
+
fmap.append(x)
|
699 |
+
x = self.conv_post(x)
|
700 |
+
fmap.append(x)
|
701 |
+
x = torch.flatten(x, 1, -1)
|
702 |
+
|
703 |
+
return x, fmap
|
704 |
+
|
705 |
+
|
706 |
+
class MultiPeriodDiscriminator(torch.nn.Module):
|
707 |
+
def __init__(self, use_spectral_norm=False):
|
708 |
+
super(MultiPeriodDiscriminator, self).__init__()
|
709 |
+
periods = [2, 3, 5, 7, 11]
|
710 |
+
|
711 |
+
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
712 |
+
discs = discs + [
|
713 |
+
DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
|
714 |
+
]
|
715 |
+
self.discriminators = nn.ModuleList(discs)
|
716 |
+
|
717 |
+
def forward(self, y, y_hat):
|
718 |
+
y_d_rs = []
|
719 |
+
y_d_gs = []
|
720 |
+
fmap_rs = []
|
721 |
+
fmap_gs = []
|
722 |
+
for i, d in enumerate(self.discriminators):
|
723 |
+
y_d_r, fmap_r = d(y)
|
724 |
+
y_d_g, fmap_g = d(y_hat)
|
725 |
+
y_d_rs.append(y_d_r)
|
726 |
+
y_d_gs.append(y_d_g)
|
727 |
+
fmap_rs.append(fmap_r)
|
728 |
+
fmap_gs.append(fmap_g)
|
729 |
+
|
730 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
731 |
+
|
732 |
+
|
733 |
+
class WavLMDiscriminator(nn.Module):
|
734 |
+
"""docstring for Discriminator."""
|
735 |
+
|
736 |
+
def __init__(
|
737 |
+
self, slm_hidden=768, slm_layers=13, initial_channel=64, use_spectral_norm=False
|
738 |
+
):
|
739 |
+
super(WavLMDiscriminator, self).__init__()
|
740 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
741 |
+
self.pre = norm_f(
|
742 |
+
Conv1d(slm_hidden * slm_layers, initial_channel, 1, 1, padding=0)
|
743 |
+
)
|
744 |
+
|
745 |
+
self.convs = nn.ModuleList(
|
746 |
+
[
|
747 |
+
norm_f(
|
748 |
+
nn.Conv1d(
|
749 |
+
initial_channel, initial_channel * 2, kernel_size=5, padding=2
|
750 |
+
)
|
751 |
+
),
|
752 |
+
norm_f(
|
753 |
+
nn.Conv1d(
|
754 |
+
initial_channel * 2,
|
755 |
+
initial_channel * 4,
|
756 |
+
kernel_size=5,
|
757 |
+
padding=2,
|
758 |
+
)
|
759 |
+
),
|
760 |
+
norm_f(
|
761 |
+
nn.Conv1d(initial_channel * 4, initial_channel * 4, 5, 1, padding=2)
|
762 |
+
),
|
763 |
+
]
|
764 |
+
)
|
765 |
+
|
766 |
+
self.conv_post = norm_f(Conv1d(initial_channel * 4, 1, 3, 1, padding=1))
|
767 |
+
|
768 |
+
def forward(self, x):
|
769 |
+
x = self.pre(x)
|
770 |
+
|
771 |
+
fmap = []
|
772 |
+
for l in self.convs:
|
773 |
+
x = l(x)
|
774 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
775 |
+
fmap.append(x)
|
776 |
+
x = self.conv_post(x)
|
777 |
+
x = torch.flatten(x, 1, -1)
|
778 |
+
|
779 |
+
return x
|
780 |
+
|
781 |
+
|
782 |
+
class ReferenceEncoder(nn.Module):
|
783 |
+
"""
|
784 |
+
inputs --- [N, Ty/r, n_mels*r] mels
|
785 |
+
outputs --- [N, ref_enc_gru_size]
|
786 |
+
"""
|
787 |
+
|
788 |
+
def __init__(self, spec_channels, gin_channels=0):
|
789 |
+
super().__init__()
|
790 |
+
self.spec_channels = spec_channels
|
791 |
+
ref_enc_filters = [32, 32, 64, 64, 128, 128]
|
792 |
+
K = len(ref_enc_filters)
|
793 |
+
filters = [1] + ref_enc_filters
|
794 |
+
convs = [
|
795 |
+
weight_norm(
|
796 |
+
nn.Conv2d(
|
797 |
+
in_channels=filters[i],
|
798 |
+
out_channels=filters[i + 1],
|
799 |
+
kernel_size=(3, 3),
|
800 |
+
stride=(2, 2),
|
801 |
+
padding=(1, 1),
|
802 |
+
)
|
803 |
+
)
|
804 |
+
for i in range(K)
|
805 |
+
]
|
806 |
+
self.convs = nn.ModuleList(convs)
|
807 |
+
# self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) # noqa: E501
|
808 |
+
|
809 |
+
out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
|
810 |
+
self.gru = nn.GRU(
|
811 |
+
input_size=ref_enc_filters[-1] * out_channels,
|
812 |
+
hidden_size=256 // 2,
|
813 |
+
batch_first=True,
|
814 |
+
)
|
815 |
+
self.proj = nn.Linear(128, gin_channels)
|
816 |
+
|
817 |
+
def forward(self, inputs, mask=None):
|
818 |
+
N = inputs.size(0)
|
819 |
+
out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
|
820 |
+
for conv in self.convs:
|
821 |
+
out = conv(out)
|
822 |
+
# out = wn(out)
|
823 |
+
out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
|
824 |
+
|
825 |
+
out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
|
826 |
+
T = out.size(1)
|
827 |
+
N = out.size(0)
|
828 |
+
out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
|
829 |
+
|
830 |
+
self.gru.flatten_parameters()
|
831 |
+
memory, out = self.gru(out) # out --- [1, N, 128]
|
832 |
+
|
833 |
+
return self.proj(out.squeeze(0))
|
834 |
+
|
835 |
+
def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
|
836 |
+
for i in range(n_convs):
|
837 |
+
L = (L - kernel_size + 2 * pad) // stride + 1
|
838 |
+
return L
|
839 |
+
|
840 |
+
|
841 |
+
class SynthesizerTrn(nn.Module):
|
842 |
+
"""
|
843 |
+
Synthesizer for Training
|
844 |
+
"""
|
845 |
+
|
846 |
+
def __init__(
|
847 |
+
self,
|
848 |
+
n_vocab,
|
849 |
+
spec_channels,
|
850 |
+
segment_size,
|
851 |
+
inter_channels,
|
852 |
+
hidden_channels,
|
853 |
+
filter_channels,
|
854 |
+
n_heads,
|
855 |
+
n_layers,
|
856 |
+
kernel_size,
|
857 |
+
p_dropout,
|
858 |
+
resblock,
|
859 |
+
resblock_kernel_sizes,
|
860 |
+
resblock_dilation_sizes,
|
861 |
+
upsample_rates,
|
862 |
+
upsample_initial_channel,
|
863 |
+
upsample_kernel_sizes,
|
864 |
+
n_speakers=256,
|
865 |
+
gin_channels=256,
|
866 |
+
use_sdp=True,
|
867 |
+
n_flow_layer=4,
|
868 |
+
n_layers_trans_flow=4,
|
869 |
+
flow_share_parameter=False,
|
870 |
+
use_transformer_flow=True,
|
871 |
+
zh_bert_extra=False,
|
872 |
+
**kwargs
|
873 |
+
):
|
874 |
+
super().__init__()
|
875 |
+
self.n_vocab = n_vocab
|
876 |
+
self.spec_channels = spec_channels
|
877 |
+
self.inter_channels = inter_channels
|
878 |
+
self.hidden_channels = hidden_channels
|
879 |
+
self.filter_channels = filter_channels
|
880 |
+
self.n_heads = n_heads
|
881 |
+
self.n_layers = n_layers
|
882 |
+
self.kernel_size = kernel_size
|
883 |
+
self.p_dropout = p_dropout
|
884 |
+
self.resblock = resblock
|
885 |
+
self.resblock_kernel_sizes = resblock_kernel_sizes
|
886 |
+
self.resblock_dilation_sizes = resblock_dilation_sizes
|
887 |
+
self.upsample_rates = upsample_rates
|
888 |
+
self.upsample_initial_channel = upsample_initial_channel
|
889 |
+
self.upsample_kernel_sizes = upsample_kernel_sizes
|
890 |
+
self.segment_size = segment_size
|
891 |
+
self.n_speakers = n_speakers
|
892 |
+
self.gin_channels = gin_channels
|
893 |
+
self.n_layers_trans_flow = n_layers_trans_flow
|
894 |
+
self.use_spk_conditioned_encoder = kwargs.get(
|
895 |
+
"use_spk_conditioned_encoder", True
|
896 |
+
)
|
897 |
+
self.use_sdp = use_sdp
|
898 |
+
self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False)
|
899 |
+
self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01)
|
900 |
+
self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6)
|
901 |
+
self.current_mas_noise_scale = self.mas_noise_scale_initial
|
902 |
+
if self.use_spk_conditioned_encoder and gin_channels > 0:
|
903 |
+
self.enc_gin_channels = gin_channels
|
904 |
+
self.enc_p = TextEncoder(
|
905 |
+
n_vocab,
|
906 |
+
inter_channels,
|
907 |
+
hidden_channels,
|
908 |
+
filter_channels,
|
909 |
+
n_heads,
|
910 |
+
n_layers,
|
911 |
+
kernel_size,
|
912 |
+
p_dropout,
|
913 |
+
gin_channels=self.enc_gin_channels,
|
914 |
+
zh_bert_extra=zh_bert_extra,
|
915 |
+
)
|
916 |
+
self.dec = Generator(
|
917 |
+
inter_channels,
|
918 |
+
resblock,
|
919 |
+
resblock_kernel_sizes,
|
920 |
+
resblock_dilation_sizes,
|
921 |
+
upsample_rates,
|
922 |
+
upsample_initial_channel,
|
923 |
+
upsample_kernel_sizes,
|
924 |
+
gin_channels=gin_channels,
|
925 |
+
)
|
926 |
+
self.enc_q = PosteriorEncoder(
|
927 |
+
spec_channels,
|
928 |
+
inter_channels,
|
929 |
+
hidden_channels,
|
930 |
+
5,
|
931 |
+
1,
|
932 |
+
16,
|
933 |
+
gin_channels=gin_channels,
|
934 |
+
)
|
935 |
+
if use_transformer_flow:
|
936 |
+
self.flow = TransformerCouplingBlock(
|
937 |
+
inter_channels,
|
938 |
+
hidden_channels,
|
939 |
+
filter_channels,
|
940 |
+
n_heads,
|
941 |
+
n_layers_trans_flow,
|
942 |
+
5,
|
943 |
+
p_dropout,
|
944 |
+
n_flow_layer,
|
945 |
+
gin_channels=gin_channels,
|
946 |
+
share_parameter=flow_share_parameter,
|
947 |
+
)
|
948 |
+
else:
|
949 |
+
self.flow = ResidualCouplingBlock(
|
950 |
+
inter_channels,
|
951 |
+
hidden_channels,
|
952 |
+
5,
|
953 |
+
1,
|
954 |
+
n_flow_layer,
|
955 |
+
gin_channels=gin_channels,
|
956 |
+
)
|
957 |
+
self.sdp = StochasticDurationPredictor(
|
958 |
+
hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
|
959 |
+
)
|
960 |
+
self.dp = DurationPredictor(
|
961 |
+
hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
|
962 |
+
)
|
963 |
+
|
964 |
+
if n_speakers >= 1:
|
965 |
+
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
966 |
+
else:
|
967 |
+
self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
|
968 |
+
|
969 |
+
def infer(
|
970 |
+
self,
|
971 |
+
x,
|
972 |
+
x_lengths,
|
973 |
+
sid,
|
974 |
+
tone,
|
975 |
+
language,
|
976 |
+
zh_bert,
|
977 |
+
ja_bert,
|
978 |
+
en_bert,
|
979 |
+
emo=None,
|
980 |
+
noise_scale=0.667,
|
981 |
+
length_scale=1,
|
982 |
+
noise_scale_w=0.8,
|
983 |
+
max_len=None,
|
984 |
+
sdp_ratio=0,
|
985 |
+
y=None,
|
986 |
+
**kwargs,
|
987 |
+
):
|
988 |
+
# x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths, tone, language, bert)
|
989 |
+
# g = self.gst(y)
|
990 |
+
if self.n_speakers > 0:
|
991 |
+
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
992 |
+
else:
|
993 |
+
g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
|
994 |
+
x, m_p, logs_p, x_mask = self.enc_p(
|
995 |
+
x, x_lengths, tone, language, zh_bert, ja_bert, en_bert, emo=emo, g=g
|
996 |
+
)
|
997 |
+
logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * (
|
998 |
+
sdp_ratio
|
999 |
+
) + self.dp(x, x_mask, g=g) * (1 - sdp_ratio)
|
1000 |
+
w = torch.exp(logw) * x_mask * length_scale
|
1001 |
+
w_ceil = torch.ceil(w)
|
1002 |
+
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
|
1003 |
+
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(
|
1004 |
+
x_mask.dtype
|
1005 |
+
)
|
1006 |
+
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
1007 |
+
attn = commons.generate_path(w_ceil, attn_mask)
|
1008 |
+
|
1009 |
+
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
|
1010 |
+
1, 2
|
1011 |
+
) # [b, t', t], [b, t, d] -> [b, d, t']
|
1012 |
+
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
|
1013 |
+
1, 2
|
1014 |
+
) # [b, t', t], [b, t, d] -> [b, d, t']
|
1015 |
+
|
1016 |
+
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
1017 |
+
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
1018 |
+
o = self.dec((z * y_mask)[:, :, :max_len], g=g)
|
1019 |
+
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
bert_vits2/modules.py
ADDED
@@ -0,0 +1,459 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import math
|
3 |
+
import numpy as np
|
4 |
+
import scipy
|
5 |
+
import torch
|
6 |
+
from torch import nn
|
7 |
+
from torch.nn import functional as F
|
8 |
+
|
9 |
+
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
10 |
+
from torch.nn.utils import weight_norm, remove_weight_norm
|
11 |
+
|
12 |
+
from bert_vits2 import commons
|
13 |
+
from bert_vits2.commons import init_weights, get_padding
|
14 |
+
from bert_vits2.transforms import piecewise_rational_quadratic_transform
|
15 |
+
from bert_vits2.attentions import Encoder
|
16 |
+
|
17 |
+
LRELU_SLOPE = 0.1
|
18 |
+
|
19 |
+
|
20 |
+
class LayerNorm(nn.Module):
|
21 |
+
def __init__(self, channels, eps=1e-5):
|
22 |
+
super().__init__()
|
23 |
+
self.channels = channels
|
24 |
+
self.eps = eps
|
25 |
+
|
26 |
+
self.gamma = nn.Parameter(torch.ones(channels))
|
27 |
+
self.beta = nn.Parameter(torch.zeros(channels))
|
28 |
+
|
29 |
+
def forward(self, x):
|
30 |
+
x = x.transpose(1, -1)
|
31 |
+
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
32 |
+
return x.transpose(1, -1)
|
33 |
+
|
34 |
+
|
35 |
+
class ConvReluNorm(nn.Module):
|
36 |
+
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
|
37 |
+
super().__init__()
|
38 |
+
self.in_channels = in_channels
|
39 |
+
self.hidden_channels = hidden_channels
|
40 |
+
self.out_channels = out_channels
|
41 |
+
self.kernel_size = kernel_size
|
42 |
+
self.n_layers = n_layers
|
43 |
+
self.p_dropout = p_dropout
|
44 |
+
assert n_layers > 1, "Number of layers should be larger than 0."
|
45 |
+
|
46 |
+
self.conv_layers = nn.ModuleList()
|
47 |
+
self.norm_layers = nn.ModuleList()
|
48 |
+
self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
|
49 |
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
50 |
+
self.relu_drop = nn.Sequential(
|
51 |
+
nn.ReLU(),
|
52 |
+
nn.Dropout(p_dropout))
|
53 |
+
for _ in range(n_layers - 1):
|
54 |
+
self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
|
55 |
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
56 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
57 |
+
self.proj.weight.data.zero_()
|
58 |
+
self.proj.bias.data.zero_()
|
59 |
+
|
60 |
+
def forward(self, x, x_mask):
|
61 |
+
x_org = x
|
62 |
+
for i in range(self.n_layers):
|
63 |
+
x = self.conv_layers[i](x * x_mask)
|
64 |
+
x = self.norm_layers[i](x)
|
65 |
+
x = self.relu_drop(x)
|
66 |
+
x = x_org + self.proj(x)
|
67 |
+
return x * x_mask
|
68 |
+
|
69 |
+
|
70 |
+
class DDSConv(nn.Module):
|
71 |
+
"""
|
72 |
+
Dialted and Depth-Separable Convolution
|
73 |
+
"""
|
74 |
+
|
75 |
+
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
|
76 |
+
super().__init__()
|
77 |
+
self.channels = channels
|
78 |
+
self.kernel_size = kernel_size
|
79 |
+
self.n_layers = n_layers
|
80 |
+
self.p_dropout = p_dropout
|
81 |
+
|
82 |
+
self.drop = nn.Dropout(p_dropout)
|
83 |
+
self.convs_sep = nn.ModuleList()
|
84 |
+
self.convs_1x1 = nn.ModuleList()
|
85 |
+
self.norms_1 = nn.ModuleList()
|
86 |
+
self.norms_2 = nn.ModuleList()
|
87 |
+
for i in range(n_layers):
|
88 |
+
dilation = kernel_size ** i
|
89 |
+
padding = (kernel_size * dilation - dilation) // 2
|
90 |
+
self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
|
91 |
+
groups=channels, dilation=dilation, padding=padding
|
92 |
+
))
|
93 |
+
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
|
94 |
+
self.norms_1.append(LayerNorm(channels))
|
95 |
+
self.norms_2.append(LayerNorm(channels))
|
96 |
+
|
97 |
+
def forward(self, x, x_mask, g=None):
|
98 |
+
if g is not None:
|
99 |
+
x = x + g
|
100 |
+
for i in range(self.n_layers):
|
101 |
+
y = self.convs_sep[i](x * x_mask)
|
102 |
+
y = self.norms_1[i](y)
|
103 |
+
y = F.gelu(y)
|
104 |
+
y = self.convs_1x1[i](y)
|
105 |
+
y = self.norms_2[i](y)
|
106 |
+
y = F.gelu(y)
|
107 |
+
y = self.drop(y)
|
108 |
+
x = x + y
|
109 |
+
return x * x_mask
|
110 |
+
|
111 |
+
|
112 |
+
class WN(torch.nn.Module):
|
113 |
+
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
|
114 |
+
super(WN, self).__init__()
|
115 |
+
assert (kernel_size % 2 == 1)
|
116 |
+
self.hidden_channels = hidden_channels
|
117 |
+
self.kernel_size = kernel_size,
|
118 |
+
self.dilation_rate = dilation_rate
|
119 |
+
self.n_layers = n_layers
|
120 |
+
self.gin_channels = gin_channels
|
121 |
+
self.p_dropout = p_dropout
|
122 |
+
|
123 |
+
self.in_layers = torch.nn.ModuleList()
|
124 |
+
self.res_skip_layers = torch.nn.ModuleList()
|
125 |
+
self.drop = nn.Dropout(p_dropout)
|
126 |
+
|
127 |
+
if gin_channels != 0:
|
128 |
+
cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
|
129 |
+
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
|
130 |
+
|
131 |
+
for i in range(n_layers):
|
132 |
+
dilation = dilation_rate ** i
|
133 |
+
padding = int((kernel_size * dilation - dilation) / 2)
|
134 |
+
in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
|
135 |
+
dilation=dilation, padding=padding)
|
136 |
+
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
|
137 |
+
self.in_layers.append(in_layer)
|
138 |
+
|
139 |
+
# last one is not necessary
|
140 |
+
if i < n_layers - 1:
|
141 |
+
res_skip_channels = 2 * hidden_channels
|
142 |
+
else:
|
143 |
+
res_skip_channels = hidden_channels
|
144 |
+
|
145 |
+
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
|
146 |
+
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
|
147 |
+
self.res_skip_layers.append(res_skip_layer)
|
148 |
+
|
149 |
+
def forward(self, x, x_mask, g=None, **kwargs):
|
150 |
+
output = torch.zeros_like(x)
|
151 |
+
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
152 |
+
|
153 |
+
if g is not None:
|
154 |
+
g = self.cond_layer(g)
|
155 |
+
|
156 |
+
for i in range(self.n_layers):
|
157 |
+
x_in = self.in_layers[i](x)
|
158 |
+
if g is not None:
|
159 |
+
cond_offset = i * 2 * self.hidden_channels
|
160 |
+
g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
|
161 |
+
else:
|
162 |
+
g_l = torch.zeros_like(x_in)
|
163 |
+
|
164 |
+
acts = commons.fused_add_tanh_sigmoid_multiply(
|
165 |
+
x_in,
|
166 |
+
g_l,
|
167 |
+
n_channels_tensor)
|
168 |
+
acts = self.drop(acts)
|
169 |
+
|
170 |
+
res_skip_acts = self.res_skip_layers[i](acts)
|
171 |
+
if i < self.n_layers - 1:
|
172 |
+
res_acts = res_skip_acts[:, :self.hidden_channels, :]
|
173 |
+
x = (x + res_acts) * x_mask
|
174 |
+
output = output + res_skip_acts[:, self.hidden_channels:, :]
|
175 |
+
else:
|
176 |
+
output = output + res_skip_acts
|
177 |
+
return output * x_mask
|
178 |
+
|
179 |
+
def remove_weight_norm(self):
|
180 |
+
if self.gin_channels != 0:
|
181 |
+
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
182 |
+
for l in self.in_layers:
|
183 |
+
torch.nn.utils.remove_weight_norm(l)
|
184 |
+
for l in self.res_skip_layers:
|
185 |
+
torch.nn.utils.remove_weight_norm(l)
|
186 |
+
|
187 |
+
|
188 |
+
class ResBlock1(torch.nn.Module):
|
189 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
|
190 |
+
super(ResBlock1, self).__init__()
|
191 |
+
self.convs1 = nn.ModuleList([
|
192 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
193 |
+
padding=get_padding(kernel_size, dilation[0]))),
|
194 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
195 |
+
padding=get_padding(kernel_size, dilation[1]))),
|
196 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
|
197 |
+
padding=get_padding(kernel_size, dilation[2])))
|
198 |
+
])
|
199 |
+
self.convs1.apply(init_weights)
|
200 |
+
|
201 |
+
self.convs2 = nn.ModuleList([
|
202 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
203 |
+
padding=get_padding(kernel_size, 1))),
|
204 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
205 |
+
padding=get_padding(kernel_size, 1))),
|
206 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
207 |
+
padding=get_padding(kernel_size, 1)))
|
208 |
+
])
|
209 |
+
self.convs2.apply(init_weights)
|
210 |
+
|
211 |
+
def forward(self, x, x_mask=None):
|
212 |
+
for c1, c2 in zip(self.convs1, self.convs2):
|
213 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
214 |
+
if x_mask is not None:
|
215 |
+
xt = xt * x_mask
|
216 |
+
xt = c1(xt)
|
217 |
+
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
218 |
+
if x_mask is not None:
|
219 |
+
xt = xt * x_mask
|
220 |
+
xt = c2(xt)
|
221 |
+
x = xt + x
|
222 |
+
if x_mask is not None:
|
223 |
+
x = x * x_mask
|
224 |
+
return x
|
225 |
+
|
226 |
+
def remove_weight_norm(self):
|
227 |
+
for l in self.convs1:
|
228 |
+
remove_weight_norm(l)
|
229 |
+
for l in self.convs2:
|
230 |
+
remove_weight_norm(l)
|
231 |
+
|
232 |
+
|
233 |
+
class ResBlock2(torch.nn.Module):
|
234 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
|
235 |
+
super(ResBlock2, self).__init__()
|
236 |
+
self.convs = nn.ModuleList([
|
237 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
238 |
+
padding=get_padding(kernel_size, dilation[0]))),
|
239 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
240 |
+
padding=get_padding(kernel_size, dilation[1])))
|
241 |
+
])
|
242 |
+
self.convs.apply(init_weights)
|
243 |
+
|
244 |
+
def forward(self, x, x_mask=None):
|
245 |
+
for c in self.convs:
|
246 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
247 |
+
if x_mask is not None:
|
248 |
+
xt = xt * x_mask
|
249 |
+
xt = c(xt)
|
250 |
+
x = xt + x
|
251 |
+
if x_mask is not None:
|
252 |
+
x = x * x_mask
|
253 |
+
return x
|
254 |
+
|
255 |
+
def remove_weight_norm(self):
|
256 |
+
for l in self.convs:
|
257 |
+
remove_weight_norm(l)
|
258 |
+
|
259 |
+
|
260 |
+
class Log(nn.Module):
|
261 |
+
def forward(self, x, x_mask, reverse=False, **kwargs):
|
262 |
+
if not reverse:
|
263 |
+
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
|
264 |
+
logdet = torch.sum(-y, [1, 2])
|
265 |
+
return y, logdet
|
266 |
+
else:
|
267 |
+
x = torch.exp(x) * x_mask
|
268 |
+
return x
|
269 |
+
|
270 |
+
|
271 |
+
class Flip(nn.Module):
|
272 |
+
def forward(self, x, *args, reverse=False, **kwargs):
|
273 |
+
x = torch.flip(x, [1])
|
274 |
+
if not reverse:
|
275 |
+
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
|
276 |
+
return x, logdet
|
277 |
+
else:
|
278 |
+
return x
|
279 |
+
|
280 |
+
|
281 |
+
class ElementwiseAffine(nn.Module):
|
282 |
+
def __init__(self, channels):
|
283 |
+
super().__init__()
|
284 |
+
self.channels = channels
|
285 |
+
self.m = nn.Parameter(torch.zeros(channels, 1))
|
286 |
+
self.logs = nn.Parameter(torch.zeros(channels, 1))
|
287 |
+
|
288 |
+
def forward(self, x, x_mask, reverse=False, **kwargs):
|
289 |
+
if not reverse:
|
290 |
+
y = self.m + torch.exp(self.logs) * x
|
291 |
+
y = y * x_mask
|
292 |
+
logdet = torch.sum(self.logs * x_mask, [1, 2])
|
293 |
+
return y, logdet
|
294 |
+
else:
|
295 |
+
x = (x - self.m) * torch.exp(-self.logs) * x_mask
|
296 |
+
return x
|
297 |
+
|
298 |
+
|
299 |
+
class ResidualCouplingLayer(nn.Module):
|
300 |
+
def __init__(self,
|
301 |
+
channels,
|
302 |
+
hidden_channels,
|
303 |
+
kernel_size,
|
304 |
+
dilation_rate,
|
305 |
+
n_layers,
|
306 |
+
p_dropout=0,
|
307 |
+
gin_channels=0,
|
308 |
+
mean_only=False):
|
309 |
+
assert channels % 2 == 0, "channels should be divisible by 2"
|
310 |
+
super().__init__()
|
311 |
+
self.channels = channels
|
312 |
+
self.hidden_channels = hidden_channels
|
313 |
+
self.kernel_size = kernel_size
|
314 |
+
self.dilation_rate = dilation_rate
|
315 |
+
self.n_layers = n_layers
|
316 |
+
self.half_channels = channels // 2
|
317 |
+
self.mean_only = mean_only
|
318 |
+
|
319 |
+
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
320 |
+
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout,
|
321 |
+
gin_channels=gin_channels)
|
322 |
+
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
323 |
+
self.post.weight.data.zero_()
|
324 |
+
self.post.bias.data.zero_()
|
325 |
+
|
326 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
327 |
+
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
328 |
+
h = self.pre(x0) * x_mask
|
329 |
+
h = self.enc(h, x_mask, g=g)
|
330 |
+
stats = self.post(h) * x_mask
|
331 |
+
if not self.mean_only:
|
332 |
+
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
|
333 |
+
else:
|
334 |
+
m = stats
|
335 |
+
logs = torch.zeros_like(m)
|
336 |
+
|
337 |
+
if not reverse:
|
338 |
+
x1 = m + x1 * torch.exp(logs) * x_mask
|
339 |
+
x = torch.cat([x0, x1], 1)
|
340 |
+
logdet = torch.sum(logs, [1, 2])
|
341 |
+
return x, logdet
|
342 |
+
else:
|
343 |
+
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
344 |
+
x = torch.cat([x0, x1], 1)
|
345 |
+
return x
|
346 |
+
|
347 |
+
|
348 |
+
class ConvFlow(nn.Module):
|
349 |
+
def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
|
350 |
+
super().__init__()
|
351 |
+
self.in_channels = in_channels
|
352 |
+
self.filter_channels = filter_channels
|
353 |
+
self.kernel_size = kernel_size
|
354 |
+
self.n_layers = n_layers
|
355 |
+
self.num_bins = num_bins
|
356 |
+
self.tail_bound = tail_bound
|
357 |
+
self.half_channels = in_channels // 2
|
358 |
+
|
359 |
+
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
|
360 |
+
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
|
361 |
+
self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
|
362 |
+
self.proj.weight.data.zero_()
|
363 |
+
self.proj.bias.data.zero_()
|
364 |
+
|
365 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
366 |
+
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
367 |
+
h = self.pre(x0)
|
368 |
+
h = self.convs(h, x_mask, g=g)
|
369 |
+
h = self.proj(h) * x_mask
|
370 |
+
|
371 |
+
b, c, t = x0.shape
|
372 |
+
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
|
373 |
+
|
374 |
+
unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
|
375 |
+
unnormalized_heights = h[..., self.num_bins:2 * self.num_bins] / math.sqrt(self.filter_channels)
|
376 |
+
unnormalized_derivatives = h[..., 2 * self.num_bins:]
|
377 |
+
|
378 |
+
x1, logabsdet = piecewise_rational_quadratic_transform(x1,
|
379 |
+
unnormalized_widths,
|
380 |
+
unnormalized_heights,
|
381 |
+
unnormalized_derivatives,
|
382 |
+
inverse=reverse,
|
383 |
+
tails='linear',
|
384 |
+
tail_bound=self.tail_bound
|
385 |
+
)
|
386 |
+
|
387 |
+
x = torch.cat([x0, x1], 1) * x_mask
|
388 |
+
logdet = torch.sum(logabsdet * x_mask, [1, 2])
|
389 |
+
if not reverse:
|
390 |
+
return x, logdet
|
391 |
+
else:
|
392 |
+
return x
|
393 |
+
|
394 |
+
|
395 |
+
class TransformerCouplingLayer(nn.Module):
|
396 |
+
def __init__(self,
|
397 |
+
channels,
|
398 |
+
hidden_channels,
|
399 |
+
kernel_size,
|
400 |
+
n_layers,
|
401 |
+
n_heads,
|
402 |
+
p_dropout=0,
|
403 |
+
filter_channels=0,
|
404 |
+
mean_only=False,
|
405 |
+
wn_sharing_parameter=None,
|
406 |
+
gin_channels=0
|
407 |
+
):
|
408 |
+
assert channels % 2 == 0, "channels should be divisible by 2"
|
409 |
+
super().__init__()
|
410 |
+
self.channels = channels
|
411 |
+
self.hidden_channels = hidden_channels
|
412 |
+
self.kernel_size = kernel_size
|
413 |
+
self.n_layers = n_layers
|
414 |
+
self.half_channels = channels // 2
|
415 |
+
self.mean_only = mean_only
|
416 |
+
|
417 |
+
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
418 |
+
self.enc = Encoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, isflow=True,
|
419 |
+
gin_channels=gin_channels) if wn_sharing_parameter is None else wn_sharing_parameter
|
420 |
+
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
421 |
+
self.post.weight.data.zero_()
|
422 |
+
self.post.bias.data.zero_()
|
423 |
+
|
424 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
425 |
+
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
426 |
+
h = self.pre(x0) * x_mask
|
427 |
+
h = self.enc(h, x_mask, g=g)
|
428 |
+
stats = self.post(h) * x_mask
|
429 |
+
if not self.mean_only:
|
430 |
+
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
|
431 |
+
else:
|
432 |
+
m = stats
|
433 |
+
logs = torch.zeros_like(m)
|
434 |
+
|
435 |
+
if not reverse:
|
436 |
+
x1 = m + x1 * torch.exp(logs) * x_mask
|
437 |
+
x = torch.cat([x0, x1], 1)
|
438 |
+
logdet = torch.sum(logs, [1, 2])
|
439 |
+
return x, logdet
|
440 |
+
else:
|
441 |
+
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
442 |
+
x = torch.cat([x0, x1], 1)
|
443 |
+
return x
|
444 |
+
|
445 |
+
x1, logabsdet = piecewise_rational_quadratic_transform(x1,
|
446 |
+
unnormalized_widths,
|
447 |
+
unnormalized_heights,
|
448 |
+
unnormalized_derivatives,
|
449 |
+
inverse=reverse,
|
450 |
+
tails='linear',
|
451 |
+
tail_bound=self.tail_bound
|
452 |
+
)
|
453 |
+
|
454 |
+
x = torch.cat([x0, x1], 1) * x_mask
|
455 |
+
logdet = torch.sum(logabsdet * x_mask, [1, 2])
|
456 |
+
if not reverse:
|
457 |
+
return x, logdet
|
458 |
+
else:
|
459 |
+
return x
|
bert_vits2/requirements.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Cython
|
2 |
+
librosa==0.9.1
|
3 |
+
matplotlib==3.3.1
|
4 |
+
numpy
|
5 |
+
phonemizer
|
6 |
+
scipy
|
7 |
+
tensorboard
|
8 |
+
torch
|
9 |
+
torchvision
|
10 |
+
Unidecode
|
11 |
+
amfm_decompy
|
12 |
+
jieba
|
13 |
+
transformers
|
14 |
+
pypinyin
|
15 |
+
cn2an
|
bert_vits2/text/__init__.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bert_vits2.text.symbols import *
|
2 |
+
|
3 |
+
|
4 |
+
def cleaned_text_to_sequence_v111(cleaned_text, tones, language, _symbol_to_id):
|
5 |
+
"""version <= 1.1.1"""
|
6 |
+
phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
|
7 |
+
tone_start = language_tone_start_map_v111[language]
|
8 |
+
tones = [i + tone_start for i in tones]
|
9 |
+
lang_id = language_id_map[language]
|
10 |
+
lang_ids = [lang_id for i in phones]
|
11 |
+
return phones, tones, lang_ids
|
12 |
+
|
13 |
+
def cleaned_text_to_sequence(cleaned_text, tones, language, _symbol_to_id):
|
14 |
+
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
15 |
+
Args:
|
16 |
+
text: string to convert to a sequence
|
17 |
+
Returns:
|
18 |
+
List of integers corresponding to the symbols in the text
|
19 |
+
"""
|
20 |
+
phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
|
21 |
+
tone_start = language_tone_start_map[language]
|
22 |
+
tones = [i + tone_start for i in tones]
|
23 |
+
lang_id = language_id_map[language]
|
24 |
+
lang_ids = [lang_id for i in phones]
|
25 |
+
return phones, tones, lang_ids
|
bert_vits2/text/chinese.py
ADDED
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
|
4 |
+
import cn2an
|
5 |
+
from pypinyin import lazy_pinyin, Style
|
6 |
+
|
7 |
+
from bert_vits2.text.symbols import punctuation
|
8 |
+
from bert_vits2.text.tone_sandhi import ToneSandhi
|
9 |
+
|
10 |
+
current_file_path = os.path.dirname(__file__)
|
11 |
+
pinyin_to_symbol_map = {line.split("\t")[0]: line.strip().split("\t")[1] for line in
|
12 |
+
open(os.path.join(current_file_path, 'opencpop-strict.txt')).readlines()}
|
13 |
+
|
14 |
+
import jieba.posseg as psg
|
15 |
+
from jieba import lcut
|
16 |
+
|
17 |
+
lcut("预加载")
|
18 |
+
|
19 |
+
rep_map = {
|
20 |
+
":": ",",
|
21 |
+
";": ",",
|
22 |
+
",": ",",
|
23 |
+
"。": ".",
|
24 |
+
"!": "!",
|
25 |
+
"?": "?",
|
26 |
+
"\n": ".",
|
27 |
+
"·": ",",
|
28 |
+
"、": ",",
|
29 |
+
"...": "…",
|
30 |
+
"$": ".",
|
31 |
+
"“": "'",
|
32 |
+
"”": "'",
|
33 |
+
'"': "'",
|
34 |
+
"‘": "'",
|
35 |
+
"’": "'",
|
36 |
+
"(": "'",
|
37 |
+
")": "'",
|
38 |
+
"(": "'",
|
39 |
+
")": "'",
|
40 |
+
"《": "'",
|
41 |
+
"》": "'",
|
42 |
+
"【": "'",
|
43 |
+
"】": "'",
|
44 |
+
"[": "'",
|
45 |
+
"]": "'",
|
46 |
+
"—": "-",
|
47 |
+
"~": "-",
|
48 |
+
"~": "-",
|
49 |
+
"「": "'",
|
50 |
+
"」": "'",
|
51 |
+
}
|
52 |
+
|
53 |
+
tone_modifier = ToneSandhi()
|
54 |
+
|
55 |
+
|
56 |
+
def replace_punctuation(text):
|
57 |
+
text = text.replace("嗯", "恩").replace("呣", "母")
|
58 |
+
pattern = re.compile('|'.join(re.escape(p) for p in rep_map.keys()))
|
59 |
+
|
60 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
61 |
+
|
62 |
+
replaced_text = re.sub(r'[^\u4e00-\u9fa5' + "".join(punctuation) + r']+', '', replaced_text)
|
63 |
+
|
64 |
+
return replaced_text
|
65 |
+
|
66 |
+
|
67 |
+
def g2p(text, **kwargs):
|
68 |
+
pattern = r'(?<=[{0}])\s*'.format(''.join(punctuation))
|
69 |
+
sentences = [i for i in re.split(pattern, text) if i.strip() != '']
|
70 |
+
phones, tones, word2ph = _g2p(sentences)
|
71 |
+
assert sum(word2ph) == len(phones)
|
72 |
+
assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
|
73 |
+
phones = ['_'] + phones + ["_"]
|
74 |
+
tones = [0] + tones + [0]
|
75 |
+
word2ph = [1] + word2ph + [1]
|
76 |
+
return phones, tones, word2ph
|
77 |
+
|
78 |
+
|
79 |
+
def _get_initials_finals(word):
|
80 |
+
initials = []
|
81 |
+
finals = []
|
82 |
+
orig_initials = lazy_pinyin(
|
83 |
+
word, neutral_tone_with_five=True, style=Style.INITIALS)
|
84 |
+
orig_finals = lazy_pinyin(
|
85 |
+
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
86 |
+
for c, v in zip(orig_initials, orig_finals):
|
87 |
+
initials.append(c)
|
88 |
+
finals.append(v)
|
89 |
+
return initials, finals
|
90 |
+
|
91 |
+
|
92 |
+
def _g2p(segments, **kwargs):
|
93 |
+
phones_list = []
|
94 |
+
tones_list = []
|
95 |
+
word2ph = []
|
96 |
+
for seg in segments:
|
97 |
+
pinyins = []
|
98 |
+
# Replace all English words in the sentence
|
99 |
+
seg = re.sub('[a-zA-Z]+', '', seg)
|
100 |
+
seg_cut = psg.lcut(seg)
|
101 |
+
initials = []
|
102 |
+
finals = []
|
103 |
+
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
|
104 |
+
for word, pos in seg_cut:
|
105 |
+
if pos == 'eng':
|
106 |
+
continue
|
107 |
+
sub_initials, sub_finals = _get_initials_finals(word)
|
108 |
+
sub_finals = tone_modifier.modified_tone(word, pos,
|
109 |
+
sub_finals)
|
110 |
+
initials.append(sub_initials)
|
111 |
+
finals.append(sub_finals)
|
112 |
+
|
113 |
+
# assert len(sub_initials) == len(sub_finals) == len(word)
|
114 |
+
initials = sum(initials, [])
|
115 |
+
finals = sum(finals, [])
|
116 |
+
#
|
117 |
+
for c, v in zip(initials, finals):
|
118 |
+
raw_pinyin = c + v
|
119 |
+
# NOTE: post process for pypinyin outputs
|
120 |
+
# we discriminate i, ii and iii
|
121 |
+
if c == v:
|
122 |
+
assert c in punctuation
|
123 |
+
phone = [c]
|
124 |
+
tone = '0'
|
125 |
+
word2ph.append(1)
|
126 |
+
else:
|
127 |
+
v_without_tone = v[:-1]
|
128 |
+
tone = v[-1]
|
129 |
+
|
130 |
+
pinyin = c + v_without_tone
|
131 |
+
assert tone in '12345'
|
132 |
+
|
133 |
+
if c:
|
134 |
+
# 多音节
|
135 |
+
v_rep_map = {
|
136 |
+
"uei": 'ui',
|
137 |
+
'iou': 'iu',
|
138 |
+
'uen': 'un',
|
139 |
+
}
|
140 |
+
if v_without_tone in v_rep_map.keys():
|
141 |
+
pinyin = c + v_rep_map[v_without_tone]
|
142 |
+
else:
|
143 |
+
# 单音节
|
144 |
+
pinyin_rep_map = {
|
145 |
+
'ing': 'ying',
|
146 |
+
'i': 'yi',
|
147 |
+
'in': 'yin',
|
148 |
+
'u': 'wu',
|
149 |
+
}
|
150 |
+
if pinyin in pinyin_rep_map.keys():
|
151 |
+
pinyin = pinyin_rep_map[pinyin]
|
152 |
+
else:
|
153 |
+
single_rep_map = {
|
154 |
+
'v': 'yu',
|
155 |
+
'e': 'e',
|
156 |
+
'i': 'y',
|
157 |
+
'u': 'w',
|
158 |
+
}
|
159 |
+
if pinyin[0] in single_rep_map.keys():
|
160 |
+
pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
|
161 |
+
|
162 |
+
assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
|
163 |
+
phone = pinyin_to_symbol_map[pinyin].split(' ')
|
164 |
+
word2ph.append(len(phone))
|
165 |
+
|
166 |
+
phones_list += phone
|
167 |
+
tones_list += [int(tone)] * len(phone)
|
168 |
+
return phones_list, tones_list, word2ph
|
169 |
+
|
170 |
+
|
171 |
+
def text_normalize(text):
|
172 |
+
# numbers = re.findall(r'\d+(?:\.?\d+)?', text)
|
173 |
+
# for number in numbers:
|
174 |
+
# text = text.replace(number, cn2an.an2cn(number), 1)
|
175 |
+
text = cn2an.transform(text, "an2cn")
|
176 |
+
text = replace_punctuation(text)
|
177 |
+
return text
|
178 |
+
|
179 |
+
|
180 |
+
def get_bert_feature(text, word2ph):
|
181 |
+
from bert_vits2.text import chinese_bert
|
182 |
+
return chinese_bert.get_bert_feature(text, word2ph)
|
183 |
+
|
184 |
+
|
185 |
+
if __name__ == '__main__':
|
186 |
+
text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏。"
|
187 |
+
text = text_normalize(text)
|
188 |
+
print(text)
|
189 |
+
phones, tones, word2ph = g2p(text)
|
190 |
+
|
191 |
+
print(phones, tones, word2ph)
|
192 |
+
bert = get_bert_feature(text, word2ph)
|
193 |
+
|
194 |
+
print(bert.shape)
|
195 |
+
|
196 |
+
# # 示例用法
|
197 |
+
# text = "这是一个示例文本:,你好!这是一个测试...."
|
198 |
+
# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
|
bert_vits2/text/chinese_bert.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
from contants import config
|
4 |
+
|
5 |
+
|
6 |
+
def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, style_text=None, style_weight=0.7,
|
7 |
+
**kwargs):
|
8 |
+
with torch.no_grad():
|
9 |
+
inputs = tokenizer(text, return_tensors='pt')
|
10 |
+
for i in inputs:
|
11 |
+
inputs[i] = inputs[i].to(device)
|
12 |
+
res = model(**inputs, output_hidden_states=True)
|
13 |
+
res = torch.cat(res['hidden_states'][-3:-2], -1)[0].float().cpu()
|
14 |
+
if style_text:
|
15 |
+
style_inputs = tokenizer(style_text, return_tensors="pt")
|
16 |
+
for i in style_inputs:
|
17 |
+
style_inputs[i] = style_inputs[i].to(device)
|
18 |
+
style_res = model(**style_inputs, output_hidden_states=True)
|
19 |
+
style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].float().cpu()
|
20 |
+
style_res_mean = style_res.mean(0)
|
21 |
+
|
22 |
+
assert len(word2ph) == len(text) + 2
|
23 |
+
word2phone = word2ph
|
24 |
+
phone_level_feature = []
|
25 |
+
for i in range(len(word2phone)):
|
26 |
+
if style_text:
|
27 |
+
repeat_feature = (
|
28 |
+
res[i].repeat(word2phone[i], 1) * (1 - style_weight)
|
29 |
+
+ style_res_mean.repeat(word2phone[i], 1) * style_weight
|
30 |
+
)
|
31 |
+
else:
|
32 |
+
repeat_feature = res[i].repeat(word2phone[i], 1)
|
33 |
+
phone_level_feature.append(repeat_feature)
|
34 |
+
|
35 |
+
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
36 |
+
|
37 |
+
return phone_level_feature.T
|
38 |
+
|
39 |
+
|
40 |
+
if __name__ == '__main__':
|
41 |
+
|
42 |
+
word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
|
43 |
+
word2phone = [1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2,
|
44 |
+
2, 2, 2, 1]
|
45 |
+
|
46 |
+
# 计算总帧数
|
47 |
+
total_frames = sum(word2phone)
|
48 |
+
print(word_level_feature.shape)
|
49 |
+
print(word2phone)
|
50 |
+
phone_level_feature = []
|
51 |
+
for i in range(len(word2phone)):
|
52 |
+
print(word_level_feature[i].shape)
|
53 |
+
|
54 |
+
# 对每个词重复word2phone[i]次
|
55 |
+
repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
|
56 |
+
phone_level_feature.append(repeat_feature)
|
57 |
+
|
58 |
+
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
59 |
+
print(phone_level_feature.shape) # torch.Size([36, 1024])
|
bert_vits2/text/chinese_bert_extra.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
from contants import config
|
4 |
+
|
5 |
+
|
6 |
+
def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, style_text=None, style_weight=0.7,
|
7 |
+
**kwargs):
|
8 |
+
with torch.no_grad():
|
9 |
+
inputs = tokenizer(text, return_tensors='pt')
|
10 |
+
for i in inputs:
|
11 |
+
inputs[i] = inputs[i].to(device)
|
12 |
+
res = model(**inputs, output_hidden_states=True)
|
13 |
+
res = torch.nn.functional.normalize(torch.cat(res["hidden_states"][-3:-2], -1)[0], dim=0).float().cpu()
|
14 |
+
if style_text:
|
15 |
+
style_inputs = tokenizer(style_text, return_tensors="pt")
|
16 |
+
for i in style_inputs:
|
17 |
+
style_inputs[i] = style_inputs[i].to(device)
|
18 |
+
style_res = model(**style_inputs, output_hidden_states=True)
|
19 |
+
style_res = torch.nn.functional.normalize(
|
20 |
+
torch.cat(style_res["hidden_states"][-3:-2], -1)[0], dim=0
|
21 |
+
).float().cpu()
|
22 |
+
style_res_mean = style_res.mean(0)
|
23 |
+
assert len(word2ph) == len(text) + 2
|
24 |
+
word2phone = word2ph
|
25 |
+
phone_level_feature = []
|
26 |
+
for i in range(len(word2phone)):
|
27 |
+
if style_text:
|
28 |
+
repeat_feature = (
|
29 |
+
res[i].repeat(word2phone[i], 1) * (1 - style_weight)
|
30 |
+
+ style_res_mean.repeat(word2phone[i], 1) * style_weight
|
31 |
+
)
|
32 |
+
else:
|
33 |
+
repeat_feature = res[i].repeat(word2phone[i], 1)
|
34 |
+
phone_level_feature.append(repeat_feature)
|
35 |
+
|
36 |
+
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
37 |
+
|
38 |
+
return phone_level_feature.T
|
39 |
+
|
40 |
+
|
41 |
+
if __name__ == '__main__':
|
42 |
+
|
43 |
+
word_level_feature = torch.rand(38, 2048) # 12个词,每个词2048维特征
|
44 |
+
word2phone = [1, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2,
|
45 |
+
2, 2, 2, 1]
|
46 |
+
|
47 |
+
# 计算总帧数
|
48 |
+
total_frames = sum(word2phone)
|
49 |
+
print(word_level_feature.shape)
|
50 |
+
print(word2phone)
|
51 |
+
phone_level_feature = []
|
52 |
+
for i in range(len(word2phone)):
|
53 |
+
print(word_level_feature[i].shape)
|
54 |
+
|
55 |
+
# 对每个词重复word2phone[i]次
|
56 |
+
repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
|
57 |
+
phone_level_feature.append(repeat_feature)
|
58 |
+
|
59 |
+
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
60 |
+
print(phone_level_feature.shape) # torch.Size([36, 2048])
|
bert_vits2/text/chinese_v100.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
|
4 |
+
import cn2an
|
5 |
+
from pypinyin import lazy_pinyin, Style
|
6 |
+
|
7 |
+
from bert_vits2.text.symbols import punctuation
|
8 |
+
from bert_vits2.text.tone_sandhi import ToneSandhi
|
9 |
+
|
10 |
+
current_file_path = os.path.dirname(__file__)
|
11 |
+
pinyin_to_symbol_map = {line.split("\t")[0]: line.strip().split("\t")[1] for line in
|
12 |
+
open(os.path.join(current_file_path, 'opencpop-strict.txt')).readlines()}
|
13 |
+
|
14 |
+
import jieba.posseg as psg
|
15 |
+
from jieba import lcut
|
16 |
+
|
17 |
+
lcut("预加载")
|
18 |
+
|
19 |
+
rep_map = {
|
20 |
+
':': ',',
|
21 |
+
';': ',',
|
22 |
+
',': ',',
|
23 |
+
'。': '.',
|
24 |
+
'!': '!',
|
25 |
+
'?': '?',
|
26 |
+
'\n': '.',
|
27 |
+
"·": ",",
|
28 |
+
'、': ",",
|
29 |
+
'...': '…',
|
30 |
+
'$': '.',
|
31 |
+
'“': "'",
|
32 |
+
'”': "'",
|
33 |
+
'‘': "'",
|
34 |
+
'’': "'",
|
35 |
+
'(': "'",
|
36 |
+
')': "'",
|
37 |
+
'(': "'",
|
38 |
+
')': "'",
|
39 |
+
'《': "'",
|
40 |
+
'》': "'",
|
41 |
+
'【': "'",
|
42 |
+
'】': "'",
|
43 |
+
'[': "'",
|
44 |
+
']': "'",
|
45 |
+
'—': "-",
|
46 |
+
'~': "-",
|
47 |
+
'~': "-",
|
48 |
+
'「': "'",
|
49 |
+
'」': "'",
|
50 |
+
}
|
51 |
+
|
52 |
+
tone_modifier = ToneSandhi()
|
53 |
+
|
54 |
+
|
55 |
+
def replace_punctuation(text):
|
56 |
+
text = text.replace("嗯", "恩").replace("呣", "母")
|
57 |
+
pattern = re.compile('|'.join(re.escape(p) for p in rep_map.keys()))
|
58 |
+
|
59 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
60 |
+
|
61 |
+
replaced_text = re.sub(r'[^\u4e00-\u9fa5' + "".join(punctuation) + r']+', '', replaced_text)
|
62 |
+
|
63 |
+
return replaced_text
|
64 |
+
|
65 |
+
|
66 |
+
def g2p(text, **kwargs):
|
67 |
+
pattern = r'(?<=[{0}])\s*'.format(''.join(punctuation))
|
68 |
+
sentences = [i for i in re.split(pattern, text) if i.strip() != '']
|
69 |
+
phones, tones, word2ph = _g2p(sentences)
|
70 |
+
assert sum(word2ph) == len(phones)
|
71 |
+
assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
|
72 |
+
phones = ['_'] + phones + ["_"]
|
73 |
+
tones = [0] + tones + [0]
|
74 |
+
word2ph = [1] + word2ph + [1]
|
75 |
+
return phones, tones, word2ph
|
76 |
+
|
77 |
+
|
78 |
+
def _get_initials_finals(word):
|
79 |
+
initials = []
|
80 |
+
finals = []
|
81 |
+
orig_initials = lazy_pinyin(
|
82 |
+
word, neutral_tone_with_five=True, style=Style.INITIALS)
|
83 |
+
orig_finals = lazy_pinyin(
|
84 |
+
word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
|
85 |
+
for c, v in zip(orig_initials, orig_finals):
|
86 |
+
initials.append(c)
|
87 |
+
finals.append(v)
|
88 |
+
return initials, finals
|
89 |
+
|
90 |
+
|
91 |
+
def _g2p(segments):
|
92 |
+
phones_list = []
|
93 |
+
tones_list = []
|
94 |
+
word2ph = []
|
95 |
+
for seg in segments:
|
96 |
+
pinyins = []
|
97 |
+
# Replace all English words in the sentence
|
98 |
+
seg = re.sub('[a-zA-Z]+', '', seg)
|
99 |
+
seg_cut = psg.lcut(seg)
|
100 |
+
initials = []
|
101 |
+
finals = []
|
102 |
+
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
|
103 |
+
for word, pos in seg_cut:
|
104 |
+
if pos == 'eng':
|
105 |
+
continue
|
106 |
+
sub_initials, sub_finals = _get_initials_finals(word)
|
107 |
+
sub_finals = tone_modifier.modified_tone(word, pos,
|
108 |
+
sub_finals)
|
109 |
+
initials.append(sub_initials)
|
110 |
+
finals.append(sub_finals)
|
111 |
+
|
112 |
+
# assert len(sub_initials) == len(sub_finals) == len(word)
|
113 |
+
initials = sum(initials, [])
|
114 |
+
finals = sum(finals, [])
|
115 |
+
#
|
116 |
+
for c, v in zip(initials, finals):
|
117 |
+
raw_pinyin = c + v
|
118 |
+
# NOTE: post process for pypinyin outputs
|
119 |
+
# we discriminate i, ii and iii
|
120 |
+
if c == v:
|
121 |
+
assert c in punctuation
|
122 |
+
phone = [c]
|
123 |
+
tone = '0'
|
124 |
+
word2ph.append(1)
|
125 |
+
else:
|
126 |
+
v_without_tone = v[:-1]
|
127 |
+
tone = v[-1]
|
128 |
+
|
129 |
+
pinyin = c + v_without_tone
|
130 |
+
assert tone in '12345'
|
131 |
+
|
132 |
+
if c:
|
133 |
+
# 多音节
|
134 |
+
v_rep_map = {
|
135 |
+
"uei": 'ui',
|
136 |
+
'iou': 'iu',
|
137 |
+
'uen': 'un',
|
138 |
+
}
|
139 |
+
if v_without_tone in v_rep_map.keys():
|
140 |
+
pinyin = c + v_rep_map[v_without_tone]
|
141 |
+
else:
|
142 |
+
# 单音节
|
143 |
+
pinyin_rep_map = {
|
144 |
+
'ing': 'ying',
|
145 |
+
'i': 'yi',
|
146 |
+
'in': 'yin',
|
147 |
+
'u': 'wu',
|
148 |
+
}
|
149 |
+
if pinyin in pinyin_rep_map.keys():
|
150 |
+
pinyin = pinyin_rep_map[pinyin]
|
151 |
+
else:
|
152 |
+
single_rep_map = {
|
153 |
+
'v': 'yu',
|
154 |
+
'e': 'e',
|
155 |
+
'i': 'y',
|
156 |
+
'u': 'w',
|
157 |
+
}
|
158 |
+
if pinyin[0] in single_rep_map.keys():
|
159 |
+
pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
|
160 |
+
|
161 |
+
assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
|
162 |
+
phone = pinyin_to_symbol_map[pinyin].split(' ')
|
163 |
+
word2ph.append(len(phone))
|
164 |
+
|
165 |
+
phones_list += phone
|
166 |
+
tones_list += [int(tone)] * len(phone)
|
167 |
+
return phones_list, tones_list, word2ph
|
168 |
+
|
169 |
+
|
170 |
+
def text_normalize(text):
|
171 |
+
# numbers = re.findall(r'\d+(?:\.?\d+)?', text)
|
172 |
+
# for number in numbers:
|
173 |
+
# text = text.replace(number, cn2an.an2cn(number), 1)
|
174 |
+
text = cn2an.transform(text, "an2cn")
|
175 |
+
text = replace_punctuation(text)
|
176 |
+
return text
|
177 |
+
|
178 |
+
|
179 |
+
def get_bert_feature(text, word2ph):
|
180 |
+
from bert_vits2.text import chinese_bert
|
181 |
+
return chinese_bert.get_bert_feature(text, word2ph)
|
182 |
+
|
183 |
+
|
184 |
+
if __name__ == '__main__':
|
185 |
+
text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏。"
|
186 |
+
text = text_normalize(text)
|
187 |
+
print(text)
|
188 |
+
phones, tones, word2ph = g2p(text)
|
189 |
+
|
190 |
+
print(phones, tones, word2ph)
|
191 |
+
bert = get_bert_feature(text, word2ph)
|
192 |
+
|
193 |
+
print(bert.shape)
|
194 |
+
|
195 |
+
# # 示例用法
|
196 |
+
# text = "这是一个示例文本:,你好!这是一个测试...."
|
197 |
+
# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
|
bert_vits2/text/chinese_v240.py
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
|
4 |
+
from pypinyin import Style
|
5 |
+
from bert_vits2.text.symbols import punctuation
|
6 |
+
from bert_vits2.text.tone_sandhi import ToneSandhi
|
7 |
+
|
8 |
+
import cn2an
|
9 |
+
|
10 |
+
normalizer = lambda x: cn2an.transform(x, "an2cn")
|
11 |
+
|
12 |
+
current_file_path = os.path.dirname(__file__)
|
13 |
+
pinyin_to_symbol_map = {
|
14 |
+
line.split("\t")[0]: line.strip().split("\t")[1]
|
15 |
+
for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
|
16 |
+
}
|
17 |
+
|
18 |
+
import jieba.posseg as psg
|
19 |
+
|
20 |
+
rep_map = {
|
21 |
+
":": ",",
|
22 |
+
";": ",",
|
23 |
+
",": ",",
|
24 |
+
"。": ".",
|
25 |
+
"!": "!",
|
26 |
+
"?": "?",
|
27 |
+
"\n": ".",
|
28 |
+
"·": ",",
|
29 |
+
"、": ",",
|
30 |
+
"...": "…",
|
31 |
+
"$": ".",
|
32 |
+
"“": "'",
|
33 |
+
"”": "'",
|
34 |
+
'"': "'",
|
35 |
+
"‘": "'",
|
36 |
+
"’": "'",
|
37 |
+
"(": "'",
|
38 |
+
")": "'",
|
39 |
+
"(": "'",
|
40 |
+
")": "'",
|
41 |
+
"《": "'",
|
42 |
+
"》": "'",
|
43 |
+
"【": "'",
|
44 |
+
"】": "'",
|
45 |
+
"[": "'",
|
46 |
+
"]": "'",
|
47 |
+
"—": "-",
|
48 |
+
"~": "-",
|
49 |
+
"~": "-",
|
50 |
+
"「": "'",
|
51 |
+
"」": "'",
|
52 |
+
}
|
53 |
+
|
54 |
+
tone_modifier = ToneSandhi()
|
55 |
+
|
56 |
+
|
57 |
+
def replace_punctuation(text):
|
58 |
+
text = text.replace("嗯", "恩").replace("呣", "母")
|
59 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
60 |
+
|
61 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
62 |
+
|
63 |
+
replaced_text = re.sub(
|
64 |
+
r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
|
65 |
+
)
|
66 |
+
|
67 |
+
return replaced_text
|
68 |
+
|
69 |
+
|
70 |
+
def g2p(text, pinyinPlus=None, **kwargs):
|
71 |
+
pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
|
72 |
+
sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
|
73 |
+
phones, tones, word2ph = _g2p(sentences, pinyinPlus)
|
74 |
+
assert sum(word2ph) == len(phones)
|
75 |
+
assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
|
76 |
+
phones = ["_"] + phones + ["_"]
|
77 |
+
tones = [0] + tones + [0]
|
78 |
+
word2ph = [1] + word2ph + [1]
|
79 |
+
return phones, tones, word2ph
|
80 |
+
|
81 |
+
|
82 |
+
def _get_initials_finalsV2(word, orig_initials, orig_finals):
|
83 |
+
initials = []
|
84 |
+
finals = []
|
85 |
+
for c, v in zip(orig_initials, orig_finals):
|
86 |
+
initials.append(c)
|
87 |
+
finals.append(v)
|
88 |
+
return initials, finals
|
89 |
+
|
90 |
+
|
91 |
+
def _g2p(segments, pinyinPlus, **kwargs):
|
92 |
+
phones_list = []
|
93 |
+
tones_list = []
|
94 |
+
word2ph = []
|
95 |
+
for seg in segments:
|
96 |
+
# Replace all English words in the sentence
|
97 |
+
|
98 |
+
seg = re.sub("[a-zA-Z]+", "", seg)
|
99 |
+
|
100 |
+
seg_cut = psg.lcut(seg)
|
101 |
+
initials = []
|
102 |
+
finals = []
|
103 |
+
seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
|
104 |
+
allWords = ""
|
105 |
+
for word, pos in seg_cut:
|
106 |
+
allWords = allWords + word
|
107 |
+
|
108 |
+
orig_initials = pinyinPlus.lazy_pinyin(
|
109 |
+
allWords, neutral_tone_with_five=True, style=Style.INITIALS
|
110 |
+
)
|
111 |
+
orig_finals = pinyinPlus.lazy_pinyin(
|
112 |
+
allWords, neutral_tone_with_five=True, style=Style.FINALS_TONE3
|
113 |
+
)
|
114 |
+
currentIndex = 0
|
115 |
+
for word, pos in seg_cut:
|
116 |
+
curr_orig_initials = orig_initials[currentIndex: currentIndex + len(word)]
|
117 |
+
curr_orig_finalss = orig_finals[currentIndex: currentIndex + len(word)]
|
118 |
+
currentIndex = currentIndex + len(word)
|
119 |
+
if pos == "eng":
|
120 |
+
continue
|
121 |
+
sub_initials, sub_finals = _get_initials_finalsV2(
|
122 |
+
word, curr_orig_initials, curr_orig_finalss
|
123 |
+
)
|
124 |
+
sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
|
125 |
+
initials.append(sub_initials)
|
126 |
+
finals.append(sub_finals)
|
127 |
+
|
128 |
+
# assert len(sub_initials) == len(sub_finals) == len(word)
|
129 |
+
initials = sum(initials, [])
|
130 |
+
finals = sum(finals, [])
|
131 |
+
#
|
132 |
+
for c, v in zip(initials, finals):
|
133 |
+
raw_pinyin = c + v
|
134 |
+
# NOTE: post process for pypinyin outputs
|
135 |
+
# we discriminate i, ii and iii
|
136 |
+
if c == v:
|
137 |
+
assert c in punctuation
|
138 |
+
phone = [c]
|
139 |
+
tone = "0"
|
140 |
+
word2ph.append(1)
|
141 |
+
else:
|
142 |
+
v_without_tone = v[:-1]
|
143 |
+
tone = v[-1]
|
144 |
+
|
145 |
+
pinyin = c + v_without_tone
|
146 |
+
assert tone in "12345"
|
147 |
+
|
148 |
+
if c:
|
149 |
+
# 多音节
|
150 |
+
v_rep_map = {
|
151 |
+
"uei": "ui",
|
152 |
+
"iou": "iu",
|
153 |
+
"uen": "un",
|
154 |
+
}
|
155 |
+
if v_without_tone in v_rep_map.keys():
|
156 |
+
pinyin = c + v_rep_map[v_without_tone]
|
157 |
+
else:
|
158 |
+
# 单音节
|
159 |
+
pinyin_rep_map = {
|
160 |
+
"ing": "ying",
|
161 |
+
"i": "yi",
|
162 |
+
"in": "yin",
|
163 |
+
"u": "wu",
|
164 |
+
}
|
165 |
+
if pinyin in pinyin_rep_map.keys():
|
166 |
+
pinyin = pinyin_rep_map[pinyin]
|
167 |
+
else:
|
168 |
+
single_rep_map = {
|
169 |
+
"v": "yu",
|
170 |
+
"e": "e",
|
171 |
+
"i": "y",
|
172 |
+
"u": "w",
|
173 |
+
}
|
174 |
+
if pinyin[0] in single_rep_map.keys():
|
175 |
+
pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
|
176 |
+
|
177 |
+
assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
|
178 |
+
phone = pinyin_to_symbol_map[pinyin].split(" ")
|
179 |
+
word2ph.append(len(phone))
|
180 |
+
|
181 |
+
phones_list += phone
|
182 |
+
tones_list += [int(tone)] * len(phone)
|
183 |
+
return phones_list, tones_list, word2ph
|
184 |
+
|
185 |
+
|
186 |
+
def text_normalize(text):
|
187 |
+
text = normalizer(text)
|
188 |
+
text = replace_punctuation(text)
|
189 |
+
return text
|
190 |
+
|
191 |
+
|
192 |
+
def get_bert_feature(text, word2ph):
|
193 |
+
from bert_vits2.text import chinese_bert_extra as chinese_bert
|
194 |
+
|
195 |
+
return chinese_bert.get_bert_feature(text, word2ph)
|
196 |
+
|
197 |
+
|
198 |
+
if __name__ == "__main__":
|
199 |
+
from bert_vits2.text.chinese_bert import get_bert_feature
|
200 |
+
|
201 |
+
text = "欸,这个「勾玉」的形状,是不是和那边门上的凹槽很像?"
|
202 |
+
text = text_normalize(text)
|
203 |
+
print(text)
|
204 |
+
phones, tones, word2ph = g2p(text)
|
205 |
+
bert = get_bert_feature(text, word2ph)
|
206 |
+
|
207 |
+
print(phones, tones, word2ph, bert.shape)
|
208 |
+
|
209 |
+
# # 示例用法
|
210 |
+
# text = "这是一个示例文本:,你好!这是一个测试...."
|
211 |
+
# print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
|
bert_vits2/text/cleaner.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bert_vits2.text import chinese, japanese, english, cleaned_text_to_sequence, japanese_v111, chinese_v100, \
|
2 |
+
japanese_v200, english_v200, english_v230, chinese_v240, japanese_extra
|
3 |
+
|
4 |
+
language_module_map = {
|
5 |
+
'zh': chinese,
|
6 |
+
'ja': japanese,
|
7 |
+
'en': english,
|
8 |
+
'ja_v111': japanese_v111,
|
9 |
+
'zh_v100': chinese_v100,
|
10 |
+
'ja_v200': japanese_v200,
|
11 |
+
'en_v200': english_v200,
|
12 |
+
'en_v230': english_v230,
|
13 |
+
'zh_v240': chinese_v240,
|
14 |
+
'ja_extra': japanese_extra,
|
15 |
+
}
|
16 |
+
|
17 |
+
|
18 |
+
# _loaded_modules = {}
|
19 |
+
#
|
20 |
+
#
|
21 |
+
# def get_language_module(language):
|
22 |
+
# if language not in _loaded_modules:
|
23 |
+
# module_path = language_module_map.get(language)
|
24 |
+
# if not module_path:
|
25 |
+
# raise ValueError(f"Unsupported language: {language}")
|
26 |
+
#
|
27 |
+
# _loaded_modules[language] = importlib.import_module(module_path)
|
28 |
+
#
|
29 |
+
# return _loaded_modules[language]
|
30 |
+
|
31 |
+
|
32 |
+
def clean_text(text, language, tokenizer, pinyinPlus=None):
|
33 |
+
language_module = language_module_map[language]
|
34 |
+
norm_text = language_module.text_normalize(text)
|
35 |
+
phones, tones, word2ph = language_module.g2p(norm_text, tokenizer=tokenizer, pinyinPlus=pinyinPlus)
|
36 |
+
return norm_text, phones, tones, word2ph
|
37 |
+
|
38 |
+
|
39 |
+
# def clean_text_bert(text, language, tokenizer):
|
40 |
+
# language_module = language_module_map[language]
|
41 |
+
# norm_text = language_module.text_normalize(text)
|
42 |
+
# phones, tones, word2ph = language_module.g2p(norm_text, tokenizer)
|
43 |
+
# bert = language_module.get_bert_feature(norm_text, word2ph)
|
44 |
+
# return phones, tones, bert
|
45 |
+
|
46 |
+
|
47 |
+
def text_to_sequence(text, language, tokenizer):
|
48 |
+
norm_text, phones, tones, word2ph = clean_text(text, language, tokenizer)
|
49 |
+
return cleaned_text_to_sequence(phones, tones, language)
|
50 |
+
|
51 |
+
|
52 |
+
if __name__ == '__main__':
|
53 |
+
pass
|
bert_vits2/text/cmudict.rep
ADDED
The diff for this file is too large to render.
See raw diff
|
|
bert_vits2/text/cmudict_cache.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b9b21b20325471934ba92f2e4a5976989e7d920caa32e7a286eacb027d197949
|
3 |
+
size 6212655
|
bert_vits2/text/english.py
ADDED
@@ -0,0 +1,449 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
from g2p_en import G2p
|
5 |
+
|
6 |
+
from bert_vits2.text import symbols
|
7 |
+
|
8 |
+
current_file_path = os.path.dirname(__file__)
|
9 |
+
CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
|
10 |
+
CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
|
11 |
+
_g2p = G2p()
|
12 |
+
|
13 |
+
arpa = {
|
14 |
+
"AH0",
|
15 |
+
"S",
|
16 |
+
"AH1",
|
17 |
+
"EY2",
|
18 |
+
"AE2",
|
19 |
+
"EH0",
|
20 |
+
"OW2",
|
21 |
+
"UH0",
|
22 |
+
"NG",
|
23 |
+
"B",
|
24 |
+
"G",
|
25 |
+
"AY0",
|
26 |
+
"M",
|
27 |
+
"AA0",
|
28 |
+
"F",
|
29 |
+
"AO0",
|
30 |
+
"ER2",
|
31 |
+
"UH1",
|
32 |
+
"IY1",
|
33 |
+
"AH2",
|
34 |
+
"DH",
|
35 |
+
"IY0",
|
36 |
+
"EY1",
|
37 |
+
"IH0",
|
38 |
+
"K",
|
39 |
+
"N",
|
40 |
+
"W",
|
41 |
+
"IY2",
|
42 |
+
"T",
|
43 |
+
"AA1",
|
44 |
+
"ER1",
|
45 |
+
"EH2",
|
46 |
+
"OY0",
|
47 |
+
"UH2",
|
48 |
+
"UW1",
|
49 |
+
"Z",
|
50 |
+
"AW2",
|
51 |
+
"AW1",
|
52 |
+
"V",
|
53 |
+
"UW2",
|
54 |
+
"AA2",
|
55 |
+
"ER",
|
56 |
+
"AW0",
|
57 |
+
"UW0",
|
58 |
+
"R",
|
59 |
+
"OW1",
|
60 |
+
"EH1",
|
61 |
+
"ZH",
|
62 |
+
"AE0",
|
63 |
+
"IH2",
|
64 |
+
"IH",
|
65 |
+
"Y",
|
66 |
+
"JH",
|
67 |
+
"P",
|
68 |
+
"AY1",
|
69 |
+
"EY0",
|
70 |
+
"OY2",
|
71 |
+
"TH",
|
72 |
+
"HH",
|
73 |
+
"D",
|
74 |
+
"ER0",
|
75 |
+
"CH",
|
76 |
+
"AO1",
|
77 |
+
"AE1",
|
78 |
+
"AO2",
|
79 |
+
"OY1",
|
80 |
+
"AY2",
|
81 |
+
"IH1",
|
82 |
+
"OW0",
|
83 |
+
"L",
|
84 |
+
"SH",
|
85 |
+
}
|
86 |
+
|
87 |
+
|
88 |
+
def post_replace_ph(ph):
|
89 |
+
rep_map = {
|
90 |
+
":": ",",
|
91 |
+
";": ",",
|
92 |
+
",": ",",
|
93 |
+
"。": ".",
|
94 |
+
"!": "!",
|
95 |
+
"?": "?",
|
96 |
+
"\n": ".",
|
97 |
+
"·": ",",
|
98 |
+
"、": ",",
|
99 |
+
"…": "...",
|
100 |
+
"···": "...",
|
101 |
+
"・・・": "...",
|
102 |
+
"v": "V",
|
103 |
+
}
|
104 |
+
if ph in rep_map.keys():
|
105 |
+
ph = rep_map[ph]
|
106 |
+
if ph in symbols:
|
107 |
+
return ph
|
108 |
+
if ph not in symbols:
|
109 |
+
ph = "UNK"
|
110 |
+
return ph
|
111 |
+
|
112 |
+
|
113 |
+
rep_map = {
|
114 |
+
":": ",",
|
115 |
+
";": ",",
|
116 |
+
",": ",",
|
117 |
+
"。": ".",
|
118 |
+
"!": "!",
|
119 |
+
"?": "?",
|
120 |
+
"\n": ".",
|
121 |
+
".": ".",
|
122 |
+
"…": "...",
|
123 |
+
"···": "...",
|
124 |
+
"・・・": "...",
|
125 |
+
"·": ",",
|
126 |
+
"・": ",",
|
127 |
+
"、": ",",
|
128 |
+
"$": ".",
|
129 |
+
"“": "'",
|
130 |
+
"”": "'",
|
131 |
+
'"': "'",
|
132 |
+
"‘": "'",
|
133 |
+
"’": "'",
|
134 |
+
"(": "'",
|
135 |
+
")": "'",
|
136 |
+
"(": "'",
|
137 |
+
")": "'",
|
138 |
+
"《": "'",
|
139 |
+
"》": "'",
|
140 |
+
"【": "'",
|
141 |
+
"】": "'",
|
142 |
+
"[": "'",
|
143 |
+
"]": "'",
|
144 |
+
"—": "-",
|
145 |
+
"−": "-",
|
146 |
+
"~": "-",
|
147 |
+
"~": "-",
|
148 |
+
"「": "'",
|
149 |
+
"」": "'",
|
150 |
+
}
|
151 |
+
|
152 |
+
|
153 |
+
def replace_punctuation(text):
|
154 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
155 |
+
|
156 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
157 |
+
|
158 |
+
# replaced_text = re.sub(
|
159 |
+
# r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
|
160 |
+
# + "".join(punctuation)
|
161 |
+
# + r"]+",
|
162 |
+
# "",
|
163 |
+
# replaced_text,
|
164 |
+
# )
|
165 |
+
|
166 |
+
return replaced_text
|
167 |
+
|
168 |
+
|
169 |
+
def read_dict():
|
170 |
+
g2p_dict = {}
|
171 |
+
start_line = 49
|
172 |
+
with open(CMU_DICT_PATH) as f:
|
173 |
+
line = f.readline()
|
174 |
+
line_index = 1
|
175 |
+
while line:
|
176 |
+
if line_index >= start_line:
|
177 |
+
line = line.strip()
|
178 |
+
word_split = line.split(" ")
|
179 |
+
word = word_split[0]
|
180 |
+
|
181 |
+
syllable_split = word_split[1].split(" - ")
|
182 |
+
g2p_dict[word] = []
|
183 |
+
for syllable in syllable_split:
|
184 |
+
phone_split = syllable.split(" ")
|
185 |
+
g2p_dict[word].append(phone_split)
|
186 |
+
|
187 |
+
line_index = line_index + 1
|
188 |
+
line = f.readline()
|
189 |
+
|
190 |
+
return g2p_dict
|
191 |
+
|
192 |
+
|
193 |
+
def cache_dict(g2p_dict, file_path):
|
194 |
+
with open(file_path, "wb") as pickle_file:
|
195 |
+
pickle.dump(g2p_dict, pickle_file)
|
196 |
+
|
197 |
+
|
198 |
+
def get_dict():
|
199 |
+
if os.path.exists(CACHE_PATH):
|
200 |
+
with open(CACHE_PATH, "rb") as pickle_file:
|
201 |
+
g2p_dict = pickle.load(pickle_file)
|
202 |
+
else:
|
203 |
+
g2p_dict = read_dict()
|
204 |
+
cache_dict(g2p_dict, CACHE_PATH)
|
205 |
+
|
206 |
+
return g2p_dict
|
207 |
+
|
208 |
+
|
209 |
+
eng_dict = get_dict()
|
210 |
+
|
211 |
+
|
212 |
+
def refine_ph(phn):
|
213 |
+
tone = 0
|
214 |
+
if re.search(r"\d$", phn):
|
215 |
+
tone = int(phn[-1]) + 1
|
216 |
+
phn = phn[:-1]
|
217 |
+
return phn.lower(), tone
|
218 |
+
|
219 |
+
|
220 |
+
def refine_syllables(syllables):
|
221 |
+
tones = []
|
222 |
+
phonemes = []
|
223 |
+
for phn_list in syllables:
|
224 |
+
for i in range(len(phn_list)):
|
225 |
+
phn = phn_list[i]
|
226 |
+
phn, tone = refine_ph(phn)
|
227 |
+
phonemes.append(phn)
|
228 |
+
tones.append(tone)
|
229 |
+
return phonemes, tones
|
230 |
+
|
231 |
+
|
232 |
+
import re
|
233 |
+
import inflect
|
234 |
+
|
235 |
+
_inflect = inflect.engine()
|
236 |
+
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
|
237 |
+
_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
|
238 |
+
_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
|
239 |
+
_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
|
240 |
+
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
|
241 |
+
_number_re = re.compile(r"[0-9]+")
|
242 |
+
|
243 |
+
# List of (regular expression, replacement) pairs for abbreviations:
|
244 |
+
_abbreviations = [
|
245 |
+
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
246 |
+
for x in [
|
247 |
+
("mrs", "misess"),
|
248 |
+
("mr", "mister"),
|
249 |
+
("dr", "doctor"),
|
250 |
+
("st", "saint"),
|
251 |
+
("co", "company"),
|
252 |
+
("jr", "junior"),
|
253 |
+
("maj", "major"),
|
254 |
+
("gen", "general"),
|
255 |
+
("drs", "doctors"),
|
256 |
+
("rev", "reverend"),
|
257 |
+
("lt", "lieutenant"),
|
258 |
+
("hon", "honorable"),
|
259 |
+
("sgt", "sergeant"),
|
260 |
+
("capt", "captain"),
|
261 |
+
("esq", "esquire"),
|
262 |
+
("ltd", "limited"),
|
263 |
+
("col", "colonel"),
|
264 |
+
("ft", "fort"),
|
265 |
+
]
|
266 |
+
]
|
267 |
+
|
268 |
+
# List of (ipa, lazy ipa) pairs:
|
269 |
+
_lazy_ipa = [
|
270 |
+
(re.compile("%s" % x[0]), x[1])
|
271 |
+
for x in [
|
272 |
+
("r", "ɹ"),
|
273 |
+
("æ", "e"),
|
274 |
+
("ɑ", "a"),
|
275 |
+
("ɔ", "o"),
|
276 |
+
("ð", "z"),
|
277 |
+
("θ", "s"),
|
278 |
+
("ɛ", "e"),
|
279 |
+
("ɪ", "i"),
|
280 |
+
("ʊ", "u"),
|
281 |
+
("ʒ", "ʥ"),
|
282 |
+
("ʤ", "ʥ"),
|
283 |
+
("ˈ", "↓"),
|
284 |
+
]
|
285 |
+
]
|
286 |
+
|
287 |
+
# List of (ipa, lazy ipa2) pairs:
|
288 |
+
_lazy_ipa2 = [
|
289 |
+
(re.compile("%s" % x[0]), x[1])
|
290 |
+
for x in [
|
291 |
+
("r", "ɹ"),
|
292 |
+
("ð", "z"),
|
293 |
+
("θ", "s"),
|
294 |
+
("ʒ", "ʑ"),
|
295 |
+
("ʤ", "dʑ"),
|
296 |
+
("ˈ", "↓"),
|
297 |
+
]
|
298 |
+
]
|
299 |
+
|
300 |
+
# List of (ipa, ipa2) pairs
|
301 |
+
_ipa_to_ipa2 = [
|
302 |
+
(re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
|
303 |
+
]
|
304 |
+
|
305 |
+
|
306 |
+
def _expand_dollars(m):
|
307 |
+
match = m.group(1)
|
308 |
+
parts = match.split(".")
|
309 |
+
if len(parts) > 2:
|
310 |
+
return match + " dollars" # Unexpected format
|
311 |
+
dollars = int(parts[0]) if parts[0] else 0
|
312 |
+
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
313 |
+
if dollars and cents:
|
314 |
+
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
315 |
+
cent_unit = "cent" if cents == 1 else "cents"
|
316 |
+
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
|
317 |
+
elif dollars:
|
318 |
+
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
319 |
+
return "%s %s" % (dollars, dollar_unit)
|
320 |
+
elif cents:
|
321 |
+
cent_unit = "cent" if cents == 1 else "cents"
|
322 |
+
return "%s %s" % (cents, cent_unit)
|
323 |
+
else:
|
324 |
+
return "zero dollars"
|
325 |
+
|
326 |
+
|
327 |
+
def _remove_commas(m):
|
328 |
+
return m.group(1).replace(",", "")
|
329 |
+
|
330 |
+
|
331 |
+
def _expand_ordinal(m):
|
332 |
+
return _inflect.number_to_words(m.group(0))
|
333 |
+
|
334 |
+
|
335 |
+
def _expand_number(m):
|
336 |
+
num = int(m.group(0))
|
337 |
+
if num > 1000 and num < 3000:
|
338 |
+
if num == 2000:
|
339 |
+
return "two thousand"
|
340 |
+
elif num > 2000 and num < 2010:
|
341 |
+
return "two thousand " + _inflect.number_to_words(num % 100)
|
342 |
+
elif num % 100 == 0:
|
343 |
+
return _inflect.number_to_words(num // 100) + " hundred"
|
344 |
+
else:
|
345 |
+
return _inflect.number_to_words(
|
346 |
+
num, andword="", zero="oh", group=2
|
347 |
+
).replace(", ", " ")
|
348 |
+
else:
|
349 |
+
return _inflect.number_to_words(num, andword="")
|
350 |
+
|
351 |
+
|
352 |
+
def _expand_decimal_point(m):
|
353 |
+
return m.group(1).replace(".", " point ")
|
354 |
+
|
355 |
+
|
356 |
+
def normalize_numbers(text):
|
357 |
+
text = re.sub(_comma_number_re, _remove_commas, text)
|
358 |
+
text = re.sub(_pounds_re, r"\1 pounds", text)
|
359 |
+
text = re.sub(_dollars_re, _expand_dollars, text)
|
360 |
+
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
361 |
+
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
362 |
+
text = re.sub(_number_re, _expand_number, text)
|
363 |
+
return text
|
364 |
+
|
365 |
+
|
366 |
+
def text_normalize(text):
|
367 |
+
text = normalize_numbers(text)
|
368 |
+
text = replace_punctuation(text)
|
369 |
+
text = re.sub(r"([,;.\?\!])([\w])", r"\1 \2", text)
|
370 |
+
return text
|
371 |
+
|
372 |
+
|
373 |
+
def distribute_phone(n_phone, n_word):
|
374 |
+
phones_per_word = [0] * n_word
|
375 |
+
for task in range(n_phone):
|
376 |
+
min_tasks = min(phones_per_word)
|
377 |
+
min_index = phones_per_word.index(min_tasks)
|
378 |
+
phones_per_word[min_index] += 1
|
379 |
+
return phones_per_word
|
380 |
+
|
381 |
+
|
382 |
+
def sep_text(text):
|
383 |
+
words = re.split(r"([,;.\?\!\s+])", text)
|
384 |
+
words = [word for word in words if word.strip() != ""]
|
385 |
+
return words
|
386 |
+
|
387 |
+
|
388 |
+
def g2p(text, tokenizer, **kwargs):
|
389 |
+
phones = []
|
390 |
+
tones = []
|
391 |
+
# word2ph = []
|
392 |
+
words = sep_text(text)
|
393 |
+
tokens = [tokenizer.tokenize(i) for i in words]
|
394 |
+
for word in words:
|
395 |
+
if word.upper() in eng_dict:
|
396 |
+
phns, tns = refine_syllables(eng_dict[word.upper()])
|
397 |
+
phones.append([post_replace_ph(i) for i in phns])
|
398 |
+
tones.append(tns)
|
399 |
+
# word2ph.append(len(phns))
|
400 |
+
else:
|
401 |
+
phone_list = list(filter(lambda p: p != " ", _g2p(word)))
|
402 |
+
phns = []
|
403 |
+
tns = []
|
404 |
+
for ph in phone_list:
|
405 |
+
if ph in arpa:
|
406 |
+
ph, tn = refine_ph(ph)
|
407 |
+
phns.append(ph)
|
408 |
+
tns.append(tn)
|
409 |
+
else:
|
410 |
+
phns.append(ph)
|
411 |
+
tns.append(0)
|
412 |
+
phones.append([post_replace_ph(i) for i in phns])
|
413 |
+
tones.append(tns)
|
414 |
+
# word2ph.append(len(phns))
|
415 |
+
# phones = [post_replace_ph(i) for i in phones]
|
416 |
+
|
417 |
+
word2ph = []
|
418 |
+
for token, phoneme in zip(tokens, phones):
|
419 |
+
phone_len = len(phoneme)
|
420 |
+
word_len = len(token)
|
421 |
+
|
422 |
+
aaa = distribute_phone(phone_len, word_len)
|
423 |
+
word2ph += aaa
|
424 |
+
|
425 |
+
phones = ["_"] + [j for i in phones for j in i] + ["_"]
|
426 |
+
tones = [0] + [j for i in tones for j in i] + [0]
|
427 |
+
word2ph = [1] + word2ph + [1]
|
428 |
+
assert len(phones) == len(tones), text
|
429 |
+
assert len(phones) == sum(word2ph), text
|
430 |
+
|
431 |
+
return phones, tones, word2ph
|
432 |
+
|
433 |
+
|
434 |
+
def get_bert_feature(text, word2ph):
|
435 |
+
from bert_vits2.text import english_bert_mock
|
436 |
+
|
437 |
+
return english_bert_mock.get_bert_feature(text, word2ph)
|
438 |
+
|
439 |
+
|
440 |
+
if __name__ == "__main__":
|
441 |
+
# print(get_dict())
|
442 |
+
# print(eng_word_to_phoneme("hello"))
|
443 |
+
print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
|
444 |
+
# all_phones = set()
|
445 |
+
# for k, syllables in eng_dict.items():
|
446 |
+
# for group in syllables:
|
447 |
+
# for ph in group:
|
448 |
+
# all_phones.add(ph)
|
449 |
+
# print(all_phones)
|
bert_vits2/text/english_bert_mock.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
from contants import config
|
4 |
+
|
5 |
+
|
6 |
+
def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, style_text=None, style_weight=0.7,
|
7 |
+
**kwargs):
|
8 |
+
with torch.no_grad():
|
9 |
+
inputs = tokenizer(text, return_tensors="pt")
|
10 |
+
for i in inputs:
|
11 |
+
inputs[i] = inputs[i].to(device)
|
12 |
+
res = model(**inputs, output_hidden_states=True)
|
13 |
+
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
|
14 |
+
if style_text:
|
15 |
+
style_inputs = tokenizer(style_text, return_tensors="pt")
|
16 |
+
for i in style_inputs:
|
17 |
+
style_inputs[i] = style_inputs[i].to(device)
|
18 |
+
style_res = model(**style_inputs, output_hidden_states=True)
|
19 |
+
style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].float().cpu()
|
20 |
+
style_res_mean = style_res.mean(0)
|
21 |
+
assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
|
22 |
+
word2phone = word2ph
|
23 |
+
phone_level_feature = []
|
24 |
+
for i in range(len(word2phone)):
|
25 |
+
if style_text:
|
26 |
+
repeat_feature = (
|
27 |
+
res[i].repeat(word2phone[i], 1) * (1 - style_weight)
|
28 |
+
+ style_res_mean.repeat(word2phone[i], 1) * style_weight
|
29 |
+
)
|
30 |
+
else:
|
31 |
+
repeat_feature = res[i].repeat(word2phone[i], 1)
|
32 |
+
phone_level_feature.append(repeat_feature)
|
33 |
+
|
34 |
+
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
35 |
+
|
36 |
+
return phone_level_feature.T
|
bert_vits2/text/english_bert_mock_v200.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
from contants import config
|
4 |
+
|
5 |
+
|
6 |
+
def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, **kwargs):
|
7 |
+
with torch.no_grad():
|
8 |
+
inputs = tokenizer(text, return_tensors="pt")
|
9 |
+
for i in inputs:
|
10 |
+
inputs[i] = inputs[i].to(device)
|
11 |
+
res = model(**inputs, output_hidden_states=True)
|
12 |
+
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
|
13 |
+
# assert len(word2ph) == len(text)+2
|
14 |
+
word2phone = word2ph
|
15 |
+
phone_level_feature = []
|
16 |
+
for i in range(len(word2phone)):
|
17 |
+
repeat_feature = res[i].repeat(word2phone[i], 1)
|
18 |
+
phone_level_feature.append(repeat_feature)
|
19 |
+
|
20 |
+
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
21 |
+
|
22 |
+
return phone_level_feature.T
|
bert_vits2/text/english_v200.py
ADDED
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import os
|
3 |
+
from g2p_en import G2p
|
4 |
+
|
5 |
+
from bert_vits2.text import symbols
|
6 |
+
|
7 |
+
current_file_path = os.path.dirname(__file__)
|
8 |
+
CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
|
9 |
+
CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
|
10 |
+
_g2p = G2p()
|
11 |
+
|
12 |
+
arpa = {
|
13 |
+
"AH0",
|
14 |
+
"S",
|
15 |
+
"AH1",
|
16 |
+
"EY2",
|
17 |
+
"AE2",
|
18 |
+
"EH0",
|
19 |
+
"OW2",
|
20 |
+
"UH0",
|
21 |
+
"NG",
|
22 |
+
"B",
|
23 |
+
"G",
|
24 |
+
"AY0",
|
25 |
+
"M",
|
26 |
+
"AA0",
|
27 |
+
"F",
|
28 |
+
"AO0",
|
29 |
+
"ER2",
|
30 |
+
"UH1",
|
31 |
+
"IY1",
|
32 |
+
"AH2",
|
33 |
+
"DH",
|
34 |
+
"IY0",
|
35 |
+
"EY1",
|
36 |
+
"IH0",
|
37 |
+
"K",
|
38 |
+
"N",
|
39 |
+
"W",
|
40 |
+
"IY2",
|
41 |
+
"T",
|
42 |
+
"AA1",
|
43 |
+
"ER1",
|
44 |
+
"EH2",
|
45 |
+
"OY0",
|
46 |
+
"UH2",
|
47 |
+
"UW1",
|
48 |
+
"Z",
|
49 |
+
"AW2",
|
50 |
+
"AW1",
|
51 |
+
"V",
|
52 |
+
"UW2",
|
53 |
+
"AA2",
|
54 |
+
"ER",
|
55 |
+
"AW0",
|
56 |
+
"UW0",
|
57 |
+
"R",
|
58 |
+
"OW1",
|
59 |
+
"EH1",
|
60 |
+
"ZH",
|
61 |
+
"AE0",
|
62 |
+
"IH2",
|
63 |
+
"IH",
|
64 |
+
"Y",
|
65 |
+
"JH",
|
66 |
+
"P",
|
67 |
+
"AY1",
|
68 |
+
"EY0",
|
69 |
+
"OY2",
|
70 |
+
"TH",
|
71 |
+
"HH",
|
72 |
+
"D",
|
73 |
+
"ER0",
|
74 |
+
"CH",
|
75 |
+
"AO1",
|
76 |
+
"AE1",
|
77 |
+
"AO2",
|
78 |
+
"OY1",
|
79 |
+
"AY2",
|
80 |
+
"IH1",
|
81 |
+
"OW0",
|
82 |
+
"L",
|
83 |
+
"SH",
|
84 |
+
}
|
85 |
+
|
86 |
+
|
87 |
+
def post_replace_ph(ph):
|
88 |
+
rep_map = {
|
89 |
+
":": ",",
|
90 |
+
";": ",",
|
91 |
+
",": ",",
|
92 |
+
"。": ".",
|
93 |
+
"!": "!",
|
94 |
+
"?": "?",
|
95 |
+
"\n": ".",
|
96 |
+
"·": ",",
|
97 |
+
"、": ",",
|
98 |
+
"...": "…",
|
99 |
+
"v": "V",
|
100 |
+
}
|
101 |
+
if ph in rep_map.keys():
|
102 |
+
ph = rep_map[ph]
|
103 |
+
if ph in symbols:
|
104 |
+
return ph
|
105 |
+
if ph not in symbols:
|
106 |
+
ph = "UNK"
|
107 |
+
return ph
|
108 |
+
|
109 |
+
|
110 |
+
def read_dict():
|
111 |
+
g2p_dict = {}
|
112 |
+
start_line = 49
|
113 |
+
with open(CMU_DICT_PATH) as f:
|
114 |
+
line = f.readline()
|
115 |
+
line_index = 1
|
116 |
+
while line:
|
117 |
+
if line_index >= start_line:
|
118 |
+
line = line.strip()
|
119 |
+
word_split = line.split(" ")
|
120 |
+
word = word_split[0]
|
121 |
+
|
122 |
+
syllable_split = word_split[1].split(" - ")
|
123 |
+
g2p_dict[word] = []
|
124 |
+
for syllable in syllable_split:
|
125 |
+
phone_split = syllable.split(" ")
|
126 |
+
g2p_dict[word].append(phone_split)
|
127 |
+
|
128 |
+
line_index = line_index + 1
|
129 |
+
line = f.readline()
|
130 |
+
|
131 |
+
return g2p_dict
|
132 |
+
|
133 |
+
|
134 |
+
def cache_dict(g2p_dict, file_path):
|
135 |
+
with open(file_path, "wb") as pickle_file:
|
136 |
+
pickle.dump(g2p_dict, pickle_file)
|
137 |
+
|
138 |
+
|
139 |
+
def get_dict():
|
140 |
+
if os.path.exists(CACHE_PATH):
|
141 |
+
with open(CACHE_PATH, "rb") as pickle_file:
|
142 |
+
g2p_dict = pickle.load(pickle_file)
|
143 |
+
else:
|
144 |
+
g2p_dict = read_dict()
|
145 |
+
cache_dict(g2p_dict, CACHE_PATH)
|
146 |
+
|
147 |
+
return g2p_dict
|
148 |
+
|
149 |
+
|
150 |
+
eng_dict = get_dict()
|
151 |
+
|
152 |
+
|
153 |
+
def refine_ph(phn):
|
154 |
+
tone = 0
|
155 |
+
if re.search(r"\d$", phn):
|
156 |
+
tone = int(phn[-1]) + 1
|
157 |
+
phn = phn[:-1]
|
158 |
+
return phn.lower(), tone
|
159 |
+
|
160 |
+
|
161 |
+
def refine_syllables(syllables):
|
162 |
+
tones = []
|
163 |
+
phonemes = []
|
164 |
+
for phn_list in syllables:
|
165 |
+
for i in range(len(phn_list)):
|
166 |
+
phn = phn_list[i]
|
167 |
+
phn, tone = refine_ph(phn)
|
168 |
+
phonemes.append(phn)
|
169 |
+
tones.append(tone)
|
170 |
+
return phonemes, tones
|
171 |
+
|
172 |
+
|
173 |
+
import re
|
174 |
+
import inflect
|
175 |
+
|
176 |
+
_inflect = inflect.engine()
|
177 |
+
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
|
178 |
+
_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
|
179 |
+
_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
|
180 |
+
_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
|
181 |
+
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
|
182 |
+
_number_re = re.compile(r"[0-9]+")
|
183 |
+
|
184 |
+
# List of (regular expression, replacement) pairs for abbreviations:
|
185 |
+
_abbreviations = [
|
186 |
+
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
187 |
+
for x in [
|
188 |
+
("mrs", "misess"),
|
189 |
+
("mr", "mister"),
|
190 |
+
("dr", "doctor"),
|
191 |
+
("st", "saint"),
|
192 |
+
("co", "company"),
|
193 |
+
("jr", "junior"),
|
194 |
+
("maj", "major"),
|
195 |
+
("gen", "general"),
|
196 |
+
("drs", "doctors"),
|
197 |
+
("rev", "reverend"),
|
198 |
+
("lt", "lieutenant"),
|
199 |
+
("hon", "honorable"),
|
200 |
+
("sgt", "sergeant"),
|
201 |
+
("capt", "captain"),
|
202 |
+
("esq", "esquire"),
|
203 |
+
("ltd", "limited"),
|
204 |
+
("col", "colonel"),
|
205 |
+
("ft", "fort"),
|
206 |
+
]
|
207 |
+
]
|
208 |
+
|
209 |
+
# List of (ipa, lazy ipa) pairs:
|
210 |
+
_lazy_ipa = [
|
211 |
+
(re.compile("%s" % x[0]), x[1])
|
212 |
+
for x in [
|
213 |
+
("r", "ɹ"),
|
214 |
+
("æ", "e"),
|
215 |
+
("ɑ", "a"),
|
216 |
+
("ɔ", "o"),
|
217 |
+
("ð", "z"),
|
218 |
+
("θ", "s"),
|
219 |
+
("ɛ", "e"),
|
220 |
+
("ɪ", "i"),
|
221 |
+
("ʊ", "u"),
|
222 |
+
("ʒ", "ʥ"),
|
223 |
+
("ʤ", "ʥ"),
|
224 |
+
("ˈ", "↓"),
|
225 |
+
]
|
226 |
+
]
|
227 |
+
|
228 |
+
# List of (ipa, lazy ipa2) pairs:
|
229 |
+
_lazy_ipa2 = [
|
230 |
+
(re.compile("%s" % x[0]), x[1])
|
231 |
+
for x in [
|
232 |
+
("r", "ɹ"),
|
233 |
+
("ð", "z"),
|
234 |
+
("θ", "s"),
|
235 |
+
("ʒ", "ʑ"),
|
236 |
+
("ʤ", "dʑ"),
|
237 |
+
("ˈ", "↓"),
|
238 |
+
]
|
239 |
+
]
|
240 |
+
|
241 |
+
# List of (ipa, ipa2) pairs
|
242 |
+
_ipa_to_ipa2 = [
|
243 |
+
(re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
|
244 |
+
]
|
245 |
+
|
246 |
+
|
247 |
+
def _expand_dollars(m):
|
248 |
+
match = m.group(1)
|
249 |
+
parts = match.split(".")
|
250 |
+
if len(parts) > 2:
|
251 |
+
return match + " dollars" # Unexpected format
|
252 |
+
dollars = int(parts[0]) if parts[0] else 0
|
253 |
+
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
254 |
+
if dollars and cents:
|
255 |
+
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
256 |
+
cent_unit = "cent" if cents == 1 else "cents"
|
257 |
+
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
|
258 |
+
elif dollars:
|
259 |
+
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
260 |
+
return "%s %s" % (dollars, dollar_unit)
|
261 |
+
elif cents:
|
262 |
+
cent_unit = "cent" if cents == 1 else "cents"
|
263 |
+
return "%s %s" % (cents, cent_unit)
|
264 |
+
else:
|
265 |
+
return "zero dollars"
|
266 |
+
|
267 |
+
|
268 |
+
def _remove_commas(m):
|
269 |
+
return m.group(1).replace(",", "")
|
270 |
+
|
271 |
+
|
272 |
+
def _expand_ordinal(m):
|
273 |
+
return _inflect.number_to_words(m.group(0))
|
274 |
+
|
275 |
+
|
276 |
+
def _expand_number(m):
|
277 |
+
num = int(m.group(0))
|
278 |
+
if num > 1000 and num < 3000:
|
279 |
+
if num == 2000:
|
280 |
+
return "two thousand"
|
281 |
+
elif num > 2000 and num < 2010:
|
282 |
+
return "two thousand " + _inflect.number_to_words(num % 100)
|
283 |
+
elif num % 100 == 0:
|
284 |
+
return _inflect.number_to_words(num // 100) + " hundred"
|
285 |
+
else:
|
286 |
+
return _inflect.number_to_words(
|
287 |
+
num, andword="", zero="oh", group=2
|
288 |
+
).replace(", ", " ")
|
289 |
+
else:
|
290 |
+
return _inflect.number_to_words(num, andword="")
|
291 |
+
|
292 |
+
|
293 |
+
def _expand_decimal_point(m):
|
294 |
+
return m.group(1).replace(".", " point ")
|
295 |
+
|
296 |
+
|
297 |
+
def normalize_numbers(text):
|
298 |
+
text = re.sub(_comma_number_re, _remove_commas, text)
|
299 |
+
text = re.sub(_pounds_re, r"\1 pounds", text)
|
300 |
+
text = re.sub(_dollars_re, _expand_dollars, text)
|
301 |
+
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
302 |
+
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
303 |
+
text = re.sub(_number_re, _expand_number, text)
|
304 |
+
return text
|
305 |
+
|
306 |
+
|
307 |
+
def text_normalize(text):
|
308 |
+
text = normalize_numbers(text)
|
309 |
+
return text
|
310 |
+
|
311 |
+
|
312 |
+
def g2p(text, **kwargs):
|
313 |
+
phones = []
|
314 |
+
tones = []
|
315 |
+
word2ph = []
|
316 |
+
words = re.split(r"([,;.\-\?\!\s+])", text)
|
317 |
+
words = [word for word in words if word.strip() != ""]
|
318 |
+
for word in words:
|
319 |
+
if word.upper() in eng_dict:
|
320 |
+
phns, tns = refine_syllables(eng_dict[word.upper()])
|
321 |
+
phones += phns
|
322 |
+
tones += tns
|
323 |
+
word2ph.append(len(phns))
|
324 |
+
else:
|
325 |
+
phone_list = list(filter(lambda p: p != " ", _g2p(word)))
|
326 |
+
for ph in phone_list:
|
327 |
+
if ph in arpa:
|
328 |
+
ph, tn = refine_ph(ph)
|
329 |
+
phones.append(ph)
|
330 |
+
tones.append(tn)
|
331 |
+
else:
|
332 |
+
phones.append(ph)
|
333 |
+
tones.append(0)
|
334 |
+
word2ph.append(len(phone_list))
|
335 |
+
|
336 |
+
phones = [post_replace_ph(i) for i in phones]
|
337 |
+
|
338 |
+
phones = ["_"] + phones + ["_"]
|
339 |
+
tones = [0] + tones + [0]
|
340 |
+
word2ph = [1] + word2ph + [1]
|
341 |
+
|
342 |
+
return phones, tones, word2ph
|
343 |
+
|
344 |
+
|
345 |
+
def get_bert_feature(text, word2ph):
|
346 |
+
from bert_vits2.text import english_bert_mock
|
347 |
+
|
348 |
+
return english_bert_mock.get_bert_feature(text, word2ph)
|
349 |
+
|
350 |
+
|
351 |
+
if __name__ == "__main__":
|
352 |
+
# print(get_dict())
|
353 |
+
# print(eng_word_to_phoneme("hello"))
|
354 |
+
print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
|
355 |
+
# all_phones = set()
|
356 |
+
# for k, syllables in eng_dict.items():
|
357 |
+
# for group in syllables:
|
358 |
+
# for ph in group:
|
359 |
+
# all_phones.add(ph)
|
360 |
+
# print(all_phones)
|
bert_vits2/text/english_v230.py
ADDED
@@ -0,0 +1,493 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
import os
|
3 |
+
from g2p_en import G2p
|
4 |
+
from transformers import DebertaV2Tokenizer
|
5 |
+
|
6 |
+
from bert_vits2.text import symbols
|
7 |
+
from bert_vits2.text.symbols import punctuation
|
8 |
+
|
9 |
+
current_file_path = os.path.dirname(__file__)
|
10 |
+
CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
|
11 |
+
CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
|
12 |
+
_g2p = G2p()
|
13 |
+
LOCAL_PATH = "./bert/deberta-v3-large"
|
14 |
+
# tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
|
15 |
+
|
16 |
+
arpa = {
|
17 |
+
"AH0",
|
18 |
+
"S",
|
19 |
+
"AH1",
|
20 |
+
"EY2",
|
21 |
+
"AE2",
|
22 |
+
"EH0",
|
23 |
+
"OW2",
|
24 |
+
"UH0",
|
25 |
+
"NG",
|
26 |
+
"B",
|
27 |
+
"G",
|
28 |
+
"AY0",
|
29 |
+
"M",
|
30 |
+
"AA0",
|
31 |
+
"F",
|
32 |
+
"AO0",
|
33 |
+
"ER2",
|
34 |
+
"UH1",
|
35 |
+
"IY1",
|
36 |
+
"AH2",
|
37 |
+
"DH",
|
38 |
+
"IY0",
|
39 |
+
"EY1",
|
40 |
+
"IH0",
|
41 |
+
"K",
|
42 |
+
"N",
|
43 |
+
"W",
|
44 |
+
"IY2",
|
45 |
+
"T",
|
46 |
+
"AA1",
|
47 |
+
"ER1",
|
48 |
+
"EH2",
|
49 |
+
"OY0",
|
50 |
+
"UH2",
|
51 |
+
"UW1",
|
52 |
+
"Z",
|
53 |
+
"AW2",
|
54 |
+
"AW1",
|
55 |
+
"V",
|
56 |
+
"UW2",
|
57 |
+
"AA2",
|
58 |
+
"ER",
|
59 |
+
"AW0",
|
60 |
+
"UW0",
|
61 |
+
"R",
|
62 |
+
"OW1",
|
63 |
+
"EH1",
|
64 |
+
"ZH",
|
65 |
+
"AE0",
|
66 |
+
"IH2",
|
67 |
+
"IH",
|
68 |
+
"Y",
|
69 |
+
"JH",
|
70 |
+
"P",
|
71 |
+
"AY1",
|
72 |
+
"EY0",
|
73 |
+
"OY2",
|
74 |
+
"TH",
|
75 |
+
"HH",
|
76 |
+
"D",
|
77 |
+
"ER0",
|
78 |
+
"CH",
|
79 |
+
"AO1",
|
80 |
+
"AE1",
|
81 |
+
"AO2",
|
82 |
+
"OY1",
|
83 |
+
"AY2",
|
84 |
+
"IH1",
|
85 |
+
"OW0",
|
86 |
+
"L",
|
87 |
+
"SH",
|
88 |
+
}
|
89 |
+
|
90 |
+
|
91 |
+
def post_replace_ph(ph):
|
92 |
+
rep_map = {
|
93 |
+
":": ",",
|
94 |
+
";": ",",
|
95 |
+
",": ",",
|
96 |
+
"。": ".",
|
97 |
+
"!": "!",
|
98 |
+
"?": "?",
|
99 |
+
"\n": ".",
|
100 |
+
"·": ",",
|
101 |
+
"、": ",",
|
102 |
+
"…": "...",
|
103 |
+
"···": "...",
|
104 |
+
"・・・": "...",
|
105 |
+
"v": "V",
|
106 |
+
}
|
107 |
+
if ph in rep_map.keys():
|
108 |
+
ph = rep_map[ph]
|
109 |
+
if ph in symbols:
|
110 |
+
return ph
|
111 |
+
if ph not in symbols:
|
112 |
+
ph = "UNK"
|
113 |
+
return ph
|
114 |
+
|
115 |
+
|
116 |
+
rep_map = {
|
117 |
+
":": ",",
|
118 |
+
";": ",",
|
119 |
+
",": ",",
|
120 |
+
"。": ".",
|
121 |
+
"!": "!",
|
122 |
+
"?": "?",
|
123 |
+
"\n": ".",
|
124 |
+
".": ".",
|
125 |
+
"…": "...",
|
126 |
+
"···": "...",
|
127 |
+
"・・・": "...",
|
128 |
+
"·": ",",
|
129 |
+
"・": ",",
|
130 |
+
"、": ",",
|
131 |
+
"$": ".",
|
132 |
+
"“": "'",
|
133 |
+
"”": "'",
|
134 |
+
'"': "'",
|
135 |
+
"‘": "'",
|
136 |
+
"’": "'",
|
137 |
+
"(": "'",
|
138 |
+
")": "'",
|
139 |
+
"(": "'",
|
140 |
+
")": "'",
|
141 |
+
"《": "'",
|
142 |
+
"》": "'",
|
143 |
+
"【": "'",
|
144 |
+
"】": "'",
|
145 |
+
"[": "'",
|
146 |
+
"]": "'",
|
147 |
+
"—": "-",
|
148 |
+
"−": "-",
|
149 |
+
"~": "-",
|
150 |
+
"~": "-",
|
151 |
+
"「": "'",
|
152 |
+
"」": "'",
|
153 |
+
}
|
154 |
+
|
155 |
+
|
156 |
+
def replace_punctuation(text):
|
157 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
158 |
+
|
159 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
160 |
+
|
161 |
+
# replaced_text = re.sub(
|
162 |
+
# r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
|
163 |
+
# + "".join(punctuation)
|
164 |
+
# + r"]+",
|
165 |
+
# "",
|
166 |
+
# replaced_text,
|
167 |
+
# )
|
168 |
+
|
169 |
+
return replaced_text
|
170 |
+
|
171 |
+
|
172 |
+
def read_dict():
|
173 |
+
g2p_dict = {}
|
174 |
+
start_line = 49
|
175 |
+
with open(CMU_DICT_PATH) as f:
|
176 |
+
line = f.readline()
|
177 |
+
line_index = 1
|
178 |
+
while line:
|
179 |
+
if line_index >= start_line:
|
180 |
+
line = line.strip()
|
181 |
+
word_split = line.split(" ")
|
182 |
+
word = word_split[0]
|
183 |
+
|
184 |
+
syllable_split = word_split[1].split(" - ")
|
185 |
+
g2p_dict[word] = []
|
186 |
+
for syllable in syllable_split:
|
187 |
+
phone_split = syllable.split(" ")
|
188 |
+
g2p_dict[word].append(phone_split)
|
189 |
+
|
190 |
+
line_index = line_index + 1
|
191 |
+
line = f.readline()
|
192 |
+
|
193 |
+
return g2p_dict
|
194 |
+
|
195 |
+
|
196 |
+
def cache_dict(g2p_dict, file_path):
|
197 |
+
with open(file_path, "wb") as pickle_file:
|
198 |
+
pickle.dump(g2p_dict, pickle_file)
|
199 |
+
|
200 |
+
|
201 |
+
def get_dict():
|
202 |
+
if os.path.exists(CACHE_PATH):
|
203 |
+
with open(CACHE_PATH, "rb") as pickle_file:
|
204 |
+
g2p_dict = pickle.load(pickle_file)
|
205 |
+
else:
|
206 |
+
g2p_dict = read_dict()
|
207 |
+
cache_dict(g2p_dict, CACHE_PATH)
|
208 |
+
|
209 |
+
return g2p_dict
|
210 |
+
|
211 |
+
|
212 |
+
eng_dict = get_dict()
|
213 |
+
|
214 |
+
|
215 |
+
def refine_ph(phn):
|
216 |
+
tone = 0
|
217 |
+
if re.search(r"\d$", phn):
|
218 |
+
tone = int(phn[-1]) + 1
|
219 |
+
phn = phn[:-1]
|
220 |
+
else:
|
221 |
+
tone = 3
|
222 |
+
return phn.lower(), tone
|
223 |
+
|
224 |
+
|
225 |
+
def refine_syllables(syllables):
|
226 |
+
tones = []
|
227 |
+
phonemes = []
|
228 |
+
for phn_list in syllables:
|
229 |
+
for i in range(len(phn_list)):
|
230 |
+
phn = phn_list[i]
|
231 |
+
phn, tone = refine_ph(phn)
|
232 |
+
phonemes.append(phn)
|
233 |
+
tones.append(tone)
|
234 |
+
return phonemes, tones
|
235 |
+
|
236 |
+
|
237 |
+
import re
|
238 |
+
import inflect
|
239 |
+
|
240 |
+
_inflect = inflect.engine()
|
241 |
+
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
|
242 |
+
_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
|
243 |
+
_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
|
244 |
+
_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
|
245 |
+
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
|
246 |
+
_number_re = re.compile(r"[0-9]+")
|
247 |
+
|
248 |
+
# List of (regular expression, replacement) pairs for abbreviations:
|
249 |
+
_abbreviations = [
|
250 |
+
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
251 |
+
for x in [
|
252 |
+
("mrs", "misess"),
|
253 |
+
("mr", "mister"),
|
254 |
+
("dr", "doctor"),
|
255 |
+
("st", "saint"),
|
256 |
+
("co", "company"),
|
257 |
+
("jr", "junior"),
|
258 |
+
("maj", "major"),
|
259 |
+
("gen", "general"),
|
260 |
+
("drs", "doctors"),
|
261 |
+
("rev", "reverend"),
|
262 |
+
("lt", "lieutenant"),
|
263 |
+
("hon", "honorable"),
|
264 |
+
("sgt", "sergeant"),
|
265 |
+
("capt", "captain"),
|
266 |
+
("esq", "esquire"),
|
267 |
+
("ltd", "limited"),
|
268 |
+
("col", "colonel"),
|
269 |
+
("ft", "fort"),
|
270 |
+
]
|
271 |
+
]
|
272 |
+
|
273 |
+
# List of (ipa, lazy ipa) pairs:
|
274 |
+
_lazy_ipa = [
|
275 |
+
(re.compile("%s" % x[0]), x[1])
|
276 |
+
for x in [
|
277 |
+
("r", "ɹ"),
|
278 |
+
("æ", "e"),
|
279 |
+
("ɑ", "a"),
|
280 |
+
("ɔ", "o"),
|
281 |
+
("ð", "z"),
|
282 |
+
("θ", "s"),
|
283 |
+
("ɛ", "e"),
|
284 |
+
("ɪ", "i"),
|
285 |
+
("ʊ", "u"),
|
286 |
+
("ʒ", "ʥ"),
|
287 |
+
("ʤ", "ʥ"),
|
288 |
+
("ˈ", "↓"),
|
289 |
+
]
|
290 |
+
]
|
291 |
+
|
292 |
+
# List of (ipa, lazy ipa2) pairs:
|
293 |
+
_lazy_ipa2 = [
|
294 |
+
(re.compile("%s" % x[0]), x[1])
|
295 |
+
for x in [
|
296 |
+
("r", "ɹ"),
|
297 |
+
("ð", "z"),
|
298 |
+
("θ", "s"),
|
299 |
+
("ʒ", "ʑ"),
|
300 |
+
("ʤ", "dʑ"),
|
301 |
+
("ˈ", "↓"),
|
302 |
+
]
|
303 |
+
]
|
304 |
+
|
305 |
+
# List of (ipa, ipa2) pairs
|
306 |
+
_ipa_to_ipa2 = [
|
307 |
+
(re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
|
308 |
+
]
|
309 |
+
|
310 |
+
|
311 |
+
def _expand_dollars(m):
|
312 |
+
match = m.group(1)
|
313 |
+
parts = match.split(".")
|
314 |
+
if len(parts) > 2:
|
315 |
+
return match + " dollars" # Unexpected format
|
316 |
+
dollars = int(parts[0]) if parts[0] else 0
|
317 |
+
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
318 |
+
if dollars and cents:
|
319 |
+
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
320 |
+
cent_unit = "cent" if cents == 1 else "cents"
|
321 |
+
return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
|
322 |
+
elif dollars:
|
323 |
+
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
324 |
+
return "%s %s" % (dollars, dollar_unit)
|
325 |
+
elif cents:
|
326 |
+
cent_unit = "cent" if cents == 1 else "cents"
|
327 |
+
return "%s %s" % (cents, cent_unit)
|
328 |
+
else:
|
329 |
+
return "zero dollars"
|
330 |
+
|
331 |
+
|
332 |
+
def _remove_commas(m):
|
333 |
+
return m.group(1).replace(",", "")
|
334 |
+
|
335 |
+
|
336 |
+
def _expand_ordinal(m):
|
337 |
+
return _inflect.number_to_words(m.group(0))
|
338 |
+
|
339 |
+
|
340 |
+
def _expand_number(m):
|
341 |
+
num = int(m.group(0))
|
342 |
+
if num > 1000 and num < 3000:
|
343 |
+
if num == 2000:
|
344 |
+
return "two thousand"
|
345 |
+
elif num > 2000 and num < 2010:
|
346 |
+
return "two thousand " + _inflect.number_to_words(num % 100)
|
347 |
+
elif num % 100 == 0:
|
348 |
+
return _inflect.number_to_words(num // 100) + " hundred"
|
349 |
+
else:
|
350 |
+
return _inflect.number_to_words(
|
351 |
+
num, andword="", zero="oh", group=2
|
352 |
+
).replace(", ", " ")
|
353 |
+
else:
|
354 |
+
return _inflect.number_to_words(num, andword="")
|
355 |
+
|
356 |
+
|
357 |
+
def _expand_decimal_point(m):
|
358 |
+
return m.group(1).replace(".", " point ")
|
359 |
+
|
360 |
+
|
361 |
+
def normalize_numbers(text):
|
362 |
+
text = re.sub(_comma_number_re, _remove_commas, text)
|
363 |
+
text = re.sub(_pounds_re, r"\1 pounds", text)
|
364 |
+
text = re.sub(_dollars_re, _expand_dollars, text)
|
365 |
+
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
366 |
+
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
367 |
+
text = re.sub(_number_re, _expand_number, text)
|
368 |
+
return text
|
369 |
+
|
370 |
+
|
371 |
+
def text_normalize(text):
|
372 |
+
text = normalize_numbers(text)
|
373 |
+
text = replace_punctuation(text)
|
374 |
+
text = re.sub(r"([,;.\?\!])([\w])", r"\1 \2", text)
|
375 |
+
return text
|
376 |
+
|
377 |
+
|
378 |
+
def distribute_phone(n_phone, n_word):
|
379 |
+
phones_per_word = [0] * n_word
|
380 |
+
for task in range(n_phone):
|
381 |
+
min_tasks = min(phones_per_word)
|
382 |
+
min_index = phones_per_word.index(min_tasks)
|
383 |
+
phones_per_word[min_index] += 1
|
384 |
+
return phones_per_word
|
385 |
+
|
386 |
+
|
387 |
+
def sep_text(text):
|
388 |
+
words = re.split(r"([,;.\?\!\s+])", text)
|
389 |
+
words = [word for word in words if word.strip() != ""]
|
390 |
+
return words
|
391 |
+
|
392 |
+
|
393 |
+
def text_to_words(text, tokenizer):
|
394 |
+
tokens = tokenizer.tokenize(text)
|
395 |
+
words = []
|
396 |
+
for idx, t in enumerate(tokens):
|
397 |
+
if t.startswith("▁"):
|
398 |
+
words.append([t[1:]])
|
399 |
+
else:
|
400 |
+
if t in punctuation:
|
401 |
+
if idx == len(tokens) - 1:
|
402 |
+
words.append([f"{t}"])
|
403 |
+
else:
|
404 |
+
if (
|
405 |
+
not tokens[idx + 1].startswith("▁")
|
406 |
+
and tokens[idx + 1] not in punctuation
|
407 |
+
):
|
408 |
+
if idx == 0:
|
409 |
+
words.append([])
|
410 |
+
words[-1].append(f"{t}")
|
411 |
+
else:
|
412 |
+
words.append([f"{t}"])
|
413 |
+
else:
|
414 |
+
if idx == 0:
|
415 |
+
words.append([])
|
416 |
+
words[-1].append(f"{t}")
|
417 |
+
return words
|
418 |
+
|
419 |
+
|
420 |
+
def g2p(text, tokenizer, **kwargs):
|
421 |
+
phones = []
|
422 |
+
tones = []
|
423 |
+
phone_len = []
|
424 |
+
# words = sep_text(text)
|
425 |
+
# tokens = [tokenizer.tokenize(i) for i in words]
|
426 |
+
words = text_to_words(text, tokenizer)
|
427 |
+
|
428 |
+
for word in words:
|
429 |
+
temp_phones, temp_tones = [], []
|
430 |
+
if len(word) > 1:
|
431 |
+
if "'" in word:
|
432 |
+
word = ["".join(word)]
|
433 |
+
for w in word:
|
434 |
+
if w in punctuation:
|
435 |
+
temp_phones.append(w)
|
436 |
+
temp_tones.append(0)
|
437 |
+
continue
|
438 |
+
if w.upper() in eng_dict:
|
439 |
+
phns, tns = refine_syllables(eng_dict[w.upper()])
|
440 |
+
temp_phones += [post_replace_ph(i) for i in phns]
|
441 |
+
temp_tones += tns
|
442 |
+
# w2ph.append(len(phns))
|
443 |
+
else:
|
444 |
+
phone_list = list(filter(lambda p: p != " ", _g2p(w)))
|
445 |
+
phns = []
|
446 |
+
tns = []
|
447 |
+
for ph in phone_list:
|
448 |
+
if ph in arpa:
|
449 |
+
ph, tn = refine_ph(ph)
|
450 |
+
phns.append(ph)
|
451 |
+
tns.append(tn)
|
452 |
+
else:
|
453 |
+
phns.append(ph)
|
454 |
+
tns.append(0)
|
455 |
+
temp_phones += [post_replace_ph(i) for i in phns]
|
456 |
+
temp_tones += tns
|
457 |
+
phones += temp_phones
|
458 |
+
tones += temp_tones
|
459 |
+
phone_len.append(len(temp_phones))
|
460 |
+
# phones = [post_replace_ph(i) for i in phones]
|
461 |
+
|
462 |
+
word2ph = []
|
463 |
+
for token, pl in zip(words, phone_len):
|
464 |
+
word_len = len(token)
|
465 |
+
|
466 |
+
aaa = distribute_phone(pl, word_len)
|
467 |
+
word2ph += aaa
|
468 |
+
|
469 |
+
phones = ["_"] + phones + ["_"]
|
470 |
+
tones = [0] + tones + [0]
|
471 |
+
word2ph = [1] + word2ph + [1]
|
472 |
+
assert len(phones) == len(tones), text
|
473 |
+
assert len(phones) == sum(word2ph), text
|
474 |
+
|
475 |
+
return phones, tones, word2ph
|
476 |
+
|
477 |
+
|
478 |
+
def get_bert_feature(text, word2ph):
|
479 |
+
from bert_vits2.text import english_bert_mock
|
480 |
+
|
481 |
+
return english_bert_mock.get_bert_feature(text, word2ph)
|
482 |
+
|
483 |
+
|
484 |
+
if __name__ == "__main__":
|
485 |
+
# print(get_dict())
|
486 |
+
# print(eng_word_to_phoneme("hello"))
|
487 |
+
print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
|
488 |
+
# all_phones = set()
|
489 |
+
# for k, syllables in eng_dict.items():
|
490 |
+
# for group in syllables:
|
491 |
+
# for ph in group:
|
492 |
+
# all_phones.add(ph)
|
493 |
+
# print(all_phones)
|
bert_vits2/text/japanese.py
ADDED
@@ -0,0 +1,428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Convert Japanese text to phonemes which is
|
2 |
+
# compatible with Julius https://github.com/julius-speech/segmentation-kit
|
3 |
+
import re
|
4 |
+
import unicodedata
|
5 |
+
|
6 |
+
from bert_vits2.text import punctuation, symbols
|
7 |
+
|
8 |
+
from num2words import num2words
|
9 |
+
|
10 |
+
import pyopenjtalk
|
11 |
+
import jaconv
|
12 |
+
|
13 |
+
|
14 |
+
def kata2phoneme(text: str) -> str:
|
15 |
+
"""Convert katakana text to phonemes."""
|
16 |
+
text = text.strip()
|
17 |
+
if text == "ー":
|
18 |
+
return ["ー"]
|
19 |
+
elif text.startswith("ー"):
|
20 |
+
return ["ー"] + kata2phoneme(text[1:])
|
21 |
+
res = []
|
22 |
+
prev = None
|
23 |
+
while text:
|
24 |
+
if re.match(_MARKS, text):
|
25 |
+
res.append(text)
|
26 |
+
text = text[1:]
|
27 |
+
continue
|
28 |
+
if text.startswith("ー"):
|
29 |
+
if prev:
|
30 |
+
res.append(prev[-1])
|
31 |
+
text = text[1:]
|
32 |
+
continue
|
33 |
+
res += pyopenjtalk.g2p(text).lower().replace("cl", "q").split(" ")
|
34 |
+
break
|
35 |
+
# res = _COLON_RX.sub(":", res)
|
36 |
+
return res
|
37 |
+
|
38 |
+
|
39 |
+
def hira2kata(text: str) -> str:
|
40 |
+
return jaconv.hira2kata(text)
|
41 |
+
|
42 |
+
|
43 |
+
_SYMBOL_TOKENS = set(list("・、。?!"))
|
44 |
+
_NO_YOMI_TOKENS = set(list("「」『』―()[][]"))
|
45 |
+
_MARKS = re.compile(
|
46 |
+
r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
|
47 |
+
)
|
48 |
+
|
49 |
+
|
50 |
+
def text2kata(text: str) -> str:
|
51 |
+
parsed = pyopenjtalk.run_frontend(text)
|
52 |
+
|
53 |
+
res = []
|
54 |
+
for parts in parsed:
|
55 |
+
word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
|
56 |
+
"’", ""
|
57 |
+
)
|
58 |
+
if yomi:
|
59 |
+
if re.match(_MARKS, yomi):
|
60 |
+
if len(word) > 1:
|
61 |
+
word = [replace_punctuation(i) for i in list(word)]
|
62 |
+
yomi = word
|
63 |
+
res += yomi
|
64 |
+
sep += word
|
65 |
+
continue
|
66 |
+
elif word not in rep_map.keys() and word not in rep_map.values():
|
67 |
+
word = ","
|
68 |
+
yomi = word
|
69 |
+
res.append(yomi)
|
70 |
+
else:
|
71 |
+
if word in _SYMBOL_TOKENS:
|
72 |
+
res.append(word)
|
73 |
+
elif word in ("っ", "ッ"):
|
74 |
+
res.append("ッ")
|
75 |
+
elif word in _NO_YOMI_TOKENS:
|
76 |
+
pass
|
77 |
+
else:
|
78 |
+
res.append(word)
|
79 |
+
return hira2kata("".join(res))
|
80 |
+
|
81 |
+
|
82 |
+
def text2sep_kata(text: str) -> (list, list):
|
83 |
+
parsed = pyopenjtalk.run_frontend(text)
|
84 |
+
|
85 |
+
res = []
|
86 |
+
sep = []
|
87 |
+
for parts in parsed:
|
88 |
+
word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
|
89 |
+
"’", ""
|
90 |
+
)
|
91 |
+
if yomi:
|
92 |
+
if re.match(_MARKS, yomi):
|
93 |
+
if len(word) > 1:
|
94 |
+
word = [replace_punctuation(i) for i in list(word)]
|
95 |
+
yomi = word
|
96 |
+
res += yomi
|
97 |
+
sep += word
|
98 |
+
continue
|
99 |
+
elif word not in rep_map.keys() and word not in rep_map.values():
|
100 |
+
word = ","
|
101 |
+
yomi = word
|
102 |
+
res.append(yomi)
|
103 |
+
else:
|
104 |
+
if word in _SYMBOL_TOKENS:
|
105 |
+
res.append(word)
|
106 |
+
elif word in ("っ", "ッ"):
|
107 |
+
res.append("ッ")
|
108 |
+
elif word in _NO_YOMI_TOKENS:
|
109 |
+
pass
|
110 |
+
else:
|
111 |
+
res.append(word)
|
112 |
+
sep.append(word)
|
113 |
+
return sep, [hira2kata(i) for i in res], get_accent(parsed)
|
114 |
+
|
115 |
+
|
116 |
+
def get_accent(parsed):
|
117 |
+
labels = pyopenjtalk.make_label(parsed)
|
118 |
+
|
119 |
+
phonemes = []
|
120 |
+
accents = []
|
121 |
+
for n, label in enumerate(labels):
|
122 |
+
phoneme = re.search(r"\-([^\+]*)\+", label).group(1)
|
123 |
+
if phoneme not in ["sil", "pau"]:
|
124 |
+
phonemes.append(phoneme.replace("cl", "q").lower())
|
125 |
+
else:
|
126 |
+
continue
|
127 |
+
a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
|
128 |
+
a2 = int(re.search(r"\+(\d+)\+", label).group(1))
|
129 |
+
if re.search(r"\-([^\+]*)\+", labels[n + 1]).group(1) in ["sil", "pau"]:
|
130 |
+
a2_next = -1
|
131 |
+
else:
|
132 |
+
a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
|
133 |
+
# Falling
|
134 |
+
if a1 == 0 and a2_next == a2 + 1:
|
135 |
+
accents.append(-1)
|
136 |
+
# Rising
|
137 |
+
elif a2 == 1 and a2_next == 2:
|
138 |
+
accents.append(1)
|
139 |
+
else:
|
140 |
+
accents.append(0)
|
141 |
+
return list(zip(phonemes, accents))
|
142 |
+
|
143 |
+
|
144 |
+
_ALPHASYMBOL_YOMI = {
|
145 |
+
"#": "シャープ",
|
146 |
+
"%": "パーセント",
|
147 |
+
"&": "アンド",
|
148 |
+
"+": "プラス",
|
149 |
+
"-": "マイナス",
|
150 |
+
":": "コロン",
|
151 |
+
";": "セミコロン",
|
152 |
+
"<": "小なり",
|
153 |
+
"=": "イコール",
|
154 |
+
">": "大なり",
|
155 |
+
"@": "アット",
|
156 |
+
"a": "エー",
|
157 |
+
"b": "ビー",
|
158 |
+
"c": "シー",
|
159 |
+
"d": "ディー",
|
160 |
+
"e": "イー",
|
161 |
+
"f": "エフ",
|
162 |
+
"g": "ジー",
|
163 |
+
"h": "エイチ",
|
164 |
+
"i": "アイ",
|
165 |
+
"j": "ジェー",
|
166 |
+
"k": "ケー",
|
167 |
+
"l": "エル",
|
168 |
+
"m": "エム",
|
169 |
+
"n": "エヌ",
|
170 |
+
"o": "オー",
|
171 |
+
"p": "ピー",
|
172 |
+
"q": "キュー",
|
173 |
+
"r": "アール",
|
174 |
+
"s": "エス",
|
175 |
+
"t": "ティ���",
|
176 |
+
"u": "ユー",
|
177 |
+
"v": "ブイ",
|
178 |
+
"w": "ダブリュー",
|
179 |
+
"x": "エックス",
|
180 |
+
"y": "ワイ",
|
181 |
+
"z": "ゼット",
|
182 |
+
"α": "アルファ",
|
183 |
+
"β": "ベータ",
|
184 |
+
"γ": "ガンマ",
|
185 |
+
"δ": "デルタ",
|
186 |
+
"ε": "イプシロン",
|
187 |
+
"ζ": "ゼータ",
|
188 |
+
"η": "イータ",
|
189 |
+
"θ": "シータ",
|
190 |
+
"ι": "イオタ",
|
191 |
+
"κ": "カッパ",
|
192 |
+
"λ": "ラムダ",
|
193 |
+
"μ": "ミュー",
|
194 |
+
"ν": "ニュー",
|
195 |
+
"ξ": "クサイ",
|
196 |
+
"ο": "オミクロン",
|
197 |
+
"π": "パイ",
|
198 |
+
"ρ": "ロー",
|
199 |
+
"σ": "シグマ",
|
200 |
+
"τ": "タウ",
|
201 |
+
"υ": "ウプシロン",
|
202 |
+
"φ": "ファイ",
|
203 |
+
"χ": "カイ",
|
204 |
+
"ψ": "プサイ",
|
205 |
+
"ω": "オメガ",
|
206 |
+
}
|
207 |
+
|
208 |
+
_NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
|
209 |
+
_CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
|
210 |
+
_CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
|
211 |
+
_NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
|
212 |
+
|
213 |
+
|
214 |
+
def japanese_convert_numbers_to_words(text: str) -> str:
|
215 |
+
res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
|
216 |
+
res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
|
217 |
+
res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
|
218 |
+
return res
|
219 |
+
|
220 |
+
|
221 |
+
def japanese_convert_alpha_symbols_to_words(text: str) -> str:
|
222 |
+
return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
|
223 |
+
|
224 |
+
|
225 |
+
def japanese_text_to_phonemes(text: str) -> str:
|
226 |
+
"""Convert Japanese text to phonemes."""
|
227 |
+
res = unicodedata.normalize("NFKC", text)
|
228 |
+
res = japanese_convert_numbers_to_words(res)
|
229 |
+
# res = japanese_convert_alpha_symbols_to_words(res)
|
230 |
+
res = text2kata(res)
|
231 |
+
res = kata2phoneme(res)
|
232 |
+
return res
|
233 |
+
|
234 |
+
|
235 |
+
def is_japanese_character(char):
|
236 |
+
# 定义日语文字系统的 Unicode 范围
|
237 |
+
japanese_ranges = [
|
238 |
+
(0x3040, 0x309F), # 平假名
|
239 |
+
(0x30A0, 0x30FF), # 片假名
|
240 |
+
(0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs)
|
241 |
+
(0x3400, 0x4DBF), # 汉字扩展 A
|
242 |
+
(0x20000, 0x2A6DF), # 汉字扩展 B
|
243 |
+
# 可以根据需要添加其他汉字扩展范围
|
244 |
+
]
|
245 |
+
|
246 |
+
# 将字符的 Unicode 编码转换为整数
|
247 |
+
char_code = ord(char)
|
248 |
+
|
249 |
+
# 检查字符是否在任何一个日语范围内
|
250 |
+
for start, end in japanese_ranges:
|
251 |
+
if start <= char_code <= end:
|
252 |
+
return True
|
253 |
+
|
254 |
+
return False
|
255 |
+
|
256 |
+
|
257 |
+
rep_map = {
|
258 |
+
":": ",",
|
259 |
+
";": ",",
|
260 |
+
",": ",",
|
261 |
+
"。": ".",
|
262 |
+
"!": "!",
|
263 |
+
"?": "?",
|
264 |
+
"\n": ".",
|
265 |
+
".": ".",
|
266 |
+
"…": "...",
|
267 |
+
"···": "...",
|
268 |
+
"・・・": "...",
|
269 |
+
"·": ",",
|
270 |
+
"・": ",",
|
271 |
+
"、": ",",
|
272 |
+
"$": ".",
|
273 |
+
"“": "'",
|
274 |
+
"”": "'",
|
275 |
+
'"': "'",
|
276 |
+
"‘": "'",
|
277 |
+
"’": "'",
|
278 |
+
"(": "'",
|
279 |
+
")": "'",
|
280 |
+
"(": "'",
|
281 |
+
")": "'",
|
282 |
+
"《": "'",
|
283 |
+
"》": "'",
|
284 |
+
"【": "'",
|
285 |
+
"】": "'",
|
286 |
+
"[": "'",
|
287 |
+
"]": "'",
|
288 |
+
"—": "-",
|
289 |
+
"−": "-",
|
290 |
+
"~": "-",
|
291 |
+
"~": "-",
|
292 |
+
"「": "'",
|
293 |
+
"」": "'",
|
294 |
+
}
|
295 |
+
|
296 |
+
|
297 |
+
def replace_punctuation(text):
|
298 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
299 |
+
|
300 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
301 |
+
|
302 |
+
replaced_text = re.sub(
|
303 |
+
r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
|
304 |
+
+ "".join(punctuation)
|
305 |
+
+ r"]+",
|
306 |
+
"",
|
307 |
+
replaced_text,
|
308 |
+
)
|
309 |
+
|
310 |
+
return replaced_text
|
311 |
+
|
312 |
+
|
313 |
+
def text_normalize(text):
|
314 |
+
res = unicodedata.normalize("NFKC", text)
|
315 |
+
res = japanese_convert_numbers_to_words(res)
|
316 |
+
# res = "".join([i for i in res if is_japanese_character(i)])
|
317 |
+
res = replace_punctuation(res)
|
318 |
+
res = res.replace("゙", "")
|
319 |
+
return res
|
320 |
+
|
321 |
+
|
322 |
+
def distribute_phone(n_phone, n_word):
|
323 |
+
phones_per_word = [0] * n_word
|
324 |
+
for task in range(n_phone):
|
325 |
+
min_tasks = min(phones_per_word)
|
326 |
+
min_index = phones_per_word.index(min_tasks)
|
327 |
+
phones_per_word[min_index] += 1
|
328 |
+
return phones_per_word
|
329 |
+
|
330 |
+
|
331 |
+
def handle_long(sep_phonemes):
|
332 |
+
for i in range(len(sep_phonemes)):
|
333 |
+
if sep_phonemes[i][0] == "ー":
|
334 |
+
sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
|
335 |
+
if "ー" in sep_phonemes[i]:
|
336 |
+
for j in range(len(sep_phonemes[i])):
|
337 |
+
if sep_phonemes[i][j] == "ー":
|
338 |
+
sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
|
339 |
+
return sep_phonemes
|
340 |
+
|
341 |
+
|
342 |
+
def align_tones(phones, tones):
|
343 |
+
res = []
|
344 |
+
for pho in phones:
|
345 |
+
temp = [0] * len(pho)
|
346 |
+
for idx, p in enumerate(pho):
|
347 |
+
if len(tones) == 0:
|
348 |
+
break
|
349 |
+
if p == tones[0][0]:
|
350 |
+
temp[idx] = tones[0][1]
|
351 |
+
if idx > 0:
|
352 |
+
temp[idx] += temp[idx - 1]
|
353 |
+
tones.pop(0)
|
354 |
+
temp = [0] + temp
|
355 |
+
temp = temp[:-1]
|
356 |
+
if -1 in temp:
|
357 |
+
temp = [i + 1 for i in temp]
|
358 |
+
res.append(temp)
|
359 |
+
res = [i for j in res for i in j]
|
360 |
+
assert not any([i < 0 for i in res]) and not any([i > 1 for i in res])
|
361 |
+
return res
|
362 |
+
|
363 |
+
|
364 |
+
def rearrange_tones(tones, phones):
|
365 |
+
res = [0] * len(tones)
|
366 |
+
for i in range(len(tones)):
|
367 |
+
if i == 0:
|
368 |
+
if tones[i] not in punctuation:
|
369 |
+
res[i] = 1
|
370 |
+
elif tones[i] == prev:
|
371 |
+
if phones[i] in punctuation:
|
372 |
+
res[i] = 0
|
373 |
+
else:
|
374 |
+
res[i] = 1
|
375 |
+
elif tones[i] > prev:
|
376 |
+
res[i] = 2
|
377 |
+
elif tones[i] < prev:
|
378 |
+
res[i - 1] = 3
|
379 |
+
res[i] = 1
|
380 |
+
prev = tones[i]
|
381 |
+
return res
|
382 |
+
|
383 |
+
|
384 |
+
def g2p(norm_text, tokenizer, **kwargs):
|
385 |
+
sep_text, sep_kata, acc = text2sep_kata(norm_text)
|
386 |
+
sep_tokenized = []
|
387 |
+
for i in sep_text:
|
388 |
+
if i not in punctuation:
|
389 |
+
sep_tokenized.append(tokenizer.tokenize(i))
|
390 |
+
else:
|
391 |
+
sep_tokenized.append([i])
|
392 |
+
|
393 |
+
sep_phonemes = handle_long([kata2phoneme(i) for i in sep_kata])
|
394 |
+
# 异常处理,MeCab不认识的词的话会一路传到这里来,然后炸掉。目前来看只有那些超级稀有的生僻词会出现这种情况
|
395 |
+
for i in sep_phonemes:
|
396 |
+
for j in i:
|
397 |
+
assert j in symbols, (sep_text, sep_kata, sep_phonemes)
|
398 |
+
tones = align_tones(sep_phonemes, acc)
|
399 |
+
|
400 |
+
word2ph = []
|
401 |
+
for token, phoneme in zip(sep_tokenized, sep_phonemes):
|
402 |
+
phone_len = len(phoneme)
|
403 |
+
word_len = len(token)
|
404 |
+
|
405 |
+
aaa = distribute_phone(phone_len, word_len)
|
406 |
+
word2ph += aaa
|
407 |
+
phones = ["_"] + [j for i in sep_phonemes for j in i] + ["_"]
|
408 |
+
# tones = [0] + rearrange_tones(tones, phones[1:-1]) + [0]
|
409 |
+
tones = [0] + tones + [0]
|
410 |
+
word2ph = [1] + word2ph + [1]
|
411 |
+
assert len(phones) == len(tones)
|
412 |
+
return phones, tones, word2ph
|
413 |
+
|
414 |
+
|
415 |
+
if __name__ == "__main__":
|
416 |
+
from manager import model_handler
|
417 |
+
|
418 |
+
tokenizer, _ = model_handler.get_bert_model("DEBERTA_V2_LARGE_JAPANESE_CHAR_WWM")
|
419 |
+
text = "hello,こんにちは、世界ー!……"
|
420 |
+
from bert_vits2.text.japanese_bert import get_bert_feature
|
421 |
+
|
422 |
+
text = text_normalize(text)
|
423 |
+
print(text)
|
424 |
+
|
425 |
+
phones, tones, word2ph = g2p(text, tokenizer)
|
426 |
+
bert = get_bert_feature(text, word2ph)
|
427 |
+
|
428 |
+
print(phones, tones, word2ph, bert.shape)
|
bert_vits2/text/japanese_bert.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
from contants import config
|
4 |
+
from bert_vits2.text.japanese import text2sep_kata
|
5 |
+
|
6 |
+
LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
|
7 |
+
|
8 |
+
|
9 |
+
def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, style_text=None, style_weight=0.7,
|
10 |
+
**kwargs):
|
11 |
+
text = "".join(text2sep_kata(text)[0])
|
12 |
+
if style_text:
|
13 |
+
style_text = "".join(text2sep_kata(style_text)[0])
|
14 |
+
with torch.no_grad():
|
15 |
+
inputs = tokenizer(text, return_tensors="pt")
|
16 |
+
for i in inputs:
|
17 |
+
inputs[i] = inputs[i].to(device)
|
18 |
+
res = model(**inputs, output_hidden_states=True)
|
19 |
+
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
|
20 |
+
if style_text:
|
21 |
+
style_inputs = tokenizer(style_text, return_tensors="pt")
|
22 |
+
for i in style_inputs:
|
23 |
+
style_inputs[i] = style_inputs[i].to(device)
|
24 |
+
style_res = model(**style_inputs, output_hidden_states=True)
|
25 |
+
style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].float().cpu()
|
26 |
+
style_res_mean = style_res.mean(0)
|
27 |
+
|
28 |
+
assert len(word2ph) == len(text) + 2
|
29 |
+
word2phone = word2ph
|
30 |
+
phone_level_feature = []
|
31 |
+
for i in range(len(word2phone)):
|
32 |
+
if style_text:
|
33 |
+
repeat_feature = (
|
34 |
+
res[i].repeat(word2phone[i], 1) * (1 - style_weight)
|
35 |
+
+ style_res_mean.repeat(word2phone[i], 1) * style_weight
|
36 |
+
)
|
37 |
+
else:
|
38 |
+
repeat_feature = res[i].repeat(word2phone[i], 1)
|
39 |
+
phone_level_feature.append(repeat_feature)
|
40 |
+
|
41 |
+
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
42 |
+
|
43 |
+
return phone_level_feature.T
|
bert_vits2/text/japanese_bert_extra.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
from contants import config
|
4 |
+
from bert_vits2.text.japanese import text2sep_kata
|
5 |
+
|
6 |
+
|
7 |
+
def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, style_text=None, style_weight=0.7,
|
8 |
+
**kwargs):
|
9 |
+
text = "".join(text2sep_kata(text)[0])
|
10 |
+
if style_text:
|
11 |
+
style_text = "".join(text2sep_kata(style_text)[0])
|
12 |
+
|
13 |
+
with torch.no_grad():
|
14 |
+
inputs = tokenizer(text, return_tensors="pt")
|
15 |
+
for i in inputs:
|
16 |
+
inputs[i] = inputs[i].to(device)
|
17 |
+
res = model(**inputs, output_hidden_states=True)
|
18 |
+
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
|
19 |
+
if style_text:
|
20 |
+
style_inputs = tokenizer(style_text, return_tensors="pt")
|
21 |
+
for i in style_inputs:
|
22 |
+
style_inputs[i] = style_inputs[i].to(device)
|
23 |
+
style_res = model(**style_inputs, output_hidden_states=True)
|
24 |
+
style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].float().cpu()
|
25 |
+
style_res_mean = style_res.mean(0)
|
26 |
+
|
27 |
+
assert len(word2ph) == len(text) + 2
|
28 |
+
word2phone = word2ph
|
29 |
+
phone_level_feature = []
|
30 |
+
for i in range(len(word2phone)):
|
31 |
+
if style_text:
|
32 |
+
repeat_feature = (
|
33 |
+
res[i].repeat(word2phone[i], 1) * (1 - style_weight)
|
34 |
+
+ style_res_mean.repeat(word2phone[i], 1) * style_weight
|
35 |
+
)
|
36 |
+
else:
|
37 |
+
repeat_feature = res[i].repeat(word2phone[i], 1)
|
38 |
+
phone_level_feature.append(repeat_feature)
|
39 |
+
|
40 |
+
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
41 |
+
|
42 |
+
return phone_level_feature.T
|
bert_vits2/text/japanese_bert_v111.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
from contants import config
|
4 |
+
|
5 |
+
|
6 |
+
def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, **kwargs):
|
7 |
+
with torch.no_grad():
|
8 |
+
inputs = tokenizer(text, return_tensors="pt")
|
9 |
+
for i in inputs:
|
10 |
+
inputs[i] = inputs[i].to(device)
|
11 |
+
res = model(**inputs, output_hidden_states=True)
|
12 |
+
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
|
13 |
+
assert inputs["input_ids"].shape[-1] == len(word2ph)
|
14 |
+
word2phone = word2ph
|
15 |
+
phone_level_feature = []
|
16 |
+
for i in range(len(word2phone)):
|
17 |
+
repeat_feature = res[i].repeat(word2phone[i], 1)
|
18 |
+
phone_level_feature.append(repeat_feature)
|
19 |
+
|
20 |
+
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
21 |
+
|
22 |
+
return phone_level_feature.T
|
bert_vits2/text/japanese_bert_v200.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
from contants import config
|
4 |
+
from bert_vits2.text.japanese_v200 import text2sep_kata
|
5 |
+
|
6 |
+
|
7 |
+
def get_bert_feature(text, word2ph, tokenizer, model, device=config.system.device, **kwargs):
|
8 |
+
sep_text, _, _ = text2sep_kata(text)
|
9 |
+
sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
|
10 |
+
sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
|
11 |
+
sep_ids = [2] + [item for sublist in sep_ids for item in sublist] + [3]
|
12 |
+
return get_bert_feature_with_token(sep_ids, word2ph, tokenizer, model, device)
|
13 |
+
|
14 |
+
|
15 |
+
def get_bert_feature_with_token(tokens, word2ph, tokenizer, model, device=config.system.device):
|
16 |
+
with torch.no_grad():
|
17 |
+
inputs = torch.tensor(tokens).to(device).unsqueeze(0)
|
18 |
+
token_type_ids = torch.zeros_like(inputs).to(device)
|
19 |
+
attention_mask = torch.ones_like(inputs).to(device)
|
20 |
+
inputs = {
|
21 |
+
"input_ids": inputs,
|
22 |
+
"token_type_ids": token_type_ids,
|
23 |
+
"attention_mask": attention_mask,
|
24 |
+
}
|
25 |
+
|
26 |
+
# for i in inputs:
|
27 |
+
# inputs[i] = inputs[i].to(device)
|
28 |
+
res = model(**inputs, output_hidden_states=True)
|
29 |
+
res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
|
30 |
+
assert inputs["input_ids"].shape[-1] == len(word2ph)
|
31 |
+
word2phone = word2ph
|
32 |
+
phone_level_feature = []
|
33 |
+
for i in range(len(word2phone)):
|
34 |
+
repeat_feature = res[i].repeat(word2phone[i], 1)
|
35 |
+
phone_level_feature.append(repeat_feature)
|
36 |
+
|
37 |
+
phone_level_feature = torch.cat(phone_level_feature, dim=0)
|
38 |
+
|
39 |
+
return phone_level_feature.T
|
bert_vits2/text/japanese_extra.py
ADDED
@@ -0,0 +1,524 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Convert Japanese text to phonemes which is
|
2 |
+
# compatible with Julius https://github.com/julius-speech/segmentation-kit
|
3 |
+
import re
|
4 |
+
import unicodedata
|
5 |
+
|
6 |
+
import pyopenjtalk
|
7 |
+
from num2words import num2words
|
8 |
+
|
9 |
+
from bert_vits2.text import punctuation
|
10 |
+
from bert_vits2.text.japanese_mora_list import (
|
11 |
+
mora_kata_to_mora_phonemes,
|
12 |
+
)
|
13 |
+
|
14 |
+
# 子音の集合
|
15 |
+
COSONANTS = set(
|
16 |
+
[
|
17 |
+
cosonant
|
18 |
+
for cosonant, _ in mora_kata_to_mora_phonemes.values()
|
19 |
+
if cosonant is not None
|
20 |
+
]
|
21 |
+
)
|
22 |
+
|
23 |
+
# 母音の集合
|
24 |
+
VOWELS = {"a", "i", "u", "e", "o"}
|
25 |
+
|
26 |
+
# 正規化で記号を変換するための辞書
|
27 |
+
rep_map = {
|
28 |
+
":": ",",
|
29 |
+
";": ",",
|
30 |
+
",": ",",
|
31 |
+
"。": ".",
|
32 |
+
"!": "!",
|
33 |
+
"?": "?",
|
34 |
+
"\n": ".",
|
35 |
+
".": ".",
|
36 |
+
"…": "...",
|
37 |
+
"···": "...",
|
38 |
+
"・・・": "...",
|
39 |
+
"·": ",",
|
40 |
+
"・": ",",
|
41 |
+
"、": ",",
|
42 |
+
"$": ".",
|
43 |
+
"“": "'",
|
44 |
+
"”": "'",
|
45 |
+
'"': "'",
|
46 |
+
"‘": "'",
|
47 |
+
"’": "'",
|
48 |
+
"(": "'",
|
49 |
+
")": "'",
|
50 |
+
"(": "'",
|
51 |
+
")": "'",
|
52 |
+
"《": "'",
|
53 |
+
"》": "'",
|
54 |
+
"【": "'",
|
55 |
+
"】": "'",
|
56 |
+
"[": "'",
|
57 |
+
"]": "'",
|
58 |
+
"—": "-",
|
59 |
+
"−": "-",
|
60 |
+
# "~": "-", # これは長音記号「ー」として扱うよう変更
|
61 |
+
# "~": "-", # これは長音記号「ー」として扱うよう変更
|
62 |
+
"「": "'",
|
63 |
+
"」": "'",
|
64 |
+
}
|
65 |
+
|
66 |
+
|
67 |
+
def text_normalize(text):
|
68 |
+
"""
|
69 |
+
日本語のテキストを正規化する。
|
70 |
+
結果は、ちょうど次の文字のみからなる:
|
71 |
+
- ひらがな
|
72 |
+
- カタカナ(全角長音記号「ー」が入る!)
|
73 |
+
- 漢字
|
74 |
+
- 半角アルファベット(大文字と小文字)
|
75 |
+
- ギリシャ文字
|
76 |
+
- `.` (句点`。`や`…`の一部や改行等)
|
77 |
+
- `,` (読点`、`や`:`等)
|
78 |
+
- `?` (疑問符`?`)
|
79 |
+
- `!` (感嘆符`!`)
|
80 |
+
- `'` (`「`や`」`等)
|
81 |
+
- `-` (`―`(ダッシュ、長音記号ではない)や`-`等)
|
82 |
+
|
83 |
+
注意点:
|
84 |
+
- 三点リーダー`…`は`...`に変換される(`なるほど…。` → `なるほど....`)
|
85 |
+
- 数字は漢字に変換される(`1,100円` → `千百円`、`52.34` → `五十二点三四`)
|
86 |
+
- 読点や疑問符等の位置・個数等は保持される(`??あ、、!!!` → `??あ,,!!!`)
|
87 |
+
"""
|
88 |
+
# print(f"Before normalization: {text}")
|
89 |
+
# ここでアルファベットは半角になり、三点リーダは`...`になる
|
90 |
+
res = unicodedata.normalize("NFKC", text)
|
91 |
+
|
92 |
+
res = japanese_convert_numbers_to_words(res) # 「100円」→「百円」等
|
93 |
+
|
94 |
+
# 「~」と「~」も長音記号として扱う
|
95 |
+
res = res.replace("~", "ー")
|
96 |
+
res = res.replace("~", "ー")
|
97 |
+
|
98 |
+
res = replace_punctuation(res) # 句読点等正規化、読めない文字を削除
|
99 |
+
|
100 |
+
# 結合文字の濁点・半濁点を削除
|
101 |
+
# 通常の「ば」等はそのままのこされる、「あ゛」は上で「あ゙」になりここで「あ」になる
|
102 |
+
res = res.replace("\u3099", "") # 結合文字の濁点を削除、る゙ → る
|
103 |
+
res = res.replace("\u309A", "") # 結合文字の半濁点を削除、な゚ → な
|
104 |
+
return res
|
105 |
+
|
106 |
+
|
107 |
+
def replace_punctuation(text: str) -> str:
|
108 |
+
"""句読点等を「.」「,」「!」「?」「'」「-」に正規化し、OpenJTalkで読みが取得できるもののみ残す:
|
109 |
+
漢字・平仮名・カタカナ、アルファベット、ギリシャ文字
|
110 |
+
"""
|
111 |
+
pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
112 |
+
|
113 |
+
# 句読点を辞書で置換
|
114 |
+
replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
115 |
+
|
116 |
+
replaced_text = re.sub(
|
117 |
+
# ↓ ひらがな、カタカナ、漢字
|
118 |
+
r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
|
119 |
+
# ↓ 半角アルファベット(大文字と小文字)
|
120 |
+
+ r"\u0041-\u005A\u0061-\u007A"
|
121 |
+
# ↓ 全角アルファベット(大文字と小文字)
|
122 |
+
+ r"\uFF21-\uFF3A\uFF41-\uFF5A"
|
123 |
+
# ↓ ギリシャ文字
|
124 |
+
+ r"\u0370-\u03FF\u1F00-\u1FFF"
|
125 |
+
# ↓ "!", "?", "…", ",", ".", "'", "-", 但し`…`はすでに`...`に変換されている
|
126 |
+
+ "".join(punctuation) + r"]+",
|
127 |
+
# 上述以外の文字を削除
|
128 |
+
"",
|
129 |
+
replaced_text,
|
130 |
+
)
|
131 |
+
|
132 |
+
return replaced_text
|
133 |
+
|
134 |
+
|
135 |
+
_NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
|
136 |
+
_CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
|
137 |
+
_CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
|
138 |
+
_NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
|
139 |
+
|
140 |
+
|
141 |
+
def japanese_convert_numbers_to_words(text: str) -> str:
|
142 |
+
res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
|
143 |
+
res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
|
144 |
+
res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
|
145 |
+
return res
|
146 |
+
|
147 |
+
|
148 |
+
def g2p(norm_text: str, tokenizer, **kwargs) -> tuple[list[str], list[int], list[int]]:
|
149 |
+
"""
|
150 |
+
他で使われるメインの関数。`text_normalize()`で正規化された`norm_text`を受け取り、
|
151 |
+
- phones: 音素のリスト(ただし`!`や`,`や`.`等punctuationが含まれうる)
|
152 |
+
- tones: アクセントのリスト、0(低)と1(高)からなり、phonesと同じ長さ
|
153 |
+
- word2ph: 元のテキストの各文字に音素が何個割り当てられるかを表すリスト
|
154 |
+
のタプルを返す。
|
155 |
+
ただし`phones`と`tones`の最初と終わりに`_`が入り、応じて`word2ph`の最初と最後に1が追加される。
|
156 |
+
"""
|
157 |
+
# pyopenjtalkのフルコンテキストラベルを使ってアクセントを取り出すと、punctuationの位置が消えてしまい情報が失われてしまう:
|
158 |
+
# 「こんにちは、世界。」と「こんにちは!世界。」と「こんにちは!!!???世界……。」は全て同じになる。
|
159 |
+
# よって、まずpunctuation無しの音素とアクセントのリストを作り、
|
160 |
+
# それとは別にpyopenjtalk.run_frontend()で得られる音素リスト(こちらはpunctuationが保持される)を使い、
|
161 |
+
# アクセント割当をしなおすことによってpunctuationを含めた音素とアクセントのリストを作る。
|
162 |
+
|
163 |
+
# punctuationがすべて消えた、音素とアクセントのタプルのリスト
|
164 |
+
phone_tone_list_wo_punct = g2phone_tone_wo_punct(norm_text)
|
165 |
+
|
166 |
+
# sep_text: 単語単位の単語のリスト
|
167 |
+
# sep_kata: 単語単位の単語のカタカナ読みのリスト
|
168 |
+
sep_text, sep_kata = text2sep_kata(norm_text)
|
169 |
+
|
170 |
+
# sep_phonemes: 各単語ごとの音素のリストのリスト
|
171 |
+
sep_phonemes = handle_long([kata2phoneme_list(i) for i in sep_kata])
|
172 |
+
|
173 |
+
# phone_w_punct: sep_phonemesを結合した、punctuationを元のまま保持した音素列
|
174 |
+
phone_w_punct: list[str] = []
|
175 |
+
for i in sep_phonemes:
|
176 |
+
phone_w_punct += i
|
177 |
+
|
178 |
+
# punctuation無しのアクセント情報を使って、punctuationを含めたアクセント情報を作る
|
179 |
+
phone_tone_list = align_tones(phone_w_punct, phone_tone_list_wo_punct)
|
180 |
+
# word2phは厳密な解答は不可能なので(「今日」「眼鏡」等の熟字訓が存在)、
|
181 |
+
# Bert-VITS2では、単語単位の分割を使って、単語の文字ごとにだいたい均等に音素を分配する
|
182 |
+
|
183 |
+
# sep_textから、各単語を1文字1文字分割して、文字のリスト(のリスト)を作る
|
184 |
+
sep_tokenized: list[list[str]] = []
|
185 |
+
for i in sep_text:
|
186 |
+
if i not in punctuation:
|
187 |
+
sep_tokenized.append(tokenizer.tokenize(i)) # ここでおそらく`i`が文字単位に分割される
|
188 |
+
else:
|
189 |
+
sep_tokenized.append([i])
|
190 |
+
|
191 |
+
# 各単語について、音素の数と文字の数を比較して、均等っぽく分配する
|
192 |
+
word2ph = []
|
193 |
+
for token, phoneme in zip(sep_tokenized, sep_phonemes):
|
194 |
+
phone_len = len(phoneme)
|
195 |
+
word_len = len(token)
|
196 |
+
word2ph += distribute_phone(phone_len, word_len)
|
197 |
+
|
198 |
+
# 最初と最後に`_`記号を追加、アクセントは0(低)、word2phもそれに合わせて追加
|
199 |
+
phone_tone_list = [("_", 0)] + phone_tone_list + [("_", 0)]
|
200 |
+
word2ph = [1] + word2ph + [1]
|
201 |
+
|
202 |
+
phones = [phone for phone, _ in phone_tone_list]
|
203 |
+
tones = [tone for _, tone in phone_tone_list]
|
204 |
+
|
205 |
+
assert len(phones) == sum(word2ph), f"{len(phones)} != {sum(word2ph)}"
|
206 |
+
|
207 |
+
return phones, tones, word2ph
|
208 |
+
|
209 |
+
|
210 |
+
def g2phone_tone_wo_punct(text: str) -> list[tuple[str, int]]:
|
211 |
+
"""
|
212 |
+
テキストに対して、音素とアクセント(0か1)のペアのリストを返す。
|
213 |
+
ただし「!」「.」「?」等の非音素記号(punctuation)は全て消える(ポーズ記号も残さない)。
|
214 |
+
非音素記号を含める処理は`align_tones()`で行われる。
|
215 |
+
また「っ」は「cl」でなく「q」に変換される(「ん」は「N」のまま)。
|
216 |
+
例: "こんにちは、世界ー。。元気?!" →
|
217 |
+
[('k', 0), ('o', 0), ('N', 1), ('n', 1), ('i', 1), ('ch', 1), ('i', 1), ('w', 1), ('a', 1), ('s', 1), ('e', 1), ('k', 0), ('a', 0), ('i', 0), ('i', 0), ('g', 1), ('e', 1), ('N', 0), ('k', 0), ('i', 0)]
|
218 |
+
"""
|
219 |
+
prosodies = pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True)
|
220 |
+
result: list[tuple[str, int]] = []
|
221 |
+
current_phrase: list[tuple[str, int]] = []
|
222 |
+
current_tone = 0
|
223 |
+
for i, letter in enumerate(prosodies):
|
224 |
+
# 特殊記号の処理
|
225 |
+
|
226 |
+
# 文頭記号、無視する
|
227 |
+
if letter == "^":
|
228 |
+
assert i == 0, "Unexpected ^"
|
229 |
+
# アクセント句の終わりに来る記号
|
230 |
+
elif letter in ("$", "?", "_", "#"):
|
231 |
+
# 保持しているフレーズを、アクセント数値を0-1に修正し結果に追加
|
232 |
+
result.extend(fix_phone_tone(current_phrase))
|
233 |
+
# 末尾に来る終了記号、無視(文中の疑問文は`_`になる)
|
234 |
+
if letter in ("$", "?"):
|
235 |
+
assert i == len(prosodies) - 1, f"Unexpected {letter}"
|
236 |
+
# あとは"_"���ポーズ)と"#"(アクセント句の境界)のみ
|
237 |
+
# これらは残さず、次のアクセント句に備える。
|
238 |
+
current_phrase = []
|
239 |
+
# 0を基準点にしてそこから上昇・下降する(負の場合は上の`fix_phone_tone`で直る)
|
240 |
+
current_tone = 0
|
241 |
+
# アクセント上昇記号
|
242 |
+
elif letter == "[":
|
243 |
+
current_tone = current_tone + 1
|
244 |
+
# アクセント下降記号
|
245 |
+
elif letter == "]":
|
246 |
+
current_tone = current_tone - 1
|
247 |
+
# それ以外は通常の音素
|
248 |
+
else:
|
249 |
+
if letter == "cl": # 「っ」の処理
|
250 |
+
letter = "q"
|
251 |
+
current_phrase.append((letter, current_tone))
|
252 |
+
return result
|
253 |
+
|
254 |
+
|
255 |
+
def text2sep_kata(norm_text: str) -> tuple[list[str], list[str]]:
|
256 |
+
"""
|
257 |
+
`text_normalize`で正規化済みの`norm_text`を受け取り、それを単語分割し、
|
258 |
+
分割された単語リストとその読み(カタカナor記号1文字)のリストのタプルを返す。
|
259 |
+
単語分割結果は、`g2p()`の`word2ph`で1文字あたりに割り振る音素記号の数を決めるために使う。
|
260 |
+
例:
|
261 |
+
`私はそう思う!って感じ?` →
|
262 |
+
["私", "は", "そう", "思う", "!", "って", "感じ", "?"], ["ワタシ", "ワ", "ソー", "オモウ", "!", "ッテ", "カンジ", "?"]
|
263 |
+
"""
|
264 |
+
# parsed: OpenJTalkの解析結果
|
265 |
+
parsed = pyopenjtalk.run_frontend(norm_text)
|
266 |
+
sep_text: list[str] = []
|
267 |
+
sep_kata: list[str] = []
|
268 |
+
for parts in parsed:
|
269 |
+
# word: 実際の単語の文字列
|
270 |
+
# yomi: その読み、但し無声化サインの`’`は除去
|
271 |
+
word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
|
272 |
+
"’", ""
|
273 |
+
)
|
274 |
+
"""
|
275 |
+
ここで`yomi`の取りうる値は以下の通りのはず。
|
276 |
+
- `word`が通常単語 → 通常の読み(カタカナ)
|
277 |
+
(カタカナからなり、長音記号も含みうる、`アー` 等)
|
278 |
+
- `word`が`ー` から始まる → `ーラー` や `ーーー` など
|
279 |
+
- `word`が句読点や空白等 → `、`
|
280 |
+
- `word`が`?` → `?`(全角になる)
|
281 |
+
他にも`word`が読めないキリル文字アラビア文字等が来ると`、`になるが、正規化でこの場合は起きないはず。
|
282 |
+
また元のコードでは`yomi`が空白の場合の処理があったが、これは起きないはず。
|
283 |
+
処理すべきは`yomi`が`、`の場合のみのはず。
|
284 |
+
"""
|
285 |
+
assert yomi != "", f"Empty yomi: {word}"
|
286 |
+
if yomi == "、":
|
287 |
+
# wordは正規化されているので、`.`, `,`, `!`, `'`, `-`のいずれか
|
288 |
+
if word not in (
|
289 |
+
".",
|
290 |
+
",",
|
291 |
+
"!",
|
292 |
+
"'",
|
293 |
+
"-",
|
294 |
+
):
|
295 |
+
# ここはpyopenjtalkが読めない文字等のときに起こる
|
296 |
+
raise ValueError(f"Cannot read: {word} in:\n{norm_text}")
|
297 |
+
# yomiは元の記号のままに変更
|
298 |
+
yomi = word
|
299 |
+
elif yomi == "?":
|
300 |
+
assert word == "?", f"yomi `?` comes from: {word}"
|
301 |
+
yomi = "?"
|
302 |
+
sep_text.append(word)
|
303 |
+
sep_kata.append(yomi)
|
304 |
+
return sep_text, sep_kata
|
305 |
+
|
306 |
+
|
307 |
+
# ESPnetの実装から引用、変更点無し
|
308 |
+
# https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
|
309 |
+
def pyopenjtalk_g2p_prosody(text: str, drop_unvoiced_vowels: bool = True) -> list[str]:
|
310 |
+
"""Extract phoneme + prosoody symbol sequence from input full-context labels.
|
311 |
+
|
312 |
+
The algorithm is based on `Prosodic features control by symbols as input of
|
313 |
+
sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
|
314 |
+
|
315 |
+
Args:
|
316 |
+
text (str): Input text.
|
317 |
+
drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
|
318 |
+
|
319 |
+
Returns:
|
320 |
+
List[str]: List of phoneme + prosody symbols.
|
321 |
+
|
322 |
+
Examples:
|
323 |
+
#>>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
|
324 |
+
#>>> pyopenjtalk_g2p_prosody("こんにちは。")
|
325 |
+
['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
|
326 |
+
|
327 |
+
.. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
|
328 |
+
modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
|
329 |
+
|
330 |
+
"""
|
331 |
+
labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
|
332 |
+
N = len(labels)
|
333 |
+
|
334 |
+
phones = []
|
335 |
+
for n in range(N):
|
336 |
+
lab_curr = labels[n]
|
337 |
+
|
338 |
+
# current phoneme
|
339 |
+
p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
|
340 |
+
# deal unvoiced vowels as normal vowels
|
341 |
+
if drop_unvoiced_vowels and p3 in "AEIOU":
|
342 |
+
p3 = p3.lower()
|
343 |
+
|
344 |
+
# deal with sil at the beginning and the end of text
|
345 |
+
if p3 == "sil":
|
346 |
+
assert n == 0 or n == N - 1
|
347 |
+
if n == 0:
|
348 |
+
phones.append("^")
|
349 |
+
elif n == N - 1:
|
350 |
+
# check question form or not
|
351 |
+
e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
|
352 |
+
if e3 == 0:
|
353 |
+
phones.append("$")
|
354 |
+
elif e3 == 1:
|
355 |
+
phones.append("?")
|
356 |
+
continue
|
357 |
+
elif p3 == "pau":
|
358 |
+
phones.append("_")
|
359 |
+
continue
|
360 |
+
else:
|
361 |
+
phones.append(p3)
|
362 |
+
|
363 |
+
# accent type and position info (forward or backward)
|
364 |
+
a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
|
365 |
+
a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
|
366 |
+
a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
|
367 |
+
|
368 |
+
# number of mora in accent phrase
|
369 |
+
f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
|
370 |
+
|
371 |
+
a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
|
372 |
+
# accent phrase border
|
373 |
+
if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
|
374 |
+
phones.append("#")
|
375 |
+
# pitch falling
|
376 |
+
elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
|
377 |
+
phones.append("]")
|
378 |
+
# pitch rising
|
379 |
+
elif a2 == 1 and a2_next == 2:
|
380 |
+
phones.append("[")
|
381 |
+
|
382 |
+
return phones
|
383 |
+
|
384 |
+
|
385 |
+
def _numeric_feature_by_regex(regex, s):
|
386 |
+
match = re.search(regex, s)
|
387 |
+
if match is None:
|
388 |
+
return -50
|
389 |
+
return int(match.group(1))
|
390 |
+
|
391 |
+
|
392 |
+
def fix_phone_tone(phone_tone_list: list[tuple[str, int]]) -> list[tuple[str, int]]:
|
393 |
+
"""
|
394 |
+
`phone_tone_list`のtone(アクセントの値)を0か1の範囲に修正する。
|
395 |
+
例: [(a, 0), (i, -1), (u, -1)] → [(a, 1), (i, 0), (u, 0)]
|
396 |
+
"""
|
397 |
+
tone_values = set(tone for _, tone in phone_tone_list)
|
398 |
+
if len(tone_values) == 1:
|
399 |
+
assert tone_values == {0}, tone_values
|
400 |
+
return phone_tone_list
|
401 |
+
elif len(tone_values) == 2:
|
402 |
+
if tone_values == {0, 1}:
|
403 |
+
return phone_tone_list
|
404 |
+
elif tone_values == {-1, 0}:
|
405 |
+
return [
|
406 |
+
(letter, 0 if tone == -1 else 1) for letter, tone in phone_tone_list
|
407 |
+
]
|
408 |
+
else:
|
409 |
+
raise ValueError(f"Unexpected tone values: {tone_values}")
|
410 |
+
else:
|
411 |
+
raise ValueError(f"Unexpected tone values: {tone_values}")
|
412 |
+
|
413 |
+
|
414 |
+
def distribute_phone(n_phone: int, n_word: int) -> list[int]:
|
415 |
+
"""
|
416 |
+
左から右に1ずつ振り分け、次にまた左から右に1ずつ増やし、というふうに、
|
417 |
+
音素の数`n_phone`を単語の数`n_word`に分配する。
|
418 |
+
"""
|
419 |
+
phones_per_word = [0] * n_word
|
420 |
+
for _ in range(n_phone):
|
421 |
+
min_tasks = min(phones_per_word)
|
422 |
+
min_index = phones_per_word.index(min_tasks)
|
423 |
+
phones_per_word[min_index] += 1
|
424 |
+
return phones_per_word
|
425 |
+
|
426 |
+
|
427 |
+
def handle_long(sep_phonemes: list[list[str]]) -> list[list[str]]:
|
428 |
+
for i in range(len(sep_phonemes)):
|
429 |
+
if sep_phonemes[i][0] == "ー":
|
430 |
+
sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
|
431 |
+
if "ー" in sep_phonemes[i]:
|
432 |
+
for j in range(len(sep_phonemes[i])):
|
433 |
+
if sep_phonemes[i][j] == "ー":
|
434 |
+
sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
|
435 |
+
return sep_phonemes
|
436 |
+
|
437 |
+
|
438 |
+
def align_tones(
|
439 |
+
phones_with_punct: list[str], phone_tone_list: list[tuple[str, int]]
|
440 |
+
) -> list[tuple[str, int]]:
|
441 |
+
"""
|
442 |
+
例:
|
443 |
+
…私は、、そう思う。
|
444 |
+
phones_with_punct:
|
445 |
+
[".", ".", ".", "w", "a", "t", "a", "sh", "i", "w", "a", ",", ",", "s", "o", "o", "o", "m", "o", "u", "."]
|
446 |
+
phone_tone_list:
|
447 |
+
[("w", 0), ("a", 0), ("t", 1), ("a", 1), ("sh", 1), ("i", 1), ("w", 1), ("a", 1), ("s", 0), ("o", 0), ("o", 1), ("o", 1), ("m", 1), ("o", 1), ("u", 0))]
|
448 |
+
Return:
|
449 |
+
[(".", 0), (".", 0), (".", 0), ("w", 0), ("a", 0), ("t", 1), ("a", 1), ("sh", 1), ("i", 1), ("w", 1), ("a", 1), (",", 0), (",", 0), ("s", 0), ("o", 0), ("o", 1), ("o", 1), ("m", 1), ("o", 1), ("u", 0), (".", 0)]
|
450 |
+
"""
|
451 |
+
result: list[tuple[str, int]] = []
|
452 |
+
tone_index = 0
|
453 |
+
for phone in phones_with_punct:
|
454 |
+
if tone_index >= len(phone_tone_list):
|
455 |
+
# 余ったpunctuationがある場合 → (punctuation, 0)を追加
|
456 |
+
result.append((phone, 0))
|
457 |
+
elif phone == phone_tone_list[tone_index][0]:
|
458 |
+
# phone_tone_listの現在の音素と一致する場合 → toneをそこから取得、(phone, tone)を追加
|
459 |
+
result.append((phone, phone_tone_list[tone_index][1]))
|
460 |
+
# 探すindexを1つ進める
|
461 |
+
tone_index += 1
|
462 |
+
elif phone in punctuation:
|
463 |
+
# phoneがpunctuationの場合 → (phone, 0)を追加
|
464 |
+
result.append((phone, 0))
|
465 |
+
else:
|
466 |
+
print(f"phones: {phones_with_punct}")
|
467 |
+
print(f"phone_tone_list: {phone_tone_list}")
|
468 |
+
print(f"result: {result}")
|
469 |
+
print(f"tone_index: {tone_index}")
|
470 |
+
print(f"phone: {phone}")
|
471 |
+
raise ValueError(f"Unexpected phone: {phone}")
|
472 |
+
return result
|
473 |
+
|
474 |
+
|
475 |
+
def kata2phoneme_list(text: str) -> list[str]:
|
476 |
+
"""
|
477 |
+
原則カタカナの`text`を受け取り、それをそのままいじらずに音素記号のリストに変換。
|
478 |
+
注意点:
|
479 |
+
- punctuationが来た場合(punctuationが1文字の場合がありうる)、処理せず1文字のリストを返す
|
480 |
+
- 冒頭に続く「ー」はそのまま「ー」のままにする(`handle_long()`で処理される)
|
481 |
+
- 文中の「ー」は前の音素記号の最後の音素記号に変換される。
|
482 |
+
例:
|
483 |
+
`ーーソーナノカーー` → ["ー", "ー", "s", "o", "o", "n", "a", "n", "o", "k", "a", "a", "a"]
|
484 |
+
`?` → ["?"]
|
485 |
+
"""
|
486 |
+
if text in punctuation:
|
487 |
+
return [text]
|
488 |
+
# `text`がカタカナ(`ー`含む)のみからなるかどうかをチェック
|
489 |
+
if re.fullmatch(r"[\u30A0-\u30FF]+", text) is None:
|
490 |
+
raise ValueError(f"Input must be katakana only: {text}")
|
491 |
+
sorted_keys = sorted(mora_kata_to_mora_phonemes.keys(), key=len, reverse=True)
|
492 |
+
pattern = "|".join(map(re.escape, sorted_keys))
|
493 |
+
|
494 |
+
def mora2phonemes(mora: str) -> str:
|
495 |
+
cosonant, vowel = mora_kata_to_mora_phonemes[mora]
|
496 |
+
if cosonant is None:
|
497 |
+
return f" {vowel}"
|
498 |
+
return f" {cosonant} {vowel}"
|
499 |
+
|
500 |
+
spaced_phonemes = re.sub(pattern, lambda m: mora2phonemes(m.group()), text)
|
501 |
+
|
502 |
+
# 長音記号「ー」の処理
|
503 |
+
long_pattern = r"(\w)(ー*)"
|
504 |
+
long_replacement = lambda m: m.group(1) + (" " + m.group(1)) * len(m.group(2))
|
505 |
+
spaced_phonemes = re.sub(long_pattern, long_replacement, spaced_phonemes)
|
506 |
+
return spaced_phonemes.strip().split(" ")
|
507 |
+
|
508 |
+
|
509 |
+
if __name__ == "__main__":
|
510 |
+
from manager import model_handler
|
511 |
+
|
512 |
+
tokenizer, _ = model_handler.get_bert_model("DEBERTA_V2_LARGE_JAPANESE_CHAR_WWM")
|
513 |
+
text = "hello,こんにちは、世界ー~!……"
|
514 |
+
|
515 |
+
from bert_vits2.text.japanese_bert import get_bert_feature
|
516 |
+
|
517 |
+
text = text_normalize(text)
|
518 |
+
print(text)
|
519 |
+
|
520 |
+
phones, tones, word2ph = g2p(text)
|
521 |
+
print(phones, tones, word2ph)
|
522 |
+
bert = get_bert_feature(text, word2ph)
|
523 |
+
|
524 |
+
print(phones, tones, word2ph, bert.shape)
|