Spaces:
Runtime error
Runtime error
Upload 44 files
Browse files- Dockerfile +19 -4
- LICENSE +1 -1
- LICENSE-MoeGoe +21 -0
- app.py +336 -71
- docker-compose.yaml +3 -2
- models.py +1 -1
- optimizer_removal.py +16 -0
- request.py +265 -0
- test.py +11 -0
- text/__pycache__/__init__.cpython-310.pyc +0 -0
- text/__pycache__/cantonese.cpython-310.pyc +0 -0
- text/__pycache__/cleaners.cpython-310.pyc +0 -0
- text/__pycache__/english.cpython-310.pyc +0 -0
- text/__pycache__/japanese.cpython-310.pyc +0 -0
- text/__pycache__/korean.cpython-310.pyc +0 -0
- text/__pycache__/mandarin.cpython-310.pyc +0 -0
- text/__pycache__/ngu_dialect.cpython-310.pyc +0 -0
- text/__pycache__/shanghainese.cpython-310.pyc +0 -0
- text/cantonese.py +15 -4
- text/cleaners.py +140 -36
- text/mandarin.py +15 -3
- text/shanghainese.py +16 -5
- utils/__pycache__/merge.cpython-310.pyc +0 -0
- utils/__pycache__/nlp.cpython-310.pyc +0 -0
- utils/__pycache__/utils.cpython-310.pyc +0 -0
- utils/merge.py +161 -0
- utils/nlp.py +82 -0
- utils/utils.py +112 -0
- vits-simple-api-installer-latest.sh +27 -0
- voice.py +408 -153
Dockerfile
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
FROM python:3.
|
2 |
|
3 |
RUN mkdir -p /app
|
4 |
WORKDIR /app
|
@@ -7,16 +7,31 @@ ENV DEBIAN_FRONTEND=noninteractive
|
|
7 |
|
8 |
RUN apt-get update && \
|
9 |
apt install build-essential -yq && \
|
|
|
|
|
|
|
10 |
apt-get clean && \
|
11 |
apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
|
12 |
rm -rf /var/lib/apt/lists/*
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
COPY requirements.txt /app
|
15 |
RUN pip install -r requirements.txt
|
16 |
|
17 |
COPY . /app
|
18 |
|
19 |
-
EXPOSE
|
20 |
-
|
21 |
-
CMD ["python", "/app/app.py"]
|
22 |
|
|
|
|
1 |
+
FROM python:3.10.11-slim-bullseye
|
2 |
|
3 |
RUN mkdir -p /app
|
4 |
WORKDIR /app
|
|
|
7 |
|
8 |
RUN apt-get update && \
|
9 |
apt install build-essential -yq && \
|
10 |
+
apt install espeak-ng -yq && \
|
11 |
+
apt install cmake -yq && \
|
12 |
+
apt install -y wget -yq && \
|
13 |
apt-get clean && \
|
14 |
apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
|
15 |
rm -rf /var/lib/apt/lists/*
|
16 |
|
17 |
+
RUN pip install MarkupSafe==2.1.2 numpy==1.23.3 cython six==1.16.0
|
18 |
+
|
19 |
+
RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
|
20 |
+
tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
|
21 |
+
cd openjtalk-0.3.0.dev2 && \
|
22 |
+
rm -rf ./pyopenjtalk/open_jtalk_dic_utf_8-1.11 && \
|
23 |
+
python setup.py install && \
|
24 |
+
cd ../ && \
|
25 |
+
rm -f openjtalk-0.3.0.dev2.tar.gz && \
|
26 |
+
rm -rf openjtalk-0.3.0.dev2
|
27 |
+
|
28 |
+
RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
|
29 |
+
|
30 |
COPY requirements.txt /app
|
31 |
RUN pip install -r requirements.txt
|
32 |
|
33 |
COPY . /app
|
34 |
|
35 |
+
EXPOSE 23456
|
|
|
|
|
36 |
|
37 |
+
CMD ["python", "/app/app.py"]
|
LICENSE
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
MIT License
|
2 |
|
3 |
-
Copyright (c)
|
4 |
|
5 |
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
of this software and associated documentation files (the "Software"), to deal
|
|
|
1 |
MIT License
|
2 |
|
3 |
+
Copyright (c) 2023 Artrajz
|
4 |
|
5 |
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
of this software and associated documentation files (the "Software"), to deal
|
LICENSE-MoeGoe
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2022 CjangCjengh
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
app.py
CHANGED
@@ -1,13 +1,15 @@
|
|
1 |
import os
|
2 |
-
import gradio as gr
|
3 |
import logging
|
|
|
|
|
4 |
import uuid
|
5 |
-
|
6 |
-
from flask import Flask, request, send_file, jsonify
|
7 |
from werkzeug.utils import secure_filename
|
8 |
from flask_apscheduler import APScheduler
|
9 |
-
|
10 |
-
from utils import clean_folder,
|
|
|
|
|
11 |
|
12 |
app = Flask(__name__)
|
13 |
app.config.from_pyfile("config.py")
|
@@ -16,104 +18,367 @@ scheduler = APScheduler()
|
|
16 |
scheduler.init_app(app)
|
17 |
scheduler.start()
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
CUSTOM_PATH = "/gradio"
|
24 |
|
25 |
if not os.path.exists(app.config['UPLOAD_FOLDER']):
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
|
31 |
|
32 |
-
|
33 |
-
@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
def index():
|
35 |
-
return "
|
36 |
|
37 |
|
38 |
@app.route('/voice/speakers', methods=["GET", "POST"])
|
39 |
def voice_speakers_api():
|
40 |
-
|
41 |
-
return jsonify(speakers_list)
|
42 |
|
43 |
|
44 |
@app.route('/voice', methods=["GET", "POST"])
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
71 |
|
72 |
|
73 |
-
@app.route('/voice/
|
74 |
-
|
75 |
-
|
76 |
-
return jsonify("method should be POST")
|
77 |
if request.method == "POST":
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
-
|
84 |
|
85 |
-
|
|
|
86 |
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
-
if voice_obj[original_id][2] != voice_obj[target_id][2]:
|
96 |
-
form["status"] = "error"
|
97 |
-
form["message"] = "speaker IDs are in diffrent Model!"
|
98 |
-
return form
|
99 |
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
file_type = f"audio/{format}"
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
105 |
-
# return output
|
106 |
|
107 |
|
108 |
-
|
109 |
-
@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
def clean_task():
|
111 |
clean_folder(app.config["UPLOAD_FOLDER"])
|
112 |
-
clean_folder(app.config["
|
113 |
|
114 |
|
115 |
if __name__ == '__main__':
|
116 |
-
|
117 |
-
app =
|
118 |
-
# app.run(host='0.0.0.0', port=app.config["PORT"]) # 如果对外开放用这个,docker部署也用这个
|
119 |
-
# app.run(host='127.0.0.1', port=app.config["PORT"], debug=True) # 本地运行、调试
|
|
|
1 |
import os
|
|
|
2 |
import logging
|
3 |
+
import time
|
4 |
+
import logzero
|
5 |
import uuid
|
6 |
+
from flask import Flask, request, send_file, jsonify, make_response
|
|
|
7 |
from werkzeug.utils import secure_filename
|
8 |
from flask_apscheduler import APScheduler
|
9 |
+
from functools import wraps
|
10 |
+
from utils.utils import clean_folder, check_is_none
|
11 |
+
from utils.merge import merge_model
|
12 |
+
from io import BytesIO
|
13 |
|
14 |
app = Flask(__name__)
|
15 |
app.config.from_pyfile("config.py")
|
|
|
18 |
scheduler.init_app(app)
|
19 |
scheduler.start()
|
20 |
|
21 |
+
logzero.loglevel(logging.WARNING)
|
22 |
+
logger = logging.getLogger("vits-simple-api")
|
23 |
+
level = app.config.get("LOGGING_LEVEL", "DEBUG")
|
24 |
+
level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
|
25 |
+
'CRITICAL': logging.CRITICAL}
|
26 |
+
logging.basicConfig(level=level_dict[level])
|
27 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
28 |
|
29 |
+
tts = merge_model(app.config["MODEL_LIST"])
|
|
|
|
|
30 |
|
31 |
if not os.path.exists(app.config['UPLOAD_FOLDER']):
|
32 |
+
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
33 |
+
|
34 |
+
if not os.path.exists(app.config['CACHE_PATH']):
|
35 |
+
os.makedirs(app.config['CACHE_PATH'], exist_ok=True)
|
36 |
|
37 |
|
38 |
+
def require_api_key(func):
|
39 |
+
@wraps(func)
|
40 |
+
def check_api_key(*args, **kwargs):
|
41 |
+
if not app.config.get('API_KEY_ENABLED', False):
|
42 |
+
return func(*args, **kwargs)
|
43 |
+
else:
|
44 |
+
api_key = request.args.get('api_key') or request.headers.get('X-API-KEY')
|
45 |
+
if api_key and api_key == app.config['API_KEY']:
|
46 |
+
return func(*args, **kwargs)
|
47 |
+
else:
|
48 |
+
return make_response(jsonify({"status": "error", "message": "Invalid API Key"}), 401)
|
49 |
+
|
50 |
+
return check_api_key
|
51 |
+
|
52 |
+
|
53 |
+
@app.route('/', methods=["GET", "POST"])
|
54 |
def index():
|
55 |
+
return "vits-simple-api"
|
56 |
|
57 |
|
58 |
@app.route('/voice/speakers', methods=["GET", "POST"])
|
59 |
def voice_speakers_api():
|
60 |
+
return jsonify(tts.voice_speakers)
|
|
|
61 |
|
62 |
|
63 |
@app.route('/voice', methods=["GET", "POST"])
|
64 |
+
@app.route('/voice/vits', methods=["GET", "POST"])
|
65 |
+
@require_api_key
|
66 |
+
def voice_vits_api():
|
67 |
+
try:
|
68 |
+
if request.method == "GET":
|
69 |
+
text = request.args.get("text", "")
|
70 |
+
id = int(request.args.get("id", app.config.get("ID", 0)))
|
71 |
+
format = request.args.get("format", app.config.get("FORMAT", "wav"))
|
72 |
+
lang = request.args.get("lang", app.config.get("LANG", "auto"))
|
73 |
+
length = float(request.args.get("length", app.config.get("LENGTH", 1)))
|
74 |
+
noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
|
75 |
+
noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
|
76 |
+
max = int(request.args.get("max", app.config.get("MAX", 50)))
|
77 |
+
elif request.method == "POST":
|
78 |
+
text = request.form.get("text", "")
|
79 |
+
id = int(request.form.get("id", app.config.get("ID", 0)))
|
80 |
+
format = request.form.get("format", app.config.get("FORMAT", "wav"))
|
81 |
+
lang = request.form.get("lang", app.config.get("LANG", "auto"))
|
82 |
+
length = float(request.form.get("length", app.config.get("LENGTH", 1)))
|
83 |
+
noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
|
84 |
+
noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
|
85 |
+
max = int(request.form.get("max", app.config.get("MAX", 50)))
|
86 |
+
except Exception as e:
|
87 |
+
logger.error(f"[VITS] {e}")
|
88 |
+
return make_response("parameter error", 400)
|
89 |
+
|
90 |
+
logger.info(f"[VITS] id:{id} format:{format} lang:{lang} length:{length} noise:{noise} noisew:{noisew}")
|
91 |
+
logger.info(f"[VITS] len:{len(text)} text:{text}")
|
92 |
+
|
93 |
+
if check_is_none(text):
|
94 |
+
logger.info(f"[VITS] text is empty")
|
95 |
+
return make_response(jsonify({"status": "error", "message": "text is empty"}), 400)
|
96 |
+
|
97 |
+
if check_is_none(id):
|
98 |
+
logger.info(f"[VITS] speaker id is empty")
|
99 |
+
return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
|
100 |
+
|
101 |
+
if id < 0 or id >= tts.vits_speakers_count:
|
102 |
+
logger.info(f"[VITS] speaker id {id} does not exist")
|
103 |
+
return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
|
104 |
+
|
105 |
+
speaker_lang = tts.voice_speakers["VITS"][id].get('lang')
|
106 |
+
if lang.upper() != "AUTO" and lang.upper() != "MIX" and len(speaker_lang) != 1 and lang not in speaker_lang:
|
107 |
+
logger.info(f"[VITS] lang \"{lang}\" is not in {speaker_lang}")
|
108 |
+
return make_response(jsonify({"status": "error", "message": f"lang '{lang}' is not in {speaker_lang}"}), 400)
|
109 |
+
|
110 |
+
if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
|
111 |
+
speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
|
112 |
+
|
113 |
+
fname = f"{str(uuid.uuid1())}.{format}"
|
114 |
+
file_type = f"audio/{format}"
|
115 |
+
|
116 |
+
t1 = time.time()
|
117 |
+
output = tts.vits_infer({"text": text,
|
118 |
+
"id": id,
|
119 |
+
"format": format,
|
120 |
+
"length": length,
|
121 |
+
"noise": noise,
|
122 |
+
"noisew": noisew,
|
123 |
+
"max": max,
|
124 |
+
"lang": lang,
|
125 |
+
"speaker_lang": speaker_lang})
|
126 |
+
t2 = time.time()
|
127 |
+
logger.info(f"[VITS] finish in {(t2 - t1):.2f}s")
|
128 |
|
129 |
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
130 |
|
131 |
|
132 |
+
@app.route('/voice/hubert-vits', methods=["POST"])
|
133 |
+
@require_api_key
|
134 |
+
def voice_hubert_api():
|
|
|
135 |
if request.method == "POST":
|
136 |
+
try:
|
137 |
+
voice = request.files['upload']
|
138 |
+
id = int(request.form.get("id"))
|
139 |
+
format = request.form.get("format", app.config.get("LANG", "auto"))
|
140 |
+
length = float(request.form.get("length", app.config.get("LENGTH", 1)))
|
141 |
+
noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
|
142 |
+
noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
|
143 |
+
except Exception as e:
|
144 |
+
logger.error(f"[hubert] {e}")
|
145 |
+
return make_response("parameter error", 400)
|
146 |
|
147 |
+
logger.info(f"[hubert] id:{id} format:{format} length:{length} noise:{noise} noisew:{noisew}")
|
148 |
|
149 |
+
fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
|
150 |
+
voice.save(os.path.join(app.config['UPLOAD_FOLDER'], fname))
|
151 |
|
152 |
+
if check_is_none(id):
|
153 |
+
logger.info(f"[hubert] speaker id is empty")
|
154 |
+
return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
|
155 |
+
|
156 |
+
if id < 0 or id >= tts.hubert_speakers_count:
|
157 |
+
logger.info(f"[hubert] speaker id {id} does not exist")
|
158 |
+
return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
|
159 |
+
|
160 |
+
file_type = f"audio/{format}"
|
161 |
+
|
162 |
+
t1 = time.time()
|
163 |
+
output = tts.hubert_vits_infer({"id": id,
|
164 |
+
"format": format,
|
165 |
+
"length": length,
|
166 |
+
"noise": noise,
|
167 |
+
"noisew": noisew,
|
168 |
+
"audio_path": os.path.join(app.config['UPLOAD_FOLDER'], fname)})
|
169 |
+
t2 = time.time()
|
170 |
+
logger.info(f"[hubert] finish in {(t2 - t1):.2f}s")
|
171 |
+
|
172 |
+
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
173 |
+
|
174 |
+
|
175 |
+
@app.route('/voice/w2v2-vits', methods=["GET", "POST"])
|
176 |
+
@require_api_key
|
177 |
+
def voice_w2v2_api():
|
178 |
+
try:
|
179 |
+
if request.method == "GET":
|
180 |
+
text = request.args.get("text", "")
|
181 |
+
id = int(request.args.get("id", app.config.get("ID", 0)))
|
182 |
+
format = request.args.get("format", app.config.get("FORMAT", "wav"))
|
183 |
+
lang = request.args.get("lang", app.config.get("LANG", "auto"))
|
184 |
+
length = float(request.args.get("length", app.config.get("LENGTH", 1)))
|
185 |
+
noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
|
186 |
+
noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
|
187 |
+
max = int(request.args.get("max", app.config.get("MAX", 50)))
|
188 |
+
emotion = int(request.args.get("emotion", app.config.get("EMOTION", 0)))
|
189 |
+
elif request.method == "POST":
|
190 |
+
text = request.form.get("text", "")
|
191 |
+
id = int(request.form.get("id", app.config.get("ID", 0)))
|
192 |
+
format = request.form.get("format", app.config.get("FORMAT", "wav"))
|
193 |
+
lang = request.form.get("lang", app.config.get("LANG", "auto"))
|
194 |
+
length = float(request.form.get("length"))
|
195 |
+
noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
|
196 |
+
noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
|
197 |
+
max = int(request.form.get("max", app.config.get("MAX", 50)))
|
198 |
+
emotion = int(request.form.get("emotion", app.config.get("EMOTION", 0)))
|
199 |
+
except Exception as e:
|
200 |
+
logger.error(f"[w2v2] {e}")
|
201 |
+
return make_response(f"parameter error", 400)
|
202 |
+
|
203 |
+
logger.info(f"[w2v2] id:{id} format:{format} lang:{lang} "
|
204 |
+
f"length:{length} noise:{noise} noisew:{noisew} emotion:{emotion}")
|
205 |
+
logger.info(f"[w2v2] len:{len(text)} text:{text}")
|
206 |
+
|
207 |
+
if check_is_none(text):
|
208 |
+
logger.info(f"[w2v2] text is empty")
|
209 |
+
return make_response(jsonify({"status": "error", "message": "text is empty"}), 400)
|
210 |
+
|
211 |
+
if check_is_none(id):
|
212 |
+
logger.info(f"[w2v2] speaker id is empty")
|
213 |
+
return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
|
214 |
+
|
215 |
+
if id < 0 or id >= tts.w2v2_speakers_count:
|
216 |
+
logger.info(f"[w2v2] speaker id {id} does not exist")
|
217 |
+
return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
|
218 |
+
|
219 |
+
speaker_lang = tts.voice_speakers["W2V2-VITS"][id].get('lang')
|
220 |
+
if lang.upper() != "AUTO" and lang.upper() != "MIX" and len(speaker_lang) != 1 and lang not in speaker_lang:
|
221 |
+
logger.info(f"[w2v2] lang \"{lang}\" is not in {speaker_lang}")
|
222 |
+
return make_response(jsonify({"status": "error", "message": f"lang '{lang}' is not in {speaker_lang}"}), 400)
|
223 |
+
|
224 |
+
if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
|
225 |
+
speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
|
226 |
+
|
227 |
+
fname = f"{str(uuid.uuid1())}.{format}"
|
228 |
+
file_type = f"audio/{format}"
|
229 |
|
230 |
+
t1 = time.time()
|
231 |
+
output = tts.w2v2_vits_infer({"text": text,
|
232 |
+
"id": id,
|
233 |
+
"format": format,
|
234 |
+
"length": length,
|
235 |
+
"noise": noise,
|
236 |
+
"noisew": noisew,
|
237 |
+
"max": max,
|
238 |
+
"lang": lang,
|
239 |
+
"emotion": emotion,
|
240 |
+
"speaker_lang": speaker_lang})
|
241 |
+
t2 = time.time()
|
242 |
+
logger.info(f"[w2v2] finish in {(t2 - t1):.2f}s")
|
243 |
+
|
244 |
+
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
245 |
|
|
|
|
|
|
|
|
|
246 |
|
247 |
+
@app.route('/voice/conversion', methods=["POST"])
|
248 |
+
@app.route('/voice/vits/conversion', methods=["POST"])
|
249 |
+
@require_api_key
|
250 |
+
def vits_voice_conversion_api():
|
251 |
+
if request.method == "POST":
|
252 |
+
try:
|
253 |
+
voice = request.files['upload']
|
254 |
+
original_id = int(request.form["original_id"])
|
255 |
+
target_id = int(request.form["target_id"])
|
256 |
+
format = request.form.get("format", voice.filename.split(".")[1])
|
257 |
+
except Exception as e:
|
258 |
+
logger.error(f"[vits_voice_convertsion] {e}")
|
259 |
+
return make_response("parameter error", 400)
|
260 |
+
|
261 |
+
fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
|
262 |
+
audio_path = os.path.join(app.config['UPLOAD_FOLDER'], fname)
|
263 |
+
voice.save(audio_path)
|
264 |
file_type = f"audio/{format}"
|
265 |
|
266 |
+
logger.info(f"[vits_voice_convertsion] orginal_id:{original_id} target_id:{target_id}")
|
267 |
+
t1 = time.time()
|
268 |
+
try:
|
269 |
+
output = tts.vits_voice_conversion({"audio_path": audio_path,
|
270 |
+
"original_id": original_id,
|
271 |
+
"target_id": target_id,
|
272 |
+
"format": format})
|
273 |
+
except Exception as e:
|
274 |
+
logger.info(f"[vits_voice_convertsion] {e}")
|
275 |
+
return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
|
276 |
+
t2 = time.time()
|
277 |
+
logger.info(f"finish in {(t2 - t1):.2f}s")
|
278 |
+
|
279 |
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
|
|
280 |
|
281 |
|
282 |
+
@app.route('/voice/ssml', methods=["POST"])
|
283 |
+
@require_api_key
|
284 |
+
def ssml():
|
285 |
+
try:
|
286 |
+
ssml = request.form["ssml"]
|
287 |
+
except Exception as e:
|
288 |
+
logger.info(f"[ssml] {e}")
|
289 |
+
return make_response(jsonify({"status": "error", "message": f"parameter error"}), 400)
|
290 |
+
|
291 |
+
logger.debug(ssml)
|
292 |
+
|
293 |
+
t1 = time.time()
|
294 |
+
try:
|
295 |
+
output, format = tts.create_ssml_infer_task(ssml)
|
296 |
+
except Exception as e:
|
297 |
+
logger.info(f"[ssml] {e}")
|
298 |
+
return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
|
299 |
+
t2 = time.time()
|
300 |
+
|
301 |
+
fname = f"{str(uuid.uuid1())}.{format}"
|
302 |
+
file_type = f"audio/{format}"
|
303 |
+
|
304 |
+
logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
|
305 |
+
|
306 |
+
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
307 |
+
|
308 |
+
|
309 |
+
@app.route('/voice/dimension-emotion', methods=["POST"])
|
310 |
+
def dimensional_emotion():
|
311 |
+
if request.method == "POST":
|
312 |
+
try:
|
313 |
+
audio = request.files['upload']
|
314 |
+
except Exception as e:
|
315 |
+
logger.error(f"[dimensional_emotion] {e}")
|
316 |
+
return make_response("parameter error", 400)
|
317 |
+
|
318 |
+
content = BytesIO(audio.read())
|
319 |
+
|
320 |
+
file_type = "application/octet-stream; charset=ascii"
|
321 |
+
fname = os.path.splitext(audio.filename)[0] + ".npy"
|
322 |
+
output = tts.get_dimensional_emotion_npy(content)
|
323 |
+
|
324 |
+
return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
|
325 |
+
|
326 |
+
|
327 |
+
@app.route('/voice/check', methods=["GET", "POST"])
|
328 |
+
def check():
|
329 |
+
try:
|
330 |
+
if request.method == "GET":
|
331 |
+
model = request.args.get("model")
|
332 |
+
id = int(request.args.get("id"))
|
333 |
+
elif request.method == "POST":
|
334 |
+
model = request.form["model"]
|
335 |
+
id = int(request.form["id"])
|
336 |
+
except Exception as e:
|
337 |
+
logger.info(f"[check] {e}")
|
338 |
+
return make_response(jsonify({"status": "error", "message": "parameter error"}), 400)
|
339 |
+
|
340 |
+
if check_is_none(model):
|
341 |
+
logger.info(f"[check] model {model} is empty")
|
342 |
+
return make_response(jsonify({"status": "error", "message": "model is empty"}), 400)
|
343 |
+
|
344 |
+
if model.upper() not in ("VITS", "HUBERT", "W2V2"):
|
345 |
+
res = make_response(jsonify({"status": "error", "message": f"model {model} does not exist"}))
|
346 |
+
res.status = 404
|
347 |
+
logger.info(f"[check] speaker id {id} error")
|
348 |
+
return res
|
349 |
+
|
350 |
+
if check_is_none(id):
|
351 |
+
logger.info(f"[check] speaker id is empty")
|
352 |
+
return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
|
353 |
+
|
354 |
+
if model.upper() == "VITS":
|
355 |
+
speaker_list = tts.voice_speakers["VITS"]
|
356 |
+
elif model.upper() == "HUBERT":
|
357 |
+
speaker_list = tts.voice_speakers["HUBERT-VITS"]
|
358 |
+
elif model.upper() == "W2V2":
|
359 |
+
speaker_list = tts.voice_speakers["W2V2-VITS"]
|
360 |
+
|
361 |
+
if len(speaker_list) == 0:
|
362 |
+
logger.info(f"[check] {model} not loaded")
|
363 |
+
return make_response(jsonify({"status": "error", "message": f"{model} not loaded"}), 400)
|
364 |
+
|
365 |
+
if id < 0 or id >= len(speaker_list):
|
366 |
+
logger.info(f"[check] speaker id {id} does not exist")
|
367 |
+
return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
|
368 |
+
name = str(speaker_list[id]["name"])
|
369 |
+
lang = speaker_list[id]["lang"]
|
370 |
+
logger.info(f"[check] check id:{id} name:{name} lang:{lang}")
|
371 |
+
|
372 |
+
return make_response(jsonify({"status": "success", "id": id, "name": name, "lang": lang}), 200)
|
373 |
+
|
374 |
+
|
375 |
+
# regular cleaning
|
376 |
+
@scheduler.task('interval', id='clean_task', seconds=3600, misfire_grace_time=900)
|
377 |
def clean_task():
|
378 |
clean_folder(app.config["UPLOAD_FOLDER"])
|
379 |
+
clean_folder(app.config["CACHE_PATH"])
|
380 |
|
381 |
|
382 |
if __name__ == '__main__':
|
383 |
+
app.run(host='0.0.0.0', port=app.config.get("PORT", 23456), debug=app.config.get("DEBUG", False)) # 对外开放
|
384 |
+
# app.run(host='127.0.0.1', port=app.config.get("PORT",23456), debug=True) # 本地运行、调试
|
|
|
|
docker-compose.yaml
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
version: '3.4'
|
2 |
services:
|
3 |
-
|
4 |
-
image: artrajz/
|
5 |
restart: always
|
6 |
ports:
|
7 |
- 23456:23456
|
8 |
environment:
|
9 |
LANG: 'C.UTF-8'
|
|
|
10 |
volumes:
|
11 |
- ./Model:/app/Model # 挂载模型文件夹
|
12 |
- ./config.py:/app/config.py # 挂载配置文件
|
|
|
1 |
version: '3.4'
|
2 |
services:
|
3 |
+
vits:
|
4 |
+
image: artrajz/vits-simple-api:latest
|
5 |
restart: always
|
6 |
ports:
|
7 |
- 23456:23456
|
8 |
environment:
|
9 |
LANG: 'C.UTF-8'
|
10 |
+
TZ: Asia/Shanghai #timezone
|
11 |
volumes:
|
12 |
- ./Model:/app/Model # 挂载模型文件夹
|
13 |
- ./config.py:/app/config.py # 挂载配置文件
|
models.py
CHANGED
@@ -363,7 +363,7 @@ class SynthesizerTrn(nn.Module):
|
|
363 |
else:
|
364 |
self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
|
365 |
|
366 |
-
if n_speakers
|
367 |
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
368 |
|
369 |
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):
|
|
|
363 |
else:
|
364 |
self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
|
365 |
|
366 |
+
if n_speakers >= 1:
|
367 |
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
368 |
|
369 |
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):
|
optimizer_removal.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch import load, save
|
2 |
+
|
3 |
+
if __name__ == '__main__':
|
4 |
+
print("优化器通常不会被用于推理阶段,如果只用于推理可以去除优化器以减小模型体积\n")
|
5 |
+
input_path = input("请输入模型的路径:")
|
6 |
+
output_path = f"{input_path.split('.')[0]}_inference.pth"
|
7 |
+
checkpoint_dict = load(input_path, map_location='cpu')
|
8 |
+
checkpoint_dict_new = {}
|
9 |
+
for k, v in checkpoint_dict.items():
|
10 |
+
if k == "optimizer":
|
11 |
+
print(f"remove optimizer")
|
12 |
+
continue
|
13 |
+
checkpoint_dict_new[k] = v
|
14 |
+
save(checkpoint_dict_new, output_path)
|
15 |
+
print("finish")
|
16 |
+
print(output_path)
|
request.py
ADDED
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import requests
|
3 |
+
import os
|
4 |
+
import random
|
5 |
+
import string
|
6 |
+
from requests_toolbelt.multipart.encoder import MultipartEncoder
|
7 |
+
|
8 |
+
abs_path = os.path.dirname(__file__)
|
9 |
+
base = "http://127.0.0.1:23456"
|
10 |
+
|
11 |
+
|
12 |
+
# 映射表
|
13 |
+
def voice_speakers():
|
14 |
+
url = f"{base}/voice/speakers"
|
15 |
+
|
16 |
+
res = requests.post(url=url)
|
17 |
+
json = res.json()
|
18 |
+
for i in json:
|
19 |
+
print(i)
|
20 |
+
for j in json[i]:
|
21 |
+
print(j)
|
22 |
+
return json
|
23 |
+
|
24 |
+
|
25 |
+
# 语音合成 voice vits
|
26 |
+
def voice_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50):
|
27 |
+
fields = {
|
28 |
+
"text": text,
|
29 |
+
"id": str(id),
|
30 |
+
"format": format,
|
31 |
+
"lang": lang,
|
32 |
+
"length": str(length),
|
33 |
+
"noise": str(noise),
|
34 |
+
"noisew": str(noisew),
|
35 |
+
"max": str(max)
|
36 |
+
}
|
37 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
38 |
+
|
39 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
40 |
+
headers = {"Content-Type": m.content_type}
|
41 |
+
url = f"{base}/voice"
|
42 |
+
|
43 |
+
res = requests.post(url=url, data=m, headers=headers)
|
44 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
45 |
+
path = f"{abs_path}/{fname}"
|
46 |
+
|
47 |
+
with open(path, "wb") as f:
|
48 |
+
f.write(res.content)
|
49 |
+
print(path)
|
50 |
+
return path
|
51 |
+
|
52 |
+
|
53 |
+
# 语音转换 hubert-vits
|
54 |
+
def voice_hubert_vits(upload_path, id, format="wav", length=1, noise=0.667, noisew=0.8):
|
55 |
+
upload_name = os.path.basename(upload_path)
|
56 |
+
upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
|
57 |
+
|
58 |
+
with open(upload_path, 'rb') as upload_file:
|
59 |
+
fields = {
|
60 |
+
"upload": (upload_name, upload_file, upload_type),
|
61 |
+
"id": str(id),
|
62 |
+
"format": format,
|
63 |
+
"length": str(length),
|
64 |
+
"noise": str(noise),
|
65 |
+
"noisew": str(noisew),
|
66 |
+
}
|
67 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
68 |
+
|
69 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
70 |
+
headers = {"Content-Type": m.content_type}
|
71 |
+
url = f"{base}/voice/hubert-vits"
|
72 |
+
|
73 |
+
res = requests.post(url=url, data=m, headers=headers)
|
74 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
75 |
+
path = f"{abs_path}/{fname}"
|
76 |
+
|
77 |
+
with open(path, "wb") as f:
|
78 |
+
f.write(res.content)
|
79 |
+
print(path)
|
80 |
+
return path
|
81 |
+
|
82 |
+
|
83 |
+
# 维度情感模型 w2v2-vits
|
84 |
+
def voice_w2v2_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50, emotion=0):
|
85 |
+
fields = {
|
86 |
+
"text": text,
|
87 |
+
"id": str(id),
|
88 |
+
"format": format,
|
89 |
+
"lang": lang,
|
90 |
+
"length": str(length),
|
91 |
+
"noise": str(noise),
|
92 |
+
"noisew": str(noisew),
|
93 |
+
"max": str(max),
|
94 |
+
"emotion": str(emotion)
|
95 |
+
}
|
96 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
97 |
+
|
98 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
99 |
+
headers = {"Content-Type": m.content_type}
|
100 |
+
url = f"{base}/voice/w2v2-vits"
|
101 |
+
|
102 |
+
res = requests.post(url=url, data=m, headers=headers)
|
103 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
104 |
+
path = f"{abs_path}/{fname}"
|
105 |
+
|
106 |
+
with open(path, "wb") as f:
|
107 |
+
f.write(res.content)
|
108 |
+
print(path)
|
109 |
+
return path
|
110 |
+
|
111 |
+
|
112 |
+
# 语音转换 同VITS模型内角色之间的音色转换
|
113 |
+
def voice_conversion(upload_path, original_id, target_id):
|
114 |
+
upload_name = os.path.basename(upload_path)
|
115 |
+
upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
|
116 |
+
|
117 |
+
with open(upload_path, 'rb') as upload_file:
|
118 |
+
fields = {
|
119 |
+
"upload": (upload_name, upload_file, upload_type),
|
120 |
+
"original_id": str(original_id),
|
121 |
+
"target_id": str(target_id),
|
122 |
+
}
|
123 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
124 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
125 |
+
|
126 |
+
headers = {"Content-Type": m.content_type}
|
127 |
+
url = f"{base}/voice/conversion"
|
128 |
+
|
129 |
+
res = requests.post(url=url, data=m, headers=headers)
|
130 |
+
|
131 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
132 |
+
path = f"{abs_path}/{fname}"
|
133 |
+
|
134 |
+
with open(path, "wb") as f:
|
135 |
+
f.write(res.content)
|
136 |
+
print(path)
|
137 |
+
return path
|
138 |
+
|
139 |
+
|
140 |
+
def voice_ssml(ssml):
|
141 |
+
fields = {
|
142 |
+
"ssml": ssml,
|
143 |
+
}
|
144 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
145 |
+
|
146 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
147 |
+
headers = {"Content-Type": m.content_type}
|
148 |
+
url = f"{base}/voice/ssml"
|
149 |
+
|
150 |
+
res = requests.post(url=url, data=m, headers=headers)
|
151 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
152 |
+
path = f"{abs_path}/{fname}"
|
153 |
+
|
154 |
+
with open(path, "wb") as f:
|
155 |
+
f.write(res.content)
|
156 |
+
print(path)
|
157 |
+
return path
|
158 |
+
|
159 |
+
|
160 |
+
def voice_dimensional_emotion(upload_path):
|
161 |
+
upload_name = os.path.basename(upload_path)
|
162 |
+
upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
|
163 |
+
|
164 |
+
with open(upload_path, 'rb') as upload_file:
|
165 |
+
fields = {
|
166 |
+
"upload": (upload_name, upload_file, upload_type),
|
167 |
+
}
|
168 |
+
boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
|
169 |
+
|
170 |
+
m = MultipartEncoder(fields=fields, boundary=boundary)
|
171 |
+
headers = {"Content-Type": m.content_type}
|
172 |
+
url = f"{base}/voice/dimension-emotion"
|
173 |
+
|
174 |
+
res = requests.post(url=url, data=m, headers=headers)
|
175 |
+
fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
|
176 |
+
path = f"{abs_path}/{fname}"
|
177 |
+
|
178 |
+
with open(path, "wb") as f:
|
179 |
+
f.write(res.content)
|
180 |
+
print(path)
|
181 |
+
return path
|
182 |
+
|
183 |
+
|
184 |
+
import time
|
185 |
+
|
186 |
+
# while 1:
|
187 |
+
# text = input()
|
188 |
+
# l = len(text)
|
189 |
+
# time1 = time.time()
|
190 |
+
# voice_vits(text)
|
191 |
+
# time2 = time.time()
|
192 |
+
# print(f"len:{l}耗时:{time2 - time1}")
|
193 |
+
|
194 |
+
# text = "你好"
|
195 |
+
|
196 |
+
|
197 |
+
# ssml = """
|
198 |
+
# <speak lang="zh" format="mp3" length="1.2">
|
199 |
+
# <voice id="92" >这几天心里颇不宁静。</voice>
|
200 |
+
# <voice id="125">今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。</voice>
|
201 |
+
# <voice id="142">月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;</voice>
|
202 |
+
# <voice id="98">妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。</voice>
|
203 |
+
# <voice id="120">我悄悄地披了大衫,带上门出去。</voice><break time="2s"/>
|
204 |
+
# <voice id="121">沿着荷塘,是一条曲折的小煤屑路。</voice>
|
205 |
+
# <voice id="122">这是一条幽僻的路;白天也少人走,夜晚更加寂寞。</voice>
|
206 |
+
# <voice id="123">荷塘四面,长着许多树,蓊蓊郁郁的。</voice>
|
207 |
+
# <voice id="124">路的一旁,是些杨柳,和一些不知道名字的树。</voice>
|
208 |
+
# <voice id="125">没有月光的晚上,这路上阴森森的,有些怕人。</voice>
|
209 |
+
# <voice id="126">今晚却很好,虽然月光也还是淡淡的。</voice><break time="2s"/>
|
210 |
+
# <voice id="127">路上只我一个人,背着手踱着。</voice>
|
211 |
+
# <voice id="128">这一片天地好像是我的;我也像超出了平常的自己,到了另一个世界里。</voice>
|
212 |
+
# <voice id="129">我爱热闹,也爱冷静;<break strength="x-weak"/>爱群居,也爱独处。</voice>
|
213 |
+
# <voice id="130">像今晚上,一个人在这苍茫的月下,什么都可以想,什么都可以不想,便觉是个自由的人。</voice>
|
214 |
+
# <voice id="131">白天里一定要做的事,一定要说的话,现在都可不理。</voice>
|
215 |
+
# <voice id="132">这是独处的妙处,我且受用这无边的荷香月色好了。</voice>
|
216 |
+
# </speak>
|
217 |
+
# """
|
218 |
+
# ssml = """
|
219 |
+
# <speak lang="zh">
|
220 |
+
# <voice id="92" length="1.4">这几天心里颇不宁静。今晚<break/>在院子里坐着乘凉,忽然想起<break/>日日走过的荷塘,在这满月的光里,总该另有一番样子吧。</voice>
|
221 |
+
# <voice id="142" length="1.4">月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;</voice><break time="2s"/>
|
222 |
+
# <voice id="0" length="1.4" model="w2v2-vits" lang="ja">こんにちは</voice>
|
223 |
+
# </speak>
|
224 |
+
# """
|
225 |
+
# ssml = """
|
226 |
+
# <speak lang="ja">
|
227 |
+
# <voice id="142" length="1.4">こんにちは</voice>
|
228 |
+
# <voice id="0" length="1.4" model="w2v2-vits" emotion="177">こんにちは</voice>
|
229 |
+
# <voice id="0" length="1.4" model="w2v2-vits">こんにちは</voice>
|
230 |
+
# </speak>
|
231 |
+
# """
|
232 |
+
ssml = """
|
233 |
+
<speak lang="auto">
|
234 |
+
<voice>这几天心里颇不宁静。</voice>
|
235 |
+
<voice>今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。</voice>
|
236 |
+
<voice>月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;</voice>
|
237 |
+
<voice>妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。</voice>
|
238 |
+
<voice>我悄悄地披了大衫,带上门出去。</voice><break time="2s"/>
|
239 |
+
<voice>沿着荷塘,是一条曲折的小煤屑路。</voice>
|
240 |
+
<voice>这是一条幽僻的路;白天也少人走,夜晚更加寂寞。</voice>
|
241 |
+
<voice>荷塘四面,长着许多树,蓊蓊郁郁的。</voice>
|
242 |
+
<voice>路的一旁,是些杨柳,和一些不知道名字的树。</voice>
|
243 |
+
<voice>没有月光的晚上,这路上阴森森的,有些怕人。</voice>
|
244 |
+
<voice>今晚却很好,虽然月光也还是淡淡的。</voice><break time="2s"/>
|
245 |
+
<voice>路上只我一个人,背着手踱着。</voice>
|
246 |
+
<voice>这一片天地好像是我的;我也像超出了平常的自己,到了另一个世界里。</voice>
|
247 |
+
<voice>我爱热闹,也爱冷静;<break strength="x-weak"/>爱群居,也爱独处。</voice>
|
248 |
+
<voice>像今晚上,一个人在这苍茫的月下,什么都可以想,什么都可以不想,便觉是个自由的人。</voice>
|
249 |
+
<voice>白天里一定要做的事,一定要说的话,现在都可不理。</voice>
|
250 |
+
<voice>这是独处的妙处,我且受用这无边的荷香月色好了。</voice>
|
251 |
+
</speak>
|
252 |
+
"""
|
253 |
+
|
254 |
+
text = """猫咪是爱撒娇、爱玩耍的小家伙,通常有着柔软的绒毛和温柔的眼神,是许多人都喜欢的宠物哦~它们特别喜欢舔自己的毛发,用柔顺的小脑袋搓人的脚丫子,还能给人带来很多欢乐和温馨。
|
255 |
+
"""
|
256 |
+
t1 = time.time()
|
257 |
+
# voice_conversion("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav", 91, 93)
|
258 |
+
# voice_hubert_vits("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav",0)
|
259 |
+
# voice_vits(text,format="wav",lang="zh")
|
260 |
+
# voice_w2v2_vits(text,emotion=111)
|
261 |
+
# os.system(voice_ssml(ssml))
|
262 |
+
os.system(voice_vits(text,id=0, format="wav", max=0))
|
263 |
+
# voice_dimensional_emotion("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav")
|
264 |
+
t2 = time.time()
|
265 |
+
print(f"len:{len(text)}耗时:{t2 - t1}")
|
test.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from io import BytesIO
|
3 |
+
|
4 |
+
array = np.array([1, 2, 3])
|
5 |
+
|
6 |
+
npy = BytesIO()
|
7 |
+
np.save(npy,array)
|
8 |
+
npy.seek(0)
|
9 |
+
tmp = np.load("H:\git/vits-simple-api\Model/npy/25ecb3f6-f968-11ed-b094-e0d4e84af078.npy")
|
10 |
+
print(tmp)
|
11 |
+
|
text/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/text/__pycache__/__init__.cpython-310.pyc and b/text/__pycache__/__init__.cpython-310.pyc differ
|
|
text/__pycache__/cantonese.cpython-310.pyc
ADDED
Binary file (2.34 kB). View file
|
|
text/__pycache__/cleaners.cpython-310.pyc
CHANGED
Binary files a/text/__pycache__/cleaners.cpython-310.pyc and b/text/__pycache__/cleaners.cpython-310.pyc differ
|
|
text/__pycache__/english.cpython-310.pyc
ADDED
Binary file (4.69 kB). View file
|
|
text/__pycache__/japanese.cpython-310.pyc
CHANGED
Binary files a/text/__pycache__/japanese.cpython-310.pyc and b/text/__pycache__/japanese.cpython-310.pyc differ
|
|
text/__pycache__/korean.cpython-310.pyc
ADDED
Binary file (5.58 kB). View file
|
|
text/__pycache__/mandarin.cpython-310.pyc
CHANGED
Binary files a/text/__pycache__/mandarin.cpython-310.pyc and b/text/__pycache__/mandarin.cpython-310.pyc differ
|
|
text/__pycache__/ngu_dialect.cpython-310.pyc
ADDED
Binary file (1.17 kB). View file
|
|
text/__pycache__/shanghainese.cpython-310.pyc
ADDED
Binary file (2.51 kB). View file
|
|
text/cantonese.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import re
|
2 |
import cn2an
|
3 |
import opencc
|
|
|
4 |
|
5 |
-
|
6 |
-
converter = opencc.OpenCC('jyutjyu')
|
7 |
|
8 |
# List of (Latin alphabet, ipa) pairs:
|
9 |
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
@@ -35,6 +35,16 @@ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
35 |
('Z', 'iː˨sɛːt̚˥')
|
36 |
]]
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
def number_to_cantonese(text):
|
40 |
return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
|
@@ -47,9 +57,10 @@ def latin_to_ipa(text):
|
|
47 |
|
48 |
|
49 |
def cantonese_to_ipa(text):
|
|
|
50 |
text = number_to_cantonese(text.upper())
|
51 |
-
text = converter.convert(text).replace('-','').replace('$',' ')
|
52 |
-
text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
|
53 |
text = re.sub(r'[、;:]', ',', text)
|
54 |
text = re.sub(r'\s*,\s*', ', ', text)
|
55 |
text = re.sub(r'\s*。\s*', '. ', text)
|
|
|
1 |
import re
|
2 |
import cn2an
|
3 |
import opencc
|
4 |
+
import config
|
5 |
|
6 |
+
converter = opencc.OpenCC(config.ABS_PATH + '/chinese_dialect_lexicons/jyutjyu_2')
|
|
|
7 |
|
8 |
# List of (Latin alphabet, ipa) pairs:
|
9 |
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
|
35 |
('Z', 'iː˨sɛːt̚˥')
|
36 |
]]
|
37 |
|
38 |
+
_symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
|
39 |
+
('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
|
40 |
+
]]
|
41 |
+
|
42 |
+
|
43 |
+
def symbols_to_chinese(text):
|
44 |
+
for regex, replacement in _symbols_to_chinese:
|
45 |
+
text = re.sub(regex, replacement, text)
|
46 |
+
return text
|
47 |
+
|
48 |
|
49 |
def number_to_cantonese(text):
|
50 |
return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
|
|
|
57 |
|
58 |
|
59 |
def cantonese_to_ipa(text):
|
60 |
+
text = symbols_to_chinese(text)
|
61 |
text = number_to_cantonese(text.upper())
|
62 |
+
text = converter.convert(text).replace('-', '').replace('$', ' ')
|
63 |
+
text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group()) + ' ', text)
|
64 |
text = re.sub(r'[、;:]', ',', text)
|
65 |
text = re.sub(r'\s*,\s*', ', ', text)
|
66 |
text = re.sub(r'\s*。\s*', '. ', text)
|
text/cleaners.py
CHANGED
@@ -1,10 +1,77 @@
|
|
1 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
|
4 |
def japanese_cleaners(text):
|
5 |
from text.japanese import japanese_to_romaji_with_accent
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
8 |
return text
|
9 |
|
10 |
|
@@ -15,20 +82,31 @@ def japanese_cleaners2(text):
|
|
15 |
def korean_cleaners(text):
|
16 |
'''Pipeline for Korean text'''
|
17 |
from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
22 |
return text
|
23 |
|
24 |
|
25 |
def chinese_cleaners(text):
|
26 |
'''Pipeline for Chinese text'''
|
27 |
-
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
return text
|
33 |
|
34 |
|
@@ -36,9 +114,9 @@ def zh_ja_mixture_cleaners(text):
|
|
36 |
from text.mandarin import chinese_to_romaji
|
37 |
from text.japanese import japanese_to_romaji_with_accent
|
38 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
39 |
-
lambda x: chinese_to_romaji(x.group(1))+' ', text)
|
40 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
|
41 |
-
x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
|
42 |
text = re.sub(r'\s+$', '', text)
|
43 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
44 |
return text
|
@@ -57,15 +135,15 @@ def cjks_cleaners(text):
|
|
57 |
from text.sanskrit import devanagari_to_ipa
|
58 |
from text.english import english_to_lazy_ipa
|
59 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
60 |
-
lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
|
61 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
62 |
-
lambda x: japanese_to_ipa(x.group(1))+' ', text)
|
63 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
64 |
-
lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
|
65 |
text = re.sub(r'\[SA\](.*?)\[SA\]',
|
66 |
-
lambda x: devanagari_to_ipa(x.group(1))+' ', text)
|
67 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
68 |
-
lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
|
69 |
text = re.sub(r'\s+$', '', text)
|
70 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
71 |
return text
|
@@ -77,13 +155,13 @@ def cjke_cleaners(text):
|
|
77 |
from text.korean import korean_to_ipa
|
78 |
from text.english import english_to_ipa2
|
79 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
|
80 |
-
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
|
81 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
|
82 |
-
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
|
83 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
84 |
-
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
85 |
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
|
86 |
-
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
|
87 |
text = re.sub(r'\s+$', '', text)
|
88 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
89 |
return text
|
@@ -95,13 +173,28 @@ def cjke_cleaners2(text):
|
|
95 |
from text.korean import korean_to_ipa
|
96 |
from text.english import english_to_ipa2
|
97 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
98 |
-
lambda x: chinese_to_ipa(x.group(1))+' ', text)
|
99 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
100 |
-
lambda x: japanese_to_ipa2(x.group(1))+' ', text)
|
101 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
102 |
-
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
104 |
-
lambda x: english_to_ipa2(x.group(1))+' ', text)
|
105 |
text = re.sub(r'\s+$', '', text)
|
106 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
107 |
return text
|
@@ -109,15 +202,25 @@ def cjke_cleaners2(text):
|
|
109 |
|
110 |
def thai_cleaners(text):
|
111 |
from text.thai import num_to_thai, latin_to_thai
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
114 |
return text
|
115 |
|
116 |
|
117 |
def shanghainese_cleaners(text):
|
118 |
from text.shanghainese import shanghainese_to_ipa
|
119 |
-
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
121 |
return text
|
122 |
|
123 |
|
@@ -129,17 +232,18 @@ def chinese_dialect_cleaners(text):
|
|
129 |
from text.english import english_to_lazy_ipa2
|
130 |
from text.ngu_dialect import ngu_dialect_to_ipa
|
131 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
132 |
-
lambda x: chinese_to_ipa2(x.group(1))+' ', text)
|
133 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
134 |
-
lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
|
135 |
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
|
136 |
-
|
|
|
137 |
text = re.sub(r'\[GD\](.*?)\[GD\]',
|
138 |
-
lambda x: cantonese_to_ipa(x.group(1))+' ', text)
|
139 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
140 |
-
lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
|
141 |
text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
|
142 |
-
1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
|
143 |
text = re.sub(r'\s+$', '', text)
|
144 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
145 |
return text
|
|
|
1 |
import re
|
2 |
+
import config
|
3 |
+
from unidecode import unidecode
|
4 |
+
from phonemizer import phonemize
|
5 |
+
from phonemizer.backend.espeak.wrapper import EspeakWrapper
|
6 |
+
|
7 |
+
ESPEAK_LIBRARY = getattr(config, "ESPEAK_LIBRARY", "")
|
8 |
+
if ESPEAK_LIBRARY != "":
|
9 |
+
EspeakWrapper.set_library(ESPEAK_LIBRARY)
|
10 |
+
|
11 |
+
# List of (regular expression, replacement) pairs for abbreviations:
|
12 |
+
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
|
13 |
+
('mrs', 'misess'),
|
14 |
+
('mr', 'mister'),
|
15 |
+
('dr', 'doctor'),
|
16 |
+
('st', 'saint'),
|
17 |
+
('co', 'company'),
|
18 |
+
('jr', 'junior'),
|
19 |
+
('maj', 'major'),
|
20 |
+
('gen', 'general'),
|
21 |
+
('drs', 'doctors'),
|
22 |
+
('rev', 'reverend'),
|
23 |
+
('lt', 'lieutenant'),
|
24 |
+
('hon', 'honorable'),
|
25 |
+
('sgt', 'sergeant'),
|
26 |
+
('capt', 'captain'),
|
27 |
+
('esq', 'esquire'),
|
28 |
+
('ltd', 'limited'),
|
29 |
+
('col', 'colonel'),
|
30 |
+
('ft', 'fort'),
|
31 |
+
]]
|
32 |
+
|
33 |
+
|
34 |
+
def expand_abbreviations(text):
|
35 |
+
for regex, replacement in _abbreviations:
|
36 |
+
text = re.sub(regex, replacement, text)
|
37 |
+
return text
|
38 |
+
|
39 |
+
|
40 |
+
def transliteration_cleaners(text):
|
41 |
+
'''Pipeline for non-English text that transliterates to ASCII.'''
|
42 |
+
text = unidecode(text)
|
43 |
+
text = text.lower()
|
44 |
+
text = re.sub(r'\s+', ' ', text)
|
45 |
+
text = expand_abbreviations(text)
|
46 |
+
return text
|
47 |
+
|
48 |
+
|
49 |
+
# for English text
|
50 |
+
def english_cleaners(text):
|
51 |
+
'''Pipeline for English text, including abbreviation expansion.'''
|
52 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: transliteration_cleaners(x.group(1)) + ' ', text)
|
53 |
+
phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
|
54 |
+
return phonemes
|
55 |
+
|
56 |
+
|
57 |
+
# for non-English text that can be transliterated to ASCII
|
58 |
+
def english_cleaners2(text):
|
59 |
+
'''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
|
60 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: transliteration_cleaners(x.group(1)) + ' ', text)
|
61 |
+
phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True,
|
62 |
+
with_stress=True)
|
63 |
+
return phonemes
|
64 |
|
65 |
|
66 |
def japanese_cleaners(text):
|
67 |
from text.japanese import japanese_to_romaji_with_accent
|
68 |
+
|
69 |
+
def clean(text):
|
70 |
+
text = japanese_to_romaji_with_accent(text)
|
71 |
+
text = re.sub(r'([A-Za-z])$', r'\1.', text)
|
72 |
+
return text
|
73 |
+
|
74 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: clean(x.group(1)) + ' ', text)
|
75 |
return text
|
76 |
|
77 |
|
|
|
82 |
def korean_cleaners(text):
|
83 |
'''Pipeline for Korean text'''
|
84 |
from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
|
85 |
+
|
86 |
+
def clean(text):
|
87 |
+
text = latin_to_hangul(text)
|
88 |
+
text = number_to_hangul(text)
|
89 |
+
text = divide_hangul(text)
|
90 |
+
text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
|
91 |
+
return text
|
92 |
+
|
93 |
+
text = re.sub(r'\[KO\](.*?)\[KO\]', lambda x: clean(x.group(1)) + ' ', text)
|
94 |
return text
|
95 |
|
96 |
|
97 |
def chinese_cleaners(text):
|
98 |
'''Pipeline for Chinese text'''
|
99 |
+
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, symbols_to_chinese
|
100 |
+
|
101 |
+
def clean(text):
|
102 |
+
text = symbols_to_chinese(text)
|
103 |
+
text = number_to_chinese(text)
|
104 |
+
text = chinese_to_bopomofo(text)
|
105 |
+
text = latin_to_bopomofo(text)
|
106 |
+
text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
|
107 |
+
return text
|
108 |
+
|
109 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: clean(x.group(1)) + ' ', text)
|
110 |
return text
|
111 |
|
112 |
|
|
|
114 |
from text.mandarin import chinese_to_romaji
|
115 |
from text.japanese import japanese_to_romaji_with_accent
|
116 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
117 |
+
lambda x: chinese_to_romaji(x.group(1)) + ' ', text)
|
118 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
|
119 |
+
x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…') + ' ', text)
|
120 |
text = re.sub(r'\s+$', '', text)
|
121 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
122 |
return text
|
|
|
135 |
from text.sanskrit import devanagari_to_ipa
|
136 |
from text.english import english_to_lazy_ipa
|
137 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
138 |
+
lambda x: chinese_to_lazy_ipa(x.group(1)) + ' ', text)
|
139 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
140 |
+
lambda x: japanese_to_ipa(x.group(1)) + ' ', text)
|
141 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
142 |
+
lambda x: korean_to_lazy_ipa(x.group(1)) + ' ', text)
|
143 |
text = re.sub(r'\[SA\](.*?)\[SA\]',
|
144 |
+
lambda x: devanagari_to_ipa(x.group(1)) + ' ', text)
|
145 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
146 |
+
lambda x: english_to_lazy_ipa(x.group(1)) + ' ', text)
|
147 |
text = re.sub(r'\s+$', '', text)
|
148 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
149 |
return text
|
|
|
155 |
from text.korean import korean_to_ipa
|
156 |
from text.english import english_to_ipa2
|
157 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
|
158 |
+
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn') + ' ', text)
|
159 |
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
|
160 |
+
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz') + ' ', text)
|
161 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
162 |
+
lambda x: korean_to_ipa(x.group(1)) + ' ', text)
|
163 |
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
|
164 |
+
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u') + ' ', text)
|
165 |
text = re.sub(r'\s+$', '', text)
|
166 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
167 |
return text
|
|
|
173 |
from text.korean import korean_to_ipa
|
174 |
from text.english import english_to_ipa2
|
175 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
176 |
+
lambda x: chinese_to_ipa(x.group(1)) + ' ', text)
|
177 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
178 |
+
lambda x: japanese_to_ipa2(x.group(1)) + ' ', text)
|
179 |
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
180 |
+
lambda x: korean_to_ipa(x.group(1)) + ' ', text)
|
181 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
182 |
+
lambda x: english_to_ipa2(x.group(1)) + ' ', text)
|
183 |
+
text = re.sub(r'\s+$', '', text)
|
184 |
+
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
185 |
+
return text
|
186 |
+
|
187 |
+
|
188 |
+
def cje_cleaners(text):
|
189 |
+
from text.mandarin import chinese_to_ipa
|
190 |
+
from text.japanese import japanese_to_ipa2
|
191 |
+
from text.english import english_to_ipa2
|
192 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
193 |
+
lambda x: chinese_to_ipa(x.group(1)) + ' ', text)
|
194 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
195 |
+
lambda x: japanese_to_ipa2(x.group(1)) + ' ', text)
|
196 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
197 |
+
lambda x: english_to_ipa2(x.group(1)) + ' ', text)
|
198 |
text = re.sub(r'\s+$', '', text)
|
199 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
200 |
return text
|
|
|
202 |
|
203 |
def thai_cleaners(text):
|
204 |
from text.thai import num_to_thai, latin_to_thai
|
205 |
+
|
206 |
+
def clean(text):
|
207 |
+
text = num_to_thai(text)
|
208 |
+
text = latin_to_thai(text)
|
209 |
+
return text
|
210 |
+
|
211 |
+
text = re.sub(r'\[TH\](.*?)\[TH\]', lambda x: clean(x.group(1)) + ' ', text)
|
212 |
return text
|
213 |
|
214 |
|
215 |
def shanghainese_cleaners(text):
|
216 |
from text.shanghainese import shanghainese_to_ipa
|
217 |
+
|
218 |
+
def clean(text):
|
219 |
+
text = shanghainese_to_ipa(text)
|
220 |
+
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
221 |
+
return text
|
222 |
+
|
223 |
+
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: clean(x.group(1)) + ' ', text)
|
224 |
return text
|
225 |
|
226 |
|
|
|
232 |
from text.english import english_to_lazy_ipa2
|
233 |
from text.ngu_dialect import ngu_dialect_to_ipa
|
234 |
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
235 |
+
lambda x: chinese_to_ipa2(x.group(1)) + ' ', text)
|
236 |
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
237 |
+
lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ') + ' ', text)
|
238 |
text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
|
239 |
+
'˧˧˦').replace(
|
240 |
+
'6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e') + ' ', text)
|
241 |
text = re.sub(r'\[GD\](.*?)\[GD\]',
|
242 |
+
lambda x: cantonese_to_ipa(x.group(1)) + ' ', text)
|
243 |
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
244 |
+
lambda x: english_to_lazy_ipa2(x.group(1)) + ' ', text)
|
245 |
text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
|
246 |
+
1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ') + ' ', text)
|
247 |
text = re.sub(r'\s+$', '', text)
|
248 |
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
249 |
return text
|
text/mandarin.py
CHANGED
@@ -7,10 +7,9 @@ import cn2an
|
|
7 |
import logging
|
8 |
|
9 |
logging.getLogger('jieba').setLevel(logging.WARNING)
|
10 |
-
jieba.set_dictionary(os.path.dirname(os.path.realpath(sys.argv[0]))+'/jieba/dict.txt')
|
11 |
jieba.initialize()
|
12 |
|
13 |
-
|
14 |
# List of (Latin alphabet, bopomofo) pairs:
|
15 |
_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
16 |
('a', 'ㄟˉ'),
|
@@ -236,9 +235,19 @@ _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
236 |
('—', '-')
|
237 |
]]
|
238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
def number_to_chinese(text):
|
241 |
-
numbers = re.findall(r'
|
242 |
for number in numbers:
|
243 |
text = text.replace(number, cn2an.an2cn(number), 1)
|
244 |
return text
|
@@ -286,6 +295,7 @@ def bopomofo_to_ipa2(text):
|
|
286 |
|
287 |
|
288 |
def chinese_to_romaji(text):
|
|
|
289 |
text = number_to_chinese(text)
|
290 |
text = chinese_to_bopomofo(text)
|
291 |
text = latin_to_bopomofo(text)
|
@@ -306,6 +316,7 @@ def chinese_to_lazy_ipa(text):
|
|
306 |
|
307 |
|
308 |
def chinese_to_ipa(text):
|
|
|
309 |
text = number_to_chinese(text)
|
310 |
text = chinese_to_bopomofo(text)
|
311 |
text = latin_to_bopomofo(text)
|
@@ -319,6 +330,7 @@ def chinese_to_ipa(text):
|
|
319 |
|
320 |
|
321 |
def chinese_to_ipa2(text):
|
|
|
322 |
text = number_to_chinese(text)
|
323 |
text = chinese_to_bopomofo(text)
|
324 |
text = latin_to_bopomofo(text)
|
|
|
7 |
import logging
|
8 |
|
9 |
logging.getLogger('jieba').setLevel(logging.WARNING)
|
10 |
+
jieba.set_dictionary(os.path.dirname(os.path.realpath(sys.argv[0])) + '/jieba/dict.txt')
|
11 |
jieba.initialize()
|
12 |
|
|
|
13 |
# List of (Latin alphabet, bopomofo) pairs:
|
14 |
_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
15 |
('a', 'ㄟˉ'),
|
|
|
235 |
('—', '-')
|
236 |
]]
|
237 |
|
238 |
+
_symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
|
239 |
+
('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
|
240 |
+
]]
|
241 |
+
|
242 |
+
|
243 |
+
def symbols_to_chinese(text):
|
244 |
+
for regex, replacement in _symbols_to_chinese:
|
245 |
+
text = re.sub(regex, replacement, text)
|
246 |
+
return text
|
247 |
+
|
248 |
|
249 |
def number_to_chinese(text):
|
250 |
+
numbers = re.findall(r'[0-9]+(?:\.?[0-9]+)?', text)
|
251 |
for number in numbers:
|
252 |
text = text.replace(number, cn2an.an2cn(number), 1)
|
253 |
return text
|
|
|
295 |
|
296 |
|
297 |
def chinese_to_romaji(text):
|
298 |
+
text = symbols_to_chinese(text)
|
299 |
text = number_to_chinese(text)
|
300 |
text = chinese_to_bopomofo(text)
|
301 |
text = latin_to_bopomofo(text)
|
|
|
316 |
|
317 |
|
318 |
def chinese_to_ipa(text):
|
319 |
+
text = symbols_to_chinese(text)
|
320 |
text = number_to_chinese(text)
|
321 |
text = chinese_to_bopomofo(text)
|
322 |
text = latin_to_bopomofo(text)
|
|
|
330 |
|
331 |
|
332 |
def chinese_to_ipa2(text):
|
333 |
+
text = symbols_to_chinese(text)
|
334 |
text = number_to_chinese(text)
|
335 |
text = chinese_to_bopomofo(text)
|
336 |
text = latin_to_bopomofo(text)
|
text/shanghainese.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import re
|
2 |
import cn2an
|
3 |
import opencc
|
|
|
4 |
|
5 |
-
|
6 |
-
converter = opencc.OpenCC('zaonhe')
|
7 |
|
8 |
# List of (Latin alphabet, ipa) pairs:
|
9 |
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
@@ -35,9 +35,19 @@ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
35 |
('Z', 'zᴇ')
|
36 |
]]
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
def _number_to_shanghainese(num):
|
40 |
-
num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两')
|
41 |
return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
|
42 |
|
43 |
|
@@ -52,9 +62,10 @@ def latin_to_ipa(text):
|
|
52 |
|
53 |
|
54 |
def shanghainese_to_ipa(text):
|
|
|
55 |
text = number_to_shanghainese(text.upper())
|
56 |
-
text = converter.convert(text).replace('-','').replace('$',' ')
|
57 |
-
text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
|
58 |
text = re.sub(r'[、;:]', ',', text)
|
59 |
text = re.sub(r'\s*,\s*', ', ', text)
|
60 |
text = re.sub(r'\s*。\s*', '. ', text)
|
|
|
1 |
import re
|
2 |
import cn2an
|
3 |
import opencc
|
4 |
+
import config
|
5 |
|
6 |
+
converter = opencc.OpenCC(config.ABS_PATH + '/chinese_dialect_lexicons/zaonhe')
|
|
|
7 |
|
8 |
# List of (Latin alphabet, ipa) pairs:
|
9 |
_latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
|
|
35 |
('Z', 'zᴇ')
|
36 |
]]
|
37 |
|
38 |
+
_symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
|
39 |
+
('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
|
40 |
+
]]
|
41 |
+
|
42 |
+
|
43 |
+
def symbols_to_chinese(text):
|
44 |
+
for regex, replacement in _symbols_to_chinese:
|
45 |
+
text = re.sub(regex, replacement, text)
|
46 |
+
return text
|
47 |
+
|
48 |
|
49 |
def _number_to_shanghainese(num):
|
50 |
+
num = cn2an.an2cn(num).replace('一十', '十').replace('二十', '廿').replace('二', '两')
|
51 |
return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
|
52 |
|
53 |
|
|
|
62 |
|
63 |
|
64 |
def shanghainese_to_ipa(text):
|
65 |
+
text = symbols_to_chinese(text)
|
66 |
text = number_to_shanghainese(text.upper())
|
67 |
+
text = converter.convert(text).replace('-', '').replace('$', ' ')
|
68 |
+
text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group()) + ' ', text)
|
69 |
text = re.sub(r'[、;:]', ',', text)
|
70 |
text = re.sub(r'\s*,\s*', ', ', text)
|
71 |
text = re.sub(r'\s*。\s*', '. ', text)
|
utils/__pycache__/merge.cpython-310.pyc
ADDED
Binary file (3.95 kB). View file
|
|
utils/__pycache__/nlp.cpython-310.pyc
ADDED
Binary file (2.41 kB). View file
|
|
utils/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (4.02 kB). View file
|
|
utils/merge.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import logging
|
4 |
+
import config
|
5 |
+
import numpy as np
|
6 |
+
from utils.utils import check_is_none
|
7 |
+
from voice import vits, TTS
|
8 |
+
|
9 |
+
lang_dict = {
|
10 |
+
"english_cleaners": ["en"],
|
11 |
+
"english_cleaners2": ["en"],
|
12 |
+
"japanese_cleaners": ["ja"],
|
13 |
+
"japanese_cleaners2": ["ja"],
|
14 |
+
"korean_cleaners": ["ko"],
|
15 |
+
"chinese_cleaners": ["zh"],
|
16 |
+
"zh_ja_mixture_cleaners": ["zh", "ja"],
|
17 |
+
"sanskrit_cleaners": ["sa"],
|
18 |
+
"cjks_cleaners": ["zh", "ja", "ko", "sa"],
|
19 |
+
"cjke_cleaners": ["zh", "ja", "ko", "en"],
|
20 |
+
"cjke_cleaners2": ["zh", "ja", "ko", "en"],
|
21 |
+
"cje_cleaners": ["zh", "ja", "en"],
|
22 |
+
"thai_cleaners": ["th"],
|
23 |
+
"shanghainese_cleaners": ["sh"],
|
24 |
+
"chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD",
|
25 |
+
"ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC",
|
26 |
+
"YB"],
|
27 |
+
}
|
28 |
+
|
29 |
+
|
30 |
+
def analysis(model_config_json):
|
31 |
+
model_config = json.load(model_config_json)
|
32 |
+
symbols = model_config.get("symbols", None)
|
33 |
+
emotion_embedding = model_config.get("data").get("emotion_embedding", False)
|
34 |
+
if symbols != None:
|
35 |
+
if not emotion_embedding:
|
36 |
+
mode_type = "vits"
|
37 |
+
else:
|
38 |
+
mode_type = "w2v2"
|
39 |
+
else:
|
40 |
+
mode_type = "hubert-soft"
|
41 |
+
return mode_type
|
42 |
+
|
43 |
+
|
44 |
+
def load_npy(model_):
|
45 |
+
if isinstance(model_, list):
|
46 |
+
# check if is .npy
|
47 |
+
for i in model_:
|
48 |
+
_model_extention = os.path.splitext(i)[1]
|
49 |
+
if _model_extention != ".npy":
|
50 |
+
raise ValueError(f"Unsupported model type: {_model_extention}")
|
51 |
+
|
52 |
+
# merge npy files
|
53 |
+
emotion_reference = np.empty((0, 1024))
|
54 |
+
for i in model_:
|
55 |
+
tmp = np.load(i).reshape(-1, 1024)
|
56 |
+
emotion_reference = np.append(emotion_reference, tmp, axis=0)
|
57 |
+
|
58 |
+
elif os.path.isdir(model_):
|
59 |
+
emotion_reference = np.empty((0, 1024))
|
60 |
+
for root, dirs, files in os.walk(model_):
|
61 |
+
for file_name in files:
|
62 |
+
# check if is .npy
|
63 |
+
_model_extention = os.path.splitext(file_name)[1]
|
64 |
+
if _model_extention != ".npy":
|
65 |
+
continue
|
66 |
+
file_path = os.path.join(root, file_name)
|
67 |
+
|
68 |
+
# merge npy files
|
69 |
+
tmp = np.load(file_path).reshape(-1, 1024)
|
70 |
+
emotion_reference = np.append(emotion_reference, tmp, axis=0)
|
71 |
+
|
72 |
+
elif os.path.isfile(model_):
|
73 |
+
# check if is .npy
|
74 |
+
_model_extention = os.path.splitext(model_)[1]
|
75 |
+
if _model_extention != ".npy":
|
76 |
+
raise ValueError(f"Unsupported model type: {_model_extention}")
|
77 |
+
|
78 |
+
emotion_reference = np.load(model_)
|
79 |
+
logging.info(f"Loaded emotional dimention npy range:{len(emotion_reference)}")
|
80 |
+
return emotion_reference
|
81 |
+
|
82 |
+
|
83 |
+
def merge_model(merging_model):
|
84 |
+
vits_obj = []
|
85 |
+
vits_speakers = []
|
86 |
+
hubert_vits_obj = []
|
87 |
+
hubert_vits_speakers = []
|
88 |
+
w2v2_vits_obj = []
|
89 |
+
w2v2_vits_speakers = []
|
90 |
+
|
91 |
+
# model list
|
92 |
+
vits_list = []
|
93 |
+
hubert_vits_list = []
|
94 |
+
w2v2_vits_list = []
|
95 |
+
|
96 |
+
for l in merging_model:
|
97 |
+
with open(l[1], 'r', encoding='utf-8') as model_config:
|
98 |
+
model_type = analysis(model_config)
|
99 |
+
if model_type == "vits":
|
100 |
+
vits_list.append(l)
|
101 |
+
elif model_type == "hubert":
|
102 |
+
hubert_vits_list.append(l)
|
103 |
+
elif model_type == "w2v2":
|
104 |
+
w2v2_vits_list.append(l)
|
105 |
+
|
106 |
+
# merge vits
|
107 |
+
new_id = 0
|
108 |
+
for obj_id, i in enumerate(vits_list):
|
109 |
+
obj = vits(model=i[0], config=i[1], model_type="vits")
|
110 |
+
lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
|
111 |
+
|
112 |
+
for id, name in enumerate(obj.return_speakers()):
|
113 |
+
vits_obj.append([int(id), obj, obj_id])
|
114 |
+
vits_speakers.append({"id": new_id, "name": name, "lang": lang})
|
115 |
+
new_id += 1
|
116 |
+
|
117 |
+
# merge hubert-vits
|
118 |
+
if len(hubert_vits_list) != 0:
|
119 |
+
if getattr(config, "HUBERT_SOFT_MODEL", None) == None or check_is_none(config.HUBERT_SOFT_MODEL):
|
120 |
+
raise ValueError(f"Please configure HUBERT_SOFT_MODEL path in config.py")
|
121 |
+
try:
|
122 |
+
from hubert_model import hubert_soft
|
123 |
+
hubert = hubert_soft(config.HUBERT_SOFT_MODEL)
|
124 |
+
except Exception as e:
|
125 |
+
raise ValueError(f"Load HUBERT_SOFT_MODEL failed {e}")
|
126 |
+
|
127 |
+
new_id = 0
|
128 |
+
for obj_id, i in enumerate(hubert_vits_list):
|
129 |
+
obj = vits(model=i[0], config=i[1], model_=hubert, model_type="hubert")
|
130 |
+
lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
|
131 |
+
|
132 |
+
for id, name in enumerate(obj.return_speakers()):
|
133 |
+
hubert_vits_obj.append([int(id), obj, obj_id])
|
134 |
+
hubert_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
|
135 |
+
new_id += 1
|
136 |
+
|
137 |
+
# merge w2v2-vits
|
138 |
+
if len(w2v2_vits_list) != 0:
|
139 |
+
if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY):
|
140 |
+
raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py")
|
141 |
+
try:
|
142 |
+
emotion_reference = load_npy(config.DIMENSIONAL_EMOTION_NPY)
|
143 |
+
except Exception as e:
|
144 |
+
raise ValueError(f"Load DIMENSIONAL_EMOTION_NPY failed {e}")
|
145 |
+
|
146 |
+
new_id = 0
|
147 |
+
for obj_id, i in enumerate(w2v2_vits_list):
|
148 |
+
obj = vits(model=i[0], config=i[1], model_=emotion_reference, model_type="w2v2")
|
149 |
+
lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
|
150 |
+
|
151 |
+
for id, name in enumerate(obj.return_speakers()):
|
152 |
+
w2v2_vits_obj.append([int(id), obj, obj_id])
|
153 |
+
w2v2_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
|
154 |
+
new_id += 1
|
155 |
+
|
156 |
+
voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj}
|
157 |
+
voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers}
|
158 |
+
|
159 |
+
tts = TTS(voice_obj, voice_speakers)
|
160 |
+
|
161 |
+
return tts
|
utils/nlp.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import regex as re
|
2 |
+
import logging
|
3 |
+
import config
|
4 |
+
from fastlid import fastlid
|
5 |
+
from .utils import check_is_none
|
6 |
+
|
7 |
+
logger = logging.getLogger("vits-simple-api")
|
8 |
+
level = getattr(config, "LOGGING_LEVEL", "DEBUG")
|
9 |
+
level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
|
10 |
+
'CRITICAL': logging.CRITICAL}
|
11 |
+
logger.setLevel(level_dict[level])
|
12 |
+
|
13 |
+
|
14 |
+
def clasify_lang(text):
|
15 |
+
pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
|
16 |
+
r'\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
|
17 |
+
r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
|
18 |
+
words = re.split(pattern, text)
|
19 |
+
|
20 |
+
pre = ""
|
21 |
+
p = 0
|
22 |
+
for word in words:
|
23 |
+
|
24 |
+
if check_is_none(word): continue
|
25 |
+
lang = fastlid(word)[0]
|
26 |
+
if pre == "":
|
27 |
+
text = text[:p] + text[p:].replace(word, f'[{lang.upper()}]' + word, 1)
|
28 |
+
p += len(f'[{lang.upper()}]')
|
29 |
+
elif pre != lang:
|
30 |
+
text = text[:p] + text[p:].replace(word, f'[{pre.upper()}][{lang.upper()}]' + word, 1)
|
31 |
+
p += len(f'[{pre.upper()}][{lang.upper()}]')
|
32 |
+
pre = lang
|
33 |
+
p += text[p:].index(word) + len(word)
|
34 |
+
text += f"[{pre.upper()}]"
|
35 |
+
|
36 |
+
return text
|
37 |
+
|
38 |
+
|
39 |
+
def cut(text, max):
|
40 |
+
pattern = r'[\!\(\)\,\-\.\/\:\;\?\?\。\,\、\;\:]+'
|
41 |
+
sentences = re.split(pattern, text)
|
42 |
+
sentence_list = []
|
43 |
+
count = 0
|
44 |
+
p = 0
|
45 |
+
for sentence in sentences:
|
46 |
+
count += len(sentence) + 1
|
47 |
+
if count >= max:
|
48 |
+
sentence_list.append(text[p:p + count])
|
49 |
+
p += count
|
50 |
+
count = 0
|
51 |
+
if p < len(text):
|
52 |
+
sentence_list.append(text[p:])
|
53 |
+
return sentence_list
|
54 |
+
|
55 |
+
|
56 |
+
def sentence_split(text, max=50, lang="auto", speaker_lang=None):
|
57 |
+
# 如果该speaker只支持一种语言
|
58 |
+
if speaker_lang is not None and len(speaker_lang) == 1:
|
59 |
+
if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
|
60 |
+
logger.debug(
|
61 |
+
f"lang \"{lang}\" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}")
|
62 |
+
lang = speaker_lang[0]
|
63 |
+
else:
|
64 |
+
fastlid.set_languages = speaker_lang
|
65 |
+
|
66 |
+
sentence_list = []
|
67 |
+
if lang.upper() != "MIX":
|
68 |
+
if max <= 0:
|
69 |
+
sentence_list.append(
|
70 |
+
clasify_lang(text) if lang.upper() == "AUTO" else f"[{lang.upper()}]{text}[{lang.upper()}]")
|
71 |
+
else:
|
72 |
+
for i in cut(text, max):
|
73 |
+
if check_is_none(i): continue
|
74 |
+
sentence_list.append(
|
75 |
+
clasify_lang(i) if lang.upper() == "AUTO" else f"[{lang.upper()}]{i}[{lang.upper()}]")
|
76 |
+
else:
|
77 |
+
sentence_list.append(text)
|
78 |
+
|
79 |
+
for i in sentence_list:
|
80 |
+
logger.debug(i)
|
81 |
+
|
82 |
+
return sentence_list
|
utils/utils.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from json import loads
|
4 |
+
import av
|
5 |
+
from torch import load, FloatTensor
|
6 |
+
from numpy import float32
|
7 |
+
import librosa
|
8 |
+
|
9 |
+
|
10 |
+
class HParams():
|
11 |
+
def __init__(self, **kwargs):
|
12 |
+
for k, v in kwargs.items():
|
13 |
+
if type(v) == dict:
|
14 |
+
v = HParams(**v)
|
15 |
+
self[k] = v
|
16 |
+
|
17 |
+
def keys(self):
|
18 |
+
return self.__dict__.keys()
|
19 |
+
|
20 |
+
def items(self):
|
21 |
+
return self.__dict__.items()
|
22 |
+
|
23 |
+
def values(self):
|
24 |
+
return self.__dict__.values()
|
25 |
+
|
26 |
+
def __len__(self):
|
27 |
+
return len(self.__dict__)
|
28 |
+
|
29 |
+
def __getitem__(self, key):
|
30 |
+
return getattr(self, key)
|
31 |
+
|
32 |
+
def __setitem__(self, key, value):
|
33 |
+
return setattr(self, key, value)
|
34 |
+
|
35 |
+
def __contains__(self, key):
|
36 |
+
return key in self.__dict__
|
37 |
+
|
38 |
+
def __repr__(self):
|
39 |
+
return self.__dict__.__repr__()
|
40 |
+
|
41 |
+
|
42 |
+
def load_checkpoint(checkpoint_path, model):
|
43 |
+
checkpoint_dict = load(checkpoint_path, map_location='cpu')
|
44 |
+
iteration = checkpoint_dict['iteration']
|
45 |
+
saved_state_dict = checkpoint_dict['model']
|
46 |
+
if hasattr(model, 'module'):
|
47 |
+
state_dict = model.module.state_dict()
|
48 |
+
else:
|
49 |
+
state_dict = model.state_dict()
|
50 |
+
new_state_dict = {}
|
51 |
+
for k, v in state_dict.items():
|
52 |
+
try:
|
53 |
+
new_state_dict[k] = saved_state_dict[k]
|
54 |
+
except:
|
55 |
+
logging.info("%s is not in the checkpoint" % k)
|
56 |
+
new_state_dict[k] = v
|
57 |
+
if hasattr(model, 'module'):
|
58 |
+
model.module.load_state_dict(new_state_dict)
|
59 |
+
else:
|
60 |
+
model.load_state_dict(new_state_dict)
|
61 |
+
logging.info("Loaded checkpoint '{}' (iteration {})".format(
|
62 |
+
checkpoint_path, iteration))
|
63 |
+
return
|
64 |
+
|
65 |
+
|
66 |
+
def get_hparams_from_file(config_path):
|
67 |
+
with open(config_path, 'r', encoding='utf-8') as f:
|
68 |
+
data = f.read()
|
69 |
+
config = loads(data)
|
70 |
+
|
71 |
+
hparams = HParams(**config)
|
72 |
+
return hparams
|
73 |
+
|
74 |
+
|
75 |
+
def load_audio_to_torch(full_path, target_sampling_rate):
|
76 |
+
audio, sampling_rate = librosa.load(full_path, sr=target_sampling_rate, mono=True)
|
77 |
+
return FloatTensor(audio.astype(float32))
|
78 |
+
|
79 |
+
|
80 |
+
def wav2ogg(input, output):
|
81 |
+
with av.open(input, 'rb') as i:
|
82 |
+
with av.open(output, 'wb', format='ogg') as o:
|
83 |
+
out_stream = o.add_stream('libvorbis')
|
84 |
+
for frame in i.decode(audio=0):
|
85 |
+
for p in out_stream.encode(frame):
|
86 |
+
o.mux(p)
|
87 |
+
|
88 |
+
for p in out_stream.encode(None):
|
89 |
+
o.mux(p)
|
90 |
+
|
91 |
+
def wav2mp3(input, output):
|
92 |
+
with av.open(input, 'rb') as i:
|
93 |
+
with av.open(output, 'wb', format='mp3') as o:
|
94 |
+
out_stream = o.add_stream('mp3')
|
95 |
+
for frame in i.decode(audio=0):
|
96 |
+
for p in out_stream.encode(frame):
|
97 |
+
o.mux(p)
|
98 |
+
|
99 |
+
for p in out_stream.encode(None):
|
100 |
+
o.mux(p)
|
101 |
+
|
102 |
+
def clean_folder(folder_path):
|
103 |
+
for filename in os.listdir(folder_path):
|
104 |
+
file_path = os.path.join(folder_path, filename)
|
105 |
+
# 如果是文件,则删除文件
|
106 |
+
if os.path.isfile(file_path):
|
107 |
+
os.remove(file_path)
|
108 |
+
|
109 |
+
|
110 |
+
# is none -> True, is not none -> False
|
111 |
+
def check_is_none(s):
|
112 |
+
return s is None or (isinstance(s, str) and str(s).isspace()) or str(s) == ""
|
vits-simple-api-installer-latest.sh
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
INSTALL_DIR=/usr/local/vits-simple-api
|
2 |
+
|
3 |
+
RED='\033[0;31m'
|
4 |
+
GREEN='\033[0;32m'
|
5 |
+
YELLOW='\033[0;33m'
|
6 |
+
PLAIN='\033[0m'
|
7 |
+
|
8 |
+
mkdir -p $INSTALL_DIR
|
9 |
+
cd $INSTALL_DIR
|
10 |
+
if [ ! -f config.py ]; then
|
11 |
+
echo -e "${YELLOW}download config.py\n${PLAIN}"
|
12 |
+
wget -O $INSTALL_DIR/config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/config.py
|
13 |
+
fi
|
14 |
+
|
15 |
+
wget -O $INSTALL_DIR/docker-compose.yaml https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/docker-compose.yaml
|
16 |
+
|
17 |
+
echo -e "${YELLOW}Pulling the image might take a while, so why not grab a cup of java first?\n${PLAIN}"
|
18 |
+
|
19 |
+
docker compose pull
|
20 |
+
docker compose up -d
|
21 |
+
|
22 |
+
echo -e "\nThe upgrade or installation has been completed."
|
23 |
+
echo -e "The configuration file directory is $(realpath $INSTALL_DIR)"
|
24 |
+
echo -e "${YELLOW}If the vits model is not imported, it cannot be used. Import the model in the configuration file directory.${PLAIN}"
|
25 |
+
echo -e "After modifying the configuration file, restart the docker container for the modification to take effect."
|
26 |
+
echo -e "${YELLOW}If you have any questions, please put them in the issues.${PLAIN}"
|
27 |
+
echo -e "https://github.com/Artrajz/vits-simple-api"
|
voice.py
CHANGED
@@ -1,32 +1,30 @@
|
|
1 |
import os
|
2 |
-
|
3 |
import librosa
|
4 |
-
from scipy.io.wavfile import write
|
5 |
-
from mel_processing import spectrogram_torch
|
6 |
-
from text import text_to_sequence, _clean_text
|
7 |
-
from models import SynthesizerTrn
|
8 |
-
import utils
|
9 |
import commons
|
10 |
import sys
|
11 |
import re
|
12 |
import numpy as np
|
13 |
-
|
14 |
-
|
|
|
|
|
15 |
from torch import no_grad, LongTensor, inference_mode, FloatTensor
|
16 |
-
import audonnx
|
17 |
-
import uuid
|
18 |
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
|
|
|
|
20 |
|
21 |
-
class Voice:
|
22 |
-
def __init__(self, model, config, out_path=None):
|
23 |
-
self.out_path = out_path
|
24 |
-
if not os.path.exists(self.out_path):
|
25 |
-
try:
|
26 |
-
os.mkdir(self.out_path)
|
27 |
-
except:
|
28 |
-
pass
|
29 |
|
|
|
|
|
|
|
30 |
self.hps_ms = utils.get_hparams_from_file(config)
|
31 |
self.n_speakers = self.hps_ms.data.n_speakers if 'n_speakers' in self.hps_ms.data.keys() else 0
|
32 |
self.n_symbols = len(self.hps_ms.symbols) if 'symbols' in self.hps_ms.keys() else 0
|
@@ -42,9 +40,19 @@ class Voice:
|
|
42 |
emotion_embedding=self.emotion_embedding,
|
43 |
**self.hps_ms.model)
|
44 |
_ = self.net_g_ms.eval()
|
|
|
|
|
|
|
|
|
|
|
45 |
utils.load_checkpoint(model, self.net_g_ms)
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
def
|
48 |
if cleaned:
|
49 |
text_norm = text_to_sequence(text, hps.symbols, [])
|
50 |
else:
|
@@ -54,7 +62,7 @@ class Voice:
|
|
54 |
text_norm = LongTensor(text_norm)
|
55 |
return text_norm
|
56 |
|
57 |
-
def get_label_value(self,
|
58 |
value = re.search(rf'\[{label}=(.+?)\]', text)
|
59 |
if value:
|
60 |
try:
|
@@ -65,16 +73,10 @@ class Voice:
|
|
65 |
sys.exit(1)
|
66 |
else:
|
67 |
value = default
|
68 |
-
|
69 |
-
|
70 |
-
def ex_return(self, text, escape=False):
|
71 |
-
if escape:
|
72 |
-
return text.encode('unicode_escape').decode()
|
73 |
else:
|
74 |
-
return text
|
75 |
-
|
76 |
-
def return_speakers(self, escape=False):
|
77 |
-
return self.speakers
|
78 |
|
79 |
def get_label(self, text, label):
|
80 |
if f'[{label}]' in text:
|
@@ -82,132 +84,152 @@ class Voice:
|
|
82 |
else:
|
83 |
return False, text
|
84 |
|
85 |
-
def
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
x_tst_lengths = LongTensor([stn_tst.size(0)])
|
98 |
-
sid = LongTensor([speaker_id])
|
99 |
-
audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid,
|
100 |
-
noise_scale=noise_scale,
|
101 |
-
noise_scale_w=noise_scale_w,
|
102 |
-
length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
|
103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
# else:
|
105 |
-
#
|
106 |
-
#
|
107 |
-
#
|
108 |
-
#
|
109 |
-
#
|
110 |
-
#
|
111 |
-
#
|
112 |
-
#
|
113 |
-
|
114 |
-
# text, 'NOISE', 0.667, 'noise scale')
|
115 |
-
# noise_scale_w, text = self.get_label_value(
|
116 |
-
# text, 'NOISEW', 0.8, 'deviation of noise')
|
117 |
-
# cleaned, text = self.get_label(text, 'CLEANED')
|
118 |
-
#
|
119 |
-
# stn_tst = self.get_text(text, self.hps_ms, cleaned=cleaned)
|
120 |
-
#
|
121 |
-
# emotion_reference = input('Path of an emotion reference: ')
|
122 |
-
# if emotion_reference.endswith('.npy'):
|
123 |
-
# emotion = np.load(emotion_reference)
|
124 |
-
# emotion = FloatTensor(emotion).unsqueeze(0)
|
125 |
-
# else:
|
126 |
-
# audio16000, sampling_rate = librosa.load(
|
127 |
-
# emotion_reference, sr=16000, mono=True)
|
128 |
-
# emotion = w2v2_model(audio16000, sampling_rate)[
|
129 |
-
# 'hidden_states']
|
130 |
-
# emotion_reference = re.sub(
|
131 |
-
# r'\..*$', '', emotion_reference)
|
132 |
-
# np.save(emotion_reference, emotion.squeeze(0))
|
133 |
-
# emotion = FloatTensor(emotion)
|
134 |
-
#
|
135 |
-
#
|
136 |
-
# with no_grad():
|
137 |
-
# x_tst = stn_tst.unsqueeze(0)
|
138 |
-
# x_tst_lengths = LongTensor([stn_tst.size(0)])
|
139 |
-
# sid = LongTensor([speaker_id])
|
140 |
-
# audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
|
141 |
-
# noise_scale_w=noise_scale_w,
|
142 |
-
# length_scale=length_scale, emotion_embedding=emotion)[0][
|
143 |
-
# 0, 0].data.cpu().float().numpy()
|
144 |
-
|
145 |
-
# else:
|
146 |
-
# model = input('Path of a hubert-soft Model: ')
|
147 |
-
# from hubert_model import hubert_soft
|
148 |
-
# hubert = hubert_soft(model)
|
149 |
-
|
150 |
-
# if audio_path != '[VC]':
|
151 |
-
# if self.use_f0:
|
152 |
-
# audio, sampling_rate = librosa.load(
|
153 |
-
# audio_path, sr=self.hps_ms.data.sampling_rate, mono=True)
|
154 |
-
# audio16000 = librosa.resample(
|
155 |
-
# audio, orig_sr=sampling_rate, target_sr=16000)
|
156 |
-
# else:
|
157 |
-
# audio16000, sampling_rate = librosa.load(
|
158 |
-
# audio_path, sr=16000, mono=True)
|
159 |
-
#
|
160 |
-
# out_path = "H:/git/MoeGoe-Simple-API/upload/hubert.wav"
|
161 |
-
# length_scale, out_path = self.get_label_value(
|
162 |
-
# out_path, 'LENGTH', 1, 'length scale')
|
163 |
-
# noise_scale, out_path = self.get_label_value(
|
164 |
-
# out_path, 'NOISE', 0.1, 'noise scale')
|
165 |
-
# noise_scale_w, out_path = self.get_label_value(
|
166 |
-
# out_path, 'NOISEW', 0.1, 'deviation of noise')
|
167 |
-
#
|
168 |
-
# with inference_mode():
|
169 |
-
# units = hubert.units(FloatTensor(audio16000).unsqueeze(
|
170 |
-
# 0).unsqueeze(0)).squeeze(0).numpy()
|
171 |
-
# if self.use_f0:
|
172 |
-
# f0_scale, out_path = self.get_label_value(
|
173 |
-
# out_path, 'F0', 1, 'f0 scale')
|
174 |
-
# f0 = librosa.pyin(audio, sr=sampling_rate,
|
175 |
-
# fmin=librosa.note_to_hz('C0'),
|
176 |
-
# fmax=librosa.note_to_hz('C7'),
|
177 |
-
# frame_length=1780)[0]
|
178 |
-
# target_length = len(units[:, 0])
|
179 |
-
# f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length,
|
180 |
-
# np.arange(0, len(f0)), f0)) * f0_scale
|
181 |
-
# units[:, 0] = f0 / 10
|
182 |
-
#
|
183 |
-
# stn_tst = FloatTensor(units)
|
184 |
-
# with no_grad():
|
185 |
-
# x_tst = stn_tst.unsqueeze(0)
|
186 |
-
# x_tst_lengths = LongTensor([stn_tst.size(0)])
|
187 |
-
# sid = LongTensor([target_id])
|
188 |
-
# audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
|
189 |
-
# noise_scale_w=noise_scale_w, length_scale=length_scale)[0][
|
190 |
-
# 0, 0].data.float().numpy()
|
191 |
|
192 |
-
with BytesIO() as f:
|
193 |
-
fname = str(uuid.uuid1())
|
194 |
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
file_path = self.out_path + "/" + fname + ".wav"
|
202 |
-
write(file_path, 24000, audio)
|
203 |
-
silk_path = utils.convert_to_silk(file_path)
|
204 |
-
os.remove(file_path)
|
205 |
-
return silk_path, "audio/silk", fname + ".silk"
|
206 |
else:
|
207 |
-
|
208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
audio = utils.load_audio_to_torch(
|
213 |
audio_path, self.hps_ms.data.sampling_rate)
|
@@ -223,9 +245,242 @@ class Voice:
|
|
223 |
|
224 |
with no_grad():
|
225 |
sid_tgt = LongTensor([target_id])
|
226 |
-
audio = self.net_g_ms.voice_conversion(spec,
|
227 |
-
|
|
|
|
|
|
|
|
|
228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
with BytesIO() as f:
|
230 |
-
write(f,
|
231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
|
|
2 |
import librosa
|
|
|
|
|
|
|
|
|
|
|
3 |
import commons
|
4 |
import sys
|
5 |
import re
|
6 |
import numpy as np
|
7 |
+
import torch
|
8 |
+
import xml.etree.ElementTree as ET
|
9 |
+
import config
|
10 |
+
import logging
|
11 |
from torch import no_grad, LongTensor, inference_mode, FloatTensor
|
|
|
|
|
12 |
from io import BytesIO
|
13 |
+
from graiax import silkcoder
|
14 |
+
from utils.nlp import cut, sentence_split
|
15 |
+
from scipy.io.wavfile import write
|
16 |
+
from mel_processing import spectrogram_torch
|
17 |
+
from text import text_to_sequence, _clean_text
|
18 |
+
from models import SynthesizerTrn
|
19 |
+
from utils import utils
|
20 |
|
21 |
+
# torch.set_num_threads(1) # 设置torch线程为1
|
22 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
+
class vits:
|
26 |
+
def __init__(self, model, config, model_=None, model_type=None):
|
27 |
+
self.model_type = model_type
|
28 |
self.hps_ms = utils.get_hparams_from_file(config)
|
29 |
self.n_speakers = self.hps_ms.data.n_speakers if 'n_speakers' in self.hps_ms.data.keys() else 0
|
30 |
self.n_symbols = len(self.hps_ms.symbols) if 'symbols' in self.hps_ms.keys() else 0
|
|
|
40 |
emotion_embedding=self.emotion_embedding,
|
41 |
**self.hps_ms.model)
|
42 |
_ = self.net_g_ms.eval()
|
43 |
+
|
44 |
+
# load model
|
45 |
+
self.load_model(model, model_)
|
46 |
+
|
47 |
+
def load_model(self, model, model_=None):
|
48 |
utils.load_checkpoint(model, self.net_g_ms)
|
49 |
+
self.net_g_ms.to(device)
|
50 |
+
if self.model_type == "hubert":
|
51 |
+
self.hubert = model_
|
52 |
+
elif self.model_type == "w2v2":
|
53 |
+
self.emotion_reference = model_
|
54 |
|
55 |
+
def get_cleaned_text(self, text, hps, cleaned=False):
|
56 |
if cleaned:
|
57 |
text_norm = text_to_sequence(text, hps.symbols, [])
|
58 |
else:
|
|
|
62 |
text_norm = LongTensor(text_norm)
|
63 |
return text_norm
|
64 |
|
65 |
+
def get_label_value(self, label, default, warning_name='value', text=""):
|
66 |
value = re.search(rf'\[{label}=(.+?)\]', text)
|
67 |
if value:
|
68 |
try:
|
|
|
73 |
sys.exit(1)
|
74 |
else:
|
75 |
value = default
|
76 |
+
if text == "":
|
77 |
+
return value
|
|
|
|
|
|
|
78 |
else:
|
79 |
+
return value, text
|
|
|
|
|
|
|
80 |
|
81 |
def get_label(self, text, label):
|
82 |
if f'[{label}]' in text:
|
|
|
84 |
else:
|
85 |
return False, text
|
86 |
|
87 |
+
def get_cleaner(self):
|
88 |
+
return getattr(self.hps_ms.data, 'text_cleaners', [None])[0]
|
89 |
+
|
90 |
+
def return_speakers(self, escape=False):
|
91 |
+
return self.speakers
|
92 |
+
|
93 |
+
def infer(self, params):
|
94 |
+
emotion = params.get("emotion", None)
|
95 |
+
|
96 |
+
with no_grad():
|
97 |
+
x_tst = params.get("stn_tst").unsqueeze(0)
|
98 |
+
x_tst_lengths = LongTensor([params.get("stn_tst").size(0)])
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
+
audio = self.net_g_ms.infer(x_tst.to(device), x_tst_lengths.to(device), sid=params.get("sid").to(device),
|
101 |
+
noise_scale=params.get("noise_scale"),
|
102 |
+
noise_scale_w=params.get("noise_scale_w"),
|
103 |
+
length_scale=params.get("length_scale"),
|
104 |
+
emotion_embedding=emotion.to(device) if emotion != None else None)[0][
|
105 |
+
0, 0].data.float().cpu().numpy()
|
106 |
+
|
107 |
+
torch.cuda.empty_cache()
|
108 |
+
return audio
|
109 |
+
|
110 |
+
def get_infer_param(self, length, noise, noisew, text=None, speaker_id=None, audio_path=None,
|
111 |
+
emotion=None):
|
112 |
+
emo = None
|
113 |
+
if self.model_type != "hubert":
|
114 |
+
length_scale, text = self.get_label_value('LENGTH', length, 'length scale', text)
|
115 |
+
noise_scale, text = self.get_label_value('NOISE', noise, 'noise scale', text)
|
116 |
+
noise_scale_w, text = self.get_label_value('NOISEW', noisew, 'deviation of noise', text)
|
117 |
+
cleaned, text = self.get_label(text, 'CLEANED')
|
118 |
+
|
119 |
+
stn_tst = self.get_cleaned_text(text, self.hps_ms, cleaned=cleaned)
|
120 |
+
sid = LongTensor([speaker_id])
|
121 |
+
|
122 |
+
if self.model_type == "w2v2":
|
123 |
+
# if emotion_reference.endswith('.npy'):
|
124 |
+
# emotion = np.load(emotion_reference)
|
125 |
+
# emotion = FloatTensor(emotion).unsqueeze(0)
|
126 |
# else:
|
127 |
+
# audio16000, sampling_rate = librosa.load(
|
128 |
+
# emotion_reference, sr=16000, mono=True)
|
129 |
+
# emotion = self.w2v2(audio16000, sampling_rate)[
|
130 |
+
# 'hidden_states']
|
131 |
+
# emotion_reference = re.sub(
|
132 |
+
# r'\..*$', '', emotion_reference)
|
133 |
+
# np.save(emotion_reference, emotion.squeeze(0))
|
134 |
+
# emotion = FloatTensor(emotion)
|
135 |
+
emo = torch.FloatTensor(self.emotion_reference[emotion]).unsqueeze(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
|
|
|
|
|
137 |
|
138 |
+
elif self.model_type == "hubert":
|
139 |
+
if self.use_f0:
|
140 |
+
audio, sampling_rate = librosa.load(
|
141 |
+
audio_path, sr=self.hps_ms.data.sampling_rate, mono=True)
|
142 |
+
audio16000 = librosa.resample(
|
143 |
+
audio, orig_sr=sampling_rate, target_sr=16000)
|
|
|
|
|
|
|
|
|
|
|
144 |
else:
|
145 |
+
audio16000, sampling_rate = librosa.load(
|
146 |
+
audio_path, sr=16000, mono=True)
|
147 |
+
|
148 |
+
length_scale = self.get_label_value('LENGTH', length, 'length scale')
|
149 |
+
noise_scale = self.get_label_value('NOISE', noise, 'noise scale')
|
150 |
+
noise_scale_w = self.get_label_value('NOISEW', noisew, 'deviation of noise')
|
151 |
+
|
152 |
+
with inference_mode():
|
153 |
+
units = self.hubert.units(FloatTensor(audio16000).unsqueeze(0).unsqueeze(0)).squeeze(0).numpy()
|
154 |
+
if self.use_f0:
|
155 |
+
f0_scale = self.get_label_value('F0', 1, 'f0 scale')
|
156 |
+
f0 = librosa.pyin(audio,
|
157 |
+
sr=sampling_rate,
|
158 |
+
fmin=librosa.note_to_hz('C0'),
|
159 |
+
fmax=librosa.note_to_hz('C7'),
|
160 |
+
frame_length=1780)[0]
|
161 |
+
target_length = len(units[:, 0])
|
162 |
+
f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length,
|
163 |
+
np.arange(0, len(f0)), f0)) * f0_scale
|
164 |
+
units[:, 0] = f0 / 10
|
165 |
+
|
166 |
+
stn_tst = FloatTensor(units)
|
167 |
+
sid = LongTensor([speaker_id])
|
168 |
+
params = {"length_scale": length_scale, "noise_scale": noise_scale,
|
169 |
+
"noise_scale_w": noise_scale_w, "stn_tst": stn_tst,
|
170 |
+
"sid": sid, "emotion": emo}
|
171 |
+
return params
|
172 |
+
|
173 |
+
def get_audio(self, voice, auto_break=False):
|
174 |
+
text = voice.get("text", None)
|
175 |
+
speaker_id = voice.get("id", 0)
|
176 |
+
length = voice.get("length", 1)
|
177 |
+
noise = voice.get("noise", 0.667)
|
178 |
+
noisew = voice.get("noisew", 0.8)
|
179 |
+
max = voice.get("max", 50)
|
180 |
+
lang = voice.get("lang", "auto")
|
181 |
+
speaker_lang = voice.get("speaker_lang", None)
|
182 |
+
audio_path = voice.get("audio_path", None)
|
183 |
+
emotion = voice.get("emotion", 0)
|
184 |
|
185 |
+
# 去除所有多余的空白字符
|
186 |
+
if text is not None: text = re.sub(r'\s+', ' ', text).strip()
|
187 |
+
|
188 |
+
# 停顿0.75s,避免语音分段合成再拼接后的连接突兀
|
189 |
+
brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
|
190 |
+
|
191 |
+
tasks = []
|
192 |
+
if self.model_type == "vits":
|
193 |
+
sentence_list = sentence_split(text, max, lang, speaker_lang)
|
194 |
+
for sentence in sentence_list:
|
195 |
+
tasks.append(
|
196 |
+
self.get_infer_param(text=sentence, speaker_id=speaker_id, length=length, noise=noise,
|
197 |
+
noisew=noisew))
|
198 |
+
audios = []
|
199 |
+
|
200 |
+
for task in tasks:
|
201 |
+
audios.append(self.infer(task))
|
202 |
+
if auto_break:
|
203 |
+
audios.append(brk)
|
204 |
+
|
205 |
+
audio = np.concatenate(audios, axis=0)
|
206 |
+
|
207 |
+
elif self.model_type == "hubert":
|
208 |
+
params = self.get_infer_param(speaker_id=speaker_id, length=length, noise=noise, noisew=noisew,
|
209 |
+
audio_path=audio_path)
|
210 |
+
audio = self.infer(params)
|
211 |
+
|
212 |
+
elif self.model_type == "w2v2":
|
213 |
+
sentence_list = sentence_split(text, max, lang, speaker_lang)
|
214 |
+
for sentence in sentence_list:
|
215 |
+
tasks.append(
|
216 |
+
self.get_infer_param(text=sentence, speaker_id=speaker_id, length=length, noise=noise,
|
217 |
+
noisew=noisew, emotion=emotion))
|
218 |
+
|
219 |
+
audios = []
|
220 |
+
for task in tasks:
|
221 |
+
audios.append(self.infer(task))
|
222 |
+
if auto_break:
|
223 |
+
audios.append(brk)
|
224 |
+
|
225 |
+
audio = np.concatenate(audios, axis=0)
|
226 |
+
|
227 |
+
return audio
|
228 |
+
|
229 |
+
def voice_conversion(self, voice):
|
230 |
+
audio_path = voice.get("audio_path")
|
231 |
+
original_id = voice.get("original_id")
|
232 |
+
target_id = voice.get("target_id")
|
233 |
|
234 |
audio = utils.load_audio_to_torch(
|
235 |
audio_path, self.hps_ms.data.sampling_rate)
|
|
|
245 |
|
246 |
with no_grad():
|
247 |
sid_tgt = LongTensor([target_id])
|
248 |
+
audio = self.net_g_ms.voice_conversion(spec.to(device),
|
249 |
+
spec_lengths.to(device),
|
250 |
+
sid_src=sid_src.to(device),
|
251 |
+
sid_tgt=sid_tgt.to(device))[0][0, 0].data.cpu().float().numpy()
|
252 |
+
|
253 |
+
torch.cuda.empty_cache()
|
254 |
|
255 |
+
return audio
|
256 |
+
|
257 |
+
|
258 |
+
class TTS:
|
259 |
+
def __init__(self, voice_obj, voice_speakers):
|
260 |
+
self._voice_obj = voice_obj
|
261 |
+
self._voice_speakers = voice_speakers
|
262 |
+
self._strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
|
263 |
+
self._speakers_count = sum([len(self._voice_speakers[i]) for i in self._voice_speakers])
|
264 |
+
self._vits_speakers_count = len(self._voice_speakers["VITS"])
|
265 |
+
self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
|
266 |
+
self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
|
267 |
+
self.dem = None
|
268 |
+
if getattr(config, "DIMENSIONAL_EMOTION_MODEL", None) != None:
|
269 |
+
try:
|
270 |
+
import audonnx
|
271 |
+
root = os.path.dirname(config.DIMENSIONAL_EMOTION_MODEL)
|
272 |
+
model_file = config.DIMENSIONAL_EMOTION_MODEL
|
273 |
+
self.dem = audonnx.load(root=root, model_file=model_file)
|
274 |
+
except Exception as e:
|
275 |
+
self.logger.warning(f"Load DIMENSIONAL_EMOTION_MODEL failed {e}")
|
276 |
+
|
277 |
+
# Initialization information
|
278 |
+
self.logger = logging.getLogger("vits-simple-api")
|
279 |
+
self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
|
280 |
+
self.logger.info(f'device:{device} device.type:{device.type}')
|
281 |
+
if self._vits_speakers_count != 0: self.logger.info(f"[VITS] {self._vits_speakers_count} speakers")
|
282 |
+
if self._hubert_speakers_count != 0: self.logger.info(f"[hubert] {self._hubert_speakers_count} speakers")
|
283 |
+
if self._w2v2_speakers_count != 0: self.logger.info(f"[w2v2] {self._w2v2_speakers_count} speakers")
|
284 |
+
self.logger.info(f"{self._speakers_count} speakers in total")
|
285 |
+
if self._speakers_count == 0:
|
286 |
+
self.logger.warning(f"No model was loaded")
|
287 |
+
|
288 |
+
@property
|
289 |
+
def voice_speakers(self):
|
290 |
+
return self._voice_speakers
|
291 |
+
|
292 |
+
@property
|
293 |
+
def speakers_count(self):
|
294 |
+
return self._speakers_count
|
295 |
+
|
296 |
+
@property
|
297 |
+
def vits_speakers_count(self):
|
298 |
+
return self._vits_speakers_count
|
299 |
+
|
300 |
+
@property
|
301 |
+
def hubert_speakers_count(self):
|
302 |
+
return self._hubert_speakers_count
|
303 |
+
|
304 |
+
@property
|
305 |
+
def w2v2_speakers_count(self):
|
306 |
+
return self._w2v2_speakers_count
|
307 |
+
|
308 |
+
def encode(self, sampling_rate, audio, format):
|
309 |
with BytesIO() as f:
|
310 |
+
write(f, sampling_rate, audio)
|
311 |
+
if format.upper() == 'OGG':
|
312 |
+
with BytesIO() as o:
|
313 |
+
utils.wav2ogg(f, o)
|
314 |
+
return BytesIO(o.getvalue())
|
315 |
+
elif format.upper() == 'SILK':
|
316 |
+
return BytesIO(silkcoder.encode(f))
|
317 |
+
elif format.upper() == 'MP3':
|
318 |
+
with BytesIO() as o:
|
319 |
+
utils.wav2mp3(f, o)
|
320 |
+
return BytesIO(o.getvalue())
|
321 |
+
elif format.upper() == 'WAV':
|
322 |
+
return BytesIO(f.getvalue())
|
323 |
+
|
324 |
+
def convert_time_string(self, time_string):
|
325 |
+
time_value = float(re.findall(r'\d+\.?\d*', time_string)[0])
|
326 |
+
time_unit = re.findall(r'[a-zA-Z]+', time_string)[0].lower()
|
327 |
+
|
328 |
+
if time_unit.upper() == 'MS':
|
329 |
+
return time_value / 1000
|
330 |
+
elif time_unit.upper() == 'S':
|
331 |
+
return time_value
|
332 |
+
elif time_unit.upper() == 'MIN':
|
333 |
+
return time_value * 60
|
334 |
+
elif time_unit.upper() == 'H':
|
335 |
+
return time_value * 3600
|
336 |
+
elif time_unit.upper() == 'D':
|
337 |
+
return time_value * 24 * 3600 # 不会有人真写D吧?
|
338 |
+
else:
|
339 |
+
raise ValueError("Unsupported time unit: {}".format(time_unit))
|
340 |
+
|
341 |
+
def parse_ssml(self, ssml):
|
342 |
+
root = ET.fromstring(ssml)
|
343 |
+
format = root.attrib.get("format", "wav")
|
344 |
+
voice_tasks = []
|
345 |
+
brk_count = 0
|
346 |
+
strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
|
347 |
+
|
348 |
+
for element in root.iter():
|
349 |
+
if element.tag == "voice":
|
350 |
+
id = int(element.attrib.get("id", root.attrib.get("id", config.ID)))
|
351 |
+
lang = element.attrib.get("lang", root.attrib.get("lang", config.LANG))
|
352 |
+
length = float(element.attrib.get("length", root.attrib.get("length", config.LENGTH)))
|
353 |
+
noise = float(element.attrib.get("noise", root.attrib.get("noise", config.NOISE)))
|
354 |
+
noisew = float(element.attrib.get("noisew", root.attrib.get("noisew", config.NOISEW)))
|
355 |
+
max = int(element.attrib.get("max", root.attrib.get("max", "0")))
|
356 |
+
# 不填写默认就是vits
|
357 |
+
model = element.attrib.get("model", root.attrib.get("model", "vits"))
|
358 |
+
# w2v2-vits/emotion-vits才有emotion
|
359 |
+
emotion = int(element.attrib.get("emotion", root.attrib.get("emotion", 0)))
|
360 |
+
|
361 |
+
voice_element = ET.tostring(element, encoding='unicode')
|
362 |
+
|
363 |
+
pattern_voice = r'<voice.*?>(.*?)</voice>'
|
364 |
+
pattern_break = r'<break\s*?(.*?)\s*?/>'
|
365 |
+
|
366 |
+
matches_voice = re.findall(pattern_voice, voice_element)[0]
|
367 |
+
matches_break = re.split(pattern_break, matches_voice)
|
368 |
+
for match in matches_break:
|
369 |
+
strength = re.search(r'\s*strength\s*=\s*[\'\"](.*?)[\'\"]', match)
|
370 |
+
time = re.search(r'\s*time\s*=\s*[\'\"](.*?)[\'\"]', match)
|
371 |
+
# break标签 strength属性
|
372 |
+
if strength:
|
373 |
+
brk = strength_dict[strength.group(1)]
|
374 |
+
voice_tasks.append({"break": brk})
|
375 |
+
brk_count += 1
|
376 |
+
# break标签 time属性
|
377 |
+
elif time:
|
378 |
+
brk = self.convert_time_string(time.group(1))
|
379 |
+
voice_tasks.append({"break": brk})
|
380 |
+
brk_count += 1
|
381 |
+
# break标签 为空说明只写了break,默认停顿0.75s
|
382 |
+
elif match == "":
|
383 |
+
voice_tasks.append({"break": 0.75})
|
384 |
+
brk_count += 1
|
385 |
+
# voice标签中除了break剩下的就是文本
|
386 |
+
else:
|
387 |
+
voice_tasks.append({"id": id,
|
388 |
+
"text": match,
|
389 |
+
"lang": lang,
|
390 |
+
"length": length,
|
391 |
+
"noise": noise,
|
392 |
+
"noisew": noisew,
|
393 |
+
"max": max,
|
394 |
+
"model": model,
|
395 |
+
"emotion": emotion
|
396 |
+
})
|
397 |
+
|
398 |
+
# 分段末尾停顿0.75s
|
399 |
+
voice_tasks.append({"break": 0.75})
|
400 |
+
elif element.tag == "break":
|
401 |
+
# brk_count大于0说明voice标签中有break
|
402 |
+
if brk_count > 0:
|
403 |
+
brk_count -= 1
|
404 |
+
continue
|
405 |
+
brk = strength_dict.get(element.attrib.get("strength"),
|
406 |
+
self.convert_time_string(element.attrib.get("time", "750ms")))
|
407 |
+
voice_tasks.append({"break": brk})
|
408 |
+
|
409 |
+
for i in voice_tasks:
|
410 |
+
self.logger.debug(i)
|
411 |
+
|
412 |
+
return voice_tasks, format
|
413 |
+
|
414 |
+
def create_ssml_infer_task(self, ssml):
|
415 |
+
voice_tasks, format = self.parse_ssml(ssml)
|
416 |
+
|
417 |
+
audios = []
|
418 |
+
for voice in voice_tasks:
|
419 |
+
if voice.get("break"):
|
420 |
+
audios.append(np.zeros(int(voice.get("break") * 22050), dtype=np.int16))
|
421 |
+
else:
|
422 |
+
model = voice.get("model").upper()
|
423 |
+
if model != "VITS" and model != "W2V2-VITS" and model != "EMOTION-VITS":
|
424 |
+
raise ValueError(f"Unsupported model: {voice.get('model')}")
|
425 |
+
voice_obj = self._voice_obj[model][voice.get("id")][1]
|
426 |
+
voice["id"] = self._voice_obj[model][voice.get("id")][0]
|
427 |
+
|
428 |
+
audios.append(voice_obj.get_audio(voice))
|
429 |
+
|
430 |
+
audio = np.concatenate(audios, axis=0)
|
431 |
+
|
432 |
+
return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format), format
|
433 |
+
|
434 |
+
def vits_infer(self, voice):
|
435 |
+
format = voice.get("format", "wav")
|
436 |
+
voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
|
437 |
+
voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
|
438 |
+
audio = voice_obj.get_audio(voice, auto_break=True)
|
439 |
+
|
440 |
+
return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
441 |
+
|
442 |
+
def hubert_vits_infer(self, voice):
|
443 |
+
format = voice.get("format", "wav")
|
444 |
+
voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
|
445 |
+
voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
|
446 |
+
audio = voice_obj.get_audio(voice)
|
447 |
+
|
448 |
+
return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
449 |
+
|
450 |
+
def w2v2_vits_infer(self, voice):
|
451 |
+
format = voice.get("format", "wav")
|
452 |
+
voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
|
453 |
+
voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
|
454 |
+
audio = voice_obj.get_audio(voice, auto_break=True)
|
455 |
+
|
456 |
+
return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
457 |
+
|
458 |
+
def vits_voice_conversion(self, voice):
|
459 |
+
original_id = voice.get("original_id")
|
460 |
+
target_id = voice.get("target_id")
|
461 |
+
format = voice.get("format")
|
462 |
+
|
463 |
+
original_id_obj = int(self._voice_obj["VITS"][original_id][2])
|
464 |
+
target_id_obj = int(self._voice_obj["VITS"][target_id][2])
|
465 |
+
|
466 |
+
if original_id_obj != target_id_obj:
|
467 |
+
raise ValueError(f"speakers are in diffrent VITS Model")
|
468 |
+
|
469 |
+
voice["original_id"] = int(self._voice_obj["VITS"][original_id][0])
|
470 |
+
voice["target_id"] = int(self._voice_obj["VITS"][target_id][0])
|
471 |
+
|
472 |
+
voice_obj = self._voice_obj["VITS"][original_id][1]
|
473 |
+
audio = voice_obj.voice_conversion(voice)
|
474 |
+
|
475 |
+
return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
476 |
+
|
477 |
+
def get_dimensional_emotion_npy(self, audio):
|
478 |
+
if self.dem is None:
|
479 |
+
raise ValueError(f"Please configure DIMENSIONAL_EMOTION_MODEL path in config.py")
|
480 |
+
audio16000, sampling_rate = librosa.load(audio, sr=16000, mono=True)
|
481 |
+
emotion = self.dem(audio16000, sampling_rate)['hidden_states']
|
482 |
+
emotion_npy = BytesIO()
|
483 |
+
np.save(emotion_npy, emotion.squeeze(0))
|
484 |
+
emotion_npy.seek(0)
|
485 |
+
|
486 |
+
return emotion_npy
|