Artrajz commited on
Commit
f103c82
1 Parent(s): e6da826

Upload 44 files

Browse files
Dockerfile CHANGED
@@ -1,4 +1,4 @@
1
- FROM python:3.9.16-slim-bullseye
2
 
3
  RUN mkdir -p /app
4
  WORKDIR /app
@@ -7,16 +7,31 @@ ENV DEBIAN_FRONTEND=noninteractive
7
 
8
  RUN apt-get update && \
9
  apt install build-essential -yq && \
 
 
 
10
  apt-get clean && \
11
  apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
12
  rm -rf /var/lib/apt/lists/*
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  COPY requirements.txt /app
15
  RUN pip install -r requirements.txt
16
 
17
  COPY . /app
18
 
19
- EXPOSE 7860
20
-
21
- CMD ["python", "/app/app.py"]
22
 
 
 
1
+ FROM python:3.10.11-slim-bullseye
2
 
3
  RUN mkdir -p /app
4
  WORKDIR /app
 
7
 
8
  RUN apt-get update && \
9
  apt install build-essential -yq && \
10
+ apt install espeak-ng -yq && \
11
+ apt install cmake -yq && \
12
+ apt install -y wget -yq && \
13
  apt-get clean && \
14
  apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
15
  rm -rf /var/lib/apt/lists/*
16
 
17
+ RUN pip install MarkupSafe==2.1.2 numpy==1.23.3 cython six==1.16.0
18
+
19
+ RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
20
+ tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
21
+ cd openjtalk-0.3.0.dev2 && \
22
+ rm -rf ./pyopenjtalk/open_jtalk_dic_utf_8-1.11 && \
23
+ python setup.py install && \
24
+ cd ../ && \
25
+ rm -f openjtalk-0.3.0.dev2.tar.gz && \
26
+ rm -rf openjtalk-0.3.0.dev2
27
+
28
+ RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
29
+
30
  COPY requirements.txt /app
31
  RUN pip install -r requirements.txt
32
 
33
  COPY . /app
34
 
35
+ EXPOSE 23456
 
 
36
 
37
+ CMD ["python", "/app/app.py"]
LICENSE CHANGED
@@ -1,6 +1,6 @@
1
  MIT License
2
 
3
- Copyright (c) 2022 CjangCjengh
4
 
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
  of this software and associated documentation files (the "Software"), to deal
 
1
  MIT License
2
 
3
+ Copyright (c) 2023 Artrajz
4
 
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
  of this software and associated documentation files (the "Software"), to deal
LICENSE-MoeGoe ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 CjangCjengh
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py CHANGED
@@ -1,13 +1,15 @@
1
  import os
2
- import gradio as gr
3
  import logging
 
 
4
  import uuid
5
-
6
- from flask import Flask, request, send_file, jsonify
7
  from werkzeug.utils import secure_filename
8
  from flask_apscheduler import APScheduler
9
-
10
- from utils import clean_folder, merge_model
 
 
11
 
12
  app = Flask(__name__)
13
  app.config.from_pyfile("config.py")
@@ -16,104 +18,367 @@ scheduler = APScheduler()
16
  scheduler.init_app(app)
17
  scheduler.start()
18
 
 
 
 
 
 
 
19
  logging.getLogger('numba').setLevel(logging.WARNING)
20
 
21
- voice_obj, voice_speakers = merge_model(app.config["MODEL_LIST"])
22
-
23
- CUSTOM_PATH = "/gradio"
24
 
25
  if not os.path.exists(app.config['UPLOAD_FOLDER']):
26
- try:
27
- os.mkdir(app.config['UPLOAD_FOLDER'])
28
- except:
29
- pass
30
 
31
 
32
- @app.route('/')
33
- @app.route('/voice/')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def index():
35
- return "usage:https://github.com/Artrajz/MoeGoe-Simple-API#readme"
36
 
37
 
38
  @app.route('/voice/speakers', methods=["GET", "POST"])
39
  def voice_speakers_api():
40
- speakers_list = voice_speakers
41
- return jsonify(speakers_list)
42
 
43
 
44
  @app.route('/voice', methods=["GET", "POST"])
45
- def voice_api():
46
- if request.method == "GET":
47
- text = request.args.get("text")
48
- speaker_id = int(request.args.get("id", 0))
49
- format = request.args.get("format", "wav")
50
- lang = request.args.get("lang", "mix")
51
- speed = float(request.args.get("speed", 1.0))
52
- elif request.method == "POST":
53
- json_data = request.json
54
- text = json_data["text"]
55
- speaker_id = int(json_data["id"])
56
- format = json_data["format"]
57
- lang = json_data["lang"]
58
- speed = float(json_data["speed"])
59
-
60
- if lang.upper() == "ZH":
61
- text = f"[ZH]{text}[ZH]"
62
- elif lang.upper() == "JA":
63
- text = f"[JA]{text}[JA]"
64
-
65
- real_id = voice_obj[speaker_id][0]
66
- real_obj = voice_obj[speaker_id][1]
67
-
68
- output, file_type, fname = real_obj.generate(text, real_id, format, speed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
71
 
72
 
73
- @app.route('/voice/conversion', methods=["GET", "POST"])
74
- def voice_conversion_api():
75
- if request.method == "GET":
76
- return jsonify("method should be POST")
77
  if request.method == "POST":
78
- # json_data = request.json
79
- voice = request.files['upload']
80
- original_id = int(request.form["original_id"])
81
- target_id = int(request.form["target_id"])
 
 
 
 
 
 
82
 
83
- form = {}
84
 
85
- format = voice.filename.split(".")[1]
 
86
 
87
- fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
88
- voice.save(os.path.join(app.config['UPLOAD_FOLDER'], fname))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- real_original_id = int(voice_obj[original_id][0])
91
- real_target_id = int(voice_obj[target_id][0])
92
- real_obj = voice_obj[original_id][1]
93
- real_target_obj = voice_obj[target_id][1]
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- if voice_obj[original_id][2] != voice_obj[target_id][2]:
96
- form["status"] = "error"
97
- form["message"] = "speaker IDs are in diffrent Model!"
98
- return form
99
 
100
- output = real_obj.voice_conversion(os.path.join(app.config['UPLOAD_FOLDER'], fname),
101
- real_original_id, real_target_id, format)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  file_type = f"audio/{format}"
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
105
- # return output
106
 
107
 
108
- # ��时清理临时文件,每小时清一次
109
- @scheduler.task('interval', id='随便写', seconds=3600, misfire_grace_time=900)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  def clean_task():
111
  clean_folder(app.config["UPLOAD_FOLDER"])
112
- clean_folder(app.config["SILK_OUT_PATH"])
113
 
114
 
115
  if __name__ == '__main__':
116
- io = gr.Interface(lambda x: "Hello, " + x + "!", "textbox", "textbox")
117
- app = gr.mount_gradio_app(app, io, path=CUSTOM_PATH)
118
- # app.run(host='0.0.0.0', port=app.config["PORT"]) # 如果对外开放用这个,docker部署也用这个
119
- # app.run(host='127.0.0.1', port=app.config["PORT"], debug=True) # 本地运行、调试
 
1
  import os
 
2
  import logging
3
+ import time
4
+ import logzero
5
  import uuid
6
+ from flask import Flask, request, send_file, jsonify, make_response
 
7
  from werkzeug.utils import secure_filename
8
  from flask_apscheduler import APScheduler
9
+ from functools import wraps
10
+ from utils.utils import clean_folder, check_is_none
11
+ from utils.merge import merge_model
12
+ from io import BytesIO
13
 
14
  app = Flask(__name__)
15
  app.config.from_pyfile("config.py")
 
18
  scheduler.init_app(app)
19
  scheduler.start()
20
 
21
+ logzero.loglevel(logging.WARNING)
22
+ logger = logging.getLogger("vits-simple-api")
23
+ level = app.config.get("LOGGING_LEVEL", "DEBUG")
24
+ level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
25
+ 'CRITICAL': logging.CRITICAL}
26
+ logging.basicConfig(level=level_dict[level])
27
  logging.getLogger('numba').setLevel(logging.WARNING)
28
 
29
+ tts = merge_model(app.config["MODEL_LIST"])
 
 
30
 
31
  if not os.path.exists(app.config['UPLOAD_FOLDER']):
32
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
33
+
34
+ if not os.path.exists(app.config['CACHE_PATH']):
35
+ os.makedirs(app.config['CACHE_PATH'], exist_ok=True)
36
 
37
 
38
+ def require_api_key(func):
39
+ @wraps(func)
40
+ def check_api_key(*args, **kwargs):
41
+ if not app.config.get('API_KEY_ENABLED', False):
42
+ return func(*args, **kwargs)
43
+ else:
44
+ api_key = request.args.get('api_key') or request.headers.get('X-API-KEY')
45
+ if api_key and api_key == app.config['API_KEY']:
46
+ return func(*args, **kwargs)
47
+ else:
48
+ return make_response(jsonify({"status": "error", "message": "Invalid API Key"}), 401)
49
+
50
+ return check_api_key
51
+
52
+
53
+ @app.route('/', methods=["GET", "POST"])
54
  def index():
55
+ return "vits-simple-api"
56
 
57
 
58
  @app.route('/voice/speakers', methods=["GET", "POST"])
59
  def voice_speakers_api():
60
+ return jsonify(tts.voice_speakers)
 
61
 
62
 
63
  @app.route('/voice', methods=["GET", "POST"])
64
+ @app.route('/voice/vits', methods=["GET", "POST"])
65
+ @require_api_key
66
+ def voice_vits_api():
67
+ try:
68
+ if request.method == "GET":
69
+ text = request.args.get("text", "")
70
+ id = int(request.args.get("id", app.config.get("ID", 0)))
71
+ format = request.args.get("format", app.config.get("FORMAT", "wav"))
72
+ lang = request.args.get("lang", app.config.get("LANG", "auto"))
73
+ length = float(request.args.get("length", app.config.get("LENGTH", 1)))
74
+ noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
75
+ noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
76
+ max = int(request.args.get("max", app.config.get("MAX", 50)))
77
+ elif request.method == "POST":
78
+ text = request.form.get("text", "")
79
+ id = int(request.form.get("id", app.config.get("ID", 0)))
80
+ format = request.form.get("format", app.config.get("FORMAT", "wav"))
81
+ lang = request.form.get("lang", app.config.get("LANG", "auto"))
82
+ length = float(request.form.get("length", app.config.get("LENGTH", 1)))
83
+ noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
84
+ noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
85
+ max = int(request.form.get("max", app.config.get("MAX", 50)))
86
+ except Exception as e:
87
+ logger.error(f"[VITS] {e}")
88
+ return make_response("parameter error", 400)
89
+
90
+ logger.info(f"[VITS] id:{id} format:{format} lang:{lang} length:{length} noise:{noise} noisew:{noisew}")
91
+ logger.info(f"[VITS] len:{len(text)} text:{text}")
92
+
93
+ if check_is_none(text):
94
+ logger.info(f"[VITS] text is empty")
95
+ return make_response(jsonify({"status": "error", "message": "text is empty"}), 400)
96
+
97
+ if check_is_none(id):
98
+ logger.info(f"[VITS] speaker id is empty")
99
+ return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
100
+
101
+ if id < 0 or id >= tts.vits_speakers_count:
102
+ logger.info(f"[VITS] speaker id {id} does not exist")
103
+ return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
104
+
105
+ speaker_lang = tts.voice_speakers["VITS"][id].get('lang')
106
+ if lang.upper() != "AUTO" and lang.upper() != "MIX" and len(speaker_lang) != 1 and lang not in speaker_lang:
107
+ logger.info(f"[VITS] lang \"{lang}\" is not in {speaker_lang}")
108
+ return make_response(jsonify({"status": "error", "message": f"lang '{lang}' is not in {speaker_lang}"}), 400)
109
+
110
+ if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
111
+ speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
112
+
113
+ fname = f"{str(uuid.uuid1())}.{format}"
114
+ file_type = f"audio/{format}"
115
+
116
+ t1 = time.time()
117
+ output = tts.vits_infer({"text": text,
118
+ "id": id,
119
+ "format": format,
120
+ "length": length,
121
+ "noise": noise,
122
+ "noisew": noisew,
123
+ "max": max,
124
+ "lang": lang,
125
+ "speaker_lang": speaker_lang})
126
+ t2 = time.time()
127
+ logger.info(f"[VITS] finish in {(t2 - t1):.2f}s")
128
 
129
  return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
130
 
131
 
132
+ @app.route('/voice/hubert-vits', methods=["POST"])
133
+ @require_api_key
134
+ def voice_hubert_api():
 
135
  if request.method == "POST":
136
+ try:
137
+ voice = request.files['upload']
138
+ id = int(request.form.get("id"))
139
+ format = request.form.get("format", app.config.get("LANG", "auto"))
140
+ length = float(request.form.get("length", app.config.get("LENGTH", 1)))
141
+ noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
142
+ noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
143
+ except Exception as e:
144
+ logger.error(f"[hubert] {e}")
145
+ return make_response("parameter error", 400)
146
 
147
+ logger.info(f"[hubert] id:{id} format:{format} length:{length} noise:{noise} noisew:{noisew}")
148
 
149
+ fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
150
+ voice.save(os.path.join(app.config['UPLOAD_FOLDER'], fname))
151
 
152
+ if check_is_none(id):
153
+ logger.info(f"[hubert] speaker id is empty")
154
+ return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
155
+
156
+ if id < 0 or id >= tts.hubert_speakers_count:
157
+ logger.info(f"[hubert] speaker id {id} does not exist")
158
+ return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
159
+
160
+ file_type = f"audio/{format}"
161
+
162
+ t1 = time.time()
163
+ output = tts.hubert_vits_infer({"id": id,
164
+ "format": format,
165
+ "length": length,
166
+ "noise": noise,
167
+ "noisew": noisew,
168
+ "audio_path": os.path.join(app.config['UPLOAD_FOLDER'], fname)})
169
+ t2 = time.time()
170
+ logger.info(f"[hubert] finish in {(t2 - t1):.2f}s")
171
+
172
+ return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
173
+
174
+
175
+ @app.route('/voice/w2v2-vits', methods=["GET", "POST"])
176
+ @require_api_key
177
+ def voice_w2v2_api():
178
+ try:
179
+ if request.method == "GET":
180
+ text = request.args.get("text", "")
181
+ id = int(request.args.get("id", app.config.get("ID", 0)))
182
+ format = request.args.get("format", app.config.get("FORMAT", "wav"))
183
+ lang = request.args.get("lang", app.config.get("LANG", "auto"))
184
+ length = float(request.args.get("length", app.config.get("LENGTH", 1)))
185
+ noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
186
+ noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
187
+ max = int(request.args.get("max", app.config.get("MAX", 50)))
188
+ emotion = int(request.args.get("emotion", app.config.get("EMOTION", 0)))
189
+ elif request.method == "POST":
190
+ text = request.form.get("text", "")
191
+ id = int(request.form.get("id", app.config.get("ID", 0)))
192
+ format = request.form.get("format", app.config.get("FORMAT", "wav"))
193
+ lang = request.form.get("lang", app.config.get("LANG", "auto"))
194
+ length = float(request.form.get("length"))
195
+ noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
196
+ noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
197
+ max = int(request.form.get("max", app.config.get("MAX", 50)))
198
+ emotion = int(request.form.get("emotion", app.config.get("EMOTION", 0)))
199
+ except Exception as e:
200
+ logger.error(f"[w2v2] {e}")
201
+ return make_response(f"parameter error", 400)
202
+
203
+ logger.info(f"[w2v2] id:{id} format:{format} lang:{lang} "
204
+ f"length:{length} noise:{noise} noisew:{noisew} emotion:{emotion}")
205
+ logger.info(f"[w2v2] len:{len(text)} text:{text}")
206
+
207
+ if check_is_none(text):
208
+ logger.info(f"[w2v2] text is empty")
209
+ return make_response(jsonify({"status": "error", "message": "text is empty"}), 400)
210
+
211
+ if check_is_none(id):
212
+ logger.info(f"[w2v2] speaker id is empty")
213
+ return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
214
+
215
+ if id < 0 or id >= tts.w2v2_speakers_count:
216
+ logger.info(f"[w2v2] speaker id {id} does not exist")
217
+ return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
218
+
219
+ speaker_lang = tts.voice_speakers["W2V2-VITS"][id].get('lang')
220
+ if lang.upper() != "AUTO" and lang.upper() != "MIX" and len(speaker_lang) != 1 and lang not in speaker_lang:
221
+ logger.info(f"[w2v2] lang \"{lang}\" is not in {speaker_lang}")
222
+ return make_response(jsonify({"status": "error", "message": f"lang '{lang}' is not in {speaker_lang}"}), 400)
223
+
224
+ if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
225
+ speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
226
+
227
+ fname = f"{str(uuid.uuid1())}.{format}"
228
+ file_type = f"audio/{format}"
229
 
230
+ t1 = time.time()
231
+ output = tts.w2v2_vits_infer({"text": text,
232
+ "id": id,
233
+ "format": format,
234
+ "length": length,
235
+ "noise": noise,
236
+ "noisew": noisew,
237
+ "max": max,
238
+ "lang": lang,
239
+ "emotion": emotion,
240
+ "speaker_lang": speaker_lang})
241
+ t2 = time.time()
242
+ logger.info(f"[w2v2] finish in {(t2 - t1):.2f}s")
243
+
244
+ return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
245
 
 
 
 
 
246
 
247
+ @app.route('/voice/conversion', methods=["POST"])
248
+ @app.route('/voice/vits/conversion', methods=["POST"])
249
+ @require_api_key
250
+ def vits_voice_conversion_api():
251
+ if request.method == "POST":
252
+ try:
253
+ voice = request.files['upload']
254
+ original_id = int(request.form["original_id"])
255
+ target_id = int(request.form["target_id"])
256
+ format = request.form.get("format", voice.filename.split(".")[1])
257
+ except Exception as e:
258
+ logger.error(f"[vits_voice_convertsion] {e}")
259
+ return make_response("parameter error", 400)
260
+
261
+ fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
262
+ audio_path = os.path.join(app.config['UPLOAD_FOLDER'], fname)
263
+ voice.save(audio_path)
264
  file_type = f"audio/{format}"
265
 
266
+ logger.info(f"[vits_voice_convertsion] orginal_id:{original_id} target_id:{target_id}")
267
+ t1 = time.time()
268
+ try:
269
+ output = tts.vits_voice_conversion({"audio_path": audio_path,
270
+ "original_id": original_id,
271
+ "target_id": target_id,
272
+ "format": format})
273
+ except Exception as e:
274
+ logger.info(f"[vits_voice_convertsion] {e}")
275
+ return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
276
+ t2 = time.time()
277
+ logger.info(f"finish in {(t2 - t1):.2f}s")
278
+
279
  return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
 
280
 
281
 
282
+ @app.route('/voice/ssml', methods=["POST"])
283
+ @require_api_key
284
+ def ssml():
285
+ try:
286
+ ssml = request.form["ssml"]
287
+ except Exception as e:
288
+ logger.info(f"[ssml] {e}")
289
+ return make_response(jsonify({"status": "error", "message": f"parameter error"}), 400)
290
+
291
+ logger.debug(ssml)
292
+
293
+ t1 = time.time()
294
+ try:
295
+ output, format = tts.create_ssml_infer_task(ssml)
296
+ except Exception as e:
297
+ logger.info(f"[ssml] {e}")
298
+ return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
299
+ t2 = time.time()
300
+
301
+ fname = f"{str(uuid.uuid1())}.{format}"
302
+ file_type = f"audio/{format}"
303
+
304
+ logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
305
+
306
+ return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
307
+
308
+
309
+ @app.route('/voice/dimension-emotion', methods=["POST"])
310
+ def dimensional_emotion():
311
+ if request.method == "POST":
312
+ try:
313
+ audio = request.files['upload']
314
+ except Exception as e:
315
+ logger.error(f"[dimensional_emotion] {e}")
316
+ return make_response("parameter error", 400)
317
+
318
+ content = BytesIO(audio.read())
319
+
320
+ file_type = "application/octet-stream; charset=ascii"
321
+ fname = os.path.splitext(audio.filename)[0] + ".npy"
322
+ output = tts.get_dimensional_emotion_npy(content)
323
+
324
+ return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
325
+
326
+
327
+ @app.route('/voice/check', methods=["GET", "POST"])
328
+ def check():
329
+ try:
330
+ if request.method == "GET":
331
+ model = request.args.get("model")
332
+ id = int(request.args.get("id"))
333
+ elif request.method == "POST":
334
+ model = request.form["model"]
335
+ id = int(request.form["id"])
336
+ except Exception as e:
337
+ logger.info(f"[check] {e}")
338
+ return make_response(jsonify({"status": "error", "message": "parameter error"}), 400)
339
+
340
+ if check_is_none(model):
341
+ logger.info(f"[check] model {model} is empty")
342
+ return make_response(jsonify({"status": "error", "message": "model is empty"}), 400)
343
+
344
+ if model.upper() not in ("VITS", "HUBERT", "W2V2"):
345
+ res = make_response(jsonify({"status": "error", "message": f"model {model} does not exist"}))
346
+ res.status = 404
347
+ logger.info(f"[check] speaker id {id} error")
348
+ return res
349
+
350
+ if check_is_none(id):
351
+ logger.info(f"[check] speaker id is empty")
352
+ return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
353
+
354
+ if model.upper() == "VITS":
355
+ speaker_list = tts.voice_speakers["VITS"]
356
+ elif model.upper() == "HUBERT":
357
+ speaker_list = tts.voice_speakers["HUBERT-VITS"]
358
+ elif model.upper() == "W2V2":
359
+ speaker_list = tts.voice_speakers["W2V2-VITS"]
360
+
361
+ if len(speaker_list) == 0:
362
+ logger.info(f"[check] {model} not loaded")
363
+ return make_response(jsonify({"status": "error", "message": f"{model} not loaded"}), 400)
364
+
365
+ if id < 0 or id >= len(speaker_list):
366
+ logger.info(f"[check] speaker id {id} does not exist")
367
+ return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
368
+ name = str(speaker_list[id]["name"])
369
+ lang = speaker_list[id]["lang"]
370
+ logger.info(f"[check] check id:{id} name:{name} lang:{lang}")
371
+
372
+ return make_response(jsonify({"status": "success", "id": id, "name": name, "lang": lang}), 200)
373
+
374
+
375
+ # regular cleaning
376
+ @scheduler.task('interval', id='clean_task', seconds=3600, misfire_grace_time=900)
377
  def clean_task():
378
  clean_folder(app.config["UPLOAD_FOLDER"])
379
+ clean_folder(app.config["CACHE_PATH"])
380
 
381
 
382
  if __name__ == '__main__':
383
+ app.run(host='0.0.0.0', port=app.config.get("PORT", 23456), debug=app.config.get("DEBUG", False)) # 对外开放
384
+ # app.run(host='127.0.0.1', port=app.config.get("PORT",23456), debug=True) # 本地运行、调试
 
 
docker-compose.yaml CHANGED
@@ -1,12 +1,13 @@
1
  version: '3.4'
2
  services:
3
- moegoe:
4
- image: artrajz/moegoe-simple-api:latest
5
  restart: always
6
  ports:
7
  - 23456:23456
8
  environment:
9
  LANG: 'C.UTF-8'
 
10
  volumes:
11
  - ./Model:/app/Model # 挂载模型文件夹
12
  - ./config.py:/app/config.py # 挂载配置文件
 
1
  version: '3.4'
2
  services:
3
+ vits:
4
+ image: artrajz/vits-simple-api:latest
5
  restart: always
6
  ports:
7
  - 23456:23456
8
  environment:
9
  LANG: 'C.UTF-8'
10
+ TZ: Asia/Shanghai #timezone
11
  volumes:
12
  - ./Model:/app/Model # 挂载模型文件夹
13
  - ./config.py:/app/config.py # 挂载配置文件
models.py CHANGED
@@ -363,7 +363,7 @@ class SynthesizerTrn(nn.Module):
363
  else:
364
  self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
365
 
366
- if n_speakers > 1:
367
  self.emb_g = nn.Embedding(n_speakers, gin_channels)
368
 
369
  def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):
 
363
  else:
364
  self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
365
 
366
+ if n_speakers >= 1:
367
  self.emb_g = nn.Embedding(n_speakers, gin_channels)
368
 
369
  def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):
optimizer_removal.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import load, save
2
+
3
+ if __name__ == '__main__':
4
+ print("优化器通常不会被用于推理阶段,如果只用于推理可以去除优化器以减小模型体积\n")
5
+ input_path = input("请输入模型的路径:")
6
+ output_path = f"{input_path.split('.')[0]}_inference.pth"
7
+ checkpoint_dict = load(input_path, map_location='cpu')
8
+ checkpoint_dict_new = {}
9
+ for k, v in checkpoint_dict.items():
10
+ if k == "optimizer":
11
+ print(f"remove optimizer")
12
+ continue
13
+ checkpoint_dict_new[k] = v
14
+ save(checkpoint_dict_new, output_path)
15
+ print("finish")
16
+ print(output_path)
request.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ import os
4
+ import random
5
+ import string
6
+ from requests_toolbelt.multipart.encoder import MultipartEncoder
7
+
8
+ abs_path = os.path.dirname(__file__)
9
+ base = "http://127.0.0.1:23456"
10
+
11
+
12
+ # 映射表
13
+ def voice_speakers():
14
+ url = f"{base}/voice/speakers"
15
+
16
+ res = requests.post(url=url)
17
+ json = res.json()
18
+ for i in json:
19
+ print(i)
20
+ for j in json[i]:
21
+ print(j)
22
+ return json
23
+
24
+
25
+ # 语音合成 voice vits
26
+ def voice_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50):
27
+ fields = {
28
+ "text": text,
29
+ "id": str(id),
30
+ "format": format,
31
+ "lang": lang,
32
+ "length": str(length),
33
+ "noise": str(noise),
34
+ "noisew": str(noisew),
35
+ "max": str(max)
36
+ }
37
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
38
+
39
+ m = MultipartEncoder(fields=fields, boundary=boundary)
40
+ headers = {"Content-Type": m.content_type}
41
+ url = f"{base}/voice"
42
+
43
+ res = requests.post(url=url, data=m, headers=headers)
44
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
45
+ path = f"{abs_path}/{fname}"
46
+
47
+ with open(path, "wb") as f:
48
+ f.write(res.content)
49
+ print(path)
50
+ return path
51
+
52
+
53
+ # 语音转换 hubert-vits
54
+ def voice_hubert_vits(upload_path, id, format="wav", length=1, noise=0.667, noisew=0.8):
55
+ upload_name = os.path.basename(upload_path)
56
+ upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
57
+
58
+ with open(upload_path, 'rb') as upload_file:
59
+ fields = {
60
+ "upload": (upload_name, upload_file, upload_type),
61
+ "id": str(id),
62
+ "format": format,
63
+ "length": str(length),
64
+ "noise": str(noise),
65
+ "noisew": str(noisew),
66
+ }
67
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
68
+
69
+ m = MultipartEncoder(fields=fields, boundary=boundary)
70
+ headers = {"Content-Type": m.content_type}
71
+ url = f"{base}/voice/hubert-vits"
72
+
73
+ res = requests.post(url=url, data=m, headers=headers)
74
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
75
+ path = f"{abs_path}/{fname}"
76
+
77
+ with open(path, "wb") as f:
78
+ f.write(res.content)
79
+ print(path)
80
+ return path
81
+
82
+
83
+ # 维度情感模型 w2v2-vits
84
+ def voice_w2v2_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50, emotion=0):
85
+ fields = {
86
+ "text": text,
87
+ "id": str(id),
88
+ "format": format,
89
+ "lang": lang,
90
+ "length": str(length),
91
+ "noise": str(noise),
92
+ "noisew": str(noisew),
93
+ "max": str(max),
94
+ "emotion": str(emotion)
95
+ }
96
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
97
+
98
+ m = MultipartEncoder(fields=fields, boundary=boundary)
99
+ headers = {"Content-Type": m.content_type}
100
+ url = f"{base}/voice/w2v2-vits"
101
+
102
+ res = requests.post(url=url, data=m, headers=headers)
103
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
104
+ path = f"{abs_path}/{fname}"
105
+
106
+ with open(path, "wb") as f:
107
+ f.write(res.content)
108
+ print(path)
109
+ return path
110
+
111
+
112
+ # 语音转换 同VITS模型内角色之间的音色转换
113
+ def voice_conversion(upload_path, original_id, target_id):
114
+ upload_name = os.path.basename(upload_path)
115
+ upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
116
+
117
+ with open(upload_path, 'rb') as upload_file:
118
+ fields = {
119
+ "upload": (upload_name, upload_file, upload_type),
120
+ "original_id": str(original_id),
121
+ "target_id": str(target_id),
122
+ }
123
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
124
+ m = MultipartEncoder(fields=fields, boundary=boundary)
125
+
126
+ headers = {"Content-Type": m.content_type}
127
+ url = f"{base}/voice/conversion"
128
+
129
+ res = requests.post(url=url, data=m, headers=headers)
130
+
131
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
132
+ path = f"{abs_path}/{fname}"
133
+
134
+ with open(path, "wb") as f:
135
+ f.write(res.content)
136
+ print(path)
137
+ return path
138
+
139
+
140
+ def voice_ssml(ssml):
141
+ fields = {
142
+ "ssml": ssml,
143
+ }
144
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
145
+
146
+ m = MultipartEncoder(fields=fields, boundary=boundary)
147
+ headers = {"Content-Type": m.content_type}
148
+ url = f"{base}/voice/ssml"
149
+
150
+ res = requests.post(url=url, data=m, headers=headers)
151
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
152
+ path = f"{abs_path}/{fname}"
153
+
154
+ with open(path, "wb") as f:
155
+ f.write(res.content)
156
+ print(path)
157
+ return path
158
+
159
+
160
+ def voice_dimensional_emotion(upload_path):
161
+ upload_name = os.path.basename(upload_path)
162
+ upload_type = f'audio/{upload_name.split(".")[1]}' # wav,ogg
163
+
164
+ with open(upload_path, 'rb') as upload_file:
165
+ fields = {
166
+ "upload": (upload_name, upload_file, upload_type),
167
+ }
168
+ boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
169
+
170
+ m = MultipartEncoder(fields=fields, boundary=boundary)
171
+ headers = {"Content-Type": m.content_type}
172
+ url = f"{base}/voice/dimension-emotion"
173
+
174
+ res = requests.post(url=url, data=m, headers=headers)
175
+ fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
176
+ path = f"{abs_path}/{fname}"
177
+
178
+ with open(path, "wb") as f:
179
+ f.write(res.content)
180
+ print(path)
181
+ return path
182
+
183
+
184
+ import time
185
+
186
+ # while 1:
187
+ # text = input()
188
+ # l = len(text)
189
+ # time1 = time.time()
190
+ # voice_vits(text)
191
+ # time2 = time.time()
192
+ # print(f"len:{l}耗时:{time2 - time1}")
193
+
194
+ # text = "你好"
195
+
196
+
197
+ # ssml = """
198
+ # <speak lang="zh" format="mp3" length="1.2">
199
+ # <voice id="92" >这几天心里颇不宁静。</voice>
200
+ # <voice id="125">今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。</voice>
201
+ # <voice id="142">月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;</voice>
202
+ # <voice id="98">妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。</voice>
203
+ # <voice id="120">我悄悄地披了大衫,带上门出去。</voice><break time="2s"/>
204
+ # <voice id="121">沿着荷塘,是一条曲折的小煤屑路。</voice>
205
+ # <voice id="122">这是一条幽僻的路;白天也少人走,夜晚更加寂寞。</voice>
206
+ # <voice id="123">荷塘四面,长着许多树,蓊蓊郁郁的。</voice>
207
+ # <voice id="124">路的一旁,是些杨柳,和一些不知道名字的树。</voice>
208
+ # <voice id="125">没有月光的晚上,这路上阴森森的,有些怕人。</voice>
209
+ # <voice id="126">今晚却很好,虽然月光也还是淡淡的。</voice><break time="2s"/>
210
+ # <voice id="127">路上只我一个人,背着手踱着。</voice>
211
+ # <voice id="128">这一片天地好像是我的;我也像超出了平常的自己,到了另一个世界里。</voice>
212
+ # <voice id="129">我爱热闹,也爱冷静;<break strength="x-weak"/>爱群居,也爱独处。</voice>
213
+ # <voice id="130">像今晚上,一个人在这苍茫的月下,什么都可以想,什么都可以不想,便觉是个自由的人。</voice>
214
+ # <voice id="131">白天里一定要做的事,一定要说的话,现在都可不理。</voice>
215
+ # <voice id="132">这是独处的妙处,我且受用这无边的荷香月色好了。</voice>
216
+ # </speak>
217
+ # """
218
+ # ssml = """
219
+ # <speak lang="zh">
220
+ # <voice id="92" length="1.4">这几天心里颇不宁静。今晚<break/>在院子里坐着乘凉,忽然想起<break/>日日走过的荷塘,在这满月的光里,总该另有一番样子吧。</voice>
221
+ # <voice id="142" length="1.4">月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;</voice><break time="2s"/>
222
+ # <voice id="0" length="1.4" model="w2v2-vits" lang="ja">こんにちは</voice>
223
+ # </speak>
224
+ # """
225
+ # ssml = """
226
+ # <speak lang="ja">
227
+ # <voice id="142" length="1.4">こんにちは</voice>
228
+ # <voice id="0" length="1.4" model="w2v2-vits" emotion="177">こんにちは</voice>
229
+ # <voice id="0" length="1.4" model="w2v2-vits">こんにちは</voice>
230
+ # </speak>
231
+ # """
232
+ ssml = """
233
+ <speak lang="auto">
234
+ <voice>这几天心里颇不宁静。</voice>
235
+ <voice>今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。</voice>
236
+ <voice>月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;</voice>
237
+ <voice>妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。</voice>
238
+ <voice>我悄悄地披了大衫,带上门出去。</voice><break time="2s"/>
239
+ <voice>沿着荷塘,是一条曲折的小煤屑路。</voice>
240
+ <voice>这是一条幽僻的路;白天也少人走,夜晚更加寂寞。</voice>
241
+ <voice>荷塘四面,长着许多树,蓊蓊郁郁的。</voice>
242
+ <voice>路的一旁,是些杨柳,和一些不知道名字的树。</voice>
243
+ <voice>没有月光的晚上,这路上阴森森的,有些怕人。</voice>
244
+ <voice>今晚却很好,虽然月光也还是淡淡的。</voice><break time="2s"/>
245
+ <voice>路上只我一个人,背着手踱着。</voice>
246
+ <voice>这一片天地好像是我的;我也像超出了平常的自己,到了另一个世界里。</voice>
247
+ <voice>我爱热闹,也爱冷静;<break strength="x-weak"/>爱群居,也爱独处。</voice>
248
+ <voice>像今晚上,一个人在这苍茫的月下,什么都可以想,什么都可以不想,便觉是个自由的人。</voice>
249
+ <voice>白天里一定要做的事,一定要说的话,现在都可不理。</voice>
250
+ <voice>这是独处的妙处,我且受用这无边的荷香月色好了。</voice>
251
+ </speak>
252
+ """
253
+
254
+ text = """猫咪是爱撒娇、爱玩耍的小家伙,通常有着柔软的绒毛和温柔的眼神,是许多人都喜欢的宠物哦~它们特别喜欢舔自己的毛发,用柔顺的小脑袋搓人的脚丫子,还能给人带来很多欢乐和温馨。
255
+ """
256
+ t1 = time.time()
257
+ # voice_conversion("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav", 91, 93)
258
+ # voice_hubert_vits("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav",0)
259
+ # voice_vits(text,format="wav",lang="zh")
260
+ # voice_w2v2_vits(text,emotion=111)
261
+ # os.system(voice_ssml(ssml))
262
+ os.system(voice_vits(text,id=0, format="wav", max=0))
263
+ # voice_dimensional_emotion("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav")
264
+ t2 = time.time()
265
+ print(f"len:{len(text)}耗时:{t2 - t1}")
test.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from io import BytesIO
3
+
4
+ array = np.array([1, 2, 3])
5
+
6
+ npy = BytesIO()
7
+ np.save(npy,array)
8
+ npy.seek(0)
9
+ tmp = np.load("H:\git/vits-simple-api\Model/npy/25ecb3f6-f968-11ed-b094-e0d4e84af078.npy")
10
+ print(tmp)
11
+
text/__pycache__/__init__.cpython-310.pyc CHANGED
Binary files a/text/__pycache__/__init__.cpython-310.pyc and b/text/__pycache__/__init__.cpython-310.pyc differ
 
text/__pycache__/cantonese.cpython-310.pyc ADDED
Binary file (2.34 kB). View file
 
text/__pycache__/cleaners.cpython-310.pyc CHANGED
Binary files a/text/__pycache__/cleaners.cpython-310.pyc and b/text/__pycache__/cleaners.cpython-310.pyc differ
 
text/__pycache__/english.cpython-310.pyc ADDED
Binary file (4.69 kB). View file
 
text/__pycache__/japanese.cpython-310.pyc CHANGED
Binary files a/text/__pycache__/japanese.cpython-310.pyc and b/text/__pycache__/japanese.cpython-310.pyc differ
 
text/__pycache__/korean.cpython-310.pyc ADDED
Binary file (5.58 kB). View file
 
text/__pycache__/mandarin.cpython-310.pyc CHANGED
Binary files a/text/__pycache__/mandarin.cpython-310.pyc and b/text/__pycache__/mandarin.cpython-310.pyc differ
 
text/__pycache__/ngu_dialect.cpython-310.pyc ADDED
Binary file (1.17 kB). View file
 
text/__pycache__/shanghainese.cpython-310.pyc ADDED
Binary file (2.51 kB). View file
 
text/cantonese.py CHANGED
@@ -1,9 +1,9 @@
1
  import re
2
  import cn2an
3
  import opencc
 
4
 
5
-
6
- converter = opencc.OpenCC('jyutjyu')
7
 
8
  # List of (Latin alphabet, ipa) pairs:
9
  _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
@@ -35,6 +35,16 @@ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
35
  ('Z', 'iː˨sɛːt̚˥')
36
  ]]
37
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def number_to_cantonese(text):
40
  return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
@@ -47,9 +57,10 @@ def latin_to_ipa(text):
47
 
48
 
49
  def cantonese_to_ipa(text):
 
50
  text = number_to_cantonese(text.upper())
51
- text = converter.convert(text).replace('-','').replace('$',' ')
52
- text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
53
  text = re.sub(r'[、;:]', ',', text)
54
  text = re.sub(r'\s*,\s*', ', ', text)
55
  text = re.sub(r'\s*。\s*', '. ', text)
 
1
  import re
2
  import cn2an
3
  import opencc
4
+ import config
5
 
6
+ converter = opencc.OpenCC(config.ABS_PATH + '/chinese_dialect_lexicons/jyutjyu_2')
 
7
 
8
  # List of (Latin alphabet, ipa) pairs:
9
  _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
 
35
  ('Z', 'iː˨sɛːt̚˥')
36
  ]]
37
 
38
+ _symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
39
+ ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
40
+ ]]
41
+
42
+
43
+ def symbols_to_chinese(text):
44
+ for regex, replacement in _symbols_to_chinese:
45
+ text = re.sub(regex, replacement, text)
46
+ return text
47
+
48
 
49
  def number_to_cantonese(text):
50
  return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
 
57
 
58
 
59
  def cantonese_to_ipa(text):
60
+ text = symbols_to_chinese(text)
61
  text = number_to_cantonese(text.upper())
62
+ text = converter.convert(text).replace('-', '').replace('$', ' ')
63
+ text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group()) + ' ', text)
64
  text = re.sub(r'[、;:]', ',', text)
65
  text = re.sub(r'\s*,\s*', ', ', text)
66
  text = re.sub(r'\s*。\s*', '. ', text)
text/cleaners.py CHANGED
@@ -1,10 +1,77 @@
1
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
 
4
  def japanese_cleaners(text):
5
  from text.japanese import japanese_to_romaji_with_accent
6
- text = japanese_to_romaji_with_accent(text)
7
- text = re.sub(r'([A-Za-z])$', r'\1.', text)
 
 
 
 
 
8
  return text
9
 
10
 
@@ -15,20 +82,31 @@ def japanese_cleaners2(text):
15
  def korean_cleaners(text):
16
  '''Pipeline for Korean text'''
17
  from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
18
- text = latin_to_hangul(text)
19
- text = number_to_hangul(text)
20
- text = divide_hangul(text)
21
- text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
 
 
 
 
 
22
  return text
23
 
24
 
25
  def chinese_cleaners(text):
26
  '''Pipeline for Chinese text'''
27
- from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo
28
- text = number_to_chinese(text)
29
- text = chinese_to_bopomofo(text)
30
- text = latin_to_bopomofo(text)
31
- text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
 
 
 
 
 
 
32
  return text
33
 
34
 
@@ -36,9 +114,9 @@ def zh_ja_mixture_cleaners(text):
36
  from text.mandarin import chinese_to_romaji
37
  from text.japanese import japanese_to_romaji_with_accent
38
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
39
- lambda x: chinese_to_romaji(x.group(1))+' ', text)
40
  text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
41
- x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
42
  text = re.sub(r'\s+$', '', text)
43
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
44
  return text
@@ -57,15 +135,15 @@ def cjks_cleaners(text):
57
  from text.sanskrit import devanagari_to_ipa
58
  from text.english import english_to_lazy_ipa
59
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
60
- lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
61
  text = re.sub(r'\[JA\](.*?)\[JA\]',
62
- lambda x: japanese_to_ipa(x.group(1))+' ', text)
63
  text = re.sub(r'\[KO\](.*?)\[KO\]',
64
- lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
65
  text = re.sub(r'\[SA\](.*?)\[SA\]',
66
- lambda x: devanagari_to_ipa(x.group(1))+' ', text)
67
  text = re.sub(r'\[EN\](.*?)\[EN\]',
68
- lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
69
  text = re.sub(r'\s+$', '', text)
70
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
71
  return text
@@ -77,13 +155,13 @@ def cjke_cleaners(text):
77
  from text.korean import korean_to_ipa
78
  from text.english import english_to_ipa2
79
  text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
80
- 'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
81
  text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
82
- 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
83
  text = re.sub(r'\[KO\](.*?)\[KO\]',
84
- lambda x: korean_to_ipa(x.group(1))+' ', text)
85
  text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
86
- 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
87
  text = re.sub(r'\s+$', '', text)
88
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
89
  return text
@@ -95,13 +173,28 @@ def cjke_cleaners2(text):
95
  from text.korean import korean_to_ipa
96
  from text.english import english_to_ipa2
97
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
98
- lambda x: chinese_to_ipa(x.group(1))+' ', text)
99
  text = re.sub(r'\[JA\](.*?)\[JA\]',
100
- lambda x: japanese_to_ipa2(x.group(1))+' ', text)
101
  text = re.sub(r'\[KO\](.*?)\[KO\]',
102
- lambda x: korean_to_ipa(x.group(1))+' ', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  text = re.sub(r'\[EN\](.*?)\[EN\]',
104
- lambda x: english_to_ipa2(x.group(1))+' ', text)
105
  text = re.sub(r'\s+$', '', text)
106
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
107
  return text
@@ -109,15 +202,25 @@ def cjke_cleaners2(text):
109
 
110
  def thai_cleaners(text):
111
  from text.thai import num_to_thai, latin_to_thai
112
- text = num_to_thai(text)
113
- text = latin_to_thai(text)
 
 
 
 
 
114
  return text
115
 
116
 
117
  def shanghainese_cleaners(text):
118
  from text.shanghainese import shanghainese_to_ipa
119
- text = shanghainese_to_ipa(text)
120
- text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 
 
 
 
 
121
  return text
122
 
123
 
@@ -129,17 +232,18 @@ def chinese_dialect_cleaners(text):
129
  from text.english import english_to_lazy_ipa2
130
  from text.ngu_dialect import ngu_dialect_to_ipa
131
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
132
- lambda x: chinese_to_ipa2(x.group(1))+' ', text)
133
  text = re.sub(r'\[JA\](.*?)\[JA\]',
134
- lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
135
  text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
136
- '˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text)
 
137
  text = re.sub(r'\[GD\](.*?)\[GD\]',
138
- lambda x: cantonese_to_ipa(x.group(1))+' ', text)
139
  text = re.sub(r'\[EN\](.*?)\[EN\]',
140
- lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
141
  text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
142
- 1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
143
  text = re.sub(r'\s+$', '', text)
144
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
145
  return text
 
1
  import re
2
+ import config
3
+ from unidecode import unidecode
4
+ from phonemizer import phonemize
5
+ from phonemizer.backend.espeak.wrapper import EspeakWrapper
6
+
7
+ ESPEAK_LIBRARY = getattr(config, "ESPEAK_LIBRARY", "")
8
+ if ESPEAK_LIBRARY != "":
9
+ EspeakWrapper.set_library(ESPEAK_LIBRARY)
10
+
11
+ # List of (regular expression, replacement) pairs for abbreviations:
12
+ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
13
+ ('mrs', 'misess'),
14
+ ('mr', 'mister'),
15
+ ('dr', 'doctor'),
16
+ ('st', 'saint'),
17
+ ('co', 'company'),
18
+ ('jr', 'junior'),
19
+ ('maj', 'major'),
20
+ ('gen', 'general'),
21
+ ('drs', 'doctors'),
22
+ ('rev', 'reverend'),
23
+ ('lt', 'lieutenant'),
24
+ ('hon', 'honorable'),
25
+ ('sgt', 'sergeant'),
26
+ ('capt', 'captain'),
27
+ ('esq', 'esquire'),
28
+ ('ltd', 'limited'),
29
+ ('col', 'colonel'),
30
+ ('ft', 'fort'),
31
+ ]]
32
+
33
+
34
+ def expand_abbreviations(text):
35
+ for regex, replacement in _abbreviations:
36
+ text = re.sub(regex, replacement, text)
37
+ return text
38
+
39
+
40
+ def transliteration_cleaners(text):
41
+ '''Pipeline for non-English text that transliterates to ASCII.'''
42
+ text = unidecode(text)
43
+ text = text.lower()
44
+ text = re.sub(r'\s+', ' ', text)
45
+ text = expand_abbreviations(text)
46
+ return text
47
+
48
+
49
+ # for English text
50
+ def english_cleaners(text):
51
+ '''Pipeline for English text, including abbreviation expansion.'''
52
+ text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: transliteration_cleaners(x.group(1)) + ' ', text)
53
+ phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
54
+ return phonemes
55
+
56
+
57
+ # for non-English text that can be transliterated to ASCII
58
+ def english_cleaners2(text):
59
+ '''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
60
+ text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: transliteration_cleaners(x.group(1)) + ' ', text)
61
+ phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True,
62
+ with_stress=True)
63
+ return phonemes
64
 
65
 
66
  def japanese_cleaners(text):
67
  from text.japanese import japanese_to_romaji_with_accent
68
+
69
+ def clean(text):
70
+ text = japanese_to_romaji_with_accent(text)
71
+ text = re.sub(r'([A-Za-z])$', r'\1.', text)
72
+ return text
73
+
74
+ text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: clean(x.group(1)) + ' ', text)
75
  return text
76
 
77
 
 
82
  def korean_cleaners(text):
83
  '''Pipeline for Korean text'''
84
  from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
85
+
86
+ def clean(text):
87
+ text = latin_to_hangul(text)
88
+ text = number_to_hangul(text)
89
+ text = divide_hangul(text)
90
+ text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
91
+ return text
92
+
93
+ text = re.sub(r'\[KO\](.*?)\[KO\]', lambda x: clean(x.group(1)) + ' ', text)
94
  return text
95
 
96
 
97
  def chinese_cleaners(text):
98
  '''Pipeline for Chinese text'''
99
+ from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, symbols_to_chinese
100
+
101
+ def clean(text):
102
+ text = symbols_to_chinese(text)
103
+ text = number_to_chinese(text)
104
+ text = chinese_to_bopomofo(text)
105
+ text = latin_to_bopomofo(text)
106
+ text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
107
+ return text
108
+
109
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: clean(x.group(1)) + ' ', text)
110
  return text
111
 
112
 
 
114
  from text.mandarin import chinese_to_romaji
115
  from text.japanese import japanese_to_romaji_with_accent
116
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
117
+ lambda x: chinese_to_romaji(x.group(1)) + ' ', text)
118
  text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
119
+ x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…') + ' ', text)
120
  text = re.sub(r'\s+$', '', text)
121
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
122
  return text
 
135
  from text.sanskrit import devanagari_to_ipa
136
  from text.english import english_to_lazy_ipa
137
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
138
+ lambda x: chinese_to_lazy_ipa(x.group(1)) + ' ', text)
139
  text = re.sub(r'\[JA\](.*?)\[JA\]',
140
+ lambda x: japanese_to_ipa(x.group(1)) + ' ', text)
141
  text = re.sub(r'\[KO\](.*?)\[KO\]',
142
+ lambda x: korean_to_lazy_ipa(x.group(1)) + ' ', text)
143
  text = re.sub(r'\[SA\](.*?)\[SA\]',
144
+ lambda x: devanagari_to_ipa(x.group(1)) + ' ', text)
145
  text = re.sub(r'\[EN\](.*?)\[EN\]',
146
+ lambda x: english_to_lazy_ipa(x.group(1)) + ' ', text)
147
  text = re.sub(r'\s+$', '', text)
148
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
149
  return text
 
155
  from text.korean import korean_to_ipa
156
  from text.english import english_to_ipa2
157
  text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
158
+ 'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn') + ' ', text)
159
  text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
160
+ 'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz') + ' ', text)
161
  text = re.sub(r'\[KO\](.*?)\[KO\]',
162
+ lambda x: korean_to_ipa(x.group(1)) + ' ', text)
163
  text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
164
+ 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u') + ' ', text)
165
  text = re.sub(r'\s+$', '', text)
166
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
167
  return text
 
173
  from text.korean import korean_to_ipa
174
  from text.english import english_to_ipa2
175
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
176
+ lambda x: chinese_to_ipa(x.group(1)) + ' ', text)
177
  text = re.sub(r'\[JA\](.*?)\[JA\]',
178
+ lambda x: japanese_to_ipa2(x.group(1)) + ' ', text)
179
  text = re.sub(r'\[KO\](.*?)\[KO\]',
180
+ lambda x: korean_to_ipa(x.group(1)) + ' ', text)
181
+ text = re.sub(r'\[EN\](.*?)\[EN\]',
182
+ lambda x: english_to_ipa2(x.group(1)) + ' ', text)
183
+ text = re.sub(r'\s+$', '', text)
184
+ text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
185
+ return text
186
+
187
+
188
+ def cje_cleaners(text):
189
+ from text.mandarin import chinese_to_ipa
190
+ from text.japanese import japanese_to_ipa2
191
+ from text.english import english_to_ipa2
192
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]',
193
+ lambda x: chinese_to_ipa(x.group(1)) + ' ', text)
194
+ text = re.sub(r'\[JA\](.*?)\[JA\]',
195
+ lambda x: japanese_to_ipa2(x.group(1)) + ' ', text)
196
  text = re.sub(r'\[EN\](.*?)\[EN\]',
197
+ lambda x: english_to_ipa2(x.group(1)) + ' ', text)
198
  text = re.sub(r'\s+$', '', text)
199
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
200
  return text
 
202
 
203
  def thai_cleaners(text):
204
  from text.thai import num_to_thai, latin_to_thai
205
+
206
+ def clean(text):
207
+ text = num_to_thai(text)
208
+ text = latin_to_thai(text)
209
+ return text
210
+
211
+ text = re.sub(r'\[TH\](.*?)\[TH\]', lambda x: clean(x.group(1)) + ' ', text)
212
  return text
213
 
214
 
215
  def shanghainese_cleaners(text):
216
  from text.shanghainese import shanghainese_to_ipa
217
+
218
+ def clean(text):
219
+ text = shanghainese_to_ipa(text)
220
+ text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
221
+ return text
222
+
223
+ text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: clean(x.group(1)) + ' ', text)
224
  return text
225
 
226
 
 
232
  from text.english import english_to_lazy_ipa2
233
  from text.ngu_dialect import ngu_dialect_to_ipa
234
  text = re.sub(r'\[ZH\](.*?)\[ZH\]',
235
+ lambda x: chinese_to_ipa2(x.group(1)) + ' ', text)
236
  text = re.sub(r'\[JA\](.*?)\[JA\]',
237
+ lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ') + ' ', text)
238
  text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
239
+ '˧˧˦').replace(
240
+ '6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e') + ' ', text)
241
  text = re.sub(r'\[GD\](.*?)\[GD\]',
242
+ lambda x: cantonese_to_ipa(x.group(1)) + ' ', text)
243
  text = re.sub(r'\[EN\](.*?)\[EN\]',
244
+ lambda x: english_to_lazy_ipa2(x.group(1)) + ' ', text)
245
  text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
246
+ 1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ') + ' ', text)
247
  text = re.sub(r'\s+$', '', text)
248
  text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
249
  return text
text/mandarin.py CHANGED
@@ -7,10 +7,9 @@ import cn2an
7
  import logging
8
 
9
  logging.getLogger('jieba').setLevel(logging.WARNING)
10
- jieba.set_dictionary(os.path.dirname(os.path.realpath(sys.argv[0]))+'/jieba/dict.txt')
11
  jieba.initialize()
12
 
13
-
14
  # List of (Latin alphabet, bopomofo) pairs:
15
  _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
16
  ('a', 'ㄟˉ'),
@@ -236,9 +235,19 @@ _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
236
  ('—', '-')
237
  ]]
238
 
 
 
 
 
 
 
 
 
 
 
239
 
240
  def number_to_chinese(text):
241
- numbers = re.findall(r'\d+(?:\.?\d+)?', text)
242
  for number in numbers:
243
  text = text.replace(number, cn2an.an2cn(number), 1)
244
  return text
@@ -286,6 +295,7 @@ def bopomofo_to_ipa2(text):
286
 
287
 
288
  def chinese_to_romaji(text):
 
289
  text = number_to_chinese(text)
290
  text = chinese_to_bopomofo(text)
291
  text = latin_to_bopomofo(text)
@@ -306,6 +316,7 @@ def chinese_to_lazy_ipa(text):
306
 
307
 
308
  def chinese_to_ipa(text):
 
309
  text = number_to_chinese(text)
310
  text = chinese_to_bopomofo(text)
311
  text = latin_to_bopomofo(text)
@@ -319,6 +330,7 @@ def chinese_to_ipa(text):
319
 
320
 
321
  def chinese_to_ipa2(text):
 
322
  text = number_to_chinese(text)
323
  text = chinese_to_bopomofo(text)
324
  text = latin_to_bopomofo(text)
 
7
  import logging
8
 
9
  logging.getLogger('jieba').setLevel(logging.WARNING)
10
+ jieba.set_dictionary(os.path.dirname(os.path.realpath(sys.argv[0])) + '/jieba/dict.txt')
11
  jieba.initialize()
12
 
 
13
  # List of (Latin alphabet, bopomofo) pairs:
14
  _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
15
  ('a', 'ㄟˉ'),
 
235
  ('—', '-')
236
  ]]
237
 
238
+ _symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
239
+ ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
240
+ ]]
241
+
242
+
243
+ def symbols_to_chinese(text):
244
+ for regex, replacement in _symbols_to_chinese:
245
+ text = re.sub(regex, replacement, text)
246
+ return text
247
+
248
 
249
  def number_to_chinese(text):
250
+ numbers = re.findall(r'[0-9]+(?:\.?[0-9]+)?', text)
251
  for number in numbers:
252
  text = text.replace(number, cn2an.an2cn(number), 1)
253
  return text
 
295
 
296
 
297
  def chinese_to_romaji(text):
298
+ text = symbols_to_chinese(text)
299
  text = number_to_chinese(text)
300
  text = chinese_to_bopomofo(text)
301
  text = latin_to_bopomofo(text)
 
316
 
317
 
318
  def chinese_to_ipa(text):
319
+ text = symbols_to_chinese(text)
320
  text = number_to_chinese(text)
321
  text = chinese_to_bopomofo(text)
322
  text = latin_to_bopomofo(text)
 
330
 
331
 
332
  def chinese_to_ipa2(text):
333
+ text = symbols_to_chinese(text)
334
  text = number_to_chinese(text)
335
  text = chinese_to_bopomofo(text)
336
  text = latin_to_bopomofo(text)
text/shanghainese.py CHANGED
@@ -1,9 +1,9 @@
1
  import re
2
  import cn2an
3
  import opencc
 
4
 
5
-
6
- converter = opencc.OpenCC('zaonhe')
7
 
8
  # List of (Latin alphabet, ipa) pairs:
9
  _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
@@ -35,9 +35,19 @@ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
35
  ('Z', 'zᴇ')
36
  ]]
37
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def _number_to_shanghainese(num):
40
- num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两')
41
  return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
42
 
43
 
@@ -52,9 +62,10 @@ def latin_to_ipa(text):
52
 
53
 
54
  def shanghainese_to_ipa(text):
 
55
  text = number_to_shanghainese(text.upper())
56
- text = converter.convert(text).replace('-','').replace('$',' ')
57
- text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
58
  text = re.sub(r'[、;:]', ',', text)
59
  text = re.sub(r'\s*,\s*', ', ', text)
60
  text = re.sub(r'\s*。\s*', '. ', text)
 
1
  import re
2
  import cn2an
3
  import opencc
4
+ import config
5
 
6
+ converter = opencc.OpenCC(config.ABS_PATH + '/chinese_dialect_lexicons/zaonhe')
 
7
 
8
  # List of (Latin alphabet, ipa) pairs:
9
  _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
 
35
  ('Z', 'zᴇ')
36
  ]]
37
 
38
+ _symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
39
+ ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
40
+ ]]
41
+
42
+
43
+ def symbols_to_chinese(text):
44
+ for regex, replacement in _symbols_to_chinese:
45
+ text = re.sub(regex, replacement, text)
46
+ return text
47
+
48
 
49
  def _number_to_shanghainese(num):
50
+ num = cn2an.an2cn(num).replace('一十', '十').replace('二十', '廿').replace('二', '两')
51
  return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
52
 
53
 
 
62
 
63
 
64
  def shanghainese_to_ipa(text):
65
+ text = symbols_to_chinese(text)
66
  text = number_to_shanghainese(text.upper())
67
+ text = converter.convert(text).replace('-', '').replace('$', ' ')
68
+ text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group()) + ' ', text)
69
  text = re.sub(r'[、;:]', ',', text)
70
  text = re.sub(r'\s*,\s*', ', ', text)
71
  text = re.sub(r'\s*。\s*', '. ', text)
utils/__pycache__/merge.cpython-310.pyc ADDED
Binary file (3.95 kB). View file
 
utils/__pycache__/nlp.cpython-310.pyc ADDED
Binary file (2.41 kB). View file
 
utils/__pycache__/utils.cpython-310.pyc ADDED
Binary file (4.02 kB). View file
 
utils/merge.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import logging
4
+ import config
5
+ import numpy as np
6
+ from utils.utils import check_is_none
7
+ from voice import vits, TTS
8
+
9
+ lang_dict = {
10
+ "english_cleaners": ["en"],
11
+ "english_cleaners2": ["en"],
12
+ "japanese_cleaners": ["ja"],
13
+ "japanese_cleaners2": ["ja"],
14
+ "korean_cleaners": ["ko"],
15
+ "chinese_cleaners": ["zh"],
16
+ "zh_ja_mixture_cleaners": ["zh", "ja"],
17
+ "sanskrit_cleaners": ["sa"],
18
+ "cjks_cleaners": ["zh", "ja", "ko", "sa"],
19
+ "cjke_cleaners": ["zh", "ja", "ko", "en"],
20
+ "cjke_cleaners2": ["zh", "ja", "ko", "en"],
21
+ "cje_cleaners": ["zh", "ja", "en"],
22
+ "thai_cleaners": ["th"],
23
+ "shanghainese_cleaners": ["sh"],
24
+ "chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD",
25
+ "ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC",
26
+ "YB"],
27
+ }
28
+
29
+
30
+ def analysis(model_config_json):
31
+ model_config = json.load(model_config_json)
32
+ symbols = model_config.get("symbols", None)
33
+ emotion_embedding = model_config.get("data").get("emotion_embedding", False)
34
+ if symbols != None:
35
+ if not emotion_embedding:
36
+ mode_type = "vits"
37
+ else:
38
+ mode_type = "w2v2"
39
+ else:
40
+ mode_type = "hubert-soft"
41
+ return mode_type
42
+
43
+
44
+ def load_npy(model_):
45
+ if isinstance(model_, list):
46
+ # check if is .npy
47
+ for i in model_:
48
+ _model_extention = os.path.splitext(i)[1]
49
+ if _model_extention != ".npy":
50
+ raise ValueError(f"Unsupported model type: {_model_extention}")
51
+
52
+ # merge npy files
53
+ emotion_reference = np.empty((0, 1024))
54
+ for i in model_:
55
+ tmp = np.load(i).reshape(-1, 1024)
56
+ emotion_reference = np.append(emotion_reference, tmp, axis=0)
57
+
58
+ elif os.path.isdir(model_):
59
+ emotion_reference = np.empty((0, 1024))
60
+ for root, dirs, files in os.walk(model_):
61
+ for file_name in files:
62
+ # check if is .npy
63
+ _model_extention = os.path.splitext(file_name)[1]
64
+ if _model_extention != ".npy":
65
+ continue
66
+ file_path = os.path.join(root, file_name)
67
+
68
+ # merge npy files
69
+ tmp = np.load(file_path).reshape(-1, 1024)
70
+ emotion_reference = np.append(emotion_reference, tmp, axis=0)
71
+
72
+ elif os.path.isfile(model_):
73
+ # check if is .npy
74
+ _model_extention = os.path.splitext(model_)[1]
75
+ if _model_extention != ".npy":
76
+ raise ValueError(f"Unsupported model type: {_model_extention}")
77
+
78
+ emotion_reference = np.load(model_)
79
+ logging.info(f"Loaded emotional dimention npy range:{len(emotion_reference)}")
80
+ return emotion_reference
81
+
82
+
83
+ def merge_model(merging_model):
84
+ vits_obj = []
85
+ vits_speakers = []
86
+ hubert_vits_obj = []
87
+ hubert_vits_speakers = []
88
+ w2v2_vits_obj = []
89
+ w2v2_vits_speakers = []
90
+
91
+ # model list
92
+ vits_list = []
93
+ hubert_vits_list = []
94
+ w2v2_vits_list = []
95
+
96
+ for l in merging_model:
97
+ with open(l[1], 'r', encoding='utf-8') as model_config:
98
+ model_type = analysis(model_config)
99
+ if model_type == "vits":
100
+ vits_list.append(l)
101
+ elif model_type == "hubert":
102
+ hubert_vits_list.append(l)
103
+ elif model_type == "w2v2":
104
+ w2v2_vits_list.append(l)
105
+
106
+ # merge vits
107
+ new_id = 0
108
+ for obj_id, i in enumerate(vits_list):
109
+ obj = vits(model=i[0], config=i[1], model_type="vits")
110
+ lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
111
+
112
+ for id, name in enumerate(obj.return_speakers()):
113
+ vits_obj.append([int(id), obj, obj_id])
114
+ vits_speakers.append({"id": new_id, "name": name, "lang": lang})
115
+ new_id += 1
116
+
117
+ # merge hubert-vits
118
+ if len(hubert_vits_list) != 0:
119
+ if getattr(config, "HUBERT_SOFT_MODEL", None) == None or check_is_none(config.HUBERT_SOFT_MODEL):
120
+ raise ValueError(f"Please configure HUBERT_SOFT_MODEL path in config.py")
121
+ try:
122
+ from hubert_model import hubert_soft
123
+ hubert = hubert_soft(config.HUBERT_SOFT_MODEL)
124
+ except Exception as e:
125
+ raise ValueError(f"Load HUBERT_SOFT_MODEL failed {e}")
126
+
127
+ new_id = 0
128
+ for obj_id, i in enumerate(hubert_vits_list):
129
+ obj = vits(model=i[0], config=i[1], model_=hubert, model_type="hubert")
130
+ lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
131
+
132
+ for id, name in enumerate(obj.return_speakers()):
133
+ hubert_vits_obj.append([int(id), obj, obj_id])
134
+ hubert_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
135
+ new_id += 1
136
+
137
+ # merge w2v2-vits
138
+ if len(w2v2_vits_list) != 0:
139
+ if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY):
140
+ raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py")
141
+ try:
142
+ emotion_reference = load_npy(config.DIMENSIONAL_EMOTION_NPY)
143
+ except Exception as e:
144
+ raise ValueError(f"Load DIMENSIONAL_EMOTION_NPY failed {e}")
145
+
146
+ new_id = 0
147
+ for obj_id, i in enumerate(w2v2_vits_list):
148
+ obj = vits(model=i[0], config=i[1], model_=emotion_reference, model_type="w2v2")
149
+ lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
150
+
151
+ for id, name in enumerate(obj.return_speakers()):
152
+ w2v2_vits_obj.append([int(id), obj, obj_id])
153
+ w2v2_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
154
+ new_id += 1
155
+
156
+ voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj}
157
+ voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers}
158
+
159
+ tts = TTS(voice_obj, voice_speakers)
160
+
161
+ return tts
utils/nlp.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import regex as re
2
+ import logging
3
+ import config
4
+ from fastlid import fastlid
5
+ from .utils import check_is_none
6
+
7
+ logger = logging.getLogger("vits-simple-api")
8
+ level = getattr(config, "LOGGING_LEVEL", "DEBUG")
9
+ level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
10
+ 'CRITICAL': logging.CRITICAL}
11
+ logger.setLevel(level_dict[level])
12
+
13
+
14
+ def clasify_lang(text):
15
+ pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
16
+ r'\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
17
+ r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
18
+ words = re.split(pattern, text)
19
+
20
+ pre = ""
21
+ p = 0
22
+ for word in words:
23
+
24
+ if check_is_none(word): continue
25
+ lang = fastlid(word)[0]
26
+ if pre == "":
27
+ text = text[:p] + text[p:].replace(word, f'[{lang.upper()}]' + word, 1)
28
+ p += len(f'[{lang.upper()}]')
29
+ elif pre != lang:
30
+ text = text[:p] + text[p:].replace(word, f'[{pre.upper()}][{lang.upper()}]' + word, 1)
31
+ p += len(f'[{pre.upper()}][{lang.upper()}]')
32
+ pre = lang
33
+ p += text[p:].index(word) + len(word)
34
+ text += f"[{pre.upper()}]"
35
+
36
+ return text
37
+
38
+
39
+ def cut(text, max):
40
+ pattern = r'[\!\(\)\,\-\.\/\:\;\?\?\。\,\、\;\:]+'
41
+ sentences = re.split(pattern, text)
42
+ sentence_list = []
43
+ count = 0
44
+ p = 0
45
+ for sentence in sentences:
46
+ count += len(sentence) + 1
47
+ if count >= max:
48
+ sentence_list.append(text[p:p + count])
49
+ p += count
50
+ count = 0
51
+ if p < len(text):
52
+ sentence_list.append(text[p:])
53
+ return sentence_list
54
+
55
+
56
+ def sentence_split(text, max=50, lang="auto", speaker_lang=None):
57
+ # 如果该speaker只支持一种语言
58
+ if speaker_lang is not None and len(speaker_lang) == 1:
59
+ if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
60
+ logger.debug(
61
+ f"lang \"{lang}\" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}")
62
+ lang = speaker_lang[0]
63
+ else:
64
+ fastlid.set_languages = speaker_lang
65
+
66
+ sentence_list = []
67
+ if lang.upper() != "MIX":
68
+ if max <= 0:
69
+ sentence_list.append(
70
+ clasify_lang(text) if lang.upper() == "AUTO" else f"[{lang.upper()}]{text}[{lang.upper()}]")
71
+ else:
72
+ for i in cut(text, max):
73
+ if check_is_none(i): continue
74
+ sentence_list.append(
75
+ clasify_lang(i) if lang.upper() == "AUTO" else f"[{lang.upper()}]{i}[{lang.upper()}]")
76
+ else:
77
+ sentence_list.append(text)
78
+
79
+ for i in sentence_list:
80
+ logger.debug(i)
81
+
82
+ return sentence_list
utils/utils.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from json import loads
4
+ import av
5
+ from torch import load, FloatTensor
6
+ from numpy import float32
7
+ import librosa
8
+
9
+
10
+ class HParams():
11
+ def __init__(self, **kwargs):
12
+ for k, v in kwargs.items():
13
+ if type(v) == dict:
14
+ v = HParams(**v)
15
+ self[k] = v
16
+
17
+ def keys(self):
18
+ return self.__dict__.keys()
19
+
20
+ def items(self):
21
+ return self.__dict__.items()
22
+
23
+ def values(self):
24
+ return self.__dict__.values()
25
+
26
+ def __len__(self):
27
+ return len(self.__dict__)
28
+
29
+ def __getitem__(self, key):
30
+ return getattr(self, key)
31
+
32
+ def __setitem__(self, key, value):
33
+ return setattr(self, key, value)
34
+
35
+ def __contains__(self, key):
36
+ return key in self.__dict__
37
+
38
+ def __repr__(self):
39
+ return self.__dict__.__repr__()
40
+
41
+
42
+ def load_checkpoint(checkpoint_path, model):
43
+ checkpoint_dict = load(checkpoint_path, map_location='cpu')
44
+ iteration = checkpoint_dict['iteration']
45
+ saved_state_dict = checkpoint_dict['model']
46
+ if hasattr(model, 'module'):
47
+ state_dict = model.module.state_dict()
48
+ else:
49
+ state_dict = model.state_dict()
50
+ new_state_dict = {}
51
+ for k, v in state_dict.items():
52
+ try:
53
+ new_state_dict[k] = saved_state_dict[k]
54
+ except:
55
+ logging.info("%s is not in the checkpoint" % k)
56
+ new_state_dict[k] = v
57
+ if hasattr(model, 'module'):
58
+ model.module.load_state_dict(new_state_dict)
59
+ else:
60
+ model.load_state_dict(new_state_dict)
61
+ logging.info("Loaded checkpoint '{}' (iteration {})".format(
62
+ checkpoint_path, iteration))
63
+ return
64
+
65
+
66
+ def get_hparams_from_file(config_path):
67
+ with open(config_path, 'r', encoding='utf-8') as f:
68
+ data = f.read()
69
+ config = loads(data)
70
+
71
+ hparams = HParams(**config)
72
+ return hparams
73
+
74
+
75
+ def load_audio_to_torch(full_path, target_sampling_rate):
76
+ audio, sampling_rate = librosa.load(full_path, sr=target_sampling_rate, mono=True)
77
+ return FloatTensor(audio.astype(float32))
78
+
79
+
80
+ def wav2ogg(input, output):
81
+ with av.open(input, 'rb') as i:
82
+ with av.open(output, 'wb', format='ogg') as o:
83
+ out_stream = o.add_stream('libvorbis')
84
+ for frame in i.decode(audio=0):
85
+ for p in out_stream.encode(frame):
86
+ o.mux(p)
87
+
88
+ for p in out_stream.encode(None):
89
+ o.mux(p)
90
+
91
+ def wav2mp3(input, output):
92
+ with av.open(input, 'rb') as i:
93
+ with av.open(output, 'wb', format='mp3') as o:
94
+ out_stream = o.add_stream('mp3')
95
+ for frame in i.decode(audio=0):
96
+ for p in out_stream.encode(frame):
97
+ o.mux(p)
98
+
99
+ for p in out_stream.encode(None):
100
+ o.mux(p)
101
+
102
+ def clean_folder(folder_path):
103
+ for filename in os.listdir(folder_path):
104
+ file_path = os.path.join(folder_path, filename)
105
+ # 如果是文件,则删除文件
106
+ if os.path.isfile(file_path):
107
+ os.remove(file_path)
108
+
109
+
110
+ # is none -> True, is not none -> False
111
+ def check_is_none(s):
112
+ return s is None or (isinstance(s, str) and str(s).isspace()) or str(s) == ""
vits-simple-api-installer-latest.sh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INSTALL_DIR=/usr/local/vits-simple-api
2
+
3
+ RED='\033[0;31m'
4
+ GREEN='\033[0;32m'
5
+ YELLOW='\033[0;33m'
6
+ PLAIN='\033[0m'
7
+
8
+ mkdir -p $INSTALL_DIR
9
+ cd $INSTALL_DIR
10
+ if [ ! -f config.py ]; then
11
+ echo -e "${YELLOW}download config.py\n${PLAIN}"
12
+ wget -O $INSTALL_DIR/config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/config.py
13
+ fi
14
+
15
+ wget -O $INSTALL_DIR/docker-compose.yaml https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/docker-compose.yaml
16
+
17
+ echo -e "${YELLOW}Pulling the image might take a while, so why not grab a cup of java first?\n${PLAIN}"
18
+
19
+ docker compose pull
20
+ docker compose up -d
21
+
22
+ echo -e "\nThe upgrade or installation has been completed."
23
+ echo -e "The configuration file directory is $(realpath $INSTALL_DIR)"
24
+ echo -e "${YELLOW}If the vits model is not imported, it cannot be used. Import the model in the configuration file directory.${PLAIN}"
25
+ echo -e "After modifying the configuration file, restart the docker container for the modification to take effect."
26
+ echo -e "${YELLOW}If you have any questions, please put them in the issues.${PLAIN}"
27
+ echo -e "https://github.com/Artrajz/vits-simple-api"
voice.py CHANGED
@@ -1,32 +1,30 @@
1
  import os
2
-
3
  import librosa
4
- from scipy.io.wavfile import write
5
- from mel_processing import spectrogram_torch
6
- from text import text_to_sequence, _clean_text
7
- from models import SynthesizerTrn
8
- import utils
9
  import commons
10
  import sys
11
  import re
12
  import numpy as np
13
- # import torch
14
- # torch.set_num_threads(1) #设置torch线程为1,防止多任务推理时服务崩溃,但flask仍然会使用多线程
 
 
15
  from torch import no_grad, LongTensor, inference_mode, FloatTensor
16
- import audonnx
17
- import uuid
18
  from io import BytesIO
 
 
 
 
 
 
 
19
 
 
 
20
 
21
- class Voice:
22
- def __init__(self, model, config, out_path=None):
23
- self.out_path = out_path
24
- if not os.path.exists(self.out_path):
25
- try:
26
- os.mkdir(self.out_path)
27
- except:
28
- pass
29
 
 
 
 
30
  self.hps_ms = utils.get_hparams_from_file(config)
31
  self.n_speakers = self.hps_ms.data.n_speakers if 'n_speakers' in self.hps_ms.data.keys() else 0
32
  self.n_symbols = len(self.hps_ms.symbols) if 'symbols' in self.hps_ms.keys() else 0
@@ -42,9 +40,19 @@ class Voice:
42
  emotion_embedding=self.emotion_embedding,
43
  **self.hps_ms.model)
44
  _ = self.net_g_ms.eval()
 
 
 
 
 
45
  utils.load_checkpoint(model, self.net_g_ms)
 
 
 
 
 
46
 
47
- def get_text(self, text, hps, cleaned=False):
48
  if cleaned:
49
  text_norm = text_to_sequence(text, hps.symbols, [])
50
  else:
@@ -54,7 +62,7 @@ class Voice:
54
  text_norm = LongTensor(text_norm)
55
  return text_norm
56
 
57
- def get_label_value(self, text, label, default, warning_name='value'):
58
  value = re.search(rf'\[{label}=(.+?)\]', text)
59
  if value:
60
  try:
@@ -65,16 +73,10 @@ class Voice:
65
  sys.exit(1)
66
  else:
67
  value = default
68
- return value, text
69
-
70
- def ex_return(self, text, escape=False):
71
- if escape:
72
- return text.encode('unicode_escape').decode()
73
  else:
74
- return text
75
-
76
- def return_speakers(self, escape=False):
77
- return self.speakers
78
 
79
  def get_label(self, text, label):
80
  if f'[{label}]' in text:
@@ -82,132 +84,152 @@ class Voice:
82
  else:
83
  return False, text
84
 
85
- def generate(self, text=None, speaker_id=None, format=None, speed=1, audio_path=None, target_id=None, escape=False,
86
- option=None, w2v2_folder=None):
87
- if self.n_symbols != 0:
88
- if not self.emotion_embedding:
89
- length_scale, text = self.get_label_value(text, 'LENGTH', speed, 'length scale')
90
- noise_scale, text = self.get_label_value(text, 'NOISE', 0.667, 'noise scale')
91
- noise_scale_w, text = self.get_label_value(text, 'NOISEW', 0.8, 'deviation of noise')
92
- cleaned, text = self.get_label(text, 'CLEANED')
93
-
94
- stn_tst = self.get_text(text, self.hps_ms, cleaned=cleaned)
95
- with no_grad():
96
- x_tst = stn_tst.unsqueeze(0)
97
- x_tst_lengths = LongTensor([stn_tst.size(0)])
98
- sid = LongTensor([speaker_id])
99
- audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid,
100
- noise_scale=noise_scale,
101
- noise_scale_w=noise_scale_w,
102
- length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  # else:
105
- # w2v2_model = audonnx.load(os.path.dirname(w2v2_folder))
106
- #
107
- # if option == 'clean':
108
- # self.ex_print(_clean_text(
109
- # text, self.hps_ms.data.text_cleaners), escape)
110
- #
111
- # length_scale, text = self.get_label_value(
112
- # text, 'LENGTH', 1, 'length scale')
113
- # noise_scale, text = self.get_label_value(
114
- # text, 'NOISE', 0.667, 'noise scale')
115
- # noise_scale_w, text = self.get_label_value(
116
- # text, 'NOISEW', 0.8, 'deviation of noise')
117
- # cleaned, text = self.get_label(text, 'CLEANED')
118
- #
119
- # stn_tst = self.get_text(text, self.hps_ms, cleaned=cleaned)
120
- #
121
- # emotion_reference = input('Path of an emotion reference: ')
122
- # if emotion_reference.endswith('.npy'):
123
- # emotion = np.load(emotion_reference)
124
- # emotion = FloatTensor(emotion).unsqueeze(0)
125
- # else:
126
- # audio16000, sampling_rate = librosa.load(
127
- # emotion_reference, sr=16000, mono=True)
128
- # emotion = w2v2_model(audio16000, sampling_rate)[
129
- # 'hidden_states']
130
- # emotion_reference = re.sub(
131
- # r'\..*$', '', emotion_reference)
132
- # np.save(emotion_reference, emotion.squeeze(0))
133
- # emotion = FloatTensor(emotion)
134
- #
135
- #
136
- # with no_grad():
137
- # x_tst = stn_tst.unsqueeze(0)
138
- # x_tst_lengths = LongTensor([stn_tst.size(0)])
139
- # sid = LongTensor([speaker_id])
140
- # audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
141
- # noise_scale_w=noise_scale_w,
142
- # length_scale=length_scale, emotion_embedding=emotion)[0][
143
- # 0, 0].data.cpu().float().numpy()
144
-
145
- # else:
146
- # model = input('Path of a hubert-soft Model: ')
147
- # from hubert_model import hubert_soft
148
- # hubert = hubert_soft(model)
149
-
150
- # if audio_path != '[VC]':
151
- # if self.use_f0:
152
- # audio, sampling_rate = librosa.load(
153
- # audio_path, sr=self.hps_ms.data.sampling_rate, mono=True)
154
- # audio16000 = librosa.resample(
155
- # audio, orig_sr=sampling_rate, target_sr=16000)
156
- # else:
157
- # audio16000, sampling_rate = librosa.load(
158
- # audio_path, sr=16000, mono=True)
159
- #
160
- # out_path = "H:/git/MoeGoe-Simple-API/upload/hubert.wav"
161
- # length_scale, out_path = self.get_label_value(
162
- # out_path, 'LENGTH', 1, 'length scale')
163
- # noise_scale, out_path = self.get_label_value(
164
- # out_path, 'NOISE', 0.1, 'noise scale')
165
- # noise_scale_w, out_path = self.get_label_value(
166
- # out_path, 'NOISEW', 0.1, 'deviation of noise')
167
- #
168
- # with inference_mode():
169
- # units = hubert.units(FloatTensor(audio16000).unsqueeze(
170
- # 0).unsqueeze(0)).squeeze(0).numpy()
171
- # if self.use_f0:
172
- # f0_scale, out_path = self.get_label_value(
173
- # out_path, 'F0', 1, 'f0 scale')
174
- # f0 = librosa.pyin(audio, sr=sampling_rate,
175
- # fmin=librosa.note_to_hz('C0'),
176
- # fmax=librosa.note_to_hz('C7'),
177
- # frame_length=1780)[0]
178
- # target_length = len(units[:, 0])
179
- # f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length,
180
- # np.arange(0, len(f0)), f0)) * f0_scale
181
- # units[:, 0] = f0 / 10
182
- #
183
- # stn_tst = FloatTensor(units)
184
- # with no_grad():
185
- # x_tst = stn_tst.unsqueeze(0)
186
- # x_tst_lengths = LongTensor([stn_tst.size(0)])
187
- # sid = LongTensor([target_id])
188
- # audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
189
- # noise_scale_w=noise_scale_w, length_scale=length_scale)[0][
190
- # 0, 0].data.float().numpy()
191
 
192
- with BytesIO() as f:
193
- fname = str(uuid.uuid1())
194
 
195
- if format == 'ogg':
196
- write(f, self.hps_ms.data.sampling_rate, audio)
197
- with BytesIO() as o:
198
- utils.wav2ogg(f, o)
199
- return BytesIO(o.getvalue()), "audio/ogg", fname + ".ogg"
200
- elif format == 'silk':
201
- file_path = self.out_path + "/" + fname + ".wav"
202
- write(file_path, 24000, audio)
203
- silk_path = utils.convert_to_silk(file_path)
204
- os.remove(file_path)
205
- return silk_path, "audio/silk", fname + ".silk"
206
  else:
207
- write(f, self.hps_ms.data.sampling_rate, audio)
208
- return BytesIO(f.getvalue()), "audio/wav", fname + ".wav"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
210
- def voice_conversion(self, audio_path, original_id, target_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
 
212
  audio = utils.load_audio_to_torch(
213
  audio_path, self.hps_ms.data.sampling_rate)
@@ -223,9 +245,242 @@ class Voice:
223
 
224
  with no_grad():
225
  sid_tgt = LongTensor([target_id])
226
- audio = self.net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[
227
- 0][0, 0].data.cpu().float().numpy()
 
 
 
 
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  with BytesIO() as f:
230
- write(f, self.hps_ms.data.sampling_rate, audio)
231
- return BytesIO(f.getvalue())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
2
  import librosa
 
 
 
 
 
3
  import commons
4
  import sys
5
  import re
6
  import numpy as np
7
+ import torch
8
+ import xml.etree.ElementTree as ET
9
+ import config
10
+ import logging
11
  from torch import no_grad, LongTensor, inference_mode, FloatTensor
 
 
12
  from io import BytesIO
13
+ from graiax import silkcoder
14
+ from utils.nlp import cut, sentence_split
15
+ from scipy.io.wavfile import write
16
+ from mel_processing import spectrogram_torch
17
+ from text import text_to_sequence, _clean_text
18
+ from models import SynthesizerTrn
19
+ from utils import utils
20
 
21
+ # torch.set_num_threads(1) # 设置torch线程为1
22
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
 
 
 
 
 
 
 
 
 
24
 
25
+ class vits:
26
+ def __init__(self, model, config, model_=None, model_type=None):
27
+ self.model_type = model_type
28
  self.hps_ms = utils.get_hparams_from_file(config)
29
  self.n_speakers = self.hps_ms.data.n_speakers if 'n_speakers' in self.hps_ms.data.keys() else 0
30
  self.n_symbols = len(self.hps_ms.symbols) if 'symbols' in self.hps_ms.keys() else 0
 
40
  emotion_embedding=self.emotion_embedding,
41
  **self.hps_ms.model)
42
  _ = self.net_g_ms.eval()
43
+
44
+ # load model
45
+ self.load_model(model, model_)
46
+
47
+ def load_model(self, model, model_=None):
48
  utils.load_checkpoint(model, self.net_g_ms)
49
+ self.net_g_ms.to(device)
50
+ if self.model_type == "hubert":
51
+ self.hubert = model_
52
+ elif self.model_type == "w2v2":
53
+ self.emotion_reference = model_
54
 
55
+ def get_cleaned_text(self, text, hps, cleaned=False):
56
  if cleaned:
57
  text_norm = text_to_sequence(text, hps.symbols, [])
58
  else:
 
62
  text_norm = LongTensor(text_norm)
63
  return text_norm
64
 
65
+ def get_label_value(self, label, default, warning_name='value', text=""):
66
  value = re.search(rf'\[{label}=(.+?)\]', text)
67
  if value:
68
  try:
 
73
  sys.exit(1)
74
  else:
75
  value = default
76
+ if text == "":
77
+ return value
 
 
 
78
  else:
79
+ return value, text
 
 
 
80
 
81
  def get_label(self, text, label):
82
  if f'[{label}]' in text:
 
84
  else:
85
  return False, text
86
 
87
+ def get_cleaner(self):
88
+ return getattr(self.hps_ms.data, 'text_cleaners', [None])[0]
89
+
90
+ def return_speakers(self, escape=False):
91
+ return self.speakers
92
+
93
+ def infer(self, params):
94
+ emotion = params.get("emotion", None)
95
+
96
+ with no_grad():
97
+ x_tst = params.get("stn_tst").unsqueeze(0)
98
+ x_tst_lengths = LongTensor([params.get("stn_tst").size(0)])
 
 
 
 
 
 
99
 
100
+ audio = self.net_g_ms.infer(x_tst.to(device), x_tst_lengths.to(device), sid=params.get("sid").to(device),
101
+ noise_scale=params.get("noise_scale"),
102
+ noise_scale_w=params.get("noise_scale_w"),
103
+ length_scale=params.get("length_scale"),
104
+ emotion_embedding=emotion.to(device) if emotion != None else None)[0][
105
+ 0, 0].data.float().cpu().numpy()
106
+
107
+ torch.cuda.empty_cache()
108
+ return audio
109
+
110
+ def get_infer_param(self, length, noise, noisew, text=None, speaker_id=None, audio_path=None,
111
+ emotion=None):
112
+ emo = None
113
+ if self.model_type != "hubert":
114
+ length_scale, text = self.get_label_value('LENGTH', length, 'length scale', text)
115
+ noise_scale, text = self.get_label_value('NOISE', noise, 'noise scale', text)
116
+ noise_scale_w, text = self.get_label_value('NOISEW', noisew, 'deviation of noise', text)
117
+ cleaned, text = self.get_label(text, 'CLEANED')
118
+
119
+ stn_tst = self.get_cleaned_text(text, self.hps_ms, cleaned=cleaned)
120
+ sid = LongTensor([speaker_id])
121
+
122
+ if self.model_type == "w2v2":
123
+ # if emotion_reference.endswith('.npy'):
124
+ # emotion = np.load(emotion_reference)
125
+ # emotion = FloatTensor(emotion).unsqueeze(0)
126
  # else:
127
+ # audio16000, sampling_rate = librosa.load(
128
+ # emotion_reference, sr=16000, mono=True)
129
+ # emotion = self.w2v2(audio16000, sampling_rate)[
130
+ # 'hidden_states']
131
+ # emotion_reference = re.sub(
132
+ # r'\..*$', '', emotion_reference)
133
+ # np.save(emotion_reference, emotion.squeeze(0))
134
+ # emotion = FloatTensor(emotion)
135
+ emo = torch.FloatTensor(self.emotion_reference[emotion]).unsqueeze(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
 
 
137
 
138
+ elif self.model_type == "hubert":
139
+ if self.use_f0:
140
+ audio, sampling_rate = librosa.load(
141
+ audio_path, sr=self.hps_ms.data.sampling_rate, mono=True)
142
+ audio16000 = librosa.resample(
143
+ audio, orig_sr=sampling_rate, target_sr=16000)
 
 
 
 
 
144
  else:
145
+ audio16000, sampling_rate = librosa.load(
146
+ audio_path, sr=16000, mono=True)
147
+
148
+ length_scale = self.get_label_value('LENGTH', length, 'length scale')
149
+ noise_scale = self.get_label_value('NOISE', noise, 'noise scale')
150
+ noise_scale_w = self.get_label_value('NOISEW', noisew, 'deviation of noise')
151
+
152
+ with inference_mode():
153
+ units = self.hubert.units(FloatTensor(audio16000).unsqueeze(0).unsqueeze(0)).squeeze(0).numpy()
154
+ if self.use_f0:
155
+ f0_scale = self.get_label_value('F0', 1, 'f0 scale')
156
+ f0 = librosa.pyin(audio,
157
+ sr=sampling_rate,
158
+ fmin=librosa.note_to_hz('C0'),
159
+ fmax=librosa.note_to_hz('C7'),
160
+ frame_length=1780)[0]
161
+ target_length = len(units[:, 0])
162
+ f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length,
163
+ np.arange(0, len(f0)), f0)) * f0_scale
164
+ units[:, 0] = f0 / 10
165
+
166
+ stn_tst = FloatTensor(units)
167
+ sid = LongTensor([speaker_id])
168
+ params = {"length_scale": length_scale, "noise_scale": noise_scale,
169
+ "noise_scale_w": noise_scale_w, "stn_tst": stn_tst,
170
+ "sid": sid, "emotion": emo}
171
+ return params
172
+
173
+ def get_audio(self, voice, auto_break=False):
174
+ text = voice.get("text", None)
175
+ speaker_id = voice.get("id", 0)
176
+ length = voice.get("length", 1)
177
+ noise = voice.get("noise", 0.667)
178
+ noisew = voice.get("noisew", 0.8)
179
+ max = voice.get("max", 50)
180
+ lang = voice.get("lang", "auto")
181
+ speaker_lang = voice.get("speaker_lang", None)
182
+ audio_path = voice.get("audio_path", None)
183
+ emotion = voice.get("emotion", 0)
184
 
185
+ # 去除所有多余的空白字符
186
+ if text is not None: text = re.sub(r'\s+', ' ', text).strip()
187
+
188
+ # 停顿0.75s,避免语音分段合成再拼接后的连接突兀
189
+ brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
190
+
191
+ tasks = []
192
+ if self.model_type == "vits":
193
+ sentence_list = sentence_split(text, max, lang, speaker_lang)
194
+ for sentence in sentence_list:
195
+ tasks.append(
196
+ self.get_infer_param(text=sentence, speaker_id=speaker_id, length=length, noise=noise,
197
+ noisew=noisew))
198
+ audios = []
199
+
200
+ for task in tasks:
201
+ audios.append(self.infer(task))
202
+ if auto_break:
203
+ audios.append(brk)
204
+
205
+ audio = np.concatenate(audios, axis=0)
206
+
207
+ elif self.model_type == "hubert":
208
+ params = self.get_infer_param(speaker_id=speaker_id, length=length, noise=noise, noisew=noisew,
209
+ audio_path=audio_path)
210
+ audio = self.infer(params)
211
+
212
+ elif self.model_type == "w2v2":
213
+ sentence_list = sentence_split(text, max, lang, speaker_lang)
214
+ for sentence in sentence_list:
215
+ tasks.append(
216
+ self.get_infer_param(text=sentence, speaker_id=speaker_id, length=length, noise=noise,
217
+ noisew=noisew, emotion=emotion))
218
+
219
+ audios = []
220
+ for task in tasks:
221
+ audios.append(self.infer(task))
222
+ if auto_break:
223
+ audios.append(brk)
224
+
225
+ audio = np.concatenate(audios, axis=0)
226
+
227
+ return audio
228
+
229
+ def voice_conversion(self, voice):
230
+ audio_path = voice.get("audio_path")
231
+ original_id = voice.get("original_id")
232
+ target_id = voice.get("target_id")
233
 
234
  audio = utils.load_audio_to_torch(
235
  audio_path, self.hps_ms.data.sampling_rate)
 
245
 
246
  with no_grad():
247
  sid_tgt = LongTensor([target_id])
248
+ audio = self.net_g_ms.voice_conversion(spec.to(device),
249
+ spec_lengths.to(device),
250
+ sid_src=sid_src.to(device),
251
+ sid_tgt=sid_tgt.to(device))[0][0, 0].data.cpu().float().numpy()
252
+
253
+ torch.cuda.empty_cache()
254
 
255
+ return audio
256
+
257
+
258
+ class TTS:
259
+ def __init__(self, voice_obj, voice_speakers):
260
+ self._voice_obj = voice_obj
261
+ self._voice_speakers = voice_speakers
262
+ self._strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
263
+ self._speakers_count = sum([len(self._voice_speakers[i]) for i in self._voice_speakers])
264
+ self._vits_speakers_count = len(self._voice_speakers["VITS"])
265
+ self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
266
+ self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
267
+ self.dem = None
268
+ if getattr(config, "DIMENSIONAL_EMOTION_MODEL", None) != None:
269
+ try:
270
+ import audonnx
271
+ root = os.path.dirname(config.DIMENSIONAL_EMOTION_MODEL)
272
+ model_file = config.DIMENSIONAL_EMOTION_MODEL
273
+ self.dem = audonnx.load(root=root, model_file=model_file)
274
+ except Exception as e:
275
+ self.logger.warning(f"Load DIMENSIONAL_EMOTION_MODEL failed {e}")
276
+
277
+ # Initialization information
278
+ self.logger = logging.getLogger("vits-simple-api")
279
+ self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
280
+ self.logger.info(f'device:{device} device.type:{device.type}')
281
+ if self._vits_speakers_count != 0: self.logger.info(f"[VITS] {self._vits_speakers_count} speakers")
282
+ if self._hubert_speakers_count != 0: self.logger.info(f"[hubert] {self._hubert_speakers_count} speakers")
283
+ if self._w2v2_speakers_count != 0: self.logger.info(f"[w2v2] {self._w2v2_speakers_count} speakers")
284
+ self.logger.info(f"{self._speakers_count} speakers in total")
285
+ if self._speakers_count == 0:
286
+ self.logger.warning(f"No model was loaded")
287
+
288
+ @property
289
+ def voice_speakers(self):
290
+ return self._voice_speakers
291
+
292
+ @property
293
+ def speakers_count(self):
294
+ return self._speakers_count
295
+
296
+ @property
297
+ def vits_speakers_count(self):
298
+ return self._vits_speakers_count
299
+
300
+ @property
301
+ def hubert_speakers_count(self):
302
+ return self._hubert_speakers_count
303
+
304
+ @property
305
+ def w2v2_speakers_count(self):
306
+ return self._w2v2_speakers_count
307
+
308
+ def encode(self, sampling_rate, audio, format):
309
  with BytesIO() as f:
310
+ write(f, sampling_rate, audio)
311
+ if format.upper() == 'OGG':
312
+ with BytesIO() as o:
313
+ utils.wav2ogg(f, o)
314
+ return BytesIO(o.getvalue())
315
+ elif format.upper() == 'SILK':
316
+ return BytesIO(silkcoder.encode(f))
317
+ elif format.upper() == 'MP3':
318
+ with BytesIO() as o:
319
+ utils.wav2mp3(f, o)
320
+ return BytesIO(o.getvalue())
321
+ elif format.upper() == 'WAV':
322
+ return BytesIO(f.getvalue())
323
+
324
+ def convert_time_string(self, time_string):
325
+ time_value = float(re.findall(r'\d+\.?\d*', time_string)[0])
326
+ time_unit = re.findall(r'[a-zA-Z]+', time_string)[0].lower()
327
+
328
+ if time_unit.upper() == 'MS':
329
+ return time_value / 1000
330
+ elif time_unit.upper() == 'S':
331
+ return time_value
332
+ elif time_unit.upper() == 'MIN':
333
+ return time_value * 60
334
+ elif time_unit.upper() == 'H':
335
+ return time_value * 3600
336
+ elif time_unit.upper() == 'D':
337
+ return time_value * 24 * 3600 # 不会有人真写D吧?
338
+ else:
339
+ raise ValueError("Unsupported time unit: {}".format(time_unit))
340
+
341
+ def parse_ssml(self, ssml):
342
+ root = ET.fromstring(ssml)
343
+ format = root.attrib.get("format", "wav")
344
+ voice_tasks = []
345
+ brk_count = 0
346
+ strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
347
+
348
+ for element in root.iter():
349
+ if element.tag == "voice":
350
+ id = int(element.attrib.get("id", root.attrib.get("id", config.ID)))
351
+ lang = element.attrib.get("lang", root.attrib.get("lang", config.LANG))
352
+ length = float(element.attrib.get("length", root.attrib.get("length", config.LENGTH)))
353
+ noise = float(element.attrib.get("noise", root.attrib.get("noise", config.NOISE)))
354
+ noisew = float(element.attrib.get("noisew", root.attrib.get("noisew", config.NOISEW)))
355
+ max = int(element.attrib.get("max", root.attrib.get("max", "0")))
356
+ # 不填写默认就是vits
357
+ model = element.attrib.get("model", root.attrib.get("model", "vits"))
358
+ # w2v2-vits/emotion-vits才有emotion
359
+ emotion = int(element.attrib.get("emotion", root.attrib.get("emotion", 0)))
360
+
361
+ voice_element = ET.tostring(element, encoding='unicode')
362
+
363
+ pattern_voice = r'<voice.*?>(.*?)</voice>'
364
+ pattern_break = r'<break\s*?(.*?)\s*?/>'
365
+
366
+ matches_voice = re.findall(pattern_voice, voice_element)[0]
367
+ matches_break = re.split(pattern_break, matches_voice)
368
+ for match in matches_break:
369
+ strength = re.search(r'\s*strength\s*=\s*[\'\"](.*?)[\'\"]', match)
370
+ time = re.search(r'\s*time\s*=\s*[\'\"](.*?)[\'\"]', match)
371
+ # break标签 strength属性
372
+ if strength:
373
+ brk = strength_dict[strength.group(1)]
374
+ voice_tasks.append({"break": brk})
375
+ brk_count += 1
376
+ # break标签 time属性
377
+ elif time:
378
+ brk = self.convert_time_string(time.group(1))
379
+ voice_tasks.append({"break": brk})
380
+ brk_count += 1
381
+ # break标签 为空说明只写了break,默认停顿0.75s
382
+ elif match == "":
383
+ voice_tasks.append({"break": 0.75})
384
+ brk_count += 1
385
+ # voice标签中除了break剩下的就是文本
386
+ else:
387
+ voice_tasks.append({"id": id,
388
+ "text": match,
389
+ "lang": lang,
390
+ "length": length,
391
+ "noise": noise,
392
+ "noisew": noisew,
393
+ "max": max,
394
+ "model": model,
395
+ "emotion": emotion
396
+ })
397
+
398
+ # 分段末尾停顿0.75s
399
+ voice_tasks.append({"break": 0.75})
400
+ elif element.tag == "break":
401
+ # brk_count大于0说明voice标签中有break
402
+ if brk_count > 0:
403
+ brk_count -= 1
404
+ continue
405
+ brk = strength_dict.get(element.attrib.get("strength"),
406
+ self.convert_time_string(element.attrib.get("time", "750ms")))
407
+ voice_tasks.append({"break": brk})
408
+
409
+ for i in voice_tasks:
410
+ self.logger.debug(i)
411
+
412
+ return voice_tasks, format
413
+
414
+ def create_ssml_infer_task(self, ssml):
415
+ voice_tasks, format = self.parse_ssml(ssml)
416
+
417
+ audios = []
418
+ for voice in voice_tasks:
419
+ if voice.get("break"):
420
+ audios.append(np.zeros(int(voice.get("break") * 22050), dtype=np.int16))
421
+ else:
422
+ model = voice.get("model").upper()
423
+ if model != "VITS" and model != "W2V2-VITS" and model != "EMOTION-VITS":
424
+ raise ValueError(f"Unsupported model: {voice.get('model')}")
425
+ voice_obj = self._voice_obj[model][voice.get("id")][1]
426
+ voice["id"] = self._voice_obj[model][voice.get("id")][0]
427
+
428
+ audios.append(voice_obj.get_audio(voice))
429
+
430
+ audio = np.concatenate(audios, axis=0)
431
+
432
+ return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format), format
433
+
434
+ def vits_infer(self, voice):
435
+ format = voice.get("format", "wav")
436
+ voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
437
+ voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
438
+ audio = voice_obj.get_audio(voice, auto_break=True)
439
+
440
+ return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
441
+
442
+ def hubert_vits_infer(self, voice):
443
+ format = voice.get("format", "wav")
444
+ voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
445
+ voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
446
+ audio = voice_obj.get_audio(voice)
447
+
448
+ return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
449
+
450
+ def w2v2_vits_infer(self, voice):
451
+ format = voice.get("format", "wav")
452
+ voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
453
+ voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
454
+ audio = voice_obj.get_audio(voice, auto_break=True)
455
+
456
+ return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
457
+
458
+ def vits_voice_conversion(self, voice):
459
+ original_id = voice.get("original_id")
460
+ target_id = voice.get("target_id")
461
+ format = voice.get("format")
462
+
463
+ original_id_obj = int(self._voice_obj["VITS"][original_id][2])
464
+ target_id_obj = int(self._voice_obj["VITS"][target_id][2])
465
+
466
+ if original_id_obj != target_id_obj:
467
+ raise ValueError(f"speakers are in diffrent VITS Model")
468
+
469
+ voice["original_id"] = int(self._voice_obj["VITS"][original_id][0])
470
+ voice["target_id"] = int(self._voice_obj["VITS"][target_id][0])
471
+
472
+ voice_obj = self._voice_obj["VITS"][original_id][1]
473
+ audio = voice_obj.voice_conversion(voice)
474
+
475
+ return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
476
+
477
+ def get_dimensional_emotion_npy(self, audio):
478
+ if self.dem is None:
479
+ raise ValueError(f"Please configure DIMENSIONAL_EMOTION_MODEL path in config.py")
480
+ audio16000, sampling_rate = librosa.load(audio, sr=16000, mono=True)
481
+ emotion = self.dem(audio16000, sampling_rate)['hidden_states']
482
+ emotion_npy = BytesIO()
483
+ np.save(emotion_npy, emotion.squeeze(0))
484
+ emotion_npy.seek(0)
485
+
486
+ return emotion_npy