Spaces:
Sleeping
Sleeping
update
Browse files- app.py +125 -65
- config.py +6 -0
- templates/index.html +258 -234
- utils/utils.py +4 -0
- voice.py +95 -46
app.py
CHANGED
@@ -16,7 +16,8 @@ app.config.from_pyfile("config.py")
|
|
16 |
|
17 |
scheduler = APScheduler()
|
18 |
scheduler.init_app(app)
|
19 |
-
|
|
|
20 |
|
21 |
logzero.loglevel(logging.WARNING)
|
22 |
logger = logging.getLogger("vits-simple-api")
|
@@ -53,7 +54,8 @@ def require_api_key(func):
|
|
53 |
@app.route('/', methods=["GET", "POST"])
|
54 |
def index():
|
55 |
kwargs = {
|
56 |
-
"speakers": tts.voice_speakers
|
|
|
57 |
}
|
58 |
return render_template("index.html", **kwargs)
|
59 |
|
@@ -77,6 +79,7 @@ def voice_vits_api():
|
|
77 |
noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
|
78 |
noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
|
79 |
max = int(request.args.get("max", app.config.get("MAX", 50)))
|
|
|
80 |
elif request.method == "POST":
|
81 |
content_type = request.headers.get('Content-Type')
|
82 |
if content_type == 'application/json':
|
@@ -91,6 +94,7 @@ def voice_vits_api():
|
|
91 |
noise = float(data.get("noise", app.config.get("NOISE", 0.667)))
|
92 |
noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
|
93 |
max = int(data.get("max", app.config.get("MAX", 50)))
|
|
|
94 |
except Exception as e:
|
95 |
logger.error(f"[VITS] {e}")
|
96 |
return make_response("parameter error", 400)
|
@@ -120,23 +124,37 @@ def voice_vits_api():
|
|
120 |
if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
|
121 |
speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
|
122 |
|
|
|
|
|
|
|
|
|
123 |
fname = f"{str(uuid.uuid1())}.{format}"
|
124 |
file_type = f"audio/{format}"
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
|
142 |
@app.route('/voice/hubert-vits', methods=["POST"])
|
@@ -150,6 +168,7 @@ def voice_hubert_api():
|
|
150 |
length = float(request.form.get("length", app.config.get("LENGTH", 1)))
|
151 |
noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
|
152 |
noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
|
|
|
153 |
except Exception as e:
|
154 |
logger.error(f"[hubert] {e}")
|
155 |
return make_response("parameter error", 400)
|
@@ -168,18 +187,27 @@ def voice_hubert_api():
|
|
168 |
return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
|
169 |
|
170 |
file_type = f"audio/{format}"
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
t1 = time.time()
|
173 |
-
|
174 |
-
"format": format,
|
175 |
-
"length": length,
|
176 |
-
"noise": noise,
|
177 |
-
"noisew": noisew,
|
178 |
-
"audio_path": os.path.join(app.config['UPLOAD_FOLDER'], fname)})
|
179 |
t2 = time.time()
|
|
|
|
|
180 |
logger.info(f"[hubert] finish in {(t2 - t1):.2f}s")
|
181 |
-
|
182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
|
184 |
|
185 |
@app.route('/voice/w2v2-vits', methods=["GET", "POST"])
|
@@ -196,6 +224,7 @@ def voice_w2v2_api():
|
|
196 |
noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
|
197 |
max = int(request.args.get("max", app.config.get("MAX", 50)))
|
198 |
emotion = int(request.args.get("emotion", app.config.get("EMOTION", 0)))
|
|
|
199 |
elif request.method == "POST":
|
200 |
content_type = request.headers.get('Content-Type')
|
201 |
if content_type == 'application/json':
|
@@ -211,6 +240,7 @@ def voice_w2v2_api():
|
|
211 |
noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
|
212 |
max = int(data.get("max", app.config.get("MAX", 50)))
|
213 |
emotion = int(data.get("emotion", app.config.get("EMOTION", 0)))
|
|
|
214 |
except Exception as e:
|
215 |
logger.error(f"[w2v2] {e}")
|
216 |
return make_response(f"parameter error", 400)
|
@@ -241,24 +271,37 @@ def voice_w2v2_api():
|
|
241 |
if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
|
242 |
speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
|
243 |
|
|
|
|
|
|
|
|
|
244 |
fname = f"{str(uuid.uuid1())}.{format}"
|
245 |
file_type = f"audio/{format}"
|
246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
t1 = time.time()
|
248 |
-
|
249 |
-
"id": id,
|
250 |
-
"format": format,
|
251 |
-
"length": length,
|
252 |
-
"noise": noise,
|
253 |
-
"noisew": noisew,
|
254 |
-
"max": max,
|
255 |
-
"lang": lang,
|
256 |
-
"emotion": emotion,
|
257 |
-
"speaker_lang": speaker_lang})
|
258 |
t2 = time.time()
|
259 |
-
|
260 |
-
|
261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
|
263 |
|
264 |
@app.route('/voice/conversion', methods=["POST"])
|
@@ -271,29 +314,35 @@ def vits_voice_conversion_api():
|
|
271 |
original_id = int(request.form["original_id"])
|
272 |
target_id = int(request.form["target_id"])
|
273 |
format = request.form.get("format", voice.filename.split(".")[1])
|
|
|
274 |
except Exception as e:
|
275 |
logger.error(f"[vits_voice_convertsion] {e}")
|
276 |
return make_response("parameter error", 400)
|
277 |
|
|
|
278 |
fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
|
279 |
audio_path = os.path.join(app.config['UPLOAD_FOLDER'], fname)
|
280 |
voice.save(audio_path)
|
281 |
file_type = f"audio/{format}"
|
|
|
|
|
|
|
|
|
282 |
|
283 |
-
logger.info(f"[vits_voice_convertsion] orginal_id:{original_id} target_id:{target_id}")
|
284 |
t1 = time.time()
|
285 |
-
|
286 |
-
output = tts.vits_voice_conversion({"audio_path": audio_path,
|
287 |
-
"original_id": original_id,
|
288 |
-
"target_id": target_id,
|
289 |
-
"format": format})
|
290 |
-
except Exception as e:
|
291 |
-
logger.info(f"[vits_voice_convertsion] {e}")
|
292 |
-
return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
|
293 |
t2 = time.time()
|
294 |
-
|
295 |
-
|
296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
|
298 |
|
299 |
@app.route('/voice/ssml', methods=["POST"])
|
@@ -312,20 +361,24 @@ def ssml():
|
|
312 |
|
313 |
logger.debug(ssml)
|
314 |
|
315 |
-
t1 = time.time()
|
316 |
-
try:
|
317 |
-
output, format = tts.create_ssml_infer_task(ssml)
|
318 |
-
except Exception as e:
|
319 |
-
logger.info(f"[ssml] {e}")
|
320 |
-
return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
|
321 |
-
t2 = time.time()
|
322 |
-
|
323 |
fname = f"{str(uuid.uuid1())}.{format}"
|
324 |
file_type = f"audio/{format}"
|
325 |
|
|
|
|
|
|
|
|
|
|
|
326 |
logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
|
327 |
|
328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
|
330 |
|
331 |
@app.route('/voice/dimension-emotion', methods=["POST"])
|
@@ -333,6 +386,7 @@ def dimensional_emotion():
|
|
333 |
if request.method == "POST":
|
334 |
try:
|
335 |
audio = request.files['upload']
|
|
|
336 |
except Exception as e:
|
337 |
logger.error(f"[dimensional_emotion] {e}")
|
338 |
return make_response("parameter error", 400)
|
@@ -341,9 +395,15 @@ def dimensional_emotion():
|
|
341 |
|
342 |
file_type = "application/octet-stream; charset=ascii"
|
343 |
fname = os.path.splitext(audio.filename)[0] + ".npy"
|
344 |
-
|
345 |
-
|
346 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
347 |
|
348 |
|
349 |
@app.route('/voice/check', methods=["GET", "POST"])
|
@@ -400,7 +460,8 @@ def check():
|
|
400 |
|
401 |
|
402 |
# regular cleaning
|
403 |
-
@scheduler.task('interval', id='clean_task', seconds=
|
|
|
404 |
def clean_task():
|
405 |
clean_folder(app.config["UPLOAD_FOLDER"])
|
406 |
clean_folder(app.config["CACHE_PATH"])
|
@@ -409,4 +470,3 @@ def clean_task():
|
|
409 |
if __name__ == '__main__':
|
410 |
app.run(host='0.0.0.0', port=app.config.get("PORT", 23456), debug=app.config.get("DEBUG", False)) # 对外开放
|
411 |
# app.run(host='127.0.0.1', port=app.config.get("PORT",23456), debug=True) # 本地运行、调试
|
412 |
-
|
|
|
16 |
|
17 |
scheduler = APScheduler()
|
18 |
scheduler.init_app(app)
|
19 |
+
if app.config.get("CLEAN_INTERVAL_SECONDS", 3600) > 0:
|
20 |
+
scheduler.start()
|
21 |
|
22 |
logzero.loglevel(logging.WARNING)
|
23 |
logger = logging.getLogger("vits-simple-api")
|
|
|
54 |
@app.route('/', methods=["GET", "POST"])
|
55 |
def index():
|
56 |
kwargs = {
|
57 |
+
"speakers": tts.voice_speakers,
|
58 |
+
"speakers_count": tts.speakers_count
|
59 |
}
|
60 |
return render_template("index.html", **kwargs)
|
61 |
|
|
|
79 |
noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
|
80 |
noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
|
81 |
max = int(request.args.get("max", app.config.get("MAX", 50)))
|
82 |
+
use_streaming = request.args.get('streaming', False, type=bool)
|
83 |
elif request.method == "POST":
|
84 |
content_type = request.headers.get('Content-Type')
|
85 |
if content_type == 'application/json':
|
|
|
94 |
noise = float(data.get("noise", app.config.get("NOISE", 0.667)))
|
95 |
noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
|
96 |
max = int(data.get("max", app.config.get("MAX", 50)))
|
97 |
+
use_streaming = request.form.get('streaming', False, type=bool)
|
98 |
except Exception as e:
|
99 |
logger.error(f"[VITS] {e}")
|
100 |
return make_response("parameter error", 400)
|
|
|
124 |
if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
|
125 |
speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
|
126 |
|
127 |
+
if use_streaming and format.upper() != "MP3":
|
128 |
+
format = "mp3"
|
129 |
+
logger.warning("Streaming response only supports MP3 format.")
|
130 |
+
|
131 |
fname = f"{str(uuid.uuid1())}.{format}"
|
132 |
file_type = f"audio/{format}"
|
133 |
+
task = {"text": text,
|
134 |
+
"id": id,
|
135 |
+
"format": format,
|
136 |
+
"length": length,
|
137 |
+
"noise": noise,
|
138 |
+
"noisew": noisew,
|
139 |
+
"max": max,
|
140 |
+
"lang": lang,
|
141 |
+
"speaker_lang": speaker_lang}
|
142 |
+
|
143 |
+
if app.config.get("SAVE_AUDIO", False):
|
144 |
+
logger.debug(f"[VITS] {fname}")
|
145 |
+
|
146 |
+
if use_streaming:
|
147 |
+
audio = tts.stream_vits_infer(task, fname)
|
148 |
+
response = make_response(audio)
|
149 |
+
response.headers['Content-Disposition'] = f'attachment; filename={fname}'
|
150 |
+
response.headers['Content-Type'] = file_type
|
151 |
+
return response
|
152 |
+
else:
|
153 |
+
t1 = time.time()
|
154 |
+
audio = tts.vits_infer(task, fname)
|
155 |
+
t2 = time.time()
|
156 |
+
logger.info(f"[VITS] finish in {(t2 - t1):.2f}s")
|
157 |
+
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
158 |
|
159 |
|
160 |
@app.route('/voice/hubert-vits', methods=["POST"])
|
|
|
168 |
length = float(request.form.get("length", app.config.get("LENGTH", 1)))
|
169 |
noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
|
170 |
noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
|
171 |
+
use_streaming = request.form.get('streaming', False, type=bool)
|
172 |
except Exception as e:
|
173 |
logger.error(f"[hubert] {e}")
|
174 |
return make_response("parameter error", 400)
|
|
|
187 |
return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
|
188 |
|
189 |
file_type = f"audio/{format}"
|
190 |
+
task = {"id": id,
|
191 |
+
"format": format,
|
192 |
+
"length": length,
|
193 |
+
"noise": noise,
|
194 |
+
"noisew": noisew,
|
195 |
+
"audio_path": os.path.join(app.config['UPLOAD_FOLDER'], fname)}
|
196 |
|
197 |
t1 = time.time()
|
198 |
+
audio = tts.hubert_vits_infer(task, fname)
|
|
|
|
|
|
|
|
|
|
|
199 |
t2 = time.time()
|
200 |
+
if app.config.get("SAVE_AUDIO", False):
|
201 |
+
logger.debug(f"[hubert] {fname}")
|
202 |
logger.info(f"[hubert] finish in {(t2 - t1):.2f}s")
|
203 |
+
if use_streaming:
|
204 |
+
audio = tts.generate_audio_chunks(audio)
|
205 |
+
response = make_response(audio)
|
206 |
+
response.headers['Content-Disposition'] = f'attachment; filename={fname}'
|
207 |
+
response.headers['Content-Type'] = file_type
|
208 |
+
return response
|
209 |
+
else:
|
210 |
+
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
211 |
|
212 |
|
213 |
@app.route('/voice/w2v2-vits', methods=["GET", "POST"])
|
|
|
224 |
noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
|
225 |
max = int(request.args.get("max", app.config.get("MAX", 50)))
|
226 |
emotion = int(request.args.get("emotion", app.config.get("EMOTION", 0)))
|
227 |
+
use_streaming = request.args.get('streaming', False, type=bool)
|
228 |
elif request.method == "POST":
|
229 |
content_type = request.headers.get('Content-Type')
|
230 |
if content_type == 'application/json':
|
|
|
240 |
noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
|
241 |
max = int(data.get("max", app.config.get("MAX", 50)))
|
242 |
emotion = int(data.get("emotion", app.config.get("EMOTION", 0)))
|
243 |
+
use_streaming = request.form.get('streaming', False, type=bool)
|
244 |
except Exception as e:
|
245 |
logger.error(f"[w2v2] {e}")
|
246 |
return make_response(f"parameter error", 400)
|
|
|
271 |
if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
|
272 |
speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
|
273 |
|
274 |
+
if use_streaming and format.upper() != "MP3":
|
275 |
+
format = "mp3"
|
276 |
+
logger.warning("Streaming response only supports MP3 format.")
|
277 |
+
|
278 |
fname = f"{str(uuid.uuid1())}.{format}"
|
279 |
file_type = f"audio/{format}"
|
280 |
+
task = {"text": text,
|
281 |
+
"id": id,
|
282 |
+
"format": format,
|
283 |
+
"length": length,
|
284 |
+
"noise": noise,
|
285 |
+
"noisew": noisew,
|
286 |
+
"max": max,
|
287 |
+
"lang": lang,
|
288 |
+
"emotion": emotion,
|
289 |
+
"speaker_lang": speaker_lang}
|
290 |
+
|
291 |
t1 = time.time()
|
292 |
+
audio = tts.w2v2_vits_infer(task, fname)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
293 |
t2 = time.time()
|
294 |
+
if app.config.get("SAVE_AUDIO", False):
|
295 |
+
logger.debug(f"[W2V2] {fname}")
|
296 |
+
if use_streaming:
|
297 |
+
audio = tts.generate_audio_chunks(audio)
|
298 |
+
response = make_response(audio)
|
299 |
+
response.headers['Content-Disposition'] = f'attachment; filename={fname}'
|
300 |
+
response.headers['Content-Type'] = file_type
|
301 |
+
return response
|
302 |
+
else:
|
303 |
+
logger.info(f"[w2v2] finish in {(t2 - t1):.2f}s")
|
304 |
+
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
305 |
|
306 |
|
307 |
@app.route('/voice/conversion', methods=["POST"])
|
|
|
314 |
original_id = int(request.form["original_id"])
|
315 |
target_id = int(request.form["target_id"])
|
316 |
format = request.form.get("format", voice.filename.split(".")[1])
|
317 |
+
use_streaming = request.form.get('streaming', False, type=bool)
|
318 |
except Exception as e:
|
319 |
logger.error(f"[vits_voice_convertsion] {e}")
|
320 |
return make_response("parameter error", 400)
|
321 |
|
322 |
+
logger.info(f"[vits_voice_convertsion] orginal_id:{original_id} target_id:{target_id}")
|
323 |
fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
|
324 |
audio_path = os.path.join(app.config['UPLOAD_FOLDER'], fname)
|
325 |
voice.save(audio_path)
|
326 |
file_type = f"audio/{format}"
|
327 |
+
task = {"audio_path": audio_path,
|
328 |
+
"original_id": original_id,
|
329 |
+
"target_id": target_id,
|
330 |
+
"format": format}
|
331 |
|
|
|
332 |
t1 = time.time()
|
333 |
+
audio = tts.vits_voice_conversion(task, fname)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
t2 = time.time()
|
335 |
+
if app.config.get("SAVE_AUDIO", False):
|
336 |
+
logger.debug(f"[Voice conversion] {fname}")
|
337 |
+
logger.info(f"[Voice conversion] finish in {(t2 - t1):.2f}s")
|
338 |
+
if use_streaming:
|
339 |
+
audio = tts.generate_audio_chunks(audio)
|
340 |
+
response = make_response(audio)
|
341 |
+
response.headers['Content-Disposition'] = f'attachment; filename={fname}'
|
342 |
+
response.headers['Content-Type'] = file_type
|
343 |
+
return response
|
344 |
+
else:
|
345 |
+
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
346 |
|
347 |
|
348 |
@app.route('/voice/ssml', methods=["POST"])
|
|
|
361 |
|
362 |
logger.debug(ssml)
|
363 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
fname = f"{str(uuid.uuid1())}.{format}"
|
365 |
file_type = f"audio/{format}"
|
366 |
|
367 |
+
t1 = time.time()
|
368 |
+
audio, format = tts.create_ssml_infer_task(ssml, fname)
|
369 |
+
t2 = time.time()
|
370 |
+
if app.config.get("SAVE_AUDIO", False):
|
371 |
+
logger.debug(f"[ssml] {fname}")
|
372 |
logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
|
373 |
|
374 |
+
if eval(ssml.get('streaming', False)):
|
375 |
+
audio = tts.generate_audio_chunks(audio)
|
376 |
+
response = make_response(audio)
|
377 |
+
response.headers['Content-Disposition'] = f'attachment; filename={fname}'
|
378 |
+
response.headers['Content-Type'] = file_type
|
379 |
+
return response
|
380 |
+
else:
|
381 |
+
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
382 |
|
383 |
|
384 |
@app.route('/voice/dimension-emotion', methods=["POST"])
|
|
|
386 |
if request.method == "POST":
|
387 |
try:
|
388 |
audio = request.files['upload']
|
389 |
+
use_streaming = request.form.get('streaming', False, type=bool)
|
390 |
except Exception as e:
|
391 |
logger.error(f"[dimensional_emotion] {e}")
|
392 |
return make_response("parameter error", 400)
|
|
|
395 |
|
396 |
file_type = "application/octet-stream; charset=ascii"
|
397 |
fname = os.path.splitext(audio.filename)[0] + ".npy"
|
398 |
+
audio = tts.get_dimensional_emotion_npy(content)
|
399 |
+
if use_streaming:
|
400 |
+
audio = tts.generate_audio_chunks(audio)
|
401 |
+
response = make_response(audio)
|
402 |
+
response.headers['Content-Disposition'] = f'attachment; filename={fname}'
|
403 |
+
response.headers['Content-Type'] = file_type
|
404 |
+
return response
|
405 |
+
else:
|
406 |
+
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
407 |
|
408 |
|
409 |
@app.route('/voice/check', methods=["GET", "POST"])
|
|
|
460 |
|
461 |
|
462 |
# regular cleaning
|
463 |
+
@scheduler.task('interval', id='clean_task', seconds=app.config.get("CLEAN_INTERVAL_SECONDS", 3600),
|
464 |
+
misfire_grace_time=900)
|
465 |
def clean_task():
|
466 |
clean_folder(app.config["UPLOAD_FOLDER"])
|
467 |
clean_folder(app.config["CACHE_PATH"])
|
|
|
470 |
if __name__ == '__main__':
|
471 |
app.run(host='0.0.0.0', port=app.config.get("PORT", 23456), debug=app.config.get("DEBUG", False)) # 对外开放
|
472 |
# app.run(host='127.0.0.1', port=app.config.get("PORT",23456), debug=True) # 本地运行、调试
|
|
config.py
CHANGED
@@ -20,6 +20,12 @@ UPLOAD_FOLDER = ABS_PATH + "/upload"
|
|
20 |
# Cahce path
|
21 |
CACHE_PATH = ABS_PATH + "/cache"
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# zh ja ko en... If it is empty, it will be read based on the text_cleaners specified in the config.json.
|
24 |
LANGUAGE_AUTOMATIC_DETECT = []
|
25 |
|
|
|
20 |
# Cahce path
|
21 |
CACHE_PATH = ABS_PATH + "/cache"
|
22 |
|
23 |
+
# If CLEAN_INTERVAL_SECONDS <= 0, the cleaning task will not be executed.
|
24 |
+
CLEAN_INTERVAL_SECONDS = 3600
|
25 |
+
|
26 |
+
# save audio to CACHE_PATH
|
27 |
+
SAVE_AUDIO = False
|
28 |
+
|
29 |
# zh ja ko en... If it is empty, it will be read based on the text_cleaners specified in the config.json.
|
30 |
LANGUAGE_AUTOMATIC_DETECT = []
|
31 |
|
templates/index.html
CHANGED
@@ -1,237 +1,261 @@
|
|
1 |
<!DOCTYPE html>
|
2 |
<html lang="en">
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
</html>
|
|
|
1 |
<!DOCTYPE html>
|
2 |
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8"/>
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
|
6 |
+
<title>vits-simple-api</title>
|
7 |
+
|
8 |
+
<link rel="stylesheet" href="/static/css/bootstrap.min.css"/>
|
9 |
+
</head>
|
10 |
+
<body>
|
11 |
+
<main style="margin: 0 auto; width: 1024px">
|
12 |
+
<h1>
|
13 |
+
<a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
|
14 |
+
style="text-decoration: none; color: black"> vits-simple-api </a>
|
15 |
+
</h1>
|
16 |
+
|
17 |
+
<div>
|
18 |
+
<label>文档:</label>
|
19 |
+
<a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
|
20 |
+
style="text-decoration: none; color: black"> https://github.com/Artrajz/vits-simple-api </a>
|
21 |
+
</div>
|
22 |
+
<div>
|
23 |
+
<label>返回speakers(json):</label>
|
24 |
+
<a id="speakersLink" href="https://artrajz-vits-simple-api.hf.space/voice/speakers" target="_blank"
|
25 |
+
style="text-decoration: none; color: black">
|
26 |
+
https://artrajz-vits-simple-api.hf.space/voice/speakers
|
27 |
+
</a>
|
28 |
+
</div>
|
29 |
+
<div>
|
30 |
+
<label>简单调用api:</label>
|
31 |
+
<a id="vitsLink" href="https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164"
|
32 |
+
style="text-decoration: none; color: black">
|
33 |
+
https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164
|
34 |
+
</a>
|
35 |
+
</div>
|
36 |
+
|
37 |
+
<!-- <div style="display: flex; justify-content: center; align-items: center"> -->
|
38 |
+
<div>
|
39 |
+
<form>
|
40 |
+
<div class="form-group">
|
41 |
+
<label>text</label>
|
42 |
+
<textarea class="form-control" id="inputText" rows="3" oninput="updateLink()">你好,こんにちは</textarea>
|
43 |
+
</div>
|
44 |
+
<div class="form-group">
|
45 |
+
<label>id</label>
|
46 |
+
<select class="form-control" id="inputId" oninput="updateLink()">
|
47 |
+
{% for speaker in speakers["VITS"] %}
|
48 |
+
{% if speaker["name"] == "雷电将军(雷神)" %}
|
49 |
+
<option value="{{ speaker["id"] }}" selected>{{ speaker["id"] }} | {{ speaker["name"] }}
|
50 |
+
| {{ speaker["lang"] }}</option>
|
51 |
+
{% else %}
|
52 |
+
<option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
|
53 |
+
| {{ speaker["lang"] }}</option>
|
54 |
+
{% endif %}
|
55 |
+
{% endfor %}
|
56 |
+
</select>
|
57 |
+
</div>
|
58 |
+
</form>
|
59 |
+
</div>
|
60 |
+
<p>
|
61 |
+
<button class="btn btn-primary" type="button" data-toggle="collapse" data-target="#collapseExample"
|
62 |
+
aria-expanded="false" aria-controls="collapseExample">
|
63 |
+
Advanced
|
64 |
+
</button>
|
65 |
+
{% if speakers_count == 0 %}
|
66 |
+
<div style="color: red;">未加载任何模型</div>
|
67 |
+
{% endif %}
|
68 |
+
</p>
|
69 |
+
<div class="collapse" id="collapseExample">
|
70 |
+
<div class="card card-body">
|
71 |
+
<form>
|
72 |
+
<div class="form-group">
|
73 |
+
<label>format</label>
|
74 |
+
<select class="form-control" id="inputFormat" oninput="updateLink()">
|
75 |
+
<option></option>
|
76 |
+
<option>wav</option>
|
77 |
+
<option>mp3</option>
|
78 |
+
<option>ogg</option>
|
79 |
+
<option>silk</option>
|
80 |
+
</select>
|
81 |
+
</div>
|
82 |
+
<div class="form-group">
|
83 |
+
<label>lang</label>
|
84 |
+
<input type="text" class="form-control" id="inputLang" oninput="updateLink()" value=""
|
85 |
+
placeholder="auto"/>
|
86 |
+
</div>
|
87 |
+
<div class="form-group">
|
88 |
+
<label>length</label>
|
89 |
+
<input type="text" class="form-control" id="inputLength" oninput="updateLink()" value=""
|
90 |
+
placeholder="1"/>
|
91 |
+
</div>
|
92 |
+
<div class="form-group">
|
93 |
+
<label>noise</label>
|
94 |
+
<input type="text" class="form-control" id="inputNoise" oninput="updateLink()" value=""
|
95 |
+
placeholder="0.33"/>
|
96 |
+
</div>
|
97 |
+
<div class="form-group">
|
98 |
+
<label>noisew</label>
|
99 |
+
<input type="text" class="form-control" id="inputNoisew" oninput="updateLink()" value=""
|
100 |
+
placeholder="0.4"/>
|
101 |
+
</div>
|
102 |
+
<div class="form-group">
|
103 |
+
<label>max</label>
|
104 |
+
<input type="text" class="form-control" id="inputMax" oninput="updateLink()" value=""
|
105 |
+
placeholder="50"/>
|
106 |
+
</div>
|
107 |
+
</form>
|
108 |
+
</div>
|
109 |
+
</div>
|
110 |
+
|
111 |
+
<div style="display: flex; justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
|
112 |
+
<button type="button" class="btn btn-outline-secondary" id="getAudio" style="margin-right: 10px">播放器生成</button>
|
113 |
+
<audio id="audioPlayer" controls>
|
114 |
+
<source src="" type="audio/mp3"/>
|
115 |
+
Your browser does not support the audio element.
|
116 |
+
</audio>
|
117 |
+
<div class="form-group form-check">
|
118 |
+
<input type="checkbox" id="streaming">
|
119 |
+
<label class="form-check-label">流式响应</label>
|
120 |
+
</div>
|
121 |
+
</div>
|
122 |
+
<div>自动识别语言:可识别的语言根据不同speaker而不同,方言无法自动识别</div>
|
123 |
+
<div>方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd</div>
|
124 |
+
<br/>
|
125 |
+
|
126 |
+
<h2>所有模型均为网络搜集,感谢模型原作者的付出!</h2>
|
127 |
+
<p>
|
128 |
+
Nene_Nanami_Rong_Tang:
|
129 |
+
<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
|
130 |
+
</p>
|
131 |
+
<p>
|
132 |
+
louise:
|
133 |
+
<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
|
134 |
+
</p>
|
135 |
+
<p>
|
136 |
+
Cantonese:
|
137 |
+
<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
|
138 |
+
</p>
|
139 |
+
<p>
|
140 |
+
shanghainese:
|
141 |
+
<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
|
142 |
+
</p>
|
143 |
+
<p>
|
144 |
+
w2v2-vits:
|
145 |
+
<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
|
146 |
+
</p>
|
147 |
+
<p>
|
148 |
+
vctk:
|
149 |
+
<a href="https://github.com/jaywalnut310/vits" rel="noreferrer" target="_blank">jaywalnut310/vits</a>
|
150 |
+
</p>
|
151 |
+
<p>
|
152 |
+
Bishojo Mangekyo:
|
153 |
+
<a href="https://github.com/Francis-Komizu/VITS" rel="noreferrer" target="_blank">Francis-Komizu/VITS</a>
|
154 |
+
</p>
|
155 |
+
<p>
|
156 |
+
genshin:
|
157 |
+
<a href="https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai" rel="noreferrer" target="_blank">zomehwh/vits-uma-genshin-honkai</a>
|
158 |
+
</p>
|
159 |
+
<p>
|
160 |
+
paimon:
|
161 |
+
<a href="https://github.com/zixiiu/Digital_Life_Server" rel="noreferrer" target="_blank">zixiiu/Digital_Life_Server</a>
|
162 |
+
</p>
|
163 |
+
<p>
|
164 |
+
vits_chinese:
|
165 |
+
<a href="https://github.com/PlayVoice/vits_chinese" rel="noreferrer" target="_blank">PlayVoice/vits_chinese</a>
|
166 |
+
</p>
|
167 |
+
|
168 |
+
</main>
|
169 |
+
|
170 |
+
<script src="/static/js/jquery.slim.min.js"></script>
|
171 |
+
<script src="/static/js/bootstrap.bundle.min.js"></script>
|
172 |
+
|
173 |
+
<script>
|
174 |
+
function getProtocol() {
|
175 |
+
return 'https:' == location.protocol ? "https://" : "http://";
|
176 |
+
}
|
177 |
+
|
178 |
+
function getUrl() {
|
179 |
+
var url = window.location.host;
|
180 |
+
return url;
|
181 |
+
}
|
182 |
+
|
183 |
+
var baseUrl = getProtocol() + getUrl();
|
184 |
+
|
185 |
+
setBaseUrl();
|
186 |
+
|
187 |
+
function setBaseUrl() {
|
188 |
+
var text = document.getElementById("inputText").value;
|
189 |
+
var id = document.getElementById("inputId").value;
|
190 |
+
|
191 |
+
var vitsLink = document.getElementById("vitsLink");
|
192 |
+
var speakersLink = document.getElementById("speakersLink");
|
193 |
+
|
194 |
+
var vitsUrl = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
|
195 |
+
var speakersUrl = baseUrl + "/voice/speakers";
|
196 |
+
|
197 |
+
vitsLink.href = vitsUrl;
|
198 |
+
vitsLink.textContent = vitsUrl;
|
199 |
+
|
200 |
+
speakersLink.href = speakersUrl;
|
201 |
+
speakersLink.textContent = speakersUrl;
|
202 |
+
}
|
203 |
+
|
204 |
+
function getLink() {
|
205 |
+
var text = document.getElementById("inputText").value;
|
206 |
+
var id = document.getElementById("inputId").value;
|
207 |
+
var format = document.getElementById("inputFormat").value;
|
208 |
+
var lang = document.getElementById("inputLang").value;
|
209 |
+
var length = document.getElementById("inputLength").value;
|
210 |
+
var noise = document.getElementById("inputNoise").value;
|
211 |
+
var noisew = document.getElementById("inputNoisew").value;
|
212 |
+
var max = document.getElementById("inputMax").value;
|
213 |
+
|
214 |
+
var url = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
|
215 |
+
if (format != "") {
|
216 |
+
url += "&format=" + format;
|
217 |
+
}
|
218 |
+
if (lang != "") {
|
219 |
+
url += "&lang=" + lang;
|
220 |
+
}
|
221 |
+
if (length != "") {
|
222 |
+
url += "&length=" + length;
|
223 |
+
}
|
224 |
+
if (noise != "") {
|
225 |
+
url += "&noise=" + noise;
|
226 |
+
}
|
227 |
+
if (noisew != "") {
|
228 |
+
url += "&noisew=" + noisew;
|
229 |
+
}
|
230 |
+
if (max != "") {
|
231 |
+
url += "&max=" + max;
|
232 |
+
}
|
233 |
+
return url;
|
234 |
+
}
|
235 |
+
|
236 |
+
function updateLink() {
|
237 |
+
var url = getLink();
|
238 |
+
var link = document.getElementById("vitsLink");
|
239 |
+
link.href = url;
|
240 |
+
link.textContent = url;
|
241 |
+
}
|
242 |
+
|
243 |
+
function setAudioSource() {
|
244 |
+
var streaming = document.getElementById('streaming');
|
245 |
+
var url = getLink();
|
246 |
+
if (streaming.checked) {
|
247 |
+
url += '&streaming=true';
|
248 |
+
}
|
249 |
+
|
250 |
+
var audioPlayer = document.getElementById("audioPlayer");
|
251 |
+
audioPlayer.src = url;
|
252 |
+
audioPlayer.play();
|
253 |
+
}
|
254 |
+
|
255 |
+
var button = document.getElementById("getAudio");
|
256 |
+
button.addEventListener("click", function () {
|
257 |
+
setAudioSource();
|
258 |
+
});
|
259 |
+
</script>
|
260 |
+
</body>
|
261 |
</html>
|
utils/utils.py
CHANGED
@@ -89,3 +89,7 @@ def clean_folder(folder_path):
|
|
89 |
# is none -> True, is not none -> False
|
90 |
def check_is_none(s):
|
91 |
return s is None or (isinstance(s, str) and str(s).isspace()) or str(s) == ""
|
|
|
|
|
|
|
|
|
|
89 |
# is none -> True, is not none -> False
|
90 |
def check_is_none(s):
|
91 |
return s is None or (isinstance(s, str) and str(s).isspace()) or str(s) == ""
|
92 |
+
|
93 |
+
def save_audio(audio, path):
|
94 |
+
with open(path,"wb") as f:
|
95 |
+
f.write(audio)
|
voice.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import os
|
2 |
import librosa
|
3 |
import commons
|
4 |
-
import sys
|
5 |
import re
|
6 |
import numpy as np
|
7 |
import torch
|
@@ -156,7 +155,7 @@ class vits:
|
|
156 |
|
157 |
return params
|
158 |
|
159 |
-
def
|
160 |
text = voice.get("text", None)
|
161 |
speaker_id = voice.get("id", 0)
|
162 |
length = voice.get("length", 1)
|
@@ -171,47 +170,57 @@ class vits:
|
|
171 |
# 去除所有多余的空白字符
|
172 |
if text is not None: text = re.sub(r'\s+', ' ', text).strip()
|
173 |
|
174 |
-
# 停顿0.75s,避免语音分段合成再拼接后的连接突兀
|
175 |
-
brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
|
176 |
-
|
177 |
tasks = []
|
178 |
if self.model_type == "vits":
|
179 |
sentence_list = sentence_split(text, max, lang, speaker_lang)
|
180 |
for sentence in sentence_list:
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
audios = []
|
186 |
-
for task in tasks:
|
187 |
-
audios.append(self.infer(task))
|
188 |
-
if auto_break:
|
189 |
-
audios.append(brk)
|
190 |
-
|
191 |
-
audio = np.concatenate(audios, axis=0)
|
192 |
|
193 |
elif self.model_type == "hubert":
|
194 |
params = self.get_infer_param(speaker_id=speaker_id, length_scale=length, noise_scale=noise,
|
195 |
noise_scale_w=noisew, audio_path=audio_path)
|
196 |
-
|
197 |
|
198 |
elif self.model_type == "w2v2":
|
199 |
sentence_list = sentence_split(text, max, lang, speaker_lang)
|
200 |
for sentence in sentence_list:
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
|
205 |
-
|
206 |
-
for task in tasks:
|
207 |
-
audios.append(self.infer(task))
|
208 |
-
if auto_break:
|
209 |
-
audios.append(brk)
|
210 |
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
|
|
|
213 |
return audio
|
214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
def voice_conversion(self, voice):
|
216 |
audio_path = voice.get("audio_path")
|
217 |
original_id = voice.get("original_id")
|
@@ -330,6 +339,14 @@ class TTS:
|
|
330 |
else:
|
331 |
raise ValueError("Unsupported time unit: {}".format(time_unit))
|
332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
def parse_ssml(self, ssml):
|
334 |
root = ET.fromstring(ssml)
|
335 |
format = root.attrib.get("format", "wav")
|
@@ -403,7 +420,7 @@ class TTS:
|
|
403 |
|
404 |
return voice_tasks, format
|
405 |
|
406 |
-
def create_ssml_infer_task(self, ssml):
|
407 |
voice_tasks, format = self.parse_ssml(ssml)
|
408 |
|
409 |
audios = []
|
@@ -420,38 +437,66 @@ class TTS:
|
|
420 |
audios.append(audio)
|
421 |
|
422 |
audio = np.concatenate(audios, axis=0)
|
423 |
-
|
424 |
-
|
425 |
-
|
|
|
|
|
426 |
|
427 |
-
def vits_infer(self, voice):
|
428 |
format = voice.get("format", "wav")
|
429 |
voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
|
430 |
voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
|
|
|
431 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
432 |
-
|
433 |
-
|
434 |
-
|
|
|
|
|
435 |
|
436 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
437 |
format = voice.get("format", "wav")
|
438 |
voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
|
439 |
voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
|
|
|
440 |
audio = voice_obj.get_audio(voice)
|
441 |
-
|
442 |
-
|
443 |
-
|
|
|
|
|
444 |
|
445 |
-
def w2v2_vits_infer(self, voice):
|
446 |
format = voice.get("format", "wav")
|
447 |
voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
|
448 |
voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
|
|
|
449 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
450 |
-
|
|
|
|
|
|
|
|
|
451 |
|
452 |
-
|
453 |
-
|
454 |
-
def vits_voice_conversion(self, voice):
|
455 |
original_id = voice.get("original_id")
|
456 |
target_id = voice.get("target_id")
|
457 |
format = voice.get("format")
|
@@ -466,10 +511,14 @@ class TTS:
|
|
466 |
voice["target_id"] = int(self._voice_obj["VITS"][target_id][0])
|
467 |
|
468 |
voice_obj = self._voice_obj["VITS"][original_id][1]
|
469 |
-
|
470 |
-
output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
471 |
|
472 |
-
|
|
|
|
|
|
|
|
|
|
|
473 |
|
474 |
def get_dimensional_emotion_npy(self, audio):
|
475 |
if self.dem is None:
|
|
|
1 |
import os
|
2 |
import librosa
|
3 |
import commons
|
|
|
4 |
import re
|
5 |
import numpy as np
|
6 |
import torch
|
|
|
155 |
|
156 |
return params
|
157 |
|
158 |
+
def get_tasks(self, voice):
|
159 |
text = voice.get("text", None)
|
160 |
speaker_id = voice.get("id", 0)
|
161 |
length = voice.get("length", 1)
|
|
|
170 |
# 去除所有多余的空白字符
|
171 |
if text is not None: text = re.sub(r'\s+', ' ', text).strip()
|
172 |
|
|
|
|
|
|
|
173 |
tasks = []
|
174 |
if self.model_type == "vits":
|
175 |
sentence_list = sentence_split(text, max, lang, speaker_lang)
|
176 |
for sentence in sentence_list:
|
177 |
+
params = self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length,
|
178 |
+
noise_scale=noise, noise_scale_w=noisew)
|
179 |
+
tasks.append(params)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
elif self.model_type == "hubert":
|
182 |
params = self.get_infer_param(speaker_id=speaker_id, length_scale=length, noise_scale=noise,
|
183 |
noise_scale_w=noisew, audio_path=audio_path)
|
184 |
+
tasks.append(params)
|
185 |
|
186 |
elif self.model_type == "w2v2":
|
187 |
sentence_list = sentence_split(text, max, lang, speaker_lang)
|
188 |
for sentence in sentence_list:
|
189 |
+
params = self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length,
|
190 |
+
noise_scale=noise, noise_scale_w=noisew, emotion=emotion)
|
191 |
+
tasks.append(params)
|
192 |
|
193 |
+
return tasks
|
|
|
|
|
|
|
|
|
194 |
|
195 |
+
def get_audio(self, voice, auto_break=False):
|
196 |
+
tasks = self.get_tasks(voice)
|
197 |
+
# 停顿0.75s,避免语音分段合成再拼接后的连接突兀
|
198 |
+
brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
|
199 |
+
|
200 |
+
audios = []
|
201 |
+
for task in tasks:
|
202 |
+
if auto_break:
|
203 |
+
chunk = np.concatenate((self.infer(task), brk), axis=0)
|
204 |
+
else:
|
205 |
+
chunk = self.infer(task)
|
206 |
+
audios.append(chunk)
|
207 |
|
208 |
+
audio = np.concatenate(audios, axis=0)
|
209 |
return audio
|
210 |
|
211 |
+
def get_stream_audio(self, voice, auto_break=False):
|
212 |
+
tasks = self.get_tasks(voice)
|
213 |
+
|
214 |
+
brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
|
215 |
+
|
216 |
+
for task in tasks:
|
217 |
+
if auto_break:
|
218 |
+
chunk = np.concatenate((self.infer(task), brk), axis=0)
|
219 |
+
else:
|
220 |
+
chunk = self.infer(task)
|
221 |
+
|
222 |
+
yield chunk
|
223 |
+
|
224 |
def voice_conversion(self, voice):
|
225 |
audio_path = voice.get("audio_path")
|
226 |
original_id = voice.get("original_id")
|
|
|
339 |
else:
|
340 |
raise ValueError("Unsupported time unit: {}".format(time_unit))
|
341 |
|
342 |
+
def generate_audio_chunks(self, audio):
|
343 |
+
chunk_size = 4096
|
344 |
+
while True:
|
345 |
+
chunk = audio.read(chunk_size)
|
346 |
+
if not chunk:
|
347 |
+
break
|
348 |
+
yield chunk
|
349 |
+
|
350 |
def parse_ssml(self, ssml):
|
351 |
root = ET.fromstring(ssml)
|
352 |
format = root.attrib.get("format", "wav")
|
|
|
420 |
|
421 |
return voice_tasks, format
|
422 |
|
423 |
+
def create_ssml_infer_task(self, ssml, fname):
|
424 |
voice_tasks, format = self.parse_ssml(ssml)
|
425 |
|
426 |
audios = []
|
|
|
437 |
audios.append(audio)
|
438 |
|
439 |
audio = np.concatenate(audios, axis=0)
|
440 |
+
encoded_audio = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
441 |
+
if config.SAVE_AUDIO:
|
442 |
+
path = f"{config.CACHE_PATH}/{fname}"
|
443 |
+
utils.save_audio(encoded_audio.getvalue(), path)
|
444 |
+
return encoded_audio, format
|
445 |
|
446 |
+
def vits_infer(self, voice, fname):
|
447 |
format = voice.get("format", "wav")
|
448 |
voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
|
449 |
voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
|
450 |
+
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
451 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
452 |
+
encoded_audio = self.encode(sampling_rate, audio, format)
|
453 |
+
if config.SAVE_AUDIO:
|
454 |
+
path = f"{config.CACHE_PATH}/{fname}"
|
455 |
+
utils.save_audio(encoded_audio.getvalue(), path)
|
456 |
+
return encoded_audio
|
457 |
|
458 |
+
def stream_vits_infer(self, voice, fname):
|
459 |
+
format = voice.get("format", "wav")
|
460 |
+
voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
|
461 |
+
voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
|
462 |
+
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
463 |
+
genertator = voice_obj.get_stream_audio(voice, auto_break=True)
|
464 |
+
audio = BytesIO()
|
465 |
+
for chunk in genertator:
|
466 |
+
encoded_audio = self.encode(sampling_rate, chunk, format)
|
467 |
+
for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio):
|
468 |
+
yield encoded_audio_chunk
|
469 |
+
if config.SAVE_AUDIO:
|
470 |
+
audio.write(encoded_audio.getvalue())
|
471 |
+
if config.SAVE_AUDIO:
|
472 |
+
path = f"{config.CACHE_PATH}/{fname}"
|
473 |
+
utils.save_audio(audio.getvalue(), path)
|
474 |
+
|
475 |
+
def hubert_vits_infer(self, voice, fname):
|
476 |
format = voice.get("format", "wav")
|
477 |
voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
|
478 |
voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
|
479 |
+
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
480 |
audio = voice_obj.get_audio(voice)
|
481 |
+
encoded_audio = self.encode(sampling_rate, audio, format)
|
482 |
+
if config.SAVE_AUDIO:
|
483 |
+
path = f"{config.CACHE_PATH}/{fname}"
|
484 |
+
utils.save_audio(encoded_audio.getvalue(), path)
|
485 |
+
return encoded_audio
|
486 |
|
487 |
+
def w2v2_vits_infer(self, voice, fname):
|
488 |
format = voice.get("format", "wav")
|
489 |
voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
|
490 |
voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
|
491 |
+
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
492 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
493 |
+
encoded_audio = self.encode(sampling_rate, audio, format)
|
494 |
+
if config.SAVE_AUDIO:
|
495 |
+
path = f"{config.CACHE_PATH}/{fname}"
|
496 |
+
utils.save_audio(encoded_audio.getvalue(), path)
|
497 |
+
return encoded_audio
|
498 |
|
499 |
+
def vits_voice_conversion(self, voice, fname):
|
|
|
|
|
500 |
original_id = voice.get("original_id")
|
501 |
target_id = voice.get("target_id")
|
502 |
format = voice.get("format")
|
|
|
511 |
voice["target_id"] = int(self._voice_obj["VITS"][target_id][0])
|
512 |
|
513 |
voice_obj = self._voice_obj["VITS"][original_id][1]
|
514 |
+
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
|
|
515 |
|
516 |
+
audio = voice_obj.voice_conversion(voice)
|
517 |
+
encoded_audio = self.encode(sampling_rate, audio, format)
|
518 |
+
if config.SAVE_AUDIO:
|
519 |
+
path = f"{config.CACHE_PATH}/{fname}"
|
520 |
+
utils.save_audio(encoded_audio.getvalue(), path)
|
521 |
+
return encoded_audio
|
522 |
|
523 |
def get_dimensional_emotion_npy(self, audio):
|
524 |
if self.dem is None:
|