Spaces:
Runtime error
Runtime error
update
Browse files- app.py +125 -65
- config.py +6 -0
- templates/index.html +258 -234
- utils/utils.py +4 -0
- voice.py +95 -46
app.py
CHANGED
|
@@ -16,7 +16,8 @@ app.config.from_pyfile("config.py")
|
|
| 16 |
|
| 17 |
scheduler = APScheduler()
|
| 18 |
scheduler.init_app(app)
|
| 19 |
-
|
|
|
|
| 20 |
|
| 21 |
logzero.loglevel(logging.WARNING)
|
| 22 |
logger = logging.getLogger("vits-simple-api")
|
|
@@ -53,7 +54,8 @@ def require_api_key(func):
|
|
| 53 |
@app.route('/', methods=["GET", "POST"])
|
| 54 |
def index():
|
| 55 |
kwargs = {
|
| 56 |
-
"speakers": tts.voice_speakers
|
|
|
|
| 57 |
}
|
| 58 |
return render_template("index.html", **kwargs)
|
| 59 |
|
|
@@ -77,6 +79,7 @@ def voice_vits_api():
|
|
| 77 |
noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
|
| 78 |
noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
|
| 79 |
max = int(request.args.get("max", app.config.get("MAX", 50)))
|
|
|
|
| 80 |
elif request.method == "POST":
|
| 81 |
content_type = request.headers.get('Content-Type')
|
| 82 |
if content_type == 'application/json':
|
|
@@ -91,6 +94,7 @@ def voice_vits_api():
|
|
| 91 |
noise = float(data.get("noise", app.config.get("NOISE", 0.667)))
|
| 92 |
noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
|
| 93 |
max = int(data.get("max", app.config.get("MAX", 50)))
|
|
|
|
| 94 |
except Exception as e:
|
| 95 |
logger.error(f"[VITS] {e}")
|
| 96 |
return make_response("parameter error", 400)
|
|
@@ -120,23 +124,37 @@ def voice_vits_api():
|
|
| 120 |
if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
|
| 121 |
speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
fname = f"{str(uuid.uuid1())}.{format}"
|
| 124 |
file_type = f"audio/{format}"
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
|
| 142 |
@app.route('/voice/hubert-vits', methods=["POST"])
|
|
@@ -150,6 +168,7 @@ def voice_hubert_api():
|
|
| 150 |
length = float(request.form.get("length", app.config.get("LENGTH", 1)))
|
| 151 |
noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
|
| 152 |
noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
|
|
|
|
| 153 |
except Exception as e:
|
| 154 |
logger.error(f"[hubert] {e}")
|
| 155 |
return make_response("parameter error", 400)
|
|
@@ -168,18 +187,27 @@ def voice_hubert_api():
|
|
| 168 |
return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
|
| 169 |
|
| 170 |
file_type = f"audio/{format}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
t1 = time.time()
|
| 173 |
-
|
| 174 |
-
"format": format,
|
| 175 |
-
"length": length,
|
| 176 |
-
"noise": noise,
|
| 177 |
-
"noisew": noisew,
|
| 178 |
-
"audio_path": os.path.join(app.config['UPLOAD_FOLDER'], fname)})
|
| 179 |
t2 = time.time()
|
|
|
|
|
|
|
| 180 |
logger.info(f"[hubert] finish in {(t2 - t1):.2f}s")
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
|
| 185 |
@app.route('/voice/w2v2-vits', methods=["GET", "POST"])
|
|
@@ -196,6 +224,7 @@ def voice_w2v2_api():
|
|
| 196 |
noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
|
| 197 |
max = int(request.args.get("max", app.config.get("MAX", 50)))
|
| 198 |
emotion = int(request.args.get("emotion", app.config.get("EMOTION", 0)))
|
|
|
|
| 199 |
elif request.method == "POST":
|
| 200 |
content_type = request.headers.get('Content-Type')
|
| 201 |
if content_type == 'application/json':
|
|
@@ -211,6 +240,7 @@ def voice_w2v2_api():
|
|
| 211 |
noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
|
| 212 |
max = int(data.get("max", app.config.get("MAX", 50)))
|
| 213 |
emotion = int(data.get("emotion", app.config.get("EMOTION", 0)))
|
|
|
|
| 214 |
except Exception as e:
|
| 215 |
logger.error(f"[w2v2] {e}")
|
| 216 |
return make_response(f"parameter error", 400)
|
|
@@ -241,24 +271,37 @@ def voice_w2v2_api():
|
|
| 241 |
if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
|
| 242 |
speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
|
| 243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
fname = f"{str(uuid.uuid1())}.{format}"
|
| 245 |
file_type = f"audio/{format}"
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
t1 = time.time()
|
| 248 |
-
|
| 249 |
-
"id": id,
|
| 250 |
-
"format": format,
|
| 251 |
-
"length": length,
|
| 252 |
-
"noise": noise,
|
| 253 |
-
"noisew": noisew,
|
| 254 |
-
"max": max,
|
| 255 |
-
"lang": lang,
|
| 256 |
-
"emotion": emotion,
|
| 257 |
-
"speaker_lang": speaker_lang})
|
| 258 |
t2 = time.time()
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
|
| 264 |
@app.route('/voice/conversion', methods=["POST"])
|
|
@@ -271,29 +314,35 @@ def vits_voice_conversion_api():
|
|
| 271 |
original_id = int(request.form["original_id"])
|
| 272 |
target_id = int(request.form["target_id"])
|
| 273 |
format = request.form.get("format", voice.filename.split(".")[1])
|
|
|
|
| 274 |
except Exception as e:
|
| 275 |
logger.error(f"[vits_voice_convertsion] {e}")
|
| 276 |
return make_response("parameter error", 400)
|
| 277 |
|
|
|
|
| 278 |
fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
|
| 279 |
audio_path = os.path.join(app.config['UPLOAD_FOLDER'], fname)
|
| 280 |
voice.save(audio_path)
|
| 281 |
file_type = f"audio/{format}"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
-
logger.info(f"[vits_voice_convertsion] orginal_id:{original_id} target_id:{target_id}")
|
| 284 |
t1 = time.time()
|
| 285 |
-
|
| 286 |
-
output = tts.vits_voice_conversion({"audio_path": audio_path,
|
| 287 |
-
"original_id": original_id,
|
| 288 |
-
"target_id": target_id,
|
| 289 |
-
"format": format})
|
| 290 |
-
except Exception as e:
|
| 291 |
-
logger.info(f"[vits_voice_convertsion] {e}")
|
| 292 |
-
return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
|
| 293 |
t2 = time.time()
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
|
| 298 |
|
| 299 |
@app.route('/voice/ssml', methods=["POST"])
|
|
@@ -312,20 +361,24 @@ def ssml():
|
|
| 312 |
|
| 313 |
logger.debug(ssml)
|
| 314 |
|
| 315 |
-
t1 = time.time()
|
| 316 |
-
try:
|
| 317 |
-
output, format = tts.create_ssml_infer_task(ssml)
|
| 318 |
-
except Exception as e:
|
| 319 |
-
logger.info(f"[ssml] {e}")
|
| 320 |
-
return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
|
| 321 |
-
t2 = time.time()
|
| 322 |
-
|
| 323 |
fname = f"{str(uuid.uuid1())}.{format}"
|
| 324 |
file_type = f"audio/{format}"
|
| 325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
|
| 327 |
|
| 328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
|
| 330 |
|
| 331 |
@app.route('/voice/dimension-emotion', methods=["POST"])
|
|
@@ -333,6 +386,7 @@ def dimensional_emotion():
|
|
| 333 |
if request.method == "POST":
|
| 334 |
try:
|
| 335 |
audio = request.files['upload']
|
|
|
|
| 336 |
except Exception as e:
|
| 337 |
logger.error(f"[dimensional_emotion] {e}")
|
| 338 |
return make_response("parameter error", 400)
|
|
@@ -341,9 +395,15 @@ def dimensional_emotion():
|
|
| 341 |
|
| 342 |
file_type = "application/octet-stream; charset=ascii"
|
| 343 |
fname = os.path.splitext(audio.filename)[0] + ".npy"
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
|
| 348 |
|
| 349 |
@app.route('/voice/check', methods=["GET", "POST"])
|
|
@@ -400,7 +460,8 @@ def check():
|
|
| 400 |
|
| 401 |
|
| 402 |
# regular cleaning
|
| 403 |
-
@scheduler.task('interval', id='clean_task', seconds=
|
|
|
|
| 404 |
def clean_task():
|
| 405 |
clean_folder(app.config["UPLOAD_FOLDER"])
|
| 406 |
clean_folder(app.config["CACHE_PATH"])
|
|
@@ -409,4 +470,3 @@ def clean_task():
|
|
| 409 |
if __name__ == '__main__':
|
| 410 |
app.run(host='0.0.0.0', port=app.config.get("PORT", 23456), debug=app.config.get("DEBUG", False)) # 对外开放
|
| 411 |
# app.run(host='127.0.0.1', port=app.config.get("PORT",23456), debug=True) # 本地运行、调试
|
| 412 |
-
|
|
|
|
| 16 |
|
| 17 |
scheduler = APScheduler()
|
| 18 |
scheduler.init_app(app)
|
| 19 |
+
if app.config.get("CLEAN_INTERVAL_SECONDS", 3600) > 0:
|
| 20 |
+
scheduler.start()
|
| 21 |
|
| 22 |
logzero.loglevel(logging.WARNING)
|
| 23 |
logger = logging.getLogger("vits-simple-api")
|
|
|
|
| 54 |
@app.route('/', methods=["GET", "POST"])
|
| 55 |
def index():
|
| 56 |
kwargs = {
|
| 57 |
+
"speakers": tts.voice_speakers,
|
| 58 |
+
"speakers_count": tts.speakers_count
|
| 59 |
}
|
| 60 |
return render_template("index.html", **kwargs)
|
| 61 |
|
|
|
|
| 79 |
noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
|
| 80 |
noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
|
| 81 |
max = int(request.args.get("max", app.config.get("MAX", 50)))
|
| 82 |
+
use_streaming = request.args.get('streaming', False, type=bool)
|
| 83 |
elif request.method == "POST":
|
| 84 |
content_type = request.headers.get('Content-Type')
|
| 85 |
if content_type == 'application/json':
|
|
|
|
| 94 |
noise = float(data.get("noise", app.config.get("NOISE", 0.667)))
|
| 95 |
noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
|
| 96 |
max = int(data.get("max", app.config.get("MAX", 50)))
|
| 97 |
+
use_streaming = request.form.get('streaming', False, type=bool)
|
| 98 |
except Exception as e:
|
| 99 |
logger.error(f"[VITS] {e}")
|
| 100 |
return make_response("parameter error", 400)
|
|
|
|
| 124 |
if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
|
| 125 |
speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
|
| 126 |
|
| 127 |
+
if use_streaming and format.upper() != "MP3":
|
| 128 |
+
format = "mp3"
|
| 129 |
+
logger.warning("Streaming response only supports MP3 format.")
|
| 130 |
+
|
| 131 |
fname = f"{str(uuid.uuid1())}.{format}"
|
| 132 |
file_type = f"audio/{format}"
|
| 133 |
+
task = {"text": text,
|
| 134 |
+
"id": id,
|
| 135 |
+
"format": format,
|
| 136 |
+
"length": length,
|
| 137 |
+
"noise": noise,
|
| 138 |
+
"noisew": noisew,
|
| 139 |
+
"max": max,
|
| 140 |
+
"lang": lang,
|
| 141 |
+
"speaker_lang": speaker_lang}
|
| 142 |
+
|
| 143 |
+
if app.config.get("SAVE_AUDIO", False):
|
| 144 |
+
logger.debug(f"[VITS] {fname}")
|
| 145 |
+
|
| 146 |
+
if use_streaming:
|
| 147 |
+
audio = tts.stream_vits_infer(task, fname)
|
| 148 |
+
response = make_response(audio)
|
| 149 |
+
response.headers['Content-Disposition'] = f'attachment; filename={fname}'
|
| 150 |
+
response.headers['Content-Type'] = file_type
|
| 151 |
+
return response
|
| 152 |
+
else:
|
| 153 |
+
t1 = time.time()
|
| 154 |
+
audio = tts.vits_infer(task, fname)
|
| 155 |
+
t2 = time.time()
|
| 156 |
+
logger.info(f"[VITS] finish in {(t2 - t1):.2f}s")
|
| 157 |
+
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
| 158 |
|
| 159 |
|
| 160 |
@app.route('/voice/hubert-vits', methods=["POST"])
|
|
|
|
| 168 |
length = float(request.form.get("length", app.config.get("LENGTH", 1)))
|
| 169 |
noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
|
| 170 |
noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
|
| 171 |
+
use_streaming = request.form.get('streaming', False, type=bool)
|
| 172 |
except Exception as e:
|
| 173 |
logger.error(f"[hubert] {e}")
|
| 174 |
return make_response("parameter error", 400)
|
|
|
|
| 187 |
return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
|
| 188 |
|
| 189 |
file_type = f"audio/{format}"
|
| 190 |
+
task = {"id": id,
|
| 191 |
+
"format": format,
|
| 192 |
+
"length": length,
|
| 193 |
+
"noise": noise,
|
| 194 |
+
"noisew": noisew,
|
| 195 |
+
"audio_path": os.path.join(app.config['UPLOAD_FOLDER'], fname)}
|
| 196 |
|
| 197 |
t1 = time.time()
|
| 198 |
+
audio = tts.hubert_vits_infer(task, fname)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
t2 = time.time()
|
| 200 |
+
if app.config.get("SAVE_AUDIO", False):
|
| 201 |
+
logger.debug(f"[hubert] {fname}")
|
| 202 |
logger.info(f"[hubert] finish in {(t2 - t1):.2f}s")
|
| 203 |
+
if use_streaming:
|
| 204 |
+
audio = tts.generate_audio_chunks(audio)
|
| 205 |
+
response = make_response(audio)
|
| 206 |
+
response.headers['Content-Disposition'] = f'attachment; filename={fname}'
|
| 207 |
+
response.headers['Content-Type'] = file_type
|
| 208 |
+
return response
|
| 209 |
+
else:
|
| 210 |
+
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
| 211 |
|
| 212 |
|
| 213 |
@app.route('/voice/w2v2-vits', methods=["GET", "POST"])
|
|
|
|
| 224 |
noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
|
| 225 |
max = int(request.args.get("max", app.config.get("MAX", 50)))
|
| 226 |
emotion = int(request.args.get("emotion", app.config.get("EMOTION", 0)))
|
| 227 |
+
use_streaming = request.args.get('streaming', False, type=bool)
|
| 228 |
elif request.method == "POST":
|
| 229 |
content_type = request.headers.get('Content-Type')
|
| 230 |
if content_type == 'application/json':
|
|
|
|
| 240 |
noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
|
| 241 |
max = int(data.get("max", app.config.get("MAX", 50)))
|
| 242 |
emotion = int(data.get("emotion", app.config.get("EMOTION", 0)))
|
| 243 |
+
use_streaming = request.form.get('streaming', False, type=bool)
|
| 244 |
except Exception as e:
|
| 245 |
logger.error(f"[w2v2] {e}")
|
| 246 |
return make_response(f"parameter error", 400)
|
|
|
|
| 271 |
if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
|
| 272 |
speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
|
| 273 |
|
| 274 |
+
if use_streaming and format.upper() != "MP3":
|
| 275 |
+
format = "mp3"
|
| 276 |
+
logger.warning("Streaming response only supports MP3 format.")
|
| 277 |
+
|
| 278 |
fname = f"{str(uuid.uuid1())}.{format}"
|
| 279 |
file_type = f"audio/{format}"
|
| 280 |
+
task = {"text": text,
|
| 281 |
+
"id": id,
|
| 282 |
+
"format": format,
|
| 283 |
+
"length": length,
|
| 284 |
+
"noise": noise,
|
| 285 |
+
"noisew": noisew,
|
| 286 |
+
"max": max,
|
| 287 |
+
"lang": lang,
|
| 288 |
+
"emotion": emotion,
|
| 289 |
+
"speaker_lang": speaker_lang}
|
| 290 |
+
|
| 291 |
t1 = time.time()
|
| 292 |
+
audio = tts.w2v2_vits_infer(task, fname)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
t2 = time.time()
|
| 294 |
+
if app.config.get("SAVE_AUDIO", False):
|
| 295 |
+
logger.debug(f"[W2V2] {fname}")
|
| 296 |
+
if use_streaming:
|
| 297 |
+
audio = tts.generate_audio_chunks(audio)
|
| 298 |
+
response = make_response(audio)
|
| 299 |
+
response.headers['Content-Disposition'] = f'attachment; filename={fname}'
|
| 300 |
+
response.headers['Content-Type'] = file_type
|
| 301 |
+
return response
|
| 302 |
+
else:
|
| 303 |
+
logger.info(f"[w2v2] finish in {(t2 - t1):.2f}s")
|
| 304 |
+
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
| 305 |
|
| 306 |
|
| 307 |
@app.route('/voice/conversion', methods=["POST"])
|
|
|
|
| 314 |
original_id = int(request.form["original_id"])
|
| 315 |
target_id = int(request.form["target_id"])
|
| 316 |
format = request.form.get("format", voice.filename.split(".")[1])
|
| 317 |
+
use_streaming = request.form.get('streaming', False, type=bool)
|
| 318 |
except Exception as e:
|
| 319 |
logger.error(f"[vits_voice_convertsion] {e}")
|
| 320 |
return make_response("parameter error", 400)
|
| 321 |
|
| 322 |
+
logger.info(f"[vits_voice_convertsion] orginal_id:{original_id} target_id:{target_id}")
|
| 323 |
fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
|
| 324 |
audio_path = os.path.join(app.config['UPLOAD_FOLDER'], fname)
|
| 325 |
voice.save(audio_path)
|
| 326 |
file_type = f"audio/{format}"
|
| 327 |
+
task = {"audio_path": audio_path,
|
| 328 |
+
"original_id": original_id,
|
| 329 |
+
"target_id": target_id,
|
| 330 |
+
"format": format}
|
| 331 |
|
|
|
|
| 332 |
t1 = time.time()
|
| 333 |
+
audio = tts.vits_voice_conversion(task, fname)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
t2 = time.time()
|
| 335 |
+
if app.config.get("SAVE_AUDIO", False):
|
| 336 |
+
logger.debug(f"[Voice conversion] {fname}")
|
| 337 |
+
logger.info(f"[Voice conversion] finish in {(t2 - t1):.2f}s")
|
| 338 |
+
if use_streaming:
|
| 339 |
+
audio = tts.generate_audio_chunks(audio)
|
| 340 |
+
response = make_response(audio)
|
| 341 |
+
response.headers['Content-Disposition'] = f'attachment; filename={fname}'
|
| 342 |
+
response.headers['Content-Type'] = file_type
|
| 343 |
+
return response
|
| 344 |
+
else:
|
| 345 |
+
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
| 346 |
|
| 347 |
|
| 348 |
@app.route('/voice/ssml', methods=["POST"])
|
|
|
|
| 361 |
|
| 362 |
logger.debug(ssml)
|
| 363 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
fname = f"{str(uuid.uuid1())}.{format}"
|
| 365 |
file_type = f"audio/{format}"
|
| 366 |
|
| 367 |
+
t1 = time.time()
|
| 368 |
+
audio, format = tts.create_ssml_infer_task(ssml, fname)
|
| 369 |
+
t2 = time.time()
|
| 370 |
+
if app.config.get("SAVE_AUDIO", False):
|
| 371 |
+
logger.debug(f"[ssml] {fname}")
|
| 372 |
logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
|
| 373 |
|
| 374 |
+
if eval(ssml.get('streaming', False)):
|
| 375 |
+
audio = tts.generate_audio_chunks(audio)
|
| 376 |
+
response = make_response(audio)
|
| 377 |
+
response.headers['Content-Disposition'] = f'attachment; filename={fname}'
|
| 378 |
+
response.headers['Content-Type'] = file_type
|
| 379 |
+
return response
|
| 380 |
+
else:
|
| 381 |
+
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
| 382 |
|
| 383 |
|
| 384 |
@app.route('/voice/dimension-emotion', methods=["POST"])
|
|
|
|
| 386 |
if request.method == "POST":
|
| 387 |
try:
|
| 388 |
audio = request.files['upload']
|
| 389 |
+
use_streaming = request.form.get('streaming', False, type=bool)
|
| 390 |
except Exception as e:
|
| 391 |
logger.error(f"[dimensional_emotion] {e}")
|
| 392 |
return make_response("parameter error", 400)
|
|
|
|
| 395 |
|
| 396 |
file_type = "application/octet-stream; charset=ascii"
|
| 397 |
fname = os.path.splitext(audio.filename)[0] + ".npy"
|
| 398 |
+
audio = tts.get_dimensional_emotion_npy(content)
|
| 399 |
+
if use_streaming:
|
| 400 |
+
audio = tts.generate_audio_chunks(audio)
|
| 401 |
+
response = make_response(audio)
|
| 402 |
+
response.headers['Content-Disposition'] = f'attachment; filename={fname}'
|
| 403 |
+
response.headers['Content-Type'] = file_type
|
| 404 |
+
return response
|
| 405 |
+
else:
|
| 406 |
+
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
| 407 |
|
| 408 |
|
| 409 |
@app.route('/voice/check', methods=["GET", "POST"])
|
|
|
|
| 460 |
|
| 461 |
|
| 462 |
# regular cleaning
|
| 463 |
+
@scheduler.task('interval', id='clean_task', seconds=app.config.get("CLEAN_INTERVAL_SECONDS", 3600),
|
| 464 |
+
misfire_grace_time=900)
|
| 465 |
def clean_task():
|
| 466 |
clean_folder(app.config["UPLOAD_FOLDER"])
|
| 467 |
clean_folder(app.config["CACHE_PATH"])
|
|
|
|
| 470 |
if __name__ == '__main__':
|
| 471 |
app.run(host='0.0.0.0', port=app.config.get("PORT", 23456), debug=app.config.get("DEBUG", False)) # 对外开放
|
| 472 |
# app.run(host='127.0.0.1', port=app.config.get("PORT",23456), debug=True) # 本地运行、调试
|
|
|
config.py
CHANGED
|
@@ -20,6 +20,12 @@ UPLOAD_FOLDER = ABS_PATH + "/upload"
|
|
| 20 |
# Cahce path
|
| 21 |
CACHE_PATH = ABS_PATH + "/cache"
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
# zh ja ko en... If it is empty, it will be read based on the text_cleaners specified in the config.json.
|
| 24 |
LANGUAGE_AUTOMATIC_DETECT = []
|
| 25 |
|
|
|
|
| 20 |
# Cahce path
|
| 21 |
CACHE_PATH = ABS_PATH + "/cache"
|
| 22 |
|
| 23 |
+
# If CLEAN_INTERVAL_SECONDS <= 0, the cleaning task will not be executed.
|
| 24 |
+
CLEAN_INTERVAL_SECONDS = 3600
|
| 25 |
+
|
| 26 |
+
# save audio to CACHE_PATH
|
| 27 |
+
SAVE_AUDIO = False
|
| 28 |
+
|
| 29 |
# zh ja ko en... If it is empty, it will be read based on the text_cleaners specified in the config.json.
|
| 30 |
LANGUAGE_AUTOMATIC_DETECT = []
|
| 31 |
|
templates/index.html
CHANGED
|
@@ -1,237 +1,261 @@
|
|
| 1 |
<!DOCTYPE html>
|
| 2 |
<html lang="en">
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
</html>
|
|
|
|
| 1 |
<!DOCTYPE html>
|
| 2 |
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8"/>
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
|
| 6 |
+
<title>vits-simple-api</title>
|
| 7 |
+
|
| 8 |
+
<link rel="stylesheet" href="/static/css/bootstrap.min.css"/>
|
| 9 |
+
</head>
|
| 10 |
+
<body>
|
| 11 |
+
<main style="margin: 0 auto; width: 1024px">
|
| 12 |
+
<h1>
|
| 13 |
+
<a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
|
| 14 |
+
style="text-decoration: none; color: black"> vits-simple-api </a>
|
| 15 |
+
</h1>
|
| 16 |
+
|
| 17 |
+
<div>
|
| 18 |
+
<label>文档:</label>
|
| 19 |
+
<a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
|
| 20 |
+
style="text-decoration: none; color: black"> https://github.com/Artrajz/vits-simple-api </a>
|
| 21 |
+
</div>
|
| 22 |
+
<div>
|
| 23 |
+
<label>返回speakers(json):</label>
|
| 24 |
+
<a id="speakersLink" href="https://artrajz-vits-simple-api.hf.space/voice/speakers" target="_blank"
|
| 25 |
+
style="text-decoration: none; color: black">
|
| 26 |
+
https://artrajz-vits-simple-api.hf.space/voice/speakers
|
| 27 |
+
</a>
|
| 28 |
+
</div>
|
| 29 |
+
<div>
|
| 30 |
+
<label>简单调用api:</label>
|
| 31 |
+
<a id="vitsLink" href="https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164"
|
| 32 |
+
style="text-decoration: none; color: black">
|
| 33 |
+
https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164
|
| 34 |
+
</a>
|
| 35 |
+
</div>
|
| 36 |
+
|
| 37 |
+
<!-- <div style="display: flex; justify-content: center; align-items: center"> -->
|
| 38 |
+
<div>
|
| 39 |
+
<form>
|
| 40 |
+
<div class="form-group">
|
| 41 |
+
<label>text</label>
|
| 42 |
+
<textarea class="form-control" id="inputText" rows="3" oninput="updateLink()">你好,こんにちは</textarea>
|
| 43 |
+
</div>
|
| 44 |
+
<div class="form-group">
|
| 45 |
+
<label>id</label>
|
| 46 |
+
<select class="form-control" id="inputId" oninput="updateLink()">
|
| 47 |
+
{% for speaker in speakers["VITS"] %}
|
| 48 |
+
{% if speaker["name"] == "雷电将军(雷神)" %}
|
| 49 |
+
<option value="{{ speaker["id"] }}" selected>{{ speaker["id"] }} | {{ speaker["name"] }}
|
| 50 |
+
| {{ speaker["lang"] }}</option>
|
| 51 |
+
{% else %}
|
| 52 |
+
<option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
|
| 53 |
+
| {{ speaker["lang"] }}</option>
|
| 54 |
+
{% endif %}
|
| 55 |
+
{% endfor %}
|
| 56 |
+
</select>
|
| 57 |
+
</div>
|
| 58 |
+
</form>
|
| 59 |
+
</div>
|
| 60 |
+
<p>
|
| 61 |
+
<button class="btn btn-primary" type="button" data-toggle="collapse" data-target="#collapseExample"
|
| 62 |
+
aria-expanded="false" aria-controls="collapseExample">
|
| 63 |
+
Advanced
|
| 64 |
+
</button>
|
| 65 |
+
{% if speakers_count == 0 %}
|
| 66 |
+
<div style="color: red;">未加载任何模型</div>
|
| 67 |
+
{% endif %}
|
| 68 |
+
</p>
|
| 69 |
+
<div class="collapse" id="collapseExample">
|
| 70 |
+
<div class="card card-body">
|
| 71 |
+
<form>
|
| 72 |
+
<div class="form-group">
|
| 73 |
+
<label>format</label>
|
| 74 |
+
<select class="form-control" id="inputFormat" oninput="updateLink()">
|
| 75 |
+
<option></option>
|
| 76 |
+
<option>wav</option>
|
| 77 |
+
<option>mp3</option>
|
| 78 |
+
<option>ogg</option>
|
| 79 |
+
<option>silk</option>
|
| 80 |
+
</select>
|
| 81 |
+
</div>
|
| 82 |
+
<div class="form-group">
|
| 83 |
+
<label>lang</label>
|
| 84 |
+
<input type="text" class="form-control" id="inputLang" oninput="updateLink()" value=""
|
| 85 |
+
placeholder="auto"/>
|
| 86 |
+
</div>
|
| 87 |
+
<div class="form-group">
|
| 88 |
+
<label>length</label>
|
| 89 |
+
<input type="text" class="form-control" id="inputLength" oninput="updateLink()" value=""
|
| 90 |
+
placeholder="1"/>
|
| 91 |
+
</div>
|
| 92 |
+
<div class="form-group">
|
| 93 |
+
<label>noise</label>
|
| 94 |
+
<input type="text" class="form-control" id="inputNoise" oninput="updateLink()" value=""
|
| 95 |
+
placeholder="0.33"/>
|
| 96 |
+
</div>
|
| 97 |
+
<div class="form-group">
|
| 98 |
+
<label>noisew</label>
|
| 99 |
+
<input type="text" class="form-control" id="inputNoisew" oninput="updateLink()" value=""
|
| 100 |
+
placeholder="0.4"/>
|
| 101 |
+
</div>
|
| 102 |
+
<div class="form-group">
|
| 103 |
+
<label>max</label>
|
| 104 |
+
<input type="text" class="form-control" id="inputMax" oninput="updateLink()" value=""
|
| 105 |
+
placeholder="50"/>
|
| 106 |
+
</div>
|
| 107 |
+
</form>
|
| 108 |
+
</div>
|
| 109 |
+
</div>
|
| 110 |
+
|
| 111 |
+
<div style="display: flex; justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
|
| 112 |
+
<button type="button" class="btn btn-outline-secondary" id="getAudio" style="margin-right: 10px">播放器生成</button>
|
| 113 |
+
<audio id="audioPlayer" controls>
|
| 114 |
+
<source src="" type="audio/mp3"/>
|
| 115 |
+
Your browser does not support the audio element.
|
| 116 |
+
</audio>
|
| 117 |
+
<div class="form-group form-check">
|
| 118 |
+
<input type="checkbox" id="streaming">
|
| 119 |
+
<label class="form-check-label">流式响应</label>
|
| 120 |
+
</div>
|
| 121 |
+
</div>
|
| 122 |
+
<div>自动识别语言:可识别的语言根据不同speaker而不同,方言无法自动识别</div>
|
| 123 |
+
<div>方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd</div>
|
| 124 |
+
<br/>
|
| 125 |
+
|
| 126 |
+
<h2>所有模型均为网络搜集,感谢模型原作者的付出!</h2>
|
| 127 |
+
<p>
|
| 128 |
+
Nene_Nanami_Rong_Tang:
|
| 129 |
+
<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
|
| 130 |
+
</p>
|
| 131 |
+
<p>
|
| 132 |
+
louise:
|
| 133 |
+
<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
|
| 134 |
+
</p>
|
| 135 |
+
<p>
|
| 136 |
+
Cantonese:
|
| 137 |
+
<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
|
| 138 |
+
</p>
|
| 139 |
+
<p>
|
| 140 |
+
shanghainese:
|
| 141 |
+
<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
|
| 142 |
+
</p>
|
| 143 |
+
<p>
|
| 144 |
+
w2v2-vits:
|
| 145 |
+
<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
|
| 146 |
+
</p>
|
| 147 |
+
<p>
|
| 148 |
+
vctk:
|
| 149 |
+
<a href="https://github.com/jaywalnut310/vits" rel="noreferrer" target="_blank">jaywalnut310/vits</a>
|
| 150 |
+
</p>
|
| 151 |
+
<p>
|
| 152 |
+
Bishojo Mangekyo:
|
| 153 |
+
<a href="https://github.com/Francis-Komizu/VITS" rel="noreferrer" target="_blank">Francis-Komizu/VITS</a>
|
| 154 |
+
</p>
|
| 155 |
+
<p>
|
| 156 |
+
genshin:
|
| 157 |
+
<a href="https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai" rel="noreferrer" target="_blank">zomehwh/vits-uma-genshin-honkai</a>
|
| 158 |
+
</p>
|
| 159 |
+
<p>
|
| 160 |
+
paimon:
|
| 161 |
+
<a href="https://github.com/zixiiu/Digital_Life_Server" rel="noreferrer" target="_blank">zixiiu/Digital_Life_Server</a>
|
| 162 |
+
</p>
|
| 163 |
+
<p>
|
| 164 |
+
vits_chinese:
|
| 165 |
+
<a href="https://github.com/PlayVoice/vits_chinese" rel="noreferrer" target="_blank">PlayVoice/vits_chinese</a>
|
| 166 |
+
</p>
|
| 167 |
+
|
| 168 |
+
</main>
|
| 169 |
+
|
| 170 |
+
<script src="/static/js/jquery.slim.min.js"></script>
|
| 171 |
+
<script src="/static/js/bootstrap.bundle.min.js"></script>
|
| 172 |
+
|
| 173 |
+
<script>
|
| 174 |
+
function getProtocol() {
|
| 175 |
+
return 'https:' == location.protocol ? "https://" : "http://";
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
function getUrl() {
|
| 179 |
+
var url = window.location.host;
|
| 180 |
+
return url;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
var baseUrl = getProtocol() + getUrl();
|
| 184 |
+
|
| 185 |
+
setBaseUrl();
|
| 186 |
+
|
| 187 |
+
function setBaseUrl() {
|
| 188 |
+
var text = document.getElementById("inputText").value;
|
| 189 |
+
var id = document.getElementById("inputId").value;
|
| 190 |
+
|
| 191 |
+
var vitsLink = document.getElementById("vitsLink");
|
| 192 |
+
var speakersLink = document.getElementById("speakersLink");
|
| 193 |
+
|
| 194 |
+
var vitsUrl = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
|
| 195 |
+
var speakersUrl = baseUrl + "/voice/speakers";
|
| 196 |
+
|
| 197 |
+
vitsLink.href = vitsUrl;
|
| 198 |
+
vitsLink.textContent = vitsUrl;
|
| 199 |
+
|
| 200 |
+
speakersLink.href = speakersUrl;
|
| 201 |
+
speakersLink.textContent = speakersUrl;
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
function getLink() {
|
| 205 |
+
var text = document.getElementById("inputText").value;
|
| 206 |
+
var id = document.getElementById("inputId").value;
|
| 207 |
+
var format = document.getElementById("inputFormat").value;
|
| 208 |
+
var lang = document.getElementById("inputLang").value;
|
| 209 |
+
var length = document.getElementById("inputLength").value;
|
| 210 |
+
var noise = document.getElementById("inputNoise").value;
|
| 211 |
+
var noisew = document.getElementById("inputNoisew").value;
|
| 212 |
+
var max = document.getElementById("inputMax").value;
|
| 213 |
+
|
| 214 |
+
var url = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
|
| 215 |
+
if (format != "") {
|
| 216 |
+
url += "&format=" + format;
|
| 217 |
+
}
|
| 218 |
+
if (lang != "") {
|
| 219 |
+
url += "&lang=" + lang;
|
| 220 |
+
}
|
| 221 |
+
if (length != "") {
|
| 222 |
+
url += "&length=" + length;
|
| 223 |
+
}
|
| 224 |
+
if (noise != "") {
|
| 225 |
+
url += "&noise=" + noise;
|
| 226 |
+
}
|
| 227 |
+
if (noisew != "") {
|
| 228 |
+
url += "&noisew=" + noisew;
|
| 229 |
+
}
|
| 230 |
+
if (max != "") {
|
| 231 |
+
url += "&max=" + max;
|
| 232 |
+
}
|
| 233 |
+
return url;
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
function updateLink() {
|
| 237 |
+
var url = getLink();
|
| 238 |
+
var link = document.getElementById("vitsLink");
|
| 239 |
+
link.href = url;
|
| 240 |
+
link.textContent = url;
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
function setAudioSource() {
|
| 244 |
+
var streaming = document.getElementById('streaming');
|
| 245 |
+
var url = getLink();
|
| 246 |
+
if (streaming.checked) {
|
| 247 |
+
url += '&streaming=true';
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
var audioPlayer = document.getElementById("audioPlayer");
|
| 251 |
+
audioPlayer.src = url;
|
| 252 |
+
audioPlayer.play();
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
var button = document.getElementById("getAudio");
|
| 256 |
+
button.addEventListener("click", function () {
|
| 257 |
+
setAudioSource();
|
| 258 |
+
});
|
| 259 |
+
</script>
|
| 260 |
+
</body>
|
| 261 |
</html>
|
utils/utils.py
CHANGED
|
@@ -89,3 +89,7 @@ def clean_folder(folder_path):
|
|
| 89 |
# is none -> True, is not none -> False
|
| 90 |
def check_is_none(s):
|
| 91 |
return s is None or (isinstance(s, str) and str(s).isspace()) or str(s) == ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
# is none -> True, is not none -> False
|
| 90 |
def check_is_none(s):
|
| 91 |
return s is None or (isinstance(s, str) and str(s).isspace()) or str(s) == ""
|
| 92 |
+
|
| 93 |
+
def save_audio(audio, path):
|
| 94 |
+
with open(path,"wb") as f:
|
| 95 |
+
f.write(audio)
|
voice.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import librosa
|
| 3 |
import commons
|
| 4 |
-
import sys
|
| 5 |
import re
|
| 6 |
import numpy as np
|
| 7 |
import torch
|
|
@@ -156,7 +155,7 @@ class vits:
|
|
| 156 |
|
| 157 |
return params
|
| 158 |
|
| 159 |
-
def
|
| 160 |
text = voice.get("text", None)
|
| 161 |
speaker_id = voice.get("id", 0)
|
| 162 |
length = voice.get("length", 1)
|
|
@@ -171,47 +170,57 @@ class vits:
|
|
| 171 |
# 去除所有多余的空白字符
|
| 172 |
if text is not None: text = re.sub(r'\s+', ' ', text).strip()
|
| 173 |
|
| 174 |
-
# 停顿0.75s,避免语音分段合成再拼接后的连接突兀
|
| 175 |
-
brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
|
| 176 |
-
|
| 177 |
tasks = []
|
| 178 |
if self.model_type == "vits":
|
| 179 |
sentence_list = sentence_split(text, max, lang, speaker_lang)
|
| 180 |
for sentence in sentence_list:
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
audios = []
|
| 186 |
-
for task in tasks:
|
| 187 |
-
audios.append(self.infer(task))
|
| 188 |
-
if auto_break:
|
| 189 |
-
audios.append(brk)
|
| 190 |
-
|
| 191 |
-
audio = np.concatenate(audios, axis=0)
|
| 192 |
|
| 193 |
elif self.model_type == "hubert":
|
| 194 |
params = self.get_infer_param(speaker_id=speaker_id, length_scale=length, noise_scale=noise,
|
| 195 |
noise_scale_w=noisew, audio_path=audio_path)
|
| 196 |
-
|
| 197 |
|
| 198 |
elif self.model_type == "w2v2":
|
| 199 |
sentence_list = sentence_split(text, max, lang, speaker_lang)
|
| 200 |
for sentence in sentence_list:
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
|
| 205 |
-
|
| 206 |
-
for task in tasks:
|
| 207 |
-
audios.append(self.infer(task))
|
| 208 |
-
if auto_break:
|
| 209 |
-
audios.append(brk)
|
| 210 |
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
|
|
|
| 213 |
return audio
|
| 214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
def voice_conversion(self, voice):
|
| 216 |
audio_path = voice.get("audio_path")
|
| 217 |
original_id = voice.get("original_id")
|
|
@@ -330,6 +339,14 @@ class TTS:
|
|
| 330 |
else:
|
| 331 |
raise ValueError("Unsupported time unit: {}".format(time_unit))
|
| 332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
def parse_ssml(self, ssml):
|
| 334 |
root = ET.fromstring(ssml)
|
| 335 |
format = root.attrib.get("format", "wav")
|
|
@@ -403,7 +420,7 @@ class TTS:
|
|
| 403 |
|
| 404 |
return voice_tasks, format
|
| 405 |
|
| 406 |
-
def create_ssml_infer_task(self, ssml):
|
| 407 |
voice_tasks, format = self.parse_ssml(ssml)
|
| 408 |
|
| 409 |
audios = []
|
|
@@ -420,38 +437,66 @@ class TTS:
|
|
| 420 |
audios.append(audio)
|
| 421 |
|
| 422 |
audio = np.concatenate(audios, axis=0)
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
|
|
|
|
|
|
| 426 |
|
| 427 |
-
def vits_infer(self, voice):
|
| 428 |
format = voice.get("format", "wav")
|
| 429 |
voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
|
| 430 |
voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
|
|
|
|
| 431 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
|
|
|
|
|
|
| 435 |
|
| 436 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
format = voice.get("format", "wav")
|
| 438 |
voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
|
| 439 |
voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
|
|
|
|
| 440 |
audio = voice_obj.get_audio(voice)
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
|
|
|
|
|
|
| 444 |
|
| 445 |
-
def w2v2_vits_infer(self, voice):
|
| 446 |
format = voice.get("format", "wav")
|
| 447 |
voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
|
| 448 |
voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
|
|
|
|
| 449 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
| 450 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
def vits_voice_conversion(self, voice):
|
| 455 |
original_id = voice.get("original_id")
|
| 456 |
target_id = voice.get("target_id")
|
| 457 |
format = voice.get("format")
|
|
@@ -466,10 +511,14 @@ class TTS:
|
|
| 466 |
voice["target_id"] = int(self._voice_obj["VITS"][target_id][0])
|
| 467 |
|
| 468 |
voice_obj = self._voice_obj["VITS"][original_id][1]
|
| 469 |
-
|
| 470 |
-
output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
| 471 |
|
| 472 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 473 |
|
| 474 |
def get_dimensional_emotion_npy(self, audio):
|
| 475 |
if self.dem is None:
|
|
|
|
| 1 |
import os
|
| 2 |
import librosa
|
| 3 |
import commons
|
|
|
|
| 4 |
import re
|
| 5 |
import numpy as np
|
| 6 |
import torch
|
|
|
|
| 155 |
|
| 156 |
return params
|
| 157 |
|
| 158 |
+
def get_tasks(self, voice):
|
| 159 |
text = voice.get("text", None)
|
| 160 |
speaker_id = voice.get("id", 0)
|
| 161 |
length = voice.get("length", 1)
|
|
|
|
| 170 |
# 去除所有多余的空白字符
|
| 171 |
if text is not None: text = re.sub(r'\s+', ' ', text).strip()
|
| 172 |
|
|
|
|
|
|
|
|
|
|
| 173 |
tasks = []
|
| 174 |
if self.model_type == "vits":
|
| 175 |
sentence_list = sentence_split(text, max, lang, speaker_lang)
|
| 176 |
for sentence in sentence_list:
|
| 177 |
+
params = self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length,
|
| 178 |
+
noise_scale=noise, noise_scale_w=noisew)
|
| 179 |
+
tasks.append(params)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
elif self.model_type == "hubert":
|
| 182 |
params = self.get_infer_param(speaker_id=speaker_id, length_scale=length, noise_scale=noise,
|
| 183 |
noise_scale_w=noisew, audio_path=audio_path)
|
| 184 |
+
tasks.append(params)
|
| 185 |
|
| 186 |
elif self.model_type == "w2v2":
|
| 187 |
sentence_list = sentence_split(text, max, lang, speaker_lang)
|
| 188 |
for sentence in sentence_list:
|
| 189 |
+
params = self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length,
|
| 190 |
+
noise_scale=noise, noise_scale_w=noisew, emotion=emotion)
|
| 191 |
+
tasks.append(params)
|
| 192 |
|
| 193 |
+
return tasks
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
+
def get_audio(self, voice, auto_break=False):
|
| 196 |
+
tasks = self.get_tasks(voice)
|
| 197 |
+
# 停顿0.75s,避免语音分段合成再拼接后的连接突兀
|
| 198 |
+
brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
|
| 199 |
+
|
| 200 |
+
audios = []
|
| 201 |
+
for task in tasks:
|
| 202 |
+
if auto_break:
|
| 203 |
+
chunk = np.concatenate((self.infer(task), brk), axis=0)
|
| 204 |
+
else:
|
| 205 |
+
chunk = self.infer(task)
|
| 206 |
+
audios.append(chunk)
|
| 207 |
|
| 208 |
+
audio = np.concatenate(audios, axis=0)
|
| 209 |
return audio
|
| 210 |
|
| 211 |
+
def get_stream_audio(self, voice, auto_break=False):
|
| 212 |
+
tasks = self.get_tasks(voice)
|
| 213 |
+
|
| 214 |
+
brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
|
| 215 |
+
|
| 216 |
+
for task in tasks:
|
| 217 |
+
if auto_break:
|
| 218 |
+
chunk = np.concatenate((self.infer(task), brk), axis=0)
|
| 219 |
+
else:
|
| 220 |
+
chunk = self.infer(task)
|
| 221 |
+
|
| 222 |
+
yield chunk
|
| 223 |
+
|
| 224 |
def voice_conversion(self, voice):
|
| 225 |
audio_path = voice.get("audio_path")
|
| 226 |
original_id = voice.get("original_id")
|
|
|
|
| 339 |
else:
|
| 340 |
raise ValueError("Unsupported time unit: {}".format(time_unit))
|
| 341 |
|
| 342 |
+
def generate_audio_chunks(self, audio):
|
| 343 |
+
chunk_size = 4096
|
| 344 |
+
while True:
|
| 345 |
+
chunk = audio.read(chunk_size)
|
| 346 |
+
if not chunk:
|
| 347 |
+
break
|
| 348 |
+
yield chunk
|
| 349 |
+
|
| 350 |
def parse_ssml(self, ssml):
|
| 351 |
root = ET.fromstring(ssml)
|
| 352 |
format = root.attrib.get("format", "wav")
|
|
|
|
| 420 |
|
| 421 |
return voice_tasks, format
|
| 422 |
|
| 423 |
+
def create_ssml_infer_task(self, ssml, fname):
|
| 424 |
voice_tasks, format = self.parse_ssml(ssml)
|
| 425 |
|
| 426 |
audios = []
|
|
|
|
| 437 |
audios.append(audio)
|
| 438 |
|
| 439 |
audio = np.concatenate(audios, axis=0)
|
| 440 |
+
encoded_audio = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
| 441 |
+
if config.SAVE_AUDIO:
|
| 442 |
+
path = f"{config.CACHE_PATH}/{fname}"
|
| 443 |
+
utils.save_audio(encoded_audio.getvalue(), path)
|
| 444 |
+
return encoded_audio, format
|
| 445 |
|
| 446 |
+
def vits_infer(self, voice, fname):
|
| 447 |
format = voice.get("format", "wav")
|
| 448 |
voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
|
| 449 |
voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
|
| 450 |
+
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
| 451 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
| 452 |
+
encoded_audio = self.encode(sampling_rate, audio, format)
|
| 453 |
+
if config.SAVE_AUDIO:
|
| 454 |
+
path = f"{config.CACHE_PATH}/{fname}"
|
| 455 |
+
utils.save_audio(encoded_audio.getvalue(), path)
|
| 456 |
+
return encoded_audio
|
| 457 |
|
| 458 |
+
def stream_vits_infer(self, voice, fname):
|
| 459 |
+
format = voice.get("format", "wav")
|
| 460 |
+
voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
|
| 461 |
+
voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
|
| 462 |
+
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
| 463 |
+
genertator = voice_obj.get_stream_audio(voice, auto_break=True)
|
| 464 |
+
audio = BytesIO()
|
| 465 |
+
for chunk in genertator:
|
| 466 |
+
encoded_audio = self.encode(sampling_rate, chunk, format)
|
| 467 |
+
for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio):
|
| 468 |
+
yield encoded_audio_chunk
|
| 469 |
+
if config.SAVE_AUDIO:
|
| 470 |
+
audio.write(encoded_audio.getvalue())
|
| 471 |
+
if config.SAVE_AUDIO:
|
| 472 |
+
path = f"{config.CACHE_PATH}/{fname}"
|
| 473 |
+
utils.save_audio(audio.getvalue(), path)
|
| 474 |
+
|
| 475 |
+
def hubert_vits_infer(self, voice, fname):
|
| 476 |
format = voice.get("format", "wav")
|
| 477 |
voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
|
| 478 |
voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
|
| 479 |
+
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
| 480 |
audio = voice_obj.get_audio(voice)
|
| 481 |
+
encoded_audio = self.encode(sampling_rate, audio, format)
|
| 482 |
+
if config.SAVE_AUDIO:
|
| 483 |
+
path = f"{config.CACHE_PATH}/{fname}"
|
| 484 |
+
utils.save_audio(encoded_audio.getvalue(), path)
|
| 485 |
+
return encoded_audio
|
| 486 |
|
| 487 |
+
def w2v2_vits_infer(self, voice, fname):
|
| 488 |
format = voice.get("format", "wav")
|
| 489 |
voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
|
| 490 |
voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
|
| 491 |
+
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
| 492 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
| 493 |
+
encoded_audio = self.encode(sampling_rate, audio, format)
|
| 494 |
+
if config.SAVE_AUDIO:
|
| 495 |
+
path = f"{config.CACHE_PATH}/{fname}"
|
| 496 |
+
utils.save_audio(encoded_audio.getvalue(), path)
|
| 497 |
+
return encoded_audio
|
| 498 |
|
| 499 |
+
def vits_voice_conversion(self, voice, fname):
|
|
|
|
|
|
|
| 500 |
original_id = voice.get("original_id")
|
| 501 |
target_id = voice.get("target_id")
|
| 502 |
format = voice.get("format")
|
|
|
|
| 511 |
voice["target_id"] = int(self._voice_obj["VITS"][target_id][0])
|
| 512 |
|
| 513 |
voice_obj = self._voice_obj["VITS"][original_id][1]
|
| 514 |
+
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
|
|
|
| 515 |
|
| 516 |
+
audio = voice_obj.voice_conversion(voice)
|
| 517 |
+
encoded_audio = self.encode(sampling_rate, audio, format)
|
| 518 |
+
if config.SAVE_AUDIO:
|
| 519 |
+
path = f"{config.CACHE_PATH}/{fname}"
|
| 520 |
+
utils.save_audio(encoded_audio.getvalue(), path)
|
| 521 |
+
return encoded_audio
|
| 522 |
|
| 523 |
def get_dimensional_emotion_npy(self, audio):
|
| 524 |
if self.dem is None:
|