Spaces:

Artrajz
/

moegoe-simple-api

Runtime error

App Files Files Community

Artrajz commited on May 25, 2023

Commit

f103c82

•

1 Parent(s): e6da826

Upload 44 files

Browse files

Files changed (30) hide show

Dockerfile +19 -4
LICENSE +1 -1
LICENSE-MoeGoe +21 -0
app.py +336 -71
docker-compose.yaml +3 -2
models.py +1 -1
optimizer_removal.py +16 -0
request.py +265 -0
test.py +11 -0
text/__pycache__/__init__.cpython-310.pyc +0 -0
text/__pycache__/cantonese.cpython-310.pyc +0 -0
text/__pycache__/cleaners.cpython-310.pyc +0 -0
text/__pycache__/english.cpython-310.pyc +0 -0
text/__pycache__/japanese.cpython-310.pyc +0 -0
text/__pycache__/korean.cpython-310.pyc +0 -0
text/__pycache__/mandarin.cpython-310.pyc +0 -0
text/__pycache__/ngu_dialect.cpython-310.pyc +0 -0
text/__pycache__/shanghainese.cpython-310.pyc +0 -0
text/cantonese.py +15 -4
text/cleaners.py +140 -36
text/mandarin.py +15 -3
text/shanghainese.py +16 -5
utils/__pycache__/merge.cpython-310.pyc +0 -0
utils/__pycache__/nlp.cpython-310.pyc +0 -0
utils/__pycache__/utils.cpython-310.pyc +0 -0
utils/merge.py +161 -0
utils/nlp.py +82 -0
utils/utils.py +112 -0
vits-simple-api-installer-latest.sh +27 -0
voice.py +408 -153

Dockerfile CHANGED Viewed

@@ -1,4 +1,4 @@
-FROM python:3.9.16-slim-bullseye
 RUN mkdir -p /app
 WORKDIR /app
@@ -7,16 +7,31 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && \
     apt install build-essential -yq && \
     apt-get clean && \
     apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
     rm -rf /var/lib/apt/lists/*
 COPY requirements.txt /app
 RUN pip install -r requirements.txt
 COPY . /app
-EXPOSE 7860
-CMD ["python", "/app/app.py"]

+FROM python:3.10.11-slim-bullseye
 RUN mkdir -p /app
 WORKDIR /app
 RUN apt-get update && \
     apt install build-essential -yq && \
+    apt install espeak-ng -yq && \
+    apt install cmake -yq && \
+    apt install -y wget -yq && \
     apt-get clean && \
     apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
     rm -rf /var/lib/apt/lists/*
+RUN pip install MarkupSafe==2.1.2 numpy==1.23.3 cython six==1.16.0
+RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
+    tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
+    cd openjtalk-0.3.0.dev2 && \
+    rm -rf ./pyopenjtalk/open_jtalk_dic_utf_8-1.11 && \
+    python setup.py install && \
+    cd ../ && \
+    rm -f openjtalk-0.3.0.dev2.tar.gz && \
+    rm -rf openjtalk-0.3.0.dev2
+RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
 COPY requirements.txt /app
 RUN pip install -r requirements.txt
 COPY . /app
+EXPOSE 23456
+CMD ["python", "/app/app.py"]

LICENSE CHANGED Viewed

@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2022 CjangCjengh
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

 MIT License
+Copyright (c) 2023 Artrajz
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

LICENSE-MoeGoe ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 CjangCjengh
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import os
-import gradio as gr
 import logging
 import uuid
-from flask import Flask, request, send_file, jsonify
 from werkzeug.utils import secure_filename
 from flask_apscheduler import APScheduler
-from utils import clean_folder, merge_model
 app = Flask(__name__)
 app.config.from_pyfile("config.py")
@@ -16,104 +18,367 @@ scheduler = APScheduler()
 scheduler.init_app(app)
 scheduler.start()
 logging.getLogger('numba').setLevel(logging.WARNING)
-voice_obj, voice_speakers = merge_model(app.config["MODEL_LIST"])
-CUSTOM_PATH = "/gradio"
 if not os.path.exists(app.config['UPLOAD_FOLDER']):
-    try:
-        os.mkdir(app.config['UPLOAD_FOLDER'])
-    except:
-        pass
-@app.route('/')
-@app.route('/voice/')
 def index():
-    return "usage:https://github.com/Artrajz/MoeGoe-Simple-API#readme"
 @app.route('/voice/speakers', methods=["GET", "POST"])
 def voice_speakers_api():
-    speakers_list = voice_speakers
-    return jsonify(speakers_list)
 @app.route('/voice', methods=["GET", "POST"])
-def voice_api():
-    if request.method == "GET":
-        text = request.args.get("text")
-        speaker_id = int(request.args.get("id", 0))
-        format = request.args.get("format", "wav")
-        lang = request.args.get("lang", "mix")
-        speed = float(request.args.get("speed", 1.0))
-    elif request.method == "POST":
-        json_data = request.json
-        text = json_data["text"]
-        speaker_id = int(json_data["id"])
-        format = json_data["format"]
-        lang = json_data["lang"]
-        speed = float(json_data["speed"])
-    if lang.upper() == "ZH":
-        text = f"[ZH]{text}[ZH]"
-    elif lang.upper() == "JA":
-        text = f"[JA]{text}[JA]"
-    real_id = voice_obj[speaker_id][0]
-    real_obj = voice_obj[speaker_id][1]
-    output, file_type, fname = real_obj.generate(text, real_id, format, speed)
     return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
-@app.route('/voice/conversion', methods=["GET", "POST"])
-def voice_conversion_api():
-    if request.method == "GET":
-        return jsonify("method should be POST")
     if request.method == "POST":
-        # json_data = request.json
-        voice = request.files['upload']
-        original_id = int(request.form["original_id"])
-        target_id = int(request.form["target_id"])
-        form = {}
-        format = voice.filename.split(".")[1]
-        fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
-        voice.save(os.path.join(app.config['UPLOAD_FOLDER'], fname))
-        real_original_id = int(voice_obj[original_id][0])
-        real_target_id = int(voice_obj[target_id][0])
-        real_obj = voice_obj[original_id][1]
-        real_target_obj = voice_obj[target_id][1]
-        if voice_obj[original_id][2] != voice_obj[target_id][2]:
-            form["status"] = "error"
-            form["message"] = "speaker IDs are in diffrent Model!"
-            return form
-        output = real_obj.voice_conversion(os.path.join(app.config['UPLOAD_FOLDER'], fname),
-                                           real_original_id, real_target_id, format)
         file_type = f"audio/{format}"
         return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
-        # return output
-# ��时清理临时文件，每小时清一次
-@scheduler.task('interval', id='随便写', seconds=3600, misfire_grace_time=900)
 def clean_task():
     clean_folder(app.config["UPLOAD_FOLDER"])
-    clean_folder(app.config["SILK_OUT_PATH"])
 if __name__ == '__main__':
-    io = gr.Interface(lambda x: "Hello, " + x + "!", "textbox", "textbox")
-    app = gr.mount_gradio_app(app, io, path=CUSTOM_PATH)
-    # app.run(host='0.0.0.0', port=app.config["PORT"])  # 如果对外开放用这个,docker部署也用这个
-    # app.run(host='127.0.0.1', port=app.config["PORT"], debug=True)  # 本地运行、调试

 import os
 import logging
+import time
+import logzero
 import uuid
+from flask import Flask, request, send_file, jsonify, make_response
 from werkzeug.utils import secure_filename
 from flask_apscheduler import APScheduler
+from functools import wraps
+from utils.utils import clean_folder, check_is_none
+from utils.merge import merge_model
+from io import BytesIO
 app = Flask(__name__)
 app.config.from_pyfile("config.py")
 scheduler.init_app(app)
 scheduler.start()
+logzero.loglevel(logging.WARNING)
+logger = logging.getLogger("vits-simple-api")
+level = app.config.get("LOGGING_LEVEL", "DEBUG")
+level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
+              'CRITICAL': logging.CRITICAL}
+logging.basicConfig(level=level_dict[level])
 logging.getLogger('numba').setLevel(logging.WARNING)
+tts = merge_model(app.config["MODEL_LIST"])
 if not os.path.exists(app.config['UPLOAD_FOLDER']):
+    os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
+if not os.path.exists(app.config['CACHE_PATH']):
+    os.makedirs(app.config['CACHE_PATH'], exist_ok=True)
+def require_api_key(func):
+    @wraps(func)
+    def check_api_key(*args, **kwargs):
+        if not app.config.get('API_KEY_ENABLED', False):
+            return func(*args, **kwargs)
+        else:
+            api_key = request.args.get('api_key') or request.headers.get('X-API-KEY')
+            if api_key and api_key == app.config['API_KEY']:
+                return func(*args, **kwargs)
+            else:
+                return make_response(jsonify({"status": "error", "message": "Invalid API Key"}), 401)
+    return check_api_key
+@app.route('/', methods=["GET", "POST"])
 def index():
+    return "vits-simple-api"
 @app.route('/voice/speakers', methods=["GET", "POST"])
 def voice_speakers_api():
+    return jsonify(tts.voice_speakers)
 @app.route('/voice', methods=["GET", "POST"])
+@app.route('/voice/vits', methods=["GET", "POST"])
+@require_api_key
+def voice_vits_api():
+    try:
+        if request.method == "GET":
+            text = request.args.get("text", "")
+            id = int(request.args.get("id", app.config.get("ID", 0)))
+            format = request.args.get("format", app.config.get("FORMAT", "wav"))
+            lang = request.args.get("lang", app.config.get("LANG", "auto"))
+            length = float(request.args.get("length", app.config.get("LENGTH", 1)))
+            noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
+            noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
+            max = int(request.args.get("max", app.config.get("MAX", 50)))
+        elif request.method == "POST":
+            text = request.form.get("text", "")
+            id = int(request.form.get("id", app.config.get("ID", 0)))
+            format = request.form.get("format", app.config.get("FORMAT", "wav"))
+            lang = request.form.get("lang", app.config.get("LANG", "auto"))
+            length = float(request.form.get("length", app.config.get("LENGTH", 1)))
+            noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
+            noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
+            max = int(request.form.get("max", app.config.get("MAX", 50)))
+    except Exception as e:
+        logger.error(f"[VITS] {e}")
+        return make_response("parameter error", 400)
+    logger.info(f"[VITS] id:{id} format:{format} lang:{lang} length:{length} noise:{noise} noisew:{noisew}")
+    logger.info(f"[VITS] len:{len(text)} text：{text}")
+    if check_is_none(text):
+        logger.info(f"[VITS] text is empty")
+        return make_response(jsonify({"status": "error", "message": "text is empty"}), 400)
+    if check_is_none(id):
+        logger.info(f"[VITS] speaker id is empty")
+        return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
+    if id < 0 or id >= tts.vits_speakers_count:
+        logger.info(f"[VITS] speaker id {id} does not exist")
+        return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
+    speaker_lang = tts.voice_speakers["VITS"][id].get('lang')
+    if lang.upper() != "AUTO" and lang.upper() != "MIX" and len(speaker_lang) != 1 and lang not in speaker_lang:
+        logger.info(f"[VITS] lang \"{lang}\" is not in {speaker_lang}")
+        return make_response(jsonify({"status": "error", "message": f"lang '{lang}' is not in {speaker_lang}"}), 400)
+    if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
+        speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
+    fname = f"{str(uuid.uuid1())}.{format}"
+    file_type = f"audio/{format}"
+    t1 = time.time()
+    output = tts.vits_infer({"text": text,
+                             "id": id,
+                             "format": format,
+                             "length": length,
+                             "noise": noise,
+                             "noisew": noisew,
+                             "max": max,
+                             "lang": lang,
+                             "speaker_lang": speaker_lang})
+    t2 = time.time()
+    logger.info(f"[VITS] finish in {(t2 - t1):.2f}s")
     return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
+@app.route('/voice/hubert-vits', methods=["POST"])
+@require_api_key
+def voice_hubert_api():
     if request.method == "POST":
+        try:
+            voice = request.files['upload']
+            id = int(request.form.get("id"))
+            format = request.form.get("format", app.config.get("LANG", "auto"))
+            length = float(request.form.get("length", app.config.get("LENGTH", 1)))
+            noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
+            noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
+        except Exception as e:
+            logger.error(f"[hubert] {e}")
+            return make_response("parameter error", 400)
+    logger.info(f"[hubert] id:{id} format:{format} length:{length} noise:{noise} noisew:{noisew}")
+    fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
+    voice.save(os.path.join(app.config['UPLOAD_FOLDER'], fname))
+    if check_is_none(id):
+        logger.info(f"[hubert] speaker id is empty")
+        return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
+    if id < 0 or id >= tts.hubert_speakers_count:
+        logger.info(f"[hubert] speaker id {id} does not exist")
+        return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
+    file_type = f"audio/{format}"
+    t1 = time.time()
+    output = tts.hubert_vits_infer({"id": id,
+                                    "format": format,
+                                    "length": length,
+                                    "noise": noise,
+                                    "noisew": noisew,
+                                    "audio_path": os.path.join(app.config['UPLOAD_FOLDER'], fname)})
+    t2 = time.time()
+    logger.info(f"[hubert] finish in {(t2 - t1):.2f}s")
+    return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
+@app.route('/voice/w2v2-vits', methods=["GET", "POST"])
+@require_api_key
+def voice_w2v2_api():
+    try:
+        if request.method == "GET":
+            text = request.args.get("text", "")
+            id = int(request.args.get("id", app.config.get("ID", 0)))
+            format = request.args.get("format", app.config.get("FORMAT", "wav"))
+            lang = request.args.get("lang", app.config.get("LANG", "auto"))
+            length = float(request.args.get("length", app.config.get("LENGTH", 1)))
+            noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
+            noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
+            max = int(request.args.get("max", app.config.get("MAX", 50)))
+            emotion = int(request.args.get("emotion", app.config.get("EMOTION", 0)))
+        elif request.method == "POST":
+            text = request.form.get("text", "")
+            id = int(request.form.get("id", app.config.get("ID", 0)))
+            format = request.form.get("format", app.config.get("FORMAT", "wav"))
+            lang = request.form.get("lang", app.config.get("LANG", "auto"))
+            length = float(request.form.get("length"))
+            noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
+            noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
+            max = int(request.form.get("max", app.config.get("MAX", 50)))
+            emotion = int(request.form.get("emotion", app.config.get("EMOTION", 0)))
+    except Exception as e:
+        logger.error(f"[w2v2] {e}")
+        return make_response(f"parameter error", 400)
+    logger.info(f"[w2v2] id:{id} format:{format} lang:{lang} "
+                f"length:{length} noise:{noise} noisew:{noisew} emotion:{emotion}")
+    logger.info(f"[w2v2] len:{len(text)} text：{text}")
+    if check_is_none(text):
+        logger.info(f"[w2v2] text is empty")
+        return make_response(jsonify({"status": "error", "message": "text is empty"}), 400)
+    if check_is_none(id):
+        logger.info(f"[w2v2] speaker id is empty")
+        return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
+    if id < 0 or id >= tts.w2v2_speakers_count:
+        logger.info(f"[w2v2] speaker id {id} does not exist")
+        return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
+    speaker_lang = tts.voice_speakers["W2V2-VITS"][id].get('lang')
+    if lang.upper() != "AUTO" and lang.upper() != "MIX" and len(speaker_lang) != 1 and lang not in speaker_lang:
+        logger.info(f"[w2v2] lang \"{lang}\" is not in {speaker_lang}")
+        return make_response(jsonify({"status": "error", "message": f"lang '{lang}' is not in {speaker_lang}"}), 400)
+    if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
+        speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
+    fname = f"{str(uuid.uuid1())}.{format}"
+    file_type = f"audio/{format}"
+    t1 = time.time()
+    output = tts.w2v2_vits_infer({"text": text,
+                                  "id": id,
+                                  "format": format,
+                                  "length": length,
+                                  "noise": noise,
+                                  "noisew": noisew,
+                                  "max": max,
+                                  "lang": lang,
+                                  "emotion": emotion,
+                                  "speaker_lang": speaker_lang})
+    t2 = time.time()
+    logger.info(f"[w2v2] finish in {(t2 - t1):.2f}s")
+    return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
+@app.route('/voice/conversion', methods=["POST"])
+@app.route('/voice/vits/conversion', methods=["POST"])
+@require_api_key
+def vits_voice_conversion_api():
+    if request.method == "POST":
+        try:
+            voice = request.files['upload']
+            original_id = int(request.form["original_id"])
+            target_id = int(request.form["target_id"])
+            format = request.form.get("format", voice.filename.split(".")[1])
+        except Exception as e:
+            logger.error(f"[vits_voice_convertsion] {e}")
+            return make_response("parameter error", 400)
+        fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
+        audio_path = os.path.join(app.config['UPLOAD_FOLDER'], fname)
+        voice.save(audio_path)
         file_type = f"audio/{format}"
+        logger.info(f"[vits_voice_convertsion] orginal_id:{original_id} target_id:{target_id}")
+        t1 = time.time()
+        try:
+            output = tts.vits_voice_conversion({"audio_path": audio_path,
+                                                "original_id": original_id,
+                                                "target_id": target_id,
+                                                "format": format})
+        except Exception as e:
+            logger.info(f"[vits_voice_convertsion] {e}")
+            return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
+        t2 = time.time()
+        logger.info(f"finish in {(t2 - t1):.2f}s")
         return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
+@app.route('/voice/ssml', methods=["POST"])
+@require_api_key
+def ssml():
+    try:
+        ssml = request.form["ssml"]
+    except Exception as e:
+        logger.info(f"[ssml] {e}")
+        return make_response(jsonify({"status": "error", "message": f"parameter error"}), 400)
+    logger.debug(ssml)
+    t1 = time.time()
+    try:
+        output, format = tts.create_ssml_infer_task(ssml)
+    except Exception as e:
+        logger.info(f"[ssml] {e}")
+        return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
+    t2 = time.time()
+    fname = f"{str(uuid.uuid1())}.{format}"
+    file_type = f"audio/{format}"
+    logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
+    return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
+@app.route('/voice/dimension-emotion', methods=["POST"])
+def dimensional_emotion():
+    if request.method == "POST":
+        try:
+            audio = request.files['upload']
+        except Exception as e:
+            logger.error(f"[dimensional_emotion] {e}")
+            return make_response("parameter error", 400)
+    content = BytesIO(audio.read())
+    file_type = "application/octet-stream; charset=ascii"
+    fname = os.path.splitext(audio.filename)[0] + ".npy"
+    output = tts.get_dimensional_emotion_npy(content)
+    return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
+@app.route('/voice/check', methods=["GET", "POST"])
+def check():
+    try:
+        if request.method == "GET":
+            model = request.args.get("model")
+            id = int(request.args.get("id"))
+        elif request.method == "POST":
+            model = request.form["model"]
+            id = int(request.form["id"])
+    except Exception as e:
+        logger.info(f"[check] {e}")
+        return make_response(jsonify({"status": "error", "message": "parameter error"}), 400)
+    if check_is_none(model):
+        logger.info(f"[check] model {model} is empty")
+        return make_response(jsonify({"status": "error", "message": "model is empty"}), 400)
+    if model.upper() not in ("VITS", "HUBERT", "W2V2"):
+        res = make_response(jsonify({"status": "error", "message": f"model {model} does not exist"}))
+        res.status = 404
+        logger.info(f"[check] speaker id {id} error")
+        return res
+    if check_is_none(id):
+        logger.info(f"[check] speaker id is empty")
+        return make_response(jsonify({"status": "error", "message": "speaker id is empty"}), 400)
+    if model.upper() == "VITS":
+        speaker_list = tts.voice_speakers["VITS"]
+    elif model.upper() == "HUBERT":
+        speaker_list = tts.voice_speakers["HUBERT-VITS"]
+    elif model.upper() == "W2V2":
+        speaker_list = tts.voice_speakers["W2V2-VITS"]
+    if len(speaker_list) == 0:
+        logger.info(f"[check] {model} not loaded")
+        return make_response(jsonify({"status": "error", "message": f"{model} not loaded"}), 400)
+    if id < 0 or id >= len(speaker_list):
+        logger.info(f"[check] speaker id {id} does not exist")
+        return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
+    name = str(speaker_list[id]["name"])
+    lang = speaker_list[id]["lang"]
+    logger.info(f"[check] check id:{id} name:{name} lang:{lang}")
+    return make_response(jsonify({"status": "success", "id": id, "name": name, "lang": lang}), 200)
+# regular cleaning
+@scheduler.task('interval', id='clean_task', seconds=3600, misfire_grace_time=900)
 def clean_task():
     clean_folder(app.config["UPLOAD_FOLDER"])
+    clean_folder(app.config["CACHE_PATH"])
 if __name__ == '__main__':
+    app.run(host='0.0.0.0', port=app.config.get("PORT", 23456), debug=app.config.get("DEBUG", False))  # 对外开放
+    # app.run(host='127.0.0.1', port=app.config.get("PORT",23456), debug=True)  # 本地运行、调试

docker-compose.yaml CHANGED Viewed

@@ -1,12 +1,13 @@
 version: '3.4'
 services:
-  moegoe:
-    image: artrajz/moegoe-simple-api:latest
     restart: always
     ports:
       - 23456:23456
     environment:
       LANG: 'C.UTF-8'
     volumes:
       - ./Model:/app/Model # 挂载模型文件夹
       - ./config.py:/app/config.py # 挂载配置文件

 version: '3.4'
 services:
+  vits:
+    image: artrajz/vits-simple-api:latest
     restart: always
     ports:
       - 23456:23456
     environment:
       LANG: 'C.UTF-8'
+      TZ: Asia/Shanghai #timezone
     volumes:
       - ./Model:/app/Model # 挂载模型文件夹
       - ./config.py:/app/config.py # 挂载配置文件

models.py CHANGED Viewed

@@ -363,7 +363,7 @@ class SynthesizerTrn(nn.Module):
     else:
       self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
-    if n_speakers > 1:
       self.emb_g = nn.Embedding(n_speakers, gin_channels)
   def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):

     else:
       self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
+    if n_speakers >= 1:
       self.emb_g = nn.Embedding(n_speakers, gin_channels)
   def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., max_len=None, emotion_embedding=None):

optimizer_removal.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from torch import load, save
+if __name__ == '__main__':
+    print("优化器通常不会被用于推理阶段，如果只用于推理可以去除优化器以减小模型体积\n")
+    input_path = input("请输入模型的路径：")
+    output_path = f"{input_path.split('.')[0]}_inference.pth"
+    checkpoint_dict = load(input_path, map_location='cpu')
+    checkpoint_dict_new = {}
+    for k, v in checkpoint_dict.items():
+        if k == "optimizer":
+            print(f"remove optimizer")
+            continue
+        checkpoint_dict_new[k] = v
+    save(checkpoint_dict_new, output_path)
+    print("finish")
+    print(output_path)

request.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import re
+import requests
+import os
+import random
+import string
+from requests_toolbelt.multipart.encoder import MultipartEncoder
+abs_path = os.path.dirname(__file__)
+base = "http://127.0.0.1:23456"
+# 映射表
+def voice_speakers():
+    url = f"{base}/voice/speakers"
+    res = requests.post(url=url)
+    json = res.json()
+    for i in json:
+        print(i)
+        for j in json[i]:
+            print(j)
+    return json
+# 语音合成 voice vits
+def voice_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50):
+    fields = {
+        "text": text,
+        "id": str(id),
+        "format": format,
+        "lang": lang,
+        "length": str(length),
+        "noise": str(noise),
+        "noisew": str(noisew),
+        "max": str(max)
+    }
+    boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
+    m = MultipartEncoder(fields=fields, boundary=boundary)
+    headers = {"Content-Type": m.content_type}
+    url = f"{base}/voice"
+    res = requests.post(url=url, data=m, headers=headers)
+    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
+    path = f"{abs_path}/{fname}"
+    with open(path, "wb") as f:
+        f.write(res.content)
+    print(path)
+    return path
+# 语音转换 hubert-vits
+def voice_hubert_vits(upload_path, id, format="wav", length=1, noise=0.667, noisew=0.8):
+    upload_name = os.path.basename(upload_path)
+    upload_type = f'audio/{upload_name.split(".")[1]}'  # wav,ogg
+    with open(upload_path, 'rb') as upload_file:
+        fields = {
+            "upload": (upload_name, upload_file, upload_type),
+            "id": str(id),
+            "format": format,
+            "length": str(length),
+            "noise": str(noise),
+            "noisew": str(noisew),
+        }
+        boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
+        m = MultipartEncoder(fields=fields, boundary=boundary)
+        headers = {"Content-Type": m.content_type}
+        url = f"{base}/voice/hubert-vits"
+        res = requests.post(url=url, data=m, headers=headers)
+    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
+    path = f"{abs_path}/{fname}"
+    with open(path, "wb") as f:
+        f.write(res.content)
+    print(path)
+    return path
+# 维度情感模型 w2v2-vits
+def voice_w2v2_vits(text, id=0, format="wav", lang="auto", length=1, noise=0.667, noisew=0.8, max=50, emotion=0):
+    fields = {
+        "text": text,
+        "id": str(id),
+        "format": format,
+        "lang": lang,
+        "length": str(length),
+        "noise": str(noise),
+        "noisew": str(noisew),
+        "max": str(max),
+        "emotion": str(emotion)
+    }
+    boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
+    m = MultipartEncoder(fields=fields, boundary=boundary)
+    headers = {"Content-Type": m.content_type}
+    url = f"{base}/voice/w2v2-vits"
+    res = requests.post(url=url, data=m, headers=headers)
+    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
+    path = f"{abs_path}/{fname}"
+    with open(path, "wb") as f:
+        f.write(res.content)
+    print(path)
+    return path
+# 语音转换 同VITS模型内角色之间的音色转换
+def voice_conversion(upload_path, original_id, target_id):
+    upload_name = os.path.basename(upload_path)
+    upload_type = f'audio/{upload_name.split(".")[1]}'  # wav,ogg
+    with open(upload_path, 'rb') as upload_file:
+        fields = {
+            "upload": (upload_name, upload_file, upload_type),
+            "original_id": str(original_id),
+            "target_id": str(target_id),
+        }
+        boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
+        m = MultipartEncoder(fields=fields, boundary=boundary)
+        headers = {"Content-Type": m.content_type}
+        url = f"{base}/voice/conversion"
+        res = requests.post(url=url, data=m, headers=headers)
+    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
+    path = f"{abs_path}/{fname}"
+    with open(path, "wb") as f:
+        f.write(res.content)
+    print(path)
+    return path
+def voice_ssml(ssml):
+    fields = {
+        "ssml": ssml,
+    }
+    boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
+    m = MultipartEncoder(fields=fields, boundary=boundary)
+    headers = {"Content-Type": m.content_type}
+    url = f"{base}/voice/ssml"
+    res = requests.post(url=url, data=m, headers=headers)
+    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
+    path = f"{abs_path}/{fname}"
+    with open(path, "wb") as f:
+        f.write(res.content)
+    print(path)
+    return path
+def voice_dimensional_emotion(upload_path):
+    upload_name = os.path.basename(upload_path)
+    upload_type = f'audio/{upload_name.split(".")[1]}'  # wav,ogg
+    with open(upload_path, 'rb') as upload_file:
+        fields = {
+            "upload": (upload_name, upload_file, upload_type),
+        }
+        boundary = '----VoiceConversionFormBoundary' + ''.join(random.sample(string.ascii_letters + string.digits, 16))
+        m = MultipartEncoder(fields=fields, boundary=boundary)
+        headers = {"Content-Type": m.content_type}
+        url = f"{base}/voice/dimension-emotion"
+        res = requests.post(url=url, data=m, headers=headers)
+    fname = re.findall("filename=(.+)", res.headers["Content-Disposition"])[0]
+    path = f"{abs_path}/{fname}"
+    with open(path, "wb") as f:
+        f.write(res.content)
+    print(path)
+    return path
+import time
+# while 1:
+#     text = input()
+#     l = len(text)
+#     time1 = time.time()
+#     voice_vits(text)
+#     time2 = time.time()
+#     print(f"len:{l}耗时:{time2 - time1}")
+# text = "你好"
+# ssml = """
+# <speak lang="zh" format="mp3" length="1.2">
+#     <voice id="92" >这几天心里颇不宁静。</voice>
+#     <voice id="125">今晚在院子里坐着乘凉，忽然想起日日走过的荷塘，在这满月的光里，总该另有一番样子吧。</voice>
+#     <voice id="142">月亮渐渐地升高了，墙外马路上孩子们的欢笑，已经听不见了；</voice>
+#     <voice id="98">妻在屋里拍着闰儿，迷迷糊糊地哼着眠歌。</voice>
+#     <voice id="120">我悄悄地披了大衫，带上门出去。</voice><break time="2s"/>
+#     <voice id="121">沿着荷塘，是一条曲折的小煤屑路。</voice>
+#     <voice id="122">这是一条幽僻的路；白天也少人走，夜晚更加寂寞。</voice>
+#     <voice id="123">荷塘四面，长着许多树，蓊蓊郁郁的。</voice>
+#     <voice id="124">路的一旁，是些杨柳，和一些不知道名字的树。</voice>
+#     <voice id="125">没有月光的晚上，这路上阴森森的，有些怕人。</voice>
+#     <voice id="126">今晚却很好，虽然月光也还是淡淡的。</voice><break time="2s"/>
+#     <voice id="127">路上只我一个人，背着手踱着。</voice>
+#     <voice id="128">这一片天地好像是我的；我也像超出了平常的自己，到了另一个世界里。</voice>
+#     <voice id="129">我爱热闹，也爱冷静；<break strength="x-weak"/>爱群居，也爱独处。</voice>
+#     <voice id="130">像今晚上，一个人在这苍茫的月下，什么都可以想，什么都可以不想，便觉是个自由的人。</voice>
+#     <voice id="131">白天里一定要做的事，一定要说的话，现在都可不理。</voice>
+#     <voice id="132">这是独处的妙处，我且受用这无边的荷香月色好了。</voice>
+# </speak>
+# """
+# ssml = """
+# <speak lang="zh">
+#     <voice id="92" length="1.4">这几天心里颇不宁静。今晚<break/>在院子里坐着乘凉，忽然想起<break/>日日走过的荷塘，在这满月的光里，总该另有一番样子吧。</voice>
+#     <voice id="142" length="1.4">月亮渐渐地升高了，墙外马路上孩子们的欢笑，已经听不见了；</voice><break time="2s"/>
+#     <voice id="0" length="1.4" model="w2v2-vits" lang="ja">こんにちは</voice>
+# </speak>
+# """
+# ssml = """
+# <speak lang="ja">
+#     <voice id="142" length="1.4">こんにちは</voice>
+#     <voice id="0" length="1.4" model="w2v2-vits" emotion="177">こんにちは</voice>
+#     <voice id="0" length="1.4" model="w2v2-vits">こんにちは</voice>
+# </speak>
+# """
+ssml = """
+<speak lang="auto">
+    <voice>这几天心里颇不宁静。</voice>
+    <voice>今晚在院子里坐着乘凉，忽然想起日日走过的荷塘，在这满月的光里，总该另有一番样子吧。</voice>
+    <voice>月亮渐渐地升高了，墙外马路上孩子们的欢笑，已经听不见了；</voice>
+    <voice>妻在屋里拍着闰儿，迷迷糊糊地哼着眠歌。</voice>
+    <voice>我悄悄地披了大衫，带上门出去。</voice><break time="2s"/>
+    <voice>沿着荷塘，是一条曲折的小煤屑路。</voice>
+    <voice>这是一条幽僻的路；白天也少人走，夜晚更加寂寞。</voice>
+    <voice>荷塘四面，长着许多树，蓊蓊郁郁的。</voice>
+    <voice>路的一旁，是些杨柳，和一些不知道名字的树。</voice>
+    <voice>没有月光的晚上，这路上阴森森的，有些怕人。</voice>
+    <voice>今晚却很好，虽然月光也还是淡淡的。</voice><break time="2s"/>
+    <voice>路上只我一个人，背着手踱着。</voice>
+    <voice>这一片天地好像是我的；我也像超出了平常的自己，到了另一个世界里。</voice>
+    <voice>我爱热闹，也爱冷静；<break strength="x-weak"/>爱群居，也爱独处。</voice>
+    <voice>像今晚上，一个人在这苍茫的月下，什么都可以想，什么都可以不想，便觉是个自由的人。</voice>
+    <voice>白天里一定要做的事，一定要说的话，现在都可不理。</voice>
+    <voice>这是独处的妙处，我且受用这无边的荷香月色好了。</voice>
+</speak>
+"""
+text = """猫咪是爱撒娇、爱玩耍的小家伙，通常有着柔软的绒毛和温柔的眼神，是许多人都喜欢的宠物哦~它们特别喜欢舔自己的毛发，用柔顺的小脑袋搓人的脚丫子，还能给人带来很多欢乐和温馨。
+"""
+t1 = time.time()
+# voice_conversion("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav", 91, 93)
+# voice_hubert_vits("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav",0)
+# voice_vits(text,format="wav",lang="zh")
+# voice_w2v2_vits(text,emotion=111)
+# os.system(voice_ssml(ssml))
+os.system(voice_vits(text,id=0, format="wav", max=0))
+# voice_dimensional_emotion("H:/git/vits-simple-api/25ecb3f6-f968-11ed-b094-e0d4e84af078.wav")
+t2 = time.time()
+print(f"len:{len(text)}耗时:{t2 - t1}")

test.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import numpy as np
+from io import BytesIO
+array = np.array([1, 2, 3])
+npy = BytesIO()
+np.save(npy,array)
+npy.seek(0)
+tmp = np.load("H:\git/vits-simple-api\Model/npy/25ecb3f6-f968-11ed-b094-e0d4e84af078.npy")
+print(tmp)

text/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/text/__pycache__/__init__.cpython-310.pyc and b/text/__pycache__/__init__.cpython-310.pyc differ

text/__pycache__/cantonese.cpython-310.pyc ADDED Viewed

Binary file (2.34 kB). View file

text/__pycache__/cleaners.cpython-310.pyc CHANGED Viewed

Binary files a/text/__pycache__/cleaners.cpython-310.pyc and b/text/__pycache__/cleaners.cpython-310.pyc differ

text/__pycache__/english.cpython-310.pyc ADDED Viewed

Binary file (4.69 kB). View file

text/__pycache__/japanese.cpython-310.pyc CHANGED Viewed

Binary files a/text/__pycache__/japanese.cpython-310.pyc and b/text/__pycache__/japanese.cpython-310.pyc differ

text/__pycache__/korean.cpython-310.pyc ADDED Viewed

Binary file (5.58 kB). View file

text/__pycache__/mandarin.cpython-310.pyc CHANGED Viewed

Binary files a/text/__pycache__/mandarin.cpython-310.pyc and b/text/__pycache__/mandarin.cpython-310.pyc differ

text/__pycache__/ngu_dialect.cpython-310.pyc ADDED Viewed

Binary file (1.17 kB). View file

text/__pycache__/shanghainese.cpython-310.pyc ADDED Viewed

Binary file (2.51 kB). View file

text/cantonese.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import re
 import cn2an
 import opencc
-converter = opencc.OpenCC('jyutjyu')
 # List of (Latin alphabet, ipa) pairs:
 _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
@@ -35,6 +35,16 @@ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
     ('Z', 'iː˨sɛːt̚˥')
 ]]
 def number_to_cantonese(text):
     return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
@@ -47,9 +57,10 @@ def latin_to_ipa(text):
 def cantonese_to_ipa(text):
     text = number_to_cantonese(text.upper())
-    text = converter.convert(text).replace('-','').replace('$',' ')
-    text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
     text = re.sub(r'[、；：]', '，', text)
     text = re.sub(r'\s*，\s*', ', ', text)
     text = re.sub(r'\s*。\s*', '. ', text)

 import re
 import cn2an
 import opencc
+import config
+converter = opencc.OpenCC(config.ABS_PATH + '/chinese_dialect_lexicons/jyutjyu_2')
 # List of (Latin alphabet, ipa) pairs:
 _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
     ('Z', 'iː˨sɛːt̚˥')
 ]]
+_symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
+    ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
+]]
+def symbols_to_chinese(text):
+    for regex, replacement in _symbols_to_chinese:
+        text = re.sub(regex, replacement, text)
+    return text
 def number_to_cantonese(text):
     return re.sub(r'\d+(?:\.?\d+)?', lambda x: cn2an.an2cn(x.group()), text)
 def cantonese_to_ipa(text):
+    text = symbols_to_chinese(text)
     text = number_to_cantonese(text.upper())
+    text = converter.convert(text).replace('-', '').replace('$', ' ')
+    text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group()) + ' ', text)
     text = re.sub(r'[、；：]', '，', text)
     text = re.sub(r'\s*，\s*', ', ', text)
     text = re.sub(r'\s*。\s*', '. ', text)

text/cleaners.py CHANGED Viewed

@@ -1,10 +1,77 @@
 import re
 def japanese_cleaners(text):
     from text.japanese import japanese_to_romaji_with_accent
-    text = japanese_to_romaji_with_accent(text)
-    text = re.sub(r'([A-Za-z])$', r'\1.', text)
     return text
@@ -15,20 +82,31 @@ def japanese_cleaners2(text):
 def korean_cleaners(text):
     '''Pipeline for Korean text'''
     from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
-    text = latin_to_hangul(text)
-    text = number_to_hangul(text)
-    text = divide_hangul(text)
-    text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
     return text
 def chinese_cleaners(text):
     '''Pipeline for Chinese text'''
-    from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo
-    text = number_to_chinese(text)
-    text = chinese_to_bopomofo(text)
-    text = latin_to_bopomofo(text)
-    text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
     return text
@@ -36,9 +114,9 @@ def zh_ja_mixture_cleaners(text):
     from text.mandarin import chinese_to_romaji
     from text.japanese import japanese_to_romaji_with_accent
     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
-                  lambda x: chinese_to_romaji(x.group(1))+' ', text)
     text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
-        x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
     text = re.sub(r'\s+$', '', text)
     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text
@@ -57,15 +135,15 @@ def cjks_cleaners(text):
     from text.sanskrit import devanagari_to_ipa
     from text.english import english_to_lazy_ipa
     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
-                  lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
     text = re.sub(r'\[JA\](.*?)\[JA\]',
-                  lambda x: japanese_to_ipa(x.group(1))+' ', text)
     text = re.sub(r'\[KO\](.*?)\[KO\]',
-                  lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
     text = re.sub(r'\[SA\](.*?)\[SA\]',
-                  lambda x: devanagari_to_ipa(x.group(1))+' ', text)
     text = re.sub(r'\[EN\](.*?)\[EN\]',
-                  lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
     text = re.sub(r'\s+$', '', text)
     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text
@@ -77,13 +155,13 @@ def cjke_cleaners(text):
     from text.korean import korean_to_ipa
     from text.english import english_to_ipa2
     text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
-        'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')+' ', text)
     text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
-        'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')+' ', text)
     text = re.sub(r'\[KO\](.*?)\[KO\]',
-                  lambda x: korean_to_ipa(x.group(1))+' ', text)
     text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
-        'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')+' ', text)
     text = re.sub(r'\s+$', '', text)
     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text
@@ -95,13 +173,28 @@ def cjke_cleaners2(text):
     from text.korean import korean_to_ipa
     from text.english import english_to_ipa2
     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
-                  lambda x: chinese_to_ipa(x.group(1))+' ', text)
     text = re.sub(r'\[JA\](.*?)\[JA\]',
-                  lambda x: japanese_to_ipa2(x.group(1))+' ', text)
     text = re.sub(r'\[KO\](.*?)\[KO\]',
-                  lambda x: korean_to_ipa(x.group(1))+' ', text)
     text = re.sub(r'\[EN\](.*?)\[EN\]',
-                  lambda x: english_to_ipa2(x.group(1))+' ', text)
     text = re.sub(r'\s+$', '', text)
     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text
@@ -109,15 +202,25 @@ def cjke_cleaners2(text):
 def thai_cleaners(text):
     from text.thai import num_to_thai, latin_to_thai
-    text = num_to_thai(text)
-    text = latin_to_thai(text)
     return text
 def shanghainese_cleaners(text):
     from text.shanghainese import shanghainese_to_ipa
-    text = shanghainese_to_ipa(text)
-    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text
@@ -129,17 +232,18 @@ def chinese_dialect_cleaners(text):
     from text.english import english_to_lazy_ipa2
     from text.ngu_dialect import ngu_dialect_to_ipa
     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
-                  lambda x: chinese_to_ipa2(x.group(1))+' ', text)
     text = re.sub(r'\[JA\](.*?)\[JA\]',
-                  lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
     text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
-                  '˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text)
     text = re.sub(r'\[GD\](.*?)\[GD\]',
-                  lambda x: cantonese_to_ipa(x.group(1))+' ', text)
     text = re.sub(r'\[EN\](.*?)\[EN\]',
-                  lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
     text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
-        1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
     text = re.sub(r'\s+$', '', text)
     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text

 import re
+import config
+from unidecode import unidecode
+from phonemizer import phonemize
+from phonemizer.backend.espeak.wrapper import EspeakWrapper
+ESPEAK_LIBRARY = getattr(config, "ESPEAK_LIBRARY", "")
+if ESPEAK_LIBRARY != "":
+    EspeakWrapper.set_library(ESPEAK_LIBRARY)
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('mrs', 'misess'),
+    ('mr', 'mister'),
+    ('dr', 'doctor'),
+    ('st', 'saint'),
+    ('co', 'company'),
+    ('jr', 'junior'),
+    ('maj', 'major'),
+    ('gen', 'general'),
+    ('drs', 'doctors'),
+    ('rev', 'reverend'),
+    ('lt', 'lieutenant'),
+    ('hon', 'honorable'),
+    ('sgt', 'sergeant'),
+    ('capt', 'captain'),
+    ('esq', 'esquire'),
+    ('ltd', 'limited'),
+    ('col', 'colonel'),
+    ('ft', 'fort'),
+]]
+def expand_abbreviations(text):
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
+def transliteration_cleaners(text):
+    '''Pipeline for non-English text that transliterates to ASCII.'''
+    text = unidecode(text)
+    text = text.lower()
+    text = re.sub(r'\s+', ' ', text)
+    text = expand_abbreviations(text)
+    return text
+# for English text
+def english_cleaners(text):
+    '''Pipeline for English text, including abbreviation expansion.'''
+    text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: transliteration_cleaners(x.group(1)) + ' ', text)
+    phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
+    return phonemes
+# for non-English text that can be transliterated to ASCII
+def english_cleaners2(text):
+    '''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
+    text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: transliteration_cleaners(x.group(1)) + ' ', text)
+    phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True,
+                         with_stress=True)
+    return phonemes
 def japanese_cleaners(text):
     from text.japanese import japanese_to_romaji_with_accent
+    def clean(text):
+        text = japanese_to_romaji_with_accent(text)
+        text = re.sub(r'([A-Za-z])$', r'\1.', text)
+        return text
+    text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: clean(x.group(1)) + ' ', text)
     return text
 def korean_cleaners(text):
     '''Pipeline for Korean text'''
     from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
+    def clean(text):
+        text = latin_to_hangul(text)
+        text = number_to_hangul(text)
+        text = divide_hangul(text)
+        text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
+        return text
+    text = re.sub(r'\[KO\](.*?)\[KO\]', lambda x: clean(x.group(1)) + ' ', text)
     return text
 def chinese_cleaners(text):
     '''Pipeline for Chinese text'''
+    from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, symbols_to_chinese
+    def clean(text):
+        text = symbols_to_chinese(text)
+        text = number_to_chinese(text)
+        text = chinese_to_bopomofo(text)
+        text = latin_to_bopomofo(text)
+        text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
+        return text
+    text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: clean(x.group(1)) + ' ', text)
     return text
     from text.mandarin import chinese_to_romaji
     from text.japanese import japanese_to_romaji_with_accent
     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
+                  lambda x: chinese_to_romaji(x.group(1)) + ' ', text)
     text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
+        x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…') + ' ', text)
     text = re.sub(r'\s+$', '', text)
     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text
     from text.sanskrit import devanagari_to_ipa
     from text.english import english_to_lazy_ipa
     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
+                  lambda x: chinese_to_lazy_ipa(x.group(1)) + ' ', text)
     text = re.sub(r'\[JA\](.*?)\[JA\]',
+                  lambda x: japanese_to_ipa(x.group(1)) + ' ', text)
     text = re.sub(r'\[KO\](.*?)\[KO\]',
+                  lambda x: korean_to_lazy_ipa(x.group(1)) + ' ', text)
     text = re.sub(r'\[SA\](.*?)\[SA\]',
+                  lambda x: devanagari_to_ipa(x.group(1)) + ' ', text)
     text = re.sub(r'\[EN\](.*?)\[EN\]',
+                  lambda x: english_to_lazy_ipa(x.group(1)) + ' ', text)
     text = re.sub(r'\s+$', '', text)
     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text
     from text.korean import korean_to_ipa
     from text.english import english_to_ipa2
     text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
+        'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn') + ' ', text)
     text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
+        'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz') + ' ', text)
     text = re.sub(r'\[KO\](.*?)\[KO\]',
+                  lambda x: korean_to_ipa(x.group(1)) + ' ', text)
     text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
+        'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u') + ' ', text)
     text = re.sub(r'\s+$', '', text)
     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text
     from text.korean import korean_to_ipa
     from text.english import english_to_ipa2
     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
+                  lambda x: chinese_to_ipa(x.group(1)) + ' ', text)
     text = re.sub(r'\[JA\](.*?)\[JA\]',
+                  lambda x: japanese_to_ipa2(x.group(1)) + ' ', text)
     text = re.sub(r'\[KO\](.*?)\[KO\]',
+                  lambda x: korean_to_ipa(x.group(1)) + ' ', text)
+    text = re.sub(r'\[EN\](.*?)\[EN\]',
+                  lambda x: english_to_ipa2(x.group(1)) + ' ', text)
+    text = re.sub(r'\s+$', '', text)
+    text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
+    return text
+def cje_cleaners(text):
+    from text.mandarin import chinese_to_ipa
+    from text.japanese import japanese_to_ipa2
+    from text.english import english_to_ipa2
+    text = re.sub(r'\[ZH\](.*?)\[ZH\]',
+                  lambda x: chinese_to_ipa(x.group(1)) + ' ', text)
+    text = re.sub(r'\[JA\](.*?)\[JA\]',
+                  lambda x: japanese_to_ipa2(x.group(1)) + ' ', text)
     text = re.sub(r'\[EN\](.*?)\[EN\]',
+                  lambda x: english_to_ipa2(x.group(1)) + ' ', text)
     text = re.sub(r'\s+$', '', text)
     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text
 def thai_cleaners(text):
     from text.thai import num_to_thai, latin_to_thai
+    def clean(text):
+        text = num_to_thai(text)
+        text = latin_to_thai(text)
+        return text
+    text = re.sub(r'\[TH\](.*?)\[TH\]', lambda x: clean(x.group(1)) + ' ', text)
     return text
 def shanghainese_cleaners(text):
     from text.shanghainese import shanghainese_to_ipa
+    def clean(text):
+        text = shanghainese_to_ipa(text)
+        text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
+        return text
+    text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: clean(x.group(1)) + ' ', text)
     return text
     from text.english import english_to_lazy_ipa2
     from text.ngu_dialect import ngu_dialect_to_ipa
     text = re.sub(r'\[ZH\](.*?)\[ZH\]',
+                  lambda x: chinese_to_ipa2(x.group(1)) + ' ', text)
     text = re.sub(r'\[JA\](.*?)\[JA\]',
+                  lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ') + ' ', text)
     text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
+                                                                                                             '˧˧˦').replace(
+        '6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e') + ' ', text)
     text = re.sub(r'\[GD\](.*?)\[GD\]',
+                  lambda x: cantonese_to_ipa(x.group(1)) + ' ', text)
     text = re.sub(r'\[EN\](.*?)\[EN\]',
+                  lambda x: english_to_lazy_ipa2(x.group(1)) + ' ', text)
     text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
+        1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ') + ' ', text)
     text = re.sub(r'\s+$', '', text)
     text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
     return text

text/mandarin.py CHANGED Viewed

@@ -7,10 +7,9 @@ import cn2an
 import logging
 logging.getLogger('jieba').setLevel(logging.WARNING)
-jieba.set_dictionary(os.path.dirname(os.path.realpath(sys.argv[0]))+'/jieba/dict.txt')
 jieba.initialize()
 # List of (Latin alphabet, bopomofo) pairs:
 _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
     ('a', 'ㄟˉ'),
@@ -236,9 +235,19 @@ _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
     ('—', '-')
 ]]
 def number_to_chinese(text):
-    numbers = re.findall(r'\d+(?:\.?\d+)?', text)
     for number in numbers:
         text = text.replace(number, cn2an.an2cn(number), 1)
     return text
@@ -286,6 +295,7 @@ def bopomofo_to_ipa2(text):
 def chinese_to_romaji(text):
     text = number_to_chinese(text)
     text = chinese_to_bopomofo(text)
     text = latin_to_bopomofo(text)
@@ -306,6 +316,7 @@ def chinese_to_lazy_ipa(text):
 def chinese_to_ipa(text):
     text = number_to_chinese(text)
     text = chinese_to_bopomofo(text)
     text = latin_to_bopomofo(text)
@@ -319,6 +330,7 @@ def chinese_to_ipa(text):
 def chinese_to_ipa2(text):
     text = number_to_chinese(text)
     text = chinese_to_bopomofo(text)
     text = latin_to_bopomofo(text)

 import logging
 logging.getLogger('jieba').setLevel(logging.WARNING)
+jieba.set_dictionary(os.path.dirname(os.path.realpath(sys.argv[0])) + '/jieba/dict.txt')
 jieba.initialize()
 # List of (Latin alphabet, bopomofo) pairs:
 _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
     ('a', 'ㄟˉ'),
     ('—', '-')
 ]]
+_symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
+    ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
+]]
+def symbols_to_chinese(text):
+    for regex, replacement in _symbols_to_chinese:
+        text = re.sub(regex, replacement, text)
+    return text
 def number_to_chinese(text):
+    numbers = re.findall(r'[0-9]+(?:\.?[0-9]+)?', text)
     for number in numbers:
         text = text.replace(number, cn2an.an2cn(number), 1)
     return text
 def chinese_to_romaji(text):
+    text = symbols_to_chinese(text)
     text = number_to_chinese(text)
     text = chinese_to_bopomofo(text)
     text = latin_to_bopomofo(text)
 def chinese_to_ipa(text):
+    text = symbols_to_chinese(text)
     text = number_to_chinese(text)
     text = chinese_to_bopomofo(text)
     text = latin_to_bopomofo(text)
 def chinese_to_ipa2(text):
+    text = symbols_to_chinese(text)
     text = number_to_chinese(text)
     text = chinese_to_bopomofo(text)
     text = latin_to_bopomofo(text)

text/shanghainese.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import re
 import cn2an
 import opencc
-converter = opencc.OpenCC('zaonhe')
 # List of (Latin alphabet, ipa) pairs:
 _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
@@ -35,9 +35,19 @@ _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
     ('Z', 'zᴇ')
 ]]
 def _number_to_shanghainese(num):
-    num = cn2an.an2cn(num).replace('一十','十').replace('二十', '廿').replace('二', '两')
     return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
@@ -52,9 +62,10 @@ def latin_to_ipa(text):
 def shanghainese_to_ipa(text):
     text = number_to_shanghainese(text.upper())
-    text = converter.convert(text).replace('-','').replace('$',' ')
-    text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group())+' ', text)
     text = re.sub(r'[、；：]', '，', text)
     text = re.sub(r'\s*，\s*', ', ', text)
     text = re.sub(r'\s*。\s*', '. ', text)

 import re
 import cn2an
 import opencc
+import config
+converter = opencc.OpenCC(config.ABS_PATH + '/chinese_dialect_lexicons/zaonhe')
 # List of (Latin alphabet, ipa) pairs:
 _latin_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
     ('Z', 'zᴇ')
 ]]
+_symbols_to_chinese = [(re.compile(f'{x[0]}'), x[1]) for x in [
+    ('([0-9]+(?:\.?[0-9]+)?)%', r'百分之\1'),
+]]
+def symbols_to_chinese(text):
+    for regex, replacement in _symbols_to_chinese:
+        text = re.sub(regex, replacement, text)
+    return text
 def _number_to_shanghainese(num):
+    num = cn2an.an2cn(num).replace('一十', '十').replace('二十', '廿').replace('二', '两')
     return re.sub(r'((?:^|[^三四五六七八九])十|廿)两', r'\1二', num)
 def shanghainese_to_ipa(text):
+    text = symbols_to_chinese(text)
     text = number_to_shanghainese(text.upper())
+    text = converter.convert(text).replace('-', '').replace('$', ' ')
+    text = re.sub(r'[A-Z]', lambda x: latin_to_ipa(x.group()) + ' ', text)
     text = re.sub(r'[、；：]', '，', text)
     text = re.sub(r'\s*，\s*', ', ', text)
     text = re.sub(r'\s*。\s*', '. ', text)

utils/__pycache__/merge.cpython-310.pyc ADDED Viewed

Binary file (3.95 kB). View file

utils/__pycache__/nlp.cpython-310.pyc ADDED Viewed

Binary file (2.41 kB). View file

utils/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (4.02 kB). View file

utils/merge.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+import json
+import logging
+import config
+import numpy as np
+from utils.utils import check_is_none
+from voice import vits, TTS
+lang_dict = {
+    "english_cleaners": ["en"],
+    "english_cleaners2": ["en"],
+    "japanese_cleaners": ["ja"],
+    "japanese_cleaners2": ["ja"],
+    "korean_cleaners": ["ko"],
+    "chinese_cleaners": ["zh"],
+    "zh_ja_mixture_cleaners": ["zh", "ja"],
+    "sanskrit_cleaners": ["sa"],
+    "cjks_cleaners": ["zh", "ja", "ko", "sa"],
+    "cjke_cleaners": ["zh", "ja", "ko", "en"],
+    "cjke_cleaners2": ["zh", "ja", "ko", "en"],
+    "cje_cleaners": ["zh", "ja", "en"],
+    "thai_cleaners": ["th"],
+    "shanghainese_cleaners": ["sh"],
+    "chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD",
+                                 "ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC",
+                                 "YB"],
+}
+def analysis(model_config_json):
+    model_config = json.load(model_config_json)
+    symbols = model_config.get("symbols", None)
+    emotion_embedding = model_config.get("data").get("emotion_embedding", False)
+    if symbols != None:
+        if not emotion_embedding:
+            mode_type = "vits"
+        else:
+            mode_type = "w2v2"
+    else:
+        mode_type = "hubert-soft"
+    return mode_type
+def load_npy(model_):
+    if isinstance(model_, list):
+        # check if is .npy
+        for i in model_:
+            _model_extention = os.path.splitext(i)[1]
+            if _model_extention != ".npy":
+                raise ValueError(f"Unsupported model type: {_model_extention}")
+        # merge npy files
+        emotion_reference = np.empty((0, 1024))
+        for i in model_:
+            tmp = np.load(i).reshape(-1, 1024)
+            emotion_reference = np.append(emotion_reference, tmp, axis=0)
+    elif os.path.isdir(model_):
+        emotion_reference = np.empty((0, 1024))
+        for root, dirs, files in os.walk(model_):
+            for file_name in files:
+                # check if is .npy
+                _model_extention = os.path.splitext(file_name)[1]
+                if _model_extention != ".npy":
+                    continue
+                file_path = os.path.join(root, file_name)
+                # merge npy files
+                tmp = np.load(file_path).reshape(-1, 1024)
+                emotion_reference = np.append(emotion_reference, tmp, axis=0)
+    elif os.path.isfile(model_):
+        # check if is .npy
+        _model_extention = os.path.splitext(model_)[1]
+        if _model_extention != ".npy":
+            raise ValueError(f"Unsupported model type: {_model_extention}")
+        emotion_reference = np.load(model_)
+    logging.info(f"Loaded emotional dimention npy range:{len(emotion_reference)}")
+    return emotion_reference
+def merge_model(merging_model):
+    vits_obj = []
+    vits_speakers = []
+    hubert_vits_obj = []
+    hubert_vits_speakers = []
+    w2v2_vits_obj = []
+    w2v2_vits_speakers = []
+    # model list
+    vits_list = []
+    hubert_vits_list = []
+    w2v2_vits_list = []
+    for l in merging_model:
+        with open(l[1], 'r', encoding='utf-8') as model_config:
+            model_type = analysis(model_config)
+        if model_type == "vits":
+            vits_list.append(l)
+        elif model_type == "hubert":
+            hubert_vits_list.append(l)
+        elif model_type == "w2v2":
+            w2v2_vits_list.append(l)
+    # merge vits
+    new_id = 0
+    for obj_id, i in enumerate(vits_list):
+        obj = vits(model=i[0], config=i[1], model_type="vits")
+        lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
+        for id, name in enumerate(obj.return_speakers()):
+            vits_obj.append([int(id), obj, obj_id])
+            vits_speakers.append({"id": new_id, "name": name, "lang": lang})
+            new_id += 1
+    # merge hubert-vits
+    if len(hubert_vits_list) != 0:
+        if getattr(config, "HUBERT_SOFT_MODEL", None) == None or check_is_none(config.HUBERT_SOFT_MODEL):
+            raise ValueError(f"Please configure HUBERT_SOFT_MODEL path in config.py")
+        try:
+            from hubert_model import hubert_soft
+            hubert = hubert_soft(config.HUBERT_SOFT_MODEL)
+        except Exception as e:
+            raise ValueError(f"Load HUBERT_SOFT_MODEL failed {e}")
+    new_id = 0
+    for obj_id, i in enumerate(hubert_vits_list):
+        obj = vits(model=i[0], config=i[1], model_=hubert, model_type="hubert")
+        lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
+        for id, name in enumerate(obj.return_speakers()):
+            hubert_vits_obj.append([int(id), obj, obj_id])
+            hubert_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
+            new_id += 1
+    # merge w2v2-vits
+    if len(w2v2_vits_list) != 0:
+        if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY):
+            raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py")
+        try:
+            emotion_reference = load_npy(config.DIMENSIONAL_EMOTION_NPY)
+        except Exception as e:
+            raise ValueError(f"Load DIMENSIONAL_EMOTION_NPY failed {e}")
+    new_id = 0
+    for obj_id, i in enumerate(w2v2_vits_list):
+        obj = vits(model=i[0], config=i[1], model_=emotion_reference, model_type="w2v2")
+        lang = lang_dict.get(obj.get_cleaner(), obj.get_cleaner())
+        for id, name in enumerate(obj.return_speakers()):
+            w2v2_vits_obj.append([int(id), obj, obj_id])
+            w2v2_vits_speakers.append({"id": new_id, "name": name, "lang": lang})
+            new_id += 1
+    voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj}
+    voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers}
+    tts = TTS(voice_obj, voice_speakers)
+    return tts

utils/nlp.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import regex as re
+import logging
+import config
+from fastlid import fastlid
+from .utils import check_is_none
+logger = logging.getLogger("vits-simple-api")
+level = getattr(config, "LOGGING_LEVEL", "DEBUG")
+level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
+              'CRITICAL': logging.CRITICAL}
+logger.setLevel(level_dict[level])
+def clasify_lang(text):
+    pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
+              r'\！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」' \
+              r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
+    words = re.split(pattern, text)
+    pre = ""
+    p = 0
+    for word in words:
+        if check_is_none(word): continue
+        lang = fastlid(word)[0]
+        if pre == "":
+            text = text[:p] + text[p:].replace(word, f'[{lang.upper()}]' + word, 1)
+            p += len(f'[{lang.upper()}]')
+        elif pre != lang:
+            text = text[:p] + text[p:].replace(word, f'[{pre.upper()}][{lang.upper()}]' + word, 1)
+            p += len(f'[{pre.upper()}][{lang.upper()}]')
+        pre = lang
+        p += text[p:].index(word) + len(word)
+    text += f"[{pre.upper()}]"
+    return text
+def cut(text, max):
+    pattern = r'[\!\(\)\,\-\.\/\:\;\?\？\。\，\、\；\：]+'
+    sentences = re.split(pattern, text)
+    sentence_list = []
+    count = 0
+    p = 0
+    for sentence in sentences:
+        count += len(sentence) + 1
+        if count >= max:
+            sentence_list.append(text[p:p + count])
+            p += count
+            count = 0
+    if p < len(text):
+        sentence_list.append(text[p:])
+    return sentence_list
+def sentence_split(text, max=50, lang="auto", speaker_lang=None):
+    # 如果该speaker只支持一种语言
+    if speaker_lang is not None and len(speaker_lang) == 1:
+        if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
+            logger.debug(
+                f"lang \"{lang}\" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}")
+        lang = speaker_lang[0]
+    else:
+        fastlid.set_languages = speaker_lang
+    sentence_list = []
+    if lang.upper() != "MIX":
+        if max <= 0:
+            sentence_list.append(
+                clasify_lang(text) if lang.upper() == "AUTO" else f"[{lang.upper()}]{text}[{lang.upper()}]")
+        else:
+            for i in cut(text, max):
+                if check_is_none(i): continue
+                sentence_list.append(
+                    clasify_lang(i) if lang.upper() == "AUTO" else f"[{lang.upper()}]{i}[{lang.upper()}]")
+    else:
+        sentence_list.append(text)
+    for i in sentence_list:
+        logger.debug(i)
+    return sentence_list

utils/utils.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import logging
+import os
+from json import loads
+import av
+from torch import load, FloatTensor
+from numpy import float32
+import librosa
+class HParams():
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if type(v) == dict:
+                v = HParams(**v)
+            self[k] = v
+    def keys(self):
+        return self.__dict__.keys()
+    def items(self):
+        return self.__dict__.items()
+    def values(self):
+        return self.__dict__.values()
+    def __len__(self):
+        return len(self.__dict__)
+    def __getitem__(self, key):
+        return getattr(self, key)
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+    def __contains__(self, key):
+        return key in self.__dict__
+    def __repr__(self):
+        return self.__dict__.__repr__()
+def load_checkpoint(checkpoint_path, model):
+    checkpoint_dict = load(checkpoint_path, map_location='cpu')
+    iteration = checkpoint_dict['iteration']
+    saved_state_dict = checkpoint_dict['model']
+    if hasattr(model, 'module'):
+        state_dict = model.module.state_dict()
+    else:
+        state_dict = model.state_dict()
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        try:
+            new_state_dict[k] = saved_state_dict[k]
+        except:
+            logging.info("%s is not in the checkpoint" % k)
+            new_state_dict[k] = v
+    if hasattr(model, 'module'):
+        model.module.load_state_dict(new_state_dict)
+    else:
+        model.load_state_dict(new_state_dict)
+    logging.info("Loaded checkpoint '{}' (iteration {})".format(
+        checkpoint_path, iteration))
+    return
+def get_hparams_from_file(config_path):
+    with open(config_path, 'r', encoding='utf-8') as f:
+        data = f.read()
+    config = loads(data)
+    hparams = HParams(**config)
+    return hparams
+def load_audio_to_torch(full_path, target_sampling_rate):
+    audio, sampling_rate = librosa.load(full_path, sr=target_sampling_rate, mono=True)
+    return FloatTensor(audio.astype(float32))
+def wav2ogg(input, output):
+    with av.open(input, 'rb') as i:
+        with av.open(output, 'wb', format='ogg') as o:
+            out_stream = o.add_stream('libvorbis')
+            for frame in i.decode(audio=0):
+                for p in out_stream.encode(frame):
+                    o.mux(p)
+            for p in out_stream.encode(None):
+                o.mux(p)
+def wav2mp3(input, output):
+    with av.open(input, 'rb') as i:
+        with av.open(output, 'wb', format='mp3') as o:
+            out_stream = o.add_stream('mp3')
+            for frame in i.decode(audio=0):
+                for p in out_stream.encode(frame):
+                    o.mux(p)
+            for p in out_stream.encode(None):
+                o.mux(p)
+def clean_folder(folder_path):
+    for filename in os.listdir(folder_path):
+        file_path = os.path.join(folder_path, filename)
+        # 如果是文件，则删除文件
+        if os.path.isfile(file_path):
+            os.remove(file_path)
+# is none -> True, is not none -> False
+def check_is_none(s):
+    return s is None or (isinstance(s, str) and str(s).isspace()) or str(s) == ""

vits-simple-api-installer-latest.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+INSTALL_DIR=/usr/local/vits-simple-api
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[0;33m'
+PLAIN='\033[0m'
+mkdir -p $INSTALL_DIR
+cd $INSTALL_DIR
+if [ ! -f config.py ]; then
+    echo -e "${YELLOW}download config.py\n${PLAIN}"
+    wget -O $INSTALL_DIR/config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/config.py
+fi
+wget -O $INSTALL_DIR/docker-compose.yaml https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/docker-compose.yaml
+echo -e "${YELLOW}Pulling the image might take a while, so why not grab a cup of java first?\n${PLAIN}"
+docker compose pull
+docker compose up -d
+echo -e "\nThe upgrade or installation has been completed."
+echo -e "The configuration file directory is $(realpath $INSTALL_DIR)"
+echo -e "${YELLOW}If the vits model is not imported, it cannot be used. Import the model in the configuration file directory.${PLAIN}"
+echo -e "After modifying the configuration file, restart the docker container for the modification to take effect."
+echo -e "${YELLOW}If you have any questions, please put them in the issues.${PLAIN}"
+echo -e "https://github.com/Artrajz/vits-simple-api"

voice.py CHANGED Viewed

@@ -1,32 +1,30 @@
 import os
 import librosa
-from scipy.io.wavfile import write
-from mel_processing import spectrogram_torch
-from text import text_to_sequence, _clean_text
-from models import SynthesizerTrn
-import utils
 import commons
 import sys
 import re
 import numpy as np
-# import torch
-# torch.set_num_threads(1) #设置torch线程为1，防止多任务推理时服务崩溃，但flask仍然会使用多线程
 from torch import no_grad, LongTensor, inference_mode, FloatTensor
-import audonnx
-import uuid
 from io import BytesIO
-class Voice:
-    def __init__(self, model, config, out_path=None):
-        self.out_path = out_path
-        if not os.path.exists(self.out_path):
-            try:
-                os.mkdir(self.out_path)
-            except:
-                pass
         self.hps_ms = utils.get_hparams_from_file(config)
         self.n_speakers = self.hps_ms.data.n_speakers if 'n_speakers' in self.hps_ms.data.keys() else 0
         self.n_symbols = len(self.hps_ms.symbols) if 'symbols' in self.hps_ms.keys() else 0
@@ -42,9 +40,19 @@ class Voice:
             emotion_embedding=self.emotion_embedding,
             **self.hps_ms.model)
         _ = self.net_g_ms.eval()
         utils.load_checkpoint(model, self.net_g_ms)
-    def get_text(self, text, hps, cleaned=False):
         if cleaned:
             text_norm = text_to_sequence(text, hps.symbols, [])
         else:
@@ -54,7 +62,7 @@ class Voice:
         text_norm = LongTensor(text_norm)
         return text_norm
-    def get_label_value(self, text, label, default, warning_name='value'):
         value = re.search(rf'\[{label}=(.+?)\]', text)
         if value:
             try:
@@ -65,16 +73,10 @@ class Voice:
                 sys.exit(1)
         else:
             value = default
-        return value, text
-    def ex_return(self, text, escape=False):
-        if escape:
-            return text.encode('unicode_escape').decode()
         else:
-            return text
-    def return_speakers(self, escape=False):
-        return self.speakers
     def get_label(self, text, label):
         if f'[{label}]' in text:
@@ -82,132 +84,152 @@ class Voice:
         else:
             return False, text
-    def generate(self, text=None, speaker_id=None, format=None, speed=1, audio_path=None, target_id=None, escape=False,
-                 option=None, w2v2_folder=None):
-        if self.n_symbols != 0:
-            if not self.emotion_embedding:
-                length_scale, text = self.get_label_value(text, 'LENGTH', speed, 'length scale')
-                noise_scale, text = self.get_label_value(text, 'NOISE', 0.667, 'noise scale')
-                noise_scale_w, text = self.get_label_value(text, 'NOISEW', 0.8, 'deviation of noise')
-                cleaned, text = self.get_label(text, 'CLEANED')
-                stn_tst = self.get_text(text, self.hps_ms, cleaned=cleaned)
-                with no_grad():
-                    x_tst = stn_tst.unsqueeze(0)
-                    x_tst_lengths = LongTensor([stn_tst.size(0)])
-                    sid = LongTensor([speaker_id])
-                    audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid,
-                                                noise_scale=noise_scale,
-                                                noise_scale_w=noise_scale_w,
-                                                length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
             # else:
-            #     w2v2_model = audonnx.load(os.path.dirname(w2v2_folder))
-            #
-            #     if option == 'clean':
-            #         self.ex_print(_clean_text(
-            #             text, self.hps_ms.data.text_cleaners), escape)
-            #
-            #     length_scale, text = self.get_label_value(
-            #         text, 'LENGTH', 1, 'length scale')
-            #     noise_scale, text = self.get_label_value(
-            #         text, 'NOISE', 0.667, 'noise scale')
-            #     noise_scale_w, text = self.get_label_value(
-            #         text, 'NOISEW', 0.8, 'deviation of noise')
-            #     cleaned, text = self.get_label(text, 'CLEANED')
-            #
-            #     stn_tst = self.get_text(text, self.hps_ms, cleaned=cleaned)
-            #
-            #     emotion_reference = input('Path of an emotion reference: ')
-            #     if emotion_reference.endswith('.npy'):
-            #         emotion = np.load(emotion_reference)
-            #         emotion = FloatTensor(emotion).unsqueeze(0)
-            #     else:
-            #         audio16000, sampling_rate = librosa.load(
-            #             emotion_reference, sr=16000, mono=True)
-            #         emotion = w2v2_model(audio16000, sampling_rate)[
-            #             'hidden_states']
-            #         emotion_reference = re.sub(
-            #             r'\..*$', '', emotion_reference)
-            #         np.save(emotion_reference, emotion.squeeze(0))
-            #         emotion = FloatTensor(emotion)
-            #
-            #
-            #     with no_grad():
-            #         x_tst = stn_tst.unsqueeze(0)
-            #         x_tst_lengths = LongTensor([stn_tst.size(0)])
-            #         sid = LongTensor([speaker_id])
-            #         audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
-            #                                     noise_scale_w=noise_scale_w,
-            #                                     length_scale=length_scale, emotion_embedding=emotion)[0][
-            #             0, 0].data.cpu().float().numpy()
-        # else:
-        # model = input('Path of a hubert-soft Model: ')
-        # from hubert_model import hubert_soft
-        # hubert = hubert_soft(model)
-        # if audio_path != '[VC]':
-        #     if self.use_f0:
-        #         audio, sampling_rate = librosa.load(
-        #             audio_path, sr=self.hps_ms.data.sampling_rate, mono=True)
-        #         audio16000 = librosa.resample(
-        #             audio, orig_sr=sampling_rate, target_sr=16000)
-        #     else:
-        #         audio16000, sampling_rate = librosa.load(
-        #             audio_path, sr=16000, mono=True)
-        #
-        #     out_path = "H:/git/MoeGoe-Simple-API/upload/hubert.wav"
-        #     length_scale, out_path = self.get_label_value(
-        #         out_path, 'LENGTH', 1, 'length scale')
-        #     noise_scale, out_path = self.get_label_value(
-        #         out_path, 'NOISE', 0.1, 'noise scale')
-        #     noise_scale_w, out_path = self.get_label_value(
-        #         out_path, 'NOISEW', 0.1, 'deviation of noise')
-        #
-        #     with inference_mode():
-        #         units = hubert.units(FloatTensor(audio16000).unsqueeze(
-        #             0).unsqueeze(0)).squeeze(0).numpy()
-        #         if self.use_f0:
-        #             f0_scale, out_path = self.get_label_value(
-        #                 out_path, 'F0', 1, 'f0 scale')
-        #             f0 = librosa.pyin(audio, sr=sampling_rate,
-        #                               fmin=librosa.note_to_hz('C0'),
-        #                               fmax=librosa.note_to_hz('C7'),
-        #                               frame_length=1780)[0]
-        #             target_length = len(units[:, 0])
-        #             f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length,
-        #                                          np.arange(0, len(f0)), f0)) * f0_scale
-        #             units[:, 0] = f0 / 10
-        #
-        #     stn_tst = FloatTensor(units)
-        #     with no_grad():
-        #         x_tst = stn_tst.unsqueeze(0)
-        #         x_tst_lengths = LongTensor([stn_tst.size(0)])
-        #         sid = LongTensor([target_id])
-        #         audio = self.net_g_ms.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale,
-        #                                     noise_scale_w=noise_scale_w, length_scale=length_scale)[0][
-        #             0, 0].data.float().numpy()
-        with BytesIO() as f:
-            fname = str(uuid.uuid1())
-            if format == 'ogg':
-                write(f, self.hps_ms.data.sampling_rate, audio)
-                with BytesIO() as o:
-                    utils.wav2ogg(f, o)
-                    return BytesIO(o.getvalue()), "audio/ogg", fname + ".ogg"
-            elif format == 'silk':
-                file_path = self.out_path + "/" + fname + ".wav"
-                write(file_path, 24000, audio)
-                silk_path = utils.convert_to_silk(file_path)
-                os.remove(file_path)
-                return silk_path, "audio/silk", fname + ".silk"
             else:
-                write(f, self.hps_ms.data.sampling_rate, audio)
-                return BytesIO(f.getvalue()), "audio/wav", fname + ".wav"
-    def voice_conversion(self, audio_path, original_id, target_id):
         audio = utils.load_audio_to_torch(
             audio_path, self.hps_ms.data.sampling_rate)
@@ -223,9 +245,242 @@ class Voice:
         with no_grad():
             sid_tgt = LongTensor([target_id])
-            audio = self.net_g_ms.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[
-                0][0, 0].data.cpu().float().numpy()
         with BytesIO() as f:
-            write(f, self.hps_ms.data.sampling_rate, audio)
-            return BytesIO(f.getvalue())

 import os
 import librosa
 import commons
 import sys
 import re
 import numpy as np
+import torch
+import xml.etree.ElementTree as ET
+import config
+import logging
 from torch import no_grad, LongTensor, inference_mode, FloatTensor
 from io import BytesIO
+from graiax import silkcoder
+from utils.nlp import cut, sentence_split
+from scipy.io.wavfile import write
+from mel_processing import spectrogram_torch
+from text import text_to_sequence, _clean_text
+from models import SynthesizerTrn
+from utils import utils
+# torch.set_num_threads(1) # 设置torch线程为1
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class vits:
+    def __init__(self, model, config, model_=None, model_type=None):
+        self.model_type = model_type
         self.hps_ms = utils.get_hparams_from_file(config)
         self.n_speakers = self.hps_ms.data.n_speakers if 'n_speakers' in self.hps_ms.data.keys() else 0
         self.n_symbols = len(self.hps_ms.symbols) if 'symbols' in self.hps_ms.keys() else 0
             emotion_embedding=self.emotion_embedding,
             **self.hps_ms.model)
         _ = self.net_g_ms.eval()
+        # load model
+        self.load_model(model, model_)
+    def load_model(self, model, model_=None):
         utils.load_checkpoint(model, self.net_g_ms)
+        self.net_g_ms.to(device)
+        if self.model_type == "hubert":
+            self.hubert = model_
+        elif self.model_type == "w2v2":
+            self.emotion_reference = model_
+    def get_cleaned_text(self, text, hps, cleaned=False):
         if cleaned:
             text_norm = text_to_sequence(text, hps.symbols, [])
         else:
         text_norm = LongTensor(text_norm)
         return text_norm
+    def get_label_value(self, label, default, warning_name='value', text=""):
         value = re.search(rf'\[{label}=(.+?)\]', text)
         if value:
             try:
                 sys.exit(1)
         else:
             value = default
+        if text == "":
+            return value
         else:
+            return value, text
     def get_label(self, text, label):
         if f'[{label}]' in text:
         else:
             return False, text
+    def get_cleaner(self):
+        return getattr(self.hps_ms.data, 'text_cleaners', [None])[0]
+    def return_speakers(self, escape=False):
+        return self.speakers
+    def infer(self, params):
+        emotion = params.get("emotion", None)
+        with no_grad():
+            x_tst = params.get("stn_tst").unsqueeze(0)
+            x_tst_lengths = LongTensor([params.get("stn_tst").size(0)])
+            audio = self.net_g_ms.infer(x_tst.to(device), x_tst_lengths.to(device), sid=params.get("sid").to(device),
+                                        noise_scale=params.get("noise_scale"),
+                                        noise_scale_w=params.get("noise_scale_w"),
+                                        length_scale=params.get("length_scale"),
+                                        emotion_embedding=emotion.to(device) if emotion != None else None)[0][
+                0, 0].data.float().cpu().numpy()
+        torch.cuda.empty_cache()
+        return audio
+    def get_infer_param(self, length, noise, noisew, text=None, speaker_id=None, audio_path=None,
+                        emotion=None):
+        emo = None
+        if self.model_type != "hubert":
+            length_scale, text = self.get_label_value('LENGTH', length, 'length scale', text)
+            noise_scale, text = self.get_label_value('NOISE', noise, 'noise scale', text)
+            noise_scale_w, text = self.get_label_value('NOISEW', noisew, 'deviation of noise', text)
+            cleaned, text = self.get_label(text, 'CLEANED')
+            stn_tst = self.get_cleaned_text(text, self.hps_ms, cleaned=cleaned)
+            sid = LongTensor([speaker_id])
+        if self.model_type == "w2v2":
+            # if emotion_reference.endswith('.npy'):
+            #     emotion = np.load(emotion_reference)
+            #     emotion = FloatTensor(emotion).unsqueeze(0)
             # else:
+            #     audio16000, sampling_rate = librosa.load(
+            #         emotion_reference, sr=16000, mono=True)
+            #     emotion = self.w2v2(audio16000, sampling_rate)[
+            #         'hidden_states']
+            #     emotion_reference = re.sub(
+            #         r'\..*$', '', emotion_reference)
+            #     np.save(emotion_reference, emotion.squeeze(0))
+            #     emotion = FloatTensor(emotion)
+            emo = torch.FloatTensor(self.emotion_reference[emotion]).unsqueeze(0)
+        elif self.model_type == "hubert":
+            if self.use_f0:
+                audio, sampling_rate = librosa.load(
+                    audio_path, sr=self.hps_ms.data.sampling_rate, mono=True)
+                audio16000 = librosa.resample(
+                    audio, orig_sr=sampling_rate, target_sr=16000)
             else:
+                audio16000, sampling_rate = librosa.load(
+                    audio_path, sr=16000, mono=True)
+            length_scale = self.get_label_value('LENGTH', length, 'length scale')
+            noise_scale = self.get_label_value('NOISE', noise, 'noise scale')
+            noise_scale_w = self.get_label_value('NOISEW', noisew, 'deviation of noise')
+            with inference_mode():
+                units = self.hubert.units(FloatTensor(audio16000).unsqueeze(0).unsqueeze(0)).squeeze(0).numpy()
+                if self.use_f0:
+                    f0_scale = self.get_label_value('F0', 1, 'f0 scale')
+                    f0 = librosa.pyin(audio,
+                                      sr=sampling_rate,
+                                      fmin=librosa.note_to_hz('C0'),
+                                      fmax=librosa.note_to_hz('C7'),
+                                      frame_length=1780)[0]
+                    target_length = len(units[:, 0])
+                    f0 = np.nan_to_num(np.interp(np.arange(0, len(f0) * target_length, len(f0)) / target_length,
+                                                 np.arange(0, len(f0)), f0)) * f0_scale
+                    units[:, 0] = f0 / 10
+            stn_tst = FloatTensor(units)
+            sid = LongTensor([speaker_id])
+        params = {"length_scale": length_scale, "noise_scale": noise_scale,
+                  "noise_scale_w": noise_scale_w, "stn_tst": stn_tst,
+                  "sid": sid, "emotion": emo}
+        return params
+    def get_audio(self, voice, auto_break=False):
+        text = voice.get("text", None)
+        speaker_id = voice.get("id", 0)
+        length = voice.get("length", 1)
+        noise = voice.get("noise", 0.667)
+        noisew = voice.get("noisew", 0.8)
+        max = voice.get("max", 50)
+        lang = voice.get("lang", "auto")
+        speaker_lang = voice.get("speaker_lang", None)
+        audio_path = voice.get("audio_path", None)
+        emotion = voice.get("emotion", 0)
+        # 去除所有多余的空白字符
+        if text is not None: text = re.sub(r'\s+', ' ', text).strip()
+        # 停顿0.75s，避免语音分段合成再拼接后的连接突兀
+        brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
+        tasks = []
+        if self.model_type == "vits":
+            sentence_list = sentence_split(text, max, lang, speaker_lang)
+            for sentence in sentence_list:
+                tasks.append(
+                    self.get_infer_param(text=sentence, speaker_id=speaker_id, length=length, noise=noise,
+                                         noisew=noisew))
+            audios = []
+            for task in tasks:
+                audios.append(self.infer(task))
+                if auto_break:
+                    audios.append(brk)
+            audio = np.concatenate(audios, axis=0)
+        elif self.model_type == "hubert":
+            params = self.get_infer_param(speaker_id=speaker_id, length=length, noise=noise, noisew=noisew,
+                                          audio_path=audio_path)
+            audio = self.infer(params)
+        elif self.model_type == "w2v2":
+            sentence_list = sentence_split(text, max, lang, speaker_lang)
+            for sentence in sentence_list:
+                tasks.append(
+                    self.get_infer_param(text=sentence, speaker_id=speaker_id, length=length, noise=noise,
+                                         noisew=noisew, emotion=emotion))
+            audios = []
+            for task in tasks:
+                audios.append(self.infer(task))
+                if auto_break:
+                    audios.append(brk)
+            audio = np.concatenate(audios, axis=0)
+        return audio
+    def voice_conversion(self, voice):
+        audio_path = voice.get("audio_path")
+        original_id = voice.get("original_id")
+        target_id = voice.get("target_id")
         audio = utils.load_audio_to_torch(
             audio_path, self.hps_ms.data.sampling_rate)
         with no_grad():
             sid_tgt = LongTensor([target_id])
+            audio = self.net_g_ms.voice_conversion(spec.to(device),
+                                                   spec_lengths.to(device),
+                                                   sid_src=sid_src.to(device),
+                                                   sid_tgt=sid_tgt.to(device))[0][0, 0].data.cpu().float().numpy()
+        torch.cuda.empty_cache()
+        return audio
+class TTS:
+    def __init__(self, voice_obj, voice_speakers):
+        self._voice_obj = voice_obj
+        self._voice_speakers = voice_speakers
+        self._strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
+        self._speakers_count = sum([len(self._voice_speakers[i]) for i in self._voice_speakers])
+        self._vits_speakers_count = len(self._voice_speakers["VITS"])
+        self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
+        self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
+        self.dem = None
+        if getattr(config, "DIMENSIONAL_EMOTION_MODEL", None) != None:
+            try:
+                import audonnx
+                root = os.path.dirname(config.DIMENSIONAL_EMOTION_MODEL)
+                model_file = config.DIMENSIONAL_EMOTION_MODEL
+                self.dem = audonnx.load(root=root, model_file=model_file)
+            except Exception as e:
+                self.logger.warning(f"Load DIMENSIONAL_EMOTION_MODEL failed {e}")
+        # Initialization information
+        self.logger = logging.getLogger("vits-simple-api")
+        self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
+        self.logger.info(f'device:{device} device.type:{device.type}')
+        if self._vits_speakers_count != 0: self.logger.info(f"[VITS] {self._vits_speakers_count} speakers")
+        if self._hubert_speakers_count != 0: self.logger.info(f"[hubert] {self._hubert_speakers_count} speakers")
+        if self._w2v2_speakers_count != 0: self.logger.info(f"[w2v2] {self._w2v2_speakers_count} speakers")
+        self.logger.info(f"{self._speakers_count} speakers in total")
+        if self._speakers_count == 0:
+            self.logger.warning(f"No model was loaded")
+    @property
+    def voice_speakers(self):
+        return self._voice_speakers
+    @property
+    def speakers_count(self):
+        return self._speakers_count
+    @property
+    def vits_speakers_count(self):
+        return self._vits_speakers_count
+    @property
+    def hubert_speakers_count(self):
+        return self._hubert_speakers_count
+    @property
+    def w2v2_speakers_count(self):
+        return self._w2v2_speakers_count
+    def encode(self, sampling_rate, audio, format):
         with BytesIO() as f:
+            write(f, sampling_rate, audio)
+            if format.upper() == 'OGG':
+                with BytesIO() as o:
+                    utils.wav2ogg(f, o)
+                    return BytesIO(o.getvalue())
+            elif format.upper() == 'SILK':
+                return BytesIO(silkcoder.encode(f))
+            elif format.upper() == 'MP3':
+                with BytesIO() as o:
+                    utils.wav2mp3(f, o)
+                    return BytesIO(o.getvalue())
+            elif format.upper() == 'WAV':
+                return BytesIO(f.getvalue())
+    def convert_time_string(self, time_string):
+        time_value = float(re.findall(r'\d+\.?\d*', time_string)[0])
+        time_unit = re.findall(r'[a-zA-Z]+', time_string)[0].lower()
+        if time_unit.upper() == 'MS':
+            return time_value / 1000
+        elif time_unit.upper() == 'S':
+            return time_value
+        elif time_unit.upper() == 'MIN':
+            return time_value * 60
+        elif time_unit.upper() == 'H':
+            return time_value * 3600
+        elif time_unit.upper() == 'D':
+            return time_value * 24 * 3600  # 不会有人真写D吧？
+        else:
+            raise ValueError("Unsupported time unit: {}".format(time_unit))
+    def parse_ssml(self, ssml):
+        root = ET.fromstring(ssml)
+        format = root.attrib.get("format", "wav")
+        voice_tasks = []
+        brk_count = 0
+        strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
+        for element in root.iter():
+            if element.tag == "voice":
+                id = int(element.attrib.get("id", root.attrib.get("id", config.ID)))
+                lang = element.attrib.get("lang", root.attrib.get("lang", config.LANG))
+                length = float(element.attrib.get("length", root.attrib.get("length", config.LENGTH)))
+                noise = float(element.attrib.get("noise", root.attrib.get("noise", config.NOISE)))
+                noisew = float(element.attrib.get("noisew", root.attrib.get("noisew", config.NOISEW)))
+                max = int(element.attrib.get("max", root.attrib.get("max", "0")))
+                # 不填写默认就是vits
+                model = element.attrib.get("model", root.attrib.get("model", "vits"))
+                # w2v2-vits/emotion-vits才有emotion
+                emotion = int(element.attrib.get("emotion", root.attrib.get("emotion", 0)))
+                voice_element = ET.tostring(element, encoding='unicode')
+                pattern_voice = r'<voice.*?>(.*?)</voice>'
+                pattern_break = r'<break\s*?(.*?)\s*?/>'
+                matches_voice = re.findall(pattern_voice, voice_element)[0]
+                matches_break = re.split(pattern_break, matches_voice)
+                for match in matches_break:
+                    strength = re.search(r'\s*strength\s*=\s*[\'\"](.*?)[\'\"]', match)
+                    time = re.search(r'\s*time\s*=\s*[\'\"](.*?)[\'\"]', match)
+                    # break标签 strength属性
+                    if strength:
+                        brk = strength_dict[strength.group(1)]
+                        voice_tasks.append({"break": brk})
+                        brk_count += 1
+                    # break标签 time属性
+                    elif time:
+                        brk = self.convert_time_string(time.group(1))
+                        voice_tasks.append({"break": brk})
+                        brk_count += 1
+                    # break标签 为空说明只写了break，默认停顿0.75s
+                    elif match == "":
+                        voice_tasks.append({"break": 0.75})
+                        brk_count += 1
+                    # voice标签中除了break剩下的就是文本
+                    else:
+                        voice_tasks.append({"id": id,
+                                            "text": match,
+                                            "lang": lang,
+                                            "length": length,
+                                            "noise": noise,
+                                            "noisew": noisew,
+                                            "max": max,
+                                            "model": model,
+                                            "emotion": emotion
+                                            })
+                # 分段末尾停顿0.75s
+                voice_tasks.append({"break": 0.75})
+            elif element.tag == "break":
+                # brk_count大于0说明voice标签中有break
+                if brk_count > 0:
+                    brk_count -= 1
+                    continue
+                brk = strength_dict.get(element.attrib.get("strength"),
+                                        self.convert_time_string(element.attrib.get("time", "750ms")))
+                voice_tasks.append({"break": brk})
+        for i in voice_tasks:
+            self.logger.debug(i)
+        return voice_tasks, format
+    def create_ssml_infer_task(self, ssml):
+        voice_tasks, format = self.parse_ssml(ssml)
+        audios = []
+        for voice in voice_tasks:
+            if voice.get("break"):
+                audios.append(np.zeros(int(voice.get("break") * 22050), dtype=np.int16))
+            else:
+                model = voice.get("model").upper()
+                if model != "VITS" and model != "W2V2-VITS" and model != "EMOTION-VITS":
+                    raise ValueError(f"Unsupported model: {voice.get('model')}")
+                voice_obj = self._voice_obj[model][voice.get("id")][1]
+                voice["id"] = self._voice_obj[model][voice.get("id")][0]
+                audios.append(voice_obj.get_audio(voice))
+        audio = np.concatenate(audios, axis=0)
+        return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format), format
+    def vits_infer(self, voice):
+        format = voice.get("format", "wav")
+        voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
+        voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
+        audio = voice_obj.get_audio(voice, auto_break=True)
+        return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
+    def hubert_vits_infer(self, voice):
+        format = voice.get("format", "wav")
+        voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
+        voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
+        audio = voice_obj.get_audio(voice)
+        return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
+    def w2v2_vits_infer(self, voice):
+        format = voice.get("format", "wav")
+        voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
+        voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
+        audio = voice_obj.get_audio(voice, auto_break=True)
+        return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
+    def vits_voice_conversion(self, voice):
+        original_id = voice.get("original_id")
+        target_id = voice.get("target_id")
+        format = voice.get("format")
+        original_id_obj = int(self._voice_obj["VITS"][original_id][2])
+        target_id_obj = int(self._voice_obj["VITS"][target_id][2])
+        if original_id_obj != target_id_obj:
+            raise ValueError(f"speakers are in diffrent VITS Model")
+        voice["original_id"] = int(self._voice_obj["VITS"][original_id][0])
+        voice["target_id"] = int(self._voice_obj["VITS"][target_id][0])
+        voice_obj = self._voice_obj["VITS"][original_id][1]
+        audio = voice_obj.voice_conversion(voice)
+        return self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
+    def get_dimensional_emotion_npy(self, audio):
+        if self.dem is None:
+            raise ValueError(f"Please configure DIMENSIONAL_EMOTION_MODEL path in config.py")
+        audio16000, sampling_rate = librosa.load(audio, sr=16000, mono=True)
+        emotion = self.dem(audio16000, sampling_rate)['hidden_states']
+        emotion_npy = BytesIO()
+        np.save(emotion_npy, emotion.squeeze(0))
+        emotion_npy.seek(0)
+        return emotion_npy