Spaces:
Build error
Build error
Upload 44 files
Browse files- Dockerfile +8 -8
- Dockerfile_GPU +37 -0
- README_zh.md +16 -14
- app.py +14 -28
- config.py +10 -4
- docker-compose-gpu.yaml +15 -0
- docker-compose.yaml +3 -1
- gunicorn_config.py +4 -0
- logger.py +42 -0
- requirements.txt +2 -1
- static/css/style.css +84 -0
- templates/index.html +267 -121
- text/cleaners.py +15 -0
- text/mandarin.py +2 -3
- utils/merge.py +16 -8
- utils/nlp.py +1 -7
- vits-simple-api-installer-latest.sh +26 -1
- voice.py +14 -15
Dockerfile
CHANGED
@@ -6,15 +6,13 @@ WORKDIR /app
|
|
6 |
ENV DEBIAN_FRONTEND=noninteractive
|
7 |
|
8 |
RUN apt-get update && \
|
9 |
-
apt install build-essential -
|
10 |
-
apt install espeak-ng -yq && \
|
11 |
-
apt install cmake -yq && \
|
12 |
-
apt install -y wget -yq && \
|
13 |
apt-get clean && \
|
14 |
apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
|
15 |
rm -rf /var/lib/apt/lists/*
|
16 |
|
17 |
-
RUN pip install
|
|
|
18 |
|
19 |
RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
|
20 |
tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
|
@@ -25,13 +23,15 @@ RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openj
|
|
25 |
rm -f openjtalk-0.3.0.dev2.tar.gz && \
|
26 |
rm -rf openjtalk-0.3.0.dev2
|
27 |
|
28 |
-
RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
|
29 |
|
30 |
COPY requirements.txt /app
|
31 |
-
RUN pip install -r requirements.txt
|
|
|
|
|
32 |
|
33 |
COPY . /app
|
34 |
|
35 |
EXPOSE 23456
|
36 |
|
37 |
-
CMD ["
|
|
|
6 |
ENV DEBIAN_FRONTEND=noninteractive
|
7 |
|
8 |
RUN apt-get update && \
|
9 |
+
apt-get install -yq build-essential espeak-ng cmake wget && \
|
|
|
|
|
|
|
10 |
apt-get clean && \
|
11 |
apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
|
12 |
rm -rf /var/lib/apt/lists/*
|
13 |
|
14 |
+
RUN pip install --upgrade pip --no-cache-dir && \
|
15 |
+
pip install MarkupSafe==2.1.2 numpy==1.23.3 cython six==1.16.0 safetensors==0.3.2 --no-cache-dir
|
16 |
|
17 |
RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
|
18 |
tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
|
|
|
23 |
rm -f openjtalk-0.3.0.dev2.tar.gz && \
|
24 |
rm -rf openjtalk-0.3.0.dev2
|
25 |
|
26 |
+
RUN pip install torch --index-url https://download.pytorch.org/whl/cpu --no-cache-dir
|
27 |
|
28 |
COPY requirements.txt /app
|
29 |
+
RUN pip install -r requirements.txt --no-cache-dir
|
30 |
+
|
31 |
+
RUN pip install gunicorn --no-cache-dir
|
32 |
|
33 |
COPY . /app
|
34 |
|
35 |
EXPOSE 23456
|
36 |
|
37 |
+
CMD ["gunicorn", "-c", "gunicorn_config.py", "app:app"]
|
Dockerfile_GPU
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10.11-slim-bullseye
|
2 |
+
|
3 |
+
RUN mkdir -p /app
|
4 |
+
WORKDIR /app
|
5 |
+
|
6 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
7 |
+
|
8 |
+
RUN apt-get update && \
|
9 |
+
apt-get install -yq build-essential espeak-ng cmake wget && \
|
10 |
+
apt-get clean && \
|
11 |
+
apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false && \
|
12 |
+
rm -rf /var/lib/apt/lists/*
|
13 |
+
|
14 |
+
RUN pip install --upgrade pip --no-cache-dir && \
|
15 |
+
pip install MarkupSafe==2.1.2 numpy==1.23.3 cython six==1.16.0 safetensors==0.3.2 --no-cache-dir
|
16 |
+
|
17 |
+
RUN wget https://raw.githubusercontent.com/Artrajz/archived/main/openjtalk/openjtalk-0.3.0.dev2.tar.gz && \
|
18 |
+
tar -zxvf openjtalk-0.3.0.dev2.tar.gz && \
|
19 |
+
cd openjtalk-0.3.0.dev2 && \
|
20 |
+
rm -rf ./pyopenjtalk/open_jtalk_dic_utf_8-1.11 && \
|
21 |
+
python setup.py install && \
|
22 |
+
cd ../ && \
|
23 |
+
rm -f openjtalk-0.3.0.dev2.tar.gz && \
|
24 |
+
rm -rf openjtalk-0.3.0.dev2
|
25 |
+
|
26 |
+
RUN pip install torch --index-url https://download.pytorch.org/whl/cu117 --no-cache-dir
|
27 |
+
|
28 |
+
COPY requirements.txt /app
|
29 |
+
RUN pip install -r requirements.txt --no-cache-dir
|
30 |
+
|
31 |
+
RUN pip install gunicorn --no-cache-dir
|
32 |
+
|
33 |
+
COPY . /app
|
34 |
+
|
35 |
+
EXPOSE 23456
|
36 |
+
|
37 |
+
CMD ["gunicorn", "-c", "gunicorn_config.py", "app:app"]
|
README_zh.md
CHANGED
@@ -63,7 +63,7 @@
|
|
63 |
|
64 |
|
65 |
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164`
|
66 |
-
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text
|
67 |
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text=Difficult the first time, easy the second.&id=4`
|
68 |
- 激动:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=111`
|
69 |
- 小声:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=2077`
|
@@ -495,14 +495,15 @@ def voice_dimensional_emotion(upload_path):
|
|
495 |
|
496 |
| Name | Parameter | Is must | Default | Type | Instruction |
|
497 |
| ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
|
498 |
-
| 合成文本 | text | true | | str |
|
499 |
-
| 角色id | id | false | 0 | int |
|
500 |
| 音频格式 | format | false | wav | str | 支持wav,ogg,silk,mp3,flac |
|
501 |
| 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
|
502 |
-
| 语音长度/语速 | length | false | 1.0 | float |
|
503 |
-
| 噪声 | noise | false | 0.
|
504 |
-
|
|
505 |
| 分段阈值 | max | false | 50 | int | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
|
|
|
506 |
|
507 |
## VITS 语音转换
|
508 |
|
@@ -516,12 +517,12 @@ def voice_dimensional_emotion(upload_path):
|
|
516 |
|
517 |
| Name | Parameter | Is must | Default | Type | Instruction |
|
518 |
| ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------ |
|
519 |
-
| 上传音频 | upload | true | | file |
|
520 |
-
| 目标角色id | id | true | | int |
|
521 |
| 音频格式 | format | true | | str | wav,ogg,silk |
|
522 |
| 语音长度/语速 | length | true | | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
|
523 |
-
| 噪声 | noise | true | | float |
|
524 |
-
|
|
525 |
|
526 |
## Dimensional emotion
|
527 |
|
@@ -533,13 +534,13 @@ def voice_dimensional_emotion(upload_path):
|
|
533 |
|
534 |
| Name | Parameter | Is must | Default | Type | Instruction |
|
535 |
| ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
|
536 |
-
|
|
537 |
-
| 角色id | id | false | 0 | int |
|
538 |
| 音频格式 | format | false | wav | str | 支持wav,ogg,silk,mp3,flac |
|
539 |
| 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
|
540 |
| 语音长度/语速 | length | false | 1.0 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
|
541 |
-
| 噪声 | noise | false | 0.
|
542 |
-
|
|
543 |
| 分段阈值 | max | false | 50 | int | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
|
544 |
| 维度情感 | emotion | false | 0 | int | 范围取决于npy情感参考文件,如[innnky](https://huggingface.co/spaces/innnky/nene-emotion/tree/main)的all_emotions.npy模型范围是0-5457 |
|
545 |
|
@@ -623,4 +624,5 @@ def voice_dimensional_emotion(upload_path):
|
|
623 |
- MoeGoe:https://github.com/CjangCjengh/MoeGoe
|
624 |
- emotional-vits:https://github.com/innnky/emotional-vits
|
625 |
- vits-uma-genshin-honkai:https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
|
|
|
626 |
|
|
|
63 |
|
64 |
|
65 |
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164`
|
66 |
+
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text=我觉得1%2B1≠3&id=164&lang=zh`(get中一些字符需要转义不然会被过滤掉)
|
67 |
- `https://artrajz-vits-simple-api.hf.space/voice/vits?text=Difficult the first time, easy the second.&id=4`
|
68 |
- 激动:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=111`
|
69 |
- 小声:`https://artrajz-vits-simple-api.hf.space/voice/w2v2-vits?text=こんにちは&id=3&emotion=2077`
|
|
|
495 |
|
496 |
| Name | Parameter | Is must | Default | Type | Instruction |
|
497 |
| ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
|
498 |
+
| 合成文本 | text | true | | str | 需要合成语音的文本。 |
|
499 |
+
| 角色id | id | false | 0 | int | 即说话人id。 |
|
500 |
| 音频格式 | format | false | wav | str | 支持wav,ogg,silk,mp3,flac |
|
501 |
| 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
|
502 |
+
| 语音长度/语速 | length | false | 1.0 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢。 |
|
503 |
+
| 噪声 | noise | false | 0.33 | float | 样本噪声,控制合成的随机性。 |
|
504 |
+
| sdp噪声 | noisew | false | 0.4 | float | 随机时长预测器噪声,控制音素发音长度。 |
|
505 |
| 分段阈值 | max | false | 50 | int | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
|
506 |
+
| 流式响应 | streaming | false | false | bool | 流式合成语音,更快的首包响应。 |
|
507 |
|
508 |
## VITS 语音转换
|
509 |
|
|
|
517 |
|
518 |
| Name | Parameter | Is must | Default | Type | Instruction |
|
519 |
| ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------ |
|
520 |
+
| 上传音频 | upload | true | | file | 需要转换说话人的音频文件。 |
|
521 |
+
| 目标角色id | id | true | | int | 目标说话人id。 |
|
522 |
| 音频格式 | format | true | | str | wav,ogg,silk |
|
523 |
| 语音长度/语速 | length | true | | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
|
524 |
+
| 噪声 | noise | true | | float | 样本噪声,控制合成的随机性。 |
|
525 |
+
| sdp噪声 | noisew | true | | float | 随机时长预测器噪声,控制音素发音长度。 |
|
526 |
|
527 |
## Dimensional emotion
|
528 |
|
|
|
534 |
|
535 |
| Name | Parameter | Is must | Default | Type | Instruction |
|
536 |
| ------------- | --------- | ------- | ------- | ----- | ------------------------------------------------------------ |
|
537 |
+
| 合���文本 | text | true | | str | 需要合成语音的文本。 |
|
538 |
+
| 角色id | id | false | 0 | int | 即说话人id。 |
|
539 |
| 音频格式 | format | false | wav | str | 支持wav,ogg,silk,mp3,flac |
|
540 |
| 文本语言 | lang | false | auto | str | auto为自动识别语言模式,也是默认模式。lang=mix时,文本应该用[ZH] 或 [JA] 包裹。方言无法自动识别。 |
|
541 |
| 语音长度/语速 | length | false | 1.0 | float | 调节语音长度,相当于调节语速,该数值越大语速越慢 |
|
542 |
+
| 噪声 | noise | false | 0.33 | float | 样本噪声,控制合成的随机性。 |
|
543 |
+
| sdp噪声 | noisew | false | 0.4 | float | 随机时长预测器噪声,控制音素发音长度。 |
|
544 |
| 分段阈值 | max | false | 50 | int | 按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。 |
|
545 |
| 维度情感 | emotion | false | 0 | int | 范围取决于npy情感参考文件,如[innnky](https://huggingface.co/spaces/innnky/nene-emotion/tree/main)的all_emotions.npy模型范围是0-5457 |
|
546 |
|
|
|
624 |
- MoeGoe:https://github.com/CjangCjengh/MoeGoe
|
625 |
- emotional-vits:https://github.com/innnky/emotional-vits
|
626 |
- vits-uma-genshin-honkai:https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai
|
627 |
+
- vits_chinese:https://github.com/PlayVoice/vits_chinese
|
628 |
|
app.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1 |
import os
|
2 |
-
import logging
|
3 |
import time
|
4 |
-
import logzero
|
5 |
import uuid
|
|
|
6 |
from flask import Flask, request, send_file, jsonify, make_response, render_template
|
7 |
from werkzeug.utils import secure_filename
|
8 |
from flask_apscheduler import APScheduler
|
@@ -19,24 +18,15 @@ scheduler.init_app(app)
|
|
19 |
if app.config.get("CLEAN_INTERVAL_SECONDS", 3600) > 0:
|
20 |
scheduler.start()
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
logging.basicConfig(level=level_dict[level])
|
28 |
-
logging.getLogger('numba').setLevel(logging.WARNING)
|
29 |
-
logging.getLogger("langid.langid").setLevel(logging.INFO)
|
30 |
-
logging.getLogger("apscheduler.scheduler").setLevel(logging.INFO)
|
31 |
|
|
|
32 |
tts = merge_model(app.config["MODEL_LIST"])
|
33 |
|
34 |
-
if not os.path.exists(app.config['UPLOAD_FOLDER']):
|
35 |
-
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
36 |
-
|
37 |
-
if not os.path.exists(app.config['CACHE_PATH']):
|
38 |
-
os.makedirs(app.config['CACHE_PATH'], exist_ok=True)
|
39 |
-
|
40 |
|
41 |
def require_api_key(func):
|
42 |
@wraps(func)
|
@@ -57,7 +47,10 @@ def require_api_key(func):
|
|
57 |
def index():
|
58 |
kwargs = {
|
59 |
"speakers": tts.voice_speakers,
|
60 |
-
"speakers_count": tts.speakers_count
|
|
|
|
|
|
|
61 |
}
|
62 |
return render_template("index.html", **kwargs)
|
63 |
|
@@ -362,25 +355,18 @@ def ssml():
|
|
362 |
return make_response(jsonify({"status": "error", "message": f"parameter error"}), 400)
|
363 |
|
364 |
logger.debug(ssml)
|
365 |
-
|
366 |
fname = f"{str(uuid.uuid1())}.{format}"
|
367 |
file_type = f"audio/{format}"
|
368 |
|
369 |
t1 = time.time()
|
370 |
-
audio
|
371 |
t2 = time.time()
|
372 |
if app.config.get("SAVE_AUDIO", False):
|
373 |
logger.debug(f"[ssml] {fname}")
|
374 |
logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
|
375 |
|
376 |
-
|
377 |
-
audio = tts.generate_audio_chunks(audio)
|
378 |
-
response = make_response(audio)
|
379 |
-
response.headers['Content-Disposition'] = f'attachment; filename={fname}'
|
380 |
-
response.headers['Content-Type'] = file_type
|
381 |
-
return response
|
382 |
-
else:
|
383 |
-
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
384 |
|
385 |
|
386 |
@app.route('/voice/dimension-emotion', methods=["POST"])
|
|
|
1 |
import os
|
|
|
2 |
import time
|
|
|
3 |
import uuid
|
4 |
+
from logger import logger
|
5 |
from flask import Flask, request, send_file, jsonify, make_response, render_template
|
6 |
from werkzeug.utils import secure_filename
|
7 |
from flask_apscheduler import APScheduler
|
|
|
18 |
if app.config.get("CLEAN_INTERVAL_SECONDS", 3600) > 0:
|
19 |
scheduler.start()
|
20 |
|
21 |
+
for path in (app.config['LOGS_PATH'], app.config['UPLOAD_FOLDER'], app.config['CACHE_PATH']):
|
22 |
+
try:
|
23 |
+
os.makedirs(path, exist_ok=True)
|
24 |
+
except Exception as e:
|
25 |
+
logger.error(f"Unable to create directory {path}: {str(e)}")
|
|
|
|
|
|
|
|
|
26 |
|
27 |
+
# load model
|
28 |
tts = merge_model(app.config["MODEL_LIST"])
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
def require_api_key(func):
|
32 |
@wraps(func)
|
|
|
47 |
def index():
|
48 |
kwargs = {
|
49 |
"speakers": tts.voice_speakers,
|
50 |
+
"speakers_count": tts.speakers_count,
|
51 |
+
"vits_speakers_count":tts._vits_speakers_count,
|
52 |
+
"w2v2_speakers_count":tts._w2v2_speakers_count,
|
53 |
+
"w2v2_emotion_count":tts._w2v2_emotion_count
|
54 |
}
|
55 |
return render_template("index.html", **kwargs)
|
56 |
|
|
|
355 |
return make_response(jsonify({"status": "error", "message": f"parameter error"}), 400)
|
356 |
|
357 |
logger.debug(ssml)
|
358 |
+
voice_tasks, format = tts.parse_ssml(ssml)
|
359 |
fname = f"{str(uuid.uuid1())}.{format}"
|
360 |
file_type = f"audio/{format}"
|
361 |
|
362 |
t1 = time.time()
|
363 |
+
audio = tts.create_ssml_infer_task(voice_tasks, format, fname)
|
364 |
t2 = time.time()
|
365 |
if app.config.get("SAVE_AUDIO", False):
|
366 |
logger.debug(f"[ssml] {fname}")
|
367 |
logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
|
368 |
|
369 |
+
return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
|
371 |
|
372 |
@app.route('/voice/dimension-emotion', methods=["POST"])
|
config.py
CHANGED
@@ -12,7 +12,7 @@ DEBUG = False
|
|
12 |
PORT = 7860
|
13 |
|
14 |
# Absolute path of vits-simple-api
|
15 |
-
ABS_PATH = os.path.
|
16 |
|
17 |
# Upload path
|
18 |
UPLOAD_FOLDER = ABS_PATH + "/upload"
|
@@ -20,6 +20,12 @@ UPLOAD_FOLDER = ABS_PATH + "/upload"
|
|
20 |
# Cahce path
|
21 |
CACHE_PATH = ABS_PATH + "/cache"
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# If CLEAN_INTERVAL_SECONDS <= 0, the cleaning task will not be executed.
|
24 |
CLEAN_INTERVAL_SECONDS = 3600
|
25 |
|
@@ -39,7 +45,7 @@ API_KEY = "api-key"
|
|
39 |
LOGGING_LEVEL = "DEBUG"
|
40 |
|
41 |
# Language identification library. Optional fastlid, langid
|
42 |
-
LANGUAGE_IDENTIFICATION_LIBRARY = "
|
43 |
|
44 |
# To use the english_cleaner, you need to install espeak and provide the path of libespeak-ng.dll as input here.
|
45 |
# If ESPEAK_LIBRARY is set to empty, it will be read from the environment variable.
|
@@ -48,7 +54,7 @@ ESPEAK_LIBRARY = ""
|
|
48 |
|
49 |
# Fill in the model path here
|
50 |
MODEL_LIST = [
|
51 |
-
|
52 |
[ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/1374_epochs.pth", ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/config.json"],
|
53 |
[ABS_PATH + "/Model/vctk/pretrained_vctk.pth", ABS_PATH + "/Model/vctk/vctk_base.json"],
|
54 |
[ABS_PATH + "/Model/paimon/paimon6k_390000.pth", ABS_PATH + "/Model/paimon/paimon6k.json"],
|
@@ -73,7 +79,7 @@ HUBERT_SOFT_MODEL = ABS_PATH + "/Model/hubert-soft-0d54a1f4.pt"
|
|
73 |
DIMENSIONAL_EMOTION_NPY = ABS_PATH + "/Model/npy"
|
74 |
|
75 |
# w2v2-vits: Need to have both `model.onnx` and `model.yaml` files in the same path.
|
76 |
-
DIMENSIONAL_EMOTION_MODEL = ABS_PATH + "/Model/model.yaml"
|
77 |
|
78 |
"""
|
79 |
Default parameter
|
|
|
12 |
PORT = 7860
|
13 |
|
14 |
# Absolute path of vits-simple-api
|
15 |
+
ABS_PATH = os.path.dirname(os.path.realpath(__file__))
|
16 |
|
17 |
# Upload path
|
18 |
UPLOAD_FOLDER = ABS_PATH + "/upload"
|
|
|
20 |
# Cahce path
|
21 |
CACHE_PATH = ABS_PATH + "/cache"
|
22 |
|
23 |
+
# Logs path
|
24 |
+
LOGS_PATH = ABS_PATH + "/logs"
|
25 |
+
|
26 |
+
# Set the number of backup log files to keep.
|
27 |
+
LOGS_BACKUPCOUNT = 30
|
28 |
+
|
29 |
# If CLEAN_INTERVAL_SECONDS <= 0, the cleaning task will not be executed.
|
30 |
CLEAN_INTERVAL_SECONDS = 3600
|
31 |
|
|
|
45 |
LOGGING_LEVEL = "DEBUG"
|
46 |
|
47 |
# Language identification library. Optional fastlid, langid
|
48 |
+
LANGUAGE_IDENTIFICATION_LIBRARY = "fastlid"
|
49 |
|
50 |
# To use the english_cleaner, you need to install espeak and provide the path of libespeak-ng.dll as input here.
|
51 |
# If ESPEAK_LIBRARY is set to empty, it will be read from the environment variable.
|
|
|
54 |
|
55 |
# Fill in the model path here
|
56 |
MODEL_LIST = [
|
57 |
+
# VITS
|
58 |
[ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/1374_epochs.pth", ABS_PATH + "/Model/Nene_Nanami_Rong_Tang/config.json"],
|
59 |
[ABS_PATH + "/Model/vctk/pretrained_vctk.pth", ABS_PATH + "/Model/vctk/vctk_base.json"],
|
60 |
[ABS_PATH + "/Model/paimon/paimon6k_390000.pth", ABS_PATH + "/Model/paimon/paimon6k.json"],
|
|
|
79 |
DIMENSIONAL_EMOTION_NPY = ABS_PATH + "/Model/npy"
|
80 |
|
81 |
# w2v2-vits: Need to have both `model.onnx` and `model.yaml` files in the same path.
|
82 |
+
# DIMENSIONAL_EMOTION_MODEL = ABS_PATH + "/Model/model.yaml"
|
83 |
|
84 |
"""
|
85 |
Default parameter
|
docker-compose-gpu.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: '3.4'
|
2 |
+
services:
|
3 |
+
vits:
|
4 |
+
image: artrajz/vits-simple-api:latest-gpu
|
5 |
+
restart: always
|
6 |
+
ports:
|
7 |
+
- 23456:23456
|
8 |
+
environment:
|
9 |
+
LANG: 'C.UTF-8'
|
10 |
+
TZ: Asia/Shanghai #timezone
|
11 |
+
volumes:
|
12 |
+
- ./Model:/app/Model # 挂载模型文件夹
|
13 |
+
- ./config.py:/app/config.py # 挂载配置文件
|
14 |
+
- ./logs:/app/logs # logging logs
|
15 |
+
- ./gunicorn_config.py:/app/gunicorn_config.py # gunicorn configuration
|
docker-compose.yaml
CHANGED
@@ -10,4 +10,6 @@ services:
|
|
10 |
TZ: Asia/Shanghai #timezone
|
11 |
volumes:
|
12 |
- ./Model:/app/Model # 挂载模型文件夹
|
13 |
-
- ./config.py:/app/config.py # 挂载配置文件
|
|
|
|
|
|
10 |
TZ: Asia/Shanghai #timezone
|
11 |
volumes:
|
12 |
- ./Model:/app/Model # 挂载模型文件夹
|
13 |
+
- ./config.py:/app/config.py # 挂载配置文件
|
14 |
+
- ./logs:/app/logs # logging logs
|
15 |
+
- ./gunicorn_config.py:/app/gunicorn_config.py # gunicorn configuration
|
gunicorn_config.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import multiprocessing
|
2 |
+
|
3 |
+
bind = "0.0.0.0:23456"
|
4 |
+
workers = multiprocessing.cpu_count()
|
logger.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import logging
|
4 |
+
import logzero
|
5 |
+
import config
|
6 |
+
from logging.handlers import TimedRotatingFileHandler
|
7 |
+
|
8 |
+
logzero.loglevel(logging.WARNING)
|
9 |
+
logger = logging.getLogger("vits-simple-api")
|
10 |
+
level = getattr(config, "LOGGING_LEVEL", "DEBUG")
|
11 |
+
level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
|
12 |
+
'CRITICAL': logging.CRITICAL}
|
13 |
+
logging.basicConfig(level=level_dict[level])
|
14 |
+
logging.getLogger('numba').setLevel(logging.WARNING)
|
15 |
+
logging.getLogger("langid.langid").setLevel(logging.INFO)
|
16 |
+
logging.getLogger("apscheduler.scheduler").setLevel(logging.INFO)
|
17 |
+
|
18 |
+
os.makedirs(config.LOGS_PATH, exist_ok=True)
|
19 |
+
log_file = os.path.join(config.LOGS_PATH, 'latest.log')
|
20 |
+
backup_count = getattr(config, "LOGS_BACKUPCOUNT", 30)
|
21 |
+
handler = TimedRotatingFileHandler(log_file, when="midnight", interval=1, backupCount=backup_count, encoding='utf-8')
|
22 |
+
handler.suffix = "%Y-%m-%d.log"
|
23 |
+
formatter = logging.Formatter('%(levelname)s:%(name)s %(message)s')
|
24 |
+
handler.setFormatter(formatter)
|
25 |
+
logger.addHandler(handler)
|
26 |
+
|
27 |
+
logging.getLogger("werkzeug").addHandler(handler)
|
28 |
+
logging.getLogger("apscheduler.scheduler").addHandler(handler)
|
29 |
+
|
30 |
+
|
31 |
+
# Custom function to handle uncaught exceptions
|
32 |
+
def handle_exception(exc_type, exc_value, exc_traceback):
|
33 |
+
# If it's a keyboard interrupt, don't handle it, just return
|
34 |
+
if issubclass(exc_type, KeyboardInterrupt):
|
35 |
+
sys.__excepthook__(exc_type, exc_value, exc_traceback)
|
36 |
+
return
|
37 |
+
|
38 |
+
logger.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
|
39 |
+
|
40 |
+
|
41 |
+
# Set the global exception handler in Python
|
42 |
+
sys.excepthook = handle_exception
|
requirements.txt
CHANGED
@@ -27,4 +27,5 @@ fasttext
|
|
27 |
fastlid
|
28 |
langid
|
29 |
phonemizer==3.2.1
|
30 |
-
transformers
|
|
|
|
27 |
fastlid
|
28 |
langid
|
29 |
phonemizer==3.2.1
|
30 |
+
transformers
|
31 |
+
pydantic==1.10.6
|
static/css/style.css
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.main-container {
|
2 |
+
position: relative;
|
3 |
+
width: 100%;
|
4 |
+
min-height: 300px;
|
5 |
+
}
|
6 |
+
|
7 |
+
.container {
|
8 |
+
width: 300px;
|
9 |
+
position: relative;
|
10 |
+
}
|
11 |
+
|
12 |
+
|
13 |
+
/*tabs*/
|
14 |
+
.tabs {
|
15 |
+
display: flex;
|
16 |
+
left: 0;
|
17 |
+
}
|
18 |
+
|
19 |
+
.tab-button {
|
20 |
+
display: inline-block;
|
21 |
+
background-color: transparent;
|
22 |
+
padding: 5px 10px;
|
23 |
+
cursor: pointer;
|
24 |
+
margin-bottom: -2px;
|
25 |
+
border-top: 2px solid transparent;
|
26 |
+
border-left: 2px solid transparent;
|
27 |
+
border-right: 2px solid transparent;
|
28 |
+
border-bottom: 0px;
|
29 |
+
border-top-left-radius: 0.5rem;
|
30 |
+
border-top-right-radius: 0.5rem;
|
31 |
+
color: gray;
|
32 |
+
}
|
33 |
+
|
34 |
+
.tab-button.active {
|
35 |
+
background-color: white;
|
36 |
+
border-top: 2px solid #dee2e6;
|
37 |
+
border-left: 2px solid #dee2e6;
|
38 |
+
border-right: 2px solid #dee2e6;
|
39 |
+
color: black;
|
40 |
+
}
|
41 |
+
|
42 |
+
/*content*/
|
43 |
+
|
44 |
+
.content {
|
45 |
+
border: gray;
|
46 |
+
border-left-width: 2px;
|
47 |
+
}
|
48 |
+
|
49 |
+
.content-pane {
|
50 |
+
display: none;
|
51 |
+
padding: 20px;
|
52 |
+
}
|
53 |
+
|
54 |
+
.content-pane.active {
|
55 |
+
display: flex;
|
56 |
+
-ms-flex-wrap: wrap;
|
57 |
+
flex-wrap: wrap;
|
58 |
+
}
|
59 |
+
|
60 |
+
*, :before, :after {
|
61 |
+
box-sizing: border-box;
|
62 |
+
border-width: 0;
|
63 |
+
border-style: solid;
|
64 |
+
border-color: #e5e7eb;
|
65 |
+
}
|
66 |
+
|
67 |
+
|
68 |
+
.flex {
|
69 |
+
display: flex;
|
70 |
+
}
|
71 |
+
|
72 |
+
.border-transparent {
|
73 |
+
border-color: transparent;
|
74 |
+
}
|
75 |
+
|
76 |
+
.border-b-2 {
|
77 |
+
border-bottom: 2px solid #dee2e6;
|
78 |
+
}
|
79 |
+
|
80 |
+
.border-lr-2 {
|
81 |
+
border-left: 2px solid #dee2e6;
|
82 |
+
border-right: 2px solid #dee2e6;
|
83 |
+
}
|
84 |
+
|
templates/index.html
CHANGED
@@ -4,126 +4,230 @@
|
|
4 |
<meta charset="UTF-8"/>
|
5 |
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
|
6 |
<title>vits-simple-api</title>
|
7 |
-
|
8 |
<link rel="stylesheet" href="/static/css/bootstrap.min.css"/>
|
9 |
</head>
|
10 |
<body>
|
11 |
-
<main
|
12 |
-
<
|
13 |
-
<
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
16 |
|
17 |
-
<div>
|
18 |
-
<label>文档:</label>
|
19 |
-
<a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
|
20 |
-
style="text-decoration: none; color: black"> https://github.com/Artrajz/vits-simple-api </a>
|
21 |
-
</div>
|
22 |
-
<div>
|
23 |
-
<label>返回speakers(json):</label>
|
24 |
-
<a id="speakersLink" href="https://artrajz-vits-simple-api.hf.space/voice/speakers" target="_blank"
|
25 |
-
style="text-decoration: none; color: black">
|
26 |
-
https://artrajz-vits-simple-api.hf.space/voice/speakers
|
27 |
-
</a>
|
28 |
-
</div>
|
29 |
-
<div>
|
30 |
-
<label>简单调用api:</label>
|
31 |
-
<a id="vitsLink" href="https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164"
|
32 |
-
style="text-decoration: none; color: black">
|
33 |
-
https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164
|
34 |
-
</a>
|
35 |
-
</div>
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
<option value="{{ speaker["id"] }}" selected>{{ speaker["id"] }} | {{ speaker["name"] }}
|
50 |
| {{ speaker["lang"] }}</option>
|
51 |
{% else %}
|
52 |
<option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
|
53 |
| {{ speaker["lang"] }}</option>
|
54 |
{% endif %}
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
<
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
</div>
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
</div>
|
107 |
-
</
|
108 |
</div>
|
109 |
-
</div>
|
110 |
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
</div>
|
121 |
-
|
122 |
-
|
123 |
-
<div>方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd</div>
|
124 |
-
<br/>
|
125 |
-
|
126 |
-
<h2>所有模型均为网络搜集,感谢模型原作者的付出!</h2>
|
127 |
<p>
|
128 |
Nene_Nanami_Rong_Tang:
|
129 |
<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
|
@@ -164,6 +268,8 @@
|
|
164 |
vits_chinese:
|
165 |
<a href="https://github.com/PlayVoice/vits_chinese" rel="noreferrer" target="_blank">PlayVoice/vits_chinese</a>
|
166 |
</p>
|
|
|
|
|
167 |
|
168 |
</main>
|
169 |
|
@@ -171,6 +277,10 @@
|
|
171 |
<script src="/static/js/bootstrap.bundle.min.js"></script>
|
172 |
|
173 |
<script>
|
|
|
|
|
|
|
|
|
174 |
function getProtocol() {
|
175 |
return 'https:' == location.protocol ? "https://" : "http://";
|
176 |
}
|
@@ -181,12 +291,21 @@
|
|
181 |
}
|
182 |
|
183 |
var baseUrl = getProtocol() + getUrl();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
setBaseUrl();
|
186 |
|
187 |
function setBaseUrl() {
|
188 |
-
var text = document.getElementById("inputText").value;
|
189 |
-
var id = document.getElementById("inputId").value;
|
190 |
|
191 |
var vitsLink = document.getElementById("vitsLink");
|
192 |
var speakersLink = document.getElementById("speakersLink");
|
@@ -202,17 +321,22 @@
|
|
202 |
}
|
203 |
|
204 |
function getLink() {
|
205 |
-
var text = document.getElementById("inputText").value;
|
206 |
-
var id = document.getElementById("inputId").value;
|
207 |
-
var format = document.getElementById("inputFormat").value;
|
208 |
-
var lang = document.getElementById("inputLang").value;
|
209 |
-
var length = document.getElementById("inputLength").value;
|
210 |
-
var noise = document.getElementById("inputNoise").value;
|
211 |
-
var noisew = document.getElementById("inputNoisew").value;
|
212 |
-
var max = document.getElementById("inputMax").value;
|
213 |
-
var streaming = document.getElementById('streaming');
|
214 |
|
215 |
-
|
|
|
|
|
|
|
|
|
|
|
216 |
if (format != "") {
|
217 |
url += "&format=" + format;
|
218 |
}
|
@@ -231,6 +355,7 @@
|
|
231 |
if (max != "") {
|
232 |
url += "&max=" + max;
|
233 |
}
|
|
|
234 |
if (streaming.checked) {
|
235 |
url += '&streaming=true';
|
236 |
}
|
@@ -245,16 +370,37 @@
|
|
245 |
}
|
246 |
|
247 |
function setAudioSource() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
var url = getLink();
|
249 |
-
var audioPlayer = document.getElementById("audioPlayer");
|
250 |
audioPlayer.src = url;
|
251 |
audioPlayer.play();
|
252 |
}
|
253 |
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
</script>
|
259 |
</body>
|
260 |
</html>
|
|
|
4 |
<meta charset="UTF-8"/>
|
5 |
<meta name="viewport" content="width=device-width, initial-scale=1.0"/>
|
6 |
<title>vits-simple-api</title>
|
7 |
+
<link rel="stylesheet" href="/static/css/style.css">
|
8 |
<link rel="stylesheet" href="/static/css/bootstrap.min.css"/>
|
9 |
</head>
|
10 |
<body>
|
11 |
+
<main class="main-container">
|
12 |
+
<div class="container flex flex-wrap mx-auto">
|
13 |
+
<div class="text-center d-flex align-items-center w-100" style="height: 100px;" id="component-1">
|
14 |
+
<h1 class="w-100">
|
15 |
+
<a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
|
16 |
+
style="text-decoration: none; color: black"> vits-simple-api </a>
|
17 |
+
</h1>
|
18 |
+
</div>
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
<div class="tabs w-100 border-b-2" id="component-2">
|
22 |
+
<button class="tab-button px-4 pb-2 pt-2 active " onclick="showContent(0)">VITS</button>
|
23 |
+
<button class="tab-button px-4 pb-2 pt-2" onclick="showContent(1)">W2V2-VITS</button>
|
24 |
+
</div>
|
25 |
+
|
26 |
+
<div class="content w-100 border-lr-2 border-b-2" id="component-3">
|
27 |
+
<div class="content-pane active w-100 flex-wrap">
|
28 |
+
<form class="w-100">
|
29 |
+
<div class="form-group">
|
30 |
+
<label>text</label>
|
31 |
+
<textarea class="form-control" id="inputText1" rows="3"
|
32 |
+
oninput="updateLink()">你好,こんにちは</textarea>
|
33 |
+
</div>
|
34 |
+
<div class="form-group">
|
35 |
+
<label>id</label>
|
36 |
+
<select class="form-control" id="inputId1" oninput="updateLink()">
|
37 |
+
{% for speaker in speakers["VITS"] %}
|
38 |
+
{% if speaker["name"] == "雷电将军(雷神)" %}
|
39 |
<option value="{{ speaker["id"] }}" selected>{{ speaker["id"] }} | {{ speaker["name"] }}
|
40 |
| {{ speaker["lang"] }}</option>
|
41 |
{% else %}
|
42 |
<option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
|
43 |
| {{ speaker["lang"] }}</option>
|
44 |
{% endif %}
|
45 |
+
{% endfor %}
|
46 |
+
</select>
|
47 |
+
</div>
|
48 |
+
</form>
|
49 |
+
<form class="w-100">
|
50 |
+
<div class="row">
|
51 |
+
<div class="col-md-4 form-group">
|
52 |
+
<label data-toggle="tooltip" data-placement="top"
|
53 |
+
title="默认为wav">format</label>
|
54 |
+
<select class="form-control" id="inputFormat1" oninput="updateLink()">
|
55 |
+
<option></option>
|
56 |
+
<option>wav</option>
|
57 |
+
<option>mp3</option>
|
58 |
+
<option>ogg</option>
|
59 |
+
<option>silk</option>
|
60 |
+
</select>
|
61 |
+
</div>
|
62 |
+
<div class="col-md-4 form-group">
|
63 |
+
<label data-toggle="tooltip" data-placement="top"
|
64 |
+
title="自动识别语言auto:可识别的语言根据不同speaker而不同,方言无法自动识别。方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd">lang</label>
|
65 |
+
<input type="text" class="form-control" id="inputLang1" oninput="updateLink()" value=""
|
66 |
+
placeholder="auto"/>
|
67 |
+
</div>
|
68 |
+
<div class="col-md-4 form-group">
|
69 |
+
<label data-toggle="tooltip" data-placement="top"
|
70 |
+
title="调节语音长度,相当于调节语速,该数值越大语速越慢。">length</label>
|
71 |
+
<input type="number" class="form-control" id="inputLength1" oninput="updateLink()" value=""
|
72 |
+
placeholder="1" min="0" step="0.001"/>
|
73 |
+
</div>
|
74 |
+
</div>
|
75 |
+
<div class="row">
|
76 |
+
<div class="col-md-4 form-group">
|
77 |
+
<label data-toggle="tooltip" data-placement="top"
|
78 |
+
title="样本噪声,控制合成的随机性。">noise</label>
|
79 |
+
<input type="number" class="form-control" id="inputNoise1" oninput="updateLink()" value=""
|
80 |
+
placeholder="0.33" min="0" step="0.001"/>
|
81 |
+
</div>
|
82 |
+
<div class="col-md-4 form-group">
|
83 |
+
<label data-toggle="tooltip" data-placement="top"
|
84 |
+
title="随机时长预测器噪声,控制音素发音长度。">noisew</label>
|
85 |
+
<input type="number" class="form-control" id="inputNoisew1" oninput="updateLink()" value=""
|
86 |
+
placeholder="0.4" min="0" step="0.001"/>
|
87 |
+
</div>
|
88 |
+
<div class="col-md-4 form-group">
|
89 |
+
<label data-toggle="tooltip" data-placement="top"
|
90 |
+
title="按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。">max</label>
|
91 |
+
<input type="number" class="form-control" id="inputMax1" oninput="updateLink()" value=""
|
92 |
+
placeholder="50" step="1"/>
|
93 |
+
</div>
|
94 |
+
</div>
|
95 |
+
</form>
|
96 |
+
|
97 |
+
|
98 |
+
<div class="flex flex-wrap w-100"
|
99 |
+
style="justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
|
100 |
+
<button type="button" class="btn btn-outline-secondary" onclick="setAudioSource()"
|
101 |
+
style="margin-right: 10px">
|
102 |
+
播放器生成
|
103 |
+
</button>
|
104 |
+
<audio id="audioPlayer1" controls>
|
105 |
+
<source src="" type="audio/mp3"/>
|
106 |
+
Your browser does not support the audio element.
|
107 |
+
</audio>
|
108 |
+
<div class="form-group form-check">
|
109 |
+
<input type="checkbox" id="streaming1" onchange="updateLink()">
|
110 |
+
<label class="form-check-label" data-toggle="tooltip" data-placement="top"
|
111 |
+
title="按照max分段推理文本,推理好一段即输出,无需等待所有文本都推理完毕">流式响应</label>
|
112 |
+
</div>
|
113 |
</div>
|
114 |
+
</div>
|
115 |
+
<div class="content-pane">
|
116 |
+
<form class="w-100">
|
117 |
+
<div class="form-group">
|
118 |
+
<label>text</label>
|
119 |
+
<textarea class="form-control" id="inputText2" rows="3"
|
120 |
+
oninput="updateLink()">你好,こんにちは</textarea>
|
121 |
+
</div>
|
122 |
+
<div class="form-group">
|
123 |
+
<label>id</label>
|
124 |
+
<select class="form-control" id="inputId2" oninput="updateLink()">
|
125 |
+
{% for speaker in speakers["W2V2-VITS"] %}
|
126 |
+
<option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
|
127 |
+
| {{ speaker["lang"] }}</option>
|
128 |
+
{% endfor %}
|
129 |
+
</select>
|
130 |
+
</div>
|
131 |
+
<div class="form-group mb-3">
|
132 |
+
<label data-toggle="tooltip" data-placement="top"
|
133 |
+
title="情感嵌入,{% if w2v2_emotion_count > 0 %}
|
134 |
+
可输入范围是0-{{ w2v2_emotion_count-1 }}
|
135 |
+
{% else %}
|
136 |
+
未加载emotion
|
137 |
+
{% endif %}">emotion</label>
|
138 |
+
<input type="number" class="form-control" min="0" max="{{ w2v2_emotion_count-1 }}" step="1"
|
139 |
+
id="emotion" value="0" oninput="updateLink()">
|
140 |
+
</div>
|
141 |
+
</form>
|
142 |
+
|
143 |
+
|
144 |
+
<form class="w-100">
|
145 |
+
<div class="row">
|
146 |
+
<div class="col-md-4 form-group">
|
147 |
+
<label data-toggle="tooltip" data-placement="top"
|
148 |
+
title="默认为wav">format</label>
|
149 |
+
<select class="form-control" id="inputFormat2" oninput="updateLink()">
|
150 |
+
<option></option>
|
151 |
+
<option>wav</option>
|
152 |
+
<option>mp3</option>
|
153 |
+
<option>ogg</option>
|
154 |
+
<option>silk</option>
|
155 |
+
</select>
|
156 |
+
</div>
|
157 |
+
<div class="col-md-4 form-group">
|
158 |
+
<label data-toggle="tooltip" data-placement="top"
|
159 |
+
title="自动识别语言auto:可识别的语言根据不同speaker而不同,方言无法自动识别。方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd">lang</label>
|
160 |
+
<input type="text" class="form-control" id="inputLang2" oninput="updateLink()" value=""
|
161 |
+
placeholder="auto"/>
|
162 |
+
</div>
|
163 |
+
<div class="col-md-4 form-group">
|
164 |
+
<label data-toggle="tooltip" data-placement="top"
|
165 |
+
title="调节语音长度,相当于调节语速,该数值越大语速越慢。">length</label>
|
166 |
+
<input type="number" class="form-control" id="inputLength2" oninput="updateLink()" value=""
|
167 |
+
placeholder="1" min="0" step="0.001"/>
|
168 |
+
</div>
|
169 |
+
</div>
|
170 |
+
<div class="row">
|
171 |
+
<div class="col-md-4 form-group">
|
172 |
+
<label data-toggle="tooltip" data-placement="top"
|
173 |
+
title="样本噪声,控制合成的随机性。">noise</label>
|
174 |
+
<input type="number" class="form-control" id="inputNoise2" oninput="updateLink()" value=""
|
175 |
+
placeholder="0.33" min="0" step="0.001"/>
|
176 |
+
</div>
|
177 |
+
<div class="col-md-4 form-group">
|
178 |
+
<label data-toggle="tooltip" data-placement="top"
|
179 |
+
title="随机时长预测器噪声,控制音素发音长度。">noisew</label>
|
180 |
+
<input type="number" class="form-control" id="inputNoisew2" oninput="updateLink()" value=""
|
181 |
+
placeholder="0.4" min="0" step="0.001"/>
|
182 |
+
</div>
|
183 |
+
<div class="col-md-4 form-group">
|
184 |
+
<label data-toggle="tooltip" data-placement="top"
|
185 |
+
title="按标点符号分段,加起来大于max时为一段文本。max<=0表示不分段。">max</label>
|
186 |
+
<input type="number" class="form-control" id="inputMax2" oninput="updateLink()" value=""
|
187 |
+
placeholder="50" step="1"/>
|
188 |
+
</div>
|
189 |
+
</div>
|
190 |
+
</form>
|
191 |
+
|
192 |
+
<div class="flex flex-wrap w-100"
|
193 |
+
style="justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
|
194 |
+
<button type="button" class="btn btn-outline-secondary" onclick="setAudioSource()"
|
195 |
+
style="margin-right: 10px">
|
196 |
+
播放器生成
|
197 |
+
</button>
|
198 |
+
<audio id="audioPlayer2" controls>
|
199 |
+
<source src="" type="audio/mp3"/>
|
200 |
+
Your browser does not support the audio element.
|
201 |
+
</audio>
|
202 |
+
<div class="form-group form-check">
|
203 |
+
<input type="checkbox" id="streaming2" onchange="updateLink()">
|
204 |
+
<label class="form-check-label">流式响应</label>
|
205 |
+
</div>
|
206 |
</div>
|
207 |
+
</div>
|
208 |
</div>
|
|
|
209 |
|
210 |
+
<div class="mt-2">
|
211 |
+
{% if speakers_count == 0 %}
|
212 |
+
<div style="color: red;">未加载任何模型</div>
|
213 |
+
{% endif %}
|
214 |
+
<div>
|
215 |
+
<label>返回speakers(json):</label>
|
216 |
+
<a id="speakersLink" href="https://artrajz-vits-simple-api.hf.space/voice/speakers" target="_blank"
|
217 |
+
style="text-decoration: none; color: black">
|
218 |
+
https://artrajz-vits-simple-api.hf.space/voice/speakers
|
219 |
+
</a>
|
220 |
+
</div>
|
221 |
+
<div>
|
222 |
+
<label>API调用:</label>
|
223 |
+
<a id="vitsLink" href="https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164"
|
224 |
+
style="text-decoration: none; color: black">
|
225 |
+
https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164
|
226 |
+
</a>
|
227 |
+
</div>
|
228 |
</div>
|
229 |
+
<h2>所有模型均为网络搜集,感谢模型原作者的付出!</h2>
|
230 |
+
<h2>请严格遵循模型原作者使用协议!</h2>
|
|
|
|
|
|
|
|
|
231 |
<p>
|
232 |
Nene_Nanami_Rong_Tang:
|
233 |
<a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
|
|
|
268 |
vits_chinese:
|
269 |
<a href="https://github.com/PlayVoice/vits_chinese" rel="noreferrer" target="_blank">PlayVoice/vits_chinese</a>
|
270 |
</p>
|
271 |
+
</div>
|
272 |
+
<br/>
|
273 |
|
274 |
</main>
|
275 |
|
|
|
277 |
<script src="/static/js/bootstrap.bundle.min.js"></script>
|
278 |
|
279 |
<script>
|
280 |
+
$(function () {
|
281 |
+
$('[data-toggle="tooltip"]').tooltip()
|
282 |
+
})
|
283 |
+
|
284 |
function getProtocol() {
|
285 |
return 'https:' == location.protocol ? "https://" : "http://";
|
286 |
}
|
|
|
291 |
}
|
292 |
|
293 |
var baseUrl = getProtocol() + getUrl();
|
294 |
+
var modelType = 1;
|
295 |
+
var vitsStatus = false;
|
296 |
+
var w2v2Status = false;
|
297 |
+
{% if vits_speakers_count > 0 %}
|
298 |
+
vitsStatus = true;
|
299 |
+
{% endif %}
|
300 |
+
{% if w2v2_speakers_count > 0 %}
|
301 |
+
w2v2Status = true;
|
302 |
+
{% endif %}
|
303 |
|
304 |
setBaseUrl();
|
305 |
|
306 |
function setBaseUrl() {
|
307 |
+
var text = document.getElementById("inputText" + modelType).value;
|
308 |
+
var id = document.getElementById("inputId" + modelType).value;
|
309 |
|
310 |
var vitsLink = document.getElementById("vitsLink");
|
311 |
var speakersLink = document.getElementById("speakersLink");
|
|
|
321 |
}
|
322 |
|
323 |
function getLink() {
|
324 |
+
var text = document.getElementById("inputText" + modelType).value;
|
325 |
+
var id = document.getElementById("inputId" + modelType).value;
|
326 |
+
var format = document.getElementById("inputFormat" + modelType).value;
|
327 |
+
var lang = document.getElementById("inputLang" + modelType).value;
|
328 |
+
var length = document.getElementById("inputLength" + modelType).value;
|
329 |
+
var noise = document.getElementById("inputNoise" + modelType).value;
|
330 |
+
var noisew = document.getElementById("inputNoisew" + modelType).value;
|
331 |
+
var max = document.getElementById("inputMax" + modelType).value;
|
332 |
+
var streaming = document.getElementById('streaming' + modelType);
|
333 |
|
334 |
+
if (modelType == 1) {
|
335 |
+
var url = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
|
336 |
+
} else if (modelType == 2) {
|
337 |
+
var emotion = document.getElementById('emotion').value;
|
338 |
+
var url = baseUrl + "/voice/w2v2-vits?text=" + text + "&id=" + id + "&emotion=" + emotion;
|
339 |
+
}
|
340 |
if (format != "") {
|
341 |
url += "&format=" + format;
|
342 |
}
|
|
|
355 |
if (max != "") {
|
356 |
url += "&max=" + max;
|
357 |
}
|
358 |
+
|
359 |
if (streaming.checked) {
|
360 |
url += '&streaming=true';
|
361 |
}
|
|
|
370 |
}
|
371 |
|
372 |
function setAudioSource() {
|
373 |
+
if (modelType==1 && !vitsStatus){
|
374 |
+
alert("未加载VITS模型");
|
375 |
+
return;
|
376 |
+
}
|
377 |
+
if (modelType==2 && !w2v2Status){
|
378 |
+
alert("未加载W2V2-VITS模型");
|
379 |
+
return;
|
380 |
+
}
|
381 |
var url = getLink();
|
382 |
+
var audioPlayer = document.getElementById("audioPlayer" + modelType);
|
383 |
audioPlayer.src = url;
|
384 |
audioPlayer.play();
|
385 |
}
|
386 |
|
387 |
+
function showContent(index) {
|
388 |
+
const panes = document.querySelectorAll(".content-pane");
|
389 |
+
const buttons = document.querySelectorAll(".tab-button");
|
390 |
+
modelType = index + 1;
|
391 |
+
|
392 |
+
for (let i = 0; i < panes.length; i++) {
|
393 |
+
if (i === index) {
|
394 |
+
panes[i].classList.add("active");
|
395 |
+
buttons[i].classList.add("active");
|
396 |
+
|
397 |
+
} else {
|
398 |
+
panes[i].classList.remove("active");
|
399 |
+
buttons[i].classList.remove("active");
|
400 |
+
}
|
401 |
+
}
|
402 |
+
updateLink();
|
403 |
+
}
|
404 |
</script>
|
405 |
</body>
|
406 |
</html>
|
text/cleaners.py
CHANGED
@@ -186,6 +186,21 @@ def cjke_cleaners2(text):
|
|
186 |
|
187 |
|
188 |
def cje_cleaners(text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
from text.mandarin import chinese_to_ipa
|
190 |
from text.japanese import japanese_to_ipa2
|
191 |
from text.english import english_to_ipa2
|
|
|
186 |
|
187 |
|
188 |
def cje_cleaners(text):
|
189 |
+
from text.mandarin import chinese_to_lazy_ipa
|
190 |
+
from text.japanese import japanese_to_ipa
|
191 |
+
from text.english import english_to_ipa2
|
192 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]', lambda x: chinese_to_lazy_ipa(x.group(1)).replace(
|
193 |
+
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn') + ' ', text)
|
194 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_ipa(x.group(1)).replace('ʧ', 'tʃ').replace(
|
195 |
+
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz') + ' ', text)
|
196 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]', lambda x: english_to_ipa2(x.group(1)).replace('ɑ', 'a').replace(
|
197 |
+
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u') + ' ', text)
|
198 |
+
text = re.sub(r'\s+$', '', text)
|
199 |
+
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
200 |
+
return text
|
201 |
+
|
202 |
+
|
203 |
+
def cje_cleaners2(text):
|
204 |
from text.mandarin import chinese_to_ipa
|
205 |
from text.japanese import japanese_to_ipa2
|
206 |
from text.english import english_to_ipa2
|
text/mandarin.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
import
|
2 |
-
import sys
|
3 |
import re
|
4 |
from pypinyin import lazy_pinyin, BOPOMOFO
|
5 |
import jieba
|
@@ -7,7 +6,7 @@ import cn2an
|
|
7 |
import logging
|
8 |
|
9 |
logging.getLogger('jieba').setLevel(logging.WARNING)
|
10 |
-
jieba.set_dictionary(
|
11 |
jieba.initialize()
|
12 |
|
13 |
# List of (Latin alphabet, bopomofo) pairs:
|
|
|
1 |
+
import config
|
|
|
2 |
import re
|
3 |
from pypinyin import lazy_pinyin, BOPOMOFO
|
4 |
import jieba
|
|
|
6 |
import logging
|
7 |
|
8 |
logging.getLogger('jieba').setLevel(logging.WARNING)
|
9 |
+
jieba.set_dictionary(config.ABS_PATH + '/jieba/dict.txt')
|
10 |
jieba.initialize()
|
11 |
|
12 |
# List of (Latin alphabet, bopomofo) pairs:
|
utils/merge.py
CHANGED
@@ -19,12 +19,13 @@ lang_dict = {
|
|
19 |
"cjke_cleaners": ["zh", "ja", "ko", "en"],
|
20 |
"cjke_cleaners2": ["zh", "ja", "ko", "en"],
|
21 |
"cje_cleaners": ["zh", "ja", "en"],
|
|
|
22 |
"thai_cleaners": ["th"],
|
23 |
"shanghainese_cleaners": ["sh"],
|
24 |
"chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD",
|
25 |
"ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC",
|
26 |
"YB"],
|
27 |
-
"bert_chinese_cleaners":["zh"],
|
28 |
}
|
29 |
|
30 |
|
@@ -109,11 +110,16 @@ def merge_model(merging_model):
|
|
109 |
for obj_id, i in enumerate(vits_list):
|
110 |
obj = vits(model=i[0], config=i[1], model_type="vits")
|
111 |
lang = lang_dict.get(obj.get_cleaner(), ["unknown"])
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
# merge hubert-vits
|
119 |
if len(hubert_vits_list) != 0:
|
@@ -136,6 +142,7 @@ def merge_model(merging_model):
|
|
136 |
new_id += 1
|
137 |
|
138 |
# merge w2v2-vits
|
|
|
139 |
if len(w2v2_vits_list) != 0:
|
140 |
if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY):
|
141 |
raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py")
|
@@ -156,7 +163,8 @@ def merge_model(merging_model):
|
|
156 |
|
157 |
voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj}
|
158 |
voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers}
|
159 |
-
|
160 |
-
|
|
|
161 |
|
162 |
return tts
|
|
|
19 |
"cjke_cleaners": ["zh", "ja", "ko", "en"],
|
20 |
"cjke_cleaners2": ["zh", "ja", "ko", "en"],
|
21 |
"cje_cleaners": ["zh", "ja", "en"],
|
22 |
+
"cje_cleaners2": ["zh", "ja", "en"],
|
23 |
"thai_cleaners": ["th"],
|
24 |
"shanghainese_cleaners": ["sh"],
|
25 |
"chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD",
|
26 |
"ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC",
|
27 |
"YB"],
|
28 |
+
"bert_chinese_cleaners": ["zh"],
|
29 |
}
|
30 |
|
31 |
|
|
|
110 |
for obj_id, i in enumerate(vits_list):
|
111 |
obj = vits(model=i[0], config=i[1], model_type="vits")
|
112 |
lang = lang_dict.get(obj.get_cleaner(), ["unknown"])
|
113 |
+
if isinstance(obj.get_speakers(), list):
|
114 |
+
for id, name in enumerate(obj.get_speakers()):
|
115 |
+
vits_obj.append([int(id), obj, obj_id])
|
116 |
+
vits_speakers.append({"id": new_id, "name": name, "lang": lang})
|
117 |
+
new_id += 1
|
118 |
+
else:
|
119 |
+
for id, (name, _) in enumerate(obj.get_speakers().items()):
|
120 |
+
vits_obj.append([int(id), obj, obj_id])
|
121 |
+
vits_speakers.append({"id": new_id, "name": name, "lang": lang})
|
122 |
+
new_id += 1
|
123 |
|
124 |
# merge hubert-vits
|
125 |
if len(hubert_vits_list) != 0:
|
|
|
142 |
new_id += 1
|
143 |
|
144 |
# merge w2v2-vits
|
145 |
+
emotion_reference = None
|
146 |
if len(w2v2_vits_list) != 0:
|
147 |
if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY):
|
148 |
raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py")
|
|
|
163 |
|
164 |
voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj}
|
165 |
voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers}
|
166 |
+
w2v2_emotion_count = len(emotion_reference) if emotion_reference is not None else 0
|
167 |
+
|
168 |
+
tts = TTS(voice_obj, voice_speakers, w2v2_emotion_count=w2v2_emotion_count)
|
169 |
|
170 |
return tts
|
utils/nlp.py
CHANGED
@@ -1,13 +1,7 @@
|
|
1 |
import regex as re
|
2 |
-
import logging
|
3 |
import config
|
4 |
from .utils import check_is_none
|
5 |
-
|
6 |
-
logger = logging.getLogger("vits-simple-api")
|
7 |
-
level = getattr(config, "LOGGING_LEVEL", "DEBUG")
|
8 |
-
level_dict = {'DEBUG': logging.DEBUG, 'INFO': logging.INFO, 'WARNING': logging.WARNING, 'ERROR': logging.ERROR,
|
9 |
-
'CRITICAL': logging.CRITICAL}
|
10 |
-
logger.setLevel(level_dict[level])
|
11 |
|
12 |
|
13 |
def clasify_lang(text, speaker_lang):
|
|
|
1 |
import regex as re
|
|
|
2 |
import config
|
3 |
from .utils import check_is_none
|
4 |
+
from logger import logger
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
|
7 |
def clasify_lang(text, speaker_lang):
|
vits-simple-api-installer-latest.sh
CHANGED
@@ -12,7 +12,32 @@ if [ ! -f config.py ]; then
|
|
12 |
wget -O $INSTALL_DIR/config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/config.py
|
13 |
fi
|
14 |
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
echo -e "${YELLOW}Pulling the image might take a while, so why not grab a cup of java first?\n${PLAIN}"
|
18 |
|
|
|
12 |
wget -O $INSTALL_DIR/config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/config.py
|
13 |
fi
|
14 |
|
15 |
+
if [ ! -f gunicorn_config.py ]; then
|
16 |
+
echo -e "${YELLOW}download config.py\n${PLAIN}"
|
17 |
+
wget -O $INSTALL_DIR/gunicorn_config.py https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/gunicorn_config.py
|
18 |
+
fi
|
19 |
+
|
20 |
+
while true; do
|
21 |
+
echo -e "${GREEN}Which version of docker-compose.yaml do you want to download?"
|
22 |
+
echo -e "1. docker-compose.yaml (CPU version)"
|
23 |
+
echo -e "2. docker-compose-gpu.yaml (GPU version)"
|
24 |
+
read -p "Enter your choice (1 or 2): " choice
|
25 |
+
case $choice in
|
26 |
+
1)
|
27 |
+
echo -e "${YELLOW}Downloading docker-compose.yaml (CPU version)\n${PLAIN}"
|
28 |
+
wget -O $INSTALL_DIR/docker-compose.yaml https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/docker-compose.yaml
|
29 |
+
break
|
30 |
+
;;
|
31 |
+
2)
|
32 |
+
echo -e "${YELLOW}Downloading docker-compose-gpu.yaml (GPU version)\n${PLAIN}"
|
33 |
+
wget -O $INSTALL_DIR/docker-compose.yaml https://raw.githubusercontent.com/Artrajz/vits-simple-api/main/docker-compose-gpu.yaml
|
34 |
+
break
|
35 |
+
;;
|
36 |
+
*)
|
37 |
+
echo -e "${RED}Invalid choice. Please enter 1 or 2.${PLAIN}"
|
38 |
+
;;
|
39 |
+
esac
|
40 |
+
done
|
41 |
|
42 |
echo -e "${YELLOW}Pulling the image might take a while, so why not grab a cup of java first?\n${PLAIN}"
|
43 |
|
voice.py
CHANGED
@@ -6,7 +6,6 @@ import numpy as np
|
|
6 |
import torch
|
7 |
import xml.etree.ElementTree as ET
|
8 |
import config
|
9 |
-
import logging
|
10 |
import soundfile as sf
|
11 |
from torch import no_grad, LongTensor, inference_mode, FloatTensor
|
12 |
from io import BytesIO
|
@@ -16,6 +15,7 @@ from mel_processing import spectrogram_torch
|
|
16 |
from text import text_to_sequence
|
17 |
from models import SynthesizerTrn
|
18 |
from utils import utils
|
|
|
19 |
|
20 |
# torch.set_num_threads(1) # 设置torch线程为1
|
21 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
@@ -251,7 +251,7 @@ class vits:
|
|
251 |
|
252 |
|
253 |
class TTS:
|
254 |
-
def __init__(self, voice_obj, voice_speakers):
|
255 |
self._voice_obj = voice_obj
|
256 |
self._voice_speakers = voice_speakers
|
257 |
self._strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
|
@@ -259,10 +259,11 @@ class TTS:
|
|
259 |
self._vits_speakers_count = len(self._voice_speakers["VITS"])
|
260 |
self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
|
261 |
self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
|
|
|
262 |
self.dem = None
|
263 |
|
264 |
# Initialization information
|
265 |
-
self.logger =
|
266 |
self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
|
267 |
self.logger.info(f'device:{device} device.type:{device.type}')
|
268 |
|
@@ -420,9 +421,7 @@ class TTS:
|
|
420 |
|
421 |
return voice_tasks, format
|
422 |
|
423 |
-
def create_ssml_infer_task(self,
|
424 |
-
voice_tasks, format = self.parse_ssml(ssml)
|
425 |
-
|
426 |
audios = []
|
427 |
for voice in voice_tasks:
|
428 |
if voice.get("break"):
|
@@ -438,10 +437,10 @@ class TTS:
|
|
438 |
|
439 |
audio = np.concatenate(audios, axis=0)
|
440 |
encoded_audio = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
441 |
-
if config
|
442 |
path = f"{config.CACHE_PATH}/{fname}"
|
443 |
utils.save_audio(encoded_audio.getvalue(), path)
|
444 |
-
return encoded_audio
|
445 |
|
446 |
def vits_infer(self, voice, fname):
|
447 |
format = voice.get("format", "wav")
|
@@ -450,7 +449,7 @@ class TTS:
|
|
450 |
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
451 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
452 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
453 |
-
if config
|
454 |
path = f"{config.CACHE_PATH}/{fname}"
|
455 |
utils.save_audio(encoded_audio.getvalue(), path)
|
456 |
return encoded_audio
|
@@ -466,9 +465,9 @@ class TTS:
|
|
466 |
encoded_audio = self.encode(sampling_rate, chunk, format)
|
467 |
for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio):
|
468 |
yield encoded_audio_chunk
|
469 |
-
if config
|
470 |
-
audio.write(encoded_audio.getvalue())
|
471 |
-
if config
|
472 |
path = f"{config.CACHE_PATH}/{fname}"
|
473 |
utils.save_audio(audio.getvalue(), path)
|
474 |
|
@@ -479,7 +478,7 @@ class TTS:
|
|
479 |
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
480 |
audio = voice_obj.get_audio(voice)
|
481 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
482 |
-
if config
|
483 |
path = f"{config.CACHE_PATH}/{fname}"
|
484 |
utils.save_audio(encoded_audio.getvalue(), path)
|
485 |
return encoded_audio
|
@@ -491,7 +490,7 @@ class TTS:
|
|
491 |
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
492 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
493 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
494 |
-
if config
|
495 |
path = f"{config.CACHE_PATH}/{fname}"
|
496 |
utils.save_audio(encoded_audio.getvalue(), path)
|
497 |
return encoded_audio
|
@@ -515,7 +514,7 @@ class TTS:
|
|
515 |
|
516 |
audio = voice_obj.voice_conversion(voice)
|
517 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
518 |
-
if config
|
519 |
path = f"{config.CACHE_PATH}/{fname}"
|
520 |
utils.save_audio(encoded_audio.getvalue(), path)
|
521 |
return encoded_audio
|
|
|
6 |
import torch
|
7 |
import xml.etree.ElementTree as ET
|
8 |
import config
|
|
|
9 |
import soundfile as sf
|
10 |
from torch import no_grad, LongTensor, inference_mode, FloatTensor
|
11 |
from io import BytesIO
|
|
|
15 |
from text import text_to_sequence
|
16 |
from models import SynthesizerTrn
|
17 |
from utils import utils
|
18 |
+
from logger import logger
|
19 |
|
20 |
# torch.set_num_threads(1) # 设置torch线程为1
|
21 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
251 |
|
252 |
|
253 |
class TTS:
|
254 |
+
def __init__(self, voice_obj, voice_speakers, w2v2_emotion_count=0):
|
255 |
self._voice_obj = voice_obj
|
256 |
self._voice_speakers = voice_speakers
|
257 |
self._strength_dict = {"x-weak": 0.25, "weak": 0.5, "Medium": 0.75, "Strong": 1, "x-strong": 1.25}
|
|
|
259 |
self._vits_speakers_count = len(self._voice_speakers["VITS"])
|
260 |
self._hubert_speakers_count = len(self._voice_speakers["HUBERT-VITS"])
|
261 |
self._w2v2_speakers_count = len(self._voice_speakers["W2V2-VITS"])
|
262 |
+
self._w2v2_emotion_count = w2v2_emotion_count
|
263 |
self.dem = None
|
264 |
|
265 |
# Initialization information
|
266 |
+
self.logger = logger
|
267 |
self.logger.info(f"torch:{torch.__version__} cuda_available:{torch.cuda.is_available()}")
|
268 |
self.logger.info(f'device:{device} device.type:{device.type}')
|
269 |
|
|
|
421 |
|
422 |
return voice_tasks, format
|
423 |
|
424 |
+
def create_ssml_infer_task(self, voice_tasks, format, fname):
|
|
|
|
|
425 |
audios = []
|
426 |
for voice in voice_tasks:
|
427 |
if voice.get("break"):
|
|
|
437 |
|
438 |
audio = np.concatenate(audios, axis=0)
|
439 |
encoded_audio = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
|
440 |
+
if getattr(config, "SAVE_AUDIO", False):
|
441 |
path = f"{config.CACHE_PATH}/{fname}"
|
442 |
utils.save_audio(encoded_audio.getvalue(), path)
|
443 |
+
return encoded_audio
|
444 |
|
445 |
def vits_infer(self, voice, fname):
|
446 |
format = voice.get("format", "wav")
|
|
|
449 |
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
450 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
451 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
452 |
+
if getattr(config, "SAVE_AUDIO", False):
|
453 |
path = f"{config.CACHE_PATH}/{fname}"
|
454 |
utils.save_audio(encoded_audio.getvalue(), path)
|
455 |
return encoded_audio
|
|
|
465 |
encoded_audio = self.encode(sampling_rate, chunk, format)
|
466 |
for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio):
|
467 |
yield encoded_audio_chunk
|
468 |
+
if getattr(config, "SAVE_AUDIO", False):
|
469 |
+
audio.write(encoded_audio.getvalue())
|
470 |
+
if getattr(config, "SAVE_AUDIO", False):
|
471 |
path = f"{config.CACHE_PATH}/{fname}"
|
472 |
utils.save_audio(audio.getvalue(), path)
|
473 |
|
|
|
478 |
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
479 |
audio = voice_obj.get_audio(voice)
|
480 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
481 |
+
if getattr(config, "SAVE_AUDIO", False):
|
482 |
path = f"{config.CACHE_PATH}/{fname}"
|
483 |
utils.save_audio(encoded_audio.getvalue(), path)
|
484 |
return encoded_audio
|
|
|
490 |
sampling_rate = voice_obj.hps_ms.data.sampling_rate
|
491 |
audio = voice_obj.get_audio(voice, auto_break=True)
|
492 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
493 |
+
if getattr(config, "SAVE_AUDIO", False):
|
494 |
path = f"{config.CACHE_PATH}/{fname}"
|
495 |
utils.save_audio(encoded_audio.getvalue(), path)
|
496 |
return encoded_audio
|
|
|
514 |
|
515 |
audio = voice_obj.voice_conversion(voice)
|
516 |
encoded_audio = self.encode(sampling_rate, audio, format)
|
517 |
+
if getattr(config, "SAVE_AUDIO", False):
|
518 |
path = f"{config.CACHE_PATH}/{fname}"
|
519 |
utils.save_audio(encoded_audio.getvalue(), path)
|
520 |
return encoded_audio
|