Spaces:
Running
Running
nekoaoxiang
commited on
Commit
•
d290960
1
Parent(s):
94080f3
Add api support
Browse files- Synthesizers/base/Base_TTS_Synthesizer.py +115 -0
- Synthesizers/base/Base_TTS_Task.py +207 -0
- Synthesizers/base/__init__.py +3 -0
- Synthesizers/base/config_utils.py +40 -0
- Synthesizers/gsv_fast/GSV_Synthesizer.py +340 -0
- Synthesizers/gsv_fast/__init__.py +2 -0
- Synthesizers/gsv_fast/configs/i18n/locale/en_US.json +99 -0
- Synthesizers/gsv_fast/configs/i18n/locale/zh_CN.json +99 -0
- Synthesizers/gsv_fast/configs/i18n/locale/zh_TW.json +99 -0
- Synthesizers/gsv_fast/configs/params_config.json +201 -0
- Synthesizers/gsv_fast/configs/ui_config.json +13 -0
- Synthesizers/gsv_fast/gsv_config.py +141 -0
- Synthesizers/gsv_fast/gsv_task.py +70 -0
- Synthesizers/gsv_fast/ssml_dealer.py +216 -0
- Synthesizers/remote/Remote_Synthesizer.py +100 -0
- Synthesizers/remote/__init__.py +2 -0
- Synthesizers/remote/configs/config.json +6 -0
- Synthesizers/remote/configs/i18n/locale/en_US.json +99 -0
- Synthesizers/remote/configs/i18n/locale/zh_CN.json +99 -0
- Synthesizers/remote/configs/i18n/locale/zh_TW.json +99 -0
- Synthesizers/remote/configs/params_config.json +3 -0
- Synthesizers/remote/configs/ui_config.json +3 -0
- Synthesizers/remote/remote_task.py +82 -0
- app.py +33 -2
- config.json +1 -0
- gsv_config.json +10 -0
- pure_api.py +110 -0
- src/config_manager.py +1 -0
Synthesizers/base/Base_TTS_Synthesizer.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
|
3 |
+
from .Base_TTS_Task import Base_TTS_Task as TTS_Task
|
4 |
+
import json
|
5 |
+
from typing import List, Dict, Literal, Optional, Any, Union, Generator, Tuple
|
6 |
+
from pydantic import BaseModel, Field, model_validator
|
7 |
+
import numpy as np
|
8 |
+
from abc import ABC, abstractmethod
|
9 |
+
from typing import Dict, List, Union, Generator, Tuple
|
10 |
+
from typing_extensions import Literal
|
11 |
+
import numpy as np
|
12 |
+
import wave,io
|
13 |
+
|
14 |
+
class Base_TTS_Synthesizer(ABC):
|
15 |
+
"""
|
16 |
+
Abstract base class for a Text-To-Speech (TTS) synthesizer.
|
17 |
+
|
18 |
+
Attributes:
|
19 |
+
ui_config (Dict[str, List]): A dictionary containing UI configuration settings.
|
20 |
+
debug_mode (bool): Flag to toggle debug mode for additional logging and debugging information.
|
21 |
+
|
22 |
+
"""
|
23 |
+
|
24 |
+
ui_config: Dict[str, List] = {}
|
25 |
+
debug_mode: bool = False
|
26 |
+
|
27 |
+
def __init__(self, **kwargs):
|
28 |
+
"""
|
29 |
+
Initializes the TTS synthesizer with optional UI configurations and debug mode setting.
|
30 |
+
|
31 |
+
Args:
|
32 |
+
ui_config (Dict[str, List], optional): Configuration for user interface settings.
|
33 |
+
debug_mode (bool, optional): Enables or disables debug mode.
|
34 |
+
|
35 |
+
"""
|
36 |
+
self.ui_config = kwargs.get("ui_config", {})
|
37 |
+
self.debug_mode = kwargs.get("debug_mode", False)
|
38 |
+
|
39 |
+
@abstractmethod
|
40 |
+
def generate(
|
41 |
+
self,
|
42 |
+
task: TTS_Task,
|
43 |
+
return_type: Literal["filepath", "numpy"] = "numpy",
|
44 |
+
save_path: Optional[str] = None,
|
45 |
+
) -> Union[str, Generator[Tuple[int, np.ndarray], None, None], Any]:
|
46 |
+
"""
|
47 |
+
Generates speech from a given TTS task.
|
48 |
+
|
49 |
+
Args:
|
50 |
+
task (TTS_Task): The task containing data and parameters for speech synthesis.
|
51 |
+
return_type (Literal["filepath", "numpy"], optional): The type of return value, either a file path or audio data.
|
52 |
+
save_path (str, optional): The path to save the audio file.
|
53 |
+
Returns:
|
54 |
+
Union[str, Generator[Tuple[int, np.ndarray], None, None], Any]: Depending on the return_type, returns a file path, a generator of audio data, or other types.
|
55 |
+
|
56 |
+
"""
|
57 |
+
pass
|
58 |
+
|
59 |
+
@abstractmethod
|
60 |
+
def get_characters(self):
|
61 |
+
"""
|
62 |
+
Retrieves the available characters and their emotions for the TTS.
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
Dict[str, List[str]]: A dictionary mapping character names to lists of their emotions.
|
66 |
+
"""
|
67 |
+
pass
|
68 |
+
|
69 |
+
@abstractmethod
|
70 |
+
def params_parser(self, data):
|
71 |
+
"""
|
72 |
+
Parses input data into a TTS_Task.
|
73 |
+
|
74 |
+
Args:
|
75 |
+
data (Any): The raw input data to be parsed.
|
76 |
+
|
77 |
+
Returns:
|
78 |
+
TTS_Task: A TTS task object created from the input data.
|
79 |
+
"""
|
80 |
+
pass
|
81 |
+
|
82 |
+
@abstractmethod
|
83 |
+
def ms_like_parser(self, data):
|
84 |
+
"""
|
85 |
+
Parses input data in a Microsoft-like format into a TTS_Task.
|
86 |
+
|
87 |
+
Args:
|
88 |
+
data (Any): The raw input data to be parsed.
|
89 |
+
|
90 |
+
Returns:
|
91 |
+
TTS_Task: A TTS task object created from the Microsoft-like formatted input data.
|
92 |
+
"""
|
93 |
+
pass
|
94 |
+
|
95 |
+
|
96 |
+
def get_wave_header_chunk(sample_rate: int, channels: int = 1, sample_width: int = 2):
|
97 |
+
"""
|
98 |
+
Generate a wave header with no data.
|
99 |
+
|
100 |
+
Args:
|
101 |
+
sample_rate (int): The sample rate of the audio.
|
102 |
+
channels (int, optional): The number of audio channels. Defaults to 1.
|
103 |
+
sample_width (int, optional): The sample width in bytes. Defaults to 2.
|
104 |
+
|
105 |
+
Returns:
|
106 |
+
bytes: The wave header as bytes.
|
107 |
+
"""
|
108 |
+
wav_buf = io.BytesIO()
|
109 |
+
with wave.open(wav_buf, "wb") as vfout:
|
110 |
+
vfout.setnchannels(channels)
|
111 |
+
vfout.setsampwidth(sample_width)
|
112 |
+
vfout.setframerate(sample_rate)
|
113 |
+
|
114 |
+
wav_buf.seek(0)
|
115 |
+
return wav_buf.read()
|
Synthesizers/base/Base_TTS_Task.py
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, json, sys
|
2 |
+
|
3 |
+
from uuid import uuid4
|
4 |
+
from typing import Literal
|
5 |
+
import urllib.parse
|
6 |
+
import hashlib
|
7 |
+
|
8 |
+
from pydantic import BaseModel, Field, model_validator
|
9 |
+
from typing import Literal, List, Optional, Dict, Any, Union
|
10 |
+
from uuid import uuid4
|
11 |
+
import hashlib
|
12 |
+
|
13 |
+
def convert_value_type(value: Any, type_: str):
|
14 |
+
if value is None:
|
15 |
+
return None
|
16 |
+
if isinstance(value, str):
|
17 |
+
value = urllib.parse.unquote(value)
|
18 |
+
if type(value).__name__ == type_:
|
19 |
+
# 如果值的类型和参数的类型一致,直接返回值
|
20 |
+
return value
|
21 |
+
if type_ == "int":
|
22 |
+
return int(value)
|
23 |
+
elif type_ == "float":
|
24 |
+
if isinstance(value, str) and value[-1] == "%":
|
25 |
+
return float(value[:-1]) / 100
|
26 |
+
else:
|
27 |
+
return float(value)
|
28 |
+
elif type_ == "bool":
|
29 |
+
if isinstance(value, bool):
|
30 |
+
return value
|
31 |
+
return str(value).lower() in ("true", "1", "t", "y", "yes", "allow", "allowed")
|
32 |
+
else: # 默认为字符串
|
33 |
+
return str(value)
|
34 |
+
|
35 |
+
|
36 |
+
class ParamItem(BaseModel):
|
37 |
+
"""
|
38 |
+
Represents a parameter item for a TTS task.
|
39 |
+
|
40 |
+
Attributes:
|
41 |
+
type (str): The data type of the parameter.
|
42 |
+
default (Any): The default value of the parameter.
|
43 |
+
alias (List[str]): The list of aliases for the parameter.
|
44 |
+
label (Optional[str]): The label for the parameter.
|
45 |
+
name (Optional[str]): The name of the parameter.
|
46 |
+
description (Optional[str]): The description of the parameter.
|
47 |
+
min_value (Optional[float]): The minimum value of the parameter.
|
48 |
+
max_value (Optional[float]): The maximum value of the parameter.
|
49 |
+
step (Optional[float]): The step value for the parameter.
|
50 |
+
choices (Optional[List[str]]): The list of choices for the parameter.
|
51 |
+
"""
|
52 |
+
type: str
|
53 |
+
component_type: Optional[str] = None
|
54 |
+
default: Any
|
55 |
+
alias: List[str]
|
56 |
+
label: Optional[str] = None
|
57 |
+
name: Optional[str]
|
58 |
+
description: Optional[str] = None
|
59 |
+
min_value: Optional[float] = None
|
60 |
+
max_value: Optional[float] = None
|
61 |
+
step: Optional[float] = None
|
62 |
+
choices: Optional[List[str]] = None
|
63 |
+
|
64 |
+
def __init__(self, **data):
|
65 |
+
if not data.get("type"):
|
66 |
+
data.update({"type": "str"})
|
67 |
+
super().__init__(**data)
|
68 |
+
self.default = convert_value_type(self.default, self.type)
|
69 |
+
|
70 |
+
|
71 |
+
def init_params_config(res: dict):
|
72 |
+
|
73 |
+
result = {}
|
74 |
+
for key, value in res.items():
|
75 |
+
if value.get("label") is None:
|
76 |
+
value.update({"label": key})
|
77 |
+
value.update({"name": key})
|
78 |
+
result[key] = ParamItem(**value)
|
79 |
+
return result
|
80 |
+
|
81 |
+
|
82 |
+
class Base_TTS_Task(BaseModel):
|
83 |
+
"""
|
84 |
+
Base class for TTS (Text-to-Speech) tasks.
|
85 |
+
|
86 |
+
Attributes:
|
87 |
+
uuid (str): Unique identifier for the task.
|
88 |
+
params_config (Dict[str, ParamItem]): Configuration parameters for the task.
|
89 |
+
|
90 |
+
task_type (Literal["text", "ssml", "audio"]): Type of the task. Can be "text", "ssml", or "audio".
|
91 |
+
audio_path (Optional[str]): Path to the audio file.
|
92 |
+
src (Optional[str]): Source url.
|
93 |
+
ssml (Optional[str]): SSML (Speech Synthesis Markup Language) text.
|
94 |
+
|
95 |
+
text (Optional[str]): Text content.
|
96 |
+
|
97 |
+
format (Optional[str]): Audio format.
|
98 |
+
sample_rate (Optional[int]): Sample rate of the audio.
|
99 |
+
loudness (Optional[float]): Loudness of the audio.
|
100 |
+
speed (Optional[float]): Speed of the audio.
|
101 |
+
stream (Optional[bool]): Flag indicating if the audio should be streamed.
|
102 |
+
|
103 |
+
save_temp (Optional[bool]): Flag indicating if the temporary files should be saved.
|
104 |
+
|
105 |
+
disabled_features (Optional[List[str]]): List of disabled features.
|
106 |
+
|
107 |
+
"""
|
108 |
+
|
109 |
+
uuid: str = None
|
110 |
+
params_config: Dict[str, ParamItem]
|
111 |
+
|
112 |
+
task_type: Literal["text", "ssml", "audio"] = Field(default="text")
|
113 |
+
audio_path: Optional[str] = None
|
114 |
+
src: Optional[str] = None
|
115 |
+
ssml: Optional[str] = None
|
116 |
+
|
117 |
+
text: Optional[str] = None
|
118 |
+
|
119 |
+
character: Optional[str] = None
|
120 |
+
emotion: Optional[str] = None
|
121 |
+
|
122 |
+
format: Optional[str] = None
|
123 |
+
sample_rate: Optional[int] = None
|
124 |
+
loudness: Optional[float] = None
|
125 |
+
speed: Optional[float] = None
|
126 |
+
stream: Optional[bool] = None
|
127 |
+
|
128 |
+
save_temp: Optional[bool] = False
|
129 |
+
|
130 |
+
disabled_features: Optional[List[str]] = None
|
131 |
+
|
132 |
+
class Config:
|
133 |
+
populate_by_name = True
|
134 |
+
extra = "ignore"
|
135 |
+
|
136 |
+
def __init__(self, other_task: Union[BaseModel, dict, None] = None, **data):
|
137 |
+
if isinstance(other_task, BaseModel):
|
138 |
+
# 如果 task 是 Base_TTS_Task 实例,从该实例复制属性
|
139 |
+
data = other_task.model_dump()
|
140 |
+
super().__init__(**data)
|
141 |
+
else:
|
142 |
+
# 如果 task 是字典,直接使用这个字典
|
143 |
+
if isinstance(other_task, dict):
|
144 |
+
data = other_task
|
145 |
+
assert "params_config" in data, "params_config is not defined."
|
146 |
+
super().__init__(params_config=data.get("params_config"))
|
147 |
+
self.set_default_values()
|
148 |
+
self.set_values(**data)
|
149 |
+
self.uuid = str(uuid4())
|
150 |
+
|
151 |
+
def update_value(self, key: str, value: Any, allow_none: bool = False):
|
152 |
+
if not allow_none and value is None:
|
153 |
+
return
|
154 |
+
|
155 |
+
assert self.params_config is not None, "params_config is not defined."
|
156 |
+
for param_key, param_value in self.params_config.items():
|
157 |
+
if key in param_value.alias:
|
158 |
+
if hasattr(self, param_key):
|
159 |
+
value = convert_value_type(value, param_value.type)
|
160 |
+
setattr(self, param_key, value)
|
161 |
+
else:
|
162 |
+
pass
|
163 |
+
# raise ValueError(f"Attribute {param_key} not found. Something went wrong in params_config.json.")
|
164 |
+
|
165 |
+
def set_values(self, **data):
|
166 |
+
assert self.params_config is not None, "params_config is not defined."
|
167 |
+
for key, value in data.items():
|
168 |
+
if hasattr(self, key):
|
169 |
+
value = convert_value_type(value, type(getattr(self, key)).__name__)
|
170 |
+
setattr(self, key, value)
|
171 |
+
else:
|
172 |
+
self.update_value(key, value)
|
173 |
+
|
174 |
+
def set_default_values(self):
|
175 |
+
assert self.params_config is not None, "params_config is not defined."
|
176 |
+
for key, value in self.params_config.items():
|
177 |
+
if (
|
178 |
+
hasattr(self, key)
|
179 |
+
and getattr(self, key) is None
|
180 |
+
and value.default is not None
|
181 |
+
):
|
182 |
+
setattr(self, key, value.default)
|
183 |
+
|
184 |
+
@property
|
185 |
+
def md5(self):
|
186 |
+
m = hashlib.md5()
|
187 |
+
if self.task_type == "text":
|
188 |
+
m.update(self.text.encode())
|
189 |
+
elif self.task_type == "ssml":
|
190 |
+
m.update(self.ssml.encode())
|
191 |
+
elif self.task_type == "audio":
|
192 |
+
m.update(self.src.encode())
|
193 |
+
return m.hexdigest()
|
194 |
+
|
195 |
+
def __str__(self):
|
196 |
+
dict_content: dict = self.model_dump(exclude={"params_config"})
|
197 |
+
# 收集所有值为None的键
|
198 |
+
keys_to_remove = [key for key, value in dict_content.items() if value is None]
|
199 |
+
|
200 |
+
# 弹出这些键
|
201 |
+
for key in keys_to_remove:
|
202 |
+
dict_content.pop(key)
|
203 |
+
return json.dumps(dict_content, indent=4, ensure_ascii=False)
|
204 |
+
|
205 |
+
def copy(self, update: Dict[str, Any] = {}, deep: bool = False):
|
206 |
+
update["uuid"] = str(uuid4())
|
207 |
+
return super().model_copy(update=update, deep=deep)
|
Synthesizers/base/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .Base_TTS_Task import Base_TTS_Task, ParamItem, init_params_config
|
2 |
+
from .Base_TTS_Synthesizer import Base_TTS_Synthesizer, get_wave_header_chunk
|
3 |
+
from .config_utils import load_config
|
Synthesizers/base/config_utils.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Any, Optional, Dict, List, Literal
|
2 |
+
from pydantic import BaseModel
|
3 |
+
import os, json
|
4 |
+
|
5 |
+
class ConfigItem(BaseModel):
|
6 |
+
value : Optional[Any] = None
|
7 |
+
default : Optional[Any] = None
|
8 |
+
type : Optional[str] = None
|
9 |
+
description : Optional[str] = None
|
10 |
+
|
11 |
+
def __init__(self, **data):
|
12 |
+
super().__init__(**data)
|
13 |
+
if (self.value is None) and self.default is not None:
|
14 |
+
self.value = self.default
|
15 |
+
|
16 |
+
def is_config_item(item:Dict[str, Any])->bool:
|
17 |
+
"""判断是否为配置项"""
|
18 |
+
return isinstance(item, dict) and ("value" in item or "default" in item)
|
19 |
+
|
20 |
+
def parse_config_dict(input_config:Dict[str, Any], output_config)->Dict[str, Any]:
|
21 |
+
|
22 |
+
for key, res in input_config.items():
|
23 |
+
if is_config_item(res):
|
24 |
+
value = ConfigItem(**res).value
|
25 |
+
else:
|
26 |
+
if isinstance(res, dict):
|
27 |
+
value = parse_config_dict(res, {})
|
28 |
+
else:
|
29 |
+
value = res
|
30 |
+
output_config[key] = value
|
31 |
+
return output_config
|
32 |
+
|
33 |
+
def load_config(config_path:str)->Dict[str, Any]:
|
34 |
+
"""加载配置文件"""
|
35 |
+
assert os.path.exists(config_path), f"配置文件不存在: {config_path}"
|
36 |
+
config:Dict[str, Any] = {}
|
37 |
+
with open(config_path, 'r', encoding='utf-8') as f:
|
38 |
+
config = parse_config_dict(json.load(f), {})
|
39 |
+
return config
|
40 |
+
|
Synthesizers/gsv_fast/GSV_Synthesizer.py
ADDED
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io, wave
|
2 |
+
import os, json, sys
|
3 |
+
import threading
|
4 |
+
from typing import Any, Union, Generator, Literal, List, Dict, Tuple
|
5 |
+
from Synthesizers.base import Base_TTS_Synthesizer, load_config
|
6 |
+
|
7 |
+
from .gsv_task import GSV_TTS_Task as TTS_Task
|
8 |
+
from .ssml_dealer import SSML_Dealer
|
9 |
+
|
10 |
+
from time import time as tt
|
11 |
+
import numpy as np
|
12 |
+
import hashlib
|
13 |
+
import soundfile as sf
|
14 |
+
|
15 |
+
from .gsv_config import load_infer_config, auto_generate_infer_config, get_device_info
|
16 |
+
from datetime import datetime
|
17 |
+
|
18 |
+
dict_language = {
|
19 |
+
"中文": "all_zh",#全部按中文识别
|
20 |
+
"英文": "en",#全部按英文识别#######不变
|
21 |
+
"日文": "all_ja",#全部按日文识别
|
22 |
+
"中英混合": "zh",#按中英混合识别####不变
|
23 |
+
"日英混合": "ja",#按日英混合识别####不变
|
24 |
+
"多语种混合": "auto",#多语种启动切分识别语种
|
25 |
+
"auto": "auto",
|
26 |
+
"zh": "zh",
|
27 |
+
"en": "en",
|
28 |
+
"ja": "ja",
|
29 |
+
"all_zh": "all_zh",
|
30 |
+
"all_ja": "all_ja",
|
31 |
+
}
|
32 |
+
|
33 |
+
from Adapters.gsv_fast.TTS_infer_pack.TTS import TTS, TTS_Config
|
34 |
+
class GSV_Synthesizer(Base_TTS_Synthesizer):
|
35 |
+
device: str = "auto"
|
36 |
+
is_half: bool = False
|
37 |
+
models_path:str = "models/gsv"
|
38 |
+
cnhubert_base_path:str = "models/pretrained_models/gsv/chinese-hubert-base"
|
39 |
+
bert_base_path:str = "models/pretrained_models/gsv/chinese-roberta-wwm-ext-large"
|
40 |
+
save_prompt_cache:bool = True
|
41 |
+
prompt_cache_dir:str = "cache/prompt_cache"
|
42 |
+
default_character:str = None
|
43 |
+
|
44 |
+
ui_config:dict = None
|
45 |
+
tts_pipline:TTS = None
|
46 |
+
character:str = None
|
47 |
+
lock:threading.Lock = None
|
48 |
+
|
49 |
+
def __init__(self, config_path:str=None, **kwargs):
|
50 |
+
super().__init__()
|
51 |
+
|
52 |
+
if config_path is None:
|
53 |
+
config_path = "gsv_config.json"
|
54 |
+
config_dict = load_config(config_path)
|
55 |
+
config_dict.update(kwargs)
|
56 |
+
for key, value in config_dict.items():
|
57 |
+
if hasattr(self, key):
|
58 |
+
setattr(self, key, value)
|
59 |
+
if self.debug_mode:
|
60 |
+
print(f"GSV_Synthesizer config: {config_dict}")
|
61 |
+
|
62 |
+
self.device, self.is_half = get_device_info(self.device, self.is_half)
|
63 |
+
tts_config = TTS_Config("")
|
64 |
+
tts_config.device , tts_config.is_half = self.device, self.is_half
|
65 |
+
tts_config.cnhubert_base_path = self.cnhubert_base_path
|
66 |
+
tts_config.bert_base_path = self.bert_base_path
|
67 |
+
self.tts_pipline = TTS(tts_config)
|
68 |
+
|
69 |
+
if self.default_character is None:
|
70 |
+
self.default_character = next(iter(self.get_characters()), None)
|
71 |
+
|
72 |
+
self.lock = threading.Lock()
|
73 |
+
self.load_character(self.default_character)
|
74 |
+
ui_config_path = os.path.join("Synthesizers/gsv_fast/configs", "ui_config.json")
|
75 |
+
with open(ui_config_path, 'r', encoding='utf-8') as f:
|
76 |
+
self.ui_config = json.load(f)
|
77 |
+
|
78 |
+
# from https://github.com/RVC-Boss/GPT-SoVITS/pull/448
|
79 |
+
def get_streaming_tts_wav(self, params):
|
80 |
+
# from https://huggingface.co/spaces/coqui/voice-chat-with-mistral/blob/main/app.py
|
81 |
+
def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=32000):
|
82 |
+
wav_buf = io.BytesIO()
|
83 |
+
with wave.open(wav_buf, "wb") as vfout:
|
84 |
+
vfout.setnchannels(channels)
|
85 |
+
vfout.setsampwidth(sample_width)
|
86 |
+
vfout.setframerate(sample_rate)
|
87 |
+
vfout.writeframes(frame_input)
|
88 |
+
|
89 |
+
wav_buf.seek(0)
|
90 |
+
return wav_buf.read()
|
91 |
+
chunks = self.tts_pipline.run(params)
|
92 |
+
yield wave_header_chunk()
|
93 |
+
# chunk is tuple[int, np.ndarray], 代表了sample_rate和音频数据
|
94 |
+
for chunk in chunks:
|
95 |
+
sample_rate, audio_data = chunk
|
96 |
+
if audio_data is not None:
|
97 |
+
yield audio_data.tobytes()
|
98 |
+
|
99 |
+
def get_characters(self) -> dict:
|
100 |
+
characters_and_emotions = {}
|
101 |
+
|
102 |
+
# 遍历模型路径下的所有文件夹
|
103 |
+
for character_subdir in os.listdir(self.models_path):
|
104 |
+
subdir_path = os.path.join(self.models_path, character_subdir)
|
105 |
+
config_path = os.path.join(subdir_path, "infer_config.json")
|
106 |
+
if not os.path.isdir(subdir_path):
|
107 |
+
continue
|
108 |
+
# 检查路径是否为文件夹并存在配置文件
|
109 |
+
if os.path.exists(config_path):
|
110 |
+
try:
|
111 |
+
# 尝试读取配置文件并提取情感列表
|
112 |
+
with open(config_path, "r", encoding='utf-8') as f:
|
113 |
+
config = json.load(f)
|
114 |
+
emotion_dict_list = config.get('emotion_list', None)
|
115 |
+
if emotion_dict_list is None:
|
116 |
+
emotion_list = ["default"]
|
117 |
+
else:
|
118 |
+
emotion_list = list(emotion_dict_list.keys())
|
119 |
+
except json.JSONDecodeError:
|
120 |
+
# 文件读取或解析失败则使用默认情感
|
121 |
+
emotion_list = ["default"]
|
122 |
+
else:
|
123 |
+
# 如果不是文件夹或配置文件不存在,也使用默认情感
|
124 |
+
emotion_list = ["default"]
|
125 |
+
|
126 |
+
characters_and_emotions[character_subdir] = emotion_list
|
127 |
+
return characters_and_emotions
|
128 |
+
|
129 |
+
def load_character_id(self, speaker_id):
|
130 |
+
character = list(self.get_characters())[speaker_id]
|
131 |
+
return self.load_character(character)
|
132 |
+
|
133 |
+
def load_character(self, character):
|
134 |
+
if character in ["", None]:
|
135 |
+
if self.character not in ["", None]:
|
136 |
+
return
|
137 |
+
else:
|
138 |
+
character = self.default_character
|
139 |
+
print(f"{character}为空,尝试切换到默认角色{self.default_character}")
|
140 |
+
return self.load_character(character)
|
141 |
+
if str(character).lower() == str(self.character).lower():
|
142 |
+
return
|
143 |
+
character_path=os.path.join(self.models_path, character)
|
144 |
+
if not os.path.exists(character_path):
|
145 |
+
print(f"找不到角色文件夹: {character},沿用之前的角色{self.character}")
|
146 |
+
return
|
147 |
+
# raise Exception(f"Can't find character folder: {character}")
|
148 |
+
assert os.path.exists(character_path), f"找不到角色文件夹: {character}"
|
149 |
+
try:
|
150 |
+
# 加载配置
|
151 |
+
config = load_infer_config(character_path)
|
152 |
+
|
153 |
+
# 尝试从环境变量获取gpt_path,如果未设置,则从配置文件读取
|
154 |
+
gpt_path = os.path.join(character_path,config.get("gpt_path"))
|
155 |
+
# 尝试从环境变量获取sovits_path,如果未设置,则从配置文件读取
|
156 |
+
sovits_path = os.path.join(character_path,config.get("sovits_path"))
|
157 |
+
except:
|
158 |
+
try:
|
159 |
+
# 尝试调用auto_get_infer_config
|
160 |
+
auto_generate_infer_config(character_path)
|
161 |
+
self.load_character(character)
|
162 |
+
return
|
163 |
+
except:
|
164 |
+
# 报错
|
165 |
+
raise Exception("找不到模型文件!请把有效模型放置在模型文件夹下,确保其中至少有pth、ckpt和wav三种文件。")
|
166 |
+
|
167 |
+
self.character = character
|
168 |
+
|
169 |
+
t0 = tt()
|
170 |
+
self.tts_pipline.init_t2s_weights(gpt_path)
|
171 |
+
self.tts_pipline.init_vits_weights(sovits_path)
|
172 |
+
t1 = tt()
|
173 |
+
print(f"加载角色成功: {character}, 耗时: {t1-t0:.2f}s")
|
174 |
+
|
175 |
+
def generate_from_text(self, task: TTS_Task):
|
176 |
+
self.load_character(task.character)
|
177 |
+
task.character = self.character
|
178 |
+
# 加载环境配置
|
179 |
+
if task.ref_audio_path is None or not os.path.exists(task.ref_audio_path):
|
180 |
+
task.ref_audio_path, task.prompt_text, task.prompt_language = self.get_ref_infos(self.character, task.emotion)
|
181 |
+
|
182 |
+
return self.get_wav_from_text_api(
|
183 |
+
text=task.text,
|
184 |
+
text_language=task.text_language,
|
185 |
+
ref_audio_path=task.ref_audio_path,
|
186 |
+
prompt_text=task.prompt_text,
|
187 |
+
prompt_language=task.prompt_language,
|
188 |
+
batch_size=task.batch_size,
|
189 |
+
speed=task.speed,
|
190 |
+
top_k=task.top_k,
|
191 |
+
top_p=task.top_p,
|
192 |
+
temperature=task.temperature,
|
193 |
+
cut_method=task.cut_method,
|
194 |
+
max_cut_length=task.max_cut_length,
|
195 |
+
seed=task.seed,
|
196 |
+
parallel_infer=task.parallel_infer,
|
197 |
+
repetition_penalty=task.repetition_penalty,
|
198 |
+
stream=task.stream
|
199 |
+
)
|
200 |
+
|
201 |
+
def generate_from_ssml(self, task: TTS_Task):
|
202 |
+
dealer = SSML_Dealer()
|
203 |
+
return dealer.generate_from_ssml(task.ssml, self)
|
204 |
+
|
205 |
+
def generate(
|
206 |
+
self,
|
207 |
+
task: TTS_Task,
|
208 |
+
return_type: Literal["filepath", "numpy"] = "numpy",
|
209 |
+
save_path: str = None,
|
210 |
+
) -> Union[str, Generator[Tuple[int, np.ndarray], None, None], Any]:
|
211 |
+
if self.debug_mode:
|
212 |
+
print(f"task: {task}")
|
213 |
+
gen = None
|
214 |
+
if task.task_type == "text":
|
215 |
+
gen = self.generate_from_text(task)
|
216 |
+
elif task.task_type == "ssml":
|
217 |
+
gen = self.generate_from_ssml(task)
|
218 |
+
|
219 |
+
if return_type == "numpy":
|
220 |
+
return gen
|
221 |
+
elif return_type == "filepath":
|
222 |
+
if save_path is None:
|
223 |
+
save_path = f"tmp_audio/{datetime.now().strftime('%Y%m%d%H%M%S')}.{task.format}"
|
224 |
+
sr, audio_data = next(gen)
|
225 |
+
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
226 |
+
sf.write(save_path, audio_data, sr)
|
227 |
+
return save_path
|
228 |
+
@staticmethod
|
229 |
+
def calc_short_md5(string):
|
230 |
+
m = hashlib.md5()
|
231 |
+
m.update(string.encode())
|
232 |
+
return m.hexdigest()[:8]
|
233 |
+
def get_ref_infos(self, character, emotion) -> Tuple[str, str, str]:
|
234 |
+
if self.debug_mode:
|
235 |
+
print(f"try to get ref infos, character: {character}, emotion: {emotion}")
|
236 |
+
character_path = os.path.join(self.models_path, character)
|
237 |
+
config: Dict[str, Any] = load_infer_config(character_path)
|
238 |
+
emotion_dict: Dict = config.get("emotion_list", None)
|
239 |
+
if emotion_dict is None:
|
240 |
+
return None, None, None
|
241 |
+
emotion_name_list = list(emotion_dict.keys())
|
242 |
+
if emotion not in emotion_name_list:
|
243 |
+
emotion = emotion_name_list[0]
|
244 |
+
for emotion_name, details in emotion_dict.items():
|
245 |
+
if emotion_name == emotion:
|
246 |
+
relative_path = details['ref_wav_path']
|
247 |
+
ref_audio_path = os.path.join(os.path.join(self.models_path,self.character), relative_path)
|
248 |
+
prompt_text = details['prompt_text']
|
249 |
+
prompt_language = details['prompt_language']
|
250 |
+
|
251 |
+
return ref_audio_path, prompt_text, prompt_language
|
252 |
+
return None, None, None
|
253 |
+
|
254 |
+
def get_wav_from_text_api(
|
255 |
+
self,
|
256 |
+
text: str,
|
257 |
+
text_language="auto",
|
258 |
+
ref_audio_path=None,
|
259 |
+
prompt_text=None,
|
260 |
+
prompt_language="auto",
|
261 |
+
batch_size=1,
|
262 |
+
speed=1.0,
|
263 |
+
top_k=12,
|
264 |
+
top_p=0.6,
|
265 |
+
temperature=0.6,
|
266 |
+
cut_method="auto_cut",
|
267 |
+
max_cut_length=100,
|
268 |
+
seed=-1,
|
269 |
+
stream=False,
|
270 |
+
parallel_infer=True,
|
271 |
+
repetition_penalty=1.35,
|
272 |
+
**kwargs
|
273 |
+
):
|
274 |
+
|
275 |
+
text = text.replace("\r", "\n").replace("<br>", "\n").replace("\t", " ")
|
276 |
+
text = text.replace("……","。").replace("…","。").replace("\n\n","\n").replace("。\n","\n").replace("\n", "。\n")
|
277 |
+
|
278 |
+
assert os.path.exists(ref_audio_path), f"找不到参考音频文件: {ref_audio_path}"
|
279 |
+
prompt_cache_path = ""
|
280 |
+
|
281 |
+
if self.save_prompt_cache:
|
282 |
+
prompt_cache_path = f"{self.prompt_cache_dir}/prompt_cache_{self.calc_short_md5(ref_audio_path + prompt_text + prompt_language)}.pickle"
|
283 |
+
|
284 |
+
try:
|
285 |
+
text_language = dict_language[text_language]
|
286 |
+
prompt_language = dict_language[prompt_language]
|
287 |
+
if "-" in text_language:
|
288 |
+
text_language = text_language.split("-")[0]
|
289 |
+
if "-" in prompt_language:
|
290 |
+
prompt_language = prompt_language.split("-")[0]
|
291 |
+
except:
|
292 |
+
text_language = "auto"
|
293 |
+
prompt_language = "auto"
|
294 |
+
ref_free = False
|
295 |
+
|
296 |
+
if cut_method == "auto_cut":
|
297 |
+
cut_method = f"auto_cut_{max_cut_length}"
|
298 |
+
|
299 |
+
params = {
|
300 |
+
"text": text,
|
301 |
+
"text_lang": text_language.lower(),
|
302 |
+
"prompt_cache_path": prompt_cache_path,
|
303 |
+
"ref_audio_path": ref_audio_path,
|
304 |
+
"prompt_text": prompt_text,
|
305 |
+
"prompt_lang": prompt_language.lower(),
|
306 |
+
"top_k": top_k,
|
307 |
+
"top_p": top_p,
|
308 |
+
"temperature": temperature,
|
309 |
+
"text_split_method": cut_method,
|
310 |
+
"batch_size": batch_size,
|
311 |
+
"speed_factor": speed,
|
312 |
+
"ref_text_free": ref_free,
|
313 |
+
"split_bucket":True,
|
314 |
+
"return_fragment":stream,
|
315 |
+
"seed": seed,
|
316 |
+
"parallel_infer": parallel_infer,
|
317 |
+
"repetition_penalty": repetition_penalty
|
318 |
+
}
|
319 |
+
# 调用原始的get_tts_wav函数
|
320 |
+
# 注意:这里假设get_tts_wav函数及其所需的其它依赖已经定义并可用
|
321 |
+
with self.lock:
|
322 |
+
if stream == False:
|
323 |
+
return self.tts_pipline.run(params)
|
324 |
+
else:
|
325 |
+
return self.get_streaming_tts_wav(params)
|
326 |
+
|
327 |
+
@staticmethod
|
328 |
+
def params_parser(data) -> TTS_Task:
|
329 |
+
task = TTS_Task(**data)
|
330 |
+
return task
|
331 |
+
|
332 |
+
@staticmethod
|
333 |
+
def ms_like_parser(data) -> TTS_Task:
|
334 |
+
inputs = data.get("inputs", [])
|
335 |
+
try:
|
336 |
+
data["text"] = inputs[0]["text"]
|
337 |
+
except:
|
338 |
+
pass
|
339 |
+
task = TTS_Task(**data)
|
340 |
+
return task
|
Synthesizers/gsv_fast/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .GSV_Synthesizer import GSV_Synthesizer as TTS_Synthesizer
|
2 |
+
from .gsv_task import GSV_TTS_Task as TTS_Task
|
Synthesizers/gsv_fast/configs/i18n/locale/en_US.json
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
", 返回内容:": ", Return Content:",
|
3 |
+
"<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>This is the model management interface. It allows you to assign emotions to multiple reference audio segments. If you only have one segment, you can skip using this interface.</p><p>If you have questions or need further information, please refer to the documentation: <a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">Click to view detailed documentation</a>.</p>",
|
4 |
+
"Endpoint": "Endpoint",
|
5 |
+
"GPT模型路径": "GPT Model Path",
|
6 |
+
"Sovits模型路径": "SoVITS Model Path",
|
7 |
+
"Temperature": "Temperature",
|
8 |
+
"Top K": "Top K",
|
9 |
+
"Top P": "Top P",
|
10 |
+
"all_ja": "Japanese Only",
|
11 |
+
"all_zh": "Chinese Only",
|
12 |
+
"auto": "Auto Detect",
|
13 |
+
"auto_cut": "Smart Split",
|
14 |
+
"batch_size,1代表不并行,越大越快,但是越可能出问题": "Batch Size: 1 means no parallel processing. Larger values are faster but more prone to issues.",
|
15 |
+
"cut0": "Split by Line Break Only",
|
16 |
+
"cut1": "Group Four Sentences Together",
|
17 |
+
"cut2": "Group 50 Characters Together",
|
18 |
+
"cut3": "Split by Chinese Period",
|
19 |
+
"cut4": "Split by English Period",
|
20 |
+
"cut5": "Split by Punctuation",
|
21 |
+
"en": "English",
|
22 |
+
"https://space.bilibili.com/66633770": "https://github.com/X-T-E-R",
|
23 |
+
"https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
|
24 |
+
"ja": "Japanese",
|
25 |
+
"json设置(一般不动)": "JSON Settings (Do not change it unless you know what you are doing)",
|
26 |
+
"zh": "Chinese",
|
27 |
+
"不切": "Do Not Split",
|
28 |
+
"人物情感列表网址": "Character Emotion List URL",
|
29 |
+
"从json中读取": "Read from JSON",
|
30 |
+
"使用前,请确认后端服务已启动。": "Before using, please ensure the backend service is running.",
|
31 |
+
"保存json\n(可能不会有完成提示,没报错就是成功)": "Save JSON\n(There may not be a completion notice; no error means success)",
|
32 |
+
"保存失败!": "Save Failed!",
|
33 |
+
"保存成功!": "Save Successful!",
|
34 |
+
"停止播放": "Stop Playback",
|
35 |
+
"切句方式": "Sentence Splitting Method",
|
36 |
+
"前端处理后的文本(每句):": "Front-end Processed Text (Per Sentence):",
|
37 |
+
"参考音频在3~10秒范围外,请更换!": "Reference audio is outside the 3-10 second range. Please replace it!",
|
38 |
+
"参考音频路径": "Reference Audio Path",
|
39 |
+
"发送json格式": "Send in JSON",
|
40 |
+
"发送并开始播放": "Send and Start Playback",
|
41 |
+
"发送请求": "Send Request",
|
42 |
+
"发送请求到": "Send Request to",
|
43 |
+
"吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "Missing or swallowed words are normal. Severe issues can be resolved by adding new lines or periods, or adjusting the batch size.",
|
44 |
+
"吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "Missing or swallowed words are normal. Severe issues can be resolved by adding new lines or periods, changing the reference audio (using the model management interface), or adjusting the batch size.",
|
45 |
+
"基础选项": "Basic Options",
|
46 |
+
"实际输入的参考文本:": "Actual Reference Text Input:",
|
47 |
+
"实际输入的目标文本(切句后):": "Actual Target Text Input (After Splitting):",
|
48 |
+
"实际输入的目标文本(每句):": "Actual Target Text Input (Per Sentence):",
|
49 |
+
"实际输入的目标文本:": "Actual Target Text Input:",
|
50 |
+
"密码": "Password",
|
51 |
+
"当前人物": "Current Character",
|
52 |
+
"当前人物变更为: ": "Current Character Changed to: ",
|
53 |
+
"您在使用经典推理模式,部分选项不可用": "You are using Classic Inference Mode. Some options are unavailable.",
|
54 |
+
"情感列表": "Emotion",
|
55 |
+
"情感风格": "Emotion",
|
56 |
+
"我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "Five little monkeys jumping on the bed, one fell off and bumped his head. Mama called the doctor, and the doctor said, \"No more monkeys jumping on the bed!\"",
|
57 |
+
"扫描": "Scan",
|
58 |
+
"扫描人物列表": "Scan Character List",
|
59 |
+
"扫描模型文件夹:": "Scan Model Folder:",
|
60 |
+
"找不到模型文件!请把有效文件放置在文件夹下!!!": "Model file not found! Please place valid files in the folder!!!",
|
61 |
+
"提供的推理特化包,当前版本:": ", Current Version: ",
|
62 |
+
"提示": "Tip",
|
63 |
+
"提示文本": "Prompt Text",
|
64 |
+
"提示语言": "Prompt Language",
|
65 |
+
"文件打开失败,保存失败!": "File Opening Failed, Save Failed!",
|
66 |
+
"文本语言": "Text Language",
|
67 |
+
"是否自动匹配情感": "Automatically Match Emotions",
|
68 |
+
"模型文件夹路径": "Model Folder Path",
|
69 |
+
"每句允许最大切分字词数": "Max Words per Split Sentence",
|
70 |
+
"流式音频": "Streaming Audio",
|
71 |
+
"添加情感": "Add Emotion",
|
72 |
+
"点击查看详细文档": "Click to View Detailed Documentation",
|
73 |
+
"版本": "Version",
|
74 |
+
"用户名": "Username",
|
75 |
+
"种子": "Seed",
|
76 |
+
"简介": "Introduction",
|
77 |
+
"缺失某些项,保存失败!": "Missing Some Items, Save Failed!",
|
78 |
+
"网址设置": "URL Settings",
|
79 |
+
"自动生成info": "Auto Generate Info",
|
80 |
+
"若有疑问或需要进一步了解,可参考文档:": "If you have questions or need further information, please refer to the documentation: ",
|
81 |
+
"认证信息": "Authentication Info",
|
82 |
+
"认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "Authentication is enabled. You can disable it in config.json.\nHowever, this feature is not fully implemented yet and is just for show.",
|
83 |
+
"语速": "Speed",
|
84 |
+
"请修改后点击下方按钮进行保存": "Please modify and click the button below to save",
|
85 |
+
"请求失败,状态码:": "Request Failed, Status Code:",
|
86 |
+
"请求失败,请检查URL是否正确": "Request Failed. Please check if the URL is correct.",
|
87 |
+
"请求完整音频": "Request Complete Audio",
|
88 |
+
"请求网址": "Request URL",
|
89 |
+
"输入文本": "Input Text",
|
90 |
+
"这是一个由": "This is a Inference Specialization Package provided by ",
|
91 |
+
"这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "This is a configuration file for https://github.com/X-T-E-R/TTS-for-GPT-soVITS, a simple and easy-to-use frontend and backend project",
|
92 |
+
"这是展示页面的版本,并未使用后端服务,下面参数无效。": "This is a demonstration page version and does not utilize backend services, the parameters below are invalid.",
|
93 |
+
"选择角色": "Select Character",
|
94 |
+
"音频输出": "Audio Output",
|
95 |
+
"音频预览": "Audio Preview",
|
96 |
+
"项目开源地址:": "Github Link: ",
|
97 |
+
"高级选项": "Advanced Options",
|
98 |
+
"最大允许长度": "Max Length Allowed"
|
99 |
+
}
|
Synthesizers/gsv_fast/configs/i18n/locale/zh_CN.json
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
", 返回内容:": ", 返回内容:",
|
3 |
+
"<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>",
|
4 |
+
"Endpoint": "Endpoint",
|
5 |
+
"GPT模型路径": "GPT模型路径",
|
6 |
+
"Sovits模型路径": "Sovits模型路径",
|
7 |
+
"Temperature": "Temperature",
|
8 |
+
"Top K": "Top K",
|
9 |
+
"Top P": "Top P",
|
10 |
+
"all_ja": "只有日文",
|
11 |
+
"all_zh": "只有中文",
|
12 |
+
"auto": "自动判断",
|
13 |
+
"auto_cut": "智能切分",
|
14 |
+
"batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不并行,越大越快,但是越可能出问题",
|
15 |
+
"cut0": "仅凭换行切分",
|
16 |
+
"cut1": "凑四句一切",
|
17 |
+
"cut2": "凑50字一切",
|
18 |
+
"cut3": "按中文句号。切",
|
19 |
+
"cut4": "按英文句号.切",
|
20 |
+
"cut5": "按标点符号切",
|
21 |
+
"en": "英文",
|
22 |
+
"https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770",
|
23 |
+
"https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
|
24 |
+
"ja": "日文",
|
25 |
+
"json设置(一般不动)": "json设置(一般不动)",
|
26 |
+
"zh": "中文",
|
27 |
+
"不切": "不切",
|
28 |
+
"人物情感列表网址": "人物情感列表网址",
|
29 |
+
"从json中读取": "从json中读取",
|
30 |
+
"使用前,请确认后端服务已启动。": "使用前,请确认后端服务已启动。",
|
31 |
+
"保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不会有完成提示,没报错就是成功)",
|
32 |
+
"保存失败!": "保存失败!",
|
33 |
+
"保存成功!": "保存成功!",
|
34 |
+
"停止播放": "停止播放",
|
35 |
+
"切句方式": "切句方式",
|
36 |
+
"前端处理后的文本(每句):": "前端处理后的文本(每句):",
|
37 |
+
"参考音频在3~10秒范围外,请更换!": "参考音频在3~10秒范围外,请更换!",
|
38 |
+
"参考音频路径": "参考音频路径",
|
39 |
+
"发送json格式": "发送json格式",
|
40 |
+
"发送并开始播放": "发送并开始播放",
|
41 |
+
"发送请求": "发送请求",
|
42 |
+
"发送请求到": "发送请求到",
|
43 |
+
"吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。",
|
44 |
+
"吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。",
|
45 |
+
"基础选项": "基础选项",
|
46 |
+
"实际输入的参考文本:": "实际输入的参考文本:",
|
47 |
+
"实际输入的目标文本(切句后):": "实际输入的目标文本(切句后):",
|
48 |
+
"实际输入的目标文本(每句):": "实际输入的目标文本(每句):",
|
49 |
+
"实际输入的目标文本:": "实际输入的目标文本:",
|
50 |
+
"密码": "密码",
|
51 |
+
"当前人物": "当前人物",
|
52 |
+
"当前人物变更为: ": "当前人物变更为: ",
|
53 |
+
"您在使用经典推理模式,部分选项不可用": "您在使用经典推理模式,部分选项不可用",
|
54 |
+
"情感列表": "情感列表",
|
55 |
+
"情感风格": "情感风格",
|
56 |
+
"我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。",
|
57 |
+
"扫描": "扫描",
|
58 |
+
"扫描人物列表": "扫描人物列表",
|
59 |
+
"扫描模型文件夹:": "扫描模型文件夹:",
|
60 |
+
"找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!请把有效文件放置在文件夹下!!!",
|
61 |
+
"提供的推理特化包,当前版本:": "提供的推理特化包,当前版本:",
|
62 |
+
"提示": "提示",
|
63 |
+
"提示文本": "提示文本",
|
64 |
+
"提示语言": "提示语言",
|
65 |
+
"文件打开失败,保存失败!": "文件打开失败,保存失败!",
|
66 |
+
"文本语言": "文本语言",
|
67 |
+
"是否自动匹配情感": "是否自动匹配情感",
|
68 |
+
"模型文件夹路径": "模型文件夹路径",
|
69 |
+
"每句允许最大切分字词数": "每句允许最大切分字词数",
|
70 |
+
"流式音频": "流式音频",
|
71 |
+
"添加情感": "添加情感",
|
72 |
+
"点击查看详细文档": "点击查看详细文档",
|
73 |
+
"��本": "版本",
|
74 |
+
"用户名": "用户名",
|
75 |
+
"种子": "种子",
|
76 |
+
"简介": "简介",
|
77 |
+
"缺失某些项,保存失败!": "缺失某些项,保存失败!",
|
78 |
+
"网址设置": "网址设置",
|
79 |
+
"自动生成info": "自动生成info",
|
80 |
+
"若有疑问或需要进一步了解,可参考文档:": "若有疑问或需要进一步了解,可参考文档:",
|
81 |
+
"认证信息": "认证信息",
|
82 |
+
"认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设",
|
83 |
+
"语速": "语速",
|
84 |
+
"请修改后点击下方按钮进行保存": "请修改后点击下方按钮进行保存",
|
85 |
+
"请求失败,状态码:": "请求失败,状态码:",
|
86 |
+
"请求失败,请检查URL是否正确": "请求失败,请检查URL是否正确",
|
87 |
+
"请求完整音频": "请求完整音频",
|
88 |
+
"请求网址": "请求网址",
|
89 |
+
"输入文本": "输入文本",
|
90 |
+
"这是一个由": "这是一个由",
|
91 |
+
"这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目",
|
92 |
+
"这是展示页面的版本,并未使用后端服务,下面参数无效。": "这是展示页面的版本,并未使用后端服务,下面参数无效。",
|
93 |
+
"选择角色": "选择角色",
|
94 |
+
"音频输出": "音频输出",
|
95 |
+
"音频预览": "音频预览",
|
96 |
+
"项目开源地址:": "项目开源地址:",
|
97 |
+
"高级选项": "高级选项",
|
98 |
+
"最大允许长度": "最大允许长度"
|
99 |
+
}
|
Synthesizers/gsv_fast/configs/i18n/locale/zh_TW.json
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
", 返回内容:": ", 返回內容:",
|
3 |
+
"<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>這是模型管理介面,為了實現對多段參考音頻分配情緒設計,如果您只有一段可不使用這個介面</p><p>若有疑問或需要進一步了解,可參考文件:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">點擊查看詳細文件</a>。</p>",
|
4 |
+
"Endpoint": "Endpoint",
|
5 |
+
"GPT模型路径": "GPT模型路徑",
|
6 |
+
"Sovits模型路径": "Sovits模型路徑",
|
7 |
+
"Temperature": "Temperature",
|
8 |
+
"Top K": "Top K",
|
9 |
+
"Top P": "Top P",
|
10 |
+
"all_ja": "僅日文",
|
11 |
+
"all_zh": "僅中文",
|
12 |
+
"auto": "自動判斷",
|
13 |
+
"auto_cut": "智慧切分",
|
14 |
+
"batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不並行,越大越快,但是越可能出現問題",
|
15 |
+
"cut0": "僅憑換行切分",
|
16 |
+
"cut1": "湊四句一切",
|
17 |
+
"cut2": "湊50字一切",
|
18 |
+
"cut3": "按中文句號。切",
|
19 |
+
"cut4": "按英文句號.切",
|
20 |
+
"cut5": "按標點符號切",
|
21 |
+
"en": "英文",
|
22 |
+
"https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770",
|
23 |
+
"https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
|
24 |
+
"ja": "日文",
|
25 |
+
"json设置(一般不动)": "json設置(一般不動)",
|
26 |
+
"zh": "中文",
|
27 |
+
"不切": "不切",
|
28 |
+
"人物情感列表网址": "人物情緒列表網址",
|
29 |
+
"从json中读取": "從json中讀取",
|
30 |
+
"使用前,请确认后端服务已启动。": "使用前,請確認後端服務已啟動。",
|
31 |
+
"保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不會有完成提示,沒報錯就是成功)",
|
32 |
+
"保存失败!": "保存失敗!",
|
33 |
+
"保存成功!": "保存成功!",
|
34 |
+
"停止播放": "停止播放",
|
35 |
+
"切句方式": "切句方式",
|
36 |
+
"前端处理后的文本(每句):": "前端處理後的文本(每句):",
|
37 |
+
"参考音频在3~10秒范围外,请更换!": "參考音頻在3~10秒範圍外,請更換!",
|
38 |
+
"参考音频路径": "參考音頻路徑",
|
39 |
+
"发送json格式": "發送json格式",
|
40 |
+
"发送并开始播放": "發送並開始播放",
|
41 |
+
"发送请求": "發送請求",
|
42 |
+
"发送请求到": "發送請求到",
|
43 |
+
"吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或調節batch size滑條。",
|
44 |
+
"吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或者更換參考音頻(使用模型管理介面)、調節下方batch size滑條。",
|
45 |
+
"基础选项": "基礎選項",
|
46 |
+
"实际输入的参考文本:": "實際輸入的參考文本:",
|
47 |
+
"实际输入的目标文本(切句后):": "實際輸入的目標文本(切句後):",
|
48 |
+
"实际输入的目标文本(每句):": "實際輸入的目標文本(每句):",
|
49 |
+
"实际输入的目标文本:": "實際輸入的目標文本:",
|
50 |
+
"密码": "密碼",
|
51 |
+
"当前人物": "當前人物",
|
52 |
+
"当前人物变更为: ": "當前人物變更為: ",
|
53 |
+
"您在使用经典推理模式,部分选项不可用": "您在使用經典推理模式,部分選項不可用",
|
54 |
+
"情感列表": "情緒列表",
|
55 |
+
"情感风格": "情緒風格",
|
56 |
+
"我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "有時掉進黑洞,有時候爬上彩虹。在下一秒鐘,命運如何轉動,沒有人會曉得。我說希望無窮,你猜美夢成空,相信和懷疑,總要決鬥。",
|
57 |
+
"扫描": "掃描",
|
58 |
+
"扫描人物列表": "掃描人物列表",
|
59 |
+
"扫描模型文件夹:": "掃描模型文件夾:",
|
60 |
+
"找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!請把有效文件放置在文件夾下!!!",
|
61 |
+
"提供的推理特化包,当前版本:": "提供的推理特化包,當前版本:",
|
62 |
+
"提示": "提示",
|
63 |
+
"提示文本": "提示文本",
|
64 |
+
"提示语言": "提示語言",
|
65 |
+
"文件打开失败,保存失败!": "文件開啟失敗,保存失敗!",
|
66 |
+
"文本语言": "文本語言",
|
67 |
+
"是否自动匹配情感": "是否自動匹配情緒",
|
68 |
+
"模型文件夹路径": "模型文件夾路徑",
|
69 |
+
"每句允许最大切分字词数": "每句允許最大切分字詞數",
|
70 |
+
"流式音频": "流式音頻",
|
71 |
+
"添加情感": "添加情緒",
|
72 |
+
"点击查看详细文档": "點擊查看詳細��件",
|
73 |
+
"版本": "版本",
|
74 |
+
"用户名": "使用者名稱",
|
75 |
+
"种子": "種子",
|
76 |
+
"简介": "簡介",
|
77 |
+
"缺失某些项,保存失败!": "缺失某些項,保存失敗!",
|
78 |
+
"网址设置": "網址設置",
|
79 |
+
"自动生成info": "自動生成info",
|
80 |
+
"若有疑问或需要进一步了解,可参考文档:": "若有疑問或需要進一步了解,可參考文件:",
|
81 |
+
"认证信息": "認證信息",
|
82 |
+
"认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "認證信息已啟用,您可以在config.json中關閉。\n但是這個功能還沒做好,只是擺設",
|
83 |
+
"语速": "語速",
|
84 |
+
"请修改后点击下方按钮进行保存": "請修改後點擊下方按鈕進行保存",
|
85 |
+
"请求失败,状态码:": "請求失敗,狀態碼:",
|
86 |
+
"请求失败,请检查URL是否正确": "請求失敗,請檢查URL是否正確",
|
87 |
+
"请求完整音频": "請求完整音頻",
|
88 |
+
"请求网址": "請求網址",
|
89 |
+
"输入文本": "輸入文本",
|
90 |
+
"这是一个由": "這是一個由",
|
91 |
+
"这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "這是一個配置文件適用於https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一個簡單好用的前後端項目",
|
92 |
+
"这是展示页面的版本,并未使用后端服务,下面参数无效。": "這是展示頁面的版本,並未使用後端服務,下面參數無效。",
|
93 |
+
"选择角色": "選擇角色",
|
94 |
+
"音频输出": "音頻輸出",
|
95 |
+
"音频预览": "音頻預覽",
|
96 |
+
"项目开源地址:": "Github Link:",
|
97 |
+
"高级选项": "高級選項",
|
98 |
+
"最大允许长度": "最大允許長度"
|
99 |
+
}
|
Synthesizers/gsv_fast/configs/params_config.json
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"task_type":{
|
3 |
+
"type": "str",
|
4 |
+
"description": "Task type for the API.",
|
5 |
+
"alias": ["task_type", "task", "type", "textType"],
|
6 |
+
"default": "tts"
|
7 |
+
},
|
8 |
+
"text": {
|
9 |
+
"type": "str",
|
10 |
+
"label": "文本",
|
11 |
+
"description": "The text to be synthesized.",
|
12 |
+
"alias": ["text", "txt", "tex", "t"],
|
13 |
+
"default": ""
|
14 |
+
},
|
15 |
+
"ssml": {
|
16 |
+
"type": "str",
|
17 |
+
"label": "SSML文本",
|
18 |
+
"description": "The SSML text to be synthesized.",
|
19 |
+
"alias": ["ssml", "text", "txt", "tex", "t"],
|
20 |
+
"default": null
|
21 |
+
},
|
22 |
+
"text_language": {
|
23 |
+
"type": "str",
|
24 |
+
"label": "文本语言",
|
25 |
+
"description": "Language of the text.",
|
26 |
+
"alias": ["text_language", "lang", "language", "lan", "text_lang", "xml:lang"],
|
27 |
+
"choices": ["auto", "zh", "en", "ja", "all_zh", "all_ja"],
|
28 |
+
"default": "auto"
|
29 |
+
},
|
30 |
+
"character": {
|
31 |
+
"type": "str",
|
32 |
+
"label": "角色模型",
|
33 |
+
"description": "Character name for the model.",
|
34 |
+
"alias": ["cha_name", "character", "model_name", "cha", "spk" , "speaker", "name", "role"],
|
35 |
+
"default": ""
|
36 |
+
},
|
37 |
+
"emotion": {
|
38 |
+
"type": "str",
|
39 |
+
"label": "情感风格",
|
40 |
+
"description": "Emotion of the character.",
|
41 |
+
"alias": ["character_emotion", "emotion", "style"],
|
42 |
+
"default": "default"
|
43 |
+
},
|
44 |
+
"ref_audio_path":{
|
45 |
+
"type": "str",
|
46 |
+
"component_type":"audio",
|
47 |
+
"label": "参考音频路径, 启用后将忽视emotion参数",
|
48 |
+
"description": "Reference audio path for the model.",
|
49 |
+
"alias": ["ref_audio_path", "ref_audio", "ref_path"],
|
50 |
+
"default": null
|
51 |
+
},
|
52 |
+
"prompt_text": {
|
53 |
+
"type": "str",
|
54 |
+
"label": "参考音频文本",
|
55 |
+
"description": "Reference audio text for the model.",
|
56 |
+
"alias": ["prompt_text", "ref_text"],
|
57 |
+
"default": null
|
58 |
+
},
|
59 |
+
"prompt_language": {
|
60 |
+
"type": "str",
|
61 |
+
"label": "参考音频语言",
|
62 |
+
"description": "Reference audio language for the model.",
|
63 |
+
"alias": ["prompt_language", "ref_lang"],
|
64 |
+
"choices": ["auto", "zh", "en", "ja", "all_zh", "all_ja"],
|
65 |
+
"default": "auto"
|
66 |
+
},
|
67 |
+
"speaker_id": {
|
68 |
+
"type": "int",
|
69 |
+
"label": "角色ID",
|
70 |
+
"description": "Speaker ID for the model.",
|
71 |
+
"alias": ["speaker_id", "id"],
|
72 |
+
"default": null
|
73 |
+
},
|
74 |
+
"batch_size": {
|
75 |
+
"type": "int",
|
76 |
+
"label": "批处理大小",
|
77 |
+
"description": "Batch size for processing.",
|
78 |
+
"alias": ["batch_size", "batch"],
|
79 |
+
"default": 10,
|
80 |
+
"min_value": 1,
|
81 |
+
"max_value": 100,
|
82 |
+
"step": 1
|
83 |
+
},
|
84 |
+
"speed": {
|
85 |
+
"type": "float",
|
86 |
+
"label": "语速",
|
87 |
+
"description": "Speed factor for synthesis.",
|
88 |
+
"alias": ["speed", "speed_factor", "spd", "rate"],
|
89 |
+
"default": 1.0,
|
90 |
+
"min_value": 0.5,
|
91 |
+
"max_value": 2.0,
|
92 |
+
"step": 0.05
|
93 |
+
},
|
94 |
+
"top_k": {
|
95 |
+
"type": "int",
|
96 |
+
"label": "采样Top K",
|
97 |
+
"description": "Top K parameter for sampling.",
|
98 |
+
"alias": ["top_k", "topk"],
|
99 |
+
"default": 5,
|
100 |
+
"min_value": 1,
|
101 |
+
"max_value": 40,
|
102 |
+
"step": 1
|
103 |
+
},
|
104 |
+
"top_p": {
|
105 |
+
"type": "float",
|
106 |
+
"label": "采样Top P",
|
107 |
+
"description": "Top P parameter for sampling.",
|
108 |
+
"alias": ["top_p", "topp"],
|
109 |
+
"default": 0.8,
|
110 |
+
"min_value": 0.1,
|
111 |
+
"max_value": 2.0,
|
112 |
+
"step": 0.01
|
113 |
+
},
|
114 |
+
"temperature": {
|
115 |
+
"type": "float",
|
116 |
+
"label": "采样温度",
|
117 |
+
"description": "Temperature for sampling.",
|
118 |
+
"alias": ["temperature"],
|
119 |
+
"default": 0.8,
|
120 |
+
"min_value": 0.1,
|
121 |
+
"max_value": 2.0,
|
122 |
+
"step": 0.01
|
123 |
+
},
|
124 |
+
"seed": {
|
125 |
+
"type": "int",
|
126 |
+
"label": "随机种子",
|
127 |
+
"description": "Seed for randomness.",
|
128 |
+
"alias": ["seed"],
|
129 |
+
"default": -1
|
130 |
+
},
|
131 |
+
"stream": {
|
132 |
+
"type": "bool",
|
133 |
+
"label": "流式输出",
|
134 |
+
"description": "Stream the audio or not.",
|
135 |
+
"alias": ["stream", "streaming"],
|
136 |
+
"default": false
|
137 |
+
},
|
138 |
+
"save_temp": {
|
139 |
+
"type": "bool",
|
140 |
+
"label": "保存临时输出",
|
141 |
+
"description": "Save the output temporarily.",
|
142 |
+
"alias": ["save_temp", "save"],
|
143 |
+
"default": false
|
144 |
+
},
|
145 |
+
"cut_method": {
|
146 |
+
"type": "str",
|
147 |
+
"label": "文本切割方法",
|
148 |
+
"description": "Method for text cutting.",
|
149 |
+
"alias": ["cut_method", "cut"],
|
150 |
+
"choices": ["auto_cut", "cut0", "cut1", "cut2", "cut3", "cut4", "cut5"],
|
151 |
+
"default": "auto_cut"
|
152 |
+
},
|
153 |
+
"max_cut_length": {
|
154 |
+
"type": "int",
|
155 |
+
"label": "文本切割最大长度",
|
156 |
+
"description": "Maximum length of the text cut.",
|
157 |
+
"alias": ["max_cut_length", "max_cut"],
|
158 |
+
"default": 50,
|
159 |
+
"min_value": 5,
|
160 |
+
"max_value": 1000,
|
161 |
+
"step": 1
|
162 |
+
},
|
163 |
+
"parallel_infer": {
|
164 |
+
"type": "bool",
|
165 |
+
"label": "并行推理",
|
166 |
+
"description": "Parallel inference or not.",
|
167 |
+
"alias": ["parallel_infer", "parallel"],
|
168 |
+
"default": true
|
169 |
+
},
|
170 |
+
"repetition_penalty": {
|
171 |
+
"type": "float",
|
172 |
+
"label": "重复惩罚",
|
173 |
+
"description": "Repetition penalty for sampling.",
|
174 |
+
"alias": ["repetition_penalty", "rep_penalty"],
|
175 |
+
"default": 1.35,
|
176 |
+
"min_value": 0,
|
177 |
+
"max_value": 5,
|
178 |
+
"step": 0.01
|
179 |
+
},
|
180 |
+
"format": {
|
181 |
+
"type": "str",
|
182 |
+
"label": "输出格式",
|
183 |
+
"description": "Format of the output audio.",
|
184 |
+
"alias": ["format"],
|
185 |
+
"default": "wav"
|
186 |
+
},
|
187 |
+
"loudness": {
|
188 |
+
"type": "float",
|
189 |
+
"label": "音量",
|
190 |
+
"description": "Loudness of the audio. Now is unsupported.",
|
191 |
+
"alias": ["loudness", "volume", "vol"],
|
192 |
+
"default": null
|
193 |
+
},
|
194 |
+
"pitch": {
|
195 |
+
"type": "float",
|
196 |
+
"label": "音调",
|
197 |
+
"description": "Pitch of the audio. Now is unsupported.",
|
198 |
+
"alias": ["pitch"],
|
199 |
+
"default": null
|
200 |
+
}
|
201 |
+
}
|
Synthesizers/gsv_fast/configs/ui_config.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"ref_settings": [["ref_audio_path", "prompt_text", "prompt_language"]],
|
3 |
+
"basic_settings": [
|
4 |
+
"speed",
|
5 |
+
|
6 |
+
["text_language", "cut_method", "max_cut_length", "batch_size"]
|
7 |
+
],
|
8 |
+
"advanced_settings": [
|
9 |
+
"seed",
|
10 |
+
"parallel_infer",
|
11 |
+
["top_k", "top_p", "temperature", "repetition_penalty"]
|
12 |
+
]
|
13 |
+
}
|
Synthesizers/gsv_fast/gsv_config.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__version__ = "2.4.3 240414"
|
2 |
+
|
3 |
+
import os, json
|
4 |
+
import torch
|
5 |
+
|
6 |
+
import logging
|
7 |
+
|
8 |
+
from pydantic import BaseModel, Field
|
9 |
+
logging.getLogger("markdown_it").setLevel(logging.ERROR)
|
10 |
+
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
11 |
+
logging.getLogger("httpcore").setLevel(logging.ERROR)
|
12 |
+
logging.getLogger("httpx").setLevel(logging.ERROR)
|
13 |
+
logging.getLogger("asyncio").setLevel(logging.ERROR)
|
14 |
+
logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
|
15 |
+
logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
|
16 |
+
def test_fp16_computation():
|
17 |
+
# 检查CUDA是否可用
|
18 |
+
if not torch.cuda.is_available():
|
19 |
+
return False, "CUDA is not available. Please check your installation."
|
20 |
+
|
21 |
+
try:
|
22 |
+
# 创建一个简单的半精度张量计算任务
|
23 |
+
# 例如,执行一个半精度的矩阵乘法
|
24 |
+
a = torch.randn(3, 3, dtype=torch.float16).cuda() # 将张量a转换为半精度并移动到GPU
|
25 |
+
b = torch.randn(3, 3, dtype=torch.float16).cuda() # 将张量b转换为半精度并移动到GPU
|
26 |
+
c = torch.matmul(a, b) # 执行半精度的矩阵乘法
|
27 |
+
# 如果没有发生错误,我们认为GPU支持半精度运算
|
28 |
+
return True, "Your GPU supports FP16 computation."
|
29 |
+
except Exception as e:
|
30 |
+
# 如果执行过程中发生异常,我们认为GPU不支持半精度运算
|
31 |
+
return False, f"Your GPU does not support FP16 computation. Error: {e}"
|
32 |
+
|
33 |
+
|
34 |
+
def get_device_info(device_config="auto", is_half_config="auto")-> tuple[str, bool]:
|
35 |
+
global device, is_half
|
36 |
+
try:
|
37 |
+
return device, is_half
|
38 |
+
except:
|
39 |
+
if torch.cuda.is_available():
|
40 |
+
device = "cuda"
|
41 |
+
is_half = True
|
42 |
+
else:
|
43 |
+
device = "cpu"
|
44 |
+
is_half = False
|
45 |
+
|
46 |
+
if device_config != "auto":
|
47 |
+
device = device_config
|
48 |
+
is_half = (device == "cpu")
|
49 |
+
if is_half_config != "auto":
|
50 |
+
is_half = str(is_half_config).lower() == "true"
|
51 |
+
|
52 |
+
supports_fp16, message = test_fp16_computation()
|
53 |
+
if not supports_fp16 and is_half:
|
54 |
+
is_half = False
|
55 |
+
print(message)
|
56 |
+
|
57 |
+
return device, is_half
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
def load_infer_config(character_path):
|
63 |
+
config_path = os.path.join(character_path, "infer_config.json")
|
64 |
+
"""加载环境配置文件"""
|
65 |
+
with open(config_path, 'r', encoding='utf-8') as f:
|
66 |
+
config = json.load(f)
|
67 |
+
return config
|
68 |
+
|
69 |
+
def auto_generate_infer_config(character_path):
|
70 |
+
## TODO: Auto-generate wav-list and prompt-list from character_path
|
71 |
+
##
|
72 |
+
# Initialize variables for file detection
|
73 |
+
|
74 |
+
print(f"正在自动生成配置文件: {character_path}")
|
75 |
+
ckpt_file_found = None
|
76 |
+
pth_file_found = None
|
77 |
+
wav_file_found = None
|
78 |
+
|
79 |
+
# Iterate through files in character_path to find matching file types
|
80 |
+
for dirpath, dirnames, filenames in os.walk(character_path):
|
81 |
+
for file in filenames:
|
82 |
+
# 构建文件的完整路径
|
83 |
+
full_path = os.path.join(dirpath, file)
|
84 |
+
# 从full_path中移除character_path部分
|
85 |
+
relative_path = remove_character_path(full_path,character_path)
|
86 |
+
# 根据文件扩展名和变量是否已赋值来更新变量
|
87 |
+
if file.lower().endswith(".ckpt") and ckpt_file_found is None:
|
88 |
+
ckpt_file_found = relative_path
|
89 |
+
elif file.lower().endswith(".pth") and pth_file_found is None:
|
90 |
+
pth_file_found = relative_path
|
91 |
+
elif file.lower().endswith(".wav") and wav_file_found is None:
|
92 |
+
wav_file_found = relative_path
|
93 |
+
elif file.lower().endswith(".mp3"):
|
94 |
+
import pydub
|
95 |
+
# Convert mp3 to wav
|
96 |
+
wav_file_path = os.path.join(dirpath,os.path.splitext(file)[0] + ".wav")
|
97 |
+
|
98 |
+
|
99 |
+
pydub.AudioSegment.from_mp3(full_path).export(wav_file_path, format="wav")
|
100 |
+
if wav_file_found is None:
|
101 |
+
wav_file_found = remove_character_path(os.path.join(dirpath,os.path.splitext(file)[0] + ".wav"),character_path)
|
102 |
+
|
103 |
+
|
104 |
+
# Initialize infer_config with gpt_path and sovits_path regardless of wav_file_found
|
105 |
+
infer_config = {
|
106 |
+
"gpt_path": ckpt_file_found,
|
107 |
+
"sovits_path": pth_file_found,
|
108 |
+
"software_version": "1.1",
|
109 |
+
r"简介": r"这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目"
|
110 |
+
}
|
111 |
+
|
112 |
+
# If wav file is also found, update infer_config to include ref_audio_path, prompt_text, and prompt_language
|
113 |
+
if wav_file_found:
|
114 |
+
wav_file_name = os.path.splitext(os.path.basename(wav_file_found))[0] # Extract the filename without extension
|
115 |
+
infer_config["emotion_list"] = {
|
116 |
+
"default": {
|
117 |
+
"ref_audio_path": wav_file_found,
|
118 |
+
"prompt_text": wav_file_name,
|
119 |
+
"prompt_language": "多语种混合"
|
120 |
+
}
|
121 |
+
}
|
122 |
+
else:
|
123 |
+
raise Exception("找不到wav参考文件!请把��效wav文件放置在模型文件夹下。")
|
124 |
+
pass
|
125 |
+
# Check if the essential model files were found
|
126 |
+
if ckpt_file_found and pth_file_found:
|
127 |
+
infer_config_path = os.path.join(character_path, "infer_config.json")
|
128 |
+
try:
|
129 |
+
with open(infer_config_path , 'w', encoding='utf-8') as f:
|
130 |
+
json.dump(infer_config, f, ensure_ascii=False, indent=4)
|
131 |
+
except IOError as e:
|
132 |
+
print(f"无法写入文件: {infer_config_path}. 错误: {e}")
|
133 |
+
|
134 |
+
return infer_config_path
|
135 |
+
else:
|
136 |
+
return "Required model files (.ckpt or .pth) not found in character_path directory."
|
137 |
+
|
138 |
+
|
139 |
+
def remove_character_path(full_path,character_path):
|
140 |
+
# 从full_path中移除character_path部分
|
141 |
+
return os.path.relpath(full_path, character_path)
|
Synthesizers/gsv_fast/gsv_task.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os, json, sys
|
3 |
+
sys.path.append(".")
|
4 |
+
|
5 |
+
from uuid import uuid4
|
6 |
+
from typing import List, Dict, Literal, Optional, Any, Union
|
7 |
+
import urllib.parse
|
8 |
+
import hashlib
|
9 |
+
|
10 |
+
from Synthesizers.base import Base_TTS_Task, ParamItem, init_params_config
|
11 |
+
|
12 |
+
def get_params_config():
|
13 |
+
try:
|
14 |
+
with open(os.path.join("Synthesizers/gsv_fast/configs", "params_config.json"), "r", encoding="utf-8") as f:
|
15 |
+
return init_params_config(json.load(f))
|
16 |
+
except:
|
17 |
+
raise FileNotFoundError("params_config.json not found or invalid.")
|
18 |
+
|
19 |
+
|
20 |
+
params_config = get_params_config()
|
21 |
+
|
22 |
+
from pydantic import BaseModel, Field, model_validator
|
23 |
+
|
24 |
+
class GSV_TTS_Task(Base_TTS_Task):
|
25 |
+
# character: Optional[str] = None
|
26 |
+
# emotion: Optional[str] = None
|
27 |
+
ref_audio_path: Optional[str] = None
|
28 |
+
prompt_text: Optional[str] = None
|
29 |
+
prompt_language: Optional[str] = None
|
30 |
+
text_language: Optional[str] = None
|
31 |
+
speaker_id: Optional[int] = None
|
32 |
+
batch_size: Optional[int] = None
|
33 |
+
top_k: Optional[int] = None
|
34 |
+
top_p: Optional[float] = None
|
35 |
+
temperature: Optional[float] = None
|
36 |
+
cut_method: Optional[str] = None
|
37 |
+
max_cut_length: Optional[int] = None
|
38 |
+
seed: Optional[int] = None
|
39 |
+
save_temp: Optional[bool] = False
|
40 |
+
parallel_infer : Optional[bool] = True
|
41 |
+
repetition_penalty : Optional[float] = 1.35
|
42 |
+
# the gsv_fast model only supports 32000 sample rate
|
43 |
+
sample_rate: int = 32000
|
44 |
+
|
45 |
+
def __init__(self, other_task: Union[BaseModel, dict, None] = None, **data):
|
46 |
+
data.setdefault('params_config', params_config)
|
47 |
+
super().__init__(other_task, **data)
|
48 |
+
|
49 |
+
@property
|
50 |
+
def md5(self):
|
51 |
+
m = hashlib.md5()
|
52 |
+
if self.task_type == "audio":
|
53 |
+
m.update(self.src.encode())
|
54 |
+
elif self.task_type == "ssml":
|
55 |
+
m.update(self.ssml.encode())
|
56 |
+
elif self.task_type == "text":
|
57 |
+
m.update(self.text.encode())
|
58 |
+
m.update(self.text_language.encode())
|
59 |
+
m.update(self.character.encode())
|
60 |
+
m.update(str(self.speaker_id).encode())
|
61 |
+
m.update(str(self.speed).encode())
|
62 |
+
m.update(str(self.top_k).encode())
|
63 |
+
m.update(str(self.top_p).encode())
|
64 |
+
m.update(str(self.temperature).encode())
|
65 |
+
m.update(str(self.cut_method).encode())
|
66 |
+
m.update(str(self.emotion).encode())
|
67 |
+
return m.hexdigest()
|
68 |
+
|
69 |
+
|
70 |
+
|
Synthesizers/gsv_fast/ssml_dealer.py
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, json
|
2 |
+
from typing import List , Dict
|
3 |
+
from uuid import uuid4
|
4 |
+
|
5 |
+
import sys
|
6 |
+
sys.path.append(".")
|
7 |
+
|
8 |
+
import xml.etree.ElementTree as ET
|
9 |
+
from .gsv_task import GSV_TTS_Task as TTS_Task
|
10 |
+
from Synthesizers.base import Base_TTS_Synthesizer, ParamItem, init_params_config
|
11 |
+
|
12 |
+
import tempfile
|
13 |
+
import soundfile as sf
|
14 |
+
|
15 |
+
import numpy as np
|
16 |
+
import requests, librosa
|
17 |
+
|
18 |
+
|
19 |
+
special_dict_speed = {
|
20 |
+
"x-slow": 0.5,
|
21 |
+
"slow": 0.75,
|
22 |
+
"medium": 1.0,
|
23 |
+
"fast": 1.25,
|
24 |
+
"x-fast": 1.5,
|
25 |
+
"default": 1.0
|
26 |
+
}
|
27 |
+
|
28 |
+
|
29 |
+
special_dict_break_strength = {
|
30 |
+
"x-weak": 0.25,
|
31 |
+
"weak": 0.5,
|
32 |
+
"medium": 0.75,
|
33 |
+
"strong": 1.0,
|
34 |
+
"x-strong": 1.25,
|
35 |
+
"default": 0.75
|
36 |
+
}
|
37 |
+
|
38 |
+
|
39 |
+
def load_time(time:str) -> float:
|
40 |
+
if time.endswith("ms"):
|
41 |
+
return float(time[:-2]) / 1000
|
42 |
+
if time.endswith("s"):
|
43 |
+
return float(time[:-1])
|
44 |
+
if time.endswith("min"):
|
45 |
+
return float(time[:-3]) * 60
|
46 |
+
return float(time)
|
47 |
+
|
48 |
+
def get_value_from_special_dict(key:str, special_dict:Dict[str, float]) -> float:
|
49 |
+
if key in special_dict:
|
50 |
+
return special_dict[key]
|
51 |
+
return key
|
52 |
+
|
53 |
+
class SSML_Dealer:
|
54 |
+
def __init__(self,params_config:Dict[str, ParamItem]):
|
55 |
+
self.ssml: str = ""
|
56 |
+
self.task_list: Dict[str, TTS_Task] = {}
|
57 |
+
self.task_queue : List[str] = []
|
58 |
+
self.audio_download_queue : List[str] = []
|
59 |
+
self.root : ET.Element = None
|
60 |
+
self.tts_synthesizer = None
|
61 |
+
self.params_config = TTS_Task().params_config
|
62 |
+
|
63 |
+
def get_value_from_root(self, root:ET.Element, key:str, special_dict:Dict[str, float]=None):
|
64 |
+
if key in self.params_config:
|
65 |
+
for alias in self.params_config[key].alias:
|
66 |
+
if root.get(alias) is not None:
|
67 |
+
if special_dict is not None:
|
68 |
+
return get_value_from_special_dict(root.get(alias), special_dict)
|
69 |
+
else:
|
70 |
+
return root.get(alias)
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
def analyze_element(self, root: ET.Element, father_task:TTS_Task):
|
75 |
+
task = TTS_Task(father_task)
|
76 |
+
self.task_list[task.uuid] = task
|
77 |
+
root.set("uuid", task.uuid)
|
78 |
+
root.tag = root.tag.split('}')[-1].lower()
|
79 |
+
task.text = root.text.strip() if root.text is not None else ""
|
80 |
+
print(f"--------{root.tag} : {task.text}") # debug
|
81 |
+
if root.tag in ["audio", "mstts:backgroundaudio"]:
|
82 |
+
if root.get("src") is not None:
|
83 |
+
self.audio_download_queue.append({"uuid": task.uuid, "src": root.get("src")})
|
84 |
+
task.text = ""
|
85 |
+
else:
|
86 |
+
if root.tag in ["bookmark", "break", "mstts:silence", "mstts:viseme"]:
|
87 |
+
task.text = ""
|
88 |
+
|
89 |
+
|
90 |
+
task.update_value('text_language', self.get_value_from_root(root, 'text_language'))
|
91 |
+
task.update_value('character', self.get_value_from_root(root, 'character'))
|
92 |
+
task.update_value('emotion', self.get_value_from_root(root, 'emotion'))
|
93 |
+
task.update_value('speed', self.get_value_from_root(root, 'speed', special_dict_speed))
|
94 |
+
|
95 |
+
# task.update_value('top_k', root)
|
96 |
+
# task.update_value('top_p', root)
|
97 |
+
# task.update_value('temperature', root)
|
98 |
+
# task.update_value('batch_size', root)
|
99 |
+
|
100 |
+
# task.update_value('loudness', root) # need to recheck
|
101 |
+
# task.update_value('pitch', root)
|
102 |
+
|
103 |
+
|
104 |
+
task.stream = False
|
105 |
+
if task.text.strip() != "":
|
106 |
+
self.task_queue.append(task.uuid)
|
107 |
+
if root.tail is not None:
|
108 |
+
new_task = TTS_Task(father_task)
|
109 |
+
self.task_list[new_task.uuid] = new_task
|
110 |
+
new_task.text = root.tail.strip()
|
111 |
+
if new_task.text != "":
|
112 |
+
self.task_queue.append(new_task.uuid)
|
113 |
+
root.set("tail_uuid", new_task.uuid)
|
114 |
+
for child in root:
|
115 |
+
self.analyze_element(child, father_task)
|
116 |
+
|
117 |
+
|
118 |
+
|
119 |
+
def generate_audio_from_element(self, root: ET.Element, default_silence: float = 0.3) -> np.ndarray:
|
120 |
+
# 认定所有的音频文件都已经生成
|
121 |
+
audio_data = np.array([])
|
122 |
+
uuid = root.get("uuid")
|
123 |
+
task = self.task_list[uuid]
|
124 |
+
sr = 32000
|
125 |
+
# print(f"--------{root.tag}") # debug
|
126 |
+
if root.tag in ["break"]:
|
127 |
+
# print(f"--------break: {root.get('time')}") # debug
|
128 |
+
time_ = root.get("time")
|
129 |
+
duration = 0.75
|
130 |
+
if time_ is not None:
|
131 |
+
duration = load_time(time_)
|
132 |
+
strength_ = root.get("strength")
|
133 |
+
if strength_ in special_dict_break_strength:
|
134 |
+
duration = special_dict_break_strength[strength_]
|
135 |
+
audio_data = np.zeros(int(duration * sr))
|
136 |
+
elif task.audio_path not in ["", None]:
|
137 |
+
audio_data, sr = sf.read(task.audio_path)
|
138 |
+
|
139 |
+
for child in root:
|
140 |
+
audio_data = np.concatenate([audio_data, self.generate_audio_from_element(child)])
|
141 |
+
|
142 |
+
if default_silence > 0:
|
143 |
+
audio_data = np.concatenate([audio_data, np.zeros(int(default_silence * sr))])
|
144 |
+
|
145 |
+
if root.get("tail_uuid") is not None:
|
146 |
+
audio_path = self.task_list[root.get("tail_uuid")].audio_path
|
147 |
+
if audio_path not in ["", None]:
|
148 |
+
audio_data_tail, sr = sf.read(audio_path)
|
149 |
+
audio_data = np.concatenate([audio_data, audio_data_tail])
|
150 |
+
|
151 |
+
return audio_data
|
152 |
+
|
153 |
+
def read_ssml(self, ssml:str):
|
154 |
+
self.ssml = ssml
|
155 |
+
try:
|
156 |
+
self.root = ET.fromstring(ssml)
|
157 |
+
self.analyze_element(self.root, None)
|
158 |
+
except Exception as e:
|
159 |
+
raise ValueError("Invalid SSML.")
|
160 |
+
|
161 |
+
def generate_tasks(self, tts_synthesizer, tmp_dir:str):
|
162 |
+
# 先按照人物排序
|
163 |
+
self.task_queue.sort(key=lambda x: self.task_list[x].character)
|
164 |
+
for uuid in self.task_queue:
|
165 |
+
task = self.task_list[uuid]
|
166 |
+
if task.text.strip() == "":
|
167 |
+
continue
|
168 |
+
gen = tts_synthesizer.generate_from_text(task)
|
169 |
+
sr, audio_data = next(gen)
|
170 |
+
|
171 |
+
tmp_file = os.path.join(tmp_dir, f"{task.uuid}.wav")
|
172 |
+
|
173 |
+
sf.write(tmp_file, audio_data, sr, format='wav')
|
174 |
+
task.audio_path = tmp_file
|
175 |
+
|
176 |
+
def download_audio(self, tmp_dir:str, sample_rate:int=32000):
|
177 |
+
for audio in self.audio_download_queue:
|
178 |
+
# 另开一个线程下载音频
|
179 |
+
response = requests.get(audio["src"])
|
180 |
+
# 重采样
|
181 |
+
audio_format = audio["src"].split(".")[-1]
|
182 |
+
tmp_file = os.path.join(tmp_dir, f"{uuid4()}.{audio_format}")
|
183 |
+
with open(tmp_file, 'wb') as f:
|
184 |
+
f.write(response.content)
|
185 |
+
audio_data, sr = librosa.load(tmp_file, sr=sample_rate)
|
186 |
+
sf.write(tmp_file, audio_data, sr, format='wav')
|
187 |
+
self.task_list[audio["uuid"]].audio_path = tmp_file
|
188 |
+
|
189 |
+
def generate_from_ssml(self, ssml:str, tts_synthesizer, format:str="wav"):
|
190 |
+
self.read_ssml(ssml)
|
191 |
+
tmp_dir = tempfile.mkdtemp()
|
192 |
+
self.generate_tasks(tts_synthesizer, tmp_dir)
|
193 |
+
self.download_audio(tmp_dir)
|
194 |
+
audio_data = self.generate_audio_from_element(self.root)
|
195 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{format}") as tmp_file:
|
196 |
+
sf.write(tmp_file, audio_data, 32000, format=format)
|
197 |
+
return tmp_file.name
|
198 |
+
|
199 |
+
if __name__ == "__main__":
|
200 |
+
ssml = """
|
201 |
+
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
|
202 |
+
<audio src="https://d38nvwmjovqyq6.cloudfront.net/va90web25003/companions/Foundations%20of%20Rock/5.04.mp3" >
|
203 |
+
</audio>
|
204 |
+
<voice name="en-US-AvaNeural">
|
205 |
+
Welcome <break /> to text to speech.
|
206 |
+
Welcome <break strength="medium" /> to text to speech.
|
207 |
+
Welcome <break time="750ms" /> to text to speech.
|
208 |
+
</voice>
|
209 |
+
</speak>
|
210 |
+
"""
|
211 |
+
# ssml_dealer = SSML_Dealer()
|
212 |
+
# # tts_synthesizer = TTS_synthesizer()
|
213 |
+
# print(ssml_dealer.generate_from_ssml(ssml, tts_synthesizer))
|
214 |
+
|
215 |
+
# for task in ssml_dealer.task_list.values():
|
216 |
+
# print(task)
|
Synthesizers/remote/Remote_Synthesizer.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io, wave
|
2 |
+
import os, json, sys
|
3 |
+
import threading
|
4 |
+
|
5 |
+
from Synthesizers.base import Base_TTS_Synthesizer ,load_config
|
6 |
+
|
7 |
+
from .remote_task import Remote_TTS_Task as TTS_Task, set_based_synthesizer, get_ui_config
|
8 |
+
import requests
|
9 |
+
from urllib import parse
|
10 |
+
from datetime import datetime
|
11 |
+
from typing import Union, Generator, Tuple, Any, Optional, Dict, Literal
|
12 |
+
import numpy as np
|
13 |
+
import soundfile as sf
|
14 |
+
|
15 |
+
class Remote_Synthesizer(Base_TTS_Synthesizer):
|
16 |
+
url :str = "http://127.0.0.1:5000"
|
17 |
+
tts_endpoint:str = "/tts"
|
18 |
+
character_endpoint:str = "/character_list"
|
19 |
+
based_synthesizer :str = "gsv_fast"
|
20 |
+
class Config:
|
21 |
+
extra = "ignore"
|
22 |
+
def __init__(self, config_path:str = None, **kwargs):
|
23 |
+
super().__init__(**kwargs)
|
24 |
+
if config_path is None:
|
25 |
+
config_path = os.path.join(os.path.dirname(__file__), "configs", "config.json")
|
26 |
+
config_dict = load_config(config_path)
|
27 |
+
config_dict.update(kwargs)
|
28 |
+
for key, value in config_dict.items():
|
29 |
+
if hasattr(self, key):
|
30 |
+
setattr(self, key, value)
|
31 |
+
set_based_synthesizer(self.based_synthesizer)
|
32 |
+
self.ui_config = get_ui_config(self.based_synthesizer)
|
33 |
+
|
34 |
+
def get_characters(self)-> dict:
|
35 |
+
url = self.url + self.character_endpoint
|
36 |
+
res = requests.get(url)
|
37 |
+
return json.loads(res.text)
|
38 |
+
|
39 |
+
@staticmethod
|
40 |
+
def stream_audio(url, data: Dict[str, Any]) -> Generator[Tuple[int, np.ndarray], None, None]:
|
41 |
+
headers = {"Content-Type": "application/json"}
|
42 |
+
# 发起POST请求,获取响应流
|
43 |
+
response = requests.post(
|
44 |
+
url, data=json.dumps(data), headers=headers, stream=True
|
45 |
+
)
|
46 |
+
chunk_size = 1024
|
47 |
+
# 确保请求成功
|
48 |
+
if response.status_code == 200:
|
49 |
+
# 循环读取音频流
|
50 |
+
for chunk in response.iter_content(chunk_size):
|
51 |
+
# 将二进制数据转换为numpy数组,这里假设音频数据是16位整数格式
|
52 |
+
audiodata = np.frombuffer(chunk, dtype=np.int16)
|
53 |
+
yield 32000, audiodata
|
54 |
+
else:
|
55 |
+
raise Exception(
|
56 |
+
f"Failed to get audio stream, status code: {response.status_code}"
|
57 |
+
)
|
58 |
+
def generate(
|
59 |
+
self,
|
60 |
+
task: TTS_Task,
|
61 |
+
return_type: Literal["filepath", "numpy"] = "numpy",
|
62 |
+
save_path: Optional[str] = None,
|
63 |
+
) -> Union[str, Generator[Tuple[int, np.ndarray], None, None], Any]:
|
64 |
+
|
65 |
+
|
66 |
+
url = self.url + self.tts_endpoint
|
67 |
+
data = task.data
|
68 |
+
print(return_type)
|
69 |
+
|
70 |
+
if self.debug_mode:
|
71 |
+
print(f"generate task: \n{data}")
|
72 |
+
headers = {"Content-Type": "application/json"}
|
73 |
+
if return_type == "filepath" or (
|
74 |
+
return_type == "numpy" and not task.stream
|
75 |
+
):
|
76 |
+
if save_path is None:
|
77 |
+
save_path = f"tmp_audio/{datetime.now().strftime('%Y%m%d%H%M%S')}.wav"
|
78 |
+
res = requests.post(url, data=json.dumps(data), headers=headers)
|
79 |
+
if res.status_code == 200:
|
80 |
+
with open(save_path, "wb") as f:
|
81 |
+
f.write(res.content)
|
82 |
+
if return_type == "filepath":
|
83 |
+
return save_path
|
84 |
+
else:
|
85 |
+
audiodata, sr = sf.read(save_path)
|
86 |
+
return ((sr, audiodata) for _ in range(1))
|
87 |
+
else:
|
88 |
+
raise Exception(f"remote synthesizer error: {res.text}")
|
89 |
+
|
90 |
+
elif return_type == "numpy" and task.stream:
|
91 |
+
return self.stream_audio(url, data)
|
92 |
+
|
93 |
+
|
94 |
+
def params_parser(self, data) -> TTS_Task:
|
95 |
+
task = TTS_Task(based_synthesizer=self.based_synthesizer, **data)
|
96 |
+
return task
|
97 |
+
|
98 |
+
def ms_like_parser(self,data) -> TTS_Task:
|
99 |
+
task = TTS_Task(based_synthesizer=self.based_synthesizer, **data)
|
100 |
+
return task
|
Synthesizers/remote/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .Remote_Synthesizer import Remote_Synthesizer as TTS_Synthesizer
|
2 |
+
from .remote_task import Remote_TTS_Task as TTS_Task
|
Synthesizers/remote/configs/config.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"url": "http://localhost:5000",
|
3 |
+
"tts_endpoint": "/tts",
|
4 |
+
"character_endpoint": "/character_list",
|
5 |
+
"based_synthesizer": "gsv_fast"
|
6 |
+
}
|
Synthesizers/remote/configs/i18n/locale/en_US.json
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
", 返回内容:": ", Return Content:",
|
3 |
+
"<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>This is the model management interface. It allows you to assign emotions to multiple reference audio segments. If you only have one segment, you can skip using this interface.</p><p>If you have questions or need further information, please refer to the documentation: <a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">Click to view detailed documentation</a>.</p>",
|
4 |
+
"Endpoint": "Endpoint",
|
5 |
+
"GPT模型路径": "GPT Model Path",
|
6 |
+
"Sovits模型路径": "SoVITS Model Path",
|
7 |
+
"Temperature": "Temperature",
|
8 |
+
"Top K": "Top K",
|
9 |
+
"Top P": "Top P",
|
10 |
+
"all_ja": "Japanese Only",
|
11 |
+
"all_zh": "Chinese Only",
|
12 |
+
"auto": "Auto Detect",
|
13 |
+
"auto_cut": "Smart Split",
|
14 |
+
"batch_size,1代表不并行,越大越快,但是越可能出问题": "Batch Size: 1 means no parallel processing. Larger values are faster but more prone to issues.",
|
15 |
+
"cut0": "Split by Line Break Only",
|
16 |
+
"cut1": "Group Four Sentences Together",
|
17 |
+
"cut2": "Group 50 Characters Together",
|
18 |
+
"cut3": "Split by Chinese Period",
|
19 |
+
"cut4": "Split by English Period",
|
20 |
+
"cut5": "Split by Punctuation",
|
21 |
+
"en": "English",
|
22 |
+
"https://space.bilibili.com/66633770": "https://github.com/X-T-E-R",
|
23 |
+
"https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
|
24 |
+
"ja": "Japanese",
|
25 |
+
"json设置(一般不动)": "JSON Settings (Do not change it unless you know what you are doing)",
|
26 |
+
"zh": "Chinese",
|
27 |
+
"不切": "Do Not Split",
|
28 |
+
"人物情感列表网址": "Character Emotion List URL",
|
29 |
+
"从json中读取": "Read from JSON",
|
30 |
+
"使用前,请确认后端服务已启动。": "Before using, please ensure the backend service is running.",
|
31 |
+
"保存json\n(可能不会有完成提示,没报错就是成功)": "Save JSON\n(There may not be a completion notice; no error means success)",
|
32 |
+
"保存失败!": "Save Failed!",
|
33 |
+
"保存成功!": "Save Successful!",
|
34 |
+
"停止播放": "Stop Playback",
|
35 |
+
"切句方式": "Sentence Splitting Method",
|
36 |
+
"前端处理后的文本(每句):": "Front-end Processed Text (Per Sentence):",
|
37 |
+
"参考音频在3~10秒范围外,请更换!": "Reference audio is outside the 3-10 second range. Please replace it!",
|
38 |
+
"参考音频路径": "Reference Audio Path",
|
39 |
+
"发送json格式": "Send in JSON",
|
40 |
+
"发送并开始播放": "Send and Start Playback",
|
41 |
+
"发送请求": "Send Request",
|
42 |
+
"发送请求到": "Send Request to",
|
43 |
+
"吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "Missing or swallowed words are normal. Severe issues can be resolved by adding new lines or periods, or adjusting the batch size.",
|
44 |
+
"吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "Missing or swallowed words are normal. Severe issues can be resolved by adding new lines or periods, changing the reference audio (using the model management interface), or adjusting the batch size.",
|
45 |
+
"基础选项": "Basic Options",
|
46 |
+
"实际输入的参考文本:": "Actual Reference Text Input:",
|
47 |
+
"实际输入的目标文本(切句后):": "Actual Target Text Input (After Splitting):",
|
48 |
+
"实际输入的目标文本(每句):": "Actual Target Text Input (Per Sentence):",
|
49 |
+
"实际输入的目标文本:": "Actual Target Text Input:",
|
50 |
+
"密码": "Password",
|
51 |
+
"当前人物": "Current Character",
|
52 |
+
"当前人物变更为: ": "Current Character Changed to: ",
|
53 |
+
"您在使用经典推理模式,部分选项不可用": "You are using Classic Inference Mode. Some options are unavailable.",
|
54 |
+
"情感列表": "Emotion",
|
55 |
+
"情感风格": "Emotion",
|
56 |
+
"我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "Five little monkeys jumping on the bed, one fell off and bumped his head. Mama called the doctor, and the doctor said, \"No more monkeys jumping on the bed!\"",
|
57 |
+
"扫描": "Scan",
|
58 |
+
"扫描人物列表": "Scan Character List",
|
59 |
+
"扫描模型文件夹:": "Scan Model Folder:",
|
60 |
+
"找不到模型文件!请把有效文件放置在文件夹下!!!": "Model file not found! Please place valid files in the folder!!!",
|
61 |
+
"提供的推理特化包,当前版本:": ", Current Version: ",
|
62 |
+
"提示": "Tip",
|
63 |
+
"提示文本": "Prompt Text",
|
64 |
+
"提示语言": "Prompt Language",
|
65 |
+
"文件打开失败,保存失败!": "File Opening Failed, Save Failed!",
|
66 |
+
"文本语言": "Text Language",
|
67 |
+
"是否自动匹配情感": "Automatically Match Emotions",
|
68 |
+
"模型文件夹路径": "Model Folder Path",
|
69 |
+
"每句允许最大切分字词数": "Max Words per Split Sentence",
|
70 |
+
"流式音频": "Streaming Audio",
|
71 |
+
"添加情感": "Add Emotion",
|
72 |
+
"点击查看详细文档": "Click to View Detailed Documentation",
|
73 |
+
"版本": "Version",
|
74 |
+
"用户名": "Username",
|
75 |
+
"种子": "Seed",
|
76 |
+
"简介": "Introduction",
|
77 |
+
"缺失某些项,保存失败!": "Missing Some Items, Save Failed!",
|
78 |
+
"网址设置": "URL Settings",
|
79 |
+
"自动生成info": "Auto Generate Info",
|
80 |
+
"若有疑问或需要进一步了解,可参考文档:": "If you have questions or need further information, please refer to the documentation: ",
|
81 |
+
"认证信息": "Authentication Info",
|
82 |
+
"认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "Authentication is enabled. You can disable it in config.json.\nHowever, this feature is not fully implemented yet and is just for show.",
|
83 |
+
"语速": "Speed",
|
84 |
+
"请修改后点击下方按钮进行保存": "Please modify and click the button below to save",
|
85 |
+
"请求失败,状态码:": "Request Failed, Status Code:",
|
86 |
+
"请求失败,请检查URL是否正确": "Request Failed. Please check if the URL is correct.",
|
87 |
+
"请求完整音频": "Request Complete Audio",
|
88 |
+
"请求网址": "Request URL",
|
89 |
+
"输入文本": "Input Text",
|
90 |
+
"这是一个由": "This is a Inference Specialization Package provided by ",
|
91 |
+
"这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "This is a configuration file for https://github.com/X-T-E-R/TTS-for-GPT-soVITS, a simple and easy-to-use frontend and backend project",
|
92 |
+
"这是展示页面的版本,并未使用后端服务,下面参数无效。": "This is a demonstration page version and does not utilize backend services, the parameters below are invalid.",
|
93 |
+
"选择角色": "Select Character",
|
94 |
+
"音频输出": "Audio Output",
|
95 |
+
"音频预览": "Audio Preview",
|
96 |
+
"项目开源地址:": "Github Link: ",
|
97 |
+
"高级选项": "Advanced Options",
|
98 |
+
"最大允许长度": "Max Length Allowed"
|
99 |
+
}
|
Synthesizers/remote/configs/i18n/locale/zh_CN.json
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
", 返回内容:": ", 返回内容:",
|
3 |
+
"<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>",
|
4 |
+
"Endpoint": "Endpoint",
|
5 |
+
"GPT模型路径": "GPT模型路径",
|
6 |
+
"Sovits模型路径": "Sovits模型路径",
|
7 |
+
"Temperature": "Temperature",
|
8 |
+
"Top K": "Top K",
|
9 |
+
"Top P": "Top P",
|
10 |
+
"all_ja": "只有日文",
|
11 |
+
"all_zh": "只有中文",
|
12 |
+
"auto": "自动判断",
|
13 |
+
"auto_cut": "智能切分",
|
14 |
+
"batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不并行,越大越快,但是越可能出问题",
|
15 |
+
"cut0": "仅凭换行切分",
|
16 |
+
"cut1": "凑四句一切",
|
17 |
+
"cut2": "凑50字一切",
|
18 |
+
"cut3": "按中文句号。切",
|
19 |
+
"cut4": "按英文句号.切",
|
20 |
+
"cut5": "按标点符号切",
|
21 |
+
"en": "英文",
|
22 |
+
"https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770",
|
23 |
+
"https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
|
24 |
+
"ja": "日文",
|
25 |
+
"json设置(一般不动)": "json设置(一般不动)",
|
26 |
+
"zh": "中文",
|
27 |
+
"不切": "不切",
|
28 |
+
"人物情感列表网址": "人物情感列表网址",
|
29 |
+
"从json中读取": "从json中读取",
|
30 |
+
"使用前,请确认后端服务已启动。": "使用前,请确认后端服务已启动。",
|
31 |
+
"保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不会有完成提示,没报错就是成功)",
|
32 |
+
"保存失败!": "保存失败!",
|
33 |
+
"保存成功!": "保存成功!",
|
34 |
+
"停止播放": "停止播放",
|
35 |
+
"切句方式": "切句方式",
|
36 |
+
"前端处理后的文本(每句):": "前端处理后的文本(每句):",
|
37 |
+
"参考音频在3~10秒范围外,请更换!": "参考音频在3~10秒范围外,请更换!",
|
38 |
+
"参考音频路径": "参考音频路径",
|
39 |
+
"发送json格式": "发送json格式",
|
40 |
+
"发送并开始播放": "发送并开始播放",
|
41 |
+
"发送请求": "发送请求",
|
42 |
+
"发送请求到": "发送请求到",
|
43 |
+
"吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。",
|
44 |
+
"吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。",
|
45 |
+
"基础选项": "基础选项",
|
46 |
+
"实际输入的参考文本:": "实际输入的参考文本:",
|
47 |
+
"实际输入的目标文本(切句后):": "实际输入的目标文本(切句后):",
|
48 |
+
"实际输入的目标文本(每句):": "实际输入的目标文本(每句):",
|
49 |
+
"实际输入的目标文本:": "实际输入的目标文本:",
|
50 |
+
"密码": "密码",
|
51 |
+
"当前人物": "当前人物",
|
52 |
+
"当前人物变更为: ": "当前人物变更为: ",
|
53 |
+
"您在使用经典推理模式,部分选项不可用": "您在使用经典推理模式,部分选项不可用",
|
54 |
+
"情感列表": "情感列表",
|
55 |
+
"情感风格": "情感风格",
|
56 |
+
"我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。",
|
57 |
+
"扫描": "扫描",
|
58 |
+
"扫描人物列表": "扫描人物列表",
|
59 |
+
"扫描模型文件夹:": "扫描模型文件夹:",
|
60 |
+
"找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!请把有效文件放置在文件夹下!!!",
|
61 |
+
"提供的推理特化包,当前版本:": "提供的推理特化包,当前版本:",
|
62 |
+
"提示": "提示",
|
63 |
+
"提示文本": "提示文本",
|
64 |
+
"提示语言": "提示语言",
|
65 |
+
"文件打开失败,保存失败!": "文件打开失败,保存失败!",
|
66 |
+
"文本语言": "文本语言",
|
67 |
+
"是否自动匹配情感": "是否自动匹配情感",
|
68 |
+
"模型文件夹路径": "模型文件夹路径",
|
69 |
+
"每句允许最大切分字词数": "每句允许最大切分字词数",
|
70 |
+
"流式音频": "流式音频",
|
71 |
+
"添加情感": "添加情感",
|
72 |
+
"点击查看详细文档": "点击查看详细文档",
|
73 |
+
"��本": "版本",
|
74 |
+
"用户名": "用户名",
|
75 |
+
"种子": "种子",
|
76 |
+
"简介": "简介",
|
77 |
+
"缺失某些项,保存失败!": "缺失某些项,保存失败!",
|
78 |
+
"网址设置": "网址设置",
|
79 |
+
"自动生成info": "自动生成info",
|
80 |
+
"若有疑问或需要进一步了解,可参考文档:": "若有疑问或需要进一步了解,可参考文档:",
|
81 |
+
"认证信息": "认证信息",
|
82 |
+
"认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设",
|
83 |
+
"语速": "语速",
|
84 |
+
"请修改后点击下方按钮进行保存": "请修改后点击下方按钮进行保存",
|
85 |
+
"请求失败,状态码:": "请求失败,状态码:",
|
86 |
+
"请求失败,请检查URL是否正确": "请求失败,请检查URL是否正确",
|
87 |
+
"请求完整音频": "请求完整音频",
|
88 |
+
"请求网址": "请求网址",
|
89 |
+
"输入文本": "输入文本",
|
90 |
+
"这是一个由": "这是一个由",
|
91 |
+
"这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目",
|
92 |
+
"这是展示页面的版本,并未使用后端服务,下面参数无效。": "这是展示页面的版本,并未使用后端服务,下面参数无效。",
|
93 |
+
"选择角色": "选择角色",
|
94 |
+
"音频输出": "音频输出",
|
95 |
+
"音频预览": "音频预览",
|
96 |
+
"项目开源地址:": "项目开源地址:",
|
97 |
+
"高级选项": "高级选项",
|
98 |
+
"最大允许长度": "最大允许长度"
|
99 |
+
}
|
Synthesizers/remote/configs/i18n/locale/zh_TW.json
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
", 返回内容:": ", 返回內容:",
|
3 |
+
"<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>這是模型管理介面,為了實現對多段參考音頻分配情緒設計,如果您只有一段可不使用這個介面</p><p>若有疑問或需要進一步了解,可參考文件:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">點擊查看詳細文件</a>。</p>",
|
4 |
+
"Endpoint": "Endpoint",
|
5 |
+
"GPT模型路径": "GPT模型路徑",
|
6 |
+
"Sovits模型路径": "Sovits模型路徑",
|
7 |
+
"Temperature": "Temperature",
|
8 |
+
"Top K": "Top K",
|
9 |
+
"Top P": "Top P",
|
10 |
+
"all_ja": "僅日文",
|
11 |
+
"all_zh": "僅中文",
|
12 |
+
"auto": "自動判斷",
|
13 |
+
"auto_cut": "智慧切分",
|
14 |
+
"batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不並行,越大越快,但是越可能出現問題",
|
15 |
+
"cut0": "僅憑換行切分",
|
16 |
+
"cut1": "湊四句一切",
|
17 |
+
"cut2": "湊50字一切",
|
18 |
+
"cut3": "按中文句號。切",
|
19 |
+
"cut4": "按英文句號.切",
|
20 |
+
"cut5": "按標點符號切",
|
21 |
+
"en": "英文",
|
22 |
+
"https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770",
|
23 |
+
"https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
|
24 |
+
"ja": "日文",
|
25 |
+
"json设置(一般不动)": "json設置(一般不動)",
|
26 |
+
"zh": "中文",
|
27 |
+
"不切": "不切",
|
28 |
+
"人物情感列表网址": "人物情緒列表網址",
|
29 |
+
"从json中读取": "從json中讀取",
|
30 |
+
"使用前,请确认后端服务已启动。": "使用前,請確認後端服務已啟動。",
|
31 |
+
"保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不會有完成提示,沒報錯就是成功)",
|
32 |
+
"保存失败!": "保存失敗!",
|
33 |
+
"保存成功!": "保存成功!",
|
34 |
+
"停止播放": "停止播放",
|
35 |
+
"切句方式": "切句方式",
|
36 |
+
"前端处理后的文本(每句):": "前端處理後的文本(每句):",
|
37 |
+
"参考音频在3~10秒范围外,请更换!": "參考音頻在3~10秒範圍外,請更換!",
|
38 |
+
"参考音频路径": "參考音頻路徑",
|
39 |
+
"发送json格式": "發送json格式",
|
40 |
+
"发送并开始播放": "發送並開始播放",
|
41 |
+
"发送请求": "發送請求",
|
42 |
+
"发送请求到": "發送請求到",
|
43 |
+
"吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或調節batch size滑條。",
|
44 |
+
"吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或者更換參考音頻(使用模型管理介面)、調節下方batch size滑條。",
|
45 |
+
"基础选项": "基礎選項",
|
46 |
+
"实际输入的参考文本:": "實際輸入的參考文本:",
|
47 |
+
"实际输入的目标文本(切句后):": "實際輸入的目標文本(切句後):",
|
48 |
+
"实际输入的目标文本(每句):": "實際輸入的目標文本(每句):",
|
49 |
+
"实际输入的目标文本:": "實際輸入的目標文本:",
|
50 |
+
"密码": "密碼",
|
51 |
+
"当前人物": "當前人物",
|
52 |
+
"当前人物变更为: ": "當前人物變更為: ",
|
53 |
+
"您在使用经典推理模式,部分选项不可用": "您在使用經典推理模式,部分選項不可用",
|
54 |
+
"情感列表": "情緒列表",
|
55 |
+
"情感风格": "情緒風格",
|
56 |
+
"我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "有時掉進黑洞,有時候爬上彩虹。在下一秒鐘,命運如何轉動,沒有人會曉得。我說希望無窮,你猜美夢成空,相信和懷疑,總要決鬥。",
|
57 |
+
"扫描": "掃描",
|
58 |
+
"扫描人物列表": "掃描人物列表",
|
59 |
+
"扫描模型文件夹:": "掃描模型文件夾:",
|
60 |
+
"找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!請把有效文件放置在文件夾下!!!",
|
61 |
+
"提供的推理特化包,当前版本:": "提供的推理特化包,當前版本:",
|
62 |
+
"提示": "提示",
|
63 |
+
"提示文本": "提示文本",
|
64 |
+
"提示语言": "提示語言",
|
65 |
+
"文件打开失败,保存失败!": "文件開啟失敗,保存失敗!",
|
66 |
+
"文本语言": "文本語言",
|
67 |
+
"是否自动匹配情感": "是否自動匹配情緒",
|
68 |
+
"模型文件夹路径": "模型文件夾路徑",
|
69 |
+
"每句允许最大切分字词数": "每句允許最大切分字詞數",
|
70 |
+
"流式音频": "流式音頻",
|
71 |
+
"添加情感": "添加情緒",
|
72 |
+
"点击查看详细文档": "點擊查看詳細��件",
|
73 |
+
"版本": "版本",
|
74 |
+
"用户名": "使用者名稱",
|
75 |
+
"种子": "種子",
|
76 |
+
"简介": "簡介",
|
77 |
+
"缺失某些项,保存失败!": "缺失某些項,保存失敗!",
|
78 |
+
"网址设置": "網址設置",
|
79 |
+
"自动生成info": "自動生成info",
|
80 |
+
"若有疑问或需要进一步了解,可参考文档:": "若有疑問或需要進一步了解,可參考文件:",
|
81 |
+
"认证信息": "認證信息",
|
82 |
+
"认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "認證信息已啟用,您可以在config.json中關閉。\n但是這個功能還沒做好,只是擺設",
|
83 |
+
"语速": "語速",
|
84 |
+
"请修改后点击下方按钮进行保存": "請修改後點擊下方按鈕進行保存",
|
85 |
+
"请求失败,状态码:": "請求失敗,狀態碼:",
|
86 |
+
"请求失败,请检查URL是否正确": "請求失敗,請檢查URL是否正確",
|
87 |
+
"请求完整音频": "請求完整音頻",
|
88 |
+
"请求网址": "請求網址",
|
89 |
+
"输入文本": "輸入文本",
|
90 |
+
"这是一个由": "這是一個由",
|
91 |
+
"这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "這是一個配置文件適用於https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一個簡單好用的前後端項目",
|
92 |
+
"这是展示页面的版本,并未使用后端服务,下面参数无效。": "這是展示頁面的版本,並未使用後端服務,下面參數無效。",
|
93 |
+
"选择角色": "選擇角色",
|
94 |
+
"音频输出": "音頻輸出",
|
95 |
+
"音频预览": "音頻預覽",
|
96 |
+
"项目开源地址:": "Github Link:",
|
97 |
+
"高级选项": "高級選項",
|
98 |
+
"最大允许长度": "最大允許長度"
|
99 |
+
}
|
Synthesizers/remote/configs/params_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
|
3 |
+
}
|
Synthesizers/remote/configs/ui_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
|
3 |
+
}
|
Synthesizers/remote/remote_task.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os, json, sys
|
3 |
+
sys.path.append(".")
|
4 |
+
|
5 |
+
from uuid import uuid4
|
6 |
+
from typing import List, Dict, Literal, Optional, Any, Union
|
7 |
+
import urllib.parse
|
8 |
+
import hashlib
|
9 |
+
|
10 |
+
from Synthesizers.base import Base_TTS_Task, ParamItem, init_params_config
|
11 |
+
|
12 |
+
global global_based_synthesizer
|
13 |
+
global_based_synthesizer = None
|
14 |
+
|
15 |
+
def set_based_synthesizer(based_synthesizer:str):
|
16 |
+
global global_based_synthesizer
|
17 |
+
global_based_synthesizer = based_synthesizer
|
18 |
+
|
19 |
+
def get_params_config(based_synthesizer:str= None):
|
20 |
+
assert based_synthesizer is not None, "based_synthesizer is not set, please init the remote synthesizer first."
|
21 |
+
try:
|
22 |
+
with open(os.path.join(os.path.dirname(__file__), "configs", "params_config.json"), "r", encoding="utf-8") as f:
|
23 |
+
res:dict = json.load(f)
|
24 |
+
with open(os.path.join("Synthesizers", based_synthesizer ,"configs", "params_config.json"), "r", encoding="utf-8") as f:
|
25 |
+
res.update(json.load(f))
|
26 |
+
return init_params_config(res)
|
27 |
+
except:
|
28 |
+
raise FileNotFoundError("params_config.json not found or invalid.")
|
29 |
+
|
30 |
+
params_config = None
|
31 |
+
|
32 |
+
def get_ui_config(based_synthesizer:str= None)->Dict[str, Any]:
|
33 |
+
if based_synthesizer is None:
|
34 |
+
based_synthesizer = global_based_synthesizer
|
35 |
+
assert based_synthesizer is not None, "based_synthesizer is not set, please init the remote synthesizer first."
|
36 |
+
|
37 |
+
remote_ui_config_path = os.path.join(os.path.dirname(__file__), "configs", "ui_config.json")
|
38 |
+
based_ui_config_path = os.path.join("Synthesizers", based_synthesizer ,"configs", "ui_config.json")
|
39 |
+
|
40 |
+
ui_config :Dict[str, Any] = {}
|
41 |
+
try:
|
42 |
+
with open(remote_ui_config_path, "r", encoding="utf-8") as f:
|
43 |
+
ui_config.update(json.load(f))
|
44 |
+
with open(based_ui_config_path, "r", encoding="utf-8") as f:
|
45 |
+
ui_config.update(json.load(f))
|
46 |
+
return ui_config
|
47 |
+
except:
|
48 |
+
raise FileNotFoundError("ui_config.json not found or invalid.")
|
49 |
+
|
50 |
+
from pydantic import BaseModel, Field, model_validator
|
51 |
+
from copy import deepcopy
|
52 |
+
class Remote_TTS_Task(Base_TTS_Task):
|
53 |
+
|
54 |
+
is_remote: Optional[bool] = True
|
55 |
+
data : dict = {}
|
56 |
+
|
57 |
+
class Config:
|
58 |
+
extra = "ignore"
|
59 |
+
|
60 |
+
def __init__(self, based_synthesizer:str=None, **data):
|
61 |
+
|
62 |
+
global params_config
|
63 |
+
based_synthesizer = based_synthesizer if based_synthesizer is not None else global_based_synthesizer
|
64 |
+
assert based_synthesizer is not None, "based_synthesizer is not set, please init the remote synthesizer first."
|
65 |
+
if params_config is None:
|
66 |
+
params_config = get_params_config(based_synthesizer)
|
67 |
+
copyed_data = deepcopy(data)
|
68 |
+
copyed_data.setdefault("params_config",params_config)
|
69 |
+
super().__init__(**copyed_data)
|
70 |
+
self.data = data
|
71 |
+
|
72 |
+
@property
|
73 |
+
def md5(self):
|
74 |
+
m = hashlib.md5()
|
75 |
+
m.update(self.data.__str__().encode())
|
76 |
+
return m.hexdigest()
|
77 |
+
|
78 |
+
def __str__(self):
|
79 |
+
content = super().__str__()
|
80 |
+
return f"{content}"
|
81 |
+
|
82 |
+
|
app.py
CHANGED
@@ -28,6 +28,20 @@ max_text_length = inference_config.max_text_length
|
|
28 |
from tools.i18n.i18n import I18nAuto
|
29 |
i18n = I18nAuto(locale_path="i18n/locale")
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
import nltk
|
32 |
nltk.data.path.append(os.path.abspath(os.path.join(now_dir,"nltk_data")))
|
33 |
|
@@ -400,6 +414,23 @@ with gr.Blocks() as app:
|
|
400 |
],
|
401 |
)
|
402 |
|
403 |
-
|
404 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
405 |
|
|
|
|
|
|
28 |
from tools.i18n.i18n import I18nAuto
|
29 |
i18n = I18nAuto(locale_path="i18n/locale")
|
30 |
|
31 |
+
|
32 |
+
from Synthesizers.base import Base_TTS_Synthesizer, Base_TTS_Task, get_wave_header_chunk
|
33 |
+
from importlib import import_module
|
34 |
+
|
35 |
+
synthesizer_name = inference_config.synthesizer
|
36 |
+
|
37 |
+
# 动态导入合成器模块, 此处可写成 from Synthesizers.xxx import TTS_Synthesizer, TTS_Task
|
38 |
+
synthesizer_module = import_module(f"Synthesizers.{synthesizer_name}")
|
39 |
+
TTS_Synthesizer = synthesizer_module.TTS_Synthesizer
|
40 |
+
TTS_Task = synthesizer_module.TTS_Task
|
41 |
+
|
42 |
+
# 创建合成器实例
|
43 |
+
tts_synthesizer:Base_TTS_Synthesizer = TTS_Synthesizer(debug_mode=True)
|
44 |
+
|
45 |
import nltk
|
46 |
nltk.data.path.append(os.path.abspath(os.path.join(now_dir,"nltk_data")))
|
47 |
|
|
|
414 |
],
|
415 |
)
|
416 |
|
417 |
+
import uvicorn
|
418 |
+
from pure_api import tts, character_list, set_tts_synthesizer
|
419 |
+
from fastapi import FastAPI
|
420 |
+
from fastapi.middleware.cors import CORSMiddleware
|
421 |
+
|
422 |
+
set_tts_synthesizer(tts_synthesizer)
|
423 |
+
fastapi_app:FastAPI = app.app
|
424 |
+
fastapi_app.add_api_route("/tts", tts, methods=["POST", "GET"])
|
425 |
+
fastapi_app.add_api_route("/character_list", character_list, methods=["GET"])
|
426 |
+
|
427 |
+
fastapi_app.add_middleware(
|
428 |
+
CORSMiddleware,
|
429 |
+
allow_origins=["*"],
|
430 |
+
allow_credentials=True,
|
431 |
+
allow_methods=["*"],
|
432 |
+
allow_headers=["*"],
|
433 |
+
)
|
434 |
|
435 |
+
fastapi_app = gr.mount_gradio_app(fastapi_app, app, path="/")
|
436 |
+
uvicorn.run(fastapi_app, host=inference_config.tts_host, port=inference_config.tts_port)
|
config.json
CHANGED
@@ -10,6 +10,7 @@
|
|
10 |
"max_text_length": -1,
|
11 |
"save_prompt_cache": "true",
|
12 |
"save_model_cache": "false",
|
|
|
13 |
"备注0": "locale是语言环境,auto表示自动选择,如果你想要强制指定语言环境,可以填写zh_CN或者en_US等等",
|
14 |
"备注1": "路径可以填写绝对路径或者相对路径,相对路径指的是在主项目根目录的相对路径",
|
15 |
"备注2": "tts_port是tts服务的端口号,可以自己定义,只要不和其他服务的端口号冲突就行,默认是5000",
|
|
|
10 |
"max_text_length": -1,
|
11 |
"save_prompt_cache": "true",
|
12 |
"save_model_cache": "false",
|
13 |
+
"synthesizer": "gsv_fast",
|
14 |
"备注0": "locale是语言环境,auto表示自动选择,如果你想要强制指定语言环境,可以填写zh_CN或者en_US等等",
|
15 |
"备注1": "路径可以填写绝对路径或者相对路径,相对路径指的是在主项目根目录的相对路径",
|
16 |
"备注2": "tts_port是tts服务的端口号,可以自己定义,只要不和其他服务的端口号冲突就行,默认是5000",
|
gsv_config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"device": "auto",
|
3 |
+
"is_half": "auto",
|
4 |
+
|
5 |
+
"models_path": "trained",
|
6 |
+
"cnhubert_base_path": "pretrained_models/chinese-hubert-base",
|
7 |
+
"bert_base_path": "pretrained_models/chinese-roberta-wwm-ext-large",
|
8 |
+
"save_prompt_cache": true,
|
9 |
+
"prompt_cache_dir": "cache/prompt_cache"
|
10 |
+
}
|
pure_api.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 在开头加入路径
|
2 |
+
import os, sys
|
3 |
+
import importlib
|
4 |
+
|
5 |
+
now_dir = os.getcwd()
|
6 |
+
sys.path.append(now_dir)
|
7 |
+
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
8 |
+
|
9 |
+
from src.config_manager import Inference_Config
|
10 |
+
from src.config_manager import __version__ as frontend_version
|
11 |
+
|
12 |
+
inference_config = Inference_Config()
|
13 |
+
|
14 |
+
import soundfile as sf
|
15 |
+
from fastapi import FastAPI, Request, HTTPException
|
16 |
+
from fastapi.responses import JSONResponse, FileResponse, StreamingResponse
|
17 |
+
from fastapi.middleware.cors import CORSMiddleware
|
18 |
+
import tempfile
|
19 |
+
import uvicorn
|
20 |
+
import json
|
21 |
+
|
22 |
+
# 将当前文件所在的目录添加到 sys.path
|
23 |
+
from Synthesizers.base import Base_TTS_Task, Base_TTS_Synthesizer
|
24 |
+
|
25 |
+
# 创建合成器实例
|
26 |
+
tts_synthesizer:Base_TTS_Synthesizer = None
|
27 |
+
|
28 |
+
def set_tts_synthesizer(synthesizer:Base_TTS_Synthesizer):
|
29 |
+
global tts_synthesizer
|
30 |
+
tts_synthesizer = synthesizer
|
31 |
+
|
32 |
+
# 存储临时文件的字典
|
33 |
+
temp_files = {}
|
34 |
+
|
35 |
+
async def character_list(request: Request):
|
36 |
+
res = JSONResponse(tts_synthesizer.get_characters())
|
37 |
+
return res
|
38 |
+
|
39 |
+
async def tts(request: Request):
|
40 |
+
|
41 |
+
from time import time as tt
|
42 |
+
t1 = tt()
|
43 |
+
print(f"Request Time: {t1}")
|
44 |
+
|
45 |
+
# 尝试从JSON中获取数据,如果不是JSON,则从查询参数中获取
|
46 |
+
if request.method == "GET":
|
47 |
+
data = request.query_params
|
48 |
+
else:
|
49 |
+
data = await request.json()
|
50 |
+
|
51 |
+
task:Base_TTS_Task = tts_synthesizer.params_parser(data)
|
52 |
+
|
53 |
+
if task.task_type == "text" and task.text.strip() == "":
|
54 |
+
return HTTPException(status_code=400, detail="Text is empty")
|
55 |
+
elif task.task_type == "ssml" and task.ssml.strip() == "":
|
56 |
+
return HTTPException(status_code=400, detail="SSML is empty")
|
57 |
+
md5_value = task.md5
|
58 |
+
if task.stream == False:
|
59 |
+
# TODO: use SQL instead of dict
|
60 |
+
if task.save_temp and md5_value in temp_files:
|
61 |
+
return FileResponse(path=temp_files[md5_value], media_type=f'audio/{task.format}')
|
62 |
+
else:
|
63 |
+
# 假设 gen 是你的音频生成器
|
64 |
+
try:
|
65 |
+
save_path = tts_synthesizer.generate(task, return_type="filepath")
|
66 |
+
except Exception as e:
|
67 |
+
return HTTPException(status_code=500, detail=str(e))
|
68 |
+
if task.save_temp:
|
69 |
+
temp_files[md5_value] = save_path
|
70 |
+
|
71 |
+
t2 = tt()
|
72 |
+
print(f"total time: {t2-t1}")
|
73 |
+
# 返回文件响应,FileResponse 会负责将文件发送给客户端
|
74 |
+
return FileResponse(save_path, media_type=f"audio/{task.format}", filename=os.path.basename(save_path))
|
75 |
+
else:
|
76 |
+
gen = tts_synthesizer.generate(task, return_type="numpy")
|
77 |
+
return StreamingResponse(gen, media_type='audio/wav')
|
78 |
+
|
79 |
+
|
80 |
+
|
81 |
+
|
82 |
+
if __name__ == "__main__":
|
83 |
+
# 动态导入合成器模块, 此处可写成 from Synthesizers.xxx import TTS_Synthesizer, TTS_Task
|
84 |
+
from importlib import import_module
|
85 |
+
from src.api_utils import get_localhost_ipv4_address
|
86 |
+
synthesizer_name = inference_config.synthesizer
|
87 |
+
synthesizer_module = import_module(f"Synthesizers.{synthesizer_name}")
|
88 |
+
TTS_Synthesizer = synthesizer_module.TTS_Synthesizer
|
89 |
+
TTS_Task = synthesizer_module.TTS_Task
|
90 |
+
tts_synthesizer = TTS_Synthesizer(debug_mode=True)
|
91 |
+
print(f"Backend Version: {__version__}")
|
92 |
+
tts_host = inference_config.tts_host
|
93 |
+
tts_port = inference_config.tts_port
|
94 |
+
ipv4_address = get_localhost_ipv4_address(tts_host)
|
95 |
+
ipv4_link = f"http://{ipv4_address}:{tts_port}"
|
96 |
+
print(f"INFO: Local Network URL: {ipv4_link}")
|
97 |
+
|
98 |
+
app = FastAPI()
|
99 |
+
|
100 |
+
# 设置CORS
|
101 |
+
app.add_middleware(
|
102 |
+
CORSMiddleware,
|
103 |
+
allow_origins=["*"],
|
104 |
+
allow_credentials=True,
|
105 |
+
allow_methods=["*"],
|
106 |
+
allow_headers=["*"],
|
107 |
+
)
|
108 |
+
app.add_api_route('/tts', tts, methods=["GET", "POST"])
|
109 |
+
app.add_api_route('/character_list', character_list, methods=["GET"])
|
110 |
+
uvicorn.run(app, host=tts_host, port=tts_port)
|
src/config_manager.py
CHANGED
@@ -37,6 +37,7 @@ class Inference_Config():
|
|
37 |
self.locale_language = None if locale_language.lower() == "auto" else locale_language
|
38 |
if self.enable_auth:
|
39 |
self.users = config.get("user", {})
|
|
|
40 |
|
41 |
global inference_config
|
42 |
inference_config = Inference_Config()
|
|
|
37 |
self.locale_language = None if locale_language.lower() == "auto" else locale_language
|
38 |
if self.enable_auth:
|
39 |
self.users = config.get("user", {})
|
40 |
+
self.synthesizer = config.get("synthesizer", "gsv_fast")
|
41 |
|
42 |
global inference_config
|
43 |
inference_config = Inference_Config()
|