nekoaoxiang commited on
Commit
d290960
1 Parent(s): 94080f3

Add api support

Browse files
Synthesizers/base/Base_TTS_Synthesizer.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+
3
+ from .Base_TTS_Task import Base_TTS_Task as TTS_Task
4
+ import json
5
+ from typing import List, Dict, Literal, Optional, Any, Union, Generator, Tuple
6
+ from pydantic import BaseModel, Field, model_validator
7
+ import numpy as np
8
+ from abc import ABC, abstractmethod
9
+ from typing import Dict, List, Union, Generator, Tuple
10
+ from typing_extensions import Literal
11
+ import numpy as np
12
+ import wave,io
13
+
14
+ class Base_TTS_Synthesizer(ABC):
15
+ """
16
+ Abstract base class for a Text-To-Speech (TTS) synthesizer.
17
+
18
+ Attributes:
19
+ ui_config (Dict[str, List]): A dictionary containing UI configuration settings.
20
+ debug_mode (bool): Flag to toggle debug mode for additional logging and debugging information.
21
+
22
+ """
23
+
24
+ ui_config: Dict[str, List] = {}
25
+ debug_mode: bool = False
26
+
27
+ def __init__(self, **kwargs):
28
+ """
29
+ Initializes the TTS synthesizer with optional UI configurations and debug mode setting.
30
+
31
+ Args:
32
+ ui_config (Dict[str, List], optional): Configuration for user interface settings.
33
+ debug_mode (bool, optional): Enables or disables debug mode.
34
+
35
+ """
36
+ self.ui_config = kwargs.get("ui_config", {})
37
+ self.debug_mode = kwargs.get("debug_mode", False)
38
+
39
+ @abstractmethod
40
+ def generate(
41
+ self,
42
+ task: TTS_Task,
43
+ return_type: Literal["filepath", "numpy"] = "numpy",
44
+ save_path: Optional[str] = None,
45
+ ) -> Union[str, Generator[Tuple[int, np.ndarray], None, None], Any]:
46
+ """
47
+ Generates speech from a given TTS task.
48
+
49
+ Args:
50
+ task (TTS_Task): The task containing data and parameters for speech synthesis.
51
+ return_type (Literal["filepath", "numpy"], optional): The type of return value, either a file path or audio data.
52
+ save_path (str, optional): The path to save the audio file.
53
+ Returns:
54
+ Union[str, Generator[Tuple[int, np.ndarray], None, None], Any]: Depending on the return_type, returns a file path, a generator of audio data, or other types.
55
+
56
+ """
57
+ pass
58
+
59
+ @abstractmethod
60
+ def get_characters(self):
61
+ """
62
+ Retrieves the available characters and their emotions for the TTS.
63
+
64
+ Returns:
65
+ Dict[str, List[str]]: A dictionary mapping character names to lists of their emotions.
66
+ """
67
+ pass
68
+
69
+ @abstractmethod
70
+ def params_parser(self, data):
71
+ """
72
+ Parses input data into a TTS_Task.
73
+
74
+ Args:
75
+ data (Any): The raw input data to be parsed.
76
+
77
+ Returns:
78
+ TTS_Task: A TTS task object created from the input data.
79
+ """
80
+ pass
81
+
82
+ @abstractmethod
83
+ def ms_like_parser(self, data):
84
+ """
85
+ Parses input data in a Microsoft-like format into a TTS_Task.
86
+
87
+ Args:
88
+ data (Any): The raw input data to be parsed.
89
+
90
+ Returns:
91
+ TTS_Task: A TTS task object created from the Microsoft-like formatted input data.
92
+ """
93
+ pass
94
+
95
+
96
+ def get_wave_header_chunk(sample_rate: int, channels: int = 1, sample_width: int = 2):
97
+ """
98
+ Generate a wave header with no data.
99
+
100
+ Args:
101
+ sample_rate (int): The sample rate of the audio.
102
+ channels (int, optional): The number of audio channels. Defaults to 1.
103
+ sample_width (int, optional): The sample width in bytes. Defaults to 2.
104
+
105
+ Returns:
106
+ bytes: The wave header as bytes.
107
+ """
108
+ wav_buf = io.BytesIO()
109
+ with wave.open(wav_buf, "wb") as vfout:
110
+ vfout.setnchannels(channels)
111
+ vfout.setsampwidth(sample_width)
112
+ vfout.setframerate(sample_rate)
113
+
114
+ wav_buf.seek(0)
115
+ return wav_buf.read()
Synthesizers/base/Base_TTS_Task.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json, sys
2
+
3
+ from uuid import uuid4
4
+ from typing import Literal
5
+ import urllib.parse
6
+ import hashlib
7
+
8
+ from pydantic import BaseModel, Field, model_validator
9
+ from typing import Literal, List, Optional, Dict, Any, Union
10
+ from uuid import uuid4
11
+ import hashlib
12
+
13
+ def convert_value_type(value: Any, type_: str):
14
+ if value is None:
15
+ return None
16
+ if isinstance(value, str):
17
+ value = urllib.parse.unquote(value)
18
+ if type(value).__name__ == type_:
19
+ # 如果值的类型和参数的类型一致,直接返回值
20
+ return value
21
+ if type_ == "int":
22
+ return int(value)
23
+ elif type_ == "float":
24
+ if isinstance(value, str) and value[-1] == "%":
25
+ return float(value[:-1]) / 100
26
+ else:
27
+ return float(value)
28
+ elif type_ == "bool":
29
+ if isinstance(value, bool):
30
+ return value
31
+ return str(value).lower() in ("true", "1", "t", "y", "yes", "allow", "allowed")
32
+ else: # 默认为字符串
33
+ return str(value)
34
+
35
+
36
+ class ParamItem(BaseModel):
37
+ """
38
+ Represents a parameter item for a TTS task.
39
+
40
+ Attributes:
41
+ type (str): The data type of the parameter.
42
+ default (Any): The default value of the parameter.
43
+ alias (List[str]): The list of aliases for the parameter.
44
+ label (Optional[str]): The label for the parameter.
45
+ name (Optional[str]): The name of the parameter.
46
+ description (Optional[str]): The description of the parameter.
47
+ min_value (Optional[float]): The minimum value of the parameter.
48
+ max_value (Optional[float]): The maximum value of the parameter.
49
+ step (Optional[float]): The step value for the parameter.
50
+ choices (Optional[List[str]]): The list of choices for the parameter.
51
+ """
52
+ type: str
53
+ component_type: Optional[str] = None
54
+ default: Any
55
+ alias: List[str]
56
+ label: Optional[str] = None
57
+ name: Optional[str]
58
+ description: Optional[str] = None
59
+ min_value: Optional[float] = None
60
+ max_value: Optional[float] = None
61
+ step: Optional[float] = None
62
+ choices: Optional[List[str]] = None
63
+
64
+ def __init__(self, **data):
65
+ if not data.get("type"):
66
+ data.update({"type": "str"})
67
+ super().__init__(**data)
68
+ self.default = convert_value_type(self.default, self.type)
69
+
70
+
71
+ def init_params_config(res: dict):
72
+
73
+ result = {}
74
+ for key, value in res.items():
75
+ if value.get("label") is None:
76
+ value.update({"label": key})
77
+ value.update({"name": key})
78
+ result[key] = ParamItem(**value)
79
+ return result
80
+
81
+
82
+ class Base_TTS_Task(BaseModel):
83
+ """
84
+ Base class for TTS (Text-to-Speech) tasks.
85
+
86
+ Attributes:
87
+ uuid (str): Unique identifier for the task.
88
+ params_config (Dict[str, ParamItem]): Configuration parameters for the task.
89
+
90
+ task_type (Literal["text", "ssml", "audio"]): Type of the task. Can be "text", "ssml", or "audio".
91
+ audio_path (Optional[str]): Path to the audio file.
92
+ src (Optional[str]): Source url.
93
+ ssml (Optional[str]): SSML (Speech Synthesis Markup Language) text.
94
+
95
+ text (Optional[str]): Text content.
96
+
97
+ format (Optional[str]): Audio format.
98
+ sample_rate (Optional[int]): Sample rate of the audio.
99
+ loudness (Optional[float]): Loudness of the audio.
100
+ speed (Optional[float]): Speed of the audio.
101
+ stream (Optional[bool]): Flag indicating if the audio should be streamed.
102
+
103
+ save_temp (Optional[bool]): Flag indicating if the temporary files should be saved.
104
+
105
+ disabled_features (Optional[List[str]]): List of disabled features.
106
+
107
+ """
108
+
109
+ uuid: str = None
110
+ params_config: Dict[str, ParamItem]
111
+
112
+ task_type: Literal["text", "ssml", "audio"] = Field(default="text")
113
+ audio_path: Optional[str] = None
114
+ src: Optional[str] = None
115
+ ssml: Optional[str] = None
116
+
117
+ text: Optional[str] = None
118
+
119
+ character: Optional[str] = None
120
+ emotion: Optional[str] = None
121
+
122
+ format: Optional[str] = None
123
+ sample_rate: Optional[int] = None
124
+ loudness: Optional[float] = None
125
+ speed: Optional[float] = None
126
+ stream: Optional[bool] = None
127
+
128
+ save_temp: Optional[bool] = False
129
+
130
+ disabled_features: Optional[List[str]] = None
131
+
132
+ class Config:
133
+ populate_by_name = True
134
+ extra = "ignore"
135
+
136
+ def __init__(self, other_task: Union[BaseModel, dict, None] = None, **data):
137
+ if isinstance(other_task, BaseModel):
138
+ # 如果 task 是 Base_TTS_Task 实例,从该实例复制属性
139
+ data = other_task.model_dump()
140
+ super().__init__(**data)
141
+ else:
142
+ # 如果 task 是字典,直接使用这个字典
143
+ if isinstance(other_task, dict):
144
+ data = other_task
145
+ assert "params_config" in data, "params_config is not defined."
146
+ super().__init__(params_config=data.get("params_config"))
147
+ self.set_default_values()
148
+ self.set_values(**data)
149
+ self.uuid = str(uuid4())
150
+
151
+ def update_value(self, key: str, value: Any, allow_none: bool = False):
152
+ if not allow_none and value is None:
153
+ return
154
+
155
+ assert self.params_config is not None, "params_config is not defined."
156
+ for param_key, param_value in self.params_config.items():
157
+ if key in param_value.alias:
158
+ if hasattr(self, param_key):
159
+ value = convert_value_type(value, param_value.type)
160
+ setattr(self, param_key, value)
161
+ else:
162
+ pass
163
+ # raise ValueError(f"Attribute {param_key} not found. Something went wrong in params_config.json.")
164
+
165
+ def set_values(self, **data):
166
+ assert self.params_config is not None, "params_config is not defined."
167
+ for key, value in data.items():
168
+ if hasattr(self, key):
169
+ value = convert_value_type(value, type(getattr(self, key)).__name__)
170
+ setattr(self, key, value)
171
+ else:
172
+ self.update_value(key, value)
173
+
174
+ def set_default_values(self):
175
+ assert self.params_config is not None, "params_config is not defined."
176
+ for key, value in self.params_config.items():
177
+ if (
178
+ hasattr(self, key)
179
+ and getattr(self, key) is None
180
+ and value.default is not None
181
+ ):
182
+ setattr(self, key, value.default)
183
+
184
+ @property
185
+ def md5(self):
186
+ m = hashlib.md5()
187
+ if self.task_type == "text":
188
+ m.update(self.text.encode())
189
+ elif self.task_type == "ssml":
190
+ m.update(self.ssml.encode())
191
+ elif self.task_type == "audio":
192
+ m.update(self.src.encode())
193
+ return m.hexdigest()
194
+
195
+ def __str__(self):
196
+ dict_content: dict = self.model_dump(exclude={"params_config"})
197
+ # 收集所有值为None的键
198
+ keys_to_remove = [key for key, value in dict_content.items() if value is None]
199
+
200
+ # 弹出这些键
201
+ for key in keys_to_remove:
202
+ dict_content.pop(key)
203
+ return json.dumps(dict_content, indent=4, ensure_ascii=False)
204
+
205
+ def copy(self, update: Dict[str, Any] = {}, deep: bool = False):
206
+ update["uuid"] = str(uuid4())
207
+ return super().model_copy(update=update, deep=deep)
Synthesizers/base/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .Base_TTS_Task import Base_TTS_Task, ParamItem, init_params_config
2
+ from .Base_TTS_Synthesizer import Base_TTS_Synthesizer, get_wave_header_chunk
3
+ from .config_utils import load_config
Synthesizers/base/config_utils.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Optional, Dict, List, Literal
2
+ from pydantic import BaseModel
3
+ import os, json
4
+
5
+ class ConfigItem(BaseModel):
6
+ value : Optional[Any] = None
7
+ default : Optional[Any] = None
8
+ type : Optional[str] = None
9
+ description : Optional[str] = None
10
+
11
+ def __init__(self, **data):
12
+ super().__init__(**data)
13
+ if (self.value is None) and self.default is not None:
14
+ self.value = self.default
15
+
16
+ def is_config_item(item:Dict[str, Any])->bool:
17
+ """判断是否为配置项"""
18
+ return isinstance(item, dict) and ("value" in item or "default" in item)
19
+
20
+ def parse_config_dict(input_config:Dict[str, Any], output_config)->Dict[str, Any]:
21
+
22
+ for key, res in input_config.items():
23
+ if is_config_item(res):
24
+ value = ConfigItem(**res).value
25
+ else:
26
+ if isinstance(res, dict):
27
+ value = parse_config_dict(res, {})
28
+ else:
29
+ value = res
30
+ output_config[key] = value
31
+ return output_config
32
+
33
+ def load_config(config_path:str)->Dict[str, Any]:
34
+ """加载配置文件"""
35
+ assert os.path.exists(config_path), f"配置文件不存在: {config_path}"
36
+ config:Dict[str, Any] = {}
37
+ with open(config_path, 'r', encoding='utf-8') as f:
38
+ config = parse_config_dict(json.load(f), {})
39
+ return config
40
+
Synthesizers/gsv_fast/GSV_Synthesizer.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io, wave
2
+ import os, json, sys
3
+ import threading
4
+ from typing import Any, Union, Generator, Literal, List, Dict, Tuple
5
+ from Synthesizers.base import Base_TTS_Synthesizer, load_config
6
+
7
+ from .gsv_task import GSV_TTS_Task as TTS_Task
8
+ from .ssml_dealer import SSML_Dealer
9
+
10
+ from time import time as tt
11
+ import numpy as np
12
+ import hashlib
13
+ import soundfile as sf
14
+
15
+ from .gsv_config import load_infer_config, auto_generate_infer_config, get_device_info
16
+ from datetime import datetime
17
+
18
+ dict_language = {
19
+ "中文": "all_zh",#全部按中文识别
20
+ "英文": "en",#全部按英文识别#######不变
21
+ "日文": "all_ja",#全部按日文识别
22
+ "中英混合": "zh",#按中英混合识别####不变
23
+ "日英混合": "ja",#按日英混合识别####不变
24
+ "多语种混合": "auto",#多语种启动切分识别语种
25
+ "auto": "auto",
26
+ "zh": "zh",
27
+ "en": "en",
28
+ "ja": "ja",
29
+ "all_zh": "all_zh",
30
+ "all_ja": "all_ja",
31
+ }
32
+
33
+ from Adapters.gsv_fast.TTS_infer_pack.TTS import TTS, TTS_Config
34
+ class GSV_Synthesizer(Base_TTS_Synthesizer):
35
+ device: str = "auto"
36
+ is_half: bool = False
37
+ models_path:str = "models/gsv"
38
+ cnhubert_base_path:str = "models/pretrained_models/gsv/chinese-hubert-base"
39
+ bert_base_path:str = "models/pretrained_models/gsv/chinese-roberta-wwm-ext-large"
40
+ save_prompt_cache:bool = True
41
+ prompt_cache_dir:str = "cache/prompt_cache"
42
+ default_character:str = None
43
+
44
+ ui_config:dict = None
45
+ tts_pipline:TTS = None
46
+ character:str = None
47
+ lock:threading.Lock = None
48
+
49
+ def __init__(self, config_path:str=None, **kwargs):
50
+ super().__init__()
51
+
52
+ if config_path is None:
53
+ config_path = "gsv_config.json"
54
+ config_dict = load_config(config_path)
55
+ config_dict.update(kwargs)
56
+ for key, value in config_dict.items():
57
+ if hasattr(self, key):
58
+ setattr(self, key, value)
59
+ if self.debug_mode:
60
+ print(f"GSV_Synthesizer config: {config_dict}")
61
+
62
+ self.device, self.is_half = get_device_info(self.device, self.is_half)
63
+ tts_config = TTS_Config("")
64
+ tts_config.device , tts_config.is_half = self.device, self.is_half
65
+ tts_config.cnhubert_base_path = self.cnhubert_base_path
66
+ tts_config.bert_base_path = self.bert_base_path
67
+ self.tts_pipline = TTS(tts_config)
68
+
69
+ if self.default_character is None:
70
+ self.default_character = next(iter(self.get_characters()), None)
71
+
72
+ self.lock = threading.Lock()
73
+ self.load_character(self.default_character)
74
+ ui_config_path = os.path.join("Synthesizers/gsv_fast/configs", "ui_config.json")
75
+ with open(ui_config_path, 'r', encoding='utf-8') as f:
76
+ self.ui_config = json.load(f)
77
+
78
+ # from https://github.com/RVC-Boss/GPT-SoVITS/pull/448
79
+ def get_streaming_tts_wav(self, params):
80
+ # from https://huggingface.co/spaces/coqui/voice-chat-with-mistral/blob/main/app.py
81
+ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=32000):
82
+ wav_buf = io.BytesIO()
83
+ with wave.open(wav_buf, "wb") as vfout:
84
+ vfout.setnchannels(channels)
85
+ vfout.setsampwidth(sample_width)
86
+ vfout.setframerate(sample_rate)
87
+ vfout.writeframes(frame_input)
88
+
89
+ wav_buf.seek(0)
90
+ return wav_buf.read()
91
+ chunks = self.tts_pipline.run(params)
92
+ yield wave_header_chunk()
93
+ # chunk is tuple[int, np.ndarray], 代表了sample_rate和音频数据
94
+ for chunk in chunks:
95
+ sample_rate, audio_data = chunk
96
+ if audio_data is not None:
97
+ yield audio_data.tobytes()
98
+
99
+ def get_characters(self) -> dict:
100
+ characters_and_emotions = {}
101
+
102
+ # 遍历模型路径下的所有文件夹
103
+ for character_subdir in os.listdir(self.models_path):
104
+ subdir_path = os.path.join(self.models_path, character_subdir)
105
+ config_path = os.path.join(subdir_path, "infer_config.json")
106
+ if not os.path.isdir(subdir_path):
107
+ continue
108
+ # 检查路径是否为文件夹并存在配置文件
109
+ if os.path.exists(config_path):
110
+ try:
111
+ # 尝试读取配置文件并提取情感列表
112
+ with open(config_path, "r", encoding='utf-8') as f:
113
+ config = json.load(f)
114
+ emotion_dict_list = config.get('emotion_list', None)
115
+ if emotion_dict_list is None:
116
+ emotion_list = ["default"]
117
+ else:
118
+ emotion_list = list(emotion_dict_list.keys())
119
+ except json.JSONDecodeError:
120
+ # 文件读取或解析失败则使用默认情感
121
+ emotion_list = ["default"]
122
+ else:
123
+ # 如果不是文件夹或配置文件不存在,也使用默认情感
124
+ emotion_list = ["default"]
125
+
126
+ characters_and_emotions[character_subdir] = emotion_list
127
+ return characters_and_emotions
128
+
129
+ def load_character_id(self, speaker_id):
130
+ character = list(self.get_characters())[speaker_id]
131
+ return self.load_character(character)
132
+
133
+ def load_character(self, character):
134
+ if character in ["", None]:
135
+ if self.character not in ["", None]:
136
+ return
137
+ else:
138
+ character = self.default_character
139
+ print(f"{character}为空,尝试切换到默认角色{self.default_character}")
140
+ return self.load_character(character)
141
+ if str(character).lower() == str(self.character).lower():
142
+ return
143
+ character_path=os.path.join(self.models_path, character)
144
+ if not os.path.exists(character_path):
145
+ print(f"找不到角色文件夹: {character},沿用之前的角色{self.character}")
146
+ return
147
+ # raise Exception(f"Can't find character folder: {character}")
148
+ assert os.path.exists(character_path), f"找不到角色文件夹: {character}"
149
+ try:
150
+ # 加载配置
151
+ config = load_infer_config(character_path)
152
+
153
+ # 尝试从环境变量获取gpt_path,如果未设置,则从配置文件读取
154
+ gpt_path = os.path.join(character_path,config.get("gpt_path"))
155
+ # 尝试从环境变量获取sovits_path,如果未设置,则从配置文件读取
156
+ sovits_path = os.path.join(character_path,config.get("sovits_path"))
157
+ except:
158
+ try:
159
+ # 尝试调用auto_get_infer_config
160
+ auto_generate_infer_config(character_path)
161
+ self.load_character(character)
162
+ return
163
+ except:
164
+ # 报错
165
+ raise Exception("找不到模型文件!请把有效模型放置在模型文件夹下,确保其中至少有pth、ckpt和wav三种文件。")
166
+
167
+ self.character = character
168
+
169
+ t0 = tt()
170
+ self.tts_pipline.init_t2s_weights(gpt_path)
171
+ self.tts_pipline.init_vits_weights(sovits_path)
172
+ t1 = tt()
173
+ print(f"加载角色成功: {character}, 耗时: {t1-t0:.2f}s")
174
+
175
+ def generate_from_text(self, task: TTS_Task):
176
+ self.load_character(task.character)
177
+ task.character = self.character
178
+ # 加载环境配置
179
+ if task.ref_audio_path is None or not os.path.exists(task.ref_audio_path):
180
+ task.ref_audio_path, task.prompt_text, task.prompt_language = self.get_ref_infos(self.character, task.emotion)
181
+
182
+ return self.get_wav_from_text_api(
183
+ text=task.text,
184
+ text_language=task.text_language,
185
+ ref_audio_path=task.ref_audio_path,
186
+ prompt_text=task.prompt_text,
187
+ prompt_language=task.prompt_language,
188
+ batch_size=task.batch_size,
189
+ speed=task.speed,
190
+ top_k=task.top_k,
191
+ top_p=task.top_p,
192
+ temperature=task.temperature,
193
+ cut_method=task.cut_method,
194
+ max_cut_length=task.max_cut_length,
195
+ seed=task.seed,
196
+ parallel_infer=task.parallel_infer,
197
+ repetition_penalty=task.repetition_penalty,
198
+ stream=task.stream
199
+ )
200
+
201
+ def generate_from_ssml(self, task: TTS_Task):
202
+ dealer = SSML_Dealer()
203
+ return dealer.generate_from_ssml(task.ssml, self)
204
+
205
+ def generate(
206
+ self,
207
+ task: TTS_Task,
208
+ return_type: Literal["filepath", "numpy"] = "numpy",
209
+ save_path: str = None,
210
+ ) -> Union[str, Generator[Tuple[int, np.ndarray], None, None], Any]:
211
+ if self.debug_mode:
212
+ print(f"task: {task}")
213
+ gen = None
214
+ if task.task_type == "text":
215
+ gen = self.generate_from_text(task)
216
+ elif task.task_type == "ssml":
217
+ gen = self.generate_from_ssml(task)
218
+
219
+ if return_type == "numpy":
220
+ return gen
221
+ elif return_type == "filepath":
222
+ if save_path is None:
223
+ save_path = f"tmp_audio/{datetime.now().strftime('%Y%m%d%H%M%S')}.{task.format}"
224
+ sr, audio_data = next(gen)
225
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
226
+ sf.write(save_path, audio_data, sr)
227
+ return save_path
228
+ @staticmethod
229
+ def calc_short_md5(string):
230
+ m = hashlib.md5()
231
+ m.update(string.encode())
232
+ return m.hexdigest()[:8]
233
+ def get_ref_infos(self, character, emotion) -> Tuple[str, str, str]:
234
+ if self.debug_mode:
235
+ print(f"try to get ref infos, character: {character}, emotion: {emotion}")
236
+ character_path = os.path.join(self.models_path, character)
237
+ config: Dict[str, Any] = load_infer_config(character_path)
238
+ emotion_dict: Dict = config.get("emotion_list", None)
239
+ if emotion_dict is None:
240
+ return None, None, None
241
+ emotion_name_list = list(emotion_dict.keys())
242
+ if emotion not in emotion_name_list:
243
+ emotion = emotion_name_list[0]
244
+ for emotion_name, details in emotion_dict.items():
245
+ if emotion_name == emotion:
246
+ relative_path = details['ref_wav_path']
247
+ ref_audio_path = os.path.join(os.path.join(self.models_path,self.character), relative_path)
248
+ prompt_text = details['prompt_text']
249
+ prompt_language = details['prompt_language']
250
+
251
+ return ref_audio_path, prompt_text, prompt_language
252
+ return None, None, None
253
+
254
+ def get_wav_from_text_api(
255
+ self,
256
+ text: str,
257
+ text_language="auto",
258
+ ref_audio_path=None,
259
+ prompt_text=None,
260
+ prompt_language="auto",
261
+ batch_size=1,
262
+ speed=1.0,
263
+ top_k=12,
264
+ top_p=0.6,
265
+ temperature=0.6,
266
+ cut_method="auto_cut",
267
+ max_cut_length=100,
268
+ seed=-1,
269
+ stream=False,
270
+ parallel_infer=True,
271
+ repetition_penalty=1.35,
272
+ **kwargs
273
+ ):
274
+
275
+ text = text.replace("\r", "\n").replace("<br>", "\n").replace("\t", " ")
276
+ text = text.replace("……","。").replace("…","。").replace("\n\n","\n").replace("。\n","\n").replace("\n", "。\n")
277
+
278
+ assert os.path.exists(ref_audio_path), f"找不到参考音频文件: {ref_audio_path}"
279
+ prompt_cache_path = ""
280
+
281
+ if self.save_prompt_cache:
282
+ prompt_cache_path = f"{self.prompt_cache_dir}/prompt_cache_{self.calc_short_md5(ref_audio_path + prompt_text + prompt_language)}.pickle"
283
+
284
+ try:
285
+ text_language = dict_language[text_language]
286
+ prompt_language = dict_language[prompt_language]
287
+ if "-" in text_language:
288
+ text_language = text_language.split("-")[0]
289
+ if "-" in prompt_language:
290
+ prompt_language = prompt_language.split("-")[0]
291
+ except:
292
+ text_language = "auto"
293
+ prompt_language = "auto"
294
+ ref_free = False
295
+
296
+ if cut_method == "auto_cut":
297
+ cut_method = f"auto_cut_{max_cut_length}"
298
+
299
+ params = {
300
+ "text": text,
301
+ "text_lang": text_language.lower(),
302
+ "prompt_cache_path": prompt_cache_path,
303
+ "ref_audio_path": ref_audio_path,
304
+ "prompt_text": prompt_text,
305
+ "prompt_lang": prompt_language.lower(),
306
+ "top_k": top_k,
307
+ "top_p": top_p,
308
+ "temperature": temperature,
309
+ "text_split_method": cut_method,
310
+ "batch_size": batch_size,
311
+ "speed_factor": speed,
312
+ "ref_text_free": ref_free,
313
+ "split_bucket":True,
314
+ "return_fragment":stream,
315
+ "seed": seed,
316
+ "parallel_infer": parallel_infer,
317
+ "repetition_penalty": repetition_penalty
318
+ }
319
+ # 调用原始的get_tts_wav函数
320
+ # 注意:这里假设get_tts_wav函数及其所需的其它依赖已经定义并可用
321
+ with self.lock:
322
+ if stream == False:
323
+ return self.tts_pipline.run(params)
324
+ else:
325
+ return self.get_streaming_tts_wav(params)
326
+
327
+ @staticmethod
328
+ def params_parser(data) -> TTS_Task:
329
+ task = TTS_Task(**data)
330
+ return task
331
+
332
+ @staticmethod
333
+ def ms_like_parser(data) -> TTS_Task:
334
+ inputs = data.get("inputs", [])
335
+ try:
336
+ data["text"] = inputs[0]["text"]
337
+ except:
338
+ pass
339
+ task = TTS_Task(**data)
340
+ return task
Synthesizers/gsv_fast/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .GSV_Synthesizer import GSV_Synthesizer as TTS_Synthesizer
2
+ from .gsv_task import GSV_TTS_Task as TTS_Task
Synthesizers/gsv_fast/configs/i18n/locale/en_US.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ ", 返回内容:": ", Return Content:",
3
+ "<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>This is the model management interface. It allows you to assign emotions to multiple reference audio segments. If you only have one segment, you can skip using this interface.</p><p>If you have questions or need further information, please refer to the documentation: <a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">Click to view detailed documentation</a>.</p>",
4
+ "Endpoint": "Endpoint",
5
+ "GPT模型路径": "GPT Model Path",
6
+ "Sovits模型路径": "SoVITS Model Path",
7
+ "Temperature": "Temperature",
8
+ "Top K": "Top K",
9
+ "Top P": "Top P",
10
+ "all_ja": "Japanese Only",
11
+ "all_zh": "Chinese Only",
12
+ "auto": "Auto Detect",
13
+ "auto_cut": "Smart Split",
14
+ "batch_size,1代表不并行,越大越快,但是越可能出问题": "Batch Size: 1 means no parallel processing. Larger values are faster but more prone to issues.",
15
+ "cut0": "Split by Line Break Only",
16
+ "cut1": "Group Four Sentences Together",
17
+ "cut2": "Group 50 Characters Together",
18
+ "cut3": "Split by Chinese Period",
19
+ "cut4": "Split by English Period",
20
+ "cut5": "Split by Punctuation",
21
+ "en": "English",
22
+ "https://space.bilibili.com/66633770": "https://github.com/X-T-E-R",
23
+ "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
24
+ "ja": "Japanese",
25
+ "json设置(一般不动)": "JSON Settings (Do not change it unless you know what you are doing)",
26
+ "zh": "Chinese",
27
+ "不切": "Do Not Split",
28
+ "人物情感列表网址": "Character Emotion List URL",
29
+ "从json中读取": "Read from JSON",
30
+ "使用前,请确认后端服务已启动。": "Before using, please ensure the backend service is running.",
31
+ "保存json\n(可能不会有完成提示,没报错就是成功)": "Save JSON\n(There may not be a completion notice; no error means success)",
32
+ "保存失败!": "Save Failed!",
33
+ "保存成功!": "Save Successful!",
34
+ "停止播放": "Stop Playback",
35
+ "切句方式": "Sentence Splitting Method",
36
+ "前端处理后的文本(每句):": "Front-end Processed Text (Per Sentence):",
37
+ "参考音频在3~10秒范围外,请更换!": "Reference audio is outside the 3-10 second range. Please replace it!",
38
+ "参考音频路径": "Reference Audio Path",
39
+ "发送json格式": "Send in JSON",
40
+ "发送并开始播放": "Send and Start Playback",
41
+ "发送请求": "Send Request",
42
+ "发送请求到": "Send Request to",
43
+ "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "Missing or swallowed words are normal. Severe issues can be resolved by adding new lines or periods, or adjusting the batch size.",
44
+ "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "Missing or swallowed words are normal. Severe issues can be resolved by adding new lines or periods, changing the reference audio (using the model management interface), or adjusting the batch size.",
45
+ "基础选项": "Basic Options",
46
+ "实际输入的参考文本:": "Actual Reference Text Input:",
47
+ "实际输入的目标文本(切句后):": "Actual Target Text Input (After Splitting):",
48
+ "实际输入的目标文本(每句):": "Actual Target Text Input (Per Sentence):",
49
+ "实际输入的目标文本:": "Actual Target Text Input:",
50
+ "密码": "Password",
51
+ "当前人物": "Current Character",
52
+ "当前人物变更为: ": "Current Character Changed to: ",
53
+ "您在使用经典推理模式,部分选项不可用": "You are using Classic Inference Mode. Some options are unavailable.",
54
+ "情感列表": "Emotion",
55
+ "情感风格": "Emotion",
56
+ "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "Five little monkeys jumping on the bed, one fell off and bumped his head. Mama called the doctor, and the doctor said, \"No more monkeys jumping on the bed!\"",
57
+ "扫描": "Scan",
58
+ "扫描人物列表": "Scan Character List",
59
+ "扫描模型文件夹:": "Scan Model Folder:",
60
+ "找不到模型文件!请把有效文件放置在文件夹下!!!": "Model file not found! Please place valid files in the folder!!!",
61
+ "提供的推理特化包,当前版本:": ", Current Version: ",
62
+ "提示": "Tip",
63
+ "提示文本": "Prompt Text",
64
+ "提示语言": "Prompt Language",
65
+ "文件打开失败,保存失败!": "File Opening Failed, Save Failed!",
66
+ "文本语言": "Text Language",
67
+ "是否自动匹配情感": "Automatically Match Emotions",
68
+ "模型文件夹路径": "Model Folder Path",
69
+ "每句允许最大切分字词数": "Max Words per Split Sentence",
70
+ "流式音频": "Streaming Audio",
71
+ "添加情感": "Add Emotion",
72
+ "点击查看详细文档": "Click to View Detailed Documentation",
73
+ "版本": "Version",
74
+ "用户名": "Username",
75
+ "种子": "Seed",
76
+ "简介": "Introduction",
77
+ "缺失某些项,保存失败!": "Missing Some Items, Save Failed!",
78
+ "网址设置": "URL Settings",
79
+ "自动生成info": "Auto Generate Info",
80
+ "若有疑问或需要进一步了解,可参考文档:": "If you have questions or need further information, please refer to the documentation: ",
81
+ "认证信息": "Authentication Info",
82
+ "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "Authentication is enabled. You can disable it in config.json.\nHowever, this feature is not fully implemented yet and is just for show.",
83
+ "语速": "Speed",
84
+ "请修改后点击下方按钮进行保存": "Please modify and click the button below to save",
85
+ "请求失败,状态码:": "Request Failed, Status Code:",
86
+ "请求失败,请检查URL是否正确": "Request Failed. Please check if the URL is correct.",
87
+ "请求完整音频": "Request Complete Audio",
88
+ "请求网址": "Request URL",
89
+ "输入文本": "Input Text",
90
+ "这是一个由": "This is a Inference Specialization Package provided by ",
91
+ "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "This is a configuration file for https://github.com/X-T-E-R/TTS-for-GPT-soVITS, a simple and easy-to-use frontend and backend project",
92
+ "这是展示页面的版本,并未使用后端服务,下面参数无效。": "This is a demonstration page version and does not utilize backend services, the parameters below are invalid.",
93
+ "选择角色": "Select Character",
94
+ "音频输出": "Audio Output",
95
+ "音频预览": "Audio Preview",
96
+ "项目开源地址:": "Github Link: ",
97
+ "高级选项": "Advanced Options",
98
+ "最大允许长度": "Max Length Allowed"
99
+ }
Synthesizers/gsv_fast/configs/i18n/locale/zh_CN.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ ", 返回内容:": ", 返回内容:",
3
+ "<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>",
4
+ "Endpoint": "Endpoint",
5
+ "GPT模型路径": "GPT模型路径",
6
+ "Sovits模型路径": "Sovits模型路径",
7
+ "Temperature": "Temperature",
8
+ "Top K": "Top K",
9
+ "Top P": "Top P",
10
+ "all_ja": "只有日文",
11
+ "all_zh": "只有中文",
12
+ "auto": "自动判断",
13
+ "auto_cut": "智能切分",
14
+ "batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不并行,越大越快,但是越可能出问题",
15
+ "cut0": "仅凭换行切分",
16
+ "cut1": "凑四句一切",
17
+ "cut2": "凑50字一切",
18
+ "cut3": "按中文句号。切",
19
+ "cut4": "按英文句号.切",
20
+ "cut5": "按标点符号切",
21
+ "en": "英文",
22
+ "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770",
23
+ "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
24
+ "ja": "日文",
25
+ "json设置(一般不动)": "json设置(一般不动)",
26
+ "zh": "中文",
27
+ "不切": "不切",
28
+ "人物情感列表网址": "人物情感列表网址",
29
+ "从json中读取": "从json中读取",
30
+ "使用前,请确认后端服务已启动。": "使用前,请确认后端服务已启动。",
31
+ "保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不会有完成提示,没报错就是成功)",
32
+ "保存失败!": "保存失败!",
33
+ "保存成功!": "保存成功!",
34
+ "停止播放": "停止播放",
35
+ "切句方式": "切句方式",
36
+ "前端处理后的文本(每句):": "前端处理后的文本(每句):",
37
+ "参考音频在3~10秒范围外,请更换!": "参考音频在3~10秒范围外,请更换!",
38
+ "参考音频路径": "参考音频路径",
39
+ "发送json格式": "发送json格式",
40
+ "发送并开始播放": "发送并开始播放",
41
+ "发送请求": "发送请求",
42
+ "发送请求到": "发送请求到",
43
+ "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。",
44
+ "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。",
45
+ "基础选项": "基础选项",
46
+ "实际输入的参考文本:": "实际输入的参考文本:",
47
+ "实际输入的目标文本(切句后):": "实际输入的目标文本(切句后):",
48
+ "实际输入的目标文本(每句):": "实际输入的目标文本(每句):",
49
+ "实际输入的目标文本:": "实际输入的目标文本:",
50
+ "密码": "密码",
51
+ "当前人物": "当前人物",
52
+ "当前人物变更为: ": "当前人物变更为: ",
53
+ "您在使用经典推理模式,部分选项不可用": "您在使用经典推理模式,部分选项不可用",
54
+ "情感列表": "情感列表",
55
+ "情感风格": "情感风格",
56
+ "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。",
57
+ "扫描": "扫描",
58
+ "扫描人物列表": "扫描人物列表",
59
+ "扫描模型文件夹:": "扫描模型文件夹:",
60
+ "找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!请把有效文件放置在文件夹下!!!",
61
+ "提供的推理特化包,当前版本:": "提供的推理特化包,当前版本:",
62
+ "提示": "提示",
63
+ "提示文本": "提示文本",
64
+ "提示语言": "提示语言",
65
+ "文件打开失败,保存失败!": "文件打开失败,保存失败!",
66
+ "文本语言": "文本语言",
67
+ "是否自动匹配情感": "是否自动匹配情感",
68
+ "模型文件夹路径": "模型文件夹路径",
69
+ "每句允许最大切分字词数": "每句允许最大切分字词数",
70
+ "流式音频": "流式音频",
71
+ "添加情感": "添加情感",
72
+ "点击查看详细文档": "点击查看详细文档",
73
+ "��本": "版本",
74
+ "用户名": "用户名",
75
+ "种子": "种子",
76
+ "简介": "简介",
77
+ "缺失某些项,保存失败!": "缺失某些项,保存失败!",
78
+ "网址设置": "网址设置",
79
+ "自动生成info": "自动生成info",
80
+ "若有疑问或需要进一步了解,可参考文档:": "若有疑问或需要进一步了解,可参考文档:",
81
+ "认证信息": "认证信息",
82
+ "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设",
83
+ "语速": "语速",
84
+ "请修改后点击下方按钮进行保存": "请修改后点击下方按钮进行保存",
85
+ "请求失败,状态码:": "请求失败,状态码:",
86
+ "请求失败,请检查URL是否正确": "请求失败,请检查URL是否正确",
87
+ "请求完整音频": "请求完整音频",
88
+ "请求网址": "请求网址",
89
+ "输入文本": "输入文本",
90
+ "这是一个由": "这是一个由",
91
+ "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目",
92
+ "这是展示页面的版本,并未使用后端服务,下面参数无效。": "这是展示页面的版本,并未使用后端服务,下面参数无效。",
93
+ "选择角色": "选择角色",
94
+ "音频输出": "音频输出",
95
+ "音频预览": "音频预览",
96
+ "项目开源地址:": "项目开源地址:",
97
+ "高级选项": "高级选项",
98
+ "最大允许长度": "最大允许长度"
99
+ }
Synthesizers/gsv_fast/configs/i18n/locale/zh_TW.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ ", 返回内容:": ", 返回內容:",
3
+ "<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>這是模型管理介面,為了實現對多段參考音頻分配情緒設計,如果您只有一段可不使用這個介面</p><p>若有疑問或需要進一步了解,可參考文件:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">點擊查看詳細文件</a>。</p>",
4
+ "Endpoint": "Endpoint",
5
+ "GPT模型路径": "GPT模型路徑",
6
+ "Sovits模型路径": "Sovits模型路徑",
7
+ "Temperature": "Temperature",
8
+ "Top K": "Top K",
9
+ "Top P": "Top P",
10
+ "all_ja": "僅日文",
11
+ "all_zh": "僅中文",
12
+ "auto": "自動判斷",
13
+ "auto_cut": "智慧切分",
14
+ "batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不並行,越大越快,但是越可能出現問題",
15
+ "cut0": "僅憑換行切分",
16
+ "cut1": "湊四句一切",
17
+ "cut2": "湊50字一切",
18
+ "cut3": "按中文句號。切",
19
+ "cut4": "按英文句號.切",
20
+ "cut5": "按標點符號切",
21
+ "en": "英文",
22
+ "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770",
23
+ "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
24
+ "ja": "日文",
25
+ "json设置(一般不动)": "json設置(一般不動)",
26
+ "zh": "中文",
27
+ "不切": "不切",
28
+ "人物情感列表网址": "人物情緒列表網址",
29
+ "从json中读取": "從json中讀取",
30
+ "使用前,请确认后端服务已启动。": "使用前,請確認後端服務已啟動。",
31
+ "保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不會有完成提示,沒報錯就是成功)",
32
+ "保存失败!": "保存失敗!",
33
+ "保存成功!": "保存成功!",
34
+ "停止播放": "停止播放",
35
+ "切句方式": "切句方式",
36
+ "前端处理后的文本(每句):": "前端處理後的文本(每句):",
37
+ "参考音频在3~10秒范围外,请更换!": "參考音頻在3~10秒範圍外,請更換!",
38
+ "参考音频路径": "參考音頻路徑",
39
+ "发送json格式": "發送json格式",
40
+ "发送并开始播放": "發送並開始播放",
41
+ "发送请求": "發送請求",
42
+ "发送请求到": "發送請求到",
43
+ "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或調節batch size滑條。",
44
+ "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或者更換參考音頻(使用模型管理介面)、調節下方batch size滑條。",
45
+ "基础选项": "基礎選項",
46
+ "实际输入的参考文本:": "實際輸入的參考文本:",
47
+ "实际输入的目标文本(切句后):": "實際輸入的目標文本(切句後):",
48
+ "实际输入的目标文本(每句):": "實際輸入的目標文本(每句):",
49
+ "实际输入的目标文本:": "實際輸入的目標文本:",
50
+ "密码": "密碼",
51
+ "当前人物": "當前人物",
52
+ "当前人物变更为: ": "當前人物變更為: ",
53
+ "您在使用经典推理模式,部分选项不可用": "您在使用經典推理模式,部分選項不可用",
54
+ "情感列表": "情緒列表",
55
+ "情感风格": "情緒風格",
56
+ "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "有時掉進黑洞,有時候爬上彩虹。在下一秒鐘,命運如何轉動,沒有人會曉得。我說希望無窮,你猜美夢成空,相信和懷疑,總要決鬥。",
57
+ "扫描": "掃描",
58
+ "扫描人物列表": "掃描人物列表",
59
+ "扫描模型文件夹:": "掃描模型文件夾:",
60
+ "找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!請把有效文件放置在文件夾下!!!",
61
+ "提供的推理特化包,当前版本:": "提供的推理特化包,當前版本:",
62
+ "提示": "提示",
63
+ "提示文本": "提示文本",
64
+ "提示语言": "提示語言",
65
+ "文件打开失败,保存失败!": "文件開啟失敗,保存失敗!",
66
+ "文本语言": "文本語言",
67
+ "是否自动匹配情感": "是否自動匹配情緒",
68
+ "模型文件夹路径": "模型文件夾路徑",
69
+ "每句允许最大切分字词数": "每句允許最大切分字詞數",
70
+ "流式音频": "流式音頻",
71
+ "添加情感": "添加情緒",
72
+ "点击查看详细文档": "點擊查看詳細��件",
73
+ "版本": "版本",
74
+ "用户名": "使用者名稱",
75
+ "种子": "種子",
76
+ "简介": "簡介",
77
+ "缺失某些项,保存失败!": "缺失某些項,保存失敗!",
78
+ "网址设置": "網址設置",
79
+ "自动生成info": "自動生成info",
80
+ "若有疑问或需要进一步了解,可参考文档:": "若有疑問或需要進一步了解,可參考文件:",
81
+ "认证信息": "認證信息",
82
+ "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "認證信息已啟用,您可以在config.json中關閉。\n但是這個功能還沒做好,只是擺設",
83
+ "语速": "語速",
84
+ "请修改后点击下方按钮进行保存": "請修改後點擊下方按鈕進行保存",
85
+ "请求失败,状态码:": "請求失敗,狀態碼:",
86
+ "请求失败,请检查URL是否正确": "請求失敗,請檢查URL是否正確",
87
+ "请求完整音频": "請求完整音頻",
88
+ "请求网址": "請求網址",
89
+ "输入文本": "輸入文本",
90
+ "这是一个由": "這是一個由",
91
+ "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "這是一個配置文件適用於https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一個簡單好用的前後端項目",
92
+ "这是展示页面的版本,并未使用后端服务,下面参数无效。": "這是展示頁面的版本,並未使用後端服務,下面參數無效。",
93
+ "选择角色": "選擇角色",
94
+ "音频输出": "音頻輸出",
95
+ "音频预览": "音頻預覽",
96
+ "项目开源地址:": "Github Link:",
97
+ "高级选项": "高級選項",
98
+ "最大允许长度": "最大允許長度"
99
+ }
Synthesizers/gsv_fast/configs/params_config.json ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_type":{
3
+ "type": "str",
4
+ "description": "Task type for the API.",
5
+ "alias": ["task_type", "task", "type", "textType"],
6
+ "default": "tts"
7
+ },
8
+ "text": {
9
+ "type": "str",
10
+ "label": "文本",
11
+ "description": "The text to be synthesized.",
12
+ "alias": ["text", "txt", "tex", "t"],
13
+ "default": ""
14
+ },
15
+ "ssml": {
16
+ "type": "str",
17
+ "label": "SSML文本",
18
+ "description": "The SSML text to be synthesized.",
19
+ "alias": ["ssml", "text", "txt", "tex", "t"],
20
+ "default": null
21
+ },
22
+ "text_language": {
23
+ "type": "str",
24
+ "label": "文本语言",
25
+ "description": "Language of the text.",
26
+ "alias": ["text_language", "lang", "language", "lan", "text_lang", "xml:lang"],
27
+ "choices": ["auto", "zh", "en", "ja", "all_zh", "all_ja"],
28
+ "default": "auto"
29
+ },
30
+ "character": {
31
+ "type": "str",
32
+ "label": "角色模型",
33
+ "description": "Character name for the model.",
34
+ "alias": ["cha_name", "character", "model_name", "cha", "spk" , "speaker", "name", "role"],
35
+ "default": ""
36
+ },
37
+ "emotion": {
38
+ "type": "str",
39
+ "label": "情感风格",
40
+ "description": "Emotion of the character.",
41
+ "alias": ["character_emotion", "emotion", "style"],
42
+ "default": "default"
43
+ },
44
+ "ref_audio_path":{
45
+ "type": "str",
46
+ "component_type":"audio",
47
+ "label": "参考音频路径, 启用后将忽视emotion参数",
48
+ "description": "Reference audio path for the model.",
49
+ "alias": ["ref_audio_path", "ref_audio", "ref_path"],
50
+ "default": null
51
+ },
52
+ "prompt_text": {
53
+ "type": "str",
54
+ "label": "参考音频文本",
55
+ "description": "Reference audio text for the model.",
56
+ "alias": ["prompt_text", "ref_text"],
57
+ "default": null
58
+ },
59
+ "prompt_language": {
60
+ "type": "str",
61
+ "label": "参考音频语言",
62
+ "description": "Reference audio language for the model.",
63
+ "alias": ["prompt_language", "ref_lang"],
64
+ "choices": ["auto", "zh", "en", "ja", "all_zh", "all_ja"],
65
+ "default": "auto"
66
+ },
67
+ "speaker_id": {
68
+ "type": "int",
69
+ "label": "角色ID",
70
+ "description": "Speaker ID for the model.",
71
+ "alias": ["speaker_id", "id"],
72
+ "default": null
73
+ },
74
+ "batch_size": {
75
+ "type": "int",
76
+ "label": "批处理大小",
77
+ "description": "Batch size for processing.",
78
+ "alias": ["batch_size", "batch"],
79
+ "default": 10,
80
+ "min_value": 1,
81
+ "max_value": 100,
82
+ "step": 1
83
+ },
84
+ "speed": {
85
+ "type": "float",
86
+ "label": "语速",
87
+ "description": "Speed factor for synthesis.",
88
+ "alias": ["speed", "speed_factor", "spd", "rate"],
89
+ "default": 1.0,
90
+ "min_value": 0.5,
91
+ "max_value": 2.0,
92
+ "step": 0.05
93
+ },
94
+ "top_k": {
95
+ "type": "int",
96
+ "label": "采样Top K",
97
+ "description": "Top K parameter for sampling.",
98
+ "alias": ["top_k", "topk"],
99
+ "default": 5,
100
+ "min_value": 1,
101
+ "max_value": 40,
102
+ "step": 1
103
+ },
104
+ "top_p": {
105
+ "type": "float",
106
+ "label": "采样Top P",
107
+ "description": "Top P parameter for sampling.",
108
+ "alias": ["top_p", "topp"],
109
+ "default": 0.8,
110
+ "min_value": 0.1,
111
+ "max_value": 2.0,
112
+ "step": 0.01
113
+ },
114
+ "temperature": {
115
+ "type": "float",
116
+ "label": "采样温度",
117
+ "description": "Temperature for sampling.",
118
+ "alias": ["temperature"],
119
+ "default": 0.8,
120
+ "min_value": 0.1,
121
+ "max_value": 2.0,
122
+ "step": 0.01
123
+ },
124
+ "seed": {
125
+ "type": "int",
126
+ "label": "随机种子",
127
+ "description": "Seed for randomness.",
128
+ "alias": ["seed"],
129
+ "default": -1
130
+ },
131
+ "stream": {
132
+ "type": "bool",
133
+ "label": "流式输出",
134
+ "description": "Stream the audio or not.",
135
+ "alias": ["stream", "streaming"],
136
+ "default": false
137
+ },
138
+ "save_temp": {
139
+ "type": "bool",
140
+ "label": "保存临时输出",
141
+ "description": "Save the output temporarily.",
142
+ "alias": ["save_temp", "save"],
143
+ "default": false
144
+ },
145
+ "cut_method": {
146
+ "type": "str",
147
+ "label": "文本切割方法",
148
+ "description": "Method for text cutting.",
149
+ "alias": ["cut_method", "cut"],
150
+ "choices": ["auto_cut", "cut0", "cut1", "cut2", "cut3", "cut4", "cut5"],
151
+ "default": "auto_cut"
152
+ },
153
+ "max_cut_length": {
154
+ "type": "int",
155
+ "label": "文本切割最大长度",
156
+ "description": "Maximum length of the text cut.",
157
+ "alias": ["max_cut_length", "max_cut"],
158
+ "default": 50,
159
+ "min_value": 5,
160
+ "max_value": 1000,
161
+ "step": 1
162
+ },
163
+ "parallel_infer": {
164
+ "type": "bool",
165
+ "label": "并行推理",
166
+ "description": "Parallel inference or not.",
167
+ "alias": ["parallel_infer", "parallel"],
168
+ "default": true
169
+ },
170
+ "repetition_penalty": {
171
+ "type": "float",
172
+ "label": "重复惩罚",
173
+ "description": "Repetition penalty for sampling.",
174
+ "alias": ["repetition_penalty", "rep_penalty"],
175
+ "default": 1.35,
176
+ "min_value": 0,
177
+ "max_value": 5,
178
+ "step": 0.01
179
+ },
180
+ "format": {
181
+ "type": "str",
182
+ "label": "输出格式",
183
+ "description": "Format of the output audio.",
184
+ "alias": ["format"],
185
+ "default": "wav"
186
+ },
187
+ "loudness": {
188
+ "type": "float",
189
+ "label": "音量",
190
+ "description": "Loudness of the audio. Now is unsupported.",
191
+ "alias": ["loudness", "volume", "vol"],
192
+ "default": null
193
+ },
194
+ "pitch": {
195
+ "type": "float",
196
+ "label": "音调",
197
+ "description": "Pitch of the audio. Now is unsupported.",
198
+ "alias": ["pitch"],
199
+ "default": null
200
+ }
201
+ }
Synthesizers/gsv_fast/configs/ui_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "ref_settings": [["ref_audio_path", "prompt_text", "prompt_language"]],
3
+ "basic_settings": [
4
+ "speed",
5
+
6
+ ["text_language", "cut_method", "max_cut_length", "batch_size"]
7
+ ],
8
+ "advanced_settings": [
9
+ "seed",
10
+ "parallel_infer",
11
+ ["top_k", "top_p", "temperature", "repetition_penalty"]
12
+ ]
13
+ }
Synthesizers/gsv_fast/gsv_config.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __version__ = "2.4.3 240414"
2
+
3
+ import os, json
4
+ import torch
5
+
6
+ import logging
7
+
8
+ from pydantic import BaseModel, Field
9
+ logging.getLogger("markdown_it").setLevel(logging.ERROR)
10
+ logging.getLogger("urllib3").setLevel(logging.ERROR)
11
+ logging.getLogger("httpcore").setLevel(logging.ERROR)
12
+ logging.getLogger("httpx").setLevel(logging.ERROR)
13
+ logging.getLogger("asyncio").setLevel(logging.ERROR)
14
+ logging.getLogger("charset_normalizer").setLevel(logging.ERROR)
15
+ logging.getLogger("torchaudio._extension").setLevel(logging.ERROR)
16
+ def test_fp16_computation():
17
+ # 检查CUDA是否可用
18
+ if not torch.cuda.is_available():
19
+ return False, "CUDA is not available. Please check your installation."
20
+
21
+ try:
22
+ # 创建一个简单的半精度张量计算任务
23
+ # 例如,执行一个半精度的矩阵乘法
24
+ a = torch.randn(3, 3, dtype=torch.float16).cuda() # 将张量a转换为半精度并移动到GPU
25
+ b = torch.randn(3, 3, dtype=torch.float16).cuda() # 将张量b转换为半精度并移动到GPU
26
+ c = torch.matmul(a, b) # 执行半精度的矩阵乘法
27
+ # 如果没有发生错误,我们认为GPU支持半精度运算
28
+ return True, "Your GPU supports FP16 computation."
29
+ except Exception as e:
30
+ # 如果执行过程中发生异常,我们认为GPU不支持半精度运算
31
+ return False, f"Your GPU does not support FP16 computation. Error: {e}"
32
+
33
+
34
+ def get_device_info(device_config="auto", is_half_config="auto")-> tuple[str, bool]:
35
+ global device, is_half
36
+ try:
37
+ return device, is_half
38
+ except:
39
+ if torch.cuda.is_available():
40
+ device = "cuda"
41
+ is_half = True
42
+ else:
43
+ device = "cpu"
44
+ is_half = False
45
+
46
+ if device_config != "auto":
47
+ device = device_config
48
+ is_half = (device == "cpu")
49
+ if is_half_config != "auto":
50
+ is_half = str(is_half_config).lower() == "true"
51
+
52
+ supports_fp16, message = test_fp16_computation()
53
+ if not supports_fp16 and is_half:
54
+ is_half = False
55
+ print(message)
56
+
57
+ return device, is_half
58
+
59
+
60
+
61
+
62
+ def load_infer_config(character_path):
63
+ config_path = os.path.join(character_path, "infer_config.json")
64
+ """加载环境配置文件"""
65
+ with open(config_path, 'r', encoding='utf-8') as f:
66
+ config = json.load(f)
67
+ return config
68
+
69
+ def auto_generate_infer_config(character_path):
70
+ ## TODO: Auto-generate wav-list and prompt-list from character_path
71
+ ##
72
+ # Initialize variables for file detection
73
+
74
+ print(f"正在自动生成配置文件: {character_path}")
75
+ ckpt_file_found = None
76
+ pth_file_found = None
77
+ wav_file_found = None
78
+
79
+ # Iterate through files in character_path to find matching file types
80
+ for dirpath, dirnames, filenames in os.walk(character_path):
81
+ for file in filenames:
82
+ # 构建文件的完整路径
83
+ full_path = os.path.join(dirpath, file)
84
+ # 从full_path中移除character_path部分
85
+ relative_path = remove_character_path(full_path,character_path)
86
+ # 根据文件扩展名和变量是否已赋值来更新变量
87
+ if file.lower().endswith(".ckpt") and ckpt_file_found is None:
88
+ ckpt_file_found = relative_path
89
+ elif file.lower().endswith(".pth") and pth_file_found is None:
90
+ pth_file_found = relative_path
91
+ elif file.lower().endswith(".wav") and wav_file_found is None:
92
+ wav_file_found = relative_path
93
+ elif file.lower().endswith(".mp3"):
94
+ import pydub
95
+ # Convert mp3 to wav
96
+ wav_file_path = os.path.join(dirpath,os.path.splitext(file)[0] + ".wav")
97
+
98
+
99
+ pydub.AudioSegment.from_mp3(full_path).export(wav_file_path, format="wav")
100
+ if wav_file_found is None:
101
+ wav_file_found = remove_character_path(os.path.join(dirpath,os.path.splitext(file)[0] + ".wav"),character_path)
102
+
103
+
104
+ # Initialize infer_config with gpt_path and sovits_path regardless of wav_file_found
105
+ infer_config = {
106
+ "gpt_path": ckpt_file_found,
107
+ "sovits_path": pth_file_found,
108
+ "software_version": "1.1",
109
+ r"简介": r"这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目"
110
+ }
111
+
112
+ # If wav file is also found, update infer_config to include ref_audio_path, prompt_text, and prompt_language
113
+ if wav_file_found:
114
+ wav_file_name = os.path.splitext(os.path.basename(wav_file_found))[0] # Extract the filename without extension
115
+ infer_config["emotion_list"] = {
116
+ "default": {
117
+ "ref_audio_path": wav_file_found,
118
+ "prompt_text": wav_file_name,
119
+ "prompt_language": "多语种混合"
120
+ }
121
+ }
122
+ else:
123
+ raise Exception("找不到wav参考文件!请把��效wav文件放置在模型文件夹下。")
124
+ pass
125
+ # Check if the essential model files were found
126
+ if ckpt_file_found and pth_file_found:
127
+ infer_config_path = os.path.join(character_path, "infer_config.json")
128
+ try:
129
+ with open(infer_config_path , 'w', encoding='utf-8') as f:
130
+ json.dump(infer_config, f, ensure_ascii=False, indent=4)
131
+ except IOError as e:
132
+ print(f"无法写入文件: {infer_config_path}. 错误: {e}")
133
+
134
+ return infer_config_path
135
+ else:
136
+ return "Required model files (.ckpt or .pth) not found in character_path directory."
137
+
138
+
139
+ def remove_character_path(full_path,character_path):
140
+ # 从full_path中移除character_path部分
141
+ return os.path.relpath(full_path, character_path)
Synthesizers/gsv_fast/gsv_task.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os, json, sys
3
+ sys.path.append(".")
4
+
5
+ from uuid import uuid4
6
+ from typing import List, Dict, Literal, Optional, Any, Union
7
+ import urllib.parse
8
+ import hashlib
9
+
10
+ from Synthesizers.base import Base_TTS_Task, ParamItem, init_params_config
11
+
12
+ def get_params_config():
13
+ try:
14
+ with open(os.path.join("Synthesizers/gsv_fast/configs", "params_config.json"), "r", encoding="utf-8") as f:
15
+ return init_params_config(json.load(f))
16
+ except:
17
+ raise FileNotFoundError("params_config.json not found or invalid.")
18
+
19
+
20
+ params_config = get_params_config()
21
+
22
+ from pydantic import BaseModel, Field, model_validator
23
+
24
+ class GSV_TTS_Task(Base_TTS_Task):
25
+ # character: Optional[str] = None
26
+ # emotion: Optional[str] = None
27
+ ref_audio_path: Optional[str] = None
28
+ prompt_text: Optional[str] = None
29
+ prompt_language: Optional[str] = None
30
+ text_language: Optional[str] = None
31
+ speaker_id: Optional[int] = None
32
+ batch_size: Optional[int] = None
33
+ top_k: Optional[int] = None
34
+ top_p: Optional[float] = None
35
+ temperature: Optional[float] = None
36
+ cut_method: Optional[str] = None
37
+ max_cut_length: Optional[int] = None
38
+ seed: Optional[int] = None
39
+ save_temp: Optional[bool] = False
40
+ parallel_infer : Optional[bool] = True
41
+ repetition_penalty : Optional[float] = 1.35
42
+ # the gsv_fast model only supports 32000 sample rate
43
+ sample_rate: int = 32000
44
+
45
+ def __init__(self, other_task: Union[BaseModel, dict, None] = None, **data):
46
+ data.setdefault('params_config', params_config)
47
+ super().__init__(other_task, **data)
48
+
49
+ @property
50
+ def md5(self):
51
+ m = hashlib.md5()
52
+ if self.task_type == "audio":
53
+ m.update(self.src.encode())
54
+ elif self.task_type == "ssml":
55
+ m.update(self.ssml.encode())
56
+ elif self.task_type == "text":
57
+ m.update(self.text.encode())
58
+ m.update(self.text_language.encode())
59
+ m.update(self.character.encode())
60
+ m.update(str(self.speaker_id).encode())
61
+ m.update(str(self.speed).encode())
62
+ m.update(str(self.top_k).encode())
63
+ m.update(str(self.top_p).encode())
64
+ m.update(str(self.temperature).encode())
65
+ m.update(str(self.cut_method).encode())
66
+ m.update(str(self.emotion).encode())
67
+ return m.hexdigest()
68
+
69
+
70
+
Synthesizers/gsv_fast/ssml_dealer.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+ from typing import List , Dict
3
+ from uuid import uuid4
4
+
5
+ import sys
6
+ sys.path.append(".")
7
+
8
+ import xml.etree.ElementTree as ET
9
+ from .gsv_task import GSV_TTS_Task as TTS_Task
10
+ from Synthesizers.base import Base_TTS_Synthesizer, ParamItem, init_params_config
11
+
12
+ import tempfile
13
+ import soundfile as sf
14
+
15
+ import numpy as np
16
+ import requests, librosa
17
+
18
+
19
+ special_dict_speed = {
20
+ "x-slow": 0.5,
21
+ "slow": 0.75,
22
+ "medium": 1.0,
23
+ "fast": 1.25,
24
+ "x-fast": 1.5,
25
+ "default": 1.0
26
+ }
27
+
28
+
29
+ special_dict_break_strength = {
30
+ "x-weak": 0.25,
31
+ "weak": 0.5,
32
+ "medium": 0.75,
33
+ "strong": 1.0,
34
+ "x-strong": 1.25,
35
+ "default": 0.75
36
+ }
37
+
38
+
39
+ def load_time(time:str) -> float:
40
+ if time.endswith("ms"):
41
+ return float(time[:-2]) / 1000
42
+ if time.endswith("s"):
43
+ return float(time[:-1])
44
+ if time.endswith("min"):
45
+ return float(time[:-3]) * 60
46
+ return float(time)
47
+
48
+ def get_value_from_special_dict(key:str, special_dict:Dict[str, float]) -> float:
49
+ if key in special_dict:
50
+ return special_dict[key]
51
+ return key
52
+
53
+ class SSML_Dealer:
54
+ def __init__(self,params_config:Dict[str, ParamItem]):
55
+ self.ssml: str = ""
56
+ self.task_list: Dict[str, TTS_Task] = {}
57
+ self.task_queue : List[str] = []
58
+ self.audio_download_queue : List[str] = []
59
+ self.root : ET.Element = None
60
+ self.tts_synthesizer = None
61
+ self.params_config = TTS_Task().params_config
62
+
63
+ def get_value_from_root(self, root:ET.Element, key:str, special_dict:Dict[str, float]=None):
64
+ if key in self.params_config:
65
+ for alias in self.params_config[key].alias:
66
+ if root.get(alias) is not None:
67
+ if special_dict is not None:
68
+ return get_value_from_special_dict(root.get(alias), special_dict)
69
+ else:
70
+ return root.get(alias)
71
+
72
+
73
+
74
+ def analyze_element(self, root: ET.Element, father_task:TTS_Task):
75
+ task = TTS_Task(father_task)
76
+ self.task_list[task.uuid] = task
77
+ root.set("uuid", task.uuid)
78
+ root.tag = root.tag.split('}')[-1].lower()
79
+ task.text = root.text.strip() if root.text is not None else ""
80
+ print(f"--------{root.tag} : {task.text}") # debug
81
+ if root.tag in ["audio", "mstts:backgroundaudio"]:
82
+ if root.get("src") is not None:
83
+ self.audio_download_queue.append({"uuid": task.uuid, "src": root.get("src")})
84
+ task.text = ""
85
+ else:
86
+ if root.tag in ["bookmark", "break", "mstts:silence", "mstts:viseme"]:
87
+ task.text = ""
88
+
89
+
90
+ task.update_value('text_language', self.get_value_from_root(root, 'text_language'))
91
+ task.update_value('character', self.get_value_from_root(root, 'character'))
92
+ task.update_value('emotion', self.get_value_from_root(root, 'emotion'))
93
+ task.update_value('speed', self.get_value_from_root(root, 'speed', special_dict_speed))
94
+
95
+ # task.update_value('top_k', root)
96
+ # task.update_value('top_p', root)
97
+ # task.update_value('temperature', root)
98
+ # task.update_value('batch_size', root)
99
+
100
+ # task.update_value('loudness', root) # need to recheck
101
+ # task.update_value('pitch', root)
102
+
103
+
104
+ task.stream = False
105
+ if task.text.strip() != "":
106
+ self.task_queue.append(task.uuid)
107
+ if root.tail is not None:
108
+ new_task = TTS_Task(father_task)
109
+ self.task_list[new_task.uuid] = new_task
110
+ new_task.text = root.tail.strip()
111
+ if new_task.text != "":
112
+ self.task_queue.append(new_task.uuid)
113
+ root.set("tail_uuid", new_task.uuid)
114
+ for child in root:
115
+ self.analyze_element(child, father_task)
116
+
117
+
118
+
119
+ def generate_audio_from_element(self, root: ET.Element, default_silence: float = 0.3) -> np.ndarray:
120
+ # 认定所有的音频文件都已经生成
121
+ audio_data = np.array([])
122
+ uuid = root.get("uuid")
123
+ task = self.task_list[uuid]
124
+ sr = 32000
125
+ # print(f"--------{root.tag}") # debug
126
+ if root.tag in ["break"]:
127
+ # print(f"--------break: {root.get('time')}") # debug
128
+ time_ = root.get("time")
129
+ duration = 0.75
130
+ if time_ is not None:
131
+ duration = load_time(time_)
132
+ strength_ = root.get("strength")
133
+ if strength_ in special_dict_break_strength:
134
+ duration = special_dict_break_strength[strength_]
135
+ audio_data = np.zeros(int(duration * sr))
136
+ elif task.audio_path not in ["", None]:
137
+ audio_data, sr = sf.read(task.audio_path)
138
+
139
+ for child in root:
140
+ audio_data = np.concatenate([audio_data, self.generate_audio_from_element(child)])
141
+
142
+ if default_silence > 0:
143
+ audio_data = np.concatenate([audio_data, np.zeros(int(default_silence * sr))])
144
+
145
+ if root.get("tail_uuid") is not None:
146
+ audio_path = self.task_list[root.get("tail_uuid")].audio_path
147
+ if audio_path not in ["", None]:
148
+ audio_data_tail, sr = sf.read(audio_path)
149
+ audio_data = np.concatenate([audio_data, audio_data_tail])
150
+
151
+ return audio_data
152
+
153
+ def read_ssml(self, ssml:str):
154
+ self.ssml = ssml
155
+ try:
156
+ self.root = ET.fromstring(ssml)
157
+ self.analyze_element(self.root, None)
158
+ except Exception as e:
159
+ raise ValueError("Invalid SSML.")
160
+
161
+ def generate_tasks(self, tts_synthesizer, tmp_dir:str):
162
+ # 先按照人物排序
163
+ self.task_queue.sort(key=lambda x: self.task_list[x].character)
164
+ for uuid in self.task_queue:
165
+ task = self.task_list[uuid]
166
+ if task.text.strip() == "":
167
+ continue
168
+ gen = tts_synthesizer.generate_from_text(task)
169
+ sr, audio_data = next(gen)
170
+
171
+ tmp_file = os.path.join(tmp_dir, f"{task.uuid}.wav")
172
+
173
+ sf.write(tmp_file, audio_data, sr, format='wav')
174
+ task.audio_path = tmp_file
175
+
176
+ def download_audio(self, tmp_dir:str, sample_rate:int=32000):
177
+ for audio in self.audio_download_queue:
178
+ # 另开一个线程下载音频
179
+ response = requests.get(audio["src"])
180
+ # 重采样
181
+ audio_format = audio["src"].split(".")[-1]
182
+ tmp_file = os.path.join(tmp_dir, f"{uuid4()}.{audio_format}")
183
+ with open(tmp_file, 'wb') as f:
184
+ f.write(response.content)
185
+ audio_data, sr = librosa.load(tmp_file, sr=sample_rate)
186
+ sf.write(tmp_file, audio_data, sr, format='wav')
187
+ self.task_list[audio["uuid"]].audio_path = tmp_file
188
+
189
+ def generate_from_ssml(self, ssml:str, tts_synthesizer, format:str="wav"):
190
+ self.read_ssml(ssml)
191
+ tmp_dir = tempfile.mkdtemp()
192
+ self.generate_tasks(tts_synthesizer, tmp_dir)
193
+ self.download_audio(tmp_dir)
194
+ audio_data = self.generate_audio_from_element(self.root)
195
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f".{format}") as tmp_file:
196
+ sf.write(tmp_file, audio_data, 32000, format=format)
197
+ return tmp_file.name
198
+
199
+ if __name__ == "__main__":
200
+ ssml = """
201
+ <speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">
202
+ <audio src="https://d38nvwmjovqyq6.cloudfront.net/va90web25003/companions/Foundations%20of%20Rock/5.04.mp3" >
203
+ </audio>
204
+ <voice name="en-US-AvaNeural">
205
+ Welcome <break /> to text to speech.
206
+ Welcome <break strength="medium" /> to text to speech.
207
+ Welcome <break time="750ms" /> to text to speech.
208
+ </voice>
209
+ </speak>
210
+ """
211
+ # ssml_dealer = SSML_Dealer()
212
+ # # tts_synthesizer = TTS_synthesizer()
213
+ # print(ssml_dealer.generate_from_ssml(ssml, tts_synthesizer))
214
+
215
+ # for task in ssml_dealer.task_list.values():
216
+ # print(task)
Synthesizers/remote/Remote_Synthesizer.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io, wave
2
+ import os, json, sys
3
+ import threading
4
+
5
+ from Synthesizers.base import Base_TTS_Synthesizer ,load_config
6
+
7
+ from .remote_task import Remote_TTS_Task as TTS_Task, set_based_synthesizer, get_ui_config
8
+ import requests
9
+ from urllib import parse
10
+ from datetime import datetime
11
+ from typing import Union, Generator, Tuple, Any, Optional, Dict, Literal
12
+ import numpy as np
13
+ import soundfile as sf
14
+
15
+ class Remote_Synthesizer(Base_TTS_Synthesizer):
16
+ url :str = "http://127.0.0.1:5000"
17
+ tts_endpoint:str = "/tts"
18
+ character_endpoint:str = "/character_list"
19
+ based_synthesizer :str = "gsv_fast"
20
+ class Config:
21
+ extra = "ignore"
22
+ def __init__(self, config_path:str = None, **kwargs):
23
+ super().__init__(**kwargs)
24
+ if config_path is None:
25
+ config_path = os.path.join(os.path.dirname(__file__), "configs", "config.json")
26
+ config_dict = load_config(config_path)
27
+ config_dict.update(kwargs)
28
+ for key, value in config_dict.items():
29
+ if hasattr(self, key):
30
+ setattr(self, key, value)
31
+ set_based_synthesizer(self.based_synthesizer)
32
+ self.ui_config = get_ui_config(self.based_synthesizer)
33
+
34
+ def get_characters(self)-> dict:
35
+ url = self.url + self.character_endpoint
36
+ res = requests.get(url)
37
+ return json.loads(res.text)
38
+
39
+ @staticmethod
40
+ def stream_audio(url, data: Dict[str, Any]) -> Generator[Tuple[int, np.ndarray], None, None]:
41
+ headers = {"Content-Type": "application/json"}
42
+ # 发起POST请求,获取响应流
43
+ response = requests.post(
44
+ url, data=json.dumps(data), headers=headers, stream=True
45
+ )
46
+ chunk_size = 1024
47
+ # 确保请求成功
48
+ if response.status_code == 200:
49
+ # 循环读取音频流
50
+ for chunk in response.iter_content(chunk_size):
51
+ # 将二进制数据转换为numpy数组,这里假设音频数据是16位整数格式
52
+ audiodata = np.frombuffer(chunk, dtype=np.int16)
53
+ yield 32000, audiodata
54
+ else:
55
+ raise Exception(
56
+ f"Failed to get audio stream, status code: {response.status_code}"
57
+ )
58
+ def generate(
59
+ self,
60
+ task: TTS_Task,
61
+ return_type: Literal["filepath", "numpy"] = "numpy",
62
+ save_path: Optional[str] = None,
63
+ ) -> Union[str, Generator[Tuple[int, np.ndarray], None, None], Any]:
64
+
65
+
66
+ url = self.url + self.tts_endpoint
67
+ data = task.data
68
+ print(return_type)
69
+
70
+ if self.debug_mode:
71
+ print(f"generate task: \n{data}")
72
+ headers = {"Content-Type": "application/json"}
73
+ if return_type == "filepath" or (
74
+ return_type == "numpy" and not task.stream
75
+ ):
76
+ if save_path is None:
77
+ save_path = f"tmp_audio/{datetime.now().strftime('%Y%m%d%H%M%S')}.wav"
78
+ res = requests.post(url, data=json.dumps(data), headers=headers)
79
+ if res.status_code == 200:
80
+ with open(save_path, "wb") as f:
81
+ f.write(res.content)
82
+ if return_type == "filepath":
83
+ return save_path
84
+ else:
85
+ audiodata, sr = sf.read(save_path)
86
+ return ((sr, audiodata) for _ in range(1))
87
+ else:
88
+ raise Exception(f"remote synthesizer error: {res.text}")
89
+
90
+ elif return_type == "numpy" and task.stream:
91
+ return self.stream_audio(url, data)
92
+
93
+
94
+ def params_parser(self, data) -> TTS_Task:
95
+ task = TTS_Task(based_synthesizer=self.based_synthesizer, **data)
96
+ return task
97
+
98
+ def ms_like_parser(self,data) -> TTS_Task:
99
+ task = TTS_Task(based_synthesizer=self.based_synthesizer, **data)
100
+ return task
Synthesizers/remote/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .Remote_Synthesizer import Remote_Synthesizer as TTS_Synthesizer
2
+ from .remote_task import Remote_TTS_Task as TTS_Task
Synthesizers/remote/configs/config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "url": "http://localhost:5000",
3
+ "tts_endpoint": "/tts",
4
+ "character_endpoint": "/character_list",
5
+ "based_synthesizer": "gsv_fast"
6
+ }
Synthesizers/remote/configs/i18n/locale/en_US.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ ", 返回内容:": ", Return Content:",
3
+ "<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>This is the model management interface. It allows you to assign emotions to multiple reference audio segments. If you only have one segment, you can skip using this interface.</p><p>If you have questions or need further information, please refer to the documentation: <a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">Click to view detailed documentation</a>.</p>",
4
+ "Endpoint": "Endpoint",
5
+ "GPT模型路径": "GPT Model Path",
6
+ "Sovits模型路径": "SoVITS Model Path",
7
+ "Temperature": "Temperature",
8
+ "Top K": "Top K",
9
+ "Top P": "Top P",
10
+ "all_ja": "Japanese Only",
11
+ "all_zh": "Chinese Only",
12
+ "auto": "Auto Detect",
13
+ "auto_cut": "Smart Split",
14
+ "batch_size,1代表不并行,越大越快,但是越可能出问题": "Batch Size: 1 means no parallel processing. Larger values are faster but more prone to issues.",
15
+ "cut0": "Split by Line Break Only",
16
+ "cut1": "Group Four Sentences Together",
17
+ "cut2": "Group 50 Characters Together",
18
+ "cut3": "Split by Chinese Period",
19
+ "cut4": "Split by English Period",
20
+ "cut5": "Split by Punctuation",
21
+ "en": "English",
22
+ "https://space.bilibili.com/66633770": "https://github.com/X-T-E-R",
23
+ "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
24
+ "ja": "Japanese",
25
+ "json设置(一般不动)": "JSON Settings (Do not change it unless you know what you are doing)",
26
+ "zh": "Chinese",
27
+ "不切": "Do Not Split",
28
+ "人物情感列表网址": "Character Emotion List URL",
29
+ "从json中读取": "Read from JSON",
30
+ "使用前,请确认后端服务已启动。": "Before using, please ensure the backend service is running.",
31
+ "保存json\n(可能不会有完成提示,没报错就是成功)": "Save JSON\n(There may not be a completion notice; no error means success)",
32
+ "保存失败!": "Save Failed!",
33
+ "保存成功!": "Save Successful!",
34
+ "停止播放": "Stop Playback",
35
+ "切句方式": "Sentence Splitting Method",
36
+ "前端处理后的文本(每句):": "Front-end Processed Text (Per Sentence):",
37
+ "参考音频在3~10秒范围外,请更换!": "Reference audio is outside the 3-10 second range. Please replace it!",
38
+ "参考音频路径": "Reference Audio Path",
39
+ "发送json格式": "Send in JSON",
40
+ "发送并开始播放": "Send and Start Playback",
41
+ "发送请求": "Send Request",
42
+ "发送请求到": "Send Request to",
43
+ "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "Missing or swallowed words are normal. Severe issues can be resolved by adding new lines or periods, or adjusting the batch size.",
44
+ "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "Missing or swallowed words are normal. Severe issues can be resolved by adding new lines or periods, changing the reference audio (using the model management interface), or adjusting the batch size.",
45
+ "基础选项": "Basic Options",
46
+ "实际输入的参考文本:": "Actual Reference Text Input:",
47
+ "实际输入的目标文本(切句后):": "Actual Target Text Input (After Splitting):",
48
+ "实际输入的目标文本(每句):": "Actual Target Text Input (Per Sentence):",
49
+ "实际输入的目标文本:": "Actual Target Text Input:",
50
+ "密码": "Password",
51
+ "当前人物": "Current Character",
52
+ "当前人物变更为: ": "Current Character Changed to: ",
53
+ "您在使用经典推理模式,部分选项不可用": "You are using Classic Inference Mode. Some options are unavailable.",
54
+ "情感列表": "Emotion",
55
+ "情感风格": "Emotion",
56
+ "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "Five little monkeys jumping on the bed, one fell off and bumped his head. Mama called the doctor, and the doctor said, \"No more monkeys jumping on the bed!\"",
57
+ "扫描": "Scan",
58
+ "扫描人物列表": "Scan Character List",
59
+ "扫描模型文件夹:": "Scan Model Folder:",
60
+ "找不到模型文件!请把有效文件放置在文件夹下!!!": "Model file not found! Please place valid files in the folder!!!",
61
+ "提供的推理特化包,当前版本:": ", Current Version: ",
62
+ "提示": "Tip",
63
+ "提示文本": "Prompt Text",
64
+ "提示语言": "Prompt Language",
65
+ "文件打开失败,保存失败!": "File Opening Failed, Save Failed!",
66
+ "文本语言": "Text Language",
67
+ "是否自动匹配情感": "Automatically Match Emotions",
68
+ "模型文件夹路径": "Model Folder Path",
69
+ "每句允许最大切分字词数": "Max Words per Split Sentence",
70
+ "流式音频": "Streaming Audio",
71
+ "添加情感": "Add Emotion",
72
+ "点击查看详细文档": "Click to View Detailed Documentation",
73
+ "版本": "Version",
74
+ "用户名": "Username",
75
+ "种子": "Seed",
76
+ "简介": "Introduction",
77
+ "缺失某些项,保存失败!": "Missing Some Items, Save Failed!",
78
+ "网址设置": "URL Settings",
79
+ "自动生成info": "Auto Generate Info",
80
+ "若有疑问或需要进一步了解,可参考文档:": "If you have questions or need further information, please refer to the documentation: ",
81
+ "认证信息": "Authentication Info",
82
+ "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "Authentication is enabled. You can disable it in config.json.\nHowever, this feature is not fully implemented yet and is just for show.",
83
+ "语速": "Speed",
84
+ "请修改后点击下方按钮进行保存": "Please modify and click the button below to save",
85
+ "请求失败,状态码:": "Request Failed, Status Code:",
86
+ "请求失败,请检查URL是否正确": "Request Failed. Please check if the URL is correct.",
87
+ "请求完整音频": "Request Complete Audio",
88
+ "请求网址": "Request URL",
89
+ "输入文本": "Input Text",
90
+ "这是一个由": "This is a Inference Specialization Package provided by ",
91
+ "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "This is a configuration file for https://github.com/X-T-E-R/TTS-for-GPT-soVITS, a simple and easy-to-use frontend and backend project",
92
+ "这是展示页面的版本,并未使用后端服务,下面参数无效。": "This is a demonstration page version and does not utilize backend services, the parameters below are invalid.",
93
+ "选择角色": "Select Character",
94
+ "音频输出": "Audio Output",
95
+ "音频预览": "Audio Preview",
96
+ "项目开源地址:": "Github Link: ",
97
+ "高级选项": "Advanced Options",
98
+ "最大允许长度": "Max Length Allowed"
99
+ }
Synthesizers/remote/configs/i18n/locale/zh_CN.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ ", 返回内容:": ", 返回内容:",
3
+ "<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>",
4
+ "Endpoint": "Endpoint",
5
+ "GPT模型路径": "GPT模型路径",
6
+ "Sovits模型路径": "Sovits模型路径",
7
+ "Temperature": "Temperature",
8
+ "Top K": "Top K",
9
+ "Top P": "Top P",
10
+ "all_ja": "只有日文",
11
+ "all_zh": "只有中文",
12
+ "auto": "自动判断",
13
+ "auto_cut": "智能切分",
14
+ "batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不并行,越大越快,但是越可能出问题",
15
+ "cut0": "仅凭换行切分",
16
+ "cut1": "凑四句一切",
17
+ "cut2": "凑50字一切",
18
+ "cut3": "按中文句号。切",
19
+ "cut4": "按英文句号.切",
20
+ "cut5": "按标点符号切",
21
+ "en": "英文",
22
+ "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770",
23
+ "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
24
+ "ja": "日文",
25
+ "json设置(一般不动)": "json设置(一般不动)",
26
+ "zh": "中文",
27
+ "不切": "不切",
28
+ "人物情感列表网址": "人物情感列表网址",
29
+ "从json中读取": "从json中读取",
30
+ "使用前,请确认后端服务已启动。": "使用前,请确认后端服务已启动。",
31
+ "保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不会有完成提示,没报错就是成功)",
32
+ "保存失败!": "保存失败!",
33
+ "保存成功!": "保存成功!",
34
+ "停止播放": "停止播放",
35
+ "切句方式": "切句方式",
36
+ "前端处理后的文本(每句):": "前端处理后的文本(每句):",
37
+ "参考音频在3~10秒范围外,请更换!": "参考音频在3~10秒范围外,请更换!",
38
+ "参考音频路径": "参考音频路径",
39
+ "发送json格式": "发送json格式",
40
+ "发送并开始播放": "发送并开始播放",
41
+ "发送请求": "发送请求",
42
+ "发送请求到": "发送请求到",
43
+ "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。",
44
+ "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。",
45
+ "基础选项": "基础选项",
46
+ "实际输入的参考文本:": "实际输入的参考文本:",
47
+ "实际输入的目标文本(切句后):": "实际输入的目标文本(切句后):",
48
+ "实际输入的目标文本(每句):": "实际输入的目标文本(每句):",
49
+ "实际输入的目标文本:": "实际输入的目标文本:",
50
+ "密码": "密码",
51
+ "当前人物": "当前人物",
52
+ "当前人物变更为: ": "当前人物变更为: ",
53
+ "您在使用经典推理模式,部分选项不可用": "您在使用经典推理模式,部分选项不可用",
54
+ "情感列表": "情感列表",
55
+ "情感风格": "情感风格",
56
+ "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。",
57
+ "扫描": "扫描",
58
+ "扫描人物列表": "扫描人物列表",
59
+ "扫描模型文件夹:": "扫描模型文件夹:",
60
+ "找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!请把有效文件放置在文件夹下!!!",
61
+ "提供的推理特化包,当前版本:": "提供的推理特化包,当前版本:",
62
+ "提示": "提示",
63
+ "提示文本": "提示文本",
64
+ "提示语言": "提示语言",
65
+ "文件打开失败,保存失败!": "文件打开失败,保存失败!",
66
+ "文本语言": "文本语言",
67
+ "是否自动匹配情感": "是否自动匹配情感",
68
+ "模型文件夹路径": "模型文件夹路径",
69
+ "每句允许最大切分字词数": "每句允许最大切分字词数",
70
+ "流式音频": "流式音频",
71
+ "添加情感": "添加情感",
72
+ "点击查看详细文档": "点击查看详细文档",
73
+ "��本": "版本",
74
+ "用户名": "用户名",
75
+ "种子": "种子",
76
+ "简介": "简介",
77
+ "缺失某些项,保存失败!": "缺失某些项,保存失败!",
78
+ "网址设置": "网址设置",
79
+ "自动生成info": "自动生成info",
80
+ "若有疑问或需要进一步了解,可参考文档:": "若有疑问或需要进一步了解,可参考文档:",
81
+ "认证信息": "认证信息",
82
+ "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设",
83
+ "语速": "语速",
84
+ "请修改后点击下方按钮进行保存": "请修改后点击下方按钮进行保存",
85
+ "请求失败,状态码:": "请求失败,状态码:",
86
+ "请求失败,请检查URL是否正确": "请求失败,请检查URL是否正确",
87
+ "请求完整音频": "请求完整音频",
88
+ "请求网址": "请求网址",
89
+ "输入文本": "输入文本",
90
+ "这是一个由": "这是一个由",
91
+ "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目",
92
+ "这是展示页面的版本,并未使用后端服务,下面参数无效。": "这是展示页面的版本,并未使用后端服务,下面参数无效。",
93
+ "选择角色": "选择角色",
94
+ "音频输出": "音频输出",
95
+ "音频预览": "音频预览",
96
+ "项目开源地址:": "项目开源地址:",
97
+ "高级选项": "高级选项",
98
+ "最大允许长度": "最大允许长度"
99
+ }
Synthesizers/remote/configs/i18n/locale/zh_TW.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ ", 返回内容:": ", 返回內容:",
3
+ "<p>这是模型管理界面,为了实现对多段参考音频分配情感设计,如果您只有一段可不使用这个界面</p><p>若有疑问或需要进一步了解,可参考文档:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">点击查看详细文档</a>。</p>": "<p>這是模型管理介面,為了實現對多段參考音頻分配情緒設計,如果您只有一段可不使用這個介面</p><p>若有疑問或需要進一步了解,可參考文件:<a href=\"https://www.yuque.com/xter/zibxlp/hme8bw2r28vad3le\">點擊查看詳細文件</a>。</p>",
4
+ "Endpoint": "Endpoint",
5
+ "GPT模型路径": "GPT模型路徑",
6
+ "Sovits模型路径": "Sovits模型路徑",
7
+ "Temperature": "Temperature",
8
+ "Top K": "Top K",
9
+ "Top P": "Top P",
10
+ "all_ja": "僅日文",
11
+ "all_zh": "僅中文",
12
+ "auto": "自動判斷",
13
+ "auto_cut": "智慧切分",
14
+ "batch_size,1代表不并行,越大越快,但是越可能出问题": "batch_size,1代表不並行,越大越快,但是越可能出現問題",
15
+ "cut0": "僅憑換行切分",
16
+ "cut1": "湊四句一切",
17
+ "cut2": "湊50字一切",
18
+ "cut3": "按中文句號。切",
19
+ "cut4": "按英文句號.切",
20
+ "cut5": "按標點符號切",
21
+ "en": "英文",
22
+ "https://space.bilibili.com/66633770": "https://space.bilibili.com/66633770",
23
+ "https://www.yuque.com/xter/zibxlp": "https://www.yuque.com/xter/zibxlp",
24
+ "ja": "日文",
25
+ "json设置(一般不动)": "json設置(一般不動)",
26
+ "zh": "中文",
27
+ "不切": "不切",
28
+ "人物情感列表网址": "人物情緒列表網址",
29
+ "从json中读取": "從json中讀取",
30
+ "使用前,请确认后端服务已启动。": "使用前,請確認後端服務已啟動。",
31
+ "保存json\n(可能不会有完成提示,没报错就是成功)": "保存json\n(可能不會有完成提示,沒報錯就是成功)",
32
+ "保存失败!": "保存失敗!",
33
+ "保存成功!": "保存成功!",
34
+ "停止播放": "停止播放",
35
+ "切句方式": "切句方式",
36
+ "前端处理后的文本(每句):": "前端處理後的文本(每句):",
37
+ "参考音频在3~10秒范围外,请更换!": "參考音頻在3~10秒範圍外,請更換!",
38
+ "参考音频路径": "參考音頻路徑",
39
+ "发送json格式": "發送json格式",
40
+ "发送并开始播放": "發送並開始播放",
41
+ "发送请求": "發送請求",
42
+ "发送请求到": "發送請求到",
43
+ "吞字漏字属于正常现象,太严重可尝试换行、加句号或调节batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或調節batch size滑條。",
44
+ "吞字漏字属于正常现象,太严重可通过换行或加句号解决,或者更换参考音频(使用模型管理界面)、调节下方batch size滑条。": "吞字漏字屬於正常現象,太嚴重可通過換行或加句號解決,或者更換參考音頻(使用模型管理介面)、調節下方batch size滑條。",
45
+ "基础选项": "基礎選項",
46
+ "实际输入的参考文本:": "實際輸入的參考文本:",
47
+ "实际输入的目标文本(切句后):": "實際輸入的目標文本(切句後):",
48
+ "实际输入的目标文本(每句):": "實際輸入的目標文本(每句):",
49
+ "实际输入的目标文本:": "實際輸入的目標文本:",
50
+ "密码": "密碼",
51
+ "当前人物": "當前人物",
52
+ "当前人物变更为: ": "當前人物變更為: ",
53
+ "您在使用经典推理模式,部分选项不可用": "您在使用經典推理模式,部分選項不可用",
54
+ "情感列表": "情緒列表",
55
+ "情感风格": "情緒風格",
56
+ "我是一个粉刷匠,粉刷本领强。我要把那新房子,刷得更漂亮。刷了房顶又刷墙,刷子像飞一样。哎呀我的小鼻子,变呀变了样。": "有時掉進黑洞,有時候爬上彩虹。在下一秒鐘,命運如何轉動,沒有人會曉得。我說希望無窮,你猜美夢成空,相信和懷疑,總要決鬥。",
57
+ "扫描": "掃描",
58
+ "扫描人物列表": "掃描人物列表",
59
+ "扫描模型文件夹:": "掃描模型文件夾:",
60
+ "找不到模型文件!请把有效文件放置在文件夹下!!!": "找不到模型文件!請把有效文件放置在文件夾下!!!",
61
+ "提供的推理特化包,当前版本:": "提供的推理特化包,當前版本:",
62
+ "提示": "提示",
63
+ "提示文本": "提示文本",
64
+ "提示语言": "提示語言",
65
+ "文件打开失败,保存失败!": "文件開啟失敗,保存失敗!",
66
+ "文本语言": "文本語言",
67
+ "是否自动匹配情感": "是否自動匹配情緒",
68
+ "模型文件夹路径": "模型文件夾路徑",
69
+ "每句允许最大切分字词数": "每句允許最大切分字詞數",
70
+ "流式音频": "流式音頻",
71
+ "添加情感": "添加情緒",
72
+ "点击查看详细文档": "點擊查看詳細��件",
73
+ "版本": "版本",
74
+ "用户名": "使用者名稱",
75
+ "种子": "種子",
76
+ "简介": "簡介",
77
+ "缺失某些项,保存失败!": "缺失某些項,保存失敗!",
78
+ "网址设置": "網址設置",
79
+ "自动生成info": "自動生成info",
80
+ "若有疑问或需要进一步了解,可参考文档:": "若有疑問或需要進一步了解,可參考文件:",
81
+ "认证信息": "認證信息",
82
+ "认证信息已启用,您可以在config.json中关闭。\n但是这个功能还没做好,只是摆设": "認證信息已啟用,您可以在config.json中關閉。\n但是這個功能還沒做好,只是擺設",
83
+ "语速": "語速",
84
+ "请修改后点击下方按钮进行保存": "請修改後點擊下方按鈕進行保存",
85
+ "请求失败,状态码:": "請求失敗,狀態碼:",
86
+ "请求失败,请检查URL是否正确": "請求失敗,請檢查URL是否正確",
87
+ "请求完整音频": "請求完整音頻",
88
+ "请求网址": "請求網址",
89
+ "输入文本": "輸入文本",
90
+ "这是一个由": "這是一個由",
91
+ "这是一个配置文件适用于https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一个简单好用的前后端项目": "這是一個配置文件適用於https://github.com/X-T-E-R/TTS-for-GPT-soVITS,是一個簡單好用的前後端項目",
92
+ "这是展示页面的版本,并未使用后端服务,下面参数无效。": "這是展示頁面的版本,並未使用後端服務,下面參數無效。",
93
+ "选择角色": "選擇角色",
94
+ "音频输出": "音頻輸出",
95
+ "音频预览": "音頻預覽",
96
+ "项目开源地址:": "Github Link:",
97
+ "高级选项": "高級選項",
98
+ "最大允许长度": "最大允許長度"
99
+ }
Synthesizers/remote/configs/params_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+
3
+ }
Synthesizers/remote/configs/ui_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+
3
+ }
Synthesizers/remote/remote_task.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os, json, sys
3
+ sys.path.append(".")
4
+
5
+ from uuid import uuid4
6
+ from typing import List, Dict, Literal, Optional, Any, Union
7
+ import urllib.parse
8
+ import hashlib
9
+
10
+ from Synthesizers.base import Base_TTS_Task, ParamItem, init_params_config
11
+
12
+ global global_based_synthesizer
13
+ global_based_synthesizer = None
14
+
15
+ def set_based_synthesizer(based_synthesizer:str):
16
+ global global_based_synthesizer
17
+ global_based_synthesizer = based_synthesizer
18
+
19
+ def get_params_config(based_synthesizer:str= None):
20
+ assert based_synthesizer is not None, "based_synthesizer is not set, please init the remote synthesizer first."
21
+ try:
22
+ with open(os.path.join(os.path.dirname(__file__), "configs", "params_config.json"), "r", encoding="utf-8") as f:
23
+ res:dict = json.load(f)
24
+ with open(os.path.join("Synthesizers", based_synthesizer ,"configs", "params_config.json"), "r", encoding="utf-8") as f:
25
+ res.update(json.load(f))
26
+ return init_params_config(res)
27
+ except:
28
+ raise FileNotFoundError("params_config.json not found or invalid.")
29
+
30
+ params_config = None
31
+
32
+ def get_ui_config(based_synthesizer:str= None)->Dict[str, Any]:
33
+ if based_synthesizer is None:
34
+ based_synthesizer = global_based_synthesizer
35
+ assert based_synthesizer is not None, "based_synthesizer is not set, please init the remote synthesizer first."
36
+
37
+ remote_ui_config_path = os.path.join(os.path.dirname(__file__), "configs", "ui_config.json")
38
+ based_ui_config_path = os.path.join("Synthesizers", based_synthesizer ,"configs", "ui_config.json")
39
+
40
+ ui_config :Dict[str, Any] = {}
41
+ try:
42
+ with open(remote_ui_config_path, "r", encoding="utf-8") as f:
43
+ ui_config.update(json.load(f))
44
+ with open(based_ui_config_path, "r", encoding="utf-8") as f:
45
+ ui_config.update(json.load(f))
46
+ return ui_config
47
+ except:
48
+ raise FileNotFoundError("ui_config.json not found or invalid.")
49
+
50
+ from pydantic import BaseModel, Field, model_validator
51
+ from copy import deepcopy
52
+ class Remote_TTS_Task(Base_TTS_Task):
53
+
54
+ is_remote: Optional[bool] = True
55
+ data : dict = {}
56
+
57
+ class Config:
58
+ extra = "ignore"
59
+
60
+ def __init__(self, based_synthesizer:str=None, **data):
61
+
62
+ global params_config
63
+ based_synthesizer = based_synthesizer if based_synthesizer is not None else global_based_synthesizer
64
+ assert based_synthesizer is not None, "based_synthesizer is not set, please init the remote synthesizer first."
65
+ if params_config is None:
66
+ params_config = get_params_config(based_synthesizer)
67
+ copyed_data = deepcopy(data)
68
+ copyed_data.setdefault("params_config",params_config)
69
+ super().__init__(**copyed_data)
70
+ self.data = data
71
+
72
+ @property
73
+ def md5(self):
74
+ m = hashlib.md5()
75
+ m.update(self.data.__str__().encode())
76
+ return m.hexdigest()
77
+
78
+ def __str__(self):
79
+ content = super().__str__()
80
+ return f"{content}"
81
+
82
+
app.py CHANGED
@@ -28,6 +28,20 @@ max_text_length = inference_config.max_text_length
28
  from tools.i18n.i18n import I18nAuto
29
  i18n = I18nAuto(locale_path="i18n/locale")
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  import nltk
32
  nltk.data.path.append(os.path.abspath(os.path.join(now_dir,"nltk_data")))
33
 
@@ -400,6 +414,23 @@ with gr.Blocks() as app:
400
  ],
401
  )
402
 
403
- is_share = inference_config.is_share
404
- app.queue().launch(show_error=True, share=is_share, inbrowser=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
 
 
 
 
28
  from tools.i18n.i18n import I18nAuto
29
  i18n = I18nAuto(locale_path="i18n/locale")
30
 
31
+
32
+ from Synthesizers.base import Base_TTS_Synthesizer, Base_TTS_Task, get_wave_header_chunk
33
+ from importlib import import_module
34
+
35
+ synthesizer_name = inference_config.synthesizer
36
+
37
+ # 动态导入合成器模块, 此处可写成 from Synthesizers.xxx import TTS_Synthesizer, TTS_Task
38
+ synthesizer_module = import_module(f"Synthesizers.{synthesizer_name}")
39
+ TTS_Synthesizer = synthesizer_module.TTS_Synthesizer
40
+ TTS_Task = synthesizer_module.TTS_Task
41
+
42
+ # 创建合成器实例
43
+ tts_synthesizer:Base_TTS_Synthesizer = TTS_Synthesizer(debug_mode=True)
44
+
45
  import nltk
46
  nltk.data.path.append(os.path.abspath(os.path.join(now_dir,"nltk_data")))
47
 
 
414
  ],
415
  )
416
 
417
+ import uvicorn
418
+ from pure_api import tts, character_list, set_tts_synthesizer
419
+ from fastapi import FastAPI
420
+ from fastapi.middleware.cors import CORSMiddleware
421
+
422
+ set_tts_synthesizer(tts_synthesizer)
423
+ fastapi_app:FastAPI = app.app
424
+ fastapi_app.add_api_route("/tts", tts, methods=["POST", "GET"])
425
+ fastapi_app.add_api_route("/character_list", character_list, methods=["GET"])
426
+
427
+ fastapi_app.add_middleware(
428
+ CORSMiddleware,
429
+ allow_origins=["*"],
430
+ allow_credentials=True,
431
+ allow_methods=["*"],
432
+ allow_headers=["*"],
433
+ )
434
 
435
+ fastapi_app = gr.mount_gradio_app(fastapi_app, app, path="/")
436
+ uvicorn.run(fastapi_app, host=inference_config.tts_host, port=inference_config.tts_port)
config.json CHANGED
@@ -10,6 +10,7 @@
10
  "max_text_length": -1,
11
  "save_prompt_cache": "true",
12
  "save_model_cache": "false",
 
13
  "备注0": "locale是语言环境,auto表示自动选择,如果你想要强制指定语言环境,可以填写zh_CN或者en_US等等",
14
  "备注1": "路径可以填写绝对路径或者相对路径,相对路径指的是在主项目根目录的相对路径",
15
  "备注2": "tts_port是tts服务的端口号,可以自己定义,只要不和其他服务的端口号冲突就行,默认是5000",
 
10
  "max_text_length": -1,
11
  "save_prompt_cache": "true",
12
  "save_model_cache": "false",
13
+ "synthesizer": "gsv_fast",
14
  "备注0": "locale是语言环境,auto表示自动选择,如果你想要强制指定语言环境,可以填写zh_CN或者en_US等等",
15
  "备注1": "路径可以填写绝对路径或者相对路径,相对路径指的是在主项目根目录的相对路径",
16
  "备注2": "tts_port是tts服务的端口号,可以自己定义,只要不和其他服务的端口号冲突就行,默认是5000",
gsv_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "device": "auto",
3
+ "is_half": "auto",
4
+
5
+ "models_path": "trained",
6
+ "cnhubert_base_path": "pretrained_models/chinese-hubert-base",
7
+ "bert_base_path": "pretrained_models/chinese-roberta-wwm-ext-large",
8
+ "save_prompt_cache": true,
9
+ "prompt_cache_dir": "cache/prompt_cache"
10
+ }
pure_api.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 在开头加入路径
2
+ import os, sys
3
+ import importlib
4
+
5
+ now_dir = os.getcwd()
6
+ sys.path.append(now_dir)
7
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
8
+
9
+ from src.config_manager import Inference_Config
10
+ from src.config_manager import __version__ as frontend_version
11
+
12
+ inference_config = Inference_Config()
13
+
14
+ import soundfile as sf
15
+ from fastapi import FastAPI, Request, HTTPException
16
+ from fastapi.responses import JSONResponse, FileResponse, StreamingResponse
17
+ from fastapi.middleware.cors import CORSMiddleware
18
+ import tempfile
19
+ import uvicorn
20
+ import json
21
+
22
+ # 将当前文件所在的目录添加到 sys.path
23
+ from Synthesizers.base import Base_TTS_Task, Base_TTS_Synthesizer
24
+
25
+ # 创建合成器实例
26
+ tts_synthesizer:Base_TTS_Synthesizer = None
27
+
28
+ def set_tts_synthesizer(synthesizer:Base_TTS_Synthesizer):
29
+ global tts_synthesizer
30
+ tts_synthesizer = synthesizer
31
+
32
+ # 存储临时文件的字典
33
+ temp_files = {}
34
+
35
+ async def character_list(request: Request):
36
+ res = JSONResponse(tts_synthesizer.get_characters())
37
+ return res
38
+
39
+ async def tts(request: Request):
40
+
41
+ from time import time as tt
42
+ t1 = tt()
43
+ print(f"Request Time: {t1}")
44
+
45
+ # 尝试从JSON中获取数据,如果不是JSON,则从查询参数中获取
46
+ if request.method == "GET":
47
+ data = request.query_params
48
+ else:
49
+ data = await request.json()
50
+
51
+ task:Base_TTS_Task = tts_synthesizer.params_parser(data)
52
+
53
+ if task.task_type == "text" and task.text.strip() == "":
54
+ return HTTPException(status_code=400, detail="Text is empty")
55
+ elif task.task_type == "ssml" and task.ssml.strip() == "":
56
+ return HTTPException(status_code=400, detail="SSML is empty")
57
+ md5_value = task.md5
58
+ if task.stream == False:
59
+ # TODO: use SQL instead of dict
60
+ if task.save_temp and md5_value in temp_files:
61
+ return FileResponse(path=temp_files[md5_value], media_type=f'audio/{task.format}')
62
+ else:
63
+ # 假设 gen 是你的音频生成器
64
+ try:
65
+ save_path = tts_synthesizer.generate(task, return_type="filepath")
66
+ except Exception as e:
67
+ return HTTPException(status_code=500, detail=str(e))
68
+ if task.save_temp:
69
+ temp_files[md5_value] = save_path
70
+
71
+ t2 = tt()
72
+ print(f"total time: {t2-t1}")
73
+ # 返回文件响应,FileResponse 会负责将文件发送给客户端
74
+ return FileResponse(save_path, media_type=f"audio/{task.format}", filename=os.path.basename(save_path))
75
+ else:
76
+ gen = tts_synthesizer.generate(task, return_type="numpy")
77
+ return StreamingResponse(gen, media_type='audio/wav')
78
+
79
+
80
+
81
+
82
+ if __name__ == "__main__":
83
+ # 动态导入合成器模块, 此处可写成 from Synthesizers.xxx import TTS_Synthesizer, TTS_Task
84
+ from importlib import import_module
85
+ from src.api_utils import get_localhost_ipv4_address
86
+ synthesizer_name = inference_config.synthesizer
87
+ synthesizer_module = import_module(f"Synthesizers.{synthesizer_name}")
88
+ TTS_Synthesizer = synthesizer_module.TTS_Synthesizer
89
+ TTS_Task = synthesizer_module.TTS_Task
90
+ tts_synthesizer = TTS_Synthesizer(debug_mode=True)
91
+ print(f"Backend Version: {__version__}")
92
+ tts_host = inference_config.tts_host
93
+ tts_port = inference_config.tts_port
94
+ ipv4_address = get_localhost_ipv4_address(tts_host)
95
+ ipv4_link = f"http://{ipv4_address}:{tts_port}"
96
+ print(f"INFO: Local Network URL: {ipv4_link}")
97
+
98
+ app = FastAPI()
99
+
100
+ # 设置CORS
101
+ app.add_middleware(
102
+ CORSMiddleware,
103
+ allow_origins=["*"],
104
+ allow_credentials=True,
105
+ allow_methods=["*"],
106
+ allow_headers=["*"],
107
+ )
108
+ app.add_api_route('/tts', tts, methods=["GET", "POST"])
109
+ app.add_api_route('/character_list', character_list, methods=["GET"])
110
+ uvicorn.run(app, host=tts_host, port=tts_port)
src/config_manager.py CHANGED
@@ -37,6 +37,7 @@ class Inference_Config():
37
  self.locale_language = None if locale_language.lower() == "auto" else locale_language
38
  if self.enable_auth:
39
  self.users = config.get("user", {})
 
40
 
41
  global inference_config
42
  inference_config = Inference_Config()
 
37
  self.locale_language = None if locale_language.lower() == "auto" else locale_language
38
  if self.enable_auth:
39
  self.users = config.get("user", {})
40
+ self.synthesizer = config.get("synthesizer", "gsv_fast")
41
 
42
  global inference_config
43
  inference_config = Inference_Config()