zhzluke96
commited on
Commit
·
ae79826
1
Parent(s):
b473486
update
Browse files- modules/repos_static/resemble_enhance/inference.py +2 -0
- modules/utils/audio.py +5 -1
- modules/webui/app.py +14 -6
- modules/webui/speaker/speaker_merger.py +6 -9
- modules/webui/ssml/__init__.py +0 -0
- modules/webui/ssml/podcast_tab.py +210 -0
- modules/webui/ssml/spliter_tab.py +169 -0
- modules/webui/ssml/ssml_tab.py +61 -0
- modules/webui/tts_tab.py +12 -9
- modules/webui/webui_utils.py +31 -2
modules/repos_static/resemble_enhance/inference.py
CHANGED
@@ -127,6 +127,8 @@ def inference(
|
|
127 |
):
|
128 |
if config.runtime_env_vars.off_tqdm:
|
129 |
trange = range
|
|
|
|
|
130 |
|
131 |
remove_weight_norm_recursively(model)
|
132 |
|
|
|
127 |
):
|
128 |
if config.runtime_env_vars.off_tqdm:
|
129 |
trange = range
|
130 |
+
else:
|
131 |
+
from tqdm import trange
|
132 |
|
133 |
remove_weight_norm_recursively(model)
|
134 |
|
modules/utils/audio.py
CHANGED
@@ -19,7 +19,11 @@ def audio_to_int16(audio_data):
|
|
19 |
return audio_data
|
20 |
|
21 |
|
22 |
-
def audiosegment_to_librosawav(audiosegment):
|
|
|
|
|
|
|
|
|
23 |
channel_sounds = audiosegment.split_to_mono()
|
24 |
samples = [s.get_array_of_samples() for s in channel_sounds]
|
25 |
|
|
|
19 |
return audio_data
|
20 |
|
21 |
|
22 |
+
def audiosegment_to_librosawav(audiosegment: AudioSegment) -> np.ndarray:
|
23 |
+
"""
|
24 |
+
Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels],
|
25 |
+
where each value is in range [-1.0, 1.0].
|
26 |
+
"""
|
27 |
channel_sounds = audiosegment.split_to_mono()
|
28 |
samples = [s.get_array_of_samples() for s in channel_sounds]
|
29 |
|
modules/webui/app.py
CHANGED
@@ -8,10 +8,11 @@ from modules import config
|
|
8 |
from modules.webui import webui_config
|
9 |
|
10 |
from modules.webui.changelog_tab import create_changelog_tab
|
|
|
11 |
from modules.webui.system_tab import create_system_tab
|
12 |
from modules.webui.tts_tab import create_tts_interface
|
13 |
-
from modules.webui.ssml_tab import create_ssml_interface
|
14 |
-
from modules.webui.spliter_tab import create_spliter_tab
|
15 |
from modules.webui.speaker_tab import create_speaker_panel
|
16 |
from modules.webui.readme_tab import create_readme_tab
|
17 |
|
@@ -86,10 +87,17 @@ def create_interface():
|
|
86 |
create_tts_interface()
|
87 |
|
88 |
with gr.TabItem("SSML", id="ssml"):
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
with gr.TabItem("Speaker"):
|
95 |
create_speaker_panel()
|
|
|
8 |
from modules.webui import webui_config
|
9 |
|
10 |
from modules.webui.changelog_tab import create_changelog_tab
|
11 |
+
from modules.webui.ssml.podcast_tab import create_ssml_podcast_tab
|
12 |
from modules.webui.system_tab import create_system_tab
|
13 |
from modules.webui.tts_tab import create_tts_interface
|
14 |
+
from modules.webui.ssml.ssml_tab import create_ssml_interface
|
15 |
+
from modules.webui.ssml.spliter_tab import create_spliter_tab
|
16 |
from modules.webui.speaker_tab import create_speaker_panel
|
17 |
from modules.webui.readme_tab import create_readme_tab
|
18 |
|
|
|
87 |
create_tts_interface()
|
88 |
|
89 |
with gr.TabItem("SSML", id="ssml"):
|
90 |
+
with gr.Tabs() as ssml_tabs:
|
91 |
+
with gr.TabItem("Editor", id="ssml.editor"):
|
92 |
+
ssml_input = create_ssml_interface()
|
93 |
+
with gr.TabItem("Spilter"):
|
94 |
+
create_spliter_tab(
|
95 |
+
ssml_input=ssml_input, tabs1=tabs, tabs2=ssml_tabs
|
96 |
+
)
|
97 |
+
with gr.TabItem("Podcast"):
|
98 |
+
create_ssml_podcast_tab(
|
99 |
+
ssml_input=ssml_input, tabs1=tabs, tabs2=ssml_tabs
|
100 |
+
)
|
101 |
|
102 |
with gr.TabItem("Speaker"):
|
103 |
create_speaker_panel()
|
modules/webui/speaker/speaker_merger.py
CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
|
|
3 |
import torch
|
4 |
|
5 |
from modules.hf import spaces
|
|
|
6 |
from modules.webui.webui_utils import get_speakers, tts_generate
|
7 |
from modules.speaker import speaker_mgr, Speaker
|
8 |
|
@@ -138,23 +139,19 @@ merge_desc = """
|
|
138 |
"""
|
139 |
|
140 |
|
141 |
-
def get_spk_choices():
|
142 |
-
speakers = get_speakers()
|
143 |
-
|
144 |
-
speaker_names = ["None"] + [get_speaker_show_name(speaker) for speaker in speakers]
|
145 |
-
return speaker_names
|
146 |
-
|
147 |
-
|
148 |
# 显示 a b c d 四个选择框,选择一个或多个,然后可以试音,并导出
|
149 |
def create_speaker_merger():
|
150 |
-
|
|
|
|
|
|
|
151 |
|
152 |
gr.Markdown(merge_desc)
|
153 |
|
154 |
def spk_picker(label_tail: str):
|
155 |
with gr.Row():
|
156 |
spk_a = gr.Dropdown(
|
157 |
-
choices=
|
158 |
)
|
159 |
refresh_a_btn = gr.Button("🔄", variant="secondary")
|
160 |
|
|
|
3 |
import torch
|
4 |
|
5 |
from modules.hf import spaces
|
6 |
+
from modules.webui import webui_utils
|
7 |
from modules.webui.webui_utils import get_speakers, tts_generate
|
8 |
from modules.speaker import speaker_mgr, Speaker
|
9 |
|
|
|
139 |
"""
|
140 |
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
# 显示 a b c d 四个选择框,选择一个或多个,然后可以试音,并导出
|
143 |
def create_speaker_merger():
|
144 |
+
def get_spk_choices():
|
145 |
+
speakers, speaker_names = webui_utils.get_speaker_names()
|
146 |
+
speaker_names = ["None"] + speaker_names
|
147 |
+
return speaker_names
|
148 |
|
149 |
gr.Markdown(merge_desc)
|
150 |
|
151 |
def spk_picker(label_tail: str):
|
152 |
with gr.Row():
|
153 |
spk_a = gr.Dropdown(
|
154 |
+
choices=get_spk_choices(), value="None", label=f"Speaker {label_tail}"
|
155 |
)
|
156 |
refresh_a_btn = gr.Button("🔄", variant="secondary")
|
157 |
|
modules/webui/ssml/__init__.py
ADDED
File without changes
|
modules/webui/ssml/podcast_tab.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import torch
|
4 |
+
|
5 |
+
from modules.normalization import text_normalize
|
6 |
+
from modules.webui import webui_utils
|
7 |
+
from modules.hf import spaces
|
8 |
+
|
9 |
+
podcast_default_case = [
|
10 |
+
[1, "female2", "你好,欢迎收听今天的播客内容。今天我们要聊的是中华料理。", "chat"],
|
11 |
+
[2, "Alice", "嗨,我特别期待这个话题!中华料理真的是博大精深。", "chat"],
|
12 |
+
[
|
13 |
+
3,
|
14 |
+
"Bob",
|
15 |
+
"没错,中华料理有着几千年的历史,而且每个地区都有自己的特色菜。",
|
16 |
+
"chat",
|
17 |
+
],
|
18 |
+
[
|
19 |
+
4,
|
20 |
+
"female2",
|
21 |
+
"那我们先从最有名的川菜开始吧。川菜以其麻辣著称,是很多人的最爱。",
|
22 |
+
"chat",
|
23 |
+
],
|
24 |
+
[
|
25 |
+
5,
|
26 |
+
"Alice",
|
27 |
+
"对,我特别喜欢吃麻婆豆腐和辣子鸡。那种麻辣的感觉真是让人难以忘怀。",
|
28 |
+
"chat",
|
29 |
+
],
|
30 |
+
[
|
31 |
+
6,
|
32 |
+
"Bob",
|
33 |
+
"除了川菜,粤菜也是很受欢迎的。粤菜讲究鲜美,像是白切鸡和蒸鱼都是经典。",
|
34 |
+
"chat",
|
35 |
+
],
|
36 |
+
[7, "female2", "对啊,粤菜的烹饪方式比较清淡,更注重食材本身的味道。", "chat"],
|
37 |
+
[8, "Alice", "还有北京的京菜,像北京烤鸭,那可是来北京必吃的美食。", "chat"],
|
38 |
+
[
|
39 |
+
9,
|
40 |
+
"Bob",
|
41 |
+
"不仅如此,还有淮扬菜、湘菜、鲁菜等等,每个菜系都有其独特的风味。",
|
42 |
+
"chat",
|
43 |
+
],
|
44 |
+
[
|
45 |
+
10,
|
46 |
+
"female2",
|
47 |
+
"对对对,像淮扬菜的狮子头,湘菜的剁椒鱼头,都是让人垂涎三尺的美味。",
|
48 |
+
"chat",
|
49 |
+
],
|
50 |
+
]
|
51 |
+
|
52 |
+
|
53 |
+
# NOTE: 因为 text_normalize 需要使用 tokenizer
|
54 |
+
@torch.inference_mode()
|
55 |
+
@spaces.GPU
|
56 |
+
def merge_dataframe_to_ssml(msg, spk, style, df: pd.DataFrame):
|
57 |
+
ssml = ""
|
58 |
+
indent = " " * 2
|
59 |
+
|
60 |
+
for i, row in df.iterrows():
|
61 |
+
text = row.get("text")
|
62 |
+
spk = row.get("speaker")
|
63 |
+
style = row.get("style")
|
64 |
+
|
65 |
+
ssml += f"{indent}<voice"
|
66 |
+
if spk:
|
67 |
+
ssml += f' spk="{spk}"'
|
68 |
+
if style:
|
69 |
+
ssml += f' style="{style}"'
|
70 |
+
ssml += ">\n"
|
71 |
+
ssml += f"{indent}{indent}{text_normalize(text)}\n"
|
72 |
+
ssml += f"{indent}</voice>\n"
|
73 |
+
# 原封不动输出回去是为了触发 loadding 效果
|
74 |
+
return msg, spk, style, f"<speak version='0.1'>\n{ssml}</speak>"
|
75 |
+
|
76 |
+
|
77 |
+
def create_ssml_podcast_tab(ssml_input: gr.Textbox, tabs1: gr.Tabs, tabs2: gr.Tabs):
|
78 |
+
def get_spk_choices():
|
79 |
+
speakers, speaker_names = webui_utils.get_speaker_names()
|
80 |
+
speaker_names = ["-1"] + speaker_names
|
81 |
+
return speaker_names
|
82 |
+
|
83 |
+
styles = ["*auto"] + [s.get("name") for s in webui_utils.get_styles()]
|
84 |
+
|
85 |
+
with gr.Row():
|
86 |
+
with gr.Column(scale=1):
|
87 |
+
with gr.Group():
|
88 |
+
spk_input_dropdown = gr.Dropdown(
|
89 |
+
choices=get_spk_choices(),
|
90 |
+
interactive=True,
|
91 |
+
value="female : female2",
|
92 |
+
show_label=False,
|
93 |
+
)
|
94 |
+
style_input_dropdown = gr.Dropdown(
|
95 |
+
choices=styles,
|
96 |
+
# label="Choose Style",
|
97 |
+
interactive=True,
|
98 |
+
show_label=False,
|
99 |
+
value="*auto",
|
100 |
+
)
|
101 |
+
with gr.Group():
|
102 |
+
msg = gr.Textbox(
|
103 |
+
lines=5, label="Message", placeholder="Type speaker message here"
|
104 |
+
)
|
105 |
+
add = gr.Button("Add")
|
106 |
+
undo = gr.Button("Undo")
|
107 |
+
clear = gr.Button("Clear")
|
108 |
+
with gr.Column(scale=5):
|
109 |
+
with gr.Group():
|
110 |
+
gr.Markdown("📔Script")
|
111 |
+
script_table = gr.DataFrame(
|
112 |
+
headers=["index", "speaker", "text", "style"],
|
113 |
+
datatype=["number", "str", "str", "str"],
|
114 |
+
interactive=False,
|
115 |
+
wrap=True,
|
116 |
+
value=podcast_default_case,
|
117 |
+
row_count=(0, "dynamic"),
|
118 |
+
)
|
119 |
+
|
120 |
+
send_to_ssml_btn = gr.Button("📩Send to SSML", variant="primary")
|
121 |
+
|
122 |
+
def add_message(msg, spk, style, sheet: pd.DataFrame):
|
123 |
+
if not msg:
|
124 |
+
return "", sheet
|
125 |
+
|
126 |
+
data = pd.DataFrame(
|
127 |
+
{
|
128 |
+
"index": [sheet.shape[0]],
|
129 |
+
"speaker": [spk.split(" : ")[1].strip()],
|
130 |
+
"text": [msg],
|
131 |
+
"style": [style],
|
132 |
+
},
|
133 |
+
)
|
134 |
+
|
135 |
+
# 如果只有一行 并且是空的
|
136 |
+
is_empty = sheet.empty or (sheet.shape[0] == 1 and "text" not in sheet.iloc[0])
|
137 |
+
|
138 |
+
if is_empty:
|
139 |
+
sheet = data
|
140 |
+
else:
|
141 |
+
sheet = pd.concat(
|
142 |
+
[
|
143 |
+
sheet,
|
144 |
+
data,
|
145 |
+
],
|
146 |
+
ignore_index=True,
|
147 |
+
)
|
148 |
+
return "", sheet
|
149 |
+
|
150 |
+
def undo_message(msg, spk, style, sheet: pd.DataFrame):
|
151 |
+
if sheet.empty:
|
152 |
+
return msg, spk, style, sheet
|
153 |
+
data = sheet.iloc[-1]
|
154 |
+
sheet = sheet.iloc[:-1]
|
155 |
+
spk = ""
|
156 |
+
for choice in get_spk_choices():
|
157 |
+
if choice.endswith(data["speaker"]) and " : " in choice:
|
158 |
+
spk = choice
|
159 |
+
break
|
160 |
+
return data["text"], spk, data["style"], sheet
|
161 |
+
|
162 |
+
def clear_message():
|
163 |
+
return "", pd.DataFrame(
|
164 |
+
columns=["index", "speaker", "text", "style"],
|
165 |
+
)
|
166 |
+
|
167 |
+
def send_to_ssml(msg, spk, style, sheet: pd.DataFrame):
|
168 |
+
if sheet.empty:
|
169 |
+
return gr.Error("Please add some text to the script table.")
|
170 |
+
msg, spk, style, ssml = merge_dataframe_to_ssml(msg, spk, style, sheet)
|
171 |
+
return [
|
172 |
+
msg,
|
173 |
+
spk,
|
174 |
+
style,
|
175 |
+
gr.Textbox(value=ssml),
|
176 |
+
gr.Tabs(selected="ssml"),
|
177 |
+
gr.Tabs(selected="ssml.editor"),
|
178 |
+
]
|
179 |
+
|
180 |
+
msg.submit(
|
181 |
+
add_message,
|
182 |
+
inputs=[msg, spk_input_dropdown, style_input_dropdown, script_table],
|
183 |
+
outputs=[msg, script_table],
|
184 |
+
)
|
185 |
+
add.click(
|
186 |
+
add_message,
|
187 |
+
inputs=[msg, spk_input_dropdown, style_input_dropdown, script_table],
|
188 |
+
outputs=[msg, script_table],
|
189 |
+
)
|
190 |
+
undo.click(
|
191 |
+
undo_message,
|
192 |
+
inputs=[msg, spk_input_dropdown, style_input_dropdown, script_table],
|
193 |
+
outputs=[msg, spk_input_dropdown, style_input_dropdown, script_table],
|
194 |
+
)
|
195 |
+
clear.click(
|
196 |
+
clear_message,
|
197 |
+
outputs=[msg, script_table],
|
198 |
+
)
|
199 |
+
send_to_ssml_btn.click(
|
200 |
+
send_to_ssml,
|
201 |
+
inputs=[msg, spk_input_dropdown, style_input_dropdown, script_table],
|
202 |
+
outputs=[
|
203 |
+
msg,
|
204 |
+
spk_input_dropdown,
|
205 |
+
style_input_dropdown,
|
206 |
+
ssml_input,
|
207 |
+
tabs1,
|
208 |
+
tabs2,
|
209 |
+
],
|
210 |
+
)
|
modules/webui/ssml/spliter_tab.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from modules.normalization import text_normalize
|
4 |
+
from modules.webui import webui_utils
|
5 |
+
from modules.webui.webui_utils import (
|
6 |
+
get_speakers,
|
7 |
+
get_styles,
|
8 |
+
split_long_text,
|
9 |
+
)
|
10 |
+
from modules.hf import spaces
|
11 |
+
|
12 |
+
|
13 |
+
# NOTE: 因为 text_normalize 需要使用 tokenizer
|
14 |
+
@torch.inference_mode()
|
15 |
+
@spaces.GPU
|
16 |
+
def merge_dataframe_to_ssml(dataframe, spk, style, seed):
|
17 |
+
if style == "*auto":
|
18 |
+
style = None
|
19 |
+
if spk == "-1" or spk == -1:
|
20 |
+
spk = None
|
21 |
+
if seed == -1 or seed == "-1":
|
22 |
+
seed = None
|
23 |
+
|
24 |
+
ssml = ""
|
25 |
+
indent = " " * 2
|
26 |
+
|
27 |
+
for i, row in dataframe.iterrows():
|
28 |
+
ssml += f"{indent}<voice"
|
29 |
+
if spk:
|
30 |
+
ssml += f' spk="{spk}"'
|
31 |
+
if style:
|
32 |
+
ssml += f' style="{style}"'
|
33 |
+
if seed:
|
34 |
+
ssml += f' seed="{seed}"'
|
35 |
+
ssml += ">\n"
|
36 |
+
ssml += f"{indent}{indent}{text_normalize(row.iloc[1])}\n"
|
37 |
+
ssml += f"{indent}</voice>\n"
|
38 |
+
# 原封不动输出回去是为了触发 loadding 效果
|
39 |
+
return dataframe, spk, style, seed, f"<speak version='0.1'>\n{ssml}</speak>"
|
40 |
+
|
41 |
+
|
42 |
+
# 长文本处理
|
43 |
+
# 可以输入长文本,并选择切割方法,切割之后可以将拼接的SSML发送到SSML tab
|
44 |
+
# 根据 。 句号切割,切割之后显示到 data table
|
45 |
+
def create_spliter_tab(ssml_input, tabs1, tabs2):
|
46 |
+
speakers, speaker_names = webui_utils.get_speaker_names()
|
47 |
+
speaker_names = ["*random"] + speaker_names
|
48 |
+
|
49 |
+
styles = ["*auto"] + [s.get("name") for s in get_styles()]
|
50 |
+
|
51 |
+
with gr.Row():
|
52 |
+
with gr.Column(scale=1):
|
53 |
+
# 选择说话人 选择风格 选择seed
|
54 |
+
with gr.Group():
|
55 |
+
gr.Markdown("🗣️Speaker")
|
56 |
+
spk_input_text = gr.Textbox(
|
57 |
+
label="Speaker (Text or Seed)",
|
58 |
+
value="female2",
|
59 |
+
show_label=False,
|
60 |
+
)
|
61 |
+
spk_input_dropdown = gr.Dropdown(
|
62 |
+
choices=speaker_names,
|
63 |
+
interactive=True,
|
64 |
+
value="female : female2",
|
65 |
+
show_label=False,
|
66 |
+
)
|
67 |
+
spk_rand_button = gr.Button(
|
68 |
+
value="🎲",
|
69 |
+
variant="secondary",
|
70 |
+
)
|
71 |
+
with gr.Group():
|
72 |
+
gr.Markdown("🎭Style")
|
73 |
+
style_input_dropdown = gr.Dropdown(
|
74 |
+
choices=styles,
|
75 |
+
interactive=True,
|
76 |
+
show_label=False,
|
77 |
+
value="*auto",
|
78 |
+
)
|
79 |
+
with gr.Group():
|
80 |
+
gr.Markdown("🗣️Seed")
|
81 |
+
infer_seed_input = gr.Number(
|
82 |
+
value=42,
|
83 |
+
label="Inference Seed",
|
84 |
+
show_label=False,
|
85 |
+
minimum=-1,
|
86 |
+
maximum=2**32 - 1,
|
87 |
+
)
|
88 |
+
infer_seed_rand_button = gr.Button(
|
89 |
+
value="🎲",
|
90 |
+
variant="secondary",
|
91 |
+
)
|
92 |
+
|
93 |
+
send_btn = gr.Button("📩Send to SSML", variant="primary")
|
94 |
+
|
95 |
+
with gr.Column(scale=3):
|
96 |
+
with gr.Group():
|
97 |
+
gr.Markdown("📝Long Text Input")
|
98 |
+
gr.Markdown("- 此页面用于处理超长文本")
|
99 |
+
gr.Markdown("- 切割后,可以选择说话人、风格、seed,然后发送到SSML")
|
100 |
+
long_text_input = gr.Textbox(
|
101 |
+
label="Long Text Input",
|
102 |
+
lines=10,
|
103 |
+
placeholder="输入长文本",
|
104 |
+
elem_id="long-text-input",
|
105 |
+
show_label=False,
|
106 |
+
)
|
107 |
+
long_text_split_button = gr.Button("🔪Split Text")
|
108 |
+
|
109 |
+
with gr.Row():
|
110 |
+
with gr.Column(scale=3):
|
111 |
+
with gr.Group():
|
112 |
+
gr.Markdown("🎨Output")
|
113 |
+
long_text_output = gr.DataFrame(
|
114 |
+
headers=["index", "text", "length"],
|
115 |
+
datatype=["number", "str", "number"],
|
116 |
+
elem_id="long-text-output",
|
117 |
+
interactive=False,
|
118 |
+
wrap=True,
|
119 |
+
value=[],
|
120 |
+
)
|
121 |
+
|
122 |
+
spk_input_dropdown.change(
|
123 |
+
fn=lambda x: x.startswith("*") and "-1" or x.split(":")[-1].strip(),
|
124 |
+
inputs=[spk_input_dropdown],
|
125 |
+
outputs=[spk_input_text],
|
126 |
+
)
|
127 |
+
spk_rand_button.click(
|
128 |
+
lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
|
129 |
+
inputs=[spk_input_text],
|
130 |
+
outputs=[spk_input_text],
|
131 |
+
)
|
132 |
+
infer_seed_rand_button.click(
|
133 |
+
lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
|
134 |
+
inputs=[infer_seed_input],
|
135 |
+
outputs=[infer_seed_input],
|
136 |
+
)
|
137 |
+
long_text_split_button.click(
|
138 |
+
split_long_text,
|
139 |
+
inputs=[long_text_input],
|
140 |
+
outputs=[long_text_output],
|
141 |
+
)
|
142 |
+
|
143 |
+
infer_seed_rand_button.click(
|
144 |
+
lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
|
145 |
+
inputs=[infer_seed_input],
|
146 |
+
outputs=[infer_seed_input],
|
147 |
+
)
|
148 |
+
|
149 |
+
send_btn.click(
|
150 |
+
merge_dataframe_to_ssml,
|
151 |
+
inputs=[
|
152 |
+
long_text_output,
|
153 |
+
spk_input_text,
|
154 |
+
style_input_dropdown,
|
155 |
+
infer_seed_input,
|
156 |
+
],
|
157 |
+
outputs=[
|
158 |
+
long_text_output,
|
159 |
+
spk_input_text,
|
160 |
+
style_input_dropdown,
|
161 |
+
infer_seed_input,
|
162 |
+
ssml_input,
|
163 |
+
],
|
164 |
+
)
|
165 |
+
|
166 |
+
def change_tab():
|
167 |
+
return gr.Tabs(selected="ssml"), gr.Tabs(selected="ssml.editor")
|
168 |
+
|
169 |
+
send_btn.click(change_tab, inputs=[], outputs=[tabs1, tabs2])
|
modules/webui/ssml/ssml_tab.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from modules.webui.webui_utils import (
|
3 |
+
synthesize_ssml,
|
4 |
+
)
|
5 |
+
from modules.webui import webui_config
|
6 |
+
from modules.webui.examples import ssml_examples, default_ssml
|
7 |
+
|
8 |
+
|
9 |
+
def create_ssml_interface():
|
10 |
+
with gr.Row():
|
11 |
+
with gr.Column(scale=3):
|
12 |
+
with gr.Group():
|
13 |
+
gr.Markdown("📝SSML Input")
|
14 |
+
gr.Markdown(f"- 最长{webui_config.ssml_max:,}字符,超过会被截断")
|
15 |
+
gr.Markdown("- 尽量保证使用相同的 seed")
|
16 |
+
gr.Markdown(
|
17 |
+
"- 关于SSML可以看这个 [文档](https://github.com/lenML/ChatTTS-Forge/blob/main/docs/SSML.md)"
|
18 |
+
)
|
19 |
+
ssml_input = gr.Textbox(
|
20 |
+
label="SSML Input",
|
21 |
+
lines=10,
|
22 |
+
value=default_ssml,
|
23 |
+
placeholder="输入 SSML 或选择示例",
|
24 |
+
elem_id="ssml_input",
|
25 |
+
show_label=False,
|
26 |
+
)
|
27 |
+
ssml_button = gr.Button("🔊Synthesize SSML", variant="primary")
|
28 |
+
with gr.Column(scale=1):
|
29 |
+
with gr.Group():
|
30 |
+
# 参数
|
31 |
+
gr.Markdown("🎛️Parameters")
|
32 |
+
# batch size
|
33 |
+
batch_size_input = gr.Slider(
|
34 |
+
label="Batch Size",
|
35 |
+
value=4,
|
36 |
+
minimum=1,
|
37 |
+
maximum=webui_config.max_batch_size,
|
38 |
+
step=1,
|
39 |
+
)
|
40 |
+
|
41 |
+
with gr.Group():
|
42 |
+
gr.Markdown("💪🏼Enhance")
|
43 |
+
enable_enhance = gr.Checkbox(value=True, label="Enable Enhance")
|
44 |
+
enable_de_noise = gr.Checkbox(value=False, label="Enable De-noise")
|
45 |
+
|
46 |
+
with gr.Group():
|
47 |
+
gr.Markdown("🎄Examples")
|
48 |
+
gr.Examples(
|
49 |
+
examples=ssml_examples,
|
50 |
+
inputs=[ssml_input],
|
51 |
+
)
|
52 |
+
|
53 |
+
ssml_output = gr.Audio(label="Generated Audio", format="mp3")
|
54 |
+
|
55 |
+
ssml_button.click(
|
56 |
+
synthesize_ssml,
|
57 |
+
inputs=[ssml_input, batch_size_input, enable_enhance, enable_de_noise],
|
58 |
+
outputs=ssml_output,
|
59 |
+
)
|
60 |
+
|
61 |
+
return ssml_input
|
modules/webui/tts_tab.py
CHANGED
@@ -27,6 +27,7 @@ def create_tts_interface():
|
|
27 |
speaker_names = ["*random"] + [
|
28 |
get_speaker_show_name(speaker) for speaker in speakers
|
29 |
]
|
|
|
30 |
|
31 |
styles = ["*auto"] + [s.get("name") for s in get_styles()]
|
32 |
|
@@ -121,18 +122,10 @@ def create_tts_interface():
|
|
121 |
# tooltip="Random Seed",
|
122 |
variant="secondary",
|
123 |
)
|
|
|
124 |
use_decoder_input = gr.Checkbox(
|
125 |
value=True, label="Use Decoder", visible=False
|
126 |
)
|
127 |
-
with gr.Group():
|
128 |
-
gr.Markdown("🔧Prompt engineering")
|
129 |
-
prompt1_input = gr.Textbox(label="Prompt 1")
|
130 |
-
prompt2_input = gr.Textbox(label="Prompt 2")
|
131 |
-
prefix_input = gr.Textbox(label="Prefix")
|
132 |
-
|
133 |
-
prompt_audio = gr.File(
|
134 |
-
label="prompt_audio", visible=webui_config.experimental
|
135 |
-
)
|
136 |
|
137 |
infer_seed_rand_button.click(
|
138 |
lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
|
@@ -214,6 +207,16 @@ def create_tts_interface():
|
|
214 |
)
|
215 |
refine_button = gr.Button("✍️Refine Text")
|
216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
with gr.Group():
|
218 |
gr.Markdown("🔊Generate")
|
219 |
disable_normalize_input = gr.Checkbox(
|
|
|
27 |
speaker_names = ["*random"] + [
|
28 |
get_speaker_show_name(speaker) for speaker in speakers
|
29 |
]
|
30 |
+
speaker_names.sort(key=lambda x: x.startswith("*") and "-1" or x)
|
31 |
|
32 |
styles = ["*auto"] + [s.get("name") for s in get_styles()]
|
33 |
|
|
|
122 |
# tooltip="Random Seed",
|
123 |
variant="secondary",
|
124 |
)
|
125 |
+
# 感觉这个没必要设置...
|
126 |
use_decoder_input = gr.Checkbox(
|
127 |
value=True, label="Use Decoder", visible=False
|
128 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
infer_seed_rand_button.click(
|
131 |
lambda x: int(torch.randint(0, 2**32 - 1, (1,)).item()),
|
|
|
207 |
)
|
208 |
refine_button = gr.Button("✍️Refine Text")
|
209 |
|
210 |
+
with gr.Group():
|
211 |
+
gr.Markdown("🔧Prompt engineering")
|
212 |
+
prompt1_input = gr.Textbox(label="Prompt 1")
|
213 |
+
prompt2_input = gr.Textbox(label="Prompt 2")
|
214 |
+
prefix_input = gr.Textbox(label="Prefix")
|
215 |
+
|
216 |
+
prompt_audio = gr.File(
|
217 |
+
label="prompt_audio", visible=webui_config.experimental
|
218 |
+
)
|
219 |
+
|
220 |
with gr.Group():
|
221 |
gr.Markdown("🔊Generate")
|
222 |
disable_normalize_input = gr.Checkbox(
|
modules/webui/webui_utils.py
CHANGED
@@ -32,6 +32,20 @@ def get_speakers():
|
|
32 |
return speaker_mgr.list_speakers()
|
33 |
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def get_styles():
|
36 |
return styles_mgr.list_items()
|
37 |
|
@@ -93,7 +107,12 @@ def apply_audio_enhance(audio_data, sr, enable_denoise, enable_enhance):
|
|
93 |
|
94 |
@torch.inference_mode()
|
95 |
@spaces.GPU
|
96 |
-
def synthesize_ssml(
|
|
|
|
|
|
|
|
|
|
|
97 |
try:
|
98 |
batch_size = int(batch_size)
|
99 |
except Exception:
|
@@ -116,7 +135,16 @@ def synthesize_ssml(ssml: str, batch_size=4):
|
|
116 |
audio_segments = synthesize.synthesize_segments(segments)
|
117 |
combined_audio = combine_audio_segments(audio_segments)
|
118 |
|
119 |
-
sr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
return sr, audio_data
|
122 |
|
@@ -193,6 +221,7 @@ def tts_generate(
|
|
193 |
audio_data, sample_rate = apply_audio_enhance(
|
194 |
audio_data, sample_rate, enable_denoise, enable_enhance
|
195 |
)
|
|
|
196 |
audio_data = audio.audio_to_int16(audio_data)
|
197 |
return sample_rate, audio_data
|
198 |
|
|
|
32 |
return speaker_mgr.list_speakers()
|
33 |
|
34 |
|
35 |
+
def get_speaker_names() -> tuple[list[Speaker], list[str]]:
|
36 |
+
speakers = get_speakers()
|
37 |
+
|
38 |
+
def get_speaker_show_name(spk):
|
39 |
+
if spk.gender == "*" or spk.gender == "":
|
40 |
+
return spk.name
|
41 |
+
return f"{spk.gender} : {spk.name}"
|
42 |
+
|
43 |
+
speaker_names = [get_speaker_show_name(speaker) for speaker in speakers]
|
44 |
+
speaker_names.sort(key=lambda x: x.startswith("*") and "-1" or x)
|
45 |
+
|
46 |
+
return speakers, speaker_names
|
47 |
+
|
48 |
+
|
49 |
def get_styles():
|
50 |
return styles_mgr.list_items()
|
51 |
|
|
|
107 |
|
108 |
@torch.inference_mode()
|
109 |
@spaces.GPU
|
110 |
+
def synthesize_ssml(
|
111 |
+
ssml: str,
|
112 |
+
batch_size=4,
|
113 |
+
enable_enhance=False,
|
114 |
+
enable_denoise=False,
|
115 |
+
):
|
116 |
try:
|
117 |
batch_size = int(batch_size)
|
118 |
except Exception:
|
|
|
135 |
audio_segments = synthesize.synthesize_segments(segments)
|
136 |
combined_audio = combine_audio_segments(audio_segments)
|
137 |
|
138 |
+
sr = combined_audio.frame_rate
|
139 |
+
audio_data, sr = apply_audio_enhance(
|
140 |
+
audio.audiosegment_to_librosawav(combined_audio),
|
141 |
+
sr,
|
142 |
+
enable_denoise,
|
143 |
+
enable_enhance,
|
144 |
+
)
|
145 |
+
|
146 |
+
# NOTE: 这里必须要加,不然 gradio 没法解析成 mp3 格式
|
147 |
+
audio_data = audio.audio_to_int16(audio_data)
|
148 |
|
149 |
return sr, audio_data
|
150 |
|
|
|
221 |
audio_data, sample_rate = apply_audio_enhance(
|
222 |
audio_data, sample_rate, enable_denoise, enable_enhance
|
223 |
)
|
224 |
+
# NOTE: 这里必须要加,不然 gradio 没法解析成 mp3 格式
|
225 |
audio_data = audio.audio_to_int16(audio_data)
|
226 |
return sample_rate, audio_data
|
227 |
|