Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import os
|
|
4 |
import re
|
5 |
import tempfile
|
6 |
import logging
|
|
|
7 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
8 |
import librosa
|
9 |
import numpy as np
|
@@ -22,7 +23,6 @@ from mel_processing import spectrogram_torch
|
|
22 |
import psutil
|
23 |
from datetime import datetime
|
24 |
|
25 |
-
|
26 |
language_marks = {
|
27 |
"Japanese": "",
|
28 |
"日本語": "[JA]",
|
@@ -32,6 +32,8 @@ language_marks = {
|
|
32 |
}
|
33 |
|
34 |
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
|
|
|
|
|
35 |
def create_tts_fn(model, hps, speaker_ids):
|
36 |
def tts_fn(text, speaker, language, speed, is_symbol):
|
37 |
if limitation:
|
@@ -56,6 +58,7 @@ def create_tts_fn(model, hps, speaker_ids):
|
|
56 |
|
57 |
return tts_fn
|
58 |
|
|
|
59 |
def create_vc_fn(model, hps, speaker_ids):
|
60 |
def vc_fn(original_speaker, target_speaker, input_audio):
|
61 |
if input_audio is None:
|
@@ -88,6 +91,7 @@ def create_vc_fn(model, hps, speaker_ids):
|
|
88 |
|
89 |
return vc_fn
|
90 |
|
|
|
91 |
def get_text(text, hps, is_symbol):
|
92 |
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
|
93 |
if hps.data.add_blank:
|
@@ -95,6 +99,7 @@ def get_text(text, hps, is_symbol):
|
|
95 |
text_norm = LongTensor(text_norm)
|
96 |
return text_norm
|
97 |
|
|
|
98 |
def create_to_symbol_fn(hps):
|
99 |
def to_symbol_fn(is_symbol_input, input_text, temp_text):
|
100 |
return (_clean_text(input_text, hps.data.text_cleaners), input_text) if is_symbol_input \
|
@@ -102,38 +107,51 @@ def create_to_symbol_fn(hps):
|
|
102 |
|
103 |
return to_symbol_fn
|
104 |
|
|
|
105 |
models_tts = []
|
106 |
models_vc = []
|
107 |
models_info = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
{
|
109 |
"title": "Japanese",
|
110 |
"languages": ["Japanese"],
|
111 |
-
"description": ""
|
|
|
|
|
|
|
112 |
"model_path": "./pretrained_models/G_1153000.pth",
|
113 |
"config_path": "./configs/uma87.json",
|
114 |
"examples": [['お疲れ様です,トレーナーさん。', '无声铃鹿 Silence Suzuka (Umamusume Pretty Derby)', 'Japanese', 1, False],
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
"type": "onnx"
|
121 |
},
|
122 |
-
{
|
123 |
-
"title": "Trilingual",
|
124 |
-
"languages": ['日本語', '简体中文', 'English', 'Mix'],
|
125 |
-
"description": "",
|
126 |
-
"model_path": "./pretrained_models/G_1396000.pth",
|
127 |
-
"config_path": "./configs/uma_trilingual.json",
|
128 |
-
"examples": [['你好,训练员先生,很高兴见到你。', '草上飞 Grass Wonder (Umamusume Pretty Derby)', '简体中文', 1, False],
|
129 |
-
['To be honest, I have no idea what to say as examples.', '派蒙 Paimon (Genshin Impact)', 'English', 1, False],
|
130 |
-
['授業中に出しだら,学校生活終わるですわ。', '綾地 寧々 Ayachi Nene (Sanoba Witch)', '日本語', 1, False]],
|
131 |
-
"type": "torch"
|
132 |
-
}
|
133 |
]
|
134 |
|
135 |
-
|
136 |
-
|
137 |
if __name__ == "__main__":
|
138 |
parser = argparse.ArgumentParser()
|
139 |
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
@@ -145,6 +163,7 @@ if __name__ == "__main__":
|
|
145 |
config_path = info['config_path']
|
146 |
model_path = info['model_path']
|
147 |
type = info['type']
|
|
|
148 |
hps = utils.get_hparams_from_file(config_path)
|
149 |
if type == "onnx":
|
150 |
model = ONNXVITS_infer.SynthesizerTrn(
|
@@ -164,26 +183,30 @@ if __name__ == "__main__":
|
|
164 |
model.eval()
|
165 |
speaker_ids = hps.speakers
|
166 |
speakers = list(hps.speakers.keys())
|
167 |
-
models_tts.append((name, speakers, lang, examples,
|
168 |
hps.symbols, create_tts_fn(model, hps, speaker_ids),
|
169 |
create_to_symbol_fn(hps)))
|
170 |
-
models_vc.append((name, speakers, create_vc_fn(model, hps, speaker_ids)))
|
171 |
app = gr.Blocks()
|
172 |
with app:
|
173 |
gr.Markdown("# English & Chinese & Japanese Anime TTS\n\n"
|
174 |
"![visitor badge](https://visitor-badge.glitch.me/badge?page_id=Plachta.VITS-Umamusume-voice-synthesizer)\n\n"
|
175 |
-
"Including Japanese TTS & Trilingual TTS, speakers are all anime characters. 包含一个纯日语TTS和一个中日英三语TTS
|
176 |
"If you have any suggestions or bug reports, feel free to open discussion in [Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions).\n\n"
|
177 |
"若有bug反馈或建议,请在[Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions)下开启一个新的Discussion。 \n\n"
|
178 |
)
|
179 |
with gr.Tabs():
|
180 |
with gr.TabItem("TTS"):
|
181 |
with gr.Tabs():
|
182 |
-
for i, (name, speakers, lang, example, symbols, tts_fn, to_symbol_fn) in enumerate(
|
|
|
183 |
with gr.TabItem(name):
|
|
|
184 |
with gr.Row():
|
185 |
with gr.Column():
|
186 |
-
textbox = gr.TextArea(label="Text",
|
|
|
|
|
187 |
with gr.Accordion(label="Phoneme Input", open=False):
|
188 |
temp_text_var = gr.Variable()
|
189 |
symbol_input = gr.Checkbox(value=False, label="Symbol input")
|
@@ -212,21 +235,24 @@ if __name__ == "__main__":
|
|
212 |
text_input.selectionEnd = startPos + symbols[i].length;
|
213 |
text_input.blur();
|
214 |
window.scrollTo(x, y);
|
215 |
-
|
216 |
text = text_input.value;
|
217 |
-
|
218 |
return text;
|
219 |
}}""")
|
220 |
# select character
|
221 |
char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
|
222 |
language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
|
223 |
-
duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1,
|
|
|
224 |
with gr.Column():
|
225 |
text_output = gr.Textbox(label="Message")
|
226 |
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
227 |
btn = gr.Button("Generate!")
|
228 |
-
btn.click(tts_fn,
|
229 |
-
|
|
|
|
|
230 |
gr.Examples(
|
231 |
examples=example,
|
232 |
inputs=[textbox, char_dropdown, language_dropdown,
|
|
|
4 |
import re
|
5 |
import tempfile
|
6 |
import logging
|
7 |
+
|
8 |
logging.getLogger('numba').setLevel(logging.WARNING)
|
9 |
import librosa
|
10 |
import numpy as np
|
|
|
23 |
import psutil
|
24 |
from datetime import datetime
|
25 |
|
|
|
26 |
language_marks = {
|
27 |
"Japanese": "",
|
28 |
"日本語": "[JA]",
|
|
|
32 |
}
|
33 |
|
34 |
limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
|
35 |
+
|
36 |
+
|
37 |
def create_tts_fn(model, hps, speaker_ids):
|
38 |
def tts_fn(text, speaker, language, speed, is_symbol):
|
39 |
if limitation:
|
|
|
58 |
|
59 |
return tts_fn
|
60 |
|
61 |
+
|
62 |
def create_vc_fn(model, hps, speaker_ids):
|
63 |
def vc_fn(original_speaker, target_speaker, input_audio):
|
64 |
if input_audio is None:
|
|
|
91 |
|
92 |
return vc_fn
|
93 |
|
94 |
+
|
95 |
def get_text(text, hps, is_symbol):
|
96 |
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
|
97 |
if hps.data.add_blank:
|
|
|
99 |
text_norm = LongTensor(text_norm)
|
100 |
return text_norm
|
101 |
|
102 |
+
|
103 |
def create_to_symbol_fn(hps):
|
104 |
def to_symbol_fn(is_symbol_input, input_text, temp_text):
|
105 |
return (_clean_text(input_text, hps.data.text_cleaners), input_text) if is_symbol_input \
|
|
|
107 |
|
108 |
return to_symbol_fn
|
109 |
|
110 |
+
|
111 |
models_tts = []
|
112 |
models_vc = []
|
113 |
models_info = [
|
114 |
+
{
|
115 |
+
"title": "Trilingual",
|
116 |
+
"languages": ['日本語', '简体中文', 'English', 'Mix'],
|
117 |
+
"description": """
|
118 |
+
This model is trained on a mix up of Umamusume, Genshin Impact, Sanoba Witch & VCTK voice data to learn multilanguage.
|
119 |
+
All characters can speak English, Chinese & Japanese.\n\n
|
120 |
+
To mix multiple languages in a single sentence, wrap the corresponding part with language tokens
|
121 |
+
([JA] for Japanese, [ZH] for Chinese, [EN] for English), as shown in the examples.\n\n
|
122 |
+
这个模型在赛马娘,原神,魔女的夜宴以及VCTK数据集上混合训练以学习多种语言。
|
123 |
+
所有角色均可说中日英三语。\n\n
|
124 |
+
若需要在同一个句子中混合多种语言,使用相应的语言标记包裹句子。
|
125 |
+
(日语用[JA], 中文用[ZH], 英文用[EN]),参考Examples中的示例。
|
126 |
+
""",
|
127 |
+
"model_path": "./pretrained_models/G_1396000.pth",
|
128 |
+
"config_path": "./configs/uma_trilingual.json",
|
129 |
+
"examples": [['你好,训练员先生,很高兴见到你。', '草上飞 Grass Wonder (Umamusume Pretty Derby)', '简体中文', 1, False],
|
130 |
+
['To be honest, I have no idea what to say as examples.', '派蒙 Paimon (Genshin Impact)', 'English',
|
131 |
+
1, False],
|
132 |
+
['授業中に出しだら,学校生活終わるですわ。', '綾地 寧々 Ayachi Nene (Sanoba Witch)', '日本語', 1, False],
|
133 |
+
['[JA]こんにちわ。[JA][ZH]你好![ZH][EN]Hello![EN]', '綾地 寧々 Ayachi Nene (Sanoba Witch)', 'Mix', 1, False]],
|
134 |
+
"type": "torch"
|
135 |
+
},
|
136 |
{
|
137 |
"title": "Japanese",
|
138 |
"languages": ["Japanese"],
|
139 |
+
"description": """
|
140 |
+
This model contains 87 characters from Umamusume: Pretty Derby, Japanese only.\n\n
|
141 |
+
这个模型包含赛马娘的所有87名角色,只能合成日语。
|
142 |
+
""",
|
143 |
"model_path": "./pretrained_models/G_1153000.pth",
|
144 |
"config_path": "./configs/uma87.json",
|
145 |
"examples": [['お疲れ様です,トレーナーさん。', '无声铃鹿 Silence Suzuka (Umamusume Pretty Derby)', 'Japanese', 1, False],
|
146 |
+
['張り切っていこう!', '北部玄驹 Kitasan Black (Umamusume Pretty Derby)', 'Japanese', 1, False],
|
147 |
+
['何でこんなに慣れでんのよ,私のほが先に好きだっだのに。', '草上飞 Grass Wonder (Umamusume Pretty Derby)', 'Japanese', 1, False],
|
148 |
+
['授業中に出しだら,学校生活終わるですわ。', '目白麦昆 Mejiro Mcqueen (Umamusume Pretty Derby)', 'Japanese', 1, False],
|
149 |
+
['お帰りなさい,お兄様!', '米浴 Rice Shower (Umamusume Pretty Derby)', 'Japanese', 1, False],
|
150 |
+
['私の処女をもらっでください!', '米浴 Rice Shower (Umamusume Pretty Derby)', 'Japanese', 1, False]],
|
151 |
"type": "onnx"
|
152 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
]
|
154 |
|
|
|
|
|
155 |
if __name__ == "__main__":
|
156 |
parser = argparse.ArgumentParser()
|
157 |
parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
|
|
|
163 |
config_path = info['config_path']
|
164 |
model_path = info['model_path']
|
165 |
type = info['type']
|
166 |
+
description = info['description']
|
167 |
hps = utils.get_hparams_from_file(config_path)
|
168 |
if type == "onnx":
|
169 |
model = ONNXVITS_infer.SynthesizerTrn(
|
|
|
183 |
model.eval()
|
184 |
speaker_ids = hps.speakers
|
185 |
speakers = list(hps.speakers.keys())
|
186 |
+
models_tts.append((name, description, speakers, lang, examples,
|
187 |
hps.symbols, create_tts_fn(model, hps, speaker_ids),
|
188 |
create_to_symbol_fn(hps)))
|
189 |
+
models_vc.append((name, description, speakers, create_vc_fn(model, hps, speaker_ids)))
|
190 |
app = gr.Blocks()
|
191 |
with app:
|
192 |
gr.Markdown("# English & Chinese & Japanese Anime TTS\n\n"
|
193 |
"![visitor badge](https://visitor-badge.glitch.me/badge?page_id=Plachta.VITS-Umamusume-voice-synthesizer)\n\n"
|
194 |
+
"Including Japanese TTS & Trilingual TTS, speakers are all anime characters. \n\n包含一个纯日语TTS和一个中日英三语TTS模型,主要为二次元角色。\n\n"
|
195 |
"If you have any suggestions or bug reports, feel free to open discussion in [Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions).\n\n"
|
196 |
"若有bug反馈或建议,请在[Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions)下开启一个新的Discussion。 \n\n"
|
197 |
)
|
198 |
with gr.Tabs():
|
199 |
with gr.TabItem("TTS"):
|
200 |
with gr.Tabs():
|
201 |
+
for i, (name, description, speakers, lang, example, symbols, tts_fn, to_symbol_fn) in enumerate(
|
202 |
+
models_tts):
|
203 |
with gr.TabItem(name):
|
204 |
+
gr.Markdown(description)
|
205 |
with gr.Row():
|
206 |
with gr.Column():
|
207 |
+
textbox = gr.TextArea(label="Text",
|
208 |
+
placeholder="Type your sentence here (Maximum 150 words)",
|
209 |
+
value="こんにちわ。", elem_id=f"tts-input")
|
210 |
with gr.Accordion(label="Phoneme Input", open=False):
|
211 |
temp_text_var = gr.Variable()
|
212 |
symbol_input = gr.Checkbox(value=False, label="Symbol input")
|
|
|
235 |
text_input.selectionEnd = startPos + symbols[i].length;
|
236 |
text_input.blur();
|
237 |
window.scrollTo(x, y);
|
238 |
+
|
239 |
text = text_input.value;
|
240 |
+
|
241 |
return text;
|
242 |
}}""")
|
243 |
# select character
|
244 |
char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
|
245 |
language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
|
246 |
+
duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1,
|
247 |
+
label='速度 Speed')
|
248 |
with gr.Column():
|
249 |
text_output = gr.Textbox(label="Message")
|
250 |
audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
|
251 |
btn = gr.Button("Generate!")
|
252 |
+
btn.click(tts_fn,
|
253 |
+
inputs=[textbox, char_dropdown, language_dropdown, duration_slider,
|
254 |
+
symbol_input],
|
255 |
+
outputs=[text_output, audio_output])
|
256 |
gr.Examples(
|
257 |
examples=example,
|
258 |
inputs=[textbox, char_dropdown, language_dropdown,
|