Plachta commited on
Commit
86e39a8
1 Parent(s): f7ac706

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -30
app.py CHANGED
@@ -4,6 +4,7 @@ import os
4
  import re
5
  import tempfile
6
  import logging
 
7
  logging.getLogger('numba').setLevel(logging.WARNING)
8
  import librosa
9
  import numpy as np
@@ -22,7 +23,6 @@ from mel_processing import spectrogram_torch
22
  import psutil
23
  from datetime import datetime
24
 
25
-
26
  language_marks = {
27
  "Japanese": "",
28
  "日本語": "[JA]",
@@ -32,6 +32,8 @@ language_marks = {
32
  }
33
 
34
  limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
 
 
35
  def create_tts_fn(model, hps, speaker_ids):
36
  def tts_fn(text, speaker, language, speed, is_symbol):
37
  if limitation:
@@ -56,6 +58,7 @@ def create_tts_fn(model, hps, speaker_ids):
56
 
57
  return tts_fn
58
 
 
59
  def create_vc_fn(model, hps, speaker_ids):
60
  def vc_fn(original_speaker, target_speaker, input_audio):
61
  if input_audio is None:
@@ -88,6 +91,7 @@ def create_vc_fn(model, hps, speaker_ids):
88
 
89
  return vc_fn
90
 
 
91
  def get_text(text, hps, is_symbol):
92
  text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
93
  if hps.data.add_blank:
@@ -95,6 +99,7 @@ def get_text(text, hps, is_symbol):
95
  text_norm = LongTensor(text_norm)
96
  return text_norm
97
 
 
98
  def create_to_symbol_fn(hps):
99
  def to_symbol_fn(is_symbol_input, input_text, temp_text):
100
  return (_clean_text(input_text, hps.data.text_cleaners), input_text) if is_symbol_input \
@@ -102,38 +107,51 @@ def create_to_symbol_fn(hps):
102
 
103
  return to_symbol_fn
104
 
 
105
  models_tts = []
106
  models_vc = []
107
  models_info = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  {
109
  "title": "Japanese",
110
  "languages": ["Japanese"],
111
- "description": "",
 
 
 
112
  "model_path": "./pretrained_models/G_1153000.pth",
113
  "config_path": "./configs/uma87.json",
114
  "examples": [['お疲れ様です,トレーナーさん。', '无声铃鹿 Silence Suzuka (Umamusume Pretty Derby)', 'Japanese', 1, False],
115
- ['張り切っていこう!', '北部玄驹 Kitasan Black (Umamusume Pretty Derby)', 'Japanese', 1, False],
116
- ['何でこんなに慣れでんのよ,私のほが先に好きだっだのに。', '草上飞 Grass Wonder (Umamusume Pretty Derby)', 'Japanese', 1, False],
117
- ['授業中に出しだら,学校生活終わるですわ。', '目白麦昆 Mejiro Mcqueen (Umamusume Pretty Derby)', 'Japanese', 1, False],
118
- ['お帰りなさい,お兄様!', '米浴 Rice Shower (Umamusume Pretty Derby)', 'Japanese', 1, False],
119
- ['私の処女をもらっでください!', '米浴 Rice Shower (Umamusume Pretty Derby)', 'Japanese', 1, False]],
120
  "type": "onnx"
121
  },
122
- {
123
- "title": "Trilingual",
124
- "languages": ['日本語', '简体中文', 'English', 'Mix'],
125
- "description": "",
126
- "model_path": "./pretrained_models/G_1396000.pth",
127
- "config_path": "./configs/uma_trilingual.json",
128
- "examples": [['你好,训练员先生,很高兴见到你。', '草上飞 Grass Wonder (Umamusume Pretty Derby)', '简体中文', 1, False],
129
- ['To be honest, I have no idea what to say as examples.', '派蒙 Paimon (Genshin Impact)', 'English', 1, False],
130
- ['授業中に出しだら,学校生活終わるですわ。', '綾地 寧々 Ayachi Nene (Sanoba Witch)', '日本語', 1, False]],
131
- "type": "torch"
132
- }
133
  ]
134
 
135
-
136
-
137
  if __name__ == "__main__":
138
  parser = argparse.ArgumentParser()
139
  parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
@@ -145,6 +163,7 @@ if __name__ == "__main__":
145
  config_path = info['config_path']
146
  model_path = info['model_path']
147
  type = info['type']
 
148
  hps = utils.get_hparams_from_file(config_path)
149
  if type == "onnx":
150
  model = ONNXVITS_infer.SynthesizerTrn(
@@ -164,26 +183,30 @@ if __name__ == "__main__":
164
  model.eval()
165
  speaker_ids = hps.speakers
166
  speakers = list(hps.speakers.keys())
167
- models_tts.append((name, speakers, lang, examples,
168
  hps.symbols, create_tts_fn(model, hps, speaker_ids),
169
  create_to_symbol_fn(hps)))
170
- models_vc.append((name, speakers, create_vc_fn(model, hps, speaker_ids)))
171
  app = gr.Blocks()
172
  with app:
173
  gr.Markdown("# English & Chinese & Japanese Anime TTS\n\n"
174
  "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=Plachta.VITS-Umamusume-voice-synthesizer)\n\n"
175
- "Including Japanese TTS & Trilingual TTS, speakers are all anime characters. 包含一个纯日语TTS和一个中日英三语TTS模型,主要为二次元角色。"
176
  "If you have any suggestions or bug reports, feel free to open discussion in [Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions).\n\n"
177
  "若有bug反馈或建议,请在[Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions)下开启一个新的Discussion。 \n\n"
178
  )
179
  with gr.Tabs():
180
  with gr.TabItem("TTS"):
181
  with gr.Tabs():
182
- for i, (name, speakers, lang, example, symbols, tts_fn, to_symbol_fn) in enumerate(models_tts):
 
183
  with gr.TabItem(name):
 
184
  with gr.Row():
185
  with gr.Column():
186
- textbox = gr.TextArea(label="Text", placeholder="Type your sentence here (Maximum 150 words)", value="こんにちわ。", elem_id=f"tts-input")
 
 
187
  with gr.Accordion(label="Phoneme Input", open=False):
188
  temp_text_var = gr.Variable()
189
  symbol_input = gr.Checkbox(value=False, label="Symbol input")
@@ -212,21 +235,24 @@ if __name__ == "__main__":
212
  text_input.selectionEnd = startPos + symbols[i].length;
213
  text_input.blur();
214
  window.scrollTo(x, y);
215
-
216
  text = text_input.value;
217
-
218
  return text;
219
  }}""")
220
  # select character
221
  char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
222
  language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
223
- duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1, label='速度 Speed')
 
224
  with gr.Column():
225
  text_output = gr.Textbox(label="Message")
226
  audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
227
  btn = gr.Button("Generate!")
228
- btn.click(tts_fn, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, symbol_input],
229
- outputs=[text_output, audio_output])
 
 
230
  gr.Examples(
231
  examples=example,
232
  inputs=[textbox, char_dropdown, language_dropdown,
 
4
  import re
5
  import tempfile
6
  import logging
7
+
8
  logging.getLogger('numba').setLevel(logging.WARNING)
9
  import librosa
10
  import numpy as np
 
23
  import psutil
24
  from datetime import datetime
25
 
 
26
  language_marks = {
27
  "Japanese": "",
28
  "日本語": "[JA]",
 
32
  }
33
 
34
  limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
35
+
36
+
37
  def create_tts_fn(model, hps, speaker_ids):
38
  def tts_fn(text, speaker, language, speed, is_symbol):
39
  if limitation:
 
58
 
59
  return tts_fn
60
 
61
+
62
  def create_vc_fn(model, hps, speaker_ids):
63
  def vc_fn(original_speaker, target_speaker, input_audio):
64
  if input_audio is None:
 
91
 
92
  return vc_fn
93
 
94
+
95
  def get_text(text, hps, is_symbol):
96
  text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
97
  if hps.data.add_blank:
 
99
  text_norm = LongTensor(text_norm)
100
  return text_norm
101
 
102
+
103
  def create_to_symbol_fn(hps):
104
  def to_symbol_fn(is_symbol_input, input_text, temp_text):
105
  return (_clean_text(input_text, hps.data.text_cleaners), input_text) if is_symbol_input \
 
107
 
108
  return to_symbol_fn
109
 
110
+
111
  models_tts = []
112
  models_vc = []
113
  models_info = [
114
+ {
115
+ "title": "Trilingual",
116
+ "languages": ['日本語', '简体中文', 'English', 'Mix'],
117
+ "description": """
118
+ This model is trained on a mix up of Umamusume, Genshin Impact, Sanoba Witch & VCTK voice data to learn multilanguage.
119
+ All characters can speak English, Chinese & Japanese.\n\n
120
+ To mix multiple languages in a single sentence, wrap the corresponding part with language tokens
121
+ ([JA] for Japanese, [ZH] for Chinese, [EN] for English), as shown in the examples.\n\n
122
+ 这个模型在赛马娘,原神,魔女的夜宴以及VCTK数据集上混合训练以学习多种语言。
123
+ 所有角色均可说中日英三语。\n\n
124
+ 若需要在同一个句子中混合多种语言,使用相应的语言标记包裹句子。
125
+ (日语用[JA], 中文用[ZH], 英文用[EN]),参考Examples中的示例。
126
+ """,
127
+ "model_path": "./pretrained_models/G_1396000.pth",
128
+ "config_path": "./configs/uma_trilingual.json",
129
+ "examples": [['你好,训练员先生,很高兴见到你。', '草上飞 Grass Wonder (Umamusume Pretty Derby)', '简体中文', 1, False],
130
+ ['To be honest, I have no idea what to say as examples.', '派蒙 Paimon (Genshin Impact)', 'English',
131
+ 1, False],
132
+ ['授業中に出しだら,学校生活終わるですわ。', '綾地 寧々 Ayachi Nene (Sanoba Witch)', '日本語', 1, False],
133
+ ['[JA]こんにちわ。[JA][ZH]你好![ZH][EN]Hello![EN]', '綾地 寧々 Ayachi Nene (Sanoba Witch)', 'Mix', 1, False]],
134
+ "type": "torch"
135
+ },
136
  {
137
  "title": "Japanese",
138
  "languages": ["Japanese"],
139
+ "description": """
140
+ This model contains 87 characters from Umamusume: Pretty Derby, Japanese only.\n\n
141
+ 这个模型包含赛马娘的所有87名角色,只能合成日语。
142
+ """,
143
  "model_path": "./pretrained_models/G_1153000.pth",
144
  "config_path": "./configs/uma87.json",
145
  "examples": [['お疲れ様です,トレーナーさん。', '无声铃鹿 Silence Suzuka (Umamusume Pretty Derby)', 'Japanese', 1, False],
146
+ ['張り切っていこう!', '北部玄驹 Kitasan Black (Umamusume Pretty Derby)', 'Japanese', 1, False],
147
+ ['何でこんなに慣れでんのよ,私のほが先に好きだっだのに。', '草上飞 Grass Wonder (Umamusume Pretty Derby)', 'Japanese', 1, False],
148
+ ['授業中に出しだら,学校生活終わるですわ。', '目白麦昆 Mejiro Mcqueen (Umamusume Pretty Derby)', 'Japanese', 1, False],
149
+ ['お帰りなさい,お兄様!', '米浴 Rice Shower (Umamusume Pretty Derby)', 'Japanese', 1, False],
150
+ ['私の処女をもらっでください!', '米浴 Rice Shower (Umamusume Pretty Derby)', 'Japanese', 1, False]],
151
  "type": "onnx"
152
  },
 
 
 
 
 
 
 
 
 
 
 
153
  ]
154
 
 
 
155
  if __name__ == "__main__":
156
  parser = argparse.ArgumentParser()
157
  parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
 
163
  config_path = info['config_path']
164
  model_path = info['model_path']
165
  type = info['type']
166
+ description = info['description']
167
  hps = utils.get_hparams_from_file(config_path)
168
  if type == "onnx":
169
  model = ONNXVITS_infer.SynthesizerTrn(
 
183
  model.eval()
184
  speaker_ids = hps.speakers
185
  speakers = list(hps.speakers.keys())
186
+ models_tts.append((name, description, speakers, lang, examples,
187
  hps.symbols, create_tts_fn(model, hps, speaker_ids),
188
  create_to_symbol_fn(hps)))
189
+ models_vc.append((name, description, speakers, create_vc_fn(model, hps, speaker_ids)))
190
  app = gr.Blocks()
191
  with app:
192
  gr.Markdown("# English & Chinese & Japanese Anime TTS\n\n"
193
  "![visitor badge](https://visitor-badge.glitch.me/badge?page_id=Plachta.VITS-Umamusume-voice-synthesizer)\n\n"
194
+ "Including Japanese TTS & Trilingual TTS, speakers are all anime characters. \n\n包含一个纯日语TTS和一个中日英三语TTS模型,主要为二次元角色。\n\n"
195
  "If you have any suggestions or bug reports, feel free to open discussion in [Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions).\n\n"
196
  "若有bug反馈或建议,请在[Community](https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/discussions)下开启一个新的Discussion。 \n\n"
197
  )
198
  with gr.Tabs():
199
  with gr.TabItem("TTS"):
200
  with gr.Tabs():
201
+ for i, (name, description, speakers, lang, example, symbols, tts_fn, to_symbol_fn) in enumerate(
202
+ models_tts):
203
  with gr.TabItem(name):
204
+ gr.Markdown(description)
205
  with gr.Row():
206
  with gr.Column():
207
+ textbox = gr.TextArea(label="Text",
208
+ placeholder="Type your sentence here (Maximum 150 words)",
209
+ value="こんにちわ。", elem_id=f"tts-input")
210
  with gr.Accordion(label="Phoneme Input", open=False):
211
  temp_text_var = gr.Variable()
212
  symbol_input = gr.Checkbox(value=False, label="Symbol input")
 
235
  text_input.selectionEnd = startPos + symbols[i].length;
236
  text_input.blur();
237
  window.scrollTo(x, y);
238
+
239
  text = text_input.value;
240
+
241
  return text;
242
  }}""")
243
  # select character
244
  char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
245
  language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
246
+ duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1,
247
+ label='速度 Speed')
248
  with gr.Column():
249
  text_output = gr.Textbox(label="Message")
250
  audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
251
  btn = gr.Button("Generate!")
252
+ btn.click(tts_fn,
253
+ inputs=[textbox, char_dropdown, language_dropdown, duration_slider,
254
+ symbol_input],
255
+ outputs=[text_output, audio_output])
256
  gr.Examples(
257
  examples=example,
258
  inputs=[textbox, char_dropdown, language_dropdown,