Plachta commited on
Commit
372afc3
1 Parent(s): 9473c98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -55
app.py CHANGED
@@ -14,40 +14,23 @@ import utils
14
  import gradio as gr
15
  import gradio.utils as gr_utils
16
  import gradio.processing_utils as gr_processing_utils
17
- from ONNXVITS_infer import SynthesizerTrn
 
18
  from text import text_to_sequence, _clean_text
19
  from text.symbols import symbols
20
  from mel_processing import spectrogram_torch
21
  import psutil
22
  from datetime import datetime
23
 
24
- def audio_postprocess(self, y):
25
- if y is None:
26
- return None
27
-
28
- if gr_utils.validate_url(y):
29
- file = gr_processing_utils.download_to_file(y, dir=self.temp_dir)
30
- elif isinstance(y, tuple):
31
- sample_rate, data = y
32
- file = tempfile.NamedTemporaryFile(
33
- suffix=".wav", dir=self.temp_dir, delete=False
34
- )
35
- gr_processing_utils.audio_to_file(sample_rate, data, file.name)
36
- else:
37
- file = gr_processing_utils.create_tmp_copy_of_file(y, dir=self.temp_dir)
38
-
39
- return gr_processing_utils.encode_url_or_file_to_base64(file.name)
40
-
41
 
42
  language_marks = {
 
43
  "日本語": "[JA]",
44
  "简体中文": "[ZH]",
45
  "English": "[EN]",
46
  "Mix": "",
47
  }
48
 
49
- gr.Audio.postprocess = audio_postprocess
50
-
51
  limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
52
  def create_tts_fn(model, hps, speaker_ids):
53
  def tts_fn(text, speaker, language, speed, is_symbol):
@@ -94,10 +77,10 @@ def create_vc_fn(model, hps, speaker_ids):
94
  y = y.unsqueeze(0)
95
  spec = spectrogram_torch(y, hps.data.filter_length,
96
  hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
97
- center=False).to(device)
98
- spec_lengths = LongTensor([spec.size(-1)]).to(device)
99
- sid_src = LongTensor([original_speaker_id]).to(device)
100
- sid_tgt = LongTensor([target_speaker_id]).to(device)
101
  audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
102
  0, 0].data.cpu().float().numpy()
103
  del y, spec, spec_lengths, sid_src, sid_tgt
@@ -119,30 +102,12 @@ def create_to_symbol_fn(hps):
119
 
120
  return to_symbol_fn
121
 
122
- download_audio_js = """
123
- () =>{{
124
- let root = document.querySelector("body > gradio-app");
125
- if (root.shadowRoot != null)
126
- root = root.shadowRoot;
127
- let audio = root.querySelector("#{audio_id}").querySelector("audio");
128
- if (audio == undefined)
129
- return;
130
- audio = audio.src;
131
- let oA = document.createElement("a");
132
- oA.download = Math.floor(Math.random()*100000000)+'.wav';
133
- oA.href = audio;
134
- document.body.appendChild(oA);
135
- oA.click();
136
- oA.remove();
137
- }}
138
- """
139
-
140
  models_tts = []
141
  models_vc = []
142
  models_info = [
143
  {
144
  "title": "Japanese",
145
- "languages": ["日本語"],
146
  "description": "",
147
  "model_path": "./pretrained_models/G_1153000.pth",
148
  "config_path": "./configs/uma87.json",
@@ -151,10 +116,11 @@ models_info = [
151
  ['何でこんなに慣れでんのよ,私のほが先に好きだっだのに。', 'Grass Wonder', '日本語', 1, False],
152
  ['授業中に出しだら,学校生活終わるですわ。', 'Mejiro Mcqueen', '日本語', 1, False],
153
  ['お帰りなさい,お兄様!', 'Rice Shower', '日本語', 1, False],
154
- ['私の処女をもらっでください!', 'Rice Shower', '日本語', 1, False]]
 
155
  },
156
  {
157
- "title": "Japanese",
158
  "languages": ['日本語', '简体中文', 'English', 'Mix'],
159
  "description": "",
160
  "model_path": "./pretrained_models/G_1396000.pth",
@@ -162,6 +128,7 @@ models_info = [
162
  "examples": [['你好,训练员先生,很高兴见到你。', '草上飞 Grass Wonder (Umamusume Pretty Derby)', '简体中文', 1, False],
163
  ['To be honest, I have no idea what to say as examples.', '派蒙 Paimon (Genshin Impact)', 'English', 1, False],
164
  ['授業中に出しだら,学校生活終わるですわ。', '綾地 寧々 Ayachi Nene (Sanoba Witch)', '日本語', 1, False]]
 
165
  }
166
  ]
167
 
@@ -177,18 +144,27 @@ if __name__ == "__main__":
177
  examples = info['examples']
178
  config_path = info['config_path']
179
  model_path = info['model_path']
 
180
  hps = utils.get_hparams_from_file(config_path)
181
- model = SynthesizerTrn(
182
- len(hps.symbols),
183
- hps.data.filter_length // 2 + 1,
184
- hps.train.segment_size // hps.data.hop_length,
185
- n_speakers=hps.data.n_speakers,
186
- **hps.model)
 
 
 
 
 
 
 
 
187
  utils.load_checkpoint(model_path, model, None)
188
  model.eval()
189
  speaker_ids = hps.speakers
190
  speakers = list(hps.speakers.keys())
191
- models_tts.append((name, speakers, lang, example,
192
  hps.symbols, create_tts_fn(model, hps, speaker_ids),
193
  create_to_symbol_fn(hps)))
194
  models_vc.append((name, speakers, create_vc_fn(model, hps, speaker_ids)))
@@ -250,10 +226,8 @@ if __name__ == "__main__":
250
  audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
251
  btn = gr.Button("Generate!")
252
 
253
- download = gr.Button("Download Audio")
254
- download.click(None, [], [], _js=download_audio_js.format(audio_id="tts-audio"))
255
  if len(lang) == 1:
256
- btn.click(tts_fn, inputs=[textbox, char_dropdown, None, duration_slider, symbol_input],
257
  outputs=[text_output, audio_output])
258
  else:
259
  btn.click(tts_fn, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, symbol_input],
 
14
  import gradio as gr
15
  import gradio.utils as gr_utils
16
  import gradio.processing_utils as gr_processing_utils
17
+ import ONNXVITS_infer
18
+ import models
19
  from text import text_to_sequence, _clean_text
20
  from text.symbols import symbols
21
  from mel_processing import spectrogram_torch
22
  import psutil
23
  from datetime import datetime
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  language_marks = {
27
+ "Japanese": "",
28
  "日本語": "[JA]",
29
  "简体中文": "[ZH]",
30
  "English": "[EN]",
31
  "Mix": "",
32
  }
33
 
 
 
34
  limitation = os.getenv("SYSTEM") == "spaces" # limit text and audio length in huggingface spaces
35
  def create_tts_fn(model, hps, speaker_ids):
36
  def tts_fn(text, speaker, language, speed, is_symbol):
 
77
  y = y.unsqueeze(0)
78
  spec = spectrogram_torch(y, hps.data.filter_length,
79
  hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
80
+ center=False)
81
+ spec_lengths = LongTensor([spec.size(-1)])
82
+ sid_src = LongTensor([original_speaker_id])
83
+ sid_tgt = LongTensor([target_speaker_id])
84
  audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
85
  0, 0].data.cpu().float().numpy()
86
  del y, spec, spec_lengths, sid_src, sid_tgt
 
102
 
103
  return to_symbol_fn
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  models_tts = []
106
  models_vc = []
107
  models_info = [
108
  {
109
  "title": "Japanese",
110
+ "languages": ["Japanese"],
111
  "description": "",
112
  "model_path": "./pretrained_models/G_1153000.pth",
113
  "config_path": "./configs/uma87.json",
 
116
  ['何でこんなに慣れでんのよ,私のほが先に好きだっだのに。', 'Grass Wonder', '日本語', 1, False],
117
  ['授業中に出しだら,学校生活終わるですわ。', 'Mejiro Mcqueen', '日本語', 1, False],
118
  ['お帰りなさい,お兄様!', 'Rice Shower', '日本語', 1, False],
119
+ ['私の処女をもらっでください!', 'Rice Shower', '日本語', 1, False]],
120
+ "type": "onnx"
121
  },
122
  {
123
+ "title": "Trilingual",
124
  "languages": ['日本語', '简体中文', 'English', 'Mix'],
125
  "description": "",
126
  "model_path": "./pretrained_models/G_1396000.pth",
 
128
  "examples": [['你好,训练员先生,很高兴见到你。', '草上飞 Grass Wonder (Umamusume Pretty Derby)', '简体中文', 1, False],
129
  ['To be honest, I have no idea what to say as examples.', '派蒙 Paimon (Genshin Impact)', 'English', 1, False],
130
  ['授業中に出しだら,学校生活終わるですわ。', '綾地 寧々 Ayachi Nene (Sanoba Witch)', '日本語', 1, False]]
131
+ "type": "torch"
132
  }
133
  ]
134
 
 
144
  examples = info['examples']
145
  config_path = info['config_path']
146
  model_path = info['model_path']
147
+ type = info['type']
148
  hps = utils.get_hparams_from_file(config_path)
149
+ if type == "onnx":
150
+ model = ONNXVITS_infer.SynthesizerTrn(
151
+ len(hps.symbols),
152
+ hps.data.filter_length // 2 + 1,
153
+ hps.train.segment_size // hps.data.hop_length,
154
+ n_speakers=hps.data.n_speakers,
155
+ **hps.model)
156
+ else:
157
+ model = models.SynthesizerTrn(
158
+ len(hps.symbols),
159
+ hps.data.filter_length // 2 + 1,
160
+ hps.train.segment_size // hps.data.hop_length,
161
+ n_speakers=hps.data.n_speakers,
162
+ **hps.model)
163
  utils.load_checkpoint(model_path, model, None)
164
  model.eval()
165
  speaker_ids = hps.speakers
166
  speakers = list(hps.speakers.keys())
167
+ models_tts.append((name, speakers, lang, examples,
168
  hps.symbols, create_tts_fn(model, hps, speaker_ids),
169
  create_to_symbol_fn(hps)))
170
  models_vc.append((name, speakers, create_vc_fn(model, hps, speaker_ids)))
 
226
  audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
227
  btn = gr.Button("Generate!")
228
 
 
 
229
  if len(lang) == 1:
230
+ btn.click(tts_fn, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, symbol_input],
231
  outputs=[text_output, audio_output])
232
  else:
233
  btn.click(tts_fn, inputs=[textbox, char_dropdown, language_dropdown, duration_slider, symbol_input],