xj commited on
Commit
700e801
1 Parent(s): 1fdb3a8

[feat] modify layout

Browse files
Files changed (1) hide show
  1. app.py +43 -89
app.py CHANGED
@@ -7,7 +7,7 @@ logging.getLogger('numba').setLevel(logging.WARNING)
7
  logging.basicConfig(
8
  format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
9
  datefmt="%Y-%m-%d %H:%M:%S",
10
- level=os.environ.get("LOGLEVEL", "DEBUG").upper(),
11
  stream=sys.stdout,
12
  )
13
  logger = logging.getLogger("APP")
@@ -43,15 +43,16 @@ def get_text(text, hps):
43
  return text_norm, clean_text
44
 
45
  def vits(text, language, speaker_id, noise_scale, noise_scale_w, length_scale):
 
46
  start = time.perf_counter()
47
  if not len(text):
48
  return "输入文本不能为空!", None, None
49
  text = text.replace('\n', ' ').replace('\r', '').replace(" ", "")
50
  if len(text) > 200 and limitation:
51
  return f"输入文字过长!{len(text)}>100", None, None
52
- if language == 0:
53
  text = f"[ZH]{text}[ZH]"
54
- elif language == 1:
55
  text = f"[JA]{text}[JA]"
56
  else:
57
  text = f"{text}"
@@ -63,10 +64,7 @@ def vits(text, language, speaker_id, noise_scale, noise_scale_w, length_scale):
63
  audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=speaker_id, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
64
  length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
65
  logger.info("gen: " + text[:100])
66
- with os.popen('free') as f:
67
- logger.info(f"\n{f.read()}")
68
-
69
- return "生成成功!", (22050, audio), f"生成耗时 {round(time.perf_counter()-start, 2)} s"
70
 
71
  def search_speaker(search_value):
72
  for s in speakers:
@@ -76,88 +74,44 @@ def search_speaker(search_value):
76
  if search_value in s:
77
  return s
78
 
79
- def change_lang(language):
80
- if language == 0:
81
- return 0.6, 0.668, 1.2
82
- else:
83
- return 0.6, 0.668, 1.1
84
 
85
- download_audio_js = """
86
- () =>{{
87
- let root = document.querySelector("body > gradio-app");
88
- if (root.shadowRoot != null)
89
- root = root.shadowRoot;
90
- let audio = root.querySelector("#tts-audio").querySelector("audio");
91
- let text = root.querySelector("#input-text").querySelector("textarea");
92
- if (audio == undefined)
93
- return;
94
- text = text.value;
95
- if (text == undefined)
96
- text = Math.floor(Math.random()*100000000);
97
- audio = audio.src;
98
- let oA = document.createElement("a");
99
- oA.download = text.substr(0, 20)+'.wav';
100
- oA.href = audio;
101
- document.body.appendChild(oA);
102
- oA.click();
103
- oA.remove();
104
- }}
105
- """
106
 
107
- if __name__ == '__main__':
108
- parser = argparse.ArgumentParser()
109
- parser.add_argument('--device', type=str, default='cpu')
110
- parser.add_argument('--api', action="store_true", default=True)
111
- parser.add_argument("--share", action="store_true", default=False, help="share gradio app")
112
- parser.add_argument("--colab", action="store_true", default=False, help="share gradio app")
113
- args = parser.parse_args()
114
- device = torch.device(args.device)
115
-
116
- hps_ms = utils.get_hparams_from_file(r'./model/config.json')
117
- net_g_ms = SynthesizerTrn(
118
- len(hps_ms.symbols),
119
- hps_ms.data.filter_length // 2 + 1,
120
- hps_ms.train.segment_size // hps_ms.data.hop_length,
121
- n_speakers=hps_ms.data.n_speakers,
122
- **hps_ms.model)
123
- _ = net_g_ms.eval().to(device)
124
- speakers = hps_ms.speakers
125
- speakers = [f"{i}.{s}" for i, s in enumerate(speakers)]
126
- model, optimizer, learning_rate, epochs = utils.load_checkpoint(r'./model/G_953000.pth', net_g_ms, None)
127
-
128
- with gr.Blocks() as app:
129
- gr.Markdown(
130
- "# <center> VITS语音在线合成\n"
131
- )
 
 
 
 
 
 
132
 
133
- with gr.Tabs():
134
- with gr.TabItem("vits"):
135
- with gr.Row():
136
- with gr.Column():
137
- input_text = gr.Textbox(label="Text (200 words limitation) " if limitation else "Text", lines=5, value="可莉不知道喔。", elem_id=f"input-text")
138
- btn = gr.Button(value="Submit")
139
- with gr.Row():
140
- lang = gr.Dropdown(label="Language", choices=["中文", "日语", "中日混合(中文用[ZH][ZH]包裹起来,日文用[JA][JA]包裹起来)"],
141
- type="index", value="中文")
142
- sid = gr.Dropdown(label="Speaker", choices=speakers, type="index", value=speakers[329])
143
- with gr.Row():
144
- ns = gr.Slider(label="noise_scale(控制感情变化程度)", minimum=0.1, maximum=1.0, step=0.1, value=0.1, interactive=True)
145
- nsw = gr.Slider(label="noise_scale_w(控制音素发音长度)", minimum=0.1, maximum=1.0, step=0.1, value=0.668, interactive=True)
146
- ls = gr.Slider(label="length_scale(控制整体语速)", minimum=0.1, maximum=2.0, step=0.1, value=1.2, interactive=True)
147
- with gr.Row():
148
- search = gr.Textbox(label="Search Speaker", lines=1)
149
- btn2 = gr.Button(value="Search")
150
- with gr.Column():
151
- o1 = gr.Textbox(label="Output Message")
152
- o2 = gr.Audio(label="Output Audio", elem_id=f"tts-audio")
153
- o3 = gr.Textbox(label="Extra Info")
154
- download = gr.Button("Download Audio")
155
- btn.click(vits, inputs=[input_text, lang, sid, ns, nsw, ls], outputs=[o1, o2, o3])
156
- download.click(None, [], [], _js=download_audio_js.format())
157
- btn2.click(search_speaker, inputs=[search], outputs=[sid])
158
- lang.change(change_lang, inputs=[lang], outputs=[ns, nsw, ls])
159
- with gr.TabItem("可用人物一览"):
160
- gr.Radio(label="Speaker", choices=speakers, interactive=False, type="index")
161
- if args.colab:
162
- webbrowser.open("http://127.0.0.1:7860")
163
- app.queue(concurrency_count=1, api_open=args.api).launch(share=args.share)
 
7
  logging.basicConfig(
8
  format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
9
  datefmt="%Y-%m-%d %H:%M:%S",
10
+ level=os.environ.get("LOGLEVEL", "INFO").upper(),
11
  stream=sys.stdout,
12
  )
13
  logger = logging.getLogger("APP")
 
43
  return text_norm, clean_text
44
 
45
  def vits(text, language, speaker_id, noise_scale, noise_scale_w, length_scale):
46
+ print(text, language, speaker_id, noise_scale, noise_scale_w, length_scale)
47
  start = time.perf_counter()
48
  if not len(text):
49
  return "输入文本不能为空!", None, None
50
  text = text.replace('\n', ' ').replace('\r', '').replace(" ", "")
51
  if len(text) > 200 and limitation:
52
  return f"输入文字过长!{len(text)}>100", None, None
53
+ if language == "中文":
54
  text = f"[ZH]{text}[ZH]"
55
+ elif language == "日文":
56
  text = f"[JA]{text}[JA]"
57
  else:
58
  text = f"{text}"
 
64
  audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=speaker_id, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
65
  length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
66
  logger.info("gen: " + text[:100])
67
+ return (22050, audio)
 
 
 
68
 
69
  def search_speaker(search_value):
70
  for s in speakers:
 
74
  if search_value in s:
75
  return s
76
 
 
 
 
 
 
77
 
78
+ parser = argparse.ArgumentParser()
79
+ parser.add_argument('--device', type=str, default='cpu')
80
+ args = parser.parse_args()
81
+ device = torch.device(args.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ hps_ms = utils.get_hparams_from_file(r'./model/config.json')
84
+ net_g_ms = SynthesizerTrn(
85
+ len(hps_ms.symbols),
86
+ hps_ms.data.filter_length // 2 + 1,
87
+ hps_ms.train.segment_size // hps_ms.data.hop_length,
88
+ n_speakers=hps_ms.data.n_speakers,
89
+ **hps_ms.model)
90
+ _ = net_g_ms.eval().to(device)
91
+ speakers = hps_ms.speakers
92
+ speakers = [f"{i}.{s}" for i, s in enumerate(speakers)]
93
+ model, optimizer, learning_rate, epochs = utils.load_checkpoint(r'./model/G_953000.pth', net_g_ms, None)
94
+
95
+ demo = gr.Interface(
96
+ fn=vits,
97
+ inputs=[
98
+ gr.Textbox(label="Text (200 words limitation)", lines=5, value="可莉不知道哦!", elem_id=f"input-text"),
99
+ gr.Radio(label="language", choices=["中文", "日语", "中日混合(中文用[ZH][ZH]包裹起来,日文用[JA][JA]包裹起来)"], value="中文"),
100
+ gr.Dropdown(label="Speaker", choices=speakers, type="index", value=speakers[329]),
101
+ gr.Slider(label="noise_scale (控制感情变化程度)", minimum=0.1, maximum=1.0, step=0.1, value=0.1, interactive=True),
102
+ gr.Slider(label="noise_scale_w (控制音素发音长度)", minimum=0.1, maximum=1.0, step=0.1, value=0.7, interactive=True),
103
+ gr.Slider(label="length_scale (控制整体语速)", minimum=0.1, maximum=2.0, step=0.1, value=1.2, interactive=True),
104
+ ],
105
+ outputs=gr.Audio(label="Output Audio", elem_id=f"tts-audio"),
106
+ examples=[
107
+ ["可莉不知道哦!", "中文", speakers[329], 0.1, 0.6, 1.2],
108
+ ["该做什么好呢?", "中文", speakers[104], 0.1, 0.8, 1.2],
109
+ ["我给你讲个故事吧!", "中文", speakers[122], 0.1, 0.8, 1.2],
110
+ ],
111
+ title="VITS Genshin",
112
+ description="",
113
+ )
114
 
115
+ if __name__ == "__main__":
116
+ demo.queue(concurrency_count=1)
117
+ demo.launch()