Spaces:
Runtime error
Runtime error
xj
commited on
Commit
•
8f95475
1
Parent(s):
700e801
[bug] 修复了bugs
Browse files
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
title: Vits Genshin
|
3 |
-
emoji:
|
4 |
colorFrom: gray
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.28.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
|
|
1 |
---
|
2 |
title: Vits Genshin
|
3 |
+
emoji: 🐰
|
4 |
colorFrom: gray
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.28.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
app.py
CHANGED
@@ -43,13 +43,12 @@ def get_text(text, hps):
|
|
43 |
return text_norm, clean_text
|
44 |
|
45 |
def vits(text, language, speaker_id, noise_scale, noise_scale_w, length_scale):
|
46 |
-
print(text, language, speaker_id, noise_scale, noise_scale_w, length_scale)
|
47 |
start = time.perf_counter()
|
48 |
if not len(text):
|
49 |
return "输入文本不能为空!", None, None
|
50 |
text = text.replace('\n', ' ').replace('\r', '').replace(" ", "")
|
51 |
if len(text) > 200 and limitation:
|
52 |
-
return f"输入文字过长!{len(text)}>
|
53 |
if language == "中文":
|
54 |
text = f"[ZH]{text}[ZH]"
|
55 |
elif language == "日文":
|
@@ -63,7 +62,7 @@ def vits(text, language, speaker_id, noise_scale, noise_scale_w, length_scale):
|
|
63 |
speaker_id = LongTensor([speaker_id]).to(device)
|
64 |
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=speaker_id, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
|
65 |
length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
|
66 |
-
logger.info("gen:
|
67 |
return (22050, audio)
|
68 |
|
69 |
def search_speaker(search_value):
|
@@ -75,43 +74,43 @@ def search_speaker(search_value):
|
|
75 |
return s
|
76 |
|
77 |
|
78 |
-
|
79 |
-
parser
|
80 |
-
|
81 |
-
|
|
|
82 |
|
83 |
-
hps_ms = utils.get_hparams_from_file(r'./model/config.json')
|
84 |
-
net_g_ms = SynthesizerTrn(
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
_ = net_g_ms.eval().to(device)
|
91 |
-
speakers = hps_ms.speakers
|
92 |
-
speakers = [f"{i}.{s}" for i, s in enumerate(speakers)]
|
93 |
-
model, optimizer, learning_rate, epochs = utils.load_checkpoint(r'./model/G_953000.pth', net_g_ms, None)
|
94 |
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
)
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
demo.launch()
|
|
|
43 |
return text_norm, clean_text
|
44 |
|
45 |
def vits(text, language, speaker_id, noise_scale, noise_scale_w, length_scale):
|
|
|
46 |
start = time.perf_counter()
|
47 |
if not len(text):
|
48 |
return "输入文本不能为空!", None, None
|
49 |
text = text.replace('\n', ' ').replace('\r', '').replace(" ", "")
|
50 |
if len(text) > 200 and limitation:
|
51 |
+
return f"输入文字过长!{len(text)}>200", None, None
|
52 |
if language == "中文":
|
53 |
text = f"[ZH]{text}[ZH]"
|
54 |
elif language == "日文":
|
|
|
62 |
speaker_id = LongTensor([speaker_id]).to(device)
|
63 |
audio = net_g_ms.infer(x_tst, x_tst_lengths, sid=speaker_id, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
|
64 |
length_scale=length_scale)[0][0, 0].data.cpu().float().numpy()
|
65 |
+
logger.info(f"gen: {(text[:100], language, speaker_id, noise_scale, noise_scale_w, length_scale)}")
|
66 |
return (22050, audio)
|
67 |
|
68 |
def search_speaker(search_value):
|
|
|
74 |
return s
|
75 |
|
76 |
|
77 |
+
if __name__ == "__main__":
|
78 |
+
parser = argparse.ArgumentParser()
|
79 |
+
parser.add_argument('--device', type=str, default='cpu')
|
80 |
+
args = parser.parse_args()
|
81 |
+
device = torch.device(args.device)
|
82 |
|
83 |
+
hps_ms = utils.get_hparams_from_file(r'./model/config.json')
|
84 |
+
net_g_ms = SynthesizerTrn(
|
85 |
+
len(hps_ms.symbols),
|
86 |
+
hps_ms.data.filter_length // 2 + 1,
|
87 |
+
hps_ms.train.segment_size // hps_ms.data.hop_length,
|
88 |
+
n_speakers=hps_ms.data.n_speakers,
|
89 |
+
**hps_ms.model)
|
90 |
+
_ = net_g_ms.eval().to(device)
|
91 |
+
speakers = hps_ms.speakers
|
92 |
+
speakers = [f"{i}.{s}" for i, s in enumerate(speakers)]
|
93 |
+
model, optimizer, learning_rate, epochs = utils.load_checkpoint(r'./model/G_953000.pth', net_g_ms, None)
|
94 |
|
95 |
+
app = gr.Interface(
|
96 |
+
fn=vits,
|
97 |
+
inputs=[
|
98 |
+
gr.Textbox(label="Text (200 words limitation)", lines=5, value="可莉不知道哦!", elem_id=f"input-text"),
|
99 |
+
gr.Radio(label="language", choices=["中文", "日语", "中日混合(中文用[ZH][ZH]包裹起来,日文用[JA][JA]包裹起来)"], value="中文"),
|
100 |
+
gr.Dropdown(label="Speaker", choices=speakers, type="index", value=speakers[329]),
|
101 |
+
gr.Slider(label="noise_scale (控制感情变化程度)", minimum=0.1, maximum=1.0, step=0.1, value=0.1, interactive=True),
|
102 |
+
gr.Slider(label="noise_scale_w (控制音素发音长度)", minimum=0.1, maximum=1.0, step=0.1, value=0.7, interactive=True),
|
103 |
+
gr.Slider(label="length_scale (控制整体语速)", minimum=0.1, maximum=2.0, step=0.1, value=1.2, interactive=True),
|
104 |
+
],
|
105 |
+
outputs=gr.Audio(label="Output Audio", elem_id=f"tts-audio"),
|
106 |
+
examples=[
|
107 |
+
["可莉不知道哦!", "中文", speakers[329], 0.1, 0.6, 1.2],
|
108 |
+
["该做什么好呢?", "中文", speakers[104], 0.1, 0.8, 1.2],
|
109 |
+
["我���你讲个故事吧!", "中文", speakers[122], 0.1, 0.8, 1.2],
|
110 |
+
],
|
111 |
+
title="VITS Genshin",
|
112 |
+
description="",
|
113 |
+
)
|
114 |
|
115 |
+
app.queue(concurrency_count=1)
|
116 |
+
app.launch()
|
|