update app.py
Browse files
app.py
CHANGED
@@ -11,7 +11,16 @@ from espnet2.bin.svs_inference import SingingGenerate
|
|
11 |
|
12 |
|
13 |
singer_embeddings = {
|
14 |
-
"singer1 (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
}
|
16 |
|
17 |
langs = {
|
@@ -19,12 +28,13 @@ langs = {
|
|
19 |
"jp": 1,
|
20 |
}
|
21 |
|
22 |
-
def gen_song(lang,
|
23 |
fs = 44100
|
24 |
-
|
|
|
25 |
# pretrain_downloaded = {
|
26 |
-
# "train_config": "/data7/tyx/
|
27 |
-
# "model_file": "/data7/tyx/
|
28 |
# }
|
29 |
if texts is None:
|
30 |
return (fs, np.array([0.0])), "Error: No Text provided!"
|
@@ -90,14 +100,14 @@ def gen_song(lang, tempo, texts, durs, pitchs, spk):
|
|
90 |
),
|
91 |
"text": phns_str,
|
92 |
}
|
93 |
-
print(batch)
|
94 |
-
return (fs, np.array([0.0])), "success!"
|
95 |
|
96 |
# Infer
|
97 |
device = "cpu"
|
98 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
99 |
-
|
100 |
-
|
101 |
svs = SingingGenerate(
|
102 |
train_config = pretrain_downloaded["train_config"],
|
103 |
model_file = pretrain_downloaded["model_file"],
|
@@ -118,13 +128,28 @@ description = """
|
|
118 |
<div style="font-size: 20px;">
|
119 |
<p>This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b>.</p>
|
120 |
<p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.
|
121 |
-
Music score contains information about
|
122 |
|
123 |
<p>How to use:</p>
|
124 |
<ol>
|
125 |
<li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
|
126 |
-
<li> <b>Input
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
<li> <b>Choose one singer</b> </li>
|
129 |
<li> <b>Click submit button</b> </li>
|
130 |
</ol>
|
@@ -138,7 +163,7 @@ article = """
|
|
138 |
|
139 |
<p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
|
140 |
<a href="https://github.com/espnet/espnet">espnet GitHub</a> |
|
141 |
-
<a href="https://huggingface.co/espnet/
|
142 |
|
143 |
<pre>
|
144 |
@inproceedings{wu2024muskits,
|
@@ -155,26 +180,35 @@ article = """
|
|
155 |
|
156 |
# SP: silence, AP: aspirate.
|
157 |
examples = [
|
158 |
-
["zh",
|
|
|
159 |
# ["zh", 89, "雨 淋 湿 了 SP 天 空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 A#3 rest", "singer2 (male)"],
|
160 |
# ["zh", 89, "雨 淋 湿 了 SP 天 空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 Bb3 rest", "singer3 (female)"],
|
161 |
-
["jp",
|
162 |
]
|
163 |
|
164 |
app = gr.Interface(
|
165 |
fn=gen_song,
|
166 |
inputs=[
|
167 |
gr.Radio(label="language", choices=["zh", "jp"], value="zh"),
|
168 |
-
gr.Textbox(label="
|
169 |
-
gr.Textbox(label="Text"),
|
170 |
gr.Textbox(label="Duration"),
|
171 |
gr.Textbox(label="Pitch"),
|
172 |
gr.Radio(
|
173 |
label="Singer",
|
174 |
choices=[
|
175 |
-
"singer1 (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
],
|
177 |
-
value="singer1 (
|
178 |
),
|
179 |
],
|
180 |
outputs=[
|
|
|
11 |
|
12 |
|
13 |
singer_embeddings = {
|
14 |
+
"singer1 (male)": "resource/singer/singer_embedding_ace-1.npy",
|
15 |
+
"singer2 (female)": "resource/singer/singer_embedding_ace-2.npy",
|
16 |
+
"singer3 (male)": "resource/singer/singer_embedding_ace-3.npy",
|
17 |
+
"singer4 (female)": "resource/singer/singer_embedding_ace-8.npy",
|
18 |
+
"singer4 (male)": "resource/singer/singer_embedding_ace-7.npy",
|
19 |
+
"singer6 (female)": "resource/singer/singer_embedding_itako.npy",
|
20 |
+
"singer7 (male)": "resource/singer/singer_embedding_ofuton.npy",
|
21 |
+
"singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy",
|
22 |
+
"singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy",
|
23 |
+
"singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy",
|
24 |
}
|
25 |
|
26 |
langs = {
|
|
|
28 |
"jp": 1,
|
29 |
}
|
30 |
|
31 |
+
def gen_song(lang, texts, durs, pitchs, spk):
|
32 |
fs = 44100
|
33 |
+
tempo = 120
|
34 |
+
PRETRAIN_MODEL = "TangRain/mixdata_svs_visinger2_spkembed_lang_pretrained"
|
35 |
# pretrain_downloaded = {
|
36 |
+
# "train_config": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/config.yaml",
|
37 |
+
# "model_file": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/500epoch.pth",
|
38 |
# }
|
39 |
if texts is None:
|
40 |
return (fs, np.array([0.0])), "Error: No Text provided!"
|
|
|
100 |
),
|
101 |
"text": phns_str,
|
102 |
}
|
103 |
+
# print(batch)
|
104 |
+
# return (fs, np.array([0.0])), "success!"
|
105 |
|
106 |
# Infer
|
107 |
device = "cpu"
|
108 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
109 |
+
d = ModelDownloader()
|
110 |
+
pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL)
|
111 |
svs = SingingGenerate(
|
112 |
train_config = pretrain_downloaded["train_config"],
|
113 |
model_file = pretrain_downloaded["model_file"],
|
|
|
128 |
<div style="font-size: 20px;">
|
129 |
<p>This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b>.</p>
|
130 |
<p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.
|
131 |
+
Music score contains information about lyrics, as well as duration and pitch of each word in lyrics.</p>
|
132 |
|
133 |
<p>How to use:</p>
|
134 |
<ol>
|
135 |
<li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
|
136 |
+
<li> <b>Input lyrics</b>:
|
137 |
+
<ul>
|
138 |
+
<li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
|
139 |
+
</ul>
|
140 |
+
</li>
|
141 |
+
<li> <b>Input durations</b>:
|
142 |
+
<ul>
|
143 |
+
<li> Length of duration sequence should <b>be same as lyric sequence</b>, with each duration corresponding to the respective lyric. </li>
|
144 |
+
<li> Durations sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
|
145 |
+
</ul>
|
146 |
+
</li>
|
147 |
+
<li> <b>Input pitches</b>:
|
148 |
+
<ul>
|
149 |
+
<li> Length of pitch sequence should <b>be same as lyric sequence</b>, with each pitch corresponding to the respective lyric. </li>
|
150 |
+
<li> Pitches sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
|
151 |
+
</ul>
|
152 |
+
</li>
|
153 |
<li> <b>Choose one singer</b> </li>
|
154 |
<li> <b>Click submit button</b> </li>
|
155 |
</ol>
|
|
|
163 |
|
164 |
<p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
|
165 |
<a href="https://github.com/espnet/espnet">espnet GitHub</a> |
|
166 |
+
<a href="https://huggingface.co/espnet/mixdata_svs_visinger2_spkembed_lang_pretrained">pretrained model</a></p>
|
167 |
|
168 |
<pre>
|
169 |
@inproceedings{wu2024muskits,
|
|
|
180 |
|
181 |
# SP: silence, AP: aspirate.
|
182 |
examples = [
|
183 |
+
["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0", "singer1 (male)"],
|
184 |
+
["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest", "singer1 (male)"],
|
185 |
# ["zh", 89, "雨 淋 湿 了 SP 天 空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 A#3 rest", "singer2 (male)"],
|
186 |
# ["zh", 89, "雨 淋 湿 了 SP 天 空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 Bb3 rest", "singer3 (female)"],
|
187 |
+
["jp", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer2 (female)"],
|
188 |
]
|
189 |
|
190 |
app = gr.Interface(
|
191 |
fn=gen_song,
|
192 |
inputs=[
|
193 |
gr.Radio(label="language", choices=["zh", "jp"], value="zh"),
|
194 |
+
gr.Textbox(label="Lyrics"),
|
|
|
195 |
gr.Textbox(label="Duration"),
|
196 |
gr.Textbox(label="Pitch"),
|
197 |
gr.Radio(
|
198 |
label="Singer",
|
199 |
choices=[
|
200 |
+
"singer1 (male)",
|
201 |
+
"singer2 (female)",
|
202 |
+
"singer3 (male)",
|
203 |
+
"singer4 (female)",
|
204 |
+
"singer4 (male)",
|
205 |
+
"singer6 (female)",
|
206 |
+
"singer7 (male)",
|
207 |
+
"singer8 (female)",
|
208 |
+
"singer9 (male)",
|
209 |
+
"singer10 (female)",
|
210 |
],
|
211 |
+
value="singer1 (male)",
|
212 |
),
|
213 |
],
|
214 |
outputs=[
|