TangRain commited on
Commit
903962c
·
1 Parent(s): a4611df

update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -19
app.py CHANGED
@@ -11,7 +11,16 @@ from espnet2.bin.svs_inference import SingingGenerate
11
 
12
 
13
  singer_embeddings = {
14
- "singer1 (female)": "resource/singer/singer_embedding_opencpop.npy",
 
 
 
 
 
 
 
 
 
15
  }
16
 
17
  langs = {
@@ -19,12 +28,13 @@ langs = {
19
  "jp": 1,
20
  }
21
 
22
- def gen_song(lang, tempo, texts, durs, pitchs, spk):
23
  fs = 44100
24
- # PRETRAIN_MODEL = "espnet/aceopencpop_svs_visinger2_40singer_pretrain"
 
25
  # pretrain_downloaded = {
26
- # "train_config": "/data7/tyx/espnet/egs2/mixed/svs1/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix_all/config.yaml",
27
- # "model_file": "/data7/tyx/espnet/egs2/mixed/svs1/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix_all/500epoch.pth",
28
  # }
29
  if texts is None:
30
  return (fs, np.array([0.0])), "Error: No Text provided!"
@@ -90,14 +100,14 @@ def gen_song(lang, tempo, texts, durs, pitchs, spk):
90
  ),
91
  "text": phns_str,
92
  }
93
- print(batch)
94
- return (fs, np.array([0.0])), "success!"
95
 
96
  # Infer
97
  device = "cpu"
98
  # device = "cuda" if torch.cuda.is_available() else "cpu"
99
- # d = ModelDownloader()
100
- # pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL)
101
  svs = SingingGenerate(
102
  train_config = pretrain_downloaded["train_config"],
103
  model_file = pretrain_downloaded["model_file"],
@@ -118,13 +128,28 @@ description = """
118
  <div style="font-size: 20px;">
119
  <p>This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b>.</p>
120
  <p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.
121
- Music score contains information about tempo (singing speed), lyrics, as well as duration and pitch of each word in lyrics.</p>
122
 
123
  <p>How to use:</p>
124
  <ol>
125
  <li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
126
- <li> <b>Input tempo</b>: tempo is a positive integer, typically ranging between 60 and 180, representing the number of beats per minute. </li>
127
- <li> <b>Input lyrics, duration, pitch in equal length</b>: lyrics should match the language setting. Each word of input should be split with " " (blankspace) or "\\n" (newline) without quotes. Examples are provided.</li>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  <li> <b>Choose one singer</b> </li>
129
  <li> <b>Click submit button</b> </li>
130
  </ol>
@@ -138,7 +163,7 @@ article = """
138
 
139
  <p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
140
  <a href="https://github.com/espnet/espnet">espnet GitHub</a> |
141
- <a href="https://huggingface.co/espnet/aceopencpop_svs_visinger2_40singer_pretrain">pretrained model</a></p>
142
 
143
  <pre>
144
  @inproceedings{wu2024muskits,
@@ -155,26 +180,35 @@ article = """
155
 
156
  # SP: silence, AP: aspirate.
157
  examples = [
158
- ["zh", 89, "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21 0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0 58 58 0 58 58 63 0", "singer1 (female)"],
 
159
  # ["zh", 89, "雨 淋 湿 了 SP 天 空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 A#3 rest", "singer2 (male)"],
160
  # ["zh", 89, "雨 淋 湿 了 SP 天 空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 Bb3 rest", "singer3 (female)"],
161
- ["jp", 152, "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer1 (female)"],
162
  ]
163
 
164
  app = gr.Interface(
165
  fn=gen_song,
166
  inputs=[
167
  gr.Radio(label="language", choices=["zh", "jp"], value="zh"),
168
- gr.Textbox(label="Tempo"),
169
- gr.Textbox(label="Text"),
170
  gr.Textbox(label="Duration"),
171
  gr.Textbox(label="Pitch"),
172
  gr.Radio(
173
  label="Singer",
174
  choices=[
175
- "singer1 (female)",
 
 
 
 
 
 
 
 
 
176
  ],
177
- value="singer1 (female)"
178
  ),
179
  ],
180
  outputs=[
 
11
 
12
 
13
  singer_embeddings = {
14
+ "singer1 (male)": "resource/singer/singer_embedding_ace-1.npy",
15
+ "singer2 (female)": "resource/singer/singer_embedding_ace-2.npy",
16
+ "singer3 (male)": "resource/singer/singer_embedding_ace-3.npy",
17
+ "singer4 (female)": "resource/singer/singer_embedding_ace-8.npy",
18
+ "singer4 (male)": "resource/singer/singer_embedding_ace-7.npy",
19
+ "singer6 (female)": "resource/singer/singer_embedding_itako.npy",
20
+ "singer7 (male)": "resource/singer/singer_embedding_ofuton.npy",
21
+ "singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy",
22
+ "singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy",
23
+ "singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy",
24
  }
25
 
26
  langs = {
 
28
  "jp": 1,
29
  }
30
 
31
+ def gen_song(lang, texts, durs, pitchs, spk):
32
  fs = 44100
33
+ tempo = 120
34
+ PRETRAIN_MODEL = "TangRain/mixdata_svs_visinger2_spkembed_lang_pretrained"
35
  # pretrain_downloaded = {
36
+ # "train_config": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/config.yaml",
37
+ # "model_file": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/500epoch.pth",
38
  # }
39
  if texts is None:
40
  return (fs, np.array([0.0])), "Error: No Text provided!"
 
100
  ),
101
  "text": phns_str,
102
  }
103
+ # print(batch)
104
+ # return (fs, np.array([0.0])), "success!"
105
 
106
  # Infer
107
  device = "cpu"
108
  # device = "cuda" if torch.cuda.is_available() else "cpu"
109
+ d = ModelDownloader()
110
+ pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL)
111
  svs = SingingGenerate(
112
  train_config = pretrain_downloaded["train_config"],
113
  model_file = pretrain_downloaded["model_file"],
 
128
  <div style="font-size: 20px;">
129
  <p>This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b>.</p>
130
  <p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.
131
+ Music score contains information about lyrics, as well as duration and pitch of each word in lyrics.</p>
132
 
133
  <p>How to use:</p>
134
  <ol>
135
  <li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
136
+ <li> <b>Input lyrics</b>:
137
+ <ul>
138
+ <li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
139
+ </ul>
140
+ </li>
141
+ <li> <b>Input durations</b>:
142
+ <ul>
143
+ <li> Length of duration sequence should <b>be same as lyric sequence</b>, with each duration corresponding to the respective lyric. </li>
144
+ <li> Durations sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
145
+ </ul>
146
+ </li>
147
+ <li> <b>Input pitches</b>:
148
+ <ul>
149
+ <li> Length of pitch sequence should <b>be same as lyric sequence</b>, with each pitch corresponding to the respective lyric. </li>
150
+ <li> Pitches sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
151
+ </ul>
152
+ </li>
153
  <li> <b>Choose one singer</b> </li>
154
  <li> <b>Click submit button</b> </li>
155
  </ol>
 
163
 
164
  <p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
165
  <a href="https://github.com/espnet/espnet">espnet GitHub</a> |
166
+ <a href="https://huggingface.co/espnet/mixdata_svs_visinger2_spkembed_lang_pretrained">pretrained model</a></p>
167
 
168
  <pre>
169
  @inproceedings{wu2024muskits,
 
180
 
181
  # SP: silence, AP: aspirate.
182
  examples = [
183
+ ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0", "singer1 (male)"],
184
+ ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest", "singer1 (male)"],
185
  # ["zh", 89, "雨 淋 湿 了 SP 天 空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 A#3 rest", "singer2 (male)"],
186
  # ["zh", 89, "雨 淋 湿 了 SP 天 空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 Bb3 rest", "singer3 (female)"],
187
+ ["jp", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer2 (female)"],
188
  ]
189
 
190
  app = gr.Interface(
191
  fn=gen_song,
192
  inputs=[
193
  gr.Radio(label="language", choices=["zh", "jp"], value="zh"),
194
+ gr.Textbox(label="Lyrics"),
 
195
  gr.Textbox(label="Duration"),
196
  gr.Textbox(label="Pitch"),
197
  gr.Radio(
198
  label="Singer",
199
  choices=[
200
+ "singer1 (male)",
201
+ "singer2 (female)",
202
+ "singer3 (male)",
203
+ "singer4 (female)",
204
+ "singer4 (male)",
205
+ "singer6 (female)",
206
+ "singer7 (male)",
207
+ "singer8 (female)",
208
+ "singer9 (male)",
209
+ "singer10 (female)",
210
  ],
211
+ value="singer1 (male)",
212
  ),
213
  ],
214
  outputs=[