Spaces:

TangRain
/

muskits-espnet-svs-demo

Running

App Files Files Community

TangRain commited on Oct 24, 2024

Commit

903962c

1 Parent(s): a4611df

update app.py

Browse files

Files changed (1) hide show

app.py +53 -19

app.py CHANGED Viewed

@@ -11,7 +11,16 @@ from espnet2.bin.svs_inference import SingingGenerate
 singer_embeddings = {
-    "singer1 (female)": "resource/singer/singer_embedding_opencpop.npy",
 }
 langs = {
@@ -19,12 +28,13 @@ langs = {
     "jp": 1,
 }
-def gen_song(lang, tempo, texts, durs, pitchs, spk):
     fs = 44100
-    # PRETRAIN_MODEL = "espnet/aceopencpop_svs_visinger2_40singer_pretrain"
     # pretrain_downloaded = {
-    #     "train_config": "/data7/tyx/espnet/egs2/mixed/svs1/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix_all/config.yaml",
-    #     "model_file": "/data7/tyx/espnet/egs2/mixed/svs1/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix_all/500epoch.pth",
     # }
     if texts is None:
         return (fs, np.array([0.0])), "Error: No Text provided!"
@@ -90,14 +100,14 @@ def gen_song(lang, tempo, texts, durs, pitchs, spk):
         ),
         "text": phns_str,
     }
-    print(batch)
-    return (fs, np.array([0.0])), "success!"
     # Infer
     device = "cpu"
     # device = "cuda" if torch.cuda.is_available() else "cpu"
-    # d = ModelDownloader()
-    # pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL)
     svs = SingingGenerate(
         train_config = pretrain_downloaded["train_config"],
         model_file = pretrain_downloaded["model_file"],
@@ -118,13 +128,28 @@ description = """
 <div style="font-size: 20px;">
   <p>This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b>.</p>
   <p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.
-  Music score contains information about tempo (singing speed), lyrics, as well as duration and pitch of each word in lyrics.</p>
   <p>How to use:</p>
   <ol>
     <li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
-    <li> <b>Input tempo</b>: tempo is a positive integer, typically ranging between 60 and 180, representing the number of beats per minute. </li>
-    <li> <b>Input lyrics, duration, pitch in equal length</b>: lyrics should match the language setting. Each word of input should be split with " " (blankspace) or "\\n" (newline) without quotes. Examples are provided.</li>
     <li> <b>Choose one singer</b> </li>
     <li> <b>Click submit button</b> </li>
   </ol>
@@ -138,7 +163,7 @@ article = """
 <p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
 <a href="https://github.com/espnet/espnet">espnet GitHub</a> |
-<a href="https://huggingface.co/espnet/aceopencpop_svs_visinger2_40singer_pretrain">pretrained model</a></p>
 <pre>
 @inproceedings{wu2024muskits,
@@ -155,26 +180,35 @@ article = """
 # SP: silence, AP: aspirate.
 examples = [
-    ["zh", 89, "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21 0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0 58 58 0 58 58 63 0", "singer1 (female)"],
     # ["zh", 89, "雨 淋 湿 了 SP 天 空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 A#3 rest", "singer2 (male)"],
     # ["zh", 89, "雨 淋 湿 了 SP 天 空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 Bb3 rest", "singer3 (female)"],
-    ["jp", 152, "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer1 (female)"],
 ]
 app = gr.Interface(
     fn=gen_song,
     inputs=[
         gr.Radio(label="language", choices=["zh", "jp"], value="zh"),
-        gr.Textbox(label="Tempo"),
-        gr.Textbox(label="Text"),
         gr.Textbox(label="Duration"),
         gr.Textbox(label="Pitch"),
         gr.Radio(
             label="Singer",
             choices=[
-                "singer1 (female)",
             ],
-            value="singer1 (female)"
         ),
     ],
     outputs=[

 singer_embeddings = {
+    "singer1 (male)": "resource/singer/singer_embedding_ace-1.npy",
+    "singer2 (female)": "resource/singer/singer_embedding_ace-2.npy",
+    "singer3 (male)": "resource/singer/singer_embedding_ace-3.npy",
+    "singer4 (female)": "resource/singer/singer_embedding_ace-8.npy",
+    "singer4 (male)": "resource/singer/singer_embedding_ace-7.npy",
+    "singer6 (female)": "resource/singer/singer_embedding_itako.npy",
+    "singer7 (male)": "resource/singer/singer_embedding_ofuton.npy",
+    "singer8 (female)": "resource/singer/singer_embedding_kising_orange.npy",
+    "singer9 (male)": "resource/singer/singer_embedding_m4singer_Tenor-1.npy",
+    "singer10 (female)": "resource/singer/singer_embedding_m4singer_Alto-4.npy",
 }
 langs = {
     "jp": 1,
 }
+def gen_song(lang, texts, durs, pitchs, spk):
     fs = 44100
+    tempo = 120
+    PRETRAIN_MODEL = "TangRain/mixdata_svs_visinger2_spkembed_lang_pretrained"
     # pretrain_downloaded = {
+    #     "train_config": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/config.yaml",
+    #     "model_file": "/data7/tyx/pretrained_model/mixdata_svs_visinger2_spkembed_lang_pretrained/exp/svs_train_visinger2_spk_embed_lang_raw_phn_None_mix/500epoch.pth",
     # }
     if texts is None:
         return (fs, np.array([0.0])), "Error: No Text provided!"
         ),
         "text": phns_str,
     }
+    # print(batch)
+    # return (fs, np.array([0.0])), "success!"
     # Infer
     device = "cpu"
     # device = "cuda" if torch.cuda.is_available() else "cpu"
+    d = ModelDownloader()
+    pretrain_downloaded = d.download_and_unpack(PRETRAIN_MODEL)
     svs = SingingGenerate(
         train_config = pretrain_downloaded["train_config"],
         model_file = pretrain_downloaded["model_file"],
 <div style="font-size: 20px;">
   <p>This is the demo page of our toolkit <b>Muskits-ESPnet: A Comprehensive Toolkit for Singing Voice Synthesis in New Paradigm</b>.</p>
   <p>Singing Voice Synthesis (SVS) takes a music score as input and generates singing vocal with the voice of a specific singer.
+  Music score contains information about lyrics, as well as duration and pitch of each word in lyrics.</p>
   <p>How to use:</p>
   <ol>
     <li> <b>Choose language ID</b>: "zh" indicates lyrics input in Chinese, and "jp" indicates lyrics input in Japanese. </li>
+    <li> <b>Input lyrics</b>:
+        <ul>
+            <li> Lyrics sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
+        </ul>
+    </li>
+    <li> <b>Input durations</b>:
+        <ul>
+            <li> Length of duration sequence should <b>be same as lyric sequence</b>, with each duration corresponding to the respective lyric. </li>
+            <li> Durations sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
+        </ul>
+    </li>
+    <li> <b>Input pitches</b>:
+        <ul>
+            <li> Length of pitch sequence should <b>be same as lyric sequence</b>, with each pitch corresponding to the respective lyric. </li>
+            <li> Pitches sequence should be separated by either a space (' ') or a newline ('\\n'), without the quotation marks. </li>
+        </ul>
+    </li>
     <li> <b>Choose one singer</b> </li>
     <li> <b>Click submit button</b> </li>
   </ol>
 <p>References: <a href="https://arxiv.org/abs/2409.07226">Muskits-ESPnet paper</a> |
 <a href="https://github.com/espnet/espnet">espnet GitHub</a> |
+<a href="https://huggingface.co/espnet/mixdata_svs_visinger2_spkembed_lang_pretrained">pretrained model</a></p>
 <pre>
 @inproceedings{wu2024muskits,
 # SP: silence, AP: aspirate.
 examples = [
+    ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "60 62 62 62 0 62 58 0\n58 58 0 58 58 63 0", "singer1 (male)"],
+    ["zh", "雨 淋 湿 了 SP 天 空 AP\n毁 的 SP 很 讲 究 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.5 0.21\n0.3 0.12 0.12 0.25 0.5 0.48 0.34", "C4 D4 D4 D4 rest D4 A#3 rest\nA#3 A#3 rest A#3 A#3 D#4 rest", "singer1 (male)"],
     # ["zh", 89, "雨 淋 湿 了 SP 天 空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 A#3 rest", "singer2 (male)"],
     # ["zh", 89, "雨 淋 湿 了 SP 天 空 AP", "0.23 0.16 0.36 0.16 0.07 0.28 0.50 0.21", "C4 D4 D4 D4 rest D4 Bb3 rest", "singer3 (female)"],
+    ["jp", "きっ と と べ ば そ ら ま で と ど く AP", "0.39 2.76 0.2 0.2 0.39 0.39 0.2 0.2 0.39 0.2 0.2 0.59 1.08", "64 71 68 69 71 71 69 68 66 68 69 68 0", "singer2 (female)"],
 ]
 app = gr.Interface(
     fn=gen_song,
     inputs=[
         gr.Radio(label="language", choices=["zh", "jp"], value="zh"),
+        gr.Textbox(label="Lyrics"),
         gr.Textbox(label="Duration"),
         gr.Textbox(label="Pitch"),
         gr.Radio(
             label="Singer",
             choices=[
+                "singer1 (male)",
+                "singer2 (female)",
+                "singer3 (male)",
+                "singer4 (female)",
+                "singer4 (male)",
+                "singer6 (female)",
+                "singer7 (male)",
+                "singer8 (female)",
+                "singer9 (male)",
+                "singer10 (female)",
             ],
+            value="singer1 (male)",
         ),
     ],
     outputs=[